diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,188566 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999888609587744, + "eval_steps": 500, + "global_step": 53864, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.713013741863859e-05, + "grad_norm": 0.787381649017334, + "learning_rate": 1.999999993196498e-05, + "loss": 0.683, + "step": 2 + }, + { + "epoch": 7.426027483727717e-05, + "grad_norm": 0.9172313809394836, + "learning_rate": 1.999999972785991e-05, + "loss": 0.5783, + "step": 4 + }, + { + "epoch": 0.00011139041225591575, + "grad_norm": 0.5062880516052246, + "learning_rate": 1.99999993876848e-05, + "loss": 0.3439, + "step": 6 + }, + { + "epoch": 0.00014852054967455435, + "grad_norm": 0.5106889009475708, + "learning_rate": 1.9999998911439648e-05, + "loss": 0.4409, + "step": 8 + }, + { + "epoch": 0.00018565068709319293, + "grad_norm": 0.41305649280548096, + "learning_rate": 1.9999998299124467e-05, + "loss": 0.3783, + "step": 10 + }, + { + "epoch": 0.0002227808245118315, + "grad_norm": 0.4339084327220917, + "learning_rate": 1.999999755073926e-05, + "loss": 0.3818, + "step": 12 + }, + { + "epoch": 0.0002599109619304701, + "grad_norm": 0.5772333741188049, + "learning_rate": 1.9999996666284042e-05, + "loss": 0.481, + "step": 14 + }, + { + "epoch": 0.0002970410993491087, + "grad_norm": 0.4155276417732239, + "learning_rate": 1.9999995645758825e-05, + "loss": 0.3249, + "step": 16 + }, + { + "epoch": 0.0003341712367677473, + "grad_norm": 0.6395503282546997, + "learning_rate": 1.9999994489163616e-05, + "loss": 0.6011, + "step": 18 + }, + { + "epoch": 0.00037130137418638586, + "grad_norm": 0.5251273512840271, + "learning_rate": 1.999999319649844e-05, + "loss": 0.4956, + "step": 20 + }, + { + "epoch": 0.00040843151160502446, + "grad_norm": 0.43630415201187134, + "learning_rate": 1.9999991767763308e-05, + "loss": 0.5067, + "step": 22 + }, + { + "epoch": 0.000445561649023663, + "grad_norm": 0.5417091846466064, + "learning_rate": 1.9999990202958243e-05, + "loss": 0.3808, + "step": 24 + }, + { + "epoch": 0.0004826917864423016, + "grad_norm": 0.6985338926315308, + "learning_rate": 1.9999988502083257e-05, + "loss": 0.4041, + "step": 26 + }, + { + "epoch": 0.0005198219238609402, + "grad_norm": 0.6815202236175537, + "learning_rate": 1.9999986665138392e-05, + "loss": 0.651, + "step": 28 + }, + { + "epoch": 0.0005569520612795788, + "grad_norm": 0.6399803161621094, + "learning_rate": 1.9999984692123657e-05, + "loss": 0.4616, + "step": 30 + }, + { + "epoch": 0.0005940821986982174, + "grad_norm": 0.6329267621040344, + "learning_rate": 1.9999982583039082e-05, + "loss": 0.4948, + "step": 32 + }, + { + "epoch": 0.000631212336116856, + "grad_norm": 0.40591558814048767, + "learning_rate": 1.99999803378847e-05, + "loss": 0.3639, + "step": 34 + }, + { + "epoch": 0.0006683424735354946, + "grad_norm": 0.5665282011032104, + "learning_rate": 1.9999977956660537e-05, + "loss": 0.3756, + "step": 36 + }, + { + "epoch": 0.0007054726109541332, + "grad_norm": 0.5512558817863464, + "learning_rate": 1.9999975439366632e-05, + "loss": 0.3618, + "step": 38 + }, + { + "epoch": 0.0007426027483727717, + "grad_norm": 0.4901846647262573, + "learning_rate": 1.999997278600301e-05, + "loss": 0.4045, + "step": 40 + }, + { + "epoch": 0.0007797328857914103, + "grad_norm": 0.4119759202003479, + "learning_rate": 1.999996999656971e-05, + "loss": 0.329, + "step": 42 + }, + { + "epoch": 0.0008168630232100489, + "grad_norm": 0.5291385650634766, + "learning_rate": 1.9999967071066777e-05, + "loss": 0.3533, + "step": 44 + }, + { + "epoch": 0.0008539931606286875, + "grad_norm": 0.5106402635574341, + "learning_rate": 1.9999964009494243e-05, + "loss": 0.3966, + "step": 46 + }, + { + "epoch": 0.000891123298047326, + "grad_norm": 0.3596688508987427, + "learning_rate": 1.999996081185215e-05, + "loss": 0.2788, + "step": 48 + }, + { + "epoch": 0.0009282534354659647, + "grad_norm": 0.4595358073711395, + "learning_rate": 1.9999957478140545e-05, + "loss": 0.5023, + "step": 50 + }, + { + "epoch": 0.0009653835728846033, + "grad_norm": 0.5351885557174683, + "learning_rate": 1.9999954008359476e-05, + "loss": 0.3821, + "step": 52 + }, + { + "epoch": 0.0010025137103032418, + "grad_norm": 0.49715960025787354, + "learning_rate": 1.9999950402508984e-05, + "loss": 0.313, + "step": 54 + }, + { + "epoch": 0.0010396438477218804, + "grad_norm": 0.5796110033988953, + "learning_rate": 1.9999946660589117e-05, + "loss": 0.3942, + "step": 56 + }, + { + "epoch": 0.001076773985140519, + "grad_norm": 0.39613473415374756, + "learning_rate": 1.9999942782599933e-05, + "loss": 0.2985, + "step": 58 + }, + { + "epoch": 0.0011139041225591577, + "grad_norm": 0.5466755628585815, + "learning_rate": 1.999993876854148e-05, + "loss": 0.4769, + "step": 60 + }, + { + "epoch": 0.0011510342599777962, + "grad_norm": 0.3947904109954834, + "learning_rate": 1.9999934618413816e-05, + "loss": 0.4659, + "step": 62 + }, + { + "epoch": 0.0011881643973964348, + "grad_norm": 0.41004839539527893, + "learning_rate": 1.999993033221699e-05, + "loss": 0.4927, + "step": 64 + }, + { + "epoch": 0.0012252945348150733, + "grad_norm": 0.4754455089569092, + "learning_rate": 1.999992590995107e-05, + "loss": 0.4159, + "step": 66 + }, + { + "epoch": 0.001262424672233712, + "grad_norm": 0.7578725218772888, + "learning_rate": 1.9999921351616108e-05, + "loss": 0.2805, + "step": 68 + }, + { + "epoch": 0.0012995548096523504, + "grad_norm": 0.6312479972839355, + "learning_rate": 1.9999916657212174e-05, + "loss": 0.4298, + "step": 70 + }, + { + "epoch": 0.0013366849470709892, + "grad_norm": 0.3991794288158417, + "learning_rate": 1.9999911826739323e-05, + "loss": 0.4769, + "step": 72 + }, + { + "epoch": 0.0013738150844896278, + "grad_norm": 0.6018489003181458, + "learning_rate": 1.999990686019763e-05, + "loss": 0.5276, + "step": 74 + }, + { + "epoch": 0.0014109452219082663, + "grad_norm": 0.42345333099365234, + "learning_rate": 1.9999901757587153e-05, + "loss": 0.6705, + "step": 76 + }, + { + "epoch": 0.0014480753593269049, + "grad_norm": 0.478071928024292, + "learning_rate": 1.999989651890797e-05, + "loss": 0.5739, + "step": 78 + }, + { + "epoch": 0.0014852054967455434, + "grad_norm": 0.5030924677848816, + "learning_rate": 1.9999891144160146e-05, + "loss": 0.6992, + "step": 80 + }, + { + "epoch": 0.001522335634164182, + "grad_norm": 0.5798288583755493, + "learning_rate": 1.9999885633343757e-05, + "loss": 0.4498, + "step": 82 + }, + { + "epoch": 0.0015594657715828205, + "grad_norm": 0.4327036142349243, + "learning_rate": 1.9999879986458877e-05, + "loss": 0.1748, + "step": 84 + }, + { + "epoch": 0.0015965959090014593, + "grad_norm": 0.4869978129863739, + "learning_rate": 1.999987420350559e-05, + "loss": 0.4885, + "step": 86 + }, + { + "epoch": 0.0016337260464200979, + "grad_norm": 0.3522797226905823, + "learning_rate": 1.999986828448396e-05, + "loss": 0.5144, + "step": 88 + }, + { + "epoch": 0.0016708561838387364, + "grad_norm": 0.34214866161346436, + "learning_rate": 1.9999862229394077e-05, + "loss": 0.4755, + "step": 90 + }, + { + "epoch": 0.001707986321257375, + "grad_norm": 0.530851423740387, + "learning_rate": 1.9999856038236024e-05, + "loss": 0.3345, + "step": 92 + }, + { + "epoch": 0.0017451164586760135, + "grad_norm": 0.4837367832660675, + "learning_rate": 1.9999849711009883e-05, + "loss": 0.3591, + "step": 94 + }, + { + "epoch": 0.001782246596094652, + "grad_norm": 0.7023651599884033, + "learning_rate": 1.9999843247715743e-05, + "loss": 0.3494, + "step": 96 + }, + { + "epoch": 0.0018193767335132906, + "grad_norm": 0.39528682827949524, + "learning_rate": 1.9999836648353685e-05, + "loss": 0.5679, + "step": 98 + }, + { + "epoch": 0.0018565068709319294, + "grad_norm": 0.4950597882270813, + "learning_rate": 1.9999829912923803e-05, + "loss": 0.3934, + "step": 100 + }, + { + "epoch": 0.001893637008350568, + "grad_norm": 0.44712716341018677, + "learning_rate": 1.9999823041426193e-05, + "loss": 0.4103, + "step": 102 + }, + { + "epoch": 0.0019307671457692065, + "grad_norm": 0.6989088654518127, + "learning_rate": 1.9999816033860943e-05, + "loss": 0.337, + "step": 104 + }, + { + "epoch": 0.0019678972831878453, + "grad_norm": 0.4854273796081543, + "learning_rate": 1.999980889022815e-05, + "loss": 0.3561, + "step": 106 + }, + { + "epoch": 0.0020050274206064836, + "grad_norm": 0.39766597747802734, + "learning_rate": 1.9999801610527908e-05, + "loss": 0.4093, + "step": 108 + }, + { + "epoch": 0.0020421575580251224, + "grad_norm": 0.7291239500045776, + "learning_rate": 1.9999794194760323e-05, + "loss": 0.5149, + "step": 110 + }, + { + "epoch": 0.0020792876954437607, + "grad_norm": 0.6360824704170227, + "learning_rate": 1.999978664292549e-05, + "loss": 0.3331, + "step": 112 + }, + { + "epoch": 0.0021164178328623995, + "grad_norm": 0.6387815475463867, + "learning_rate": 1.9999778955023513e-05, + "loss": 0.5065, + "step": 114 + }, + { + "epoch": 0.002153547970281038, + "grad_norm": 0.46091097593307495, + "learning_rate": 1.9999771131054498e-05, + "loss": 0.2113, + "step": 116 + }, + { + "epoch": 0.0021906781076996766, + "grad_norm": 0.4365173578262329, + "learning_rate": 1.999976317101855e-05, + "loss": 0.3731, + "step": 118 + }, + { + "epoch": 0.0022278082451183154, + "grad_norm": 0.5046524405479431, + "learning_rate": 1.999975507491578e-05, + "loss": 0.3662, + "step": 120 + }, + { + "epoch": 0.0022649383825369537, + "grad_norm": 0.357047438621521, + "learning_rate": 1.999974684274629e-05, + "loss": 0.3079, + "step": 122 + }, + { + "epoch": 0.0023020685199555925, + "grad_norm": 0.6360185146331787, + "learning_rate": 1.9999738474510203e-05, + "loss": 0.4996, + "step": 124 + }, + { + "epoch": 0.002339198657374231, + "grad_norm": 0.5160127282142639, + "learning_rate": 1.9999729970207632e-05, + "loss": 0.4182, + "step": 126 + }, + { + "epoch": 0.0023763287947928696, + "grad_norm": 0.5332038402557373, + "learning_rate": 1.999972132983868e-05, + "loss": 0.4211, + "step": 128 + }, + { + "epoch": 0.002413458932211508, + "grad_norm": 0.34042033553123474, + "learning_rate": 1.999971255340348e-05, + "loss": 0.3714, + "step": 130 + }, + { + "epoch": 0.0024505890696301467, + "grad_norm": 0.37444549798965454, + "learning_rate": 1.9999703640902142e-05, + "loss": 0.2689, + "step": 132 + }, + { + "epoch": 0.0024877192070487854, + "grad_norm": 0.41683492064476013, + "learning_rate": 1.999969459233479e-05, + "loss": 0.4776, + "step": 134 + }, + { + "epoch": 0.002524849344467424, + "grad_norm": 0.4115779995918274, + "learning_rate": 1.999968540770155e-05, + "loss": 0.3747, + "step": 136 + }, + { + "epoch": 0.0025619794818860626, + "grad_norm": 0.5225721001625061, + "learning_rate": 1.999967608700254e-05, + "loss": 0.6104, + "step": 138 + }, + { + "epoch": 0.002599109619304701, + "grad_norm": 0.3780425190925598, + "learning_rate": 1.999966663023789e-05, + "loss": 0.5691, + "step": 140 + }, + { + "epoch": 0.0026362397567233397, + "grad_norm": 0.4317651093006134, + "learning_rate": 1.9999657037407733e-05, + "loss": 0.4652, + "step": 142 + }, + { + "epoch": 0.0026733698941419784, + "grad_norm": 0.3654298186302185, + "learning_rate": 1.9999647308512197e-05, + "loss": 0.3942, + "step": 144 + }, + { + "epoch": 0.0027105000315606168, + "grad_norm": 0.6619117259979248, + "learning_rate": 1.9999637443551414e-05, + "loss": 0.3976, + "step": 146 + }, + { + "epoch": 0.0027476301689792555, + "grad_norm": 0.588629424571991, + "learning_rate": 1.9999627442525514e-05, + "loss": 0.4946, + "step": 148 + }, + { + "epoch": 0.002784760306397894, + "grad_norm": 0.4449610114097595, + "learning_rate": 1.999961730543464e-05, + "loss": 0.4977, + "step": 150 + }, + { + "epoch": 0.0028218904438165326, + "grad_norm": 0.44638001918792725, + "learning_rate": 1.9999607032278925e-05, + "loss": 0.3224, + "step": 152 + }, + { + "epoch": 0.002859020581235171, + "grad_norm": 0.33068954944610596, + "learning_rate": 1.9999596623058513e-05, + "loss": 0.6474, + "step": 154 + }, + { + "epoch": 0.0028961507186538098, + "grad_norm": 0.302399605512619, + "learning_rate": 1.9999586077773538e-05, + "loss": 0.5662, + "step": 156 + }, + { + "epoch": 0.0029332808560724485, + "grad_norm": 0.448861300945282, + "learning_rate": 1.9999575396424154e-05, + "loss": 0.3873, + "step": 158 + }, + { + "epoch": 0.002970410993491087, + "grad_norm": 0.6153700351715088, + "learning_rate": 1.99995645790105e-05, + "loss": 0.6218, + "step": 160 + }, + { + "epoch": 0.0030075411309097256, + "grad_norm": 0.8051313161849976, + "learning_rate": 1.9999553625532722e-05, + "loss": 0.2921, + "step": 162 + }, + { + "epoch": 0.003044671268328364, + "grad_norm": 0.5313144326210022, + "learning_rate": 1.9999542535990972e-05, + "loss": 0.4455, + "step": 164 + }, + { + "epoch": 0.0030818014057470027, + "grad_norm": 0.5131443738937378, + "learning_rate": 1.9999531310385403e-05, + "loss": 0.3069, + "step": 166 + }, + { + "epoch": 0.003118931543165641, + "grad_norm": 0.3482806384563446, + "learning_rate": 1.9999519948716162e-05, + "loss": 0.3313, + "step": 168 + }, + { + "epoch": 0.00315606168058428, + "grad_norm": 0.6128836870193481, + "learning_rate": 1.9999508450983407e-05, + "loss": 0.5486, + "step": 170 + }, + { + "epoch": 0.0031931918180029186, + "grad_norm": 0.5387942790985107, + "learning_rate": 1.9999496817187294e-05, + "loss": 0.372, + "step": 172 + }, + { + "epoch": 0.003230321955421557, + "grad_norm": 0.46776172518730164, + "learning_rate": 1.9999485047327982e-05, + "loss": 0.257, + "step": 174 + }, + { + "epoch": 0.0032674520928401957, + "grad_norm": 0.45111018419265747, + "learning_rate": 1.999947314140563e-05, + "loss": 0.347, + "step": 176 + }, + { + "epoch": 0.003304582230258834, + "grad_norm": 0.46320462226867676, + "learning_rate": 1.99994610994204e-05, + "loss": 0.4372, + "step": 178 + }, + { + "epoch": 0.003341712367677473, + "grad_norm": 0.46013370156288147, + "learning_rate": 1.9999448921372457e-05, + "loss": 0.5137, + "step": 180 + }, + { + "epoch": 0.003378842505096111, + "grad_norm": 0.3580809533596039, + "learning_rate": 1.9999436607261968e-05, + "loss": 0.4795, + "step": 182 + }, + { + "epoch": 0.00341597264251475, + "grad_norm": 0.5853656530380249, + "learning_rate": 1.9999424157089093e-05, + "loss": 0.2693, + "step": 184 + }, + { + "epoch": 0.0034531027799333887, + "grad_norm": 0.4293530583381653, + "learning_rate": 1.999941157085401e-05, + "loss": 0.1988, + "step": 186 + }, + { + "epoch": 0.003490232917352027, + "grad_norm": 0.48474812507629395, + "learning_rate": 1.9999398848556886e-05, + "loss": 0.4293, + "step": 188 + }, + { + "epoch": 0.003527363054770666, + "grad_norm": 0.46240234375, + "learning_rate": 1.99993859901979e-05, + "loss": 0.3688, + "step": 190 + }, + { + "epoch": 0.003564493192189304, + "grad_norm": 0.3638309836387634, + "learning_rate": 1.999937299577722e-05, + "loss": 0.4726, + "step": 192 + }, + { + "epoch": 0.003601623329607943, + "grad_norm": 0.47168317437171936, + "learning_rate": 1.9999359865295023e-05, + "loss": 0.5157, + "step": 194 + }, + { + "epoch": 0.0036387534670265812, + "grad_norm": 0.3542977273464203, + "learning_rate": 1.999934659875149e-05, + "loss": 0.4035, + "step": 196 + }, + { + "epoch": 0.00367588360444522, + "grad_norm": 0.41440027952194214, + "learning_rate": 1.9999333196146803e-05, + "loss": 0.3192, + "step": 198 + }, + { + "epoch": 0.003713013741863859, + "grad_norm": 0.547886073589325, + "learning_rate": 1.999931965748114e-05, + "loss": 0.4244, + "step": 200 + }, + { + "epoch": 0.003750143879282497, + "grad_norm": 0.5255728363990784, + "learning_rate": 1.9999305982754694e-05, + "loss": 0.4748, + "step": 202 + }, + { + "epoch": 0.003787274016701136, + "grad_norm": 0.3951583206653595, + "learning_rate": 1.9999292171967636e-05, + "loss": 0.3381, + "step": 204 + }, + { + "epoch": 0.0038244041541197742, + "grad_norm": 0.3999170660972595, + "learning_rate": 1.9999278225120168e-05, + "loss": 0.325, + "step": 206 + }, + { + "epoch": 0.003861534291538413, + "grad_norm": 0.5085821151733398, + "learning_rate": 1.9999264142212476e-05, + "loss": 0.4041, + "step": 208 + }, + { + "epoch": 0.0038986644289570518, + "grad_norm": 0.49167922139167786, + "learning_rate": 1.9999249923244747e-05, + "loss": 0.2427, + "step": 210 + }, + { + "epoch": 0.0039357945663756905, + "grad_norm": 0.4795328974723816, + "learning_rate": 1.999923556821718e-05, + "loss": 0.2821, + "step": 212 + }, + { + "epoch": 0.0039729247037943284, + "grad_norm": 0.646994411945343, + "learning_rate": 1.999922107712997e-05, + "loss": 0.3946, + "step": 214 + }, + { + "epoch": 0.004010054841212967, + "grad_norm": 0.3711841106414795, + "learning_rate": 1.999920644998331e-05, + "loss": 0.3623, + "step": 216 + }, + { + "epoch": 0.004047184978631606, + "grad_norm": 0.48085135221481323, + "learning_rate": 1.99991916867774e-05, + "loss": 0.4661, + "step": 218 + }, + { + "epoch": 0.004084315116050245, + "grad_norm": 0.34979113936424255, + "learning_rate": 1.999917678751244e-05, + "loss": 0.495, + "step": 220 + }, + { + "epoch": 0.004121445253468883, + "grad_norm": 0.49509361386299133, + "learning_rate": 1.9999161752188642e-05, + "loss": 0.3476, + "step": 222 + }, + { + "epoch": 0.004158575390887521, + "grad_norm": 0.4042584300041199, + "learning_rate": 1.9999146580806198e-05, + "loss": 0.2823, + "step": 224 + }, + { + "epoch": 0.00419570552830616, + "grad_norm": 0.4594835937023163, + "learning_rate": 1.9999131273365323e-05, + "loss": 0.5605, + "step": 226 + }, + { + "epoch": 0.004232835665724799, + "grad_norm": 0.47700241208076477, + "learning_rate": 1.999911582986622e-05, + "loss": 0.5574, + "step": 228 + }, + { + "epoch": 0.004269965803143438, + "grad_norm": 0.33174213767051697, + "learning_rate": 1.99991002503091e-05, + "loss": 0.4264, + "step": 230 + }, + { + "epoch": 0.004307095940562076, + "grad_norm": 0.44007235765457153, + "learning_rate": 1.9999084534694182e-05, + "loss": 0.5912, + "step": 232 + }, + { + "epoch": 0.004344226077980714, + "grad_norm": 0.5250499248504639, + "learning_rate": 1.999906868302167e-05, + "loss": 0.2697, + "step": 234 + }, + { + "epoch": 0.004381356215399353, + "grad_norm": 0.4792839288711548, + "learning_rate": 1.9999052695291784e-05, + "loss": 0.3774, + "step": 236 + }, + { + "epoch": 0.004418486352817992, + "grad_norm": 0.5226247906684875, + "learning_rate": 1.9999036571504745e-05, + "loss": 0.2578, + "step": 238 + }, + { + "epoch": 0.004455616490236631, + "grad_norm": 0.5130003690719604, + "learning_rate": 1.9999020311660763e-05, + "loss": 0.4903, + "step": 240 + }, + { + "epoch": 0.004492746627655269, + "grad_norm": 0.5420089960098267, + "learning_rate": 1.999900391576007e-05, + "loss": 0.4147, + "step": 242 + }, + { + "epoch": 0.004529876765073907, + "grad_norm": 0.5653980374336243, + "learning_rate": 1.999898738380288e-05, + "loss": 0.4098, + "step": 244 + }, + { + "epoch": 0.004567006902492546, + "grad_norm": 0.3912901282310486, + "learning_rate": 1.9998970715789428e-05, + "loss": 0.2877, + "step": 246 + }, + { + "epoch": 0.004604137039911185, + "grad_norm": 0.32257866859436035, + "learning_rate": 1.999895391171993e-05, + "loss": 0.4531, + "step": 248 + }, + { + "epoch": 0.004641267177329824, + "grad_norm": 0.47456029057502747, + "learning_rate": 1.9998936971594622e-05, + "loss": 0.3763, + "step": 250 + }, + { + "epoch": 0.004678397314748462, + "grad_norm": 0.4561237096786499, + "learning_rate": 1.999891989541373e-05, + "loss": 0.4848, + "step": 252 + }, + { + "epoch": 0.0047155274521671, + "grad_norm": 0.5080367922782898, + "learning_rate": 1.9998902683177488e-05, + "loss": 0.5258, + "step": 254 + }, + { + "epoch": 0.004752657589585739, + "grad_norm": 0.3973216116428375, + "learning_rate": 1.9998885334886132e-05, + "loss": 0.3827, + "step": 256 + }, + { + "epoch": 0.004789787727004378, + "grad_norm": 0.3797387480735779, + "learning_rate": 1.99988678505399e-05, + "loss": 0.389, + "step": 258 + }, + { + "epoch": 0.004826917864423016, + "grad_norm": 0.34787413477897644, + "learning_rate": 1.999885023013902e-05, + "loss": 0.331, + "step": 260 + }, + { + "epoch": 0.004864048001841655, + "grad_norm": 0.5755864381790161, + "learning_rate": 1.9998832473683745e-05, + "loss": 0.5135, + "step": 262 + }, + { + "epoch": 0.004901178139260293, + "grad_norm": 0.49313580989837646, + "learning_rate": 1.9998814581174307e-05, + "loss": 0.4115, + "step": 264 + }, + { + "epoch": 0.004938308276678932, + "grad_norm": 0.34419214725494385, + "learning_rate": 1.9998796552610954e-05, + "loss": 0.2674, + "step": 266 + }, + { + "epoch": 0.004975438414097571, + "grad_norm": 0.4715988039970398, + "learning_rate": 1.9998778387993923e-05, + "loss": 0.2613, + "step": 268 + }, + { + "epoch": 0.005012568551516209, + "grad_norm": 0.29773658514022827, + "learning_rate": 1.9998760087323476e-05, + "loss": 0.248, + "step": 270 + }, + { + "epoch": 0.005049698688934848, + "grad_norm": 0.28970152139663696, + "learning_rate": 1.999874165059985e-05, + "loss": 0.276, + "step": 272 + }, + { + "epoch": 0.005086828826353486, + "grad_norm": 0.526797354221344, + "learning_rate": 1.9998723077823302e-05, + "loss": 0.3526, + "step": 274 + }, + { + "epoch": 0.005123958963772125, + "grad_norm": 0.34837856888771057, + "learning_rate": 1.999870436899408e-05, + "loss": 0.2613, + "step": 276 + }, + { + "epoch": 0.005161089101190764, + "grad_norm": 0.3190463185310364, + "learning_rate": 1.9998685524112445e-05, + "loss": 0.2662, + "step": 278 + }, + { + "epoch": 0.005198219238609402, + "grad_norm": 0.5121004581451416, + "learning_rate": 1.9998666543178647e-05, + "loss": 0.3837, + "step": 280 + }, + { + "epoch": 0.0052353493760280406, + "grad_norm": 0.46542319655418396, + "learning_rate": 1.999864742619295e-05, + "loss": 0.1741, + "step": 282 + }, + { + "epoch": 0.005272479513446679, + "grad_norm": 0.481201708316803, + "learning_rate": 1.9998628173155607e-05, + "loss": 0.5364, + "step": 284 + }, + { + "epoch": 0.005309609650865318, + "grad_norm": 0.25467976927757263, + "learning_rate": 1.9998608784066883e-05, + "loss": 0.2166, + "step": 286 + }, + { + "epoch": 0.005346739788283957, + "grad_norm": 0.4101756811141968, + "learning_rate": 1.9998589258927044e-05, + "loss": 0.3244, + "step": 288 + }, + { + "epoch": 0.005383869925702595, + "grad_norm": 0.47906413674354553, + "learning_rate": 1.9998569597736355e-05, + "loss": 0.258, + "step": 290 + }, + { + "epoch": 0.0054210000631212335, + "grad_norm": 0.42529603838920593, + "learning_rate": 1.9998549800495083e-05, + "loss": 0.3635, + "step": 292 + }, + { + "epoch": 0.005458130200539872, + "grad_norm": 0.47230392694473267, + "learning_rate": 1.9998529867203496e-05, + "loss": 0.4149, + "step": 294 + }, + { + "epoch": 0.005495260337958511, + "grad_norm": 0.39145946502685547, + "learning_rate": 1.9998509797861868e-05, + "loss": 0.468, + "step": 296 + }, + { + "epoch": 0.005532390475377149, + "grad_norm": 0.38009825348854065, + "learning_rate": 1.999848959247047e-05, + "loss": 0.4095, + "step": 298 + }, + { + "epoch": 0.005569520612795788, + "grad_norm": 0.4906117916107178, + "learning_rate": 1.999846925102958e-05, + "loss": 0.4982, + "step": 300 + }, + { + "epoch": 0.0056066507502144265, + "grad_norm": 0.26200997829437256, + "learning_rate": 1.999844877353947e-05, + "loss": 0.3007, + "step": 302 + }, + { + "epoch": 0.005643780887633065, + "grad_norm": 0.3799205720424652, + "learning_rate": 1.9998428160000422e-05, + "loss": 0.4404, + "step": 304 + }, + { + "epoch": 0.005680911025051704, + "grad_norm": 0.39745423197746277, + "learning_rate": 1.9998407410412715e-05, + "loss": 0.5053, + "step": 306 + }, + { + "epoch": 0.005718041162470342, + "grad_norm": 0.342948853969574, + "learning_rate": 1.9998386524776633e-05, + "loss": 0.3366, + "step": 308 + }, + { + "epoch": 0.005755171299888981, + "grad_norm": 0.5430181622505188, + "learning_rate": 1.9998365503092457e-05, + "loss": 0.3058, + "step": 310 + }, + { + "epoch": 0.0057923014373076195, + "grad_norm": 0.3937097191810608, + "learning_rate": 1.9998344345360478e-05, + "loss": 0.3755, + "step": 312 + }, + { + "epoch": 0.005829431574726258, + "grad_norm": 0.46196967363357544, + "learning_rate": 1.9998323051580977e-05, + "loss": 0.3913, + "step": 314 + }, + { + "epoch": 0.005866561712144897, + "grad_norm": 0.7649854421615601, + "learning_rate": 1.999830162175425e-05, + "loss": 0.2668, + "step": 316 + }, + { + "epoch": 0.005903691849563535, + "grad_norm": 0.5333899855613708, + "learning_rate": 1.9998280055880586e-05, + "loss": 0.4538, + "step": 318 + }, + { + "epoch": 0.005940821986982174, + "grad_norm": 0.44176235795021057, + "learning_rate": 1.999825835396028e-05, + "loss": 0.3859, + "step": 320 + }, + { + "epoch": 0.0059779521244008125, + "grad_norm": 0.27276358008384705, + "learning_rate": 1.9998236515993623e-05, + "loss": 0.3317, + "step": 322 + }, + { + "epoch": 0.006015082261819451, + "grad_norm": 0.40533676743507385, + "learning_rate": 1.9998214541980915e-05, + "loss": 0.4542, + "step": 324 + }, + { + "epoch": 0.006052212399238089, + "grad_norm": 0.4885386526584625, + "learning_rate": 1.999819243192246e-05, + "loss": 0.4974, + "step": 326 + }, + { + "epoch": 0.006089342536656728, + "grad_norm": 0.4333484172821045, + "learning_rate": 1.999817018581855e-05, + "loss": 0.207, + "step": 328 + }, + { + "epoch": 0.006126472674075367, + "grad_norm": 0.4982893466949463, + "learning_rate": 1.9998147803669495e-05, + "loss": 0.4078, + "step": 330 + }, + { + "epoch": 0.0061636028114940055, + "grad_norm": 0.5079017281532288, + "learning_rate": 1.9998125285475595e-05, + "loss": 0.3324, + "step": 332 + }, + { + "epoch": 0.006200732948912644, + "grad_norm": 0.4985852539539337, + "learning_rate": 1.9998102631237155e-05, + "loss": 0.3283, + "step": 334 + }, + { + "epoch": 0.006237863086331282, + "grad_norm": 0.2881741523742676, + "learning_rate": 1.999807984095449e-05, + "loss": 0.2229, + "step": 336 + }, + { + "epoch": 0.006274993223749921, + "grad_norm": 0.37681514024734497, + "learning_rate": 1.99980569146279e-05, + "loss": 0.3103, + "step": 338 + }, + { + "epoch": 0.00631212336116856, + "grad_norm": 0.40298473834991455, + "learning_rate": 1.999803385225771e-05, + "loss": 0.4415, + "step": 340 + }, + { + "epoch": 0.0063492534985871985, + "grad_norm": 0.5242092609405518, + "learning_rate": 1.9998010653844222e-05, + "loss": 0.618, + "step": 342 + }, + { + "epoch": 0.006386383636005837, + "grad_norm": 0.36084824800491333, + "learning_rate": 1.999798731938776e-05, + "loss": 0.298, + "step": 344 + }, + { + "epoch": 0.006423513773424475, + "grad_norm": 0.43370696902275085, + "learning_rate": 1.9997963848888637e-05, + "loss": 0.4926, + "step": 346 + }, + { + "epoch": 0.006460643910843114, + "grad_norm": 0.38683441281318665, + "learning_rate": 1.999794024234717e-05, + "loss": 0.3756, + "step": 348 + }, + { + "epoch": 0.006497774048261753, + "grad_norm": 0.5789943933486938, + "learning_rate": 1.999791649976369e-05, + "loss": 0.3597, + "step": 350 + }, + { + "epoch": 0.0065349041856803914, + "grad_norm": 0.38878417015075684, + "learning_rate": 1.9997892621138512e-05, + "loss": 0.2021, + "step": 352 + }, + { + "epoch": 0.00657203432309903, + "grad_norm": 0.39716699719429016, + "learning_rate": 1.9997868606471958e-05, + "loss": 0.6617, + "step": 354 + }, + { + "epoch": 0.006609164460517668, + "grad_norm": 0.2856186628341675, + "learning_rate": 1.9997844455764364e-05, + "loss": 0.3369, + "step": 356 + }, + { + "epoch": 0.006646294597936307, + "grad_norm": 0.4489436447620392, + "learning_rate": 1.9997820169016048e-05, + "loss": 0.386, + "step": 358 + }, + { + "epoch": 0.006683424735354946, + "grad_norm": 0.37973126769065857, + "learning_rate": 1.9997795746227353e-05, + "loss": 0.5042, + "step": 360 + }, + { + "epoch": 0.006720554872773584, + "grad_norm": 0.4062805473804474, + "learning_rate": 1.9997771187398604e-05, + "loss": 0.2486, + "step": 362 + }, + { + "epoch": 0.006757685010192222, + "grad_norm": 0.5012133121490479, + "learning_rate": 1.9997746492530132e-05, + "loss": 0.3411, + "step": 364 + }, + { + "epoch": 0.006794815147610861, + "grad_norm": 0.39055135846138, + "learning_rate": 1.9997721661622278e-05, + "loss": 0.2739, + "step": 366 + }, + { + "epoch": 0.0068319452850295, + "grad_norm": 0.37451842427253723, + "learning_rate": 1.9997696694675382e-05, + "loss": 0.3286, + "step": 368 + }, + { + "epoch": 0.006869075422448139, + "grad_norm": 0.4690632224082947, + "learning_rate": 1.9997671591689777e-05, + "loss": 0.3944, + "step": 370 + }, + { + "epoch": 0.006906205559866777, + "grad_norm": 0.6265665292739868, + "learning_rate": 1.9997646352665812e-05, + "loss": 0.573, + "step": 372 + }, + { + "epoch": 0.006943335697285415, + "grad_norm": 0.42657193541526794, + "learning_rate": 1.9997620977603823e-05, + "loss": 0.4903, + "step": 374 + }, + { + "epoch": 0.006980465834704054, + "grad_norm": 0.3993040919303894, + "learning_rate": 1.999759546650416e-05, + "loss": 0.3696, + "step": 376 + }, + { + "epoch": 0.007017595972122693, + "grad_norm": 0.5259105563163757, + "learning_rate": 1.999756981936717e-05, + "loss": 0.3929, + "step": 378 + }, + { + "epoch": 0.007054726109541332, + "grad_norm": 0.4576737880706787, + "learning_rate": 1.99975440361932e-05, + "loss": 0.3648, + "step": 380 + }, + { + "epoch": 0.00709185624695997, + "grad_norm": 0.3197403848171234, + "learning_rate": 1.99975181169826e-05, + "loss": 0.2748, + "step": 382 + }, + { + "epoch": 0.007128986384378608, + "grad_norm": 0.38901081681251526, + "learning_rate": 1.9997492061735726e-05, + "loss": 0.3293, + "step": 384 + }, + { + "epoch": 0.007166116521797247, + "grad_norm": 0.32115432620048523, + "learning_rate": 1.999746587045293e-05, + "loss": 0.3153, + "step": 386 + }, + { + "epoch": 0.007203246659215886, + "grad_norm": 0.4031219482421875, + "learning_rate": 1.9997439543134574e-05, + "loss": 0.4489, + "step": 388 + }, + { + "epoch": 0.007240376796634525, + "grad_norm": 0.35943150520324707, + "learning_rate": 1.9997413079781008e-05, + "loss": 0.3411, + "step": 390 + }, + { + "epoch": 0.0072775069340531625, + "grad_norm": 0.639771580696106, + "learning_rate": 1.9997386480392596e-05, + "loss": 0.3086, + "step": 392 + }, + { + "epoch": 0.007314637071471801, + "grad_norm": 0.41053539514541626, + "learning_rate": 1.99973597449697e-05, + "loss": 0.3575, + "step": 394 + }, + { + "epoch": 0.00735176720889044, + "grad_norm": 0.3961491584777832, + "learning_rate": 1.999733287351268e-05, + "loss": 0.311, + "step": 396 + }, + { + "epoch": 0.007388897346309079, + "grad_norm": 0.37600892782211304, + "learning_rate": 1.999730586602191e-05, + "loss": 0.4798, + "step": 398 + }, + { + "epoch": 0.007426027483727718, + "grad_norm": 0.41316813230514526, + "learning_rate": 1.999727872249775e-05, + "loss": 0.4722, + "step": 400 + }, + { + "epoch": 0.0074631576211463555, + "grad_norm": 0.3977676331996918, + "learning_rate": 1.9997251442940574e-05, + "loss": 0.4144, + "step": 402 + }, + { + "epoch": 0.007500287758564994, + "grad_norm": 0.422908753156662, + "learning_rate": 1.999722402735075e-05, + "loss": 0.2458, + "step": 404 + }, + { + "epoch": 0.007537417895983633, + "grad_norm": 0.45909595489501953, + "learning_rate": 1.999719647572865e-05, + "loss": 0.5124, + "step": 406 + }, + { + "epoch": 0.007574548033402272, + "grad_norm": 0.4146987199783325, + "learning_rate": 1.9997168788074655e-05, + "loss": 0.2605, + "step": 408 + }, + { + "epoch": 0.0076116781708209106, + "grad_norm": 0.4815383553504944, + "learning_rate": 1.9997140964389133e-05, + "loss": 0.2757, + "step": 410 + }, + { + "epoch": 0.0076488083082395485, + "grad_norm": 0.6241264939308167, + "learning_rate": 1.9997113004672472e-05, + "loss": 0.4455, + "step": 412 + }, + { + "epoch": 0.007685938445658187, + "grad_norm": 0.5692870020866394, + "learning_rate": 1.9997084908925043e-05, + "loss": 0.3409, + "step": 414 + }, + { + "epoch": 0.007723068583076826, + "grad_norm": 0.30715450644493103, + "learning_rate": 1.9997056677147237e-05, + "loss": 0.2442, + "step": 416 + }, + { + "epoch": 0.007760198720495465, + "grad_norm": 0.4382787346839905, + "learning_rate": 1.9997028309339433e-05, + "loss": 0.486, + "step": 418 + }, + { + "epoch": 0.0077973288579141035, + "grad_norm": 0.5441878437995911, + "learning_rate": 1.9996999805502016e-05, + "loss": 0.5577, + "step": 420 + }, + { + "epoch": 0.007834458995332742, + "grad_norm": 0.2951529026031494, + "learning_rate": 1.999697116563538e-05, + "loss": 0.461, + "step": 422 + }, + { + "epoch": 0.007871589132751381, + "grad_norm": 0.42165622115135193, + "learning_rate": 1.999694238973991e-05, + "loss": 0.547, + "step": 424 + }, + { + "epoch": 0.007908719270170018, + "grad_norm": 0.41160154342651367, + "learning_rate": 1.9996913477816e-05, + "loss": 0.2792, + "step": 426 + }, + { + "epoch": 0.007945849407588657, + "grad_norm": 0.4399774372577667, + "learning_rate": 1.9996884429864038e-05, + "loss": 0.399, + "step": 428 + }, + { + "epoch": 0.007982979545007296, + "grad_norm": 0.4168318808078766, + "learning_rate": 1.9996855245884427e-05, + "loss": 0.279, + "step": 430 + }, + { + "epoch": 0.008020109682425934, + "grad_norm": 0.4693607687950134, + "learning_rate": 1.9996825925877555e-05, + "loss": 0.4932, + "step": 432 + }, + { + "epoch": 0.008057239819844573, + "grad_norm": 0.3294755518436432, + "learning_rate": 1.999679646984383e-05, + "loss": 0.3367, + "step": 434 + }, + { + "epoch": 0.008094369957263212, + "grad_norm": 0.39903226494789124, + "learning_rate": 1.9996766877783647e-05, + "loss": 0.409, + "step": 436 + }, + { + "epoch": 0.00813150009468185, + "grad_norm": 0.3988722264766693, + "learning_rate": 1.9996737149697413e-05, + "loss": 0.3253, + "step": 438 + }, + { + "epoch": 0.00816863023210049, + "grad_norm": 0.3188820481300354, + "learning_rate": 1.9996707285585527e-05, + "loss": 0.2537, + "step": 440 + }, + { + "epoch": 0.008205760369519128, + "grad_norm": 0.37374868988990784, + "learning_rate": 1.99966772854484e-05, + "loss": 0.3668, + "step": 442 + }, + { + "epoch": 0.008242890506937765, + "grad_norm": 0.45591971278190613, + "learning_rate": 1.999664714928644e-05, + "loss": 0.3725, + "step": 444 + }, + { + "epoch": 0.008280020644356404, + "grad_norm": 0.3772117495536804, + "learning_rate": 1.9996616877100053e-05, + "loss": 0.5713, + "step": 446 + }, + { + "epoch": 0.008317150781775043, + "grad_norm": 0.5080755949020386, + "learning_rate": 1.9996586468889653e-05, + "loss": 0.2889, + "step": 448 + }, + { + "epoch": 0.008354280919193682, + "grad_norm": 0.6368932127952576, + "learning_rate": 1.9996555924655658e-05, + "loss": 0.3513, + "step": 450 + }, + { + "epoch": 0.00839141105661232, + "grad_norm": 0.3840327560901642, + "learning_rate": 1.9996525244398478e-05, + "loss": 0.2882, + "step": 452 + }, + { + "epoch": 0.00842854119403096, + "grad_norm": 0.4199620187282562, + "learning_rate": 1.9996494428118533e-05, + "loss": 0.1586, + "step": 454 + }, + { + "epoch": 0.008465671331449598, + "grad_norm": 0.2961635887622833, + "learning_rate": 1.9996463475816245e-05, + "loss": 0.3291, + "step": 456 + }, + { + "epoch": 0.008502801468868237, + "grad_norm": 0.49746212363243103, + "learning_rate": 1.9996432387492025e-05, + "loss": 0.2865, + "step": 458 + }, + { + "epoch": 0.008539931606286875, + "grad_norm": 0.4187294542789459, + "learning_rate": 1.9996401163146308e-05, + "loss": 0.5745, + "step": 460 + }, + { + "epoch": 0.008577061743705514, + "grad_norm": 0.3033667802810669, + "learning_rate": 1.999636980277951e-05, + "loss": 0.4464, + "step": 462 + }, + { + "epoch": 0.008614191881124151, + "grad_norm": 0.3640962839126587, + "learning_rate": 1.9996338306392065e-05, + "loss": 0.284, + "step": 464 + }, + { + "epoch": 0.00865132201854279, + "grad_norm": 0.4276184141635895, + "learning_rate": 1.99963066739844e-05, + "loss": 0.3035, + "step": 466 + }, + { + "epoch": 0.008688452155961429, + "grad_norm": 0.4547252953052521, + "learning_rate": 1.999627490555694e-05, + "loss": 0.4582, + "step": 468 + }, + { + "epoch": 0.008725582293380068, + "grad_norm": 0.48601678013801575, + "learning_rate": 1.999624300111012e-05, + "loss": 0.5454, + "step": 470 + }, + { + "epoch": 0.008762712430798706, + "grad_norm": 0.39640524983406067, + "learning_rate": 1.9996210960644376e-05, + "loss": 0.3945, + "step": 472 + }, + { + "epoch": 0.008799842568217345, + "grad_norm": 0.4193587601184845, + "learning_rate": 1.9996178784160144e-05, + "loss": 0.4255, + "step": 474 + }, + { + "epoch": 0.008836972705635984, + "grad_norm": 0.3313017189502716, + "learning_rate": 1.999614647165786e-05, + "loss": 0.3309, + "step": 476 + }, + { + "epoch": 0.008874102843054623, + "grad_norm": 0.31618499755859375, + "learning_rate": 1.9996114023137967e-05, + "loss": 0.3855, + "step": 478 + }, + { + "epoch": 0.008911232980473261, + "grad_norm": 0.42019137740135193, + "learning_rate": 1.9996081438600902e-05, + "loss": 0.3635, + "step": 480 + }, + { + "epoch": 0.008948363117891898, + "grad_norm": 0.43365445733070374, + "learning_rate": 1.999604871804711e-05, + "loss": 0.4671, + "step": 482 + }, + { + "epoch": 0.008985493255310537, + "grad_norm": 0.7438284754753113, + "learning_rate": 1.9996015861477035e-05, + "loss": 0.4199, + "step": 484 + }, + { + "epoch": 0.009022623392729176, + "grad_norm": 0.42682090401649475, + "learning_rate": 1.9995982868891128e-05, + "loss": 0.4286, + "step": 486 + }, + { + "epoch": 0.009059753530147815, + "grad_norm": 0.3651818335056305, + "learning_rate": 1.9995949740289835e-05, + "loss": 0.5493, + "step": 488 + }, + { + "epoch": 0.009096883667566454, + "grad_norm": 0.385578453540802, + "learning_rate": 1.9995916475673608e-05, + "loss": 0.4674, + "step": 490 + }, + { + "epoch": 0.009134013804985092, + "grad_norm": 0.43144655227661133, + "learning_rate": 1.99958830750429e-05, + "loss": 0.2013, + "step": 492 + }, + { + "epoch": 0.009171143942403731, + "grad_norm": 0.5458886623382568, + "learning_rate": 1.999584953839816e-05, + "loss": 0.3227, + "step": 494 + }, + { + "epoch": 0.00920827407982237, + "grad_norm": 0.5039573311805725, + "learning_rate": 1.9995815865739853e-05, + "loss": 0.4867, + "step": 496 + }, + { + "epoch": 0.009245404217241009, + "grad_norm": 0.3955095112323761, + "learning_rate": 1.9995782057068428e-05, + "loss": 0.3646, + "step": 498 + }, + { + "epoch": 0.009282534354659647, + "grad_norm": 0.42155376076698303, + "learning_rate": 1.9995748112384355e-05, + "loss": 0.4987, + "step": 500 + }, + { + "epoch": 0.009319664492078284, + "grad_norm": 0.4966467618942261, + "learning_rate": 1.9995714031688093e-05, + "loss": 0.4172, + "step": 502 + }, + { + "epoch": 0.009356794629496923, + "grad_norm": 0.4255533814430237, + "learning_rate": 1.9995679814980097e-05, + "loss": 0.3979, + "step": 504 + }, + { + "epoch": 0.009393924766915562, + "grad_norm": 0.37801483273506165, + "learning_rate": 1.9995645462260843e-05, + "loss": 0.313, + "step": 506 + }, + { + "epoch": 0.0094310549043342, + "grad_norm": 0.36350420117378235, + "learning_rate": 1.9995610973530795e-05, + "loss": 0.1838, + "step": 508 + }, + { + "epoch": 0.00946818504175284, + "grad_norm": 0.3894995450973511, + "learning_rate": 1.999557634879042e-05, + "loss": 0.3882, + "step": 510 + }, + { + "epoch": 0.009505315179171478, + "grad_norm": 0.34138813614845276, + "learning_rate": 1.9995541588040193e-05, + "loss": 0.4547, + "step": 512 + }, + { + "epoch": 0.009542445316590117, + "grad_norm": 0.3640008866786957, + "learning_rate": 1.9995506691280585e-05, + "loss": 0.2158, + "step": 514 + }, + { + "epoch": 0.009579575454008756, + "grad_norm": 0.49179133772850037, + "learning_rate": 1.999547165851207e-05, + "loss": 0.1977, + "step": 516 + }, + { + "epoch": 0.009616705591427395, + "grad_norm": 0.4022124111652374, + "learning_rate": 1.9995436489735124e-05, + "loss": 0.3845, + "step": 518 + }, + { + "epoch": 0.009653835728846032, + "grad_norm": 0.3796842098236084, + "learning_rate": 1.9995401184950228e-05, + "loss": 0.4438, + "step": 520 + }, + { + "epoch": 0.00969096586626467, + "grad_norm": 0.4665010869503021, + "learning_rate": 1.9995365744157868e-05, + "loss": 0.298, + "step": 522 + }, + { + "epoch": 0.00972809600368331, + "grad_norm": 0.4929797947406769, + "learning_rate": 1.999533016735851e-05, + "loss": 0.4707, + "step": 524 + }, + { + "epoch": 0.009765226141101948, + "grad_norm": 0.4474024772644043, + "learning_rate": 1.9995294454552654e-05, + "loss": 0.339, + "step": 526 + }, + { + "epoch": 0.009802356278520587, + "grad_norm": 0.6189339756965637, + "learning_rate": 1.9995258605740776e-05, + "loss": 0.323, + "step": 528 + }, + { + "epoch": 0.009839486415939225, + "grad_norm": 0.3080368936061859, + "learning_rate": 1.999522262092337e-05, + "loss": 0.3176, + "step": 530 + }, + { + "epoch": 0.009876616553357864, + "grad_norm": 0.48607268929481506, + "learning_rate": 1.9995186500100923e-05, + "loss": 0.4289, + "step": 532 + }, + { + "epoch": 0.009913746690776503, + "grad_norm": 0.5090469121932983, + "learning_rate": 1.999515024327393e-05, + "loss": 0.3874, + "step": 534 + }, + { + "epoch": 0.009950876828195142, + "grad_norm": 0.42586085200309753, + "learning_rate": 1.9995113850442878e-05, + "loss": 0.3146, + "step": 536 + }, + { + "epoch": 0.00998800696561378, + "grad_norm": 0.41823825240135193, + "learning_rate": 1.9995077321608264e-05, + "loss": 0.3602, + "step": 538 + }, + { + "epoch": 0.010025137103032418, + "grad_norm": 0.4068586528301239, + "learning_rate": 1.9995040656770587e-05, + "loss": 0.482, + "step": 540 + }, + { + "epoch": 0.010062267240451056, + "grad_norm": 0.44053971767425537, + "learning_rate": 1.999500385593035e-05, + "loss": 0.43, + "step": 542 + }, + { + "epoch": 0.010099397377869695, + "grad_norm": 0.3630414605140686, + "learning_rate": 1.9994966919088042e-05, + "loss": 0.304, + "step": 544 + }, + { + "epoch": 0.010136527515288334, + "grad_norm": 0.3131045997142792, + "learning_rate": 1.9994929846244177e-05, + "loss": 0.4051, + "step": 546 + }, + { + "epoch": 0.010173657652706973, + "grad_norm": 0.38520804047584534, + "learning_rate": 1.9994892637399254e-05, + "loss": 0.3213, + "step": 548 + }, + { + "epoch": 0.010210787790125611, + "grad_norm": 0.444692999124527, + "learning_rate": 1.999485529255378e-05, + "loss": 0.4698, + "step": 550 + }, + { + "epoch": 0.01024791792754425, + "grad_norm": 0.34077340364456177, + "learning_rate": 1.9994817811708262e-05, + "loss": 0.4032, + "step": 552 + }, + { + "epoch": 0.010285048064962889, + "grad_norm": 0.3990900218486786, + "learning_rate": 1.9994780194863215e-05, + "loss": 0.4268, + "step": 554 + }, + { + "epoch": 0.010322178202381528, + "grad_norm": 0.34091925621032715, + "learning_rate": 1.9994742442019144e-05, + "loss": 0.38, + "step": 556 + }, + { + "epoch": 0.010359308339800165, + "grad_norm": 0.4409696161746979, + "learning_rate": 1.999470455317657e-05, + "loss": 0.3509, + "step": 558 + }, + { + "epoch": 0.010396438477218804, + "grad_norm": 0.4494052529335022, + "learning_rate": 1.9994666528336e-05, + "loss": 0.2154, + "step": 560 + }, + { + "epoch": 0.010433568614637442, + "grad_norm": 0.2541842460632324, + "learning_rate": 1.9994628367497956e-05, + "loss": 0.5204, + "step": 562 + }, + { + "epoch": 0.010470698752056081, + "grad_norm": 0.3423829674720764, + "learning_rate": 1.999459007066296e-05, + "loss": 0.2155, + "step": 564 + }, + { + "epoch": 0.01050782888947472, + "grad_norm": 0.5962779521942139, + "learning_rate": 1.9994551637831533e-05, + "loss": 0.4892, + "step": 566 + }, + { + "epoch": 0.010544959026893359, + "grad_norm": 0.2996845543384552, + "learning_rate": 1.999451306900419e-05, + "loss": 0.2228, + "step": 568 + }, + { + "epoch": 0.010582089164311997, + "grad_norm": 0.4948548674583435, + "learning_rate": 1.9994474364181465e-05, + "loss": 0.2742, + "step": 570 + }, + { + "epoch": 0.010619219301730636, + "grad_norm": 0.24314939975738525, + "learning_rate": 1.999443552336388e-05, + "loss": 0.3812, + "step": 572 + }, + { + "epoch": 0.010656349439149275, + "grad_norm": 0.4004712402820587, + "learning_rate": 1.9994396546551962e-05, + "loss": 0.5465, + "step": 574 + }, + { + "epoch": 0.010693479576567914, + "grad_norm": 0.30782124400138855, + "learning_rate": 1.9994357433746245e-05, + "loss": 0.4436, + "step": 576 + }, + { + "epoch": 0.01073060971398655, + "grad_norm": 0.3293493092060089, + "learning_rate": 1.999431818494726e-05, + "loss": 0.4877, + "step": 578 + }, + { + "epoch": 0.01076773985140519, + "grad_norm": 0.4713459014892578, + "learning_rate": 1.999427880015554e-05, + "loss": 0.4656, + "step": 580 + }, + { + "epoch": 0.010804869988823828, + "grad_norm": 0.5082594156265259, + "learning_rate": 1.9994239279371624e-05, + "loss": 0.4909, + "step": 582 + }, + { + "epoch": 0.010842000126242467, + "grad_norm": 0.34458476305007935, + "learning_rate": 1.9994199622596047e-05, + "loss": 0.2978, + "step": 584 + }, + { + "epoch": 0.010879130263661106, + "grad_norm": 0.3392041325569153, + "learning_rate": 1.999415982982935e-05, + "loss": 0.3605, + "step": 586 + }, + { + "epoch": 0.010916260401079745, + "grad_norm": 0.39310190081596375, + "learning_rate": 1.9994119901072075e-05, + "loss": 0.294, + "step": 588 + }, + { + "epoch": 0.010953390538498383, + "grad_norm": 0.32911011576652527, + "learning_rate": 1.9994079836324758e-05, + "loss": 0.1853, + "step": 590 + }, + { + "epoch": 0.010990520675917022, + "grad_norm": 0.43393829464912415, + "learning_rate": 1.9994039635587955e-05, + "loss": 0.4223, + "step": 592 + }, + { + "epoch": 0.011027650813335661, + "grad_norm": 0.4050091803073883, + "learning_rate": 1.9993999298862205e-05, + "loss": 0.1945, + "step": 594 + }, + { + "epoch": 0.011064780950754298, + "grad_norm": 0.4151243269443512, + "learning_rate": 1.9993958826148063e-05, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 0.011101911088172937, + "grad_norm": 0.4293023645877838, + "learning_rate": 1.9993918217446078e-05, + "loss": 0.5065, + "step": 598 + }, + { + "epoch": 0.011139041225591576, + "grad_norm": 0.37561461329460144, + "learning_rate": 1.9993877472756798e-05, + "loss": 0.4602, + "step": 600 + }, + { + "epoch": 0.011176171363010214, + "grad_norm": 0.3366253077983856, + "learning_rate": 1.999383659208078e-05, + "loss": 0.3594, + "step": 602 + }, + { + "epoch": 0.011213301500428853, + "grad_norm": 0.38236403465270996, + "learning_rate": 1.9993795575418583e-05, + "loss": 0.3598, + "step": 604 + }, + { + "epoch": 0.011250431637847492, + "grad_norm": 0.3363437354564667, + "learning_rate": 1.9993754422770765e-05, + "loss": 0.206, + "step": 606 + }, + { + "epoch": 0.01128756177526613, + "grad_norm": 0.4060344994068146, + "learning_rate": 1.999371313413788e-05, + "loss": 0.2939, + "step": 608 + }, + { + "epoch": 0.01132469191268477, + "grad_norm": 0.40036308765411377, + "learning_rate": 1.9993671709520494e-05, + "loss": 0.3153, + "step": 610 + }, + { + "epoch": 0.011361822050103408, + "grad_norm": 0.4371497631072998, + "learning_rate": 1.9993630148919173e-05, + "loss": 0.4906, + "step": 612 + }, + { + "epoch": 0.011398952187522045, + "grad_norm": 0.376014769077301, + "learning_rate": 1.999358845233448e-05, + "loss": 0.4481, + "step": 614 + }, + { + "epoch": 0.011436082324940684, + "grad_norm": 0.39666667580604553, + "learning_rate": 1.9993546619766984e-05, + "loss": 0.5355, + "step": 616 + }, + { + "epoch": 0.011473212462359323, + "grad_norm": 0.5106778740882874, + "learning_rate": 1.999350465121725e-05, + "loss": 0.4714, + "step": 618 + }, + { + "epoch": 0.011510342599777961, + "grad_norm": 0.3509008288383484, + "learning_rate": 1.9993462546685852e-05, + "loss": 0.3091, + "step": 620 + }, + { + "epoch": 0.0115474727371966, + "grad_norm": 0.3783280849456787, + "learning_rate": 1.9993420306173363e-05, + "loss": 0.3315, + "step": 622 + }, + { + "epoch": 0.011584602874615239, + "grad_norm": 0.5052120685577393, + "learning_rate": 1.9993377929680355e-05, + "loss": 0.4169, + "step": 624 + }, + { + "epoch": 0.011621733012033878, + "grad_norm": 0.3175913095474243, + "learning_rate": 1.999333541720741e-05, + "loss": 0.4693, + "step": 626 + }, + { + "epoch": 0.011658863149452517, + "grad_norm": 0.3224084675312042, + "learning_rate": 1.9993292768755104e-05, + "loss": 0.3117, + "step": 628 + }, + { + "epoch": 0.011695993286871155, + "grad_norm": 0.298470675945282, + "learning_rate": 1.9993249984324016e-05, + "loss": 0.2555, + "step": 630 + }, + { + "epoch": 0.011733123424289794, + "grad_norm": 0.55222088098526, + "learning_rate": 1.9993207063914727e-05, + "loss": 0.2993, + "step": 632 + }, + { + "epoch": 0.011770253561708431, + "grad_norm": 0.3783060312271118, + "learning_rate": 1.9993164007527824e-05, + "loss": 0.1902, + "step": 634 + }, + { + "epoch": 0.01180738369912707, + "grad_norm": 0.36905935406684875, + "learning_rate": 1.9993120815163893e-05, + "loss": 0.4302, + "step": 636 + }, + { + "epoch": 0.011844513836545709, + "grad_norm": 0.4133588373661041, + "learning_rate": 1.999307748682352e-05, + "loss": 0.3256, + "step": 638 + }, + { + "epoch": 0.011881643973964347, + "grad_norm": 0.5520565509796143, + "learning_rate": 1.9993034022507295e-05, + "loss": 0.2963, + "step": 640 + }, + { + "epoch": 0.011918774111382986, + "grad_norm": 0.5690224170684814, + "learning_rate": 1.9992990422215806e-05, + "loss": 0.4899, + "step": 642 + }, + { + "epoch": 0.011955904248801625, + "grad_norm": 0.5573917031288147, + "learning_rate": 1.9992946685949658e-05, + "loss": 0.4937, + "step": 644 + }, + { + "epoch": 0.011993034386220264, + "grad_norm": 0.533745527267456, + "learning_rate": 1.9992902813709433e-05, + "loss": 0.3984, + "step": 646 + }, + { + "epoch": 0.012030164523638903, + "grad_norm": 0.29780712723731995, + "learning_rate": 1.999285880549573e-05, + "loss": 0.5047, + "step": 648 + }, + { + "epoch": 0.012067294661057541, + "grad_norm": 0.39466410875320435, + "learning_rate": 1.9992814661309157e-05, + "loss": 0.4383, + "step": 650 + }, + { + "epoch": 0.012104424798476178, + "grad_norm": 1.1004207134246826, + "learning_rate": 1.9992770381150305e-05, + "loss": 0.5021, + "step": 652 + }, + { + "epoch": 0.012141554935894817, + "grad_norm": 1.3368676900863647, + "learning_rate": 1.999272596501978e-05, + "loss": 0.4131, + "step": 654 + }, + { + "epoch": 0.012178685073313456, + "grad_norm": 0.4995886981487274, + "learning_rate": 1.9992681412918188e-05, + "loss": 0.3882, + "step": 656 + }, + { + "epoch": 0.012215815210732095, + "grad_norm": 0.3659275770187378, + "learning_rate": 1.999263672484613e-05, + "loss": 0.224, + "step": 658 + }, + { + "epoch": 0.012252945348150733, + "grad_norm": 0.27424532175064087, + "learning_rate": 1.9992591900804223e-05, + "loss": 0.3113, + "step": 660 + }, + { + "epoch": 0.012290075485569372, + "grad_norm": 0.3828636705875397, + "learning_rate": 1.9992546940793065e-05, + "loss": 0.3334, + "step": 662 + }, + { + "epoch": 0.012327205622988011, + "grad_norm": 0.4719410240650177, + "learning_rate": 1.9992501844813275e-05, + "loss": 0.5218, + "step": 664 + }, + { + "epoch": 0.01236433576040665, + "grad_norm": 0.4182148575782776, + "learning_rate": 1.9992456612865473e-05, + "loss": 0.3683, + "step": 666 + }, + { + "epoch": 0.012401465897825288, + "grad_norm": 0.5829437375068665, + "learning_rate": 1.999241124495026e-05, + "loss": 0.4429, + "step": 668 + }, + { + "epoch": 0.012438596035243927, + "grad_norm": 0.7482290863990784, + "learning_rate": 1.9992365741068264e-05, + "loss": 0.4297, + "step": 670 + }, + { + "epoch": 0.012475726172662564, + "grad_norm": 0.38636359572410583, + "learning_rate": 1.99923201012201e-05, + "loss": 0.4251, + "step": 672 + }, + { + "epoch": 0.012512856310081203, + "grad_norm": 0.40750887989997864, + "learning_rate": 1.9992274325406393e-05, + "loss": 0.2977, + "step": 674 + }, + { + "epoch": 0.012549986447499842, + "grad_norm": 0.4604863226413727, + "learning_rate": 1.9992228413627755e-05, + "loss": 0.4084, + "step": 676 + }, + { + "epoch": 0.01258711658491848, + "grad_norm": 0.4410512447357178, + "learning_rate": 1.999218236588483e-05, + "loss": 0.3426, + "step": 678 + }, + { + "epoch": 0.01262424672233712, + "grad_norm": 0.37291786074638367, + "learning_rate": 1.999213618217822e-05, + "loss": 0.355, + "step": 680 + }, + { + "epoch": 0.012661376859755758, + "grad_norm": 0.40774786472320557, + "learning_rate": 1.9992089862508576e-05, + "loss": 0.3963, + "step": 682 + }, + { + "epoch": 0.012698506997174397, + "grad_norm": 0.5577800869941711, + "learning_rate": 1.9992043406876513e-05, + "loss": 0.4311, + "step": 684 + }, + { + "epoch": 0.012735637134593036, + "grad_norm": 0.4383511543273926, + "learning_rate": 1.9991996815282673e-05, + "loss": 0.3518, + "step": 686 + }, + { + "epoch": 0.012772767272011674, + "grad_norm": 0.3836650550365448, + "learning_rate": 1.9991950087727684e-05, + "loss": 0.5306, + "step": 688 + }, + { + "epoch": 0.012809897409430311, + "grad_norm": 0.38333860039711, + "learning_rate": 1.9991903224212185e-05, + "loss": 0.442, + "step": 690 + }, + { + "epoch": 0.01284702754684895, + "grad_norm": 0.5456172227859497, + "learning_rate": 1.999185622473681e-05, + "loss": 0.3217, + "step": 692 + }, + { + "epoch": 0.012884157684267589, + "grad_norm": 0.31290164589881897, + "learning_rate": 1.9991809089302204e-05, + "loss": 0.5287, + "step": 694 + }, + { + "epoch": 0.012921287821686228, + "grad_norm": 0.28537651896476746, + "learning_rate": 1.9991761817909004e-05, + "loss": 0.2736, + "step": 696 + }, + { + "epoch": 0.012958417959104867, + "grad_norm": 0.3639223873615265, + "learning_rate": 1.9991714410557854e-05, + "loss": 0.3133, + "step": 698 + }, + { + "epoch": 0.012995548096523505, + "grad_norm": 0.44784969091415405, + "learning_rate": 1.99916668672494e-05, + "loss": 0.2096, + "step": 700 + }, + { + "epoch": 0.013032678233942144, + "grad_norm": 0.8881627321243286, + "learning_rate": 1.9991619187984287e-05, + "loss": 0.4387, + "step": 702 + }, + { + "epoch": 0.013069808371360783, + "grad_norm": 0.3999682664871216, + "learning_rate": 1.9991571372763165e-05, + "loss": 0.3796, + "step": 704 + }, + { + "epoch": 0.013106938508779422, + "grad_norm": 0.43037664890289307, + "learning_rate": 1.9991523421586688e-05, + "loss": 0.3527, + "step": 706 + }, + { + "epoch": 0.01314406864619806, + "grad_norm": 0.45817339420318604, + "learning_rate": 1.99914753344555e-05, + "loss": 0.5195, + "step": 708 + }, + { + "epoch": 0.013181198783616697, + "grad_norm": 0.402064710855484, + "learning_rate": 1.9991427111370264e-05, + "loss": 0.3444, + "step": 710 + }, + { + "epoch": 0.013218328921035336, + "grad_norm": 0.3542466461658478, + "learning_rate": 1.9991378752331634e-05, + "loss": 0.3361, + "step": 712 + }, + { + "epoch": 0.013255459058453975, + "grad_norm": 0.5512318015098572, + "learning_rate": 1.9991330257340266e-05, + "loss": 0.5111, + "step": 714 + }, + { + "epoch": 0.013292589195872614, + "grad_norm": 0.40329602360725403, + "learning_rate": 1.999128162639682e-05, + "loss": 0.3864, + "step": 716 + }, + { + "epoch": 0.013329719333291253, + "grad_norm": 0.44367092847824097, + "learning_rate": 1.999123285950196e-05, + "loss": 0.3929, + "step": 718 + }, + { + "epoch": 0.013366849470709891, + "grad_norm": 0.4042806625366211, + "learning_rate": 1.9991183956656344e-05, + "loss": 0.4039, + "step": 720 + }, + { + "epoch": 0.01340397960812853, + "grad_norm": 0.3432539999485016, + "learning_rate": 1.9991134917860645e-05, + "loss": 0.3648, + "step": 722 + }, + { + "epoch": 0.013441109745547169, + "grad_norm": 0.38274747133255005, + "learning_rate": 1.9991085743115525e-05, + "loss": 0.4013, + "step": 724 + }, + { + "epoch": 0.013478239882965808, + "grad_norm": 0.32239046692848206, + "learning_rate": 1.9991036432421657e-05, + "loss": 0.3329, + "step": 726 + }, + { + "epoch": 0.013515370020384445, + "grad_norm": 0.36656567454338074, + "learning_rate": 1.999098698577971e-05, + "loss": 0.436, + "step": 728 + }, + { + "epoch": 0.013552500157803083, + "grad_norm": 0.2693771421909332, + "learning_rate": 1.9990937403190353e-05, + "loss": 0.1754, + "step": 730 + }, + { + "epoch": 0.013589630295221722, + "grad_norm": 0.4604509472846985, + "learning_rate": 1.9990887684654264e-05, + "loss": 0.3314, + "step": 732 + }, + { + "epoch": 0.013626760432640361, + "grad_norm": 0.3748721480369568, + "learning_rate": 1.9990837830172122e-05, + "loss": 0.3602, + "step": 734 + }, + { + "epoch": 0.013663890570059, + "grad_norm": 0.28685617446899414, + "learning_rate": 1.9990787839744604e-05, + "loss": 0.3291, + "step": 736 + }, + { + "epoch": 0.013701020707477638, + "grad_norm": 0.4170685410499573, + "learning_rate": 1.9990737713372388e-05, + "loss": 0.3891, + "step": 738 + }, + { + "epoch": 0.013738150844896277, + "grad_norm": 0.4516931474208832, + "learning_rate": 1.999068745105616e-05, + "loss": 0.298, + "step": 740 + }, + { + "epoch": 0.013775280982314916, + "grad_norm": 0.34588298201560974, + "learning_rate": 1.9990637052796596e-05, + "loss": 0.4824, + "step": 742 + }, + { + "epoch": 0.013812411119733555, + "grad_norm": 0.4612199068069458, + "learning_rate": 1.9990586518594394e-05, + "loss": 0.3388, + "step": 744 + }, + { + "epoch": 0.013849541257152194, + "grad_norm": 0.3223876357078552, + "learning_rate": 1.999053584845023e-05, + "loss": 0.1741, + "step": 746 + }, + { + "epoch": 0.01388667139457083, + "grad_norm": 0.5699933767318726, + "learning_rate": 1.99904850423648e-05, + "loss": 0.5256, + "step": 748 + }, + { + "epoch": 0.01392380153198947, + "grad_norm": 0.33922168612480164, + "learning_rate": 1.9990434100338795e-05, + "loss": 0.518, + "step": 750 + }, + { + "epoch": 0.013960931669408108, + "grad_norm": 0.36788812279701233, + "learning_rate": 1.9990383022372903e-05, + "loss": 0.3338, + "step": 752 + }, + { + "epoch": 0.013998061806826747, + "grad_norm": 0.3664512038230896, + "learning_rate": 1.9990331808467824e-05, + "loss": 0.2868, + "step": 754 + }, + { + "epoch": 0.014035191944245386, + "grad_norm": 0.4419233500957489, + "learning_rate": 1.999028045862425e-05, + "loss": 0.4018, + "step": 756 + }, + { + "epoch": 0.014072322081664024, + "grad_norm": 0.41168367862701416, + "learning_rate": 1.999022897284289e-05, + "loss": 0.4652, + "step": 758 + }, + { + "epoch": 0.014109452219082663, + "grad_norm": 0.4428732991218567, + "learning_rate": 1.9990177351124434e-05, + "loss": 0.3778, + "step": 760 + }, + { + "epoch": 0.014146582356501302, + "grad_norm": 0.29541724920272827, + "learning_rate": 1.999012559346959e-05, + "loss": 0.2346, + "step": 762 + }, + { + "epoch": 0.01418371249391994, + "grad_norm": 0.45679134130477905, + "learning_rate": 1.999007369987906e-05, + "loss": 0.3643, + "step": 764 + }, + { + "epoch": 0.014220842631338578, + "grad_norm": 0.4593408405780792, + "learning_rate": 1.999002167035355e-05, + "loss": 0.259, + "step": 766 + }, + { + "epoch": 0.014257972768757217, + "grad_norm": 0.41827812790870667, + "learning_rate": 1.998996950489377e-05, + "loss": 0.4825, + "step": 768 + }, + { + "epoch": 0.014295102906175855, + "grad_norm": 0.3364203870296478, + "learning_rate": 1.9989917203500428e-05, + "loss": 0.2921, + "step": 770 + }, + { + "epoch": 0.014332233043594494, + "grad_norm": 0.444349080324173, + "learning_rate": 1.9989864766174235e-05, + "loss": 0.4103, + "step": 772 + }, + { + "epoch": 0.014369363181013133, + "grad_norm": 0.3217231035232544, + "learning_rate": 1.998981219291591e-05, + "loss": 0.3366, + "step": 774 + }, + { + "epoch": 0.014406493318431772, + "grad_norm": 0.3905393183231354, + "learning_rate": 1.9989759483726158e-05, + "loss": 0.3954, + "step": 776 + }, + { + "epoch": 0.01444362345585041, + "grad_norm": 0.3277907967567444, + "learning_rate": 1.9989706638605707e-05, + "loss": 0.307, + "step": 778 + }, + { + "epoch": 0.01448075359326905, + "grad_norm": 0.4413081705570221, + "learning_rate": 1.998965365755527e-05, + "loss": 0.3412, + "step": 780 + }, + { + "epoch": 0.014517883730687688, + "grad_norm": 0.548041582107544, + "learning_rate": 1.9989600540575566e-05, + "loss": 0.4014, + "step": 782 + }, + { + "epoch": 0.014555013868106325, + "grad_norm": 0.4476698338985443, + "learning_rate": 1.9989547287667325e-05, + "loss": 0.5237, + "step": 784 + }, + { + "epoch": 0.014592144005524964, + "grad_norm": 0.4426359534263611, + "learning_rate": 1.9989493898831265e-05, + "loss": 0.4013, + "step": 786 + }, + { + "epoch": 0.014629274142943603, + "grad_norm": 0.2528916001319885, + "learning_rate": 1.9989440374068115e-05, + "loss": 0.275, + "step": 788 + }, + { + "epoch": 0.014666404280362241, + "grad_norm": 0.4305020570755005, + "learning_rate": 1.9989386713378606e-05, + "loss": 0.3618, + "step": 790 + }, + { + "epoch": 0.01470353441778088, + "grad_norm": 0.3134215176105499, + "learning_rate": 1.9989332916763464e-05, + "loss": 0.2907, + "step": 792 + }, + { + "epoch": 0.014740664555199519, + "grad_norm": 0.31810179352760315, + "learning_rate": 1.9989278984223423e-05, + "loss": 0.4478, + "step": 794 + }, + { + "epoch": 0.014777794692618158, + "grad_norm": 0.2873820960521698, + "learning_rate": 1.9989224915759212e-05, + "loss": 0.3466, + "step": 796 + }, + { + "epoch": 0.014814924830036796, + "grad_norm": 0.39866873621940613, + "learning_rate": 1.9989170711371574e-05, + "loss": 0.374, + "step": 798 + }, + { + "epoch": 0.014852054967455435, + "grad_norm": 0.5070676207542419, + "learning_rate": 1.9989116371061246e-05, + "loss": 0.2803, + "step": 800 + }, + { + "epoch": 0.014889185104874074, + "grad_norm": 0.3221116065979004, + "learning_rate": 1.998906189482896e-05, + "loss": 0.3385, + "step": 802 + }, + { + "epoch": 0.014926315242292711, + "grad_norm": 0.301285982131958, + "learning_rate": 1.9989007282675468e-05, + "loss": 0.4443, + "step": 804 + }, + { + "epoch": 0.01496344537971135, + "grad_norm": 0.9393535256385803, + "learning_rate": 1.9988952534601504e-05, + "loss": 0.4516, + "step": 806 + }, + { + "epoch": 0.015000575517129989, + "grad_norm": 0.36717355251312256, + "learning_rate": 1.998889765060782e-05, + "loss": 0.3675, + "step": 808 + }, + { + "epoch": 0.015037705654548627, + "grad_norm": 0.42650389671325684, + "learning_rate": 1.9988842630695155e-05, + "loss": 0.4048, + "step": 810 + }, + { + "epoch": 0.015074835791967266, + "grad_norm": 0.36395561695098877, + "learning_rate": 1.9988787474864263e-05, + "loss": 0.2658, + "step": 812 + }, + { + "epoch": 0.015111965929385905, + "grad_norm": 0.4178607165813446, + "learning_rate": 1.9988732183115894e-05, + "loss": 0.1977, + "step": 814 + }, + { + "epoch": 0.015149096066804544, + "grad_norm": 0.3648502230644226, + "learning_rate": 1.99886767554508e-05, + "loss": 0.5499, + "step": 816 + }, + { + "epoch": 0.015186226204223182, + "grad_norm": 0.4816143810749054, + "learning_rate": 1.9988621191869735e-05, + "loss": 0.45, + "step": 818 + }, + { + "epoch": 0.015223356341641821, + "grad_norm": 0.3553163707256317, + "learning_rate": 1.9988565492373454e-05, + "loss": 0.3057, + "step": 820 + }, + { + "epoch": 0.015260486479060458, + "grad_norm": 0.354210764169693, + "learning_rate": 1.9988509656962716e-05, + "loss": 0.3653, + "step": 822 + }, + { + "epoch": 0.015297616616479097, + "grad_norm": 0.49755728244781494, + "learning_rate": 1.998845368563828e-05, + "loss": 0.3403, + "step": 824 + }, + { + "epoch": 0.015334746753897736, + "grad_norm": 0.39404791593551636, + "learning_rate": 1.9988397578400912e-05, + "loss": 0.4618, + "step": 826 + }, + { + "epoch": 0.015371876891316374, + "grad_norm": 0.2892735004425049, + "learning_rate": 1.998834133525137e-05, + "loss": 0.2484, + "step": 828 + }, + { + "epoch": 0.015409007028735013, + "grad_norm": 0.3096066415309906, + "learning_rate": 1.9988284956190418e-05, + "loss": 0.3135, + "step": 830 + }, + { + "epoch": 0.015446137166153652, + "grad_norm": 0.4149438738822937, + "learning_rate": 1.9988228441218828e-05, + "loss": 0.3309, + "step": 832 + }, + { + "epoch": 0.01548326730357229, + "grad_norm": 0.2606120705604553, + "learning_rate": 1.9988171790337366e-05, + "loss": 0.4085, + "step": 834 + }, + { + "epoch": 0.01552039744099093, + "grad_norm": 0.3199135363101959, + "learning_rate": 1.9988115003546805e-05, + "loss": 0.3182, + "step": 836 + }, + { + "epoch": 0.015557527578409568, + "grad_norm": 0.5812762379646301, + "learning_rate": 1.9988058080847917e-05, + "loss": 0.5002, + "step": 838 + }, + { + "epoch": 0.015594657715828207, + "grad_norm": 0.42649170756340027, + "learning_rate": 1.9988001022241478e-05, + "loss": 0.5042, + "step": 840 + }, + { + "epoch": 0.015631787853246846, + "grad_norm": 0.409511923789978, + "learning_rate": 1.9987943827728256e-05, + "loss": 0.257, + "step": 842 + }, + { + "epoch": 0.015668917990665485, + "grad_norm": 0.3790913224220276, + "learning_rate": 1.9987886497309043e-05, + "loss": 0.2362, + "step": 844 + }, + { + "epoch": 0.015706048128084123, + "grad_norm": 0.4556298851966858, + "learning_rate": 1.998782903098461e-05, + "loss": 0.3452, + "step": 846 + }, + { + "epoch": 0.015743178265502762, + "grad_norm": 0.48480024933815, + "learning_rate": 1.998777142875574e-05, + "loss": 0.1826, + "step": 848 + }, + { + "epoch": 0.0157803084029214, + "grad_norm": 0.47314268350601196, + "learning_rate": 1.998771369062322e-05, + "loss": 0.4582, + "step": 850 + }, + { + "epoch": 0.015817438540340036, + "grad_norm": 0.3589678704738617, + "learning_rate": 1.998765581658783e-05, + "loss": 0.4627, + "step": 852 + }, + { + "epoch": 0.015854568677758675, + "grad_norm": 0.45375433564186096, + "learning_rate": 1.998759780665036e-05, + "loss": 0.4603, + "step": 854 + }, + { + "epoch": 0.015891698815177314, + "grad_norm": 0.49634289741516113, + "learning_rate": 1.9987539660811603e-05, + "loss": 0.4034, + "step": 856 + }, + { + "epoch": 0.015928828952595953, + "grad_norm": 0.45370957255363464, + "learning_rate": 1.9987481379072346e-05, + "loss": 0.3334, + "step": 858 + }, + { + "epoch": 0.01596595909001459, + "grad_norm": 0.35922300815582275, + "learning_rate": 1.9987422961433384e-05, + "loss": 0.4046, + "step": 860 + }, + { + "epoch": 0.01600308922743323, + "grad_norm": 0.360714852809906, + "learning_rate": 1.998736440789551e-05, + "loss": 0.2845, + "step": 862 + }, + { + "epoch": 0.01604021936485187, + "grad_norm": 0.3543033003807068, + "learning_rate": 1.9987305718459527e-05, + "loss": 0.3602, + "step": 864 + }, + { + "epoch": 0.016077349502270508, + "grad_norm": 0.38280782103538513, + "learning_rate": 1.9987246893126224e-05, + "loss": 0.2207, + "step": 866 + }, + { + "epoch": 0.016114479639689146, + "grad_norm": 0.28148868680000305, + "learning_rate": 1.9987187931896404e-05, + "loss": 0.4137, + "step": 868 + }, + { + "epoch": 0.016151609777107785, + "grad_norm": 0.40491732954978943, + "learning_rate": 1.998712883477088e-05, + "loss": 0.4796, + "step": 870 + }, + { + "epoch": 0.016188739914526424, + "grad_norm": 0.32691875100135803, + "learning_rate": 1.998706960175044e-05, + "loss": 0.4217, + "step": 872 + }, + { + "epoch": 0.016225870051945063, + "grad_norm": 0.4398348331451416, + "learning_rate": 1.99870102328359e-05, + "loss": 0.5123, + "step": 874 + }, + { + "epoch": 0.0162630001893637, + "grad_norm": 0.2852839529514313, + "learning_rate": 1.9986950728028064e-05, + "loss": 0.479, + "step": 876 + }, + { + "epoch": 0.01630013032678234, + "grad_norm": 0.3615904450416565, + "learning_rate": 1.9986891087327744e-05, + "loss": 0.3495, + "step": 878 + }, + { + "epoch": 0.01633726046420098, + "grad_norm": 0.3513162434101105, + "learning_rate": 1.998683131073575e-05, + "loss": 0.3456, + "step": 880 + }, + { + "epoch": 0.016374390601619618, + "grad_norm": 0.3506811857223511, + "learning_rate": 1.9986771398252897e-05, + "loss": 0.3244, + "step": 882 + }, + { + "epoch": 0.016411520739038257, + "grad_norm": 0.3797071874141693, + "learning_rate": 1.9986711349879996e-05, + "loss": 0.524, + "step": 884 + }, + { + "epoch": 0.016448650876456895, + "grad_norm": 0.37659594416618347, + "learning_rate": 1.998665116561787e-05, + "loss": 0.3361, + "step": 886 + }, + { + "epoch": 0.01648578101387553, + "grad_norm": 0.36968308687210083, + "learning_rate": 1.9986590845467334e-05, + "loss": 0.428, + "step": 888 + }, + { + "epoch": 0.01652291115129417, + "grad_norm": 0.5282501578330994, + "learning_rate": 1.998653038942921e-05, + "loss": 0.4258, + "step": 890 + }, + { + "epoch": 0.016560041288712808, + "grad_norm": 0.346874862909317, + "learning_rate": 1.998646979750432e-05, + "loss": 0.4382, + "step": 892 + }, + { + "epoch": 0.016597171426131447, + "grad_norm": 0.3017529249191284, + "learning_rate": 1.998640906969349e-05, + "loss": 0.2046, + "step": 894 + }, + { + "epoch": 0.016634301563550086, + "grad_norm": 0.34062352776527405, + "learning_rate": 1.9986348205997545e-05, + "loss": 0.379, + "step": 896 + }, + { + "epoch": 0.016671431700968724, + "grad_norm": 0.7070618271827698, + "learning_rate": 1.9986287206417314e-05, + "loss": 0.5265, + "step": 898 + }, + { + "epoch": 0.016708561838387363, + "grad_norm": 0.308414101600647, + "learning_rate": 1.998622607095363e-05, + "loss": 0.3012, + "step": 900 + }, + { + "epoch": 0.016745691975806002, + "grad_norm": 0.2565460205078125, + "learning_rate": 1.9986164799607315e-05, + "loss": 0.2275, + "step": 902 + }, + { + "epoch": 0.01678282211322464, + "grad_norm": 0.41305750608444214, + "learning_rate": 1.998610339237921e-05, + "loss": 0.4378, + "step": 904 + }, + { + "epoch": 0.01681995225064328, + "grad_norm": 0.32478028535842896, + "learning_rate": 1.998604184927015e-05, + "loss": 0.3894, + "step": 906 + }, + { + "epoch": 0.01685708238806192, + "grad_norm": 0.3334619700908661, + "learning_rate": 1.9985980170280975e-05, + "loss": 0.3713, + "step": 908 + }, + { + "epoch": 0.016894212525480557, + "grad_norm": 0.284366637468338, + "learning_rate": 1.9985918355412518e-05, + "loss": 0.3807, + "step": 910 + }, + { + "epoch": 0.016931342662899196, + "grad_norm": 0.30971869826316833, + "learning_rate": 1.9985856404665627e-05, + "loss": 0.1992, + "step": 912 + }, + { + "epoch": 0.016968472800317835, + "grad_norm": 0.36958345770835876, + "learning_rate": 1.9985794318041137e-05, + "loss": 0.5721, + "step": 914 + }, + { + "epoch": 0.017005602937736473, + "grad_norm": 0.3124692440032959, + "learning_rate": 1.9985732095539904e-05, + "loss": 0.3025, + "step": 916 + }, + { + "epoch": 0.017042733075155112, + "grad_norm": 0.3706187307834625, + "learning_rate": 1.998566973716276e-05, + "loss": 0.6818, + "step": 918 + }, + { + "epoch": 0.01707986321257375, + "grad_norm": 0.35288944840431213, + "learning_rate": 1.9985607242910567e-05, + "loss": 0.3121, + "step": 920 + }, + { + "epoch": 0.01711699334999239, + "grad_norm": 0.27718523144721985, + "learning_rate": 1.998554461278417e-05, + "loss": 0.3448, + "step": 922 + }, + { + "epoch": 0.01715412348741103, + "grad_norm": 0.4456179440021515, + "learning_rate": 1.9985481846784416e-05, + "loss": 0.3475, + "step": 924 + }, + { + "epoch": 0.017191253624829664, + "grad_norm": 0.4590824544429779, + "learning_rate": 1.9985418944912167e-05, + "loss": 0.5114, + "step": 926 + }, + { + "epoch": 0.017228383762248303, + "grad_norm": 0.3306584358215332, + "learning_rate": 1.9985355907168275e-05, + "loss": 0.2187, + "step": 928 + }, + { + "epoch": 0.01726551389966694, + "grad_norm": 0.39214852452278137, + "learning_rate": 1.9985292733553603e-05, + "loss": 0.3698, + "step": 930 + }, + { + "epoch": 0.01730264403708558, + "grad_norm": 0.40430113673210144, + "learning_rate": 1.9985229424069002e-05, + "loss": 0.2494, + "step": 932 + }, + { + "epoch": 0.01733977417450422, + "grad_norm": 0.3753194510936737, + "learning_rate": 1.9985165978715336e-05, + "loss": 0.4021, + "step": 934 + }, + { + "epoch": 0.017376904311922858, + "grad_norm": 0.5208768844604492, + "learning_rate": 1.9985102397493475e-05, + "loss": 0.4051, + "step": 936 + }, + { + "epoch": 0.017414034449341496, + "grad_norm": 0.35050153732299805, + "learning_rate": 1.9985038680404277e-05, + "loss": 0.3079, + "step": 938 + }, + { + "epoch": 0.017451164586760135, + "grad_norm": 0.2716299891471863, + "learning_rate": 1.9984974827448612e-05, + "loss": 0.3982, + "step": 940 + }, + { + "epoch": 0.017488294724178774, + "grad_norm": 0.30002743005752563, + "learning_rate": 1.9984910838627348e-05, + "loss": 0.2825, + "step": 942 + }, + { + "epoch": 0.017525424861597413, + "grad_norm": 0.3770863115787506, + "learning_rate": 1.9984846713941355e-05, + "loss": 0.424, + "step": 944 + }, + { + "epoch": 0.01756255499901605, + "grad_norm": 0.27097564935684204, + "learning_rate": 1.9984782453391505e-05, + "loss": 0.245, + "step": 946 + }, + { + "epoch": 0.01759968513643469, + "grad_norm": 0.4151020348072052, + "learning_rate": 1.9984718056978674e-05, + "loss": 0.3004, + "step": 948 + }, + { + "epoch": 0.01763681527385333, + "grad_norm": 0.372779905796051, + "learning_rate": 1.9984653524703738e-05, + "loss": 0.3734, + "step": 950 + }, + { + "epoch": 0.017673945411271968, + "grad_norm": 0.3685815632343292, + "learning_rate": 1.9984588856567576e-05, + "loss": 0.4165, + "step": 952 + }, + { + "epoch": 0.017711075548690607, + "grad_norm": 0.2559392750263214, + "learning_rate": 1.998452405257107e-05, + "loss": 0.2398, + "step": 954 + }, + { + "epoch": 0.017748205686109245, + "grad_norm": 0.2744353711605072, + "learning_rate": 1.9984459112715097e-05, + "loss": 0.4297, + "step": 956 + }, + { + "epoch": 0.017785335823527884, + "grad_norm": 0.31595975160598755, + "learning_rate": 1.998439403700054e-05, + "loss": 0.3665, + "step": 958 + }, + { + "epoch": 0.017822465960946523, + "grad_norm": 0.41052547097206116, + "learning_rate": 1.998432882542829e-05, + "loss": 0.3007, + "step": 960 + }, + { + "epoch": 0.01785959609836516, + "grad_norm": 0.28710436820983887, + "learning_rate": 1.9984263477999225e-05, + "loss": 0.4114, + "step": 962 + }, + { + "epoch": 0.017896726235783797, + "grad_norm": 0.46606481075286865, + "learning_rate": 1.9984197994714245e-05, + "loss": 0.4448, + "step": 964 + }, + { + "epoch": 0.017933856373202436, + "grad_norm": 0.42105162143707275, + "learning_rate": 1.9984132375574238e-05, + "loss": 0.2621, + "step": 966 + }, + { + "epoch": 0.017970986510621074, + "grad_norm": 0.36550813913345337, + "learning_rate": 1.9984066620580092e-05, + "loss": 0.3482, + "step": 968 + }, + { + "epoch": 0.018008116648039713, + "grad_norm": 0.4748978316783905, + "learning_rate": 1.9984000729732708e-05, + "loss": 0.4168, + "step": 970 + }, + { + "epoch": 0.018045246785458352, + "grad_norm": 0.4448954463005066, + "learning_rate": 1.9983934703032975e-05, + "loss": 0.3936, + "step": 972 + }, + { + "epoch": 0.01808237692287699, + "grad_norm": 0.38591641187667847, + "learning_rate": 1.9983868540481803e-05, + "loss": 0.4349, + "step": 974 + }, + { + "epoch": 0.01811950706029563, + "grad_norm": 0.29241764545440674, + "learning_rate": 1.998380224208008e-05, + "loss": 0.4832, + "step": 976 + }, + { + "epoch": 0.01815663719771427, + "grad_norm": 0.38096871972084045, + "learning_rate": 1.9983735807828714e-05, + "loss": 0.3898, + "step": 978 + }, + { + "epoch": 0.018193767335132907, + "grad_norm": 0.2944658100605011, + "learning_rate": 1.9983669237728607e-05, + "loss": 0.3274, + "step": 980 + }, + { + "epoch": 0.018230897472551546, + "grad_norm": 0.4047447443008423, + "learning_rate": 1.998360253178067e-05, + "loss": 0.3351, + "step": 982 + }, + { + "epoch": 0.018268027609970185, + "grad_norm": 0.3475852906703949, + "learning_rate": 1.9983535689985807e-05, + "loss": 0.4889, + "step": 984 + }, + { + "epoch": 0.018305157747388823, + "grad_norm": 0.3606351613998413, + "learning_rate": 1.9983468712344926e-05, + "loss": 0.256, + "step": 986 + }, + { + "epoch": 0.018342287884807462, + "grad_norm": 0.35583171248435974, + "learning_rate": 1.998340159885894e-05, + "loss": 0.4103, + "step": 988 + }, + { + "epoch": 0.0183794180222261, + "grad_norm": 0.343099981546402, + "learning_rate": 1.998333434952876e-05, + "loss": 0.4615, + "step": 990 + }, + { + "epoch": 0.01841654815964474, + "grad_norm": 0.41156670451164246, + "learning_rate": 1.9983266964355304e-05, + "loss": 0.434, + "step": 992 + }, + { + "epoch": 0.01845367829706338, + "grad_norm": 0.40091824531555176, + "learning_rate": 1.998319944333949e-05, + "loss": 0.4066, + "step": 994 + }, + { + "epoch": 0.018490808434482017, + "grad_norm": 0.3511546552181244, + "learning_rate": 1.998313178648223e-05, + "loss": 0.3634, + "step": 996 + }, + { + "epoch": 0.018527938571900656, + "grad_norm": 0.5660949945449829, + "learning_rate": 1.9983063993784455e-05, + "loss": 0.4431, + "step": 998 + }, + { + "epoch": 0.018565068709319295, + "grad_norm": 0.4245556890964508, + "learning_rate": 1.9982996065247077e-05, + "loss": 0.243, + "step": 1000 + }, + { + "epoch": 0.01860219884673793, + "grad_norm": 0.47335347533226013, + "learning_rate": 1.998292800087103e-05, + "loss": 0.3385, + "step": 1002 + }, + { + "epoch": 0.01863932898415657, + "grad_norm": 0.23368427157402039, + "learning_rate": 1.9982859800657232e-05, + "loss": 0.2892, + "step": 1004 + }, + { + "epoch": 0.018676459121575208, + "grad_norm": 0.3205662667751312, + "learning_rate": 1.9982791464606614e-05, + "loss": 0.4385, + "step": 1006 + }, + { + "epoch": 0.018713589258993846, + "grad_norm": 0.3772878050804138, + "learning_rate": 1.9982722992720108e-05, + "loss": 0.2819, + "step": 1008 + }, + { + "epoch": 0.018750719396412485, + "grad_norm": 0.45986175537109375, + "learning_rate": 1.998265438499864e-05, + "loss": 0.4632, + "step": 1010 + }, + { + "epoch": 0.018787849533831124, + "grad_norm": 0.32547804713249207, + "learning_rate": 1.9982585641443153e-05, + "loss": 0.3966, + "step": 1012 + }, + { + "epoch": 0.018824979671249763, + "grad_norm": 0.4675557315349579, + "learning_rate": 1.9982516762054574e-05, + "loss": 0.2226, + "step": 1014 + }, + { + "epoch": 0.0188621098086684, + "grad_norm": 0.5159827470779419, + "learning_rate": 1.9982447746833844e-05, + "loss": 0.4455, + "step": 1016 + }, + { + "epoch": 0.01889923994608704, + "grad_norm": 0.3231106400489807, + "learning_rate": 1.9982378595781898e-05, + "loss": 0.6755, + "step": 1018 + }, + { + "epoch": 0.01893637008350568, + "grad_norm": 0.440939724445343, + "learning_rate": 1.9982309308899683e-05, + "loss": 0.3858, + "step": 1020 + }, + { + "epoch": 0.018973500220924318, + "grad_norm": 0.3986174166202545, + "learning_rate": 1.9982239886188136e-05, + "loss": 0.5566, + "step": 1022 + }, + { + "epoch": 0.019010630358342957, + "grad_norm": 0.431723415851593, + "learning_rate": 1.998217032764821e-05, + "loss": 0.4052, + "step": 1024 + }, + { + "epoch": 0.019047760495761595, + "grad_norm": 0.3694363832473755, + "learning_rate": 1.998210063328084e-05, + "loss": 0.4682, + "step": 1026 + }, + { + "epoch": 0.019084890633180234, + "grad_norm": 0.3512609004974365, + "learning_rate": 1.9982030803086982e-05, + "loss": 0.348, + "step": 1028 + }, + { + "epoch": 0.019122020770598873, + "grad_norm": 0.31077277660369873, + "learning_rate": 1.9981960837067584e-05, + "loss": 0.4706, + "step": 1030 + }, + { + "epoch": 0.01915915090801751, + "grad_norm": 0.3602035343647003, + "learning_rate": 1.99818907352236e-05, + "loss": 0.4031, + "step": 1032 + }, + { + "epoch": 0.01919628104543615, + "grad_norm": 0.49307844042778015, + "learning_rate": 1.998182049755598e-05, + "loss": 0.3982, + "step": 1034 + }, + { + "epoch": 0.01923341118285479, + "grad_norm": 0.37214940786361694, + "learning_rate": 1.9981750124065684e-05, + "loss": 0.3833, + "step": 1036 + }, + { + "epoch": 0.019270541320273428, + "grad_norm": 0.3503119945526123, + "learning_rate": 1.9981679614753665e-05, + "loss": 0.3887, + "step": 1038 + }, + { + "epoch": 0.019307671457692063, + "grad_norm": 0.5146250128746033, + "learning_rate": 1.998160896962089e-05, + "loss": 0.4304, + "step": 1040 + }, + { + "epoch": 0.019344801595110702, + "grad_norm": 0.413052499294281, + "learning_rate": 1.998153818866831e-05, + "loss": 0.402, + "step": 1042 + }, + { + "epoch": 0.01938193173252934, + "grad_norm": 0.516735315322876, + "learning_rate": 1.9981467271896897e-05, + "loss": 0.2478, + "step": 1044 + }, + { + "epoch": 0.01941906186994798, + "grad_norm": 0.4669174551963806, + "learning_rate": 1.998139621930761e-05, + "loss": 0.4645, + "step": 1046 + }, + { + "epoch": 0.01945619200736662, + "grad_norm": 0.3801192343235016, + "learning_rate": 1.9981325030901422e-05, + "loss": 0.2726, + "step": 1048 + }, + { + "epoch": 0.019493322144785257, + "grad_norm": 0.344904363155365, + "learning_rate": 1.9981253706679292e-05, + "loss": 0.5221, + "step": 1050 + }, + { + "epoch": 0.019530452282203896, + "grad_norm": 0.3043608069419861, + "learning_rate": 1.99811822466422e-05, + "loss": 0.5012, + "step": 1052 + }, + { + "epoch": 0.019567582419622535, + "grad_norm": 0.3266139328479767, + "learning_rate": 1.9981110650791116e-05, + "loss": 0.539, + "step": 1054 + }, + { + "epoch": 0.019604712557041173, + "grad_norm": 0.342694491147995, + "learning_rate": 1.9981038919127013e-05, + "loss": 0.3943, + "step": 1056 + }, + { + "epoch": 0.019641842694459812, + "grad_norm": 0.28042566776275635, + "learning_rate": 1.9980967051650863e-05, + "loss": 0.3984, + "step": 1058 + }, + { + "epoch": 0.01967897283187845, + "grad_norm": 0.5032764077186584, + "learning_rate": 1.998089504836365e-05, + "loss": 0.5622, + "step": 1060 + }, + { + "epoch": 0.01971610296929709, + "grad_norm": 0.29308953881263733, + "learning_rate": 1.9980822909266352e-05, + "loss": 0.3935, + "step": 1062 + }, + { + "epoch": 0.01975323310671573, + "grad_norm": 0.4919288158416748, + "learning_rate": 1.998075063435995e-05, + "loss": 0.3721, + "step": 1064 + }, + { + "epoch": 0.019790363244134367, + "grad_norm": 0.3275351822376251, + "learning_rate": 1.9980678223645426e-05, + "loss": 0.371, + "step": 1066 + }, + { + "epoch": 0.019827493381553006, + "grad_norm": 0.31691431999206543, + "learning_rate": 1.998060567712377e-05, + "loss": 0.1855, + "step": 1068 + }, + { + "epoch": 0.019864623518971645, + "grad_norm": 0.8908047676086426, + "learning_rate": 1.9980532994795965e-05, + "loss": 0.4193, + "step": 1070 + }, + { + "epoch": 0.019901753656390284, + "grad_norm": 0.2907158136367798, + "learning_rate": 1.9980460176663002e-05, + "loss": 0.4305, + "step": 1072 + }, + { + "epoch": 0.019938883793808922, + "grad_norm": 0.34081071615219116, + "learning_rate": 1.998038722272587e-05, + "loss": 0.4845, + "step": 1074 + }, + { + "epoch": 0.01997601393122756, + "grad_norm": 0.33504238724708557, + "learning_rate": 1.9980314132985563e-05, + "loss": 0.3663, + "step": 1076 + }, + { + "epoch": 0.020013144068646196, + "grad_norm": 0.4377474784851074, + "learning_rate": 1.9980240907443074e-05, + "loss": 0.2243, + "step": 1078 + }, + { + "epoch": 0.020050274206064835, + "grad_norm": 0.4400773346424103, + "learning_rate": 1.99801675460994e-05, + "loss": 0.3021, + "step": 1080 + }, + { + "epoch": 0.020087404343483474, + "grad_norm": 0.48415157198905945, + "learning_rate": 1.9980094048955542e-05, + "loss": 0.6019, + "step": 1082 + }, + { + "epoch": 0.020124534480902113, + "grad_norm": 0.4044879376888275, + "learning_rate": 1.9980020416012497e-05, + "loss": 0.4542, + "step": 1084 + }, + { + "epoch": 0.02016166461832075, + "grad_norm": 0.2885062098503113, + "learning_rate": 1.9979946647271266e-05, + "loss": 0.3487, + "step": 1086 + }, + { + "epoch": 0.02019879475573939, + "grad_norm": 0.5480518937110901, + "learning_rate": 1.9979872742732856e-05, + "loss": 0.3818, + "step": 1088 + }, + { + "epoch": 0.02023592489315803, + "grad_norm": 0.6714348196983337, + "learning_rate": 1.997979870239827e-05, + "loss": 0.374, + "step": 1090 + }, + { + "epoch": 0.020273055030576668, + "grad_norm": 0.4432184398174286, + "learning_rate": 1.997972452626852e-05, + "loss": 0.3846, + "step": 1092 + }, + { + "epoch": 0.020310185167995307, + "grad_norm": 0.33412840962409973, + "learning_rate": 1.997965021434461e-05, + "loss": 0.295, + "step": 1094 + }, + { + "epoch": 0.020347315305413945, + "grad_norm": 0.5967075824737549, + "learning_rate": 1.997957576662755e-05, + "loss": 0.3061, + "step": 1096 + }, + { + "epoch": 0.020384445442832584, + "grad_norm": 0.43157070875167847, + "learning_rate": 1.997950118311836e-05, + "loss": 0.3643, + "step": 1098 + }, + { + "epoch": 0.020421575580251223, + "grad_norm": 0.40424004197120667, + "learning_rate": 1.997942646381805e-05, + "loss": 0.3498, + "step": 1100 + }, + { + "epoch": 0.02045870571766986, + "grad_norm": 0.3167525827884674, + "learning_rate": 1.9979351608727637e-05, + "loss": 0.3574, + "step": 1102 + }, + { + "epoch": 0.0204958358550885, + "grad_norm": 0.4568251073360443, + "learning_rate": 1.997927661784814e-05, + "loss": 0.4542, + "step": 1104 + }, + { + "epoch": 0.02053296599250714, + "grad_norm": 0.41422349214553833, + "learning_rate": 1.9979201491180582e-05, + "loss": 0.2961, + "step": 1106 + }, + { + "epoch": 0.020570096129925778, + "grad_norm": 0.3077956736087799, + "learning_rate": 1.997912622872598e-05, + "loss": 0.5079, + "step": 1108 + }, + { + "epoch": 0.020607226267344417, + "grad_norm": 0.40603139996528625, + "learning_rate": 1.9979050830485362e-05, + "loss": 0.4633, + "step": 1110 + }, + { + "epoch": 0.020644356404763056, + "grad_norm": 0.32469528913497925, + "learning_rate": 1.9978975296459754e-05, + "loss": 0.3549, + "step": 1112 + }, + { + "epoch": 0.020681486542181694, + "grad_norm": 0.4310847818851471, + "learning_rate": 1.9978899626650184e-05, + "loss": 0.4445, + "step": 1114 + }, + { + "epoch": 0.02071861667960033, + "grad_norm": 0.35542115569114685, + "learning_rate": 1.9978823821057674e-05, + "loss": 0.3049, + "step": 1116 + }, + { + "epoch": 0.02075574681701897, + "grad_norm": 0.3398759663105011, + "learning_rate": 1.9978747879683266e-05, + "loss": 0.3315, + "step": 1118 + }, + { + "epoch": 0.020792876954437607, + "grad_norm": 0.2974185347557068, + "learning_rate": 1.9978671802527993e-05, + "loss": 0.2423, + "step": 1120 + }, + { + "epoch": 0.020830007091856246, + "grad_norm": 0.39581137895584106, + "learning_rate": 1.997859558959288e-05, + "loss": 0.402, + "step": 1122 + }, + { + "epoch": 0.020867137229274885, + "grad_norm": 0.42987826466560364, + "learning_rate": 1.9978519240878973e-05, + "loss": 0.3136, + "step": 1124 + }, + { + "epoch": 0.020904267366693523, + "grad_norm": 0.2600070536136627, + "learning_rate": 1.997844275638731e-05, + "loss": 0.2822, + "step": 1126 + }, + { + "epoch": 0.020941397504112162, + "grad_norm": 0.41146188974380493, + "learning_rate": 1.997836613611893e-05, + "loss": 0.3714, + "step": 1128 + }, + { + "epoch": 0.0209785276415308, + "grad_norm": 0.31470268964767456, + "learning_rate": 1.9978289380074872e-05, + "loss": 0.3559, + "step": 1130 + }, + { + "epoch": 0.02101565777894944, + "grad_norm": 0.33213889598846436, + "learning_rate": 1.9978212488256185e-05, + "loss": 0.4429, + "step": 1132 + }, + { + "epoch": 0.02105278791636808, + "grad_norm": 0.33684155344963074, + "learning_rate": 1.9978135460663913e-05, + "loss": 0.3902, + "step": 1134 + }, + { + "epoch": 0.021089918053786717, + "grad_norm": 0.4565243124961853, + "learning_rate": 1.9978058297299108e-05, + "loss": 0.48, + "step": 1136 + }, + { + "epoch": 0.021127048191205356, + "grad_norm": 0.3734474182128906, + "learning_rate": 1.9977980998162815e-05, + "loss": 0.4049, + "step": 1138 + }, + { + "epoch": 0.021164178328623995, + "grad_norm": 0.3784891366958618, + "learning_rate": 1.9977903563256092e-05, + "loss": 0.4839, + "step": 1140 + }, + { + "epoch": 0.021201308466042634, + "grad_norm": 0.31104305386543274, + "learning_rate": 1.9977825992579986e-05, + "loss": 0.3601, + "step": 1142 + }, + { + "epoch": 0.021238438603461272, + "grad_norm": 0.37195491790771484, + "learning_rate": 1.9977748286135558e-05, + "loss": 0.4285, + "step": 1144 + }, + { + "epoch": 0.02127556874087991, + "grad_norm": 0.3654918372631073, + "learning_rate": 1.997767044392386e-05, + "loss": 0.4709, + "step": 1146 + }, + { + "epoch": 0.02131269887829855, + "grad_norm": 0.3603534400463104, + "learning_rate": 1.9977592465945955e-05, + "loss": 0.4, + "step": 1148 + }, + { + "epoch": 0.02134982901571719, + "grad_norm": 0.37144672870635986, + "learning_rate": 1.99775143522029e-05, + "loss": 0.2892, + "step": 1150 + }, + { + "epoch": 0.021386959153135827, + "grad_norm": 0.34551459550857544, + "learning_rate": 1.9977436102695763e-05, + "loss": 0.4734, + "step": 1152 + }, + { + "epoch": 0.021424089290554463, + "grad_norm": 0.3170728087425232, + "learning_rate": 1.997735771742561e-05, + "loss": 0.3201, + "step": 1154 + }, + { + "epoch": 0.0214612194279731, + "grad_norm": 0.44156527519226074, + "learning_rate": 1.9977279196393498e-05, + "loss": 0.3395, + "step": 1156 + }, + { + "epoch": 0.02149834956539174, + "grad_norm": 0.4250219166278839, + "learning_rate": 1.9977200539600502e-05, + "loss": 0.4278, + "step": 1158 + }, + { + "epoch": 0.02153547970281038, + "grad_norm": 0.4389667510986328, + "learning_rate": 1.9977121747047694e-05, + "loss": 0.2627, + "step": 1160 + }, + { + "epoch": 0.021572609840229018, + "grad_norm": 0.35008880496025085, + "learning_rate": 1.9977042818736145e-05, + "loss": 0.4204, + "step": 1162 + }, + { + "epoch": 0.021609739977647657, + "grad_norm": 0.34787076711654663, + "learning_rate": 1.9976963754666926e-05, + "loss": 0.2212, + "step": 1164 + }, + { + "epoch": 0.021646870115066295, + "grad_norm": 0.34879979491233826, + "learning_rate": 1.9976884554841117e-05, + "loss": 0.4398, + "step": 1166 + }, + { + "epoch": 0.021684000252484934, + "grad_norm": 0.403594046831131, + "learning_rate": 1.9976805219259793e-05, + "loss": 0.5654, + "step": 1168 + }, + { + "epoch": 0.021721130389903573, + "grad_norm": 0.368293434381485, + "learning_rate": 1.9976725747924034e-05, + "loss": 0.2842, + "step": 1170 + }, + { + "epoch": 0.02175826052732221, + "grad_norm": 0.2966884970664978, + "learning_rate": 1.9976646140834918e-05, + "loss": 0.282, + "step": 1172 + }, + { + "epoch": 0.02179539066474085, + "grad_norm": 0.40858063101768494, + "learning_rate": 1.9976566397993533e-05, + "loss": 0.3947, + "step": 1174 + }, + { + "epoch": 0.02183252080215949, + "grad_norm": 0.3218763768672943, + "learning_rate": 1.9976486519400965e-05, + "loss": 0.3469, + "step": 1176 + }, + { + "epoch": 0.021869650939578128, + "grad_norm": 0.36078453063964844, + "learning_rate": 1.9976406505058296e-05, + "loss": 0.5874, + "step": 1178 + }, + { + "epoch": 0.021906781076996767, + "grad_norm": 0.3969400227069855, + "learning_rate": 1.997632635496662e-05, + "loss": 0.3521, + "step": 1180 + }, + { + "epoch": 0.021943911214415406, + "grad_norm": 0.28152936697006226, + "learning_rate": 1.9976246069127022e-05, + "loss": 0.2369, + "step": 1182 + }, + { + "epoch": 0.021981041351834044, + "grad_norm": 0.4404222369194031, + "learning_rate": 1.99761656475406e-05, + "loss": 0.3445, + "step": 1184 + }, + { + "epoch": 0.022018171489252683, + "grad_norm": 0.3748471140861511, + "learning_rate": 1.9976085090208442e-05, + "loss": 0.5414, + "step": 1186 + }, + { + "epoch": 0.022055301626671322, + "grad_norm": 0.4265405237674713, + "learning_rate": 1.997600439713165e-05, + "loss": 0.5902, + "step": 1188 + }, + { + "epoch": 0.02209243176408996, + "grad_norm": 0.3155985474586487, + "learning_rate": 1.997592356831132e-05, + "loss": 0.3268, + "step": 1190 + }, + { + "epoch": 0.022129561901508596, + "grad_norm": 0.3741665482521057, + "learning_rate": 1.997584260374855e-05, + "loss": 0.3036, + "step": 1192 + }, + { + "epoch": 0.022166692038927235, + "grad_norm": 0.3819212317466736, + "learning_rate": 1.9975761503444447e-05, + "loss": 0.3594, + "step": 1194 + }, + { + "epoch": 0.022203822176345873, + "grad_norm": 0.2932555377483368, + "learning_rate": 1.9975680267400107e-05, + "loss": 0.3263, + "step": 1196 + }, + { + "epoch": 0.022240952313764512, + "grad_norm": 0.34051573276519775, + "learning_rate": 1.997559889561664e-05, + "loss": 0.5083, + "step": 1198 + }, + { + "epoch": 0.02227808245118315, + "grad_norm": 0.2579888701438904, + "learning_rate": 1.997551738809515e-05, + "loss": 0.3431, + "step": 1200 + }, + { + "epoch": 0.02231521258860179, + "grad_norm": 0.32302817702293396, + "learning_rate": 1.9975435744836753e-05, + "loss": 0.3618, + "step": 1202 + }, + { + "epoch": 0.02235234272602043, + "grad_norm": 0.40014100074768066, + "learning_rate": 1.9975353965842556e-05, + "loss": 0.3728, + "step": 1204 + }, + { + "epoch": 0.022389472863439067, + "grad_norm": 0.35799989104270935, + "learning_rate": 1.9975272051113666e-05, + "loss": 0.3406, + "step": 1206 + }, + { + "epoch": 0.022426603000857706, + "grad_norm": 0.3210586905479431, + "learning_rate": 1.9975190000651206e-05, + "loss": 0.398, + "step": 1208 + }, + { + "epoch": 0.022463733138276345, + "grad_norm": 0.44624197483062744, + "learning_rate": 1.997510781445629e-05, + "loss": 0.3531, + "step": 1210 + }, + { + "epoch": 0.022500863275694984, + "grad_norm": 0.45051196217536926, + "learning_rate": 1.9975025492530034e-05, + "loss": 0.2711, + "step": 1212 + }, + { + "epoch": 0.022537993413113622, + "grad_norm": 0.32351264357566833, + "learning_rate": 1.997494303487356e-05, + "loss": 0.4831, + "step": 1214 + }, + { + "epoch": 0.02257512355053226, + "grad_norm": 0.32683709263801575, + "learning_rate": 1.997486044148799e-05, + "loss": 0.2123, + "step": 1216 + }, + { + "epoch": 0.0226122536879509, + "grad_norm": 0.5519673824310303, + "learning_rate": 1.9974777712374446e-05, + "loss": 0.2138, + "step": 1218 + }, + { + "epoch": 0.02264938382536954, + "grad_norm": 0.4483173191547394, + "learning_rate": 1.9974694847534056e-05, + "loss": 0.467, + "step": 1220 + }, + { + "epoch": 0.022686513962788177, + "grad_norm": 0.36775970458984375, + "learning_rate": 1.997461184696795e-05, + "loss": 0.4628, + "step": 1222 + }, + { + "epoch": 0.022723644100206816, + "grad_norm": 0.27962982654571533, + "learning_rate": 1.997452871067725e-05, + "loss": 0.3177, + "step": 1224 + }, + { + "epoch": 0.022760774237625455, + "grad_norm": 0.3483109772205353, + "learning_rate": 1.9974445438663096e-05, + "loss": 0.3289, + "step": 1226 + }, + { + "epoch": 0.02279790437504409, + "grad_norm": 0.40001213550567627, + "learning_rate": 1.9974362030926612e-05, + "loss": 0.335, + "step": 1228 + }, + { + "epoch": 0.02283503451246273, + "grad_norm": 0.3237497806549072, + "learning_rate": 1.9974278487468938e-05, + "loss": 0.2585, + "step": 1230 + }, + { + "epoch": 0.022872164649881368, + "grad_norm": 0.3753138780593872, + "learning_rate": 1.9974194808291215e-05, + "loss": 0.6566, + "step": 1232 + }, + { + "epoch": 0.022909294787300007, + "grad_norm": 0.301988810300827, + "learning_rate": 1.9974110993394572e-05, + "loss": 0.4063, + "step": 1234 + }, + { + "epoch": 0.022946424924718645, + "grad_norm": 0.28260454535484314, + "learning_rate": 1.9974027042780154e-05, + "loss": 0.3565, + "step": 1236 + }, + { + "epoch": 0.022983555062137284, + "grad_norm": 0.5421139597892761, + "learning_rate": 1.9973942956449103e-05, + "loss": 0.3696, + "step": 1238 + }, + { + "epoch": 0.023020685199555923, + "grad_norm": 0.4133604168891907, + "learning_rate": 1.9973858734402567e-05, + "loss": 0.3346, + "step": 1240 + }, + { + "epoch": 0.02305781533697456, + "grad_norm": 0.33195871114730835, + "learning_rate": 1.9973774376641688e-05, + "loss": 0.2657, + "step": 1242 + }, + { + "epoch": 0.0230949454743932, + "grad_norm": 0.31317609548568726, + "learning_rate": 1.997368988316761e-05, + "loss": 0.3716, + "step": 1244 + }, + { + "epoch": 0.02313207561181184, + "grad_norm": 0.36527132987976074, + "learning_rate": 1.9973605253981496e-05, + "loss": 0.5199, + "step": 1246 + }, + { + "epoch": 0.023169205749230478, + "grad_norm": 0.32158660888671875, + "learning_rate": 1.9973520489084483e-05, + "loss": 0.4862, + "step": 1248 + }, + { + "epoch": 0.023206335886649117, + "grad_norm": 0.41991668939590454, + "learning_rate": 1.9973435588477736e-05, + "loss": 0.4259, + "step": 1250 + }, + { + "epoch": 0.023243466024067756, + "grad_norm": 0.39704686403274536, + "learning_rate": 1.99733505521624e-05, + "loss": 0.3365, + "step": 1252 + }, + { + "epoch": 0.023280596161486394, + "grad_norm": 0.3496972918510437, + "learning_rate": 1.9973265380139633e-05, + "loss": 0.3906, + "step": 1254 + }, + { + "epoch": 0.023317726298905033, + "grad_norm": 0.3918420076370239, + "learning_rate": 1.9973180072410603e-05, + "loss": 0.3035, + "step": 1256 + }, + { + "epoch": 0.023354856436323672, + "grad_norm": 0.3882104158401489, + "learning_rate": 1.9973094628976465e-05, + "loss": 0.4006, + "step": 1258 + }, + { + "epoch": 0.02339198657374231, + "grad_norm": 0.398441344499588, + "learning_rate": 1.997300904983838e-05, + "loss": 0.2586, + "step": 1260 + }, + { + "epoch": 0.02342911671116095, + "grad_norm": 0.8797444105148315, + "learning_rate": 1.9972923334997515e-05, + "loss": 0.4944, + "step": 1262 + }, + { + "epoch": 0.023466246848579588, + "grad_norm": 0.3843643069267273, + "learning_rate": 1.9972837484455033e-05, + "loss": 0.4025, + "step": 1264 + }, + { + "epoch": 0.023503376985998223, + "grad_norm": 0.51822829246521, + "learning_rate": 1.997275149821211e-05, + "loss": 0.3721, + "step": 1266 + }, + { + "epoch": 0.023540507123416862, + "grad_norm": 0.4143613278865814, + "learning_rate": 1.9972665376269908e-05, + "loss": 0.3916, + "step": 1268 + }, + { + "epoch": 0.0235776372608355, + "grad_norm": 0.33080142736434937, + "learning_rate": 1.9972579118629605e-05, + "loss": 0.3852, + "step": 1270 + }, + { + "epoch": 0.02361476739825414, + "grad_norm": 0.4805343449115753, + "learning_rate": 1.9972492725292368e-05, + "loss": 0.2985, + "step": 1272 + }, + { + "epoch": 0.02365189753567278, + "grad_norm": 0.44258785247802734, + "learning_rate": 1.997240619625938e-05, + "loss": 0.545, + "step": 1274 + }, + { + "epoch": 0.023689027673091417, + "grad_norm": 0.3181736171245575, + "learning_rate": 1.997231953153181e-05, + "loss": 0.3476, + "step": 1276 + }, + { + "epoch": 0.023726157810510056, + "grad_norm": 0.3980746865272522, + "learning_rate": 1.9972232731110843e-05, + "loss": 0.3606, + "step": 1278 + }, + { + "epoch": 0.023763287947928695, + "grad_norm": 0.3911648392677307, + "learning_rate": 1.9972145794997658e-05, + "loss": 0.4655, + "step": 1280 + }, + { + "epoch": 0.023800418085347334, + "grad_norm": 0.353172242641449, + "learning_rate": 1.997205872319344e-05, + "loss": 0.5421, + "step": 1282 + }, + { + "epoch": 0.023837548222765972, + "grad_norm": 0.47451725602149963, + "learning_rate": 1.9971971515699376e-05, + "loss": 0.4842, + "step": 1284 + }, + { + "epoch": 0.02387467836018461, + "grad_norm": 0.32680267095565796, + "learning_rate": 1.9971884172516644e-05, + "loss": 0.4906, + "step": 1286 + }, + { + "epoch": 0.02391180849760325, + "grad_norm": 0.25843244791030884, + "learning_rate": 1.9971796693646437e-05, + "loss": 0.266, + "step": 1288 + }, + { + "epoch": 0.02394893863502189, + "grad_norm": 0.40673211216926575, + "learning_rate": 1.997170907908995e-05, + "loss": 0.4269, + "step": 1290 + }, + { + "epoch": 0.023986068772440527, + "grad_norm": 0.39124974608421326, + "learning_rate": 1.997162132884837e-05, + "loss": 0.4103, + "step": 1292 + }, + { + "epoch": 0.024023198909859166, + "grad_norm": 0.3710108697414398, + "learning_rate": 1.9971533442922893e-05, + "loss": 0.219, + "step": 1294 + }, + { + "epoch": 0.024060329047277805, + "grad_norm": 0.3995617628097534, + "learning_rate": 1.9971445421314713e-05, + "loss": 0.3433, + "step": 1296 + }, + { + "epoch": 0.024097459184696444, + "grad_norm": 0.3406704068183899, + "learning_rate": 1.9971357264025027e-05, + "loss": 0.3873, + "step": 1298 + }, + { + "epoch": 0.024134589322115083, + "grad_norm": 0.35661450028419495, + "learning_rate": 1.997126897105504e-05, + "loss": 0.3453, + "step": 1300 + }, + { + "epoch": 0.02417171945953372, + "grad_norm": 0.8108943104743958, + "learning_rate": 1.997118054240595e-05, + "loss": 0.3407, + "step": 1302 + }, + { + "epoch": 0.024208849596952357, + "grad_norm": 0.32169821858406067, + "learning_rate": 1.9971091978078957e-05, + "loss": 0.3548, + "step": 1304 + }, + { + "epoch": 0.024245979734370995, + "grad_norm": 0.3153872489929199, + "learning_rate": 1.9971003278075266e-05, + "loss": 0.34, + "step": 1306 + }, + { + "epoch": 0.024283109871789634, + "grad_norm": 0.36579766869544983, + "learning_rate": 1.9970914442396093e-05, + "loss": 0.1801, + "step": 1308 + }, + { + "epoch": 0.024320240009208273, + "grad_norm": 0.42245689034461975, + "learning_rate": 1.9970825471042635e-05, + "loss": 0.3233, + "step": 1310 + }, + { + "epoch": 0.02435737014662691, + "grad_norm": 0.38223937153816223, + "learning_rate": 1.9970736364016116e-05, + "loss": 0.2972, + "step": 1312 + }, + { + "epoch": 0.02439450028404555, + "grad_norm": 0.43174970149993896, + "learning_rate": 1.9970647121317737e-05, + "loss": 0.4011, + "step": 1314 + }, + { + "epoch": 0.02443163042146419, + "grad_norm": 0.3544157147407532, + "learning_rate": 1.9970557742948716e-05, + "loss": 0.337, + "step": 1316 + }, + { + "epoch": 0.024468760558882828, + "grad_norm": 0.2962047755718231, + "learning_rate": 1.9970468228910268e-05, + "loss": 0.3936, + "step": 1318 + }, + { + "epoch": 0.024505890696301467, + "grad_norm": 0.5097366571426392, + "learning_rate": 1.9970378579203614e-05, + "loss": 0.3446, + "step": 1320 + }, + { + "epoch": 0.024543020833720106, + "grad_norm": 0.3349545896053314, + "learning_rate": 1.997028879382997e-05, + "loss": 0.4444, + "step": 1322 + }, + { + "epoch": 0.024580150971138744, + "grad_norm": 0.2875061333179474, + "learning_rate": 1.9970198872790564e-05, + "loss": 0.4176, + "step": 1324 + }, + { + "epoch": 0.024617281108557383, + "grad_norm": 0.3119542896747589, + "learning_rate": 1.9970108816086614e-05, + "loss": 0.4236, + "step": 1326 + }, + { + "epoch": 0.024654411245976022, + "grad_norm": 0.33009427785873413, + "learning_rate": 1.9970018623719344e-05, + "loss": 0.2892, + "step": 1328 + }, + { + "epoch": 0.02469154138339466, + "grad_norm": 0.34331730008125305, + "learning_rate": 1.996992829568999e-05, + "loss": 0.4261, + "step": 1330 + }, + { + "epoch": 0.0247286715208133, + "grad_norm": 0.4544834792613983, + "learning_rate": 1.9969837831999774e-05, + "loss": 0.3532, + "step": 1332 + }, + { + "epoch": 0.024765801658231938, + "grad_norm": 0.2871183156967163, + "learning_rate": 1.9969747232649923e-05, + "loss": 0.3155, + "step": 1334 + }, + { + "epoch": 0.024802931795650577, + "grad_norm": 0.41414642333984375, + "learning_rate": 1.9969656497641678e-05, + "loss": 0.5466, + "step": 1336 + }, + { + "epoch": 0.024840061933069216, + "grad_norm": 0.29097655415534973, + "learning_rate": 1.996956562697627e-05, + "loss": 0.4122, + "step": 1338 + }, + { + "epoch": 0.024877192070487854, + "grad_norm": 0.3858906030654907, + "learning_rate": 1.996947462065494e-05, + "loss": 0.2427, + "step": 1340 + }, + { + "epoch": 0.02491432220790649, + "grad_norm": 0.4520706236362457, + "learning_rate": 1.9969383478678917e-05, + "loss": 0.5169, + "step": 1342 + }, + { + "epoch": 0.02495145234532513, + "grad_norm": 0.2730240225791931, + "learning_rate": 1.996929220104945e-05, + "loss": 0.2457, + "step": 1344 + }, + { + "epoch": 0.024988582482743767, + "grad_norm": 0.4015446901321411, + "learning_rate": 1.996920078776778e-05, + "loss": 0.4575, + "step": 1346 + }, + { + "epoch": 0.025025712620162406, + "grad_norm": 0.31149327754974365, + "learning_rate": 1.9969109238835142e-05, + "loss": 0.4285, + "step": 1348 + }, + { + "epoch": 0.025062842757581045, + "grad_norm": 0.336830198764801, + "learning_rate": 1.996901755425279e-05, + "loss": 0.269, + "step": 1350 + }, + { + "epoch": 0.025099972894999684, + "grad_norm": 0.3583640456199646, + "learning_rate": 1.9968925734021974e-05, + "loss": 0.2681, + "step": 1352 + }, + { + "epoch": 0.025137103032418322, + "grad_norm": 0.4549861550331116, + "learning_rate": 1.9968833778143935e-05, + "loss": 0.339, + "step": 1354 + }, + { + "epoch": 0.02517423316983696, + "grad_norm": 0.36295655369758606, + "learning_rate": 1.996874168661993e-05, + "loss": 0.3372, + "step": 1356 + }, + { + "epoch": 0.0252113633072556, + "grad_norm": 0.43552401661872864, + "learning_rate": 1.996864945945121e-05, + "loss": 0.4609, + "step": 1358 + }, + { + "epoch": 0.02524849344467424, + "grad_norm": 0.40029749274253845, + "learning_rate": 1.9968557096639032e-05, + "loss": 0.5002, + "step": 1360 + }, + { + "epoch": 0.025285623582092877, + "grad_norm": 0.37713494896888733, + "learning_rate": 1.996846459818465e-05, + "loss": 0.319, + "step": 1362 + }, + { + "epoch": 0.025322753719511516, + "grad_norm": 0.43770843744277954, + "learning_rate": 1.9968371964089323e-05, + "loss": 0.3566, + "step": 1364 + }, + { + "epoch": 0.025359883856930155, + "grad_norm": 0.443180114030838, + "learning_rate": 1.996827919435431e-05, + "loss": 0.49, + "step": 1366 + }, + { + "epoch": 0.025397013994348794, + "grad_norm": 0.3460487127304077, + "learning_rate": 1.9968186288980884e-05, + "loss": 0.53, + "step": 1368 + }, + { + "epoch": 0.025434144131767433, + "grad_norm": 0.34329843521118164, + "learning_rate": 1.996809324797029e-05, + "loss": 0.4505, + "step": 1370 + }, + { + "epoch": 0.02547127426918607, + "grad_norm": 0.3537254333496094, + "learning_rate": 1.9968000071323816e-05, + "loss": 0.5909, + "step": 1372 + }, + { + "epoch": 0.02550840440660471, + "grad_norm": 0.28023451566696167, + "learning_rate": 1.996790675904271e-05, + "loss": 0.3167, + "step": 1374 + }, + { + "epoch": 0.02554553454402335, + "grad_norm": 0.3714352250099182, + "learning_rate": 1.9967813311128254e-05, + "loss": 0.3826, + "step": 1376 + }, + { + "epoch": 0.025582664681441988, + "grad_norm": 0.35274237394332886, + "learning_rate": 1.9967719727581712e-05, + "loss": 0.4447, + "step": 1378 + }, + { + "epoch": 0.025619794818860623, + "grad_norm": 0.32768285274505615, + "learning_rate": 1.9967626008404365e-05, + "loss": 0.2885, + "step": 1380 + }, + { + "epoch": 0.02565692495627926, + "grad_norm": 0.4058002531528473, + "learning_rate": 1.9967532153597484e-05, + "loss": 0.3783, + "step": 1382 + }, + { + "epoch": 0.0256940550936979, + "grad_norm": 0.2554856836795807, + "learning_rate": 1.9967438163162346e-05, + "loss": 0.3025, + "step": 1384 + }, + { + "epoch": 0.02573118523111654, + "grad_norm": 0.3965078592300415, + "learning_rate": 1.9967344037100234e-05, + "loss": 0.3931, + "step": 1386 + }, + { + "epoch": 0.025768315368535178, + "grad_norm": 0.22335362434387207, + "learning_rate": 1.9967249775412424e-05, + "loss": 0.3573, + "step": 1388 + }, + { + "epoch": 0.025805445505953817, + "grad_norm": 0.37355759739875793, + "learning_rate": 1.9967155378100194e-05, + "loss": 0.3631, + "step": 1390 + }, + { + "epoch": 0.025842575643372456, + "grad_norm": 0.3984421491622925, + "learning_rate": 1.996706084516484e-05, + "loss": 0.2944, + "step": 1392 + }, + { + "epoch": 0.025879705780791094, + "grad_norm": 0.34430572390556335, + "learning_rate": 1.996696617660764e-05, + "loss": 0.1699, + "step": 1394 + }, + { + "epoch": 0.025916835918209733, + "grad_norm": 0.4926247298717499, + "learning_rate": 1.9966871372429888e-05, + "loss": 0.3823, + "step": 1396 + }, + { + "epoch": 0.025953966055628372, + "grad_norm": 0.2837347090244293, + "learning_rate": 1.996677643263287e-05, + "loss": 0.3125, + "step": 1398 + }, + { + "epoch": 0.02599109619304701, + "grad_norm": 0.43463966250419617, + "learning_rate": 1.9966681357217878e-05, + "loss": 0.2929, + "step": 1400 + }, + { + "epoch": 0.02602822633046565, + "grad_norm": 0.2875401973724365, + "learning_rate": 1.9966586146186206e-05, + "loss": 0.3651, + "step": 1402 + }, + { + "epoch": 0.026065356467884288, + "grad_norm": 0.4797331392765045, + "learning_rate": 1.9966490799539148e-05, + "loss": 0.4355, + "step": 1404 + }, + { + "epoch": 0.026102486605302927, + "grad_norm": 0.3717973828315735, + "learning_rate": 1.9966395317278005e-05, + "loss": 0.5791, + "step": 1406 + }, + { + "epoch": 0.026139616742721566, + "grad_norm": 0.458346962928772, + "learning_rate": 1.9966299699404076e-05, + "loss": 0.4856, + "step": 1408 + }, + { + "epoch": 0.026176746880140205, + "grad_norm": 0.20421335101127625, + "learning_rate": 1.996620394591866e-05, + "loss": 0.3512, + "step": 1410 + }, + { + "epoch": 0.026213877017558843, + "grad_norm": 0.6964619159698486, + "learning_rate": 1.996610805682306e-05, + "loss": 0.3468, + "step": 1412 + }, + { + "epoch": 0.026251007154977482, + "grad_norm": 0.42472800612449646, + "learning_rate": 1.9966012032118582e-05, + "loss": 0.4975, + "step": 1414 + }, + { + "epoch": 0.02628813729239612, + "grad_norm": 0.4406464993953705, + "learning_rate": 1.9965915871806532e-05, + "loss": 0.4985, + "step": 1416 + }, + { + "epoch": 0.026325267429814756, + "grad_norm": 0.23722697794437408, + "learning_rate": 1.9965819575888217e-05, + "loss": 0.2273, + "step": 1418 + }, + { + "epoch": 0.026362397567233395, + "grad_norm": 0.29463279247283936, + "learning_rate": 1.996572314436495e-05, + "loss": 0.4058, + "step": 1420 + }, + { + "epoch": 0.026399527704652034, + "grad_norm": 0.32248571515083313, + "learning_rate": 1.996562657723804e-05, + "loss": 0.4943, + "step": 1422 + }, + { + "epoch": 0.026436657842070672, + "grad_norm": 0.2820965647697449, + "learning_rate": 1.9965529874508805e-05, + "loss": 0.5057, + "step": 1424 + }, + { + "epoch": 0.02647378797948931, + "grad_norm": 0.31063172221183777, + "learning_rate": 1.9965433036178557e-05, + "loss": 0.207, + "step": 1426 + }, + { + "epoch": 0.02651091811690795, + "grad_norm": 0.3696473240852356, + "learning_rate": 1.9965336062248616e-05, + "loss": 0.2089, + "step": 1428 + }, + { + "epoch": 0.02654804825432659, + "grad_norm": 0.40383201837539673, + "learning_rate": 1.99652389527203e-05, + "loss": 0.3744, + "step": 1430 + }, + { + "epoch": 0.026585178391745228, + "grad_norm": 0.4137421250343323, + "learning_rate": 1.9965141707594934e-05, + "loss": 0.337, + "step": 1432 + }, + { + "epoch": 0.026622308529163866, + "grad_norm": 0.3258547782897949, + "learning_rate": 1.9965044326873833e-05, + "loss": 0.4078, + "step": 1434 + }, + { + "epoch": 0.026659438666582505, + "grad_norm": 0.24489538371562958, + "learning_rate": 1.996494681055833e-05, + "loss": 0.4465, + "step": 1436 + }, + { + "epoch": 0.026696568804001144, + "grad_norm": 0.45460736751556396, + "learning_rate": 1.9964849158649752e-05, + "loss": 0.385, + "step": 1438 + }, + { + "epoch": 0.026733698941419783, + "grad_norm": 0.2975912094116211, + "learning_rate": 1.9964751371149426e-05, + "loss": 0.359, + "step": 1440 + }, + { + "epoch": 0.02677082907883842, + "grad_norm": 0.33785927295684814, + "learning_rate": 1.9964653448058678e-05, + "loss": 0.4944, + "step": 1442 + }, + { + "epoch": 0.02680795921625706, + "grad_norm": 0.331898957490921, + "learning_rate": 1.9964555389378842e-05, + "loss": 0.4063, + "step": 1444 + }, + { + "epoch": 0.0268450893536757, + "grad_norm": 0.45948436856269836, + "learning_rate": 1.996445719511126e-05, + "loss": 0.3422, + "step": 1446 + }, + { + "epoch": 0.026882219491094338, + "grad_norm": 0.44890448451042175, + "learning_rate": 1.9964358865257257e-05, + "loss": 0.3628, + "step": 1448 + }, + { + "epoch": 0.026919349628512976, + "grad_norm": 0.322969913482666, + "learning_rate": 1.996426039981818e-05, + "loss": 0.3265, + "step": 1450 + }, + { + "epoch": 0.026956479765931615, + "grad_norm": 0.34472888708114624, + "learning_rate": 1.9964161798795367e-05, + "loss": 0.2761, + "step": 1452 + }, + { + "epoch": 0.026993609903350254, + "grad_norm": 0.6000632643699646, + "learning_rate": 1.9964063062190155e-05, + "loss": 0.2997, + "step": 1454 + }, + { + "epoch": 0.02703074004076889, + "grad_norm": 0.28989025950431824, + "learning_rate": 1.9963964190003892e-05, + "loss": 0.3571, + "step": 1456 + }, + { + "epoch": 0.027067870178187528, + "grad_norm": 0.4636005759239197, + "learning_rate": 1.9963865182237924e-05, + "loss": 0.2713, + "step": 1458 + }, + { + "epoch": 0.027105000315606167, + "grad_norm": 0.28512948751449585, + "learning_rate": 1.9963766038893592e-05, + "loss": 0.3763, + "step": 1460 + }, + { + "epoch": 0.027142130453024806, + "grad_norm": 0.3537326157093048, + "learning_rate": 1.9963666759972252e-05, + "loss": 0.339, + "step": 1462 + }, + { + "epoch": 0.027179260590443444, + "grad_norm": 0.7664099931716919, + "learning_rate": 1.996356734547525e-05, + "loss": 0.5019, + "step": 1464 + }, + { + "epoch": 0.027216390727862083, + "grad_norm": 0.323015958070755, + "learning_rate": 1.996346779540394e-05, + "loss": 0.4131, + "step": 1466 + }, + { + "epoch": 0.027253520865280722, + "grad_norm": 0.2701569199562073, + "learning_rate": 1.9963368109759683e-05, + "loss": 0.2272, + "step": 1468 + }, + { + "epoch": 0.02729065100269936, + "grad_norm": 0.46091488003730774, + "learning_rate": 1.9963268288543825e-05, + "loss": 0.3593, + "step": 1470 + }, + { + "epoch": 0.027327781140118, + "grad_norm": 0.500520646572113, + "learning_rate": 1.9963168331757728e-05, + "loss": 0.4335, + "step": 1472 + }, + { + "epoch": 0.027364911277536638, + "grad_norm": 0.29205963015556335, + "learning_rate": 1.9963068239402754e-05, + "loss": 0.1835, + "step": 1474 + }, + { + "epoch": 0.027402041414955277, + "grad_norm": 0.39108943939208984, + "learning_rate": 1.9962968011480265e-05, + "loss": 0.3862, + "step": 1476 + }, + { + "epoch": 0.027439171552373916, + "grad_norm": 0.3224082887172699, + "learning_rate": 1.9962867647991624e-05, + "loss": 0.2572, + "step": 1478 + }, + { + "epoch": 0.027476301689792555, + "grad_norm": 0.3712817430496216, + "learning_rate": 1.99627671489382e-05, + "loss": 0.2628, + "step": 1480 + }, + { + "epoch": 0.027513431827211193, + "grad_norm": 0.3820110559463501, + "learning_rate": 1.996266651432135e-05, + "loss": 0.3577, + "step": 1482 + }, + { + "epoch": 0.027550561964629832, + "grad_norm": 0.29978644847869873, + "learning_rate": 1.9962565744142454e-05, + "loss": 0.3791, + "step": 1484 + }, + { + "epoch": 0.02758769210204847, + "grad_norm": 0.3531618118286133, + "learning_rate": 1.9962464838402883e-05, + "loss": 0.2792, + "step": 1486 + }, + { + "epoch": 0.02762482223946711, + "grad_norm": 0.28927019238471985, + "learning_rate": 1.9962363797104e-05, + "loss": 0.4817, + "step": 1488 + }, + { + "epoch": 0.02766195237688575, + "grad_norm": 0.39351004362106323, + "learning_rate": 1.996226262024719e-05, + "loss": 0.3973, + "step": 1490 + }, + { + "epoch": 0.027699082514304387, + "grad_norm": 0.3068031072616577, + "learning_rate": 1.9962161307833826e-05, + "loss": 0.6361, + "step": 1492 + }, + { + "epoch": 0.027736212651723022, + "grad_norm": 0.2599097788333893, + "learning_rate": 1.9962059859865287e-05, + "loss": 0.4596, + "step": 1494 + }, + { + "epoch": 0.02777334278914166, + "grad_norm": 0.34776872396469116, + "learning_rate": 1.9961958276342952e-05, + "loss": 0.4534, + "step": 1496 + }, + { + "epoch": 0.0278104729265603, + "grad_norm": 0.29954656958580017, + "learning_rate": 1.9961856557268206e-05, + "loss": 0.3703, + "step": 1498 + }, + { + "epoch": 0.02784760306397894, + "grad_norm": 0.27080005407333374, + "learning_rate": 1.996175470264243e-05, + "loss": 0.2651, + "step": 1500 + }, + { + "epoch": 0.027884733201397578, + "grad_norm": 0.29802244901657104, + "learning_rate": 1.9961652712467013e-05, + "loss": 0.5031, + "step": 1502 + }, + { + "epoch": 0.027921863338816216, + "grad_norm": 0.28868886828422546, + "learning_rate": 1.996155058674334e-05, + "loss": 0.4347, + "step": 1504 + }, + { + "epoch": 0.027958993476234855, + "grad_norm": 0.31217366456985474, + "learning_rate": 1.99614483254728e-05, + "loss": 0.2961, + "step": 1506 + }, + { + "epoch": 0.027996123613653494, + "grad_norm": 0.37338700890541077, + "learning_rate": 1.996134592865679e-05, + "loss": 0.4172, + "step": 1508 + }, + { + "epoch": 0.028033253751072133, + "grad_norm": 0.371517151594162, + "learning_rate": 1.9961243396296696e-05, + "loss": 0.4035, + "step": 1510 + }, + { + "epoch": 0.02807038388849077, + "grad_norm": 0.39833423495292664, + "learning_rate": 1.996114072839392e-05, + "loss": 0.2867, + "step": 1512 + }, + { + "epoch": 0.02810751402590941, + "grad_norm": 0.37536507844924927, + "learning_rate": 1.9961037924949853e-05, + "loss": 0.303, + "step": 1514 + }, + { + "epoch": 0.02814464416332805, + "grad_norm": 0.38057741522789, + "learning_rate": 1.9960934985965897e-05, + "loss": 0.3201, + "step": 1516 + }, + { + "epoch": 0.028181774300746688, + "grad_norm": 0.3367200493812561, + "learning_rate": 1.9960831911443455e-05, + "loss": 0.2645, + "step": 1518 + }, + { + "epoch": 0.028218904438165326, + "grad_norm": 0.3803890645503998, + "learning_rate": 1.9960728701383924e-05, + "loss": 0.3295, + "step": 1520 + }, + { + "epoch": 0.028256034575583965, + "grad_norm": 0.3274245858192444, + "learning_rate": 1.9960625355788714e-05, + "loss": 0.3953, + "step": 1522 + }, + { + "epoch": 0.028293164713002604, + "grad_norm": 0.5688687562942505, + "learning_rate": 1.9960521874659226e-05, + "loss": 0.4073, + "step": 1524 + }, + { + "epoch": 0.028330294850421243, + "grad_norm": 0.4595238268375397, + "learning_rate": 1.996041825799687e-05, + "loss": 0.5574, + "step": 1526 + }, + { + "epoch": 0.02836742498783988, + "grad_norm": 0.5020874738693237, + "learning_rate": 1.9960314505803056e-05, + "loss": 0.366, + "step": 1528 + }, + { + "epoch": 0.02840455512525852, + "grad_norm": 0.39065760374069214, + "learning_rate": 1.99602106180792e-05, + "loss": 0.3016, + "step": 1530 + }, + { + "epoch": 0.028441685262677156, + "grad_norm": 0.29245397448539734, + "learning_rate": 1.996010659482671e-05, + "loss": 0.2864, + "step": 1532 + }, + { + "epoch": 0.028478815400095794, + "grad_norm": 0.41922682523727417, + "learning_rate": 1.9960002436047002e-05, + "loss": 0.4411, + "step": 1534 + }, + { + "epoch": 0.028515945537514433, + "grad_norm": 0.3665362000465393, + "learning_rate": 1.9959898141741494e-05, + "loss": 0.2283, + "step": 1536 + }, + { + "epoch": 0.028553075674933072, + "grad_norm": 0.3724018931388855, + "learning_rate": 1.9959793711911608e-05, + "loss": 0.4392, + "step": 1538 + }, + { + "epoch": 0.02859020581235171, + "grad_norm": 0.409440279006958, + "learning_rate": 1.995968914655876e-05, + "loss": 0.3358, + "step": 1540 + }, + { + "epoch": 0.02862733594977035, + "grad_norm": 0.512639045715332, + "learning_rate": 1.9959584445684377e-05, + "loss": 0.3464, + "step": 1542 + }, + { + "epoch": 0.028664466087188988, + "grad_norm": 0.3631186783313751, + "learning_rate": 1.995947960928988e-05, + "loss": 0.436, + "step": 1544 + }, + { + "epoch": 0.028701596224607627, + "grad_norm": 0.3963833749294281, + "learning_rate": 1.9959374637376704e-05, + "loss": 0.42, + "step": 1546 + }, + { + "epoch": 0.028738726362026266, + "grad_norm": 0.27118197083473206, + "learning_rate": 1.9959269529946264e-05, + "loss": 0.42, + "step": 1548 + }, + { + "epoch": 0.028775856499444905, + "grad_norm": 0.3797779977321625, + "learning_rate": 1.9959164287e-05, + "loss": 0.4648, + "step": 1550 + }, + { + "epoch": 0.028812986636863543, + "grad_norm": 0.37404870986938477, + "learning_rate": 1.9959058908539342e-05, + "loss": 0.3835, + "step": 1552 + }, + { + "epoch": 0.028850116774282182, + "grad_norm": 0.27369093894958496, + "learning_rate": 1.9958953394565722e-05, + "loss": 0.4301, + "step": 1554 + }, + { + "epoch": 0.02888724691170082, + "grad_norm": 0.37033283710479736, + "learning_rate": 1.9958847745080577e-05, + "loss": 0.3592, + "step": 1556 + }, + { + "epoch": 0.02892437704911946, + "grad_norm": 0.3935405910015106, + "learning_rate": 1.9958741960085345e-05, + "loss": 0.2618, + "step": 1558 + }, + { + "epoch": 0.0289615071865381, + "grad_norm": 0.2885076701641083, + "learning_rate": 1.9958636039581464e-05, + "loss": 0.4715, + "step": 1560 + }, + { + "epoch": 0.028998637323956737, + "grad_norm": 0.24158255755901337, + "learning_rate": 1.9958529983570374e-05, + "loss": 0.318, + "step": 1562 + }, + { + "epoch": 0.029035767461375376, + "grad_norm": 0.2989046573638916, + "learning_rate": 1.9958423792053524e-05, + "loss": 0.4084, + "step": 1564 + }, + { + "epoch": 0.029072897598794015, + "grad_norm": 0.33272868394851685, + "learning_rate": 1.9958317465032352e-05, + "loss": 0.4199, + "step": 1566 + }, + { + "epoch": 0.02911002773621265, + "grad_norm": 0.4965613782405853, + "learning_rate": 1.995821100250831e-05, + "loss": 0.4254, + "step": 1568 + }, + { + "epoch": 0.02914715787363129, + "grad_norm": 0.37236645817756653, + "learning_rate": 1.9958104404482843e-05, + "loss": 0.3838, + "step": 1570 + }, + { + "epoch": 0.029184288011049928, + "grad_norm": 0.3216698467731476, + "learning_rate": 1.9957997670957403e-05, + "loss": 0.4545, + "step": 1572 + }, + { + "epoch": 0.029221418148468566, + "grad_norm": 0.25853726267814636, + "learning_rate": 1.9957890801933444e-05, + "loss": 0.3244, + "step": 1574 + }, + { + "epoch": 0.029258548285887205, + "grad_norm": 0.39867860078811646, + "learning_rate": 1.9957783797412416e-05, + "loss": 0.4424, + "step": 1576 + }, + { + "epoch": 0.029295678423305844, + "grad_norm": 0.3040996789932251, + "learning_rate": 1.995767665739578e-05, + "loss": 0.4894, + "step": 1578 + }, + { + "epoch": 0.029332808560724483, + "grad_norm": 0.3603006601333618, + "learning_rate": 1.9957569381884993e-05, + "loss": 0.3921, + "step": 1580 + }, + { + "epoch": 0.02936993869814312, + "grad_norm": 0.40616369247436523, + "learning_rate": 1.995746197088151e-05, + "loss": 0.5022, + "step": 1582 + }, + { + "epoch": 0.02940706883556176, + "grad_norm": 0.28409871459007263, + "learning_rate": 1.9957354424386797e-05, + "loss": 0.5682, + "step": 1584 + }, + { + "epoch": 0.0294441989729804, + "grad_norm": 0.42425426840782166, + "learning_rate": 1.9957246742402313e-05, + "loss": 0.4215, + "step": 1586 + }, + { + "epoch": 0.029481329110399038, + "grad_norm": 0.4165717661380768, + "learning_rate": 1.9957138924929532e-05, + "loss": 0.5334, + "step": 1588 + }, + { + "epoch": 0.029518459247817676, + "grad_norm": 0.33867260813713074, + "learning_rate": 1.995703097196991e-05, + "loss": 0.4195, + "step": 1590 + }, + { + "epoch": 0.029555589385236315, + "grad_norm": 0.4551469385623932, + "learning_rate": 1.9956922883524925e-05, + "loss": 0.3001, + "step": 1592 + }, + { + "epoch": 0.029592719522654954, + "grad_norm": 0.5343561768531799, + "learning_rate": 1.9956814659596042e-05, + "loss": 0.3407, + "step": 1594 + }, + { + "epoch": 0.029629849660073593, + "grad_norm": 0.4037441313266754, + "learning_rate": 1.9956706300184737e-05, + "loss": 0.4728, + "step": 1596 + }, + { + "epoch": 0.02966697979749223, + "grad_norm": 0.378182053565979, + "learning_rate": 1.9956597805292478e-05, + "loss": 0.3131, + "step": 1598 + }, + { + "epoch": 0.02970410993491087, + "grad_norm": 0.35998398065567017, + "learning_rate": 1.995648917492075e-05, + "loss": 0.4301, + "step": 1600 + }, + { + "epoch": 0.02974124007232951, + "grad_norm": 0.4198257029056549, + "learning_rate": 1.995638040907103e-05, + "loss": 0.3985, + "step": 1602 + }, + { + "epoch": 0.029778370209748148, + "grad_norm": 0.4329060912132263, + "learning_rate": 1.9956271507744794e-05, + "loss": 0.5794, + "step": 1604 + }, + { + "epoch": 0.029815500347166783, + "grad_norm": 0.5906333923339844, + "learning_rate": 1.9956162470943522e-05, + "loss": 0.3322, + "step": 1606 + }, + { + "epoch": 0.029852630484585422, + "grad_norm": 0.3923175632953644, + "learning_rate": 1.9956053298668705e-05, + "loss": 0.5322, + "step": 1608 + }, + { + "epoch": 0.02988976062200406, + "grad_norm": 0.3679203391075134, + "learning_rate": 1.9955943990921823e-05, + "loss": 0.3602, + "step": 1610 + }, + { + "epoch": 0.0299268907594227, + "grad_norm": 0.4073547124862671, + "learning_rate": 1.9955834547704364e-05, + "loss": 0.3927, + "step": 1612 + }, + { + "epoch": 0.029964020896841338, + "grad_norm": 0.458255797624588, + "learning_rate": 1.995572496901782e-05, + "loss": 0.4757, + "step": 1614 + }, + { + "epoch": 0.030001151034259977, + "grad_norm": 0.29495638608932495, + "learning_rate": 1.995561525486368e-05, + "loss": 0.4078, + "step": 1616 + }, + { + "epoch": 0.030038281171678616, + "grad_norm": 0.2971830666065216, + "learning_rate": 1.9955505405243435e-05, + "loss": 0.2955, + "step": 1618 + }, + { + "epoch": 0.030075411309097255, + "grad_norm": 0.354912668466568, + "learning_rate": 1.995539542015858e-05, + "loss": 0.4524, + "step": 1620 + }, + { + "epoch": 0.030112541446515893, + "grad_norm": 0.35809728503227234, + "learning_rate": 1.9955285299610615e-05, + "loss": 0.3758, + "step": 1622 + }, + { + "epoch": 0.030149671583934532, + "grad_norm": 0.43015095591545105, + "learning_rate": 1.9955175043601034e-05, + "loss": 0.5977, + "step": 1624 + }, + { + "epoch": 0.03018680172135317, + "grad_norm": 0.40109366178512573, + "learning_rate": 1.9955064652131347e-05, + "loss": 0.4039, + "step": 1626 + }, + { + "epoch": 0.03022393185877181, + "grad_norm": 0.4162982702255249, + "learning_rate": 1.9954954125203042e-05, + "loss": 0.515, + "step": 1628 + }, + { + "epoch": 0.03026106199619045, + "grad_norm": 0.4334047734737396, + "learning_rate": 1.9954843462817632e-05, + "loss": 0.2991, + "step": 1630 + }, + { + "epoch": 0.030298192133609087, + "grad_norm": 0.35421228408813477, + "learning_rate": 1.9954732664976624e-05, + "loss": 0.4193, + "step": 1632 + }, + { + "epoch": 0.030335322271027726, + "grad_norm": 0.37089452147483826, + "learning_rate": 1.9954621731681518e-05, + "loss": 0.2015, + "step": 1634 + }, + { + "epoch": 0.030372452408446365, + "grad_norm": 0.2774275541305542, + "learning_rate": 1.995451066293383e-05, + "loss": 0.3854, + "step": 1636 + }, + { + "epoch": 0.030409582545865003, + "grad_norm": 0.5215280652046204, + "learning_rate": 1.9954399458735067e-05, + "loss": 0.3936, + "step": 1638 + }, + { + "epoch": 0.030446712683283642, + "grad_norm": 0.34046199917793274, + "learning_rate": 1.9954288119086748e-05, + "loss": 0.5031, + "step": 1640 + }, + { + "epoch": 0.03048384282070228, + "grad_norm": 0.36269626021385193, + "learning_rate": 1.995417664399038e-05, + "loss": 0.3754, + "step": 1642 + }, + { + "epoch": 0.030520972958120916, + "grad_norm": 0.2609393298625946, + "learning_rate": 1.9954065033447488e-05, + "loss": 0.3487, + "step": 1644 + }, + { + "epoch": 0.030558103095539555, + "grad_norm": 0.2629873752593994, + "learning_rate": 1.9953953287459584e-05, + "loss": 0.5352, + "step": 1646 + }, + { + "epoch": 0.030595233232958194, + "grad_norm": 0.33361902832984924, + "learning_rate": 1.9953841406028192e-05, + "loss": 0.2466, + "step": 1648 + }, + { + "epoch": 0.030632363370376833, + "grad_norm": 0.3738601505756378, + "learning_rate": 1.9953729389154835e-05, + "loss": 0.3589, + "step": 1650 + }, + { + "epoch": 0.03066949350779547, + "grad_norm": 0.32117071747779846, + "learning_rate": 1.9953617236841035e-05, + "loss": 0.254, + "step": 1652 + }, + { + "epoch": 0.03070662364521411, + "grad_norm": 0.32607585191726685, + "learning_rate": 1.9953504949088317e-05, + "loss": 0.2791, + "step": 1654 + }, + { + "epoch": 0.03074375378263275, + "grad_norm": 0.2379506230354309, + "learning_rate": 1.9953392525898213e-05, + "loss": 0.2839, + "step": 1656 + }, + { + "epoch": 0.030780883920051388, + "grad_norm": 0.5135214924812317, + "learning_rate": 1.995327996727225e-05, + "loss": 0.2939, + "step": 1658 + }, + { + "epoch": 0.030818014057470026, + "grad_norm": 0.43111538887023926, + "learning_rate": 1.9953167273211958e-05, + "loss": 0.3749, + "step": 1660 + }, + { + "epoch": 0.030855144194888665, + "grad_norm": 0.41557297110557556, + "learning_rate": 1.9953054443718875e-05, + "loss": 0.3819, + "step": 1662 + }, + { + "epoch": 0.030892274332307304, + "grad_norm": 0.30154499411582947, + "learning_rate": 1.9952941478794534e-05, + "loss": 0.2877, + "step": 1664 + }, + { + "epoch": 0.030929404469725943, + "grad_norm": 0.37477779388427734, + "learning_rate": 1.9952828378440473e-05, + "loss": 0.2703, + "step": 1666 + }, + { + "epoch": 0.03096653460714458, + "grad_norm": 0.35994696617126465, + "learning_rate": 1.9952715142658228e-05, + "loss": 0.3038, + "step": 1668 + }, + { + "epoch": 0.03100366474456322, + "grad_norm": 0.374391108751297, + "learning_rate": 1.995260177144934e-05, + "loss": 0.2479, + "step": 1670 + }, + { + "epoch": 0.03104079488198186, + "grad_norm": 0.33741456270217896, + "learning_rate": 1.9952488264815357e-05, + "loss": 0.2619, + "step": 1672 + }, + { + "epoch": 0.031077925019400498, + "grad_norm": 0.29656046628952026, + "learning_rate": 1.9952374622757818e-05, + "loss": 0.2605, + "step": 1674 + }, + { + "epoch": 0.031115055156819137, + "grad_norm": 0.2902997136116028, + "learning_rate": 1.9952260845278274e-05, + "loss": 0.1991, + "step": 1676 + }, + { + "epoch": 0.031152185294237775, + "grad_norm": 0.2407100647687912, + "learning_rate": 1.9952146932378267e-05, + "loss": 0.2439, + "step": 1678 + }, + { + "epoch": 0.031189315431656414, + "grad_norm": 0.40434888005256653, + "learning_rate": 1.995203288405935e-05, + "loss": 0.3774, + "step": 1680 + }, + { + "epoch": 0.03122644556907505, + "grad_norm": 0.358460932970047, + "learning_rate": 1.9951918700323077e-05, + "loss": 0.4143, + "step": 1682 + }, + { + "epoch": 0.03126357570649369, + "grad_norm": 0.33627939224243164, + "learning_rate": 1.9951804381171e-05, + "loss": 0.3964, + "step": 1684 + }, + { + "epoch": 0.03130070584391233, + "grad_norm": 0.26235583424568176, + "learning_rate": 1.9951689926604673e-05, + "loss": 0.3224, + "step": 1686 + }, + { + "epoch": 0.03133783598133097, + "grad_norm": 0.4374541640281677, + "learning_rate": 1.9951575336625658e-05, + "loss": 0.4657, + "step": 1688 + }, + { + "epoch": 0.031374966118749605, + "grad_norm": 0.4052223563194275, + "learning_rate": 1.995146061123551e-05, + "loss": 0.2915, + "step": 1690 + }, + { + "epoch": 0.03141209625616825, + "grad_norm": 0.4146050810813904, + "learning_rate": 1.995134575043579e-05, + "loss": 0.3734, + "step": 1692 + }, + { + "epoch": 0.03144922639358688, + "grad_norm": 0.32710057497024536, + "learning_rate": 1.9951230754228062e-05, + "loss": 0.3419, + "step": 1694 + }, + { + "epoch": 0.031486356531005524, + "grad_norm": 0.36174726486206055, + "learning_rate": 1.995111562261389e-05, + "loss": 0.4012, + "step": 1696 + }, + { + "epoch": 0.03152348666842416, + "grad_norm": 0.3643866777420044, + "learning_rate": 1.9951000355594842e-05, + "loss": 0.3863, + "step": 1698 + }, + { + "epoch": 0.0315606168058428, + "grad_norm": 0.35113590955734253, + "learning_rate": 1.9950884953172486e-05, + "loss": 0.3159, + "step": 1700 + }, + { + "epoch": 0.03159774694326144, + "grad_norm": 0.5454980134963989, + "learning_rate": 1.9950769415348393e-05, + "loss": 0.5577, + "step": 1702 + }, + { + "epoch": 0.03163487708068007, + "grad_norm": 0.33126652240753174, + "learning_rate": 1.9950653742124136e-05, + "loss": 0.3469, + "step": 1704 + }, + { + "epoch": 0.031672007218098715, + "grad_norm": 0.3847205936908722, + "learning_rate": 1.9950537933501285e-05, + "loss": 0.3718, + "step": 1706 + }, + { + "epoch": 0.03170913735551735, + "grad_norm": 0.2673433721065521, + "learning_rate": 1.9950421989481418e-05, + "loss": 0.3175, + "step": 1708 + }, + { + "epoch": 0.03174626749293599, + "grad_norm": 0.26624780893325806, + "learning_rate": 1.995030591006611e-05, + "loss": 0.4444, + "step": 1710 + }, + { + "epoch": 0.03178339763035463, + "grad_norm": 0.3721410632133484, + "learning_rate": 1.9950189695256946e-05, + "loss": 0.3837, + "step": 1712 + }, + { + "epoch": 0.03182052776777327, + "grad_norm": 0.40957459807395935, + "learning_rate": 1.9950073345055507e-05, + "loss": 0.3521, + "step": 1714 + }, + { + "epoch": 0.031857657905191905, + "grad_norm": 0.2849561870098114, + "learning_rate": 1.994995685946337e-05, + "loss": 0.2899, + "step": 1716 + }, + { + "epoch": 0.03189478804261055, + "grad_norm": 0.557837188243866, + "learning_rate": 1.9949840238482126e-05, + "loss": 0.4348, + "step": 1718 + }, + { + "epoch": 0.03193191818002918, + "grad_norm": 0.4172285497188568, + "learning_rate": 1.9949723482113355e-05, + "loss": 0.2374, + "step": 1720 + }, + { + "epoch": 0.031969048317447825, + "grad_norm": 0.22838103771209717, + "learning_rate": 1.9949606590358657e-05, + "loss": 0.2863, + "step": 1722 + }, + { + "epoch": 0.03200617845486646, + "grad_norm": 0.367658793926239, + "learning_rate": 1.9949489563219612e-05, + "loss": 0.2889, + "step": 1724 + }, + { + "epoch": 0.0320433085922851, + "grad_norm": 0.427361398935318, + "learning_rate": 1.994937240069782e-05, + "loss": 0.2958, + "step": 1726 + }, + { + "epoch": 0.03208043872970374, + "grad_norm": 0.33430930972099304, + "learning_rate": 1.9949255102794867e-05, + "loss": 0.3483, + "step": 1728 + }, + { + "epoch": 0.03211756886712238, + "grad_norm": 0.36812496185302734, + "learning_rate": 1.9949137669512354e-05, + "loss": 0.422, + "step": 1730 + }, + { + "epoch": 0.032154699004541015, + "grad_norm": 0.24867336452007294, + "learning_rate": 1.9949020100851876e-05, + "loss": 0.2527, + "step": 1732 + }, + { + "epoch": 0.03219182914195966, + "grad_norm": 0.5246883034706116, + "learning_rate": 1.994890239681504e-05, + "loss": 0.5334, + "step": 1734 + }, + { + "epoch": 0.03222895927937829, + "grad_norm": 0.30807244777679443, + "learning_rate": 1.9948784557403442e-05, + "loss": 0.3576, + "step": 1736 + }, + { + "epoch": 0.03226608941679693, + "grad_norm": 0.35629957914352417, + "learning_rate": 1.9948666582618685e-05, + "loss": 0.2987, + "step": 1738 + }, + { + "epoch": 0.03230321955421557, + "grad_norm": 0.3506420850753784, + "learning_rate": 1.9948548472462376e-05, + "loss": 0.1867, + "step": 1740 + }, + { + "epoch": 0.032340349691634206, + "grad_norm": 0.3471173644065857, + "learning_rate": 1.994843022693612e-05, + "loss": 0.4119, + "step": 1742 + }, + { + "epoch": 0.03237747982905285, + "grad_norm": 0.3845759928226471, + "learning_rate": 1.994831184604153e-05, + "loss": 0.3232, + "step": 1744 + }, + { + "epoch": 0.03241460996647148, + "grad_norm": 0.25755491852760315, + "learning_rate": 1.9948193329780214e-05, + "loss": 0.2737, + "step": 1746 + }, + { + "epoch": 0.032451740103890125, + "grad_norm": 0.3594185411930084, + "learning_rate": 1.9948074678153783e-05, + "loss": 0.334, + "step": 1748 + }, + { + "epoch": 0.03248887024130876, + "grad_norm": 0.4289058744907379, + "learning_rate": 1.9947955891163858e-05, + "loss": 0.3937, + "step": 1750 + }, + { + "epoch": 0.0325260003787274, + "grad_norm": 0.3494783341884613, + "learning_rate": 1.9947836968812047e-05, + "loss": 0.3211, + "step": 1752 + }, + { + "epoch": 0.03256313051614604, + "grad_norm": 0.3607964515686035, + "learning_rate": 1.9947717911099973e-05, + "loss": 0.2496, + "step": 1754 + }, + { + "epoch": 0.03260026065356468, + "grad_norm": 0.36975347995758057, + "learning_rate": 1.9947598718029257e-05, + "loss": 0.4069, + "step": 1756 + }, + { + "epoch": 0.032637390790983316, + "grad_norm": 0.28233110904693604, + "learning_rate": 1.9947479389601516e-05, + "loss": 0.3403, + "step": 1758 + }, + { + "epoch": 0.03267452092840196, + "grad_norm": 0.36703306436538696, + "learning_rate": 1.9947359925818378e-05, + "loss": 0.293, + "step": 1760 + }, + { + "epoch": 0.03271165106582059, + "grad_norm": 0.3206377923488617, + "learning_rate": 1.994724032668147e-05, + "loss": 0.3352, + "step": 1762 + }, + { + "epoch": 0.032748781203239236, + "grad_norm": 0.2851402163505554, + "learning_rate": 1.994712059219241e-05, + "loss": 0.5385, + "step": 1764 + }, + { + "epoch": 0.03278591134065787, + "grad_norm": 0.43175849318504333, + "learning_rate": 1.994700072235284e-05, + "loss": 0.3726, + "step": 1766 + }, + { + "epoch": 0.03282304147807651, + "grad_norm": 0.43918466567993164, + "learning_rate": 1.994688071716438e-05, + "loss": 0.5355, + "step": 1768 + }, + { + "epoch": 0.03286017161549515, + "grad_norm": 0.5312216877937317, + "learning_rate": 1.994676057662867e-05, + "loss": 0.4615, + "step": 1770 + }, + { + "epoch": 0.03289730175291379, + "grad_norm": 0.514531672000885, + "learning_rate": 1.994664030074734e-05, + "loss": 0.3627, + "step": 1772 + }, + { + "epoch": 0.032934431890332426, + "grad_norm": 0.3006580173969269, + "learning_rate": 1.9946519889522036e-05, + "loss": 0.3508, + "step": 1774 + }, + { + "epoch": 0.03297156202775106, + "grad_norm": 0.44068196415901184, + "learning_rate": 1.9946399342954384e-05, + "loss": 0.5224, + "step": 1776 + }, + { + "epoch": 0.033008692165169704, + "grad_norm": 0.4029116928577423, + "learning_rate": 1.9946278661046032e-05, + "loss": 0.2598, + "step": 1778 + }, + { + "epoch": 0.03304582230258834, + "grad_norm": 0.2721407413482666, + "learning_rate": 1.9946157843798618e-05, + "loss": 0.3648, + "step": 1780 + }, + { + "epoch": 0.03308295244000698, + "grad_norm": 0.3865302801132202, + "learning_rate": 1.994603689121379e-05, + "loss": 0.5265, + "step": 1782 + }, + { + "epoch": 0.033120082577425616, + "grad_norm": 0.4034496247768402, + "learning_rate": 1.9945915803293192e-05, + "loss": 0.5124, + "step": 1784 + }, + { + "epoch": 0.03315721271484426, + "grad_norm": 0.34259992837905884, + "learning_rate": 1.994579458003847e-05, + "loss": 0.3764, + "step": 1786 + }, + { + "epoch": 0.033194342852262894, + "grad_norm": 0.30749520659446716, + "learning_rate": 1.9945673221451276e-05, + "loss": 0.3621, + "step": 1788 + }, + { + "epoch": 0.033231472989681536, + "grad_norm": 0.4914582371711731, + "learning_rate": 1.9945551727533258e-05, + "loss": 0.3662, + "step": 1790 + }, + { + "epoch": 0.03326860312710017, + "grad_norm": 0.3235917091369629, + "learning_rate": 1.9945430098286074e-05, + "loss": 0.5538, + "step": 1792 + }, + { + "epoch": 0.033305733264518814, + "grad_norm": 0.4097521901130676, + "learning_rate": 1.9945308333711376e-05, + "loss": 0.254, + "step": 1794 + }, + { + "epoch": 0.03334286340193745, + "grad_norm": 0.3567485511302948, + "learning_rate": 1.994518643381082e-05, + "loss": 0.4842, + "step": 1796 + }, + { + "epoch": 0.03337999353935609, + "grad_norm": 0.36090123653411865, + "learning_rate": 1.9945064398586066e-05, + "loss": 0.2326, + "step": 1798 + }, + { + "epoch": 0.033417123676774727, + "grad_norm": 0.3272109925746918, + "learning_rate": 1.9944942228038773e-05, + "loss": 0.5286, + "step": 1800 + }, + { + "epoch": 0.03345425381419337, + "grad_norm": 0.3510742783546448, + "learning_rate": 1.9944819922170605e-05, + "loss": 0.412, + "step": 1802 + }, + { + "epoch": 0.033491383951612004, + "grad_norm": 0.5229774117469788, + "learning_rate": 1.994469748098323e-05, + "loss": 0.3688, + "step": 1804 + }, + { + "epoch": 0.033528514089030646, + "grad_norm": 0.29095184803009033, + "learning_rate": 1.9944574904478306e-05, + "loss": 0.3508, + "step": 1806 + }, + { + "epoch": 0.03356564422644928, + "grad_norm": 0.3484596312046051, + "learning_rate": 1.994445219265751e-05, + "loss": 0.4964, + "step": 1808 + }, + { + "epoch": 0.033602774363867924, + "grad_norm": 0.385435551404953, + "learning_rate": 1.99443293455225e-05, + "loss": 0.4396, + "step": 1810 + }, + { + "epoch": 0.03363990450128656, + "grad_norm": 0.3210596442222595, + "learning_rate": 1.994420636307496e-05, + "loss": 0.4621, + "step": 1812 + }, + { + "epoch": 0.033677034638705194, + "grad_norm": 0.3639221489429474, + "learning_rate": 1.9944083245316555e-05, + "loss": 0.3999, + "step": 1814 + }, + { + "epoch": 0.03371416477612384, + "grad_norm": 0.4703008532524109, + "learning_rate": 1.9943959992248966e-05, + "loss": 0.2932, + "step": 1816 + }, + { + "epoch": 0.03375129491354247, + "grad_norm": 0.5173124670982361, + "learning_rate": 1.9943836603873863e-05, + "loss": 0.458, + "step": 1818 + }, + { + "epoch": 0.033788425050961114, + "grad_norm": 0.3299204111099243, + "learning_rate": 1.994371308019293e-05, + "loss": 0.3484, + "step": 1820 + }, + { + "epoch": 0.03382555518837975, + "grad_norm": 0.3522112965583801, + "learning_rate": 1.9943589421207848e-05, + "loss": 0.3915, + "step": 1822 + }, + { + "epoch": 0.03386268532579839, + "grad_norm": 0.3242979347705841, + "learning_rate": 1.9943465626920296e-05, + "loss": 0.5231, + "step": 1824 + }, + { + "epoch": 0.03389981546321703, + "grad_norm": 0.30497244000434875, + "learning_rate": 1.9943341697331964e-05, + "loss": 0.3446, + "step": 1826 + }, + { + "epoch": 0.03393694560063567, + "grad_norm": 0.4302389323711395, + "learning_rate": 1.9943217632444535e-05, + "loss": 0.5631, + "step": 1828 + }, + { + "epoch": 0.033974075738054305, + "grad_norm": 0.3600137531757355, + "learning_rate": 1.9943093432259696e-05, + "loss": 0.5228, + "step": 1830 + }, + { + "epoch": 0.03401120587547295, + "grad_norm": 0.4871399700641632, + "learning_rate": 1.994296909677914e-05, + "loss": 0.3173, + "step": 1832 + }, + { + "epoch": 0.03404833601289158, + "grad_norm": 0.4235627055168152, + "learning_rate": 1.9942844626004557e-05, + "loss": 0.3799, + "step": 1834 + }, + { + "epoch": 0.034085466150310224, + "grad_norm": 0.5523370504379272, + "learning_rate": 1.994272001993764e-05, + "loss": 0.4066, + "step": 1836 + }, + { + "epoch": 0.03412259628772886, + "grad_norm": 0.3631313145160675, + "learning_rate": 1.9942595278580087e-05, + "loss": 0.5382, + "step": 1838 + }, + { + "epoch": 0.0341597264251475, + "grad_norm": 0.25497767329216003, + "learning_rate": 1.9942470401933592e-05, + "loss": 0.3474, + "step": 1840 + }, + { + "epoch": 0.03419685656256614, + "grad_norm": 0.2758070230484009, + "learning_rate": 1.9942345389999854e-05, + "loss": 0.2967, + "step": 1842 + }, + { + "epoch": 0.03423398669998478, + "grad_norm": 0.305310994386673, + "learning_rate": 1.994222024278058e-05, + "loss": 0.478, + "step": 1844 + }, + { + "epoch": 0.034271116837403415, + "grad_norm": 0.27458810806274414, + "learning_rate": 1.9942094960277468e-05, + "loss": 0.4244, + "step": 1846 + }, + { + "epoch": 0.03430824697482206, + "grad_norm": 0.2730516791343689, + "learning_rate": 1.994196954249222e-05, + "loss": 0.4479, + "step": 1848 + }, + { + "epoch": 0.03434537711224069, + "grad_norm": 0.5396698713302612, + "learning_rate": 1.9941843989426552e-05, + "loss": 0.4003, + "step": 1850 + }, + { + "epoch": 0.03438250724965933, + "grad_norm": 0.3073475956916809, + "learning_rate": 1.9941718301082162e-05, + "loss": 0.4318, + "step": 1852 + }, + { + "epoch": 0.03441963738707797, + "grad_norm": 0.27815282344818115, + "learning_rate": 1.9941592477460767e-05, + "loss": 0.4288, + "step": 1854 + }, + { + "epoch": 0.034456767524496605, + "grad_norm": 0.39860472083091736, + "learning_rate": 1.9941466518564076e-05, + "loss": 0.375, + "step": 1856 + }, + { + "epoch": 0.03449389766191525, + "grad_norm": 0.33730626106262207, + "learning_rate": 1.9941340424393804e-05, + "loss": 0.1817, + "step": 1858 + }, + { + "epoch": 0.03453102779933388, + "grad_norm": 0.4199099540710449, + "learning_rate": 1.9941214194951666e-05, + "loss": 0.472, + "step": 1860 + }, + { + "epoch": 0.034568157936752525, + "grad_norm": 0.35788774490356445, + "learning_rate": 1.994108783023938e-05, + "loss": 0.3755, + "step": 1862 + }, + { + "epoch": 0.03460528807417116, + "grad_norm": 0.3827565312385559, + "learning_rate": 1.994096133025867e-05, + "loss": 0.3213, + "step": 1864 + }, + { + "epoch": 0.0346424182115898, + "grad_norm": 0.4929308295249939, + "learning_rate": 1.994083469501125e-05, + "loss": 0.4381, + "step": 1866 + }, + { + "epoch": 0.03467954834900844, + "grad_norm": 0.25834909081459045, + "learning_rate": 1.9940707924498843e-05, + "loss": 0.3474, + "step": 1868 + }, + { + "epoch": 0.03471667848642708, + "grad_norm": 0.4430135190486908, + "learning_rate": 1.994058101872318e-05, + "loss": 0.3914, + "step": 1870 + }, + { + "epoch": 0.034753808623845715, + "grad_norm": 0.3168889284133911, + "learning_rate": 1.9940453977685985e-05, + "loss": 0.3955, + "step": 1872 + }, + { + "epoch": 0.03479093876126436, + "grad_norm": 0.3557870388031006, + "learning_rate": 1.9940326801388985e-05, + "loss": 0.3993, + "step": 1874 + }, + { + "epoch": 0.03482806889868299, + "grad_norm": 0.4322146773338318, + "learning_rate": 1.9940199489833912e-05, + "loss": 0.35, + "step": 1876 + }, + { + "epoch": 0.034865199036101635, + "grad_norm": 0.3354968726634979, + "learning_rate": 1.99400720430225e-05, + "loss": 0.4373, + "step": 1878 + }, + { + "epoch": 0.03490232917352027, + "grad_norm": 0.42228370904922485, + "learning_rate": 1.993994446095648e-05, + "loss": 0.4806, + "step": 1880 + }, + { + "epoch": 0.03493945931093891, + "grad_norm": 0.316290020942688, + "learning_rate": 1.993981674363759e-05, + "loss": 0.5521, + "step": 1882 + }, + { + "epoch": 0.03497658944835755, + "grad_norm": 0.3309059739112854, + "learning_rate": 1.993968889106757e-05, + "loss": 0.357, + "step": 1884 + }, + { + "epoch": 0.03501371958577619, + "grad_norm": 0.3444094657897949, + "learning_rate": 1.993956090324815e-05, + "loss": 0.2369, + "step": 1886 + }, + { + "epoch": 0.035050849723194825, + "grad_norm": 0.26614245772361755, + "learning_rate": 1.9939432780181084e-05, + "loss": 0.4925, + "step": 1888 + }, + { + "epoch": 0.03508797986061346, + "grad_norm": 0.370993435382843, + "learning_rate": 1.993930452186811e-05, + "loss": 0.3997, + "step": 1890 + }, + { + "epoch": 0.0351251099980321, + "grad_norm": 0.44502395391464233, + "learning_rate": 1.993917612831097e-05, + "loss": 0.3391, + "step": 1892 + }, + { + "epoch": 0.03516224013545074, + "grad_norm": 0.297002375125885, + "learning_rate": 1.993904759951142e-05, + "loss": 0.5913, + "step": 1894 + }, + { + "epoch": 0.03519937027286938, + "grad_norm": 0.3435524106025696, + "learning_rate": 1.9938918935471196e-05, + "loss": 0.5202, + "step": 1896 + }, + { + "epoch": 0.035236500410288016, + "grad_norm": 0.5185344815254211, + "learning_rate": 1.993879013619206e-05, + "loss": 0.2987, + "step": 1898 + }, + { + "epoch": 0.03527363054770666, + "grad_norm": 0.4222150444984436, + "learning_rate": 1.993866120167576e-05, + "loss": 0.3449, + "step": 1900 + }, + { + "epoch": 0.03531076068512529, + "grad_norm": 0.3053376376628876, + "learning_rate": 1.993853213192405e-05, + "loss": 0.3897, + "step": 1902 + }, + { + "epoch": 0.035347890822543936, + "grad_norm": 0.33486446738243103, + "learning_rate": 1.9938402926938686e-05, + "loss": 0.4871, + "step": 1904 + }, + { + "epoch": 0.03538502095996257, + "grad_norm": 0.3348642885684967, + "learning_rate": 1.9938273586721428e-05, + "loss": 0.3193, + "step": 1906 + }, + { + "epoch": 0.03542215109738121, + "grad_norm": 0.38619720935821533, + "learning_rate": 1.993814411127404e-05, + "loss": 0.3135, + "step": 1908 + }, + { + "epoch": 0.03545928123479985, + "grad_norm": 0.3484084904193878, + "learning_rate": 1.9938014500598274e-05, + "loss": 0.3072, + "step": 1910 + }, + { + "epoch": 0.03549641137221849, + "grad_norm": 0.33404597640037537, + "learning_rate": 1.9937884754695894e-05, + "loss": 0.3448, + "step": 1912 + }, + { + "epoch": 0.035533541509637126, + "grad_norm": 0.34234708547592163, + "learning_rate": 1.993775487356868e-05, + "loss": 0.3552, + "step": 1914 + }, + { + "epoch": 0.03557067164705577, + "grad_norm": 0.2922877371311188, + "learning_rate": 1.9937624857218382e-05, + "loss": 0.4182, + "step": 1916 + }, + { + "epoch": 0.035607801784474404, + "grad_norm": 0.33263513445854187, + "learning_rate": 1.9937494705646777e-05, + "loss": 0.2987, + "step": 1918 + }, + { + "epoch": 0.035644931921893046, + "grad_norm": 0.4626135528087616, + "learning_rate": 1.9937364418855636e-05, + "loss": 0.3211, + "step": 1920 + }, + { + "epoch": 0.03568206205931168, + "grad_norm": 0.28774896264076233, + "learning_rate": 1.9937233996846733e-05, + "loss": 0.266, + "step": 1922 + }, + { + "epoch": 0.03571919219673032, + "grad_norm": 0.2679785490036011, + "learning_rate": 1.993710343962184e-05, + "loss": 0.5221, + "step": 1924 + }, + { + "epoch": 0.03575632233414896, + "grad_norm": 0.34216928482055664, + "learning_rate": 1.993697274718273e-05, + "loss": 0.39, + "step": 1926 + }, + { + "epoch": 0.035793452471567594, + "grad_norm": 0.421111524105072, + "learning_rate": 1.993684191953119e-05, + "loss": 0.3922, + "step": 1928 + }, + { + "epoch": 0.035830582608986236, + "grad_norm": 0.6337347030639648, + "learning_rate": 1.9936710956668995e-05, + "loss": 0.4566, + "step": 1930 + }, + { + "epoch": 0.03586771274640487, + "grad_norm": 0.3697822690010071, + "learning_rate": 1.993657985859793e-05, + "loss": 0.4265, + "step": 1932 + }, + { + "epoch": 0.035904842883823514, + "grad_norm": 0.3487740457057953, + "learning_rate": 1.9936448625319772e-05, + "loss": 0.4625, + "step": 1934 + }, + { + "epoch": 0.03594197302124215, + "grad_norm": 0.34794843196868896, + "learning_rate": 1.993631725683631e-05, + "loss": 0.5031, + "step": 1936 + }, + { + "epoch": 0.03597910315866079, + "grad_norm": 0.36677077412605286, + "learning_rate": 1.993618575314934e-05, + "loss": 0.4152, + "step": 1938 + }, + { + "epoch": 0.036016233296079427, + "grad_norm": 0.3095185160636902, + "learning_rate": 1.9936054114260643e-05, + "loss": 0.4033, + "step": 1940 + }, + { + "epoch": 0.03605336343349807, + "grad_norm": 0.32272282242774963, + "learning_rate": 1.9935922340172006e-05, + "loss": 0.2433, + "step": 1942 + }, + { + "epoch": 0.036090493570916704, + "grad_norm": 0.36465156078338623, + "learning_rate": 1.9935790430885232e-05, + "loss": 0.2754, + "step": 1944 + }, + { + "epoch": 0.036127623708335346, + "grad_norm": 0.4902269244194031, + "learning_rate": 1.9935658386402107e-05, + "loss": 0.2743, + "step": 1946 + }, + { + "epoch": 0.03616475384575398, + "grad_norm": 0.37371352314949036, + "learning_rate": 1.9935526206724435e-05, + "loss": 0.5222, + "step": 1948 + }, + { + "epoch": 0.036201883983172624, + "grad_norm": 0.28851598501205444, + "learning_rate": 1.9935393891854014e-05, + "loss": 0.2028, + "step": 1950 + }, + { + "epoch": 0.03623901412059126, + "grad_norm": 0.37370339035987854, + "learning_rate": 1.9935261441792638e-05, + "loss": 0.2682, + "step": 1952 + }, + { + "epoch": 0.0362761442580099, + "grad_norm": 0.3711865246295929, + "learning_rate": 1.9935128856542117e-05, + "loss": 0.1995, + "step": 1954 + }, + { + "epoch": 0.03631327439542854, + "grad_norm": 0.33272868394851685, + "learning_rate": 1.9934996136104246e-05, + "loss": 0.3704, + "step": 1956 + }, + { + "epoch": 0.03635040453284718, + "grad_norm": 0.2833925485610962, + "learning_rate": 1.9934863280480842e-05, + "loss": 0.4734, + "step": 1958 + }, + { + "epoch": 0.036387534670265814, + "grad_norm": 0.3788817226886749, + "learning_rate": 1.9934730289673704e-05, + "loss": 0.4221, + "step": 1960 + }, + { + "epoch": 0.036424664807684456, + "grad_norm": 0.2599087655544281, + "learning_rate": 1.9934597163684647e-05, + "loss": 0.3282, + "step": 1962 + }, + { + "epoch": 0.03646179494510309, + "grad_norm": 0.3443366587162018, + "learning_rate": 1.9934463902515478e-05, + "loss": 0.367, + "step": 1964 + }, + { + "epoch": 0.03649892508252173, + "grad_norm": 0.3527637720108032, + "learning_rate": 1.9934330506168016e-05, + "loss": 0.5122, + "step": 1966 + }, + { + "epoch": 0.03653605521994037, + "grad_norm": 0.4264635741710663, + "learning_rate": 1.9934196974644067e-05, + "loss": 0.3896, + "step": 1968 + }, + { + "epoch": 0.036573185357359005, + "grad_norm": 0.32263895869255066, + "learning_rate": 1.993406330794546e-05, + "loss": 0.4964, + "step": 1970 + }, + { + "epoch": 0.03661031549477765, + "grad_norm": 0.5727680921554565, + "learning_rate": 1.9933929506074002e-05, + "loss": 0.5659, + "step": 1972 + }, + { + "epoch": 0.03664744563219628, + "grad_norm": 0.2408292442560196, + "learning_rate": 1.993379556903152e-05, + "loss": 0.2288, + "step": 1974 + }, + { + "epoch": 0.036684575769614924, + "grad_norm": 0.5993266105651855, + "learning_rate": 1.9933661496819835e-05, + "loss": 0.5051, + "step": 1976 + }, + { + "epoch": 0.03672170590703356, + "grad_norm": 0.2792007327079773, + "learning_rate": 1.9933527289440776e-05, + "loss": 0.2766, + "step": 1978 + }, + { + "epoch": 0.0367588360444522, + "grad_norm": 0.3402690589427948, + "learning_rate": 1.9933392946896163e-05, + "loss": 0.3342, + "step": 1980 + }, + { + "epoch": 0.03679596618187084, + "grad_norm": 0.3172428607940674, + "learning_rate": 1.9933258469187826e-05, + "loss": 0.3284, + "step": 1982 + }, + { + "epoch": 0.03683309631928948, + "grad_norm": 0.42654192447662354, + "learning_rate": 1.993312385631759e-05, + "loss": 0.3755, + "step": 1984 + }, + { + "epoch": 0.036870226456708115, + "grad_norm": 0.41326162219047546, + "learning_rate": 1.9932989108287294e-05, + "loss": 0.467, + "step": 1986 + }, + { + "epoch": 0.03690735659412676, + "grad_norm": 0.41839542984962463, + "learning_rate": 1.9932854225098773e-05, + "loss": 0.363, + "step": 1988 + }, + { + "epoch": 0.03694448673154539, + "grad_norm": 0.23728081583976746, + "learning_rate": 1.9932719206753853e-05, + "loss": 0.1745, + "step": 1990 + }, + { + "epoch": 0.036981616868964035, + "grad_norm": 0.29624655842781067, + "learning_rate": 1.993258405325438e-05, + "loss": 0.3023, + "step": 1992 + }, + { + "epoch": 0.03701874700638267, + "grad_norm": 0.36446744203567505, + "learning_rate": 1.9932448764602188e-05, + "loss": 0.4905, + "step": 1994 + }, + { + "epoch": 0.03705587714380131, + "grad_norm": 0.2961815595626831, + "learning_rate": 1.993231334079912e-05, + "loss": 0.2249, + "step": 1996 + }, + { + "epoch": 0.03709300728121995, + "grad_norm": 0.8085873126983643, + "learning_rate": 1.993217778184702e-05, + "loss": 0.5056, + "step": 1998 + }, + { + "epoch": 0.03713013741863859, + "grad_norm": 0.46194371581077576, + "learning_rate": 1.9932042087747727e-05, + "loss": 0.4027, + "step": 2000 + }, + { + "epoch": 0.037167267556057225, + "grad_norm": 0.2922123968601227, + "learning_rate": 1.9931906258503093e-05, + "loss": 0.4124, + "step": 2002 + }, + { + "epoch": 0.03720439769347586, + "grad_norm": 0.43618541955947876, + "learning_rate": 1.9931770294114965e-05, + "loss": 0.3928, + "step": 2004 + }, + { + "epoch": 0.0372415278308945, + "grad_norm": 0.37184783816337585, + "learning_rate": 1.9931634194585193e-05, + "loss": 0.2516, + "step": 2006 + }, + { + "epoch": 0.03727865796831314, + "grad_norm": 0.4159109890460968, + "learning_rate": 1.9931497959915624e-05, + "loss": 0.2019, + "step": 2008 + }, + { + "epoch": 0.03731578810573178, + "grad_norm": 0.3476172089576721, + "learning_rate": 1.993136159010812e-05, + "loss": 0.4736, + "step": 2010 + }, + { + "epoch": 0.037352918243150415, + "grad_norm": 0.3119805157184601, + "learning_rate": 1.9931225085164533e-05, + "loss": 0.3696, + "step": 2012 + }, + { + "epoch": 0.03739004838056906, + "grad_norm": 0.37937813997268677, + "learning_rate": 1.993108844508672e-05, + "loss": 0.2255, + "step": 2014 + }, + { + "epoch": 0.03742717851798769, + "grad_norm": 0.38822057843208313, + "learning_rate": 1.9930951669876537e-05, + "loss": 0.3128, + "step": 2016 + }, + { + "epoch": 0.037464308655406335, + "grad_norm": 0.31536105275154114, + "learning_rate": 1.993081475953585e-05, + "loss": 0.3901, + "step": 2018 + }, + { + "epoch": 0.03750143879282497, + "grad_norm": 0.38921961188316345, + "learning_rate": 1.993067771406652e-05, + "loss": 0.2179, + "step": 2020 + }, + { + "epoch": 0.03753856893024361, + "grad_norm": 0.46892279386520386, + "learning_rate": 1.9930540533470415e-05, + "loss": 0.3632, + "step": 2022 + }, + { + "epoch": 0.03757569906766225, + "grad_norm": 0.3340199887752533, + "learning_rate": 1.9930403217749395e-05, + "loss": 0.4507, + "step": 2024 + }, + { + "epoch": 0.03761282920508089, + "grad_norm": 0.31743505597114563, + "learning_rate": 1.9930265766905337e-05, + "loss": 0.3638, + "step": 2026 + }, + { + "epoch": 0.037649959342499525, + "grad_norm": 0.4792838990688324, + "learning_rate": 1.9930128180940103e-05, + "loss": 0.5273, + "step": 2028 + }, + { + "epoch": 0.03768708947991817, + "grad_norm": 0.29156723618507385, + "learning_rate": 1.992999045985557e-05, + "loss": 0.3429, + "step": 2030 + }, + { + "epoch": 0.0377242196173368, + "grad_norm": 0.33172696828842163, + "learning_rate": 1.9929852603653608e-05, + "loss": 0.4972, + "step": 2032 + }, + { + "epoch": 0.037761349754755445, + "grad_norm": 0.23541422188282013, + "learning_rate": 1.9929714612336094e-05, + "loss": 0.1105, + "step": 2034 + }, + { + "epoch": 0.03779847989217408, + "grad_norm": 0.3401322662830353, + "learning_rate": 1.9929576485904913e-05, + "loss": 0.5482, + "step": 2036 + }, + { + "epoch": 0.03783561002959272, + "grad_norm": 0.267124205827713, + "learning_rate": 1.9929438224361935e-05, + "loss": 0.1828, + "step": 2038 + }, + { + "epoch": 0.03787274016701136, + "grad_norm": 0.4598195552825928, + "learning_rate": 1.9929299827709046e-05, + "loss": 0.2551, + "step": 2040 + }, + { + "epoch": 0.03790987030442999, + "grad_norm": 0.34230053424835205, + "learning_rate": 1.9929161295948127e-05, + "loss": 0.3868, + "step": 2042 + }, + { + "epoch": 0.037947000441848636, + "grad_norm": 0.32452279329299927, + "learning_rate": 1.9929022629081065e-05, + "loss": 0.3717, + "step": 2044 + }, + { + "epoch": 0.03798413057926727, + "grad_norm": 0.28409162163734436, + "learning_rate": 1.9928883827109745e-05, + "loss": 0.3212, + "step": 2046 + }, + { + "epoch": 0.03802126071668591, + "grad_norm": 0.3782145380973816, + "learning_rate": 1.992874489003606e-05, + "loss": 0.352, + "step": 2048 + }, + { + "epoch": 0.03805839085410455, + "grad_norm": 0.3633666932582855, + "learning_rate": 1.9928605817861893e-05, + "loss": 0.3211, + "step": 2050 + }, + { + "epoch": 0.03809552099152319, + "grad_norm": 0.29387167096138, + "learning_rate": 1.9928466610589142e-05, + "loss": 0.37, + "step": 2052 + }, + { + "epoch": 0.038132651128941826, + "grad_norm": 0.39630770683288574, + "learning_rate": 1.99283272682197e-05, + "loss": 0.3872, + "step": 2054 + }, + { + "epoch": 0.03816978126636047, + "grad_norm": 0.3808368444442749, + "learning_rate": 1.992818779075546e-05, + "loss": 0.396, + "step": 2056 + }, + { + "epoch": 0.038206911403779104, + "grad_norm": 0.2896334230899811, + "learning_rate": 1.9928048178198325e-05, + "loss": 0.2733, + "step": 2058 + }, + { + "epoch": 0.038244041541197746, + "grad_norm": 0.3005021810531616, + "learning_rate": 1.9927908430550192e-05, + "loss": 0.3716, + "step": 2060 + }, + { + "epoch": 0.03828117167861638, + "grad_norm": 0.4141136407852173, + "learning_rate": 1.9927768547812963e-05, + "loss": 0.3235, + "step": 2062 + }, + { + "epoch": 0.03831830181603502, + "grad_norm": 0.3960406184196472, + "learning_rate": 1.9927628529988537e-05, + "loss": 0.4165, + "step": 2064 + }, + { + "epoch": 0.03835543195345366, + "grad_norm": 0.4837163984775543, + "learning_rate": 1.992748837707883e-05, + "loss": 0.4182, + "step": 2066 + }, + { + "epoch": 0.0383925620908723, + "grad_norm": 0.312007337808609, + "learning_rate": 1.9927348089085738e-05, + "loss": 0.2246, + "step": 2068 + }, + { + "epoch": 0.038429692228290936, + "grad_norm": 0.3793455958366394, + "learning_rate": 1.992720766601118e-05, + "loss": 0.5252, + "step": 2070 + }, + { + "epoch": 0.03846682236570958, + "grad_norm": 0.30731791257858276, + "learning_rate": 1.9927067107857053e-05, + "loss": 0.25, + "step": 2072 + }, + { + "epoch": 0.038503952503128214, + "grad_norm": 0.3175412118434906, + "learning_rate": 1.9926926414625282e-05, + "loss": 0.5231, + "step": 2074 + }, + { + "epoch": 0.038541082640546856, + "grad_norm": 0.3225423991680145, + "learning_rate": 1.9926785586317778e-05, + "loss": 0.2602, + "step": 2076 + }, + { + "epoch": 0.03857821277796549, + "grad_norm": 0.4689269959926605, + "learning_rate": 1.9926644622936453e-05, + "loss": 0.5089, + "step": 2078 + }, + { + "epoch": 0.03861534291538413, + "grad_norm": 0.3424060046672821, + "learning_rate": 1.9926503524483233e-05, + "loss": 0.5296, + "step": 2080 + }, + { + "epoch": 0.03865247305280277, + "grad_norm": 0.3351812958717346, + "learning_rate": 1.9926362290960028e-05, + "loss": 0.3798, + "step": 2082 + }, + { + "epoch": 0.038689603190221404, + "grad_norm": 0.4459707736968994, + "learning_rate": 1.992622092236877e-05, + "loss": 0.3672, + "step": 2084 + }, + { + "epoch": 0.038726733327640046, + "grad_norm": 0.26623523235321045, + "learning_rate": 1.9926079418711376e-05, + "loss": 0.4286, + "step": 2086 + }, + { + "epoch": 0.03876386346505868, + "grad_norm": 0.3457959294319153, + "learning_rate": 1.992593777998977e-05, + "loss": 0.4096, + "step": 2088 + }, + { + "epoch": 0.038800993602477324, + "grad_norm": 0.3527875244617462, + "learning_rate": 1.9925796006205886e-05, + "loss": 0.2598, + "step": 2090 + }, + { + "epoch": 0.03883812373989596, + "grad_norm": 0.30972811579704285, + "learning_rate": 1.9925654097361652e-05, + "loss": 0.5152, + "step": 2092 + }, + { + "epoch": 0.0388752538773146, + "grad_norm": 0.32219821214675903, + "learning_rate": 1.9925512053458994e-05, + "loss": 0.4621, + "step": 2094 + }, + { + "epoch": 0.03891238401473324, + "grad_norm": 0.2656967043876648, + "learning_rate": 1.9925369874499843e-05, + "loss": 0.3362, + "step": 2096 + }, + { + "epoch": 0.03894951415215188, + "grad_norm": 0.24639476835727692, + "learning_rate": 1.9925227560486144e-05, + "loss": 0.4291, + "step": 2098 + }, + { + "epoch": 0.038986644289570514, + "grad_norm": 0.35264506936073303, + "learning_rate": 1.9925085111419824e-05, + "loss": 0.2616, + "step": 2100 + }, + { + "epoch": 0.039023774426989156, + "grad_norm": 0.3753797709941864, + "learning_rate": 1.9924942527302826e-05, + "loss": 0.3108, + "step": 2102 + }, + { + "epoch": 0.03906090456440779, + "grad_norm": 0.4267929196357727, + "learning_rate": 1.992479980813709e-05, + "loss": 0.3397, + "step": 2104 + }, + { + "epoch": 0.039098034701826434, + "grad_norm": 0.3270409107208252, + "learning_rate": 1.9924656953924553e-05, + "loss": 0.2825, + "step": 2106 + }, + { + "epoch": 0.03913516483924507, + "grad_norm": 0.37892788648605347, + "learning_rate": 1.9924513964667166e-05, + "loss": 0.4369, + "step": 2108 + }, + { + "epoch": 0.03917229497666371, + "grad_norm": 0.2995428144931793, + "learning_rate": 1.992437084036687e-05, + "loss": 0.481, + "step": 2110 + }, + { + "epoch": 0.03920942511408235, + "grad_norm": 0.33561640977859497, + "learning_rate": 1.9924227581025613e-05, + "loss": 0.3415, + "step": 2112 + }, + { + "epoch": 0.03924655525150099, + "grad_norm": 0.46927258372306824, + "learning_rate": 1.9924084186645347e-05, + "loss": 0.3322, + "step": 2114 + }, + { + "epoch": 0.039283685388919624, + "grad_norm": 0.5767914652824402, + "learning_rate": 1.992394065722802e-05, + "loss": 0.4341, + "step": 2116 + }, + { + "epoch": 0.03932081552633826, + "grad_norm": 0.3032285273075104, + "learning_rate": 1.9923796992775587e-05, + "loss": 0.3981, + "step": 2118 + }, + { + "epoch": 0.0393579456637569, + "grad_norm": 0.28246644139289856, + "learning_rate": 1.9923653193290003e-05, + "loss": 0.3932, + "step": 2120 + }, + { + "epoch": 0.03939507580117554, + "grad_norm": 0.42464685440063477, + "learning_rate": 1.9923509258773222e-05, + "loss": 0.5505, + "step": 2122 + }, + { + "epoch": 0.03943220593859418, + "grad_norm": 0.3257697522640228, + "learning_rate": 1.9923365189227205e-05, + "loss": 0.3472, + "step": 2124 + }, + { + "epoch": 0.039469336076012815, + "grad_norm": 0.2895216643810272, + "learning_rate": 1.992322098465391e-05, + "loss": 0.1977, + "step": 2126 + }, + { + "epoch": 0.03950646621343146, + "grad_norm": 0.3868873119354248, + "learning_rate": 1.9923076645055305e-05, + "loss": 0.3542, + "step": 2128 + }, + { + "epoch": 0.03954359635085009, + "grad_norm": 0.2838451564311981, + "learning_rate": 1.9922932170433345e-05, + "loss": 0.3035, + "step": 2130 + }, + { + "epoch": 0.039580726488268735, + "grad_norm": 0.3076413571834564, + "learning_rate": 1.9922787560790007e-05, + "loss": 0.4281, + "step": 2132 + }, + { + "epoch": 0.03961785662568737, + "grad_norm": 0.40551063418388367, + "learning_rate": 1.992264281612725e-05, + "loss": 0.2346, + "step": 2134 + }, + { + "epoch": 0.03965498676310601, + "grad_norm": 0.4483836591243744, + "learning_rate": 1.9922497936447042e-05, + "loss": 0.3487, + "step": 2136 + }, + { + "epoch": 0.03969211690052465, + "grad_norm": 0.33309370279312134, + "learning_rate": 1.9922352921751363e-05, + "loss": 0.2908, + "step": 2138 + }, + { + "epoch": 0.03972924703794329, + "grad_norm": 0.32426717877388, + "learning_rate": 1.992220777204218e-05, + "loss": 0.549, + "step": 2140 + }, + { + "epoch": 0.039766377175361925, + "grad_norm": 0.348467081785202, + "learning_rate": 1.992206248732147e-05, + "loss": 0.2715, + "step": 2142 + }, + { + "epoch": 0.03980350731278057, + "grad_norm": 0.38909488916397095, + "learning_rate": 1.9921917067591206e-05, + "loss": 0.335, + "step": 2144 + }, + { + "epoch": 0.0398406374501992, + "grad_norm": 0.6617769002914429, + "learning_rate": 1.9921771512853378e-05, + "loss": 0.326, + "step": 2146 + }, + { + "epoch": 0.039877767587617845, + "grad_norm": 0.3197513818740845, + "learning_rate": 1.9921625823109953e-05, + "loss": 0.2736, + "step": 2148 + }, + { + "epoch": 0.03991489772503648, + "grad_norm": 0.2721817195415497, + "learning_rate": 1.9921479998362917e-05, + "loss": 0.4214, + "step": 2150 + }, + { + "epoch": 0.03995202786245512, + "grad_norm": 0.4922572672367096, + "learning_rate": 1.9921334038614262e-05, + "loss": 0.4888, + "step": 2152 + }, + { + "epoch": 0.03998915799987376, + "grad_norm": 0.3573835790157318, + "learning_rate": 1.9921187943865967e-05, + "loss": 0.4041, + "step": 2154 + }, + { + "epoch": 0.04002628813729239, + "grad_norm": 0.35106274485588074, + "learning_rate": 1.992104171412002e-05, + "loss": 0.3319, + "step": 2156 + }, + { + "epoch": 0.040063418274711035, + "grad_norm": 0.3410296142101288, + "learning_rate": 1.9920895349378415e-05, + "loss": 0.2539, + "step": 2158 + }, + { + "epoch": 0.04010054841212967, + "grad_norm": 0.3853680491447449, + "learning_rate": 1.992074884964314e-05, + "loss": 0.275, + "step": 2160 + }, + { + "epoch": 0.04013767854954831, + "grad_norm": 0.519813597202301, + "learning_rate": 1.9920602214916188e-05, + "loss": 0.4718, + "step": 2162 + }, + { + "epoch": 0.04017480868696695, + "grad_norm": 0.383322536945343, + "learning_rate": 1.9920455445199555e-05, + "loss": 0.2536, + "step": 2164 + }, + { + "epoch": 0.04021193882438559, + "grad_norm": 0.3393462598323822, + "learning_rate": 1.992030854049524e-05, + "loss": 0.3868, + "step": 2166 + }, + { + "epoch": 0.040249068961804225, + "grad_norm": 0.35493627190589905, + "learning_rate": 1.992016150080524e-05, + "loss": 0.3806, + "step": 2168 + }, + { + "epoch": 0.04028619909922287, + "grad_norm": 0.336121529340744, + "learning_rate": 1.9920014326131555e-05, + "loss": 0.4996, + "step": 2170 + }, + { + "epoch": 0.0403233292366415, + "grad_norm": 0.41170138120651245, + "learning_rate": 1.9919867016476192e-05, + "loss": 0.2911, + "step": 2172 + }, + { + "epoch": 0.040360459374060145, + "grad_norm": 0.497999370098114, + "learning_rate": 1.9919719571841152e-05, + "loss": 0.5393, + "step": 2174 + }, + { + "epoch": 0.04039758951147878, + "grad_norm": 0.33208751678466797, + "learning_rate": 1.991957199222844e-05, + "loss": 0.4649, + "step": 2176 + }, + { + "epoch": 0.04043471964889742, + "grad_norm": 0.3563458025455475, + "learning_rate": 1.9919424277640066e-05, + "loss": 0.5845, + "step": 2178 + }, + { + "epoch": 0.04047184978631606, + "grad_norm": 0.6497002243995667, + "learning_rate": 1.991927642807804e-05, + "loss": 0.477, + "step": 2180 + }, + { + "epoch": 0.0405089799237347, + "grad_norm": 0.2925940752029419, + "learning_rate": 1.991912844354437e-05, + "loss": 0.5139, + "step": 2182 + }, + { + "epoch": 0.040546110061153336, + "grad_norm": 0.33827826380729675, + "learning_rate": 1.991898032404108e-05, + "loss": 0.2976, + "step": 2184 + }, + { + "epoch": 0.04058324019857198, + "grad_norm": 0.3400142788887024, + "learning_rate": 1.991883206957017e-05, + "loss": 0.3475, + "step": 2186 + }, + { + "epoch": 0.04062037033599061, + "grad_norm": 0.4645389914512634, + "learning_rate": 1.9918683680133673e-05, + "loss": 0.2604, + "step": 2188 + }, + { + "epoch": 0.040657500473409255, + "grad_norm": 0.46448948979377747, + "learning_rate": 1.99185351557336e-05, + "loss": 0.3129, + "step": 2190 + }, + { + "epoch": 0.04069463061082789, + "grad_norm": 0.3606633245944977, + "learning_rate": 1.991838649637197e-05, + "loss": 0.388, + "step": 2192 + }, + { + "epoch": 0.040731760748246526, + "grad_norm": 0.35943177342414856, + "learning_rate": 1.991823770205081e-05, + "loss": 0.4715, + "step": 2194 + }, + { + "epoch": 0.04076889088566517, + "grad_norm": 0.47583284974098206, + "learning_rate": 1.9918088772772145e-05, + "loss": 0.4741, + "step": 2196 + }, + { + "epoch": 0.040806021023083804, + "grad_norm": 0.3376258313655853, + "learning_rate": 1.9917939708538e-05, + "loss": 0.2445, + "step": 2198 + }, + { + "epoch": 0.040843151160502446, + "grad_norm": 0.3509151041507721, + "learning_rate": 1.9917790509350402e-05, + "loss": 0.24, + "step": 2200 + }, + { + "epoch": 0.04088028129792108, + "grad_norm": 0.370469868183136, + "learning_rate": 1.9917641175211383e-05, + "loss": 0.345, + "step": 2202 + }, + { + "epoch": 0.04091741143533972, + "grad_norm": 0.35861000418663025, + "learning_rate": 1.9917491706122974e-05, + "loss": 0.4812, + "step": 2204 + }, + { + "epoch": 0.04095454157275836, + "grad_norm": 0.4066939353942871, + "learning_rate": 1.9917342102087207e-05, + "loss": 0.3281, + "step": 2206 + }, + { + "epoch": 0.040991671710177, + "grad_norm": 0.49802619218826294, + "learning_rate": 1.9917192363106122e-05, + "loss": 0.4234, + "step": 2208 + }, + { + "epoch": 0.041028801847595636, + "grad_norm": 0.30297183990478516, + "learning_rate": 1.9917042489181755e-05, + "loss": 0.4447, + "step": 2210 + }, + { + "epoch": 0.04106593198501428, + "grad_norm": 0.375549852848053, + "learning_rate": 1.9916892480316142e-05, + "loss": 0.3326, + "step": 2212 + }, + { + "epoch": 0.041103062122432914, + "grad_norm": 0.460097074508667, + "learning_rate": 1.991674233651133e-05, + "loss": 0.2396, + "step": 2214 + }, + { + "epoch": 0.041140192259851556, + "grad_norm": 0.40191009640693665, + "learning_rate": 1.9916592057769358e-05, + "loss": 0.5136, + "step": 2216 + }, + { + "epoch": 0.04117732239727019, + "grad_norm": 0.36045041680336, + "learning_rate": 1.9916441644092273e-05, + "loss": 0.254, + "step": 2218 + }, + { + "epoch": 0.041214452534688834, + "grad_norm": 0.44894397258758545, + "learning_rate": 1.9916291095482117e-05, + "loss": 0.2642, + "step": 2220 + }, + { + "epoch": 0.04125158267210747, + "grad_norm": 0.2763565182685852, + "learning_rate": 1.9916140411940945e-05, + "loss": 0.3243, + "step": 2222 + }, + { + "epoch": 0.04128871280952611, + "grad_norm": 0.4381501376628876, + "learning_rate": 1.9915989593470802e-05, + "loss": 0.4769, + "step": 2224 + }, + { + "epoch": 0.041325842946944746, + "grad_norm": 0.36060407757759094, + "learning_rate": 1.9915838640073746e-05, + "loss": 0.1162, + "step": 2226 + }, + { + "epoch": 0.04136297308436339, + "grad_norm": 0.3952736258506775, + "learning_rate": 1.9915687551751825e-05, + "loss": 0.2006, + "step": 2228 + }, + { + "epoch": 0.041400103221782024, + "grad_norm": 0.29216158390045166, + "learning_rate": 1.9915536328507093e-05, + "loss": 0.4255, + "step": 2230 + }, + { + "epoch": 0.04143723335920066, + "grad_norm": 0.27519091963768005, + "learning_rate": 1.9915384970341617e-05, + "loss": 0.2669, + "step": 2232 + }, + { + "epoch": 0.0414743634966193, + "grad_norm": 0.27323049306869507, + "learning_rate": 1.991523347725745e-05, + "loss": 0.2183, + "step": 2234 + }, + { + "epoch": 0.04151149363403794, + "grad_norm": 0.454260915517807, + "learning_rate": 1.991508184925666e-05, + "loss": 0.1625, + "step": 2236 + }, + { + "epoch": 0.04154862377145658, + "grad_norm": 0.3792349398136139, + "learning_rate": 1.99149300863413e-05, + "loss": 0.456, + "step": 2238 + }, + { + "epoch": 0.041585753908875214, + "grad_norm": 0.4887813925743103, + "learning_rate": 1.991477818851344e-05, + "loss": 0.5328, + "step": 2240 + }, + { + "epoch": 0.041622884046293857, + "grad_norm": 0.3308490216732025, + "learning_rate": 1.991462615577515e-05, + "loss": 0.2673, + "step": 2242 + }, + { + "epoch": 0.04166001418371249, + "grad_norm": 0.298992782831192, + "learning_rate": 1.9914473988128494e-05, + "loss": 0.4752, + "step": 2244 + }, + { + "epoch": 0.041697144321131134, + "grad_norm": 0.2960985600948334, + "learning_rate": 1.9914321685575543e-05, + "loss": 0.2969, + "step": 2246 + }, + { + "epoch": 0.04173427445854977, + "grad_norm": 0.3824763596057892, + "learning_rate": 1.9914169248118375e-05, + "loss": 0.3026, + "step": 2248 + }, + { + "epoch": 0.04177140459596841, + "grad_norm": 0.3746114671230316, + "learning_rate": 1.9914016675759057e-05, + "loss": 0.3488, + "step": 2250 + }, + { + "epoch": 0.04180853473338705, + "grad_norm": 0.33106729388237, + "learning_rate": 1.991386396849967e-05, + "loss": 0.4569, + "step": 2252 + }, + { + "epoch": 0.04184566487080569, + "grad_norm": 0.38767120242118835, + "learning_rate": 1.9913711126342285e-05, + "loss": 0.356, + "step": 2254 + }, + { + "epoch": 0.041882795008224324, + "grad_norm": 0.44010499119758606, + "learning_rate": 1.9913558149288988e-05, + "loss": 0.4348, + "step": 2256 + }, + { + "epoch": 0.04191992514564297, + "grad_norm": 0.30764085054397583, + "learning_rate": 1.9913405037341863e-05, + "loss": 0.4257, + "step": 2258 + }, + { + "epoch": 0.0419570552830616, + "grad_norm": 0.34466466307640076, + "learning_rate": 1.9913251790502985e-05, + "loss": 0.4376, + "step": 2260 + }, + { + "epoch": 0.041994185420480244, + "grad_norm": 0.36967703700065613, + "learning_rate": 1.9913098408774447e-05, + "loss": 0.3009, + "step": 2262 + }, + { + "epoch": 0.04203131555789888, + "grad_norm": 0.357180118560791, + "learning_rate": 1.991294489215833e-05, + "loss": 0.4859, + "step": 2264 + }, + { + "epoch": 0.04206844569531752, + "grad_norm": 0.432104229927063, + "learning_rate": 1.9912791240656726e-05, + "loss": 0.4087, + "step": 2266 + }, + { + "epoch": 0.04210557583273616, + "grad_norm": 0.379543274641037, + "learning_rate": 1.991263745427173e-05, + "loss": 0.4416, + "step": 2268 + }, + { + "epoch": 0.04214270597015479, + "grad_norm": 0.3863603472709656, + "learning_rate": 1.9912483533005426e-05, + "loss": 0.4804, + "step": 2270 + }, + { + "epoch": 0.042179836107573435, + "grad_norm": 0.3551326394081116, + "learning_rate": 1.9912329476859913e-05, + "loss": 0.4418, + "step": 2272 + }, + { + "epoch": 0.04221696624499207, + "grad_norm": 0.3660787045955658, + "learning_rate": 1.9912175285837286e-05, + "loss": 0.4397, + "step": 2274 + }, + { + "epoch": 0.04225409638241071, + "grad_norm": 0.392020583152771, + "learning_rate": 1.9912020959939644e-05, + "loss": 0.2551, + "step": 2276 + }, + { + "epoch": 0.04229122651982935, + "grad_norm": 0.4010932445526123, + "learning_rate": 1.9911866499169088e-05, + "loss": 0.363, + "step": 2278 + }, + { + "epoch": 0.04232835665724799, + "grad_norm": 0.4786839187145233, + "learning_rate": 1.991171190352772e-05, + "loss": 0.5113, + "step": 2280 + }, + { + "epoch": 0.042365486794666625, + "grad_norm": 0.3111361563205719, + "learning_rate": 1.991155717301764e-05, + "loss": 0.373, + "step": 2282 + }, + { + "epoch": 0.04240261693208527, + "grad_norm": 0.33212417364120483, + "learning_rate": 1.9911402307640953e-05, + "loss": 0.3908, + "step": 2284 + }, + { + "epoch": 0.0424397470695039, + "grad_norm": 0.2925907075405121, + "learning_rate": 1.991124730739977e-05, + "loss": 0.5159, + "step": 2286 + }, + { + "epoch": 0.042476877206922545, + "grad_norm": 0.43963900208473206, + "learning_rate": 1.99110921722962e-05, + "loss": 0.3137, + "step": 2288 + }, + { + "epoch": 0.04251400734434118, + "grad_norm": 0.5904701352119446, + "learning_rate": 1.9910936902332355e-05, + "loss": 0.3314, + "step": 2290 + }, + { + "epoch": 0.04255113748175982, + "grad_norm": 0.26255282759666443, + "learning_rate": 1.9910781497510342e-05, + "loss": 0.3621, + "step": 2292 + }, + { + "epoch": 0.04258826761917846, + "grad_norm": 0.22866976261138916, + "learning_rate": 1.991062595783228e-05, + "loss": 0.4155, + "step": 2294 + }, + { + "epoch": 0.0426253977565971, + "grad_norm": 0.35680580139160156, + "learning_rate": 1.9910470283300283e-05, + "loss": 0.3972, + "step": 2296 + }, + { + "epoch": 0.042662527894015735, + "grad_norm": 0.35472118854522705, + "learning_rate": 1.9910314473916475e-05, + "loss": 0.4141, + "step": 2298 + }, + { + "epoch": 0.04269965803143438, + "grad_norm": 0.33780795335769653, + "learning_rate": 1.9910158529682966e-05, + "loss": 0.255, + "step": 2300 + }, + { + "epoch": 0.04273678816885301, + "grad_norm": 0.39178887009620667, + "learning_rate": 1.991000245060189e-05, + "loss": 0.4114, + "step": 2302 + }, + { + "epoch": 0.042773918306271655, + "grad_norm": 0.29477831721305847, + "learning_rate": 1.9909846236675357e-05, + "loss": 0.3544, + "step": 2304 + }, + { + "epoch": 0.04281104844369029, + "grad_norm": 0.36197277903556824, + "learning_rate": 1.9909689887905506e-05, + "loss": 0.3376, + "step": 2306 + }, + { + "epoch": 0.042848178581108926, + "grad_norm": 0.5479002594947815, + "learning_rate": 1.990953340429446e-05, + "loss": 0.3933, + "step": 2308 + }, + { + "epoch": 0.04288530871852757, + "grad_norm": 0.2806760370731354, + "learning_rate": 1.990937678584434e-05, + "loss": 0.4755, + "step": 2310 + }, + { + "epoch": 0.0429224388559462, + "grad_norm": 0.5285623669624329, + "learning_rate": 1.990922003255729e-05, + "loss": 0.4073, + "step": 2312 + }, + { + "epoch": 0.042959568993364845, + "grad_norm": 0.2632940411567688, + "learning_rate": 1.9909063144435432e-05, + "loss": 0.5134, + "step": 2314 + }, + { + "epoch": 0.04299669913078348, + "grad_norm": 0.26469793915748596, + "learning_rate": 1.9908906121480908e-05, + "loss": 0.3295, + "step": 2316 + }, + { + "epoch": 0.04303382926820212, + "grad_norm": 0.44179654121398926, + "learning_rate": 1.9908748963695855e-05, + "loss": 0.4447, + "step": 2318 + }, + { + "epoch": 0.04307095940562076, + "grad_norm": 0.4057632088661194, + "learning_rate": 1.9908591671082408e-05, + "loss": 0.1573, + "step": 2320 + }, + { + "epoch": 0.0431080895430394, + "grad_norm": 0.30882033705711365, + "learning_rate": 1.9908434243642705e-05, + "loss": 0.3017, + "step": 2322 + }, + { + "epoch": 0.043145219680458036, + "grad_norm": 0.3255697786808014, + "learning_rate": 1.990827668137889e-05, + "loss": 0.544, + "step": 2324 + }, + { + "epoch": 0.04318234981787668, + "grad_norm": 0.3034602105617523, + "learning_rate": 1.9908118984293114e-05, + "loss": 0.4561, + "step": 2326 + }, + { + "epoch": 0.04321947995529531, + "grad_norm": 0.31836676597595215, + "learning_rate": 1.9907961152387512e-05, + "loss": 0.4676, + "step": 2328 + }, + { + "epoch": 0.043256610092713955, + "grad_norm": 0.29657238721847534, + "learning_rate": 1.990780318566424e-05, + "loss": 0.5265, + "step": 2330 + }, + { + "epoch": 0.04329374023013259, + "grad_norm": 0.43714040517807007, + "learning_rate": 1.990764508412544e-05, + "loss": 0.37, + "step": 2332 + }, + { + "epoch": 0.04333087036755123, + "grad_norm": 0.436774879693985, + "learning_rate": 1.9907486847773268e-05, + "loss": 0.363, + "step": 2334 + }, + { + "epoch": 0.04336800050496987, + "grad_norm": 0.34877175092697144, + "learning_rate": 1.990732847660988e-05, + "loss": 0.296, + "step": 2336 + }, + { + "epoch": 0.04340513064238851, + "grad_norm": 0.34823158383369446, + "learning_rate": 1.9907169970637423e-05, + "loss": 0.5373, + "step": 2338 + }, + { + "epoch": 0.043442260779807146, + "grad_norm": 0.39081087708473206, + "learning_rate": 1.9907011329858063e-05, + "loss": 0.2426, + "step": 2340 + }, + { + "epoch": 0.04347939091722579, + "grad_norm": 0.3765751123428345, + "learning_rate": 1.990685255427395e-05, + "loss": 0.181, + "step": 2342 + }, + { + "epoch": 0.04351652105464442, + "grad_norm": 0.5395621657371521, + "learning_rate": 1.9906693643887248e-05, + "loss": 0.5283, + "step": 2344 + }, + { + "epoch": 0.04355365119206306, + "grad_norm": 0.44549351930618286, + "learning_rate": 1.990653459870012e-05, + "loss": 0.1863, + "step": 2346 + }, + { + "epoch": 0.0435907813294817, + "grad_norm": 0.3754114508628845, + "learning_rate": 1.9906375418714733e-05, + "loss": 0.4069, + "step": 2348 + }, + { + "epoch": 0.043627911466900336, + "grad_norm": 0.42772772908210754, + "learning_rate": 1.9906216103933246e-05, + "loss": 0.3083, + "step": 2350 + }, + { + "epoch": 0.04366504160431898, + "grad_norm": 0.344629168510437, + "learning_rate": 1.990605665435783e-05, + "loss": 0.3885, + "step": 2352 + }, + { + "epoch": 0.043702171741737614, + "grad_norm": 0.24962933361530304, + "learning_rate": 1.9905897069990655e-05, + "loss": 0.2465, + "step": 2354 + }, + { + "epoch": 0.043739301879156256, + "grad_norm": 0.3382076621055603, + "learning_rate": 1.9905737350833894e-05, + "loss": 0.4076, + "step": 2356 + }, + { + "epoch": 0.04377643201657489, + "grad_norm": 0.32117506861686707, + "learning_rate": 1.990557749688972e-05, + "loss": 0.2161, + "step": 2358 + }, + { + "epoch": 0.043813562153993534, + "grad_norm": 0.3750784993171692, + "learning_rate": 1.9905417508160304e-05, + "loss": 0.3565, + "step": 2360 + }, + { + "epoch": 0.04385069229141217, + "grad_norm": 0.20360666513442993, + "learning_rate": 1.9905257384647825e-05, + "loss": 0.2999, + "step": 2362 + }, + { + "epoch": 0.04388782242883081, + "grad_norm": 0.24929066002368927, + "learning_rate": 1.9905097126354468e-05, + "loss": 0.3624, + "step": 2364 + }, + { + "epoch": 0.043924952566249446, + "grad_norm": 0.24092373251914978, + "learning_rate": 1.9904936733282404e-05, + "loss": 0.3969, + "step": 2366 + }, + { + "epoch": 0.04396208270366809, + "grad_norm": 0.3946561813354492, + "learning_rate": 1.990477620543382e-05, + "loss": 0.385, + "step": 2368 + }, + { + "epoch": 0.043999212841086724, + "grad_norm": 0.4159776568412781, + "learning_rate": 1.99046155428109e-05, + "loss": 0.2834, + "step": 2370 + }, + { + "epoch": 0.044036342978505366, + "grad_norm": 0.3017308712005615, + "learning_rate": 1.9904454745415834e-05, + "loss": 0.2876, + "step": 2372 + }, + { + "epoch": 0.044073473115924, + "grad_norm": 0.3111267387866974, + "learning_rate": 1.9904293813250803e-05, + "loss": 0.4372, + "step": 2374 + }, + { + "epoch": 0.044110603253342644, + "grad_norm": 0.348039448261261, + "learning_rate": 1.9904132746317998e-05, + "loss": 0.3744, + "step": 2376 + }, + { + "epoch": 0.04414773339076128, + "grad_norm": 0.28794625401496887, + "learning_rate": 1.9903971544619615e-05, + "loss": 0.3805, + "step": 2378 + }, + { + "epoch": 0.04418486352817992, + "grad_norm": 0.3251354992389679, + "learning_rate": 1.9903810208157847e-05, + "loss": 0.3764, + "step": 2380 + }, + { + "epoch": 0.04422199366559856, + "grad_norm": 0.34317511320114136, + "learning_rate": 1.9903648736934885e-05, + "loss": 0.3482, + "step": 2382 + }, + { + "epoch": 0.04425912380301719, + "grad_norm": 0.1962006539106369, + "learning_rate": 1.9903487130952928e-05, + "loss": 0.1744, + "step": 2384 + }, + { + "epoch": 0.044296253940435834, + "grad_norm": 0.36925792694091797, + "learning_rate": 1.990332539021418e-05, + "loss": 0.4086, + "step": 2386 + }, + { + "epoch": 0.04433338407785447, + "grad_norm": 0.3420776426792145, + "learning_rate": 1.9903163514720833e-05, + "loss": 0.4173, + "step": 2388 + }, + { + "epoch": 0.04437051421527311, + "grad_norm": 0.2605527341365814, + "learning_rate": 1.990300150447509e-05, + "loss": 0.3825, + "step": 2390 + }, + { + "epoch": 0.04440764435269175, + "grad_norm": 0.33080989122390747, + "learning_rate": 1.990283935947917e-05, + "loss": 0.4515, + "step": 2392 + }, + { + "epoch": 0.04444477449011039, + "grad_norm": 0.41104617714881897, + "learning_rate": 1.990267707973526e-05, + "loss": 0.5782, + "step": 2394 + }, + { + "epoch": 0.044481904627529024, + "grad_norm": 0.2557139992713928, + "learning_rate": 1.9902514665245582e-05, + "loss": 0.2922, + "step": 2396 + }, + { + "epoch": 0.04451903476494767, + "grad_norm": 0.3282010853290558, + "learning_rate": 1.990235211601234e-05, + "loss": 0.3979, + "step": 2398 + }, + { + "epoch": 0.0445561649023663, + "grad_norm": 0.30607494711875916, + "learning_rate": 1.9902189432037744e-05, + "loss": 0.1829, + "step": 2400 + }, + { + "epoch": 0.044593295039784944, + "grad_norm": 0.40720054507255554, + "learning_rate": 1.990202661332401e-05, + "loss": 0.3416, + "step": 2402 + }, + { + "epoch": 0.04463042517720358, + "grad_norm": 0.3787792921066284, + "learning_rate": 1.9901863659873356e-05, + "loss": 0.465, + "step": 2404 + }, + { + "epoch": 0.04466755531462222, + "grad_norm": 0.5011675953865051, + "learning_rate": 1.9901700571687997e-05, + "loss": 0.3458, + "step": 2406 + }, + { + "epoch": 0.04470468545204086, + "grad_norm": 0.33339226245880127, + "learning_rate": 1.9901537348770153e-05, + "loss": 0.3946, + "step": 2408 + }, + { + "epoch": 0.0447418155894595, + "grad_norm": 0.33788949251174927, + "learning_rate": 1.9901373991122042e-05, + "loss": 0.2818, + "step": 2410 + }, + { + "epoch": 0.044778945726878135, + "grad_norm": 0.37276116013526917, + "learning_rate": 1.990121049874589e-05, + "loss": 0.3801, + "step": 2412 + }, + { + "epoch": 0.04481607586429678, + "grad_norm": 0.27786198258399963, + "learning_rate": 1.9901046871643924e-05, + "loss": 0.3944, + "step": 2414 + }, + { + "epoch": 0.04485320600171541, + "grad_norm": 0.9589679837226868, + "learning_rate": 1.990088310981836e-05, + "loss": 0.3325, + "step": 2416 + }, + { + "epoch": 0.04489033613913405, + "grad_norm": 0.30228763818740845, + "learning_rate": 1.990071921327144e-05, + "loss": 0.3787, + "step": 2418 + }, + { + "epoch": 0.04492746627655269, + "grad_norm": 0.4011729955673218, + "learning_rate": 1.9900555182005385e-05, + "loss": 0.4329, + "step": 2420 + }, + { + "epoch": 0.044964596413971325, + "grad_norm": 0.2549164891242981, + "learning_rate": 1.990039101602243e-05, + "loss": 0.2352, + "step": 2422 + }, + { + "epoch": 0.04500172655138997, + "grad_norm": 0.3323017954826355, + "learning_rate": 1.9900226715324807e-05, + "loss": 0.505, + "step": 2424 + }, + { + "epoch": 0.0450388566888086, + "grad_norm": 0.3643483817577362, + "learning_rate": 1.9900062279914755e-05, + "loss": 0.3692, + "step": 2426 + }, + { + "epoch": 0.045075986826227245, + "grad_norm": 0.25162139534950256, + "learning_rate": 1.989989770979451e-05, + "loss": 0.2858, + "step": 2428 + }, + { + "epoch": 0.04511311696364588, + "grad_norm": 0.24252890050411224, + "learning_rate": 1.989973300496631e-05, + "loss": 0.2267, + "step": 2430 + }, + { + "epoch": 0.04515024710106452, + "grad_norm": 0.3379379212856293, + "learning_rate": 1.9899568165432393e-05, + "loss": 0.4047, + "step": 2432 + }, + { + "epoch": 0.04518737723848316, + "grad_norm": 0.44211095571517944, + "learning_rate": 1.989940319119501e-05, + "loss": 0.4375, + "step": 2434 + }, + { + "epoch": 0.0452245073759018, + "grad_norm": 0.2861168682575226, + "learning_rate": 1.98992380822564e-05, + "loss": 0.3605, + "step": 2436 + }, + { + "epoch": 0.045261637513320435, + "grad_norm": 0.3657349944114685, + "learning_rate": 1.9899072838618814e-05, + "loss": 0.4211, + "step": 2438 + }, + { + "epoch": 0.04529876765073908, + "grad_norm": 0.3238188922405243, + "learning_rate": 1.9898907460284493e-05, + "loss": 0.3364, + "step": 2440 + }, + { + "epoch": 0.04533589778815771, + "grad_norm": 0.30089622735977173, + "learning_rate": 1.9898741947255697e-05, + "loss": 0.3663, + "step": 2442 + }, + { + "epoch": 0.045373027925576355, + "grad_norm": 0.37084662914276123, + "learning_rate": 1.989857629953467e-05, + "loss": 0.4073, + "step": 2444 + }, + { + "epoch": 0.04541015806299499, + "grad_norm": 0.36076033115386963, + "learning_rate": 1.9898410517123673e-05, + "loss": 0.42, + "step": 2446 + }, + { + "epoch": 0.04544728820041363, + "grad_norm": 0.5651138424873352, + "learning_rate": 1.9898244600024956e-05, + "loss": 0.2326, + "step": 2448 + }, + { + "epoch": 0.04548441833783227, + "grad_norm": 0.4224671721458435, + "learning_rate": 1.989807854824078e-05, + "loss": 0.4197, + "step": 2450 + }, + { + "epoch": 0.04552154847525091, + "grad_norm": 0.391889750957489, + "learning_rate": 1.98979123617734e-05, + "loss": 0.2539, + "step": 2452 + }, + { + "epoch": 0.045558678612669545, + "grad_norm": 0.4197494387626648, + "learning_rate": 1.9897746040625083e-05, + "loss": 0.4475, + "step": 2454 + }, + { + "epoch": 0.04559580875008818, + "grad_norm": 0.2940739691257477, + "learning_rate": 1.9897579584798086e-05, + "loss": 0.3785, + "step": 2456 + }, + { + "epoch": 0.04563293888750682, + "grad_norm": 0.20279686152935028, + "learning_rate": 1.989741299429468e-05, + "loss": 0.1485, + "step": 2458 + }, + { + "epoch": 0.04567006902492546, + "grad_norm": 0.36077946424484253, + "learning_rate": 1.989724626911713e-05, + "loss": 0.3476, + "step": 2460 + }, + { + "epoch": 0.0457071991623441, + "grad_norm": 0.29293885827064514, + "learning_rate": 1.9897079409267705e-05, + "loss": 0.2808, + "step": 2462 + }, + { + "epoch": 0.045744329299762736, + "grad_norm": 0.4073811173439026, + "learning_rate": 1.989691241474867e-05, + "loss": 0.4207, + "step": 2464 + }, + { + "epoch": 0.04578145943718138, + "grad_norm": 0.34943458437919617, + "learning_rate": 1.9896745285562303e-05, + "loss": 0.3376, + "step": 2466 + }, + { + "epoch": 0.04581858957460001, + "grad_norm": 0.46569836139678955, + "learning_rate": 1.9896578021710883e-05, + "loss": 0.3053, + "step": 2468 + }, + { + "epoch": 0.045855719712018655, + "grad_norm": 0.39764001965522766, + "learning_rate": 1.9896410623196673e-05, + "loss": 0.2863, + "step": 2470 + }, + { + "epoch": 0.04589284984943729, + "grad_norm": 0.32603809237480164, + "learning_rate": 1.989624309002196e-05, + "loss": 0.3841, + "step": 2472 + }, + { + "epoch": 0.04592997998685593, + "grad_norm": 0.36866292357444763, + "learning_rate": 1.9896075422189023e-05, + "loss": 0.2175, + "step": 2474 + }, + { + "epoch": 0.04596711012427457, + "grad_norm": 0.3144623041152954, + "learning_rate": 1.9895907619700136e-05, + "loss": 0.5212, + "step": 2476 + }, + { + "epoch": 0.04600424026169321, + "grad_norm": 0.27605611085891724, + "learning_rate": 1.9895739682557594e-05, + "loss": 0.4272, + "step": 2478 + }, + { + "epoch": 0.046041370399111846, + "grad_norm": 0.427211195230484, + "learning_rate": 1.9895571610763675e-05, + "loss": 0.4338, + "step": 2480 + }, + { + "epoch": 0.04607850053653049, + "grad_norm": 0.7452526688575745, + "learning_rate": 1.9895403404320665e-05, + "loss": 0.5646, + "step": 2482 + }, + { + "epoch": 0.04611563067394912, + "grad_norm": 0.2777949869632721, + "learning_rate": 1.9895235063230855e-05, + "loss": 0.474, + "step": 2484 + }, + { + "epoch": 0.046152760811367766, + "grad_norm": 0.4482880234718323, + "learning_rate": 1.9895066587496535e-05, + "loss": 0.3579, + "step": 2486 + }, + { + "epoch": 0.0461898909487864, + "grad_norm": 0.384441077709198, + "learning_rate": 1.989489797712e-05, + "loss": 0.3721, + "step": 2488 + }, + { + "epoch": 0.04622702108620504, + "grad_norm": 0.3040623068809509, + "learning_rate": 1.9894729232103542e-05, + "loss": 0.2381, + "step": 2490 + }, + { + "epoch": 0.04626415122362368, + "grad_norm": 0.28086578845977783, + "learning_rate": 1.9894560352449455e-05, + "loss": 0.3732, + "step": 2492 + }, + { + "epoch": 0.046301281361042314, + "grad_norm": 0.38532325625419617, + "learning_rate": 1.989439133816004e-05, + "loss": 0.5281, + "step": 2494 + }, + { + "epoch": 0.046338411498460956, + "grad_norm": 0.47509413957595825, + "learning_rate": 1.98942221892376e-05, + "loss": 0.2426, + "step": 2496 + }, + { + "epoch": 0.04637554163587959, + "grad_norm": 0.37172621488571167, + "learning_rate": 1.9894052905684428e-05, + "loss": 0.4302, + "step": 2498 + }, + { + "epoch": 0.046412671773298234, + "grad_norm": 0.2374938279390335, + "learning_rate": 1.9893883487502833e-05, + "loss": 0.5086, + "step": 2500 + }, + { + "epoch": 0.04644980191071687, + "grad_norm": 0.2940489649772644, + "learning_rate": 1.989371393469512e-05, + "loss": 0.1727, + "step": 2502 + }, + { + "epoch": 0.04648693204813551, + "grad_norm": 0.3569496273994446, + "learning_rate": 1.9893544247263597e-05, + "loss": 0.2745, + "step": 2504 + }, + { + "epoch": 0.046524062185554146, + "grad_norm": 0.4116990864276886, + "learning_rate": 1.9893374425210567e-05, + "loss": 0.4684, + "step": 2506 + }, + { + "epoch": 0.04656119232297279, + "grad_norm": 0.41885608434677124, + "learning_rate": 1.9893204468538347e-05, + "loss": 0.6169, + "step": 2508 + }, + { + "epoch": 0.046598322460391424, + "grad_norm": 0.596121072769165, + "learning_rate": 1.989303437724925e-05, + "loss": 0.3405, + "step": 2510 + }, + { + "epoch": 0.046635452597810066, + "grad_norm": 0.36439183354377747, + "learning_rate": 1.989286415134559e-05, + "loss": 0.4145, + "step": 2512 + }, + { + "epoch": 0.0466725827352287, + "grad_norm": 0.2912504971027374, + "learning_rate": 1.9892693790829676e-05, + "loss": 0.4611, + "step": 2514 + }, + { + "epoch": 0.046709712872647344, + "grad_norm": 0.3208148181438446, + "learning_rate": 1.9892523295703833e-05, + "loss": 0.4708, + "step": 2516 + }, + { + "epoch": 0.04674684301006598, + "grad_norm": 0.3281371295452118, + "learning_rate": 1.989235266597038e-05, + "loss": 0.3544, + "step": 2518 + }, + { + "epoch": 0.04678397314748462, + "grad_norm": 0.4005473554134369, + "learning_rate": 1.9892181901631638e-05, + "loss": 0.379, + "step": 2520 + }, + { + "epoch": 0.04682110328490326, + "grad_norm": 0.3310092091560364, + "learning_rate": 1.989201100268993e-05, + "loss": 0.4445, + "step": 2522 + }, + { + "epoch": 0.0468582334223219, + "grad_norm": 0.6983801126480103, + "learning_rate": 1.9891839969147585e-05, + "loss": 0.3418, + "step": 2524 + }, + { + "epoch": 0.046895363559740534, + "grad_norm": 0.34990838170051575, + "learning_rate": 1.9891668801006926e-05, + "loss": 0.278, + "step": 2526 + }, + { + "epoch": 0.046932493697159176, + "grad_norm": 0.26808440685272217, + "learning_rate": 1.9891497498270285e-05, + "loss": 0.2691, + "step": 2528 + }, + { + "epoch": 0.04696962383457781, + "grad_norm": 0.3087778687477112, + "learning_rate": 1.9891326060939987e-05, + "loss": 0.4235, + "step": 2530 + }, + { + "epoch": 0.04700675397199645, + "grad_norm": 0.22401942312717438, + "learning_rate": 1.9891154489018376e-05, + "loss": 0.3899, + "step": 2532 + }, + { + "epoch": 0.04704388410941509, + "grad_norm": 0.4075601100921631, + "learning_rate": 1.9890982782507774e-05, + "loss": 0.2954, + "step": 2534 + }, + { + "epoch": 0.047081014246833724, + "grad_norm": 0.47288867831230164, + "learning_rate": 1.9890810941410524e-05, + "loss": 0.2121, + "step": 2536 + }, + { + "epoch": 0.04711814438425237, + "grad_norm": 0.3600998818874359, + "learning_rate": 1.9890638965728968e-05, + "loss": 0.1265, + "step": 2538 + }, + { + "epoch": 0.047155274521671, + "grad_norm": 0.29128292202949524, + "learning_rate": 1.9890466855465437e-05, + "loss": 0.5616, + "step": 2540 + }, + { + "epoch": 0.047192404659089644, + "grad_norm": 0.343300461769104, + "learning_rate": 1.989029461062228e-05, + "loss": 0.3112, + "step": 2542 + }, + { + "epoch": 0.04722953479650828, + "grad_norm": 0.38288718461990356, + "learning_rate": 1.9890122231201835e-05, + "loss": 0.3432, + "step": 2544 + }, + { + "epoch": 0.04726666493392692, + "grad_norm": 0.2736768126487732, + "learning_rate": 1.9889949717206457e-05, + "loss": 0.3483, + "step": 2546 + }, + { + "epoch": 0.04730379507134556, + "grad_norm": 0.5975361466407776, + "learning_rate": 1.9889777068638482e-05, + "loss": 0.5866, + "step": 2548 + }, + { + "epoch": 0.0473409252087642, + "grad_norm": 0.24607698619365692, + "learning_rate": 1.9889604285500266e-05, + "loss": 0.2626, + "step": 2550 + }, + { + "epoch": 0.047378055346182835, + "grad_norm": 0.429671049118042, + "learning_rate": 1.9889431367794158e-05, + "loss": 0.373, + "step": 2552 + }, + { + "epoch": 0.04741518548360148, + "grad_norm": 0.3735169768333435, + "learning_rate": 1.9889258315522512e-05, + "loss": 0.3311, + "step": 2554 + }, + { + "epoch": 0.04745231562102011, + "grad_norm": 0.3237277567386627, + "learning_rate": 1.988908512868768e-05, + "loss": 0.5051, + "step": 2556 + }, + { + "epoch": 0.047489445758438754, + "grad_norm": 0.3530696630477905, + "learning_rate": 1.988891180729202e-05, + "loss": 0.3225, + "step": 2558 + }, + { + "epoch": 0.04752657589585739, + "grad_norm": 0.35617583990097046, + "learning_rate": 1.9888738351337898e-05, + "loss": 0.2787, + "step": 2560 + }, + { + "epoch": 0.04756370603327603, + "grad_norm": 0.450501024723053, + "learning_rate": 1.988856476082766e-05, + "loss": 0.391, + "step": 2562 + }, + { + "epoch": 0.04760083617069467, + "grad_norm": 0.27211397886276245, + "learning_rate": 1.988839103576368e-05, + "loss": 0.3339, + "step": 2564 + }, + { + "epoch": 0.04763796630811331, + "grad_norm": 0.32609447836875916, + "learning_rate": 1.9888217176148315e-05, + "loss": 0.3408, + "step": 2566 + }, + { + "epoch": 0.047675096445531945, + "grad_norm": 0.4512081742286682, + "learning_rate": 1.9888043181983932e-05, + "loss": 0.2967, + "step": 2568 + }, + { + "epoch": 0.04771222658295058, + "grad_norm": 0.5087811946868896, + "learning_rate": 1.98878690532729e-05, + "loss": 0.5014, + "step": 2570 + }, + { + "epoch": 0.04774935672036922, + "grad_norm": 0.36853674054145813, + "learning_rate": 1.9887694790017587e-05, + "loss": 0.4413, + "step": 2572 + }, + { + "epoch": 0.04778648685778786, + "grad_norm": 0.3287915885448456, + "learning_rate": 1.988752039222037e-05, + "loss": 0.4166, + "step": 2574 + }, + { + "epoch": 0.0478236169952065, + "grad_norm": 0.30874282121658325, + "learning_rate": 1.988734585988361e-05, + "loss": 0.369, + "step": 2576 + }, + { + "epoch": 0.047860747132625135, + "grad_norm": 0.4606678783893585, + "learning_rate": 1.988717119300969e-05, + "loss": 0.3383, + "step": 2578 + }, + { + "epoch": 0.04789787727004378, + "grad_norm": 0.2779238522052765, + "learning_rate": 1.9886996391600987e-05, + "loss": 0.5059, + "step": 2580 + }, + { + "epoch": 0.04793500740746241, + "grad_norm": 0.30820155143737793, + "learning_rate": 1.988682145565988e-05, + "loss": 0.3367, + "step": 2582 + }, + { + "epoch": 0.047972137544881055, + "grad_norm": 0.6014087796211243, + "learning_rate": 1.9886646385188747e-05, + "loss": 0.4306, + "step": 2584 + }, + { + "epoch": 0.04800926768229969, + "grad_norm": 0.3724692761898041, + "learning_rate": 1.9886471180189967e-05, + "loss": 0.3303, + "step": 2586 + }, + { + "epoch": 0.04804639781971833, + "grad_norm": 0.5611693263053894, + "learning_rate": 1.9886295840665932e-05, + "loss": 0.4383, + "step": 2588 + }, + { + "epoch": 0.04808352795713697, + "grad_norm": 0.3220396339893341, + "learning_rate": 1.988612036661902e-05, + "loss": 0.3205, + "step": 2590 + }, + { + "epoch": 0.04812065809455561, + "grad_norm": 0.31981250643730164, + "learning_rate": 1.9885944758051624e-05, + "loss": 0.4138, + "step": 2592 + }, + { + "epoch": 0.048157788231974245, + "grad_norm": 0.3703214228153229, + "learning_rate": 1.988576901496613e-05, + "loss": 0.4337, + "step": 2594 + }, + { + "epoch": 0.04819491836939289, + "grad_norm": 0.5945144891738892, + "learning_rate": 1.9885593137364935e-05, + "loss": 0.3245, + "step": 2596 + }, + { + "epoch": 0.04823204850681152, + "grad_norm": 0.5193402767181396, + "learning_rate": 1.9885417125250427e-05, + "loss": 0.3577, + "step": 2598 + }, + { + "epoch": 0.048269178644230165, + "grad_norm": 0.3410298228263855, + "learning_rate": 1.9885240978625e-05, + "loss": 0.2847, + "step": 2600 + }, + { + "epoch": 0.0483063087816488, + "grad_norm": 0.3336454927921295, + "learning_rate": 1.9885064697491054e-05, + "loss": 0.3829, + "step": 2602 + }, + { + "epoch": 0.04834343891906744, + "grad_norm": 0.306267112493515, + "learning_rate": 1.9884888281850986e-05, + "loss": 0.4026, + "step": 2604 + }, + { + "epoch": 0.04838056905648608, + "grad_norm": 0.3643156588077545, + "learning_rate": 1.98847117317072e-05, + "loss": 0.3708, + "step": 2606 + }, + { + "epoch": 0.04841769919390471, + "grad_norm": 0.3960159122943878, + "learning_rate": 1.9884535047062094e-05, + "loss": 0.2696, + "step": 2608 + }, + { + "epoch": 0.048454829331323356, + "grad_norm": 0.29586654901504517, + "learning_rate": 1.988435822791807e-05, + "loss": 0.519, + "step": 2610 + }, + { + "epoch": 0.04849195946874199, + "grad_norm": 0.44784021377563477, + "learning_rate": 1.9884181274277542e-05, + "loss": 0.5057, + "step": 2612 + }, + { + "epoch": 0.04852908960616063, + "grad_norm": 0.31597235798835754, + "learning_rate": 1.9884004186142913e-05, + "loss": 0.2511, + "step": 2614 + }, + { + "epoch": 0.04856621974357927, + "grad_norm": 0.23702697455883026, + "learning_rate": 1.988382696351659e-05, + "loss": 0.4544, + "step": 2616 + }, + { + "epoch": 0.04860334988099791, + "grad_norm": 0.4404929280281067, + "learning_rate": 1.9883649606400988e-05, + "loss": 0.2467, + "step": 2618 + }, + { + "epoch": 0.048640480018416546, + "grad_norm": 0.3470233082771301, + "learning_rate": 1.9883472114798525e-05, + "loss": 0.5055, + "step": 2620 + }, + { + "epoch": 0.04867761015583519, + "grad_norm": 0.39667776226997375, + "learning_rate": 1.9883294488711607e-05, + "loss": 0.2259, + "step": 2622 + }, + { + "epoch": 0.04871474029325382, + "grad_norm": 0.4350927770137787, + "learning_rate": 1.9883116728142654e-05, + "loss": 0.3607, + "step": 2624 + }, + { + "epoch": 0.048751870430672466, + "grad_norm": 0.22223787009716034, + "learning_rate": 1.9882938833094085e-05, + "loss": 0.3478, + "step": 2626 + }, + { + "epoch": 0.0487890005680911, + "grad_norm": 0.5158822536468506, + "learning_rate": 1.9882760803568325e-05, + "loss": 0.4291, + "step": 2628 + }, + { + "epoch": 0.04882613070550974, + "grad_norm": 0.4217866063117981, + "learning_rate": 1.9882582639567788e-05, + "loss": 0.4478, + "step": 2630 + }, + { + "epoch": 0.04886326084292838, + "grad_norm": 0.2986488938331604, + "learning_rate": 1.988240434109491e-05, + "loss": 0.3516, + "step": 2632 + }, + { + "epoch": 0.04890039098034702, + "grad_norm": 0.272651731967926, + "learning_rate": 1.9882225908152103e-05, + "loss": 0.3126, + "step": 2634 + }, + { + "epoch": 0.048937521117765656, + "grad_norm": 0.4954606890678406, + "learning_rate": 1.9882047340741807e-05, + "loss": 0.2777, + "step": 2636 + }, + { + "epoch": 0.0489746512551843, + "grad_norm": 0.4016689956188202, + "learning_rate": 1.9881868638866445e-05, + "loss": 0.3718, + "step": 2638 + }, + { + "epoch": 0.049011781392602934, + "grad_norm": 0.30067822337150574, + "learning_rate": 1.988168980252845e-05, + "loss": 0.4482, + "step": 2640 + }, + { + "epoch": 0.049048911530021576, + "grad_norm": 0.5616180896759033, + "learning_rate": 1.9881510831730257e-05, + "loss": 0.3068, + "step": 2642 + }, + { + "epoch": 0.04908604166744021, + "grad_norm": 0.3896440863609314, + "learning_rate": 1.9881331726474298e-05, + "loss": 0.4718, + "step": 2644 + }, + { + "epoch": 0.049123171804858846, + "grad_norm": 0.30395743250846863, + "learning_rate": 1.9881152486763015e-05, + "loss": 0.2977, + "step": 2646 + }, + { + "epoch": 0.04916030194227749, + "grad_norm": 0.29402920603752136, + "learning_rate": 1.9880973112598842e-05, + "loss": 0.3432, + "step": 2648 + }, + { + "epoch": 0.049197432079696124, + "grad_norm": 0.38253194093704224, + "learning_rate": 1.9880793603984224e-05, + "loss": 0.36, + "step": 2650 + }, + { + "epoch": 0.049234562217114766, + "grad_norm": 0.3870534896850586, + "learning_rate": 1.98806139609216e-05, + "loss": 0.4471, + "step": 2652 + }, + { + "epoch": 0.0492716923545334, + "grad_norm": 0.46986493468284607, + "learning_rate": 1.9880434183413414e-05, + "loss": 0.3136, + "step": 2654 + }, + { + "epoch": 0.049308822491952044, + "grad_norm": 0.33612990379333496, + "learning_rate": 1.9880254271462117e-05, + "loss": 0.2595, + "step": 2656 + }, + { + "epoch": 0.04934595262937068, + "grad_norm": 0.32249048352241516, + "learning_rate": 1.9880074225070154e-05, + "loss": 0.3207, + "step": 2658 + }, + { + "epoch": 0.04938308276678932, + "grad_norm": 0.24167020618915558, + "learning_rate": 1.9879894044239975e-05, + "loss": 0.2321, + "step": 2660 + }, + { + "epoch": 0.04942021290420796, + "grad_norm": 0.24824786186218262, + "learning_rate": 1.9879713728974028e-05, + "loss": 0.3656, + "step": 2662 + }, + { + "epoch": 0.0494573430416266, + "grad_norm": 0.3819003701210022, + "learning_rate": 1.9879533279274774e-05, + "loss": 0.4395, + "step": 2664 + }, + { + "epoch": 0.049494473179045234, + "grad_norm": 0.29306739568710327, + "learning_rate": 1.9879352695144666e-05, + "loss": 0.5028, + "step": 2666 + }, + { + "epoch": 0.049531603316463876, + "grad_norm": 0.40407779812812805, + "learning_rate": 1.9879171976586157e-05, + "loss": 0.3873, + "step": 2668 + }, + { + "epoch": 0.04956873345388251, + "grad_norm": 0.3330526351928711, + "learning_rate": 1.987899112360171e-05, + "loss": 0.3294, + "step": 2670 + }, + { + "epoch": 0.049605863591301154, + "grad_norm": 0.34792229533195496, + "learning_rate": 1.9878810136193785e-05, + "loss": 0.3261, + "step": 2672 + }, + { + "epoch": 0.04964299372871979, + "grad_norm": 0.458290159702301, + "learning_rate": 1.9878629014364844e-05, + "loss": 0.4611, + "step": 2674 + }, + { + "epoch": 0.04968012386613843, + "grad_norm": 0.31894224882125854, + "learning_rate": 1.9878447758117352e-05, + "loss": 0.4871, + "step": 2676 + }, + { + "epoch": 0.04971725400355707, + "grad_norm": 0.3224238455295563, + "learning_rate": 1.9878266367453775e-05, + "loss": 0.38, + "step": 2678 + }, + { + "epoch": 0.04975438414097571, + "grad_norm": 0.3004835844039917, + "learning_rate": 1.9878084842376585e-05, + "loss": 0.2483, + "step": 2680 + }, + { + "epoch": 0.049791514278394344, + "grad_norm": 0.4765733778476715, + "learning_rate": 1.9877903182888243e-05, + "loss": 0.409, + "step": 2682 + }, + { + "epoch": 0.04982864441581298, + "grad_norm": 0.37976738810539246, + "learning_rate": 1.987772138899123e-05, + "loss": 0.3213, + "step": 2684 + }, + { + "epoch": 0.04986577455323162, + "grad_norm": 0.4271876811981201, + "learning_rate": 1.9877539460688017e-05, + "loss": 0.3077, + "step": 2686 + }, + { + "epoch": 0.04990290469065026, + "grad_norm": 0.5363243818283081, + "learning_rate": 1.9877357397981076e-05, + "loss": 0.5745, + "step": 2688 + }, + { + "epoch": 0.0499400348280689, + "grad_norm": 0.40662485361099243, + "learning_rate": 1.9877175200872886e-05, + "loss": 0.2489, + "step": 2690 + }, + { + "epoch": 0.049977164965487535, + "grad_norm": 0.6903659105300903, + "learning_rate": 1.987699286936593e-05, + "loss": 0.3617, + "step": 2692 + }, + { + "epoch": 0.05001429510290618, + "grad_norm": 0.396749883890152, + "learning_rate": 1.9876810403462684e-05, + "loss": 0.2711, + "step": 2694 + }, + { + "epoch": 0.05005142524032481, + "grad_norm": 0.3562004268169403, + "learning_rate": 1.9876627803165632e-05, + "loss": 0.5901, + "step": 2696 + }, + { + "epoch": 0.050088555377743454, + "grad_norm": 0.29518672823905945, + "learning_rate": 1.9876445068477265e-05, + "loss": 0.4468, + "step": 2698 + }, + { + "epoch": 0.05012568551516209, + "grad_norm": 0.3475431203842163, + "learning_rate": 1.987626219940006e-05, + "loss": 0.3713, + "step": 2700 + }, + { + "epoch": 0.05016281565258073, + "grad_norm": 0.35791468620300293, + "learning_rate": 1.987607919593651e-05, + "loss": 0.5917, + "step": 2702 + }, + { + "epoch": 0.05019994578999937, + "grad_norm": 0.3438893258571625, + "learning_rate": 1.9875896058089102e-05, + "loss": 0.3733, + "step": 2704 + }, + { + "epoch": 0.05023707592741801, + "grad_norm": 0.3380543887615204, + "learning_rate": 1.9875712785860336e-05, + "loss": 0.4842, + "step": 2706 + }, + { + "epoch": 0.050274206064836645, + "grad_norm": 0.33952251076698303, + "learning_rate": 1.9875529379252696e-05, + "loss": 0.3311, + "step": 2708 + }, + { + "epoch": 0.05031133620225529, + "grad_norm": 0.31405842304229736, + "learning_rate": 1.987534583826868e-05, + "loss": 0.3296, + "step": 2710 + }, + { + "epoch": 0.05034846633967392, + "grad_norm": 0.30055299401283264, + "learning_rate": 1.987516216291079e-05, + "loss": 0.2863, + "step": 2712 + }, + { + "epoch": 0.050385596477092565, + "grad_norm": 0.5926289558410645, + "learning_rate": 1.9874978353181526e-05, + "loss": 0.566, + "step": 2714 + }, + { + "epoch": 0.0504227266145112, + "grad_norm": 0.33233633637428284, + "learning_rate": 1.987479440908338e-05, + "loss": 0.1768, + "step": 2716 + }, + { + "epoch": 0.05045985675192984, + "grad_norm": 0.2553347945213318, + "learning_rate": 1.987461033061886e-05, + "loss": 0.4093, + "step": 2718 + }, + { + "epoch": 0.05049698688934848, + "grad_norm": 0.29046761989593506, + "learning_rate": 1.9874426117790474e-05, + "loss": 0.3383, + "step": 2720 + }, + { + "epoch": 0.05053411702676711, + "grad_norm": 0.33573877811431885, + "learning_rate": 1.9874241770600725e-05, + "loss": 0.5166, + "step": 2722 + }, + { + "epoch": 0.050571247164185755, + "grad_norm": 0.3779090344905853, + "learning_rate": 1.9874057289052123e-05, + "loss": 0.3437, + "step": 2724 + }, + { + "epoch": 0.05060837730160439, + "grad_norm": 0.509864091873169, + "learning_rate": 1.987387267314718e-05, + "loss": 0.3331, + "step": 2726 + }, + { + "epoch": 0.05064550743902303, + "grad_norm": 0.30726784467697144, + "learning_rate": 1.98736879228884e-05, + "loss": 0.2722, + "step": 2728 + }, + { + "epoch": 0.05068263757644167, + "grad_norm": 0.31946155428886414, + "learning_rate": 1.9873503038278307e-05, + "loss": 0.4447, + "step": 2730 + }, + { + "epoch": 0.05071976771386031, + "grad_norm": 0.5914859771728516, + "learning_rate": 1.987331801931941e-05, + "loss": 0.4449, + "step": 2732 + }, + { + "epoch": 0.050756897851278945, + "grad_norm": 0.3914320766925812, + "learning_rate": 1.987313286601423e-05, + "loss": 0.1657, + "step": 2734 + }, + { + "epoch": 0.05079402798869759, + "grad_norm": 0.44104456901550293, + "learning_rate": 1.9872947578365283e-05, + "loss": 0.3239, + "step": 2736 + }, + { + "epoch": 0.05083115812611622, + "grad_norm": 0.35505062341690063, + "learning_rate": 1.9872762156375093e-05, + "loss": 0.4602, + "step": 2738 + }, + { + "epoch": 0.050868288263534865, + "grad_norm": 0.2532171607017517, + "learning_rate": 1.9872576600046184e-05, + "loss": 0.3587, + "step": 2740 + }, + { + "epoch": 0.0509054184009535, + "grad_norm": 0.3133249878883362, + "learning_rate": 1.987239090938108e-05, + "loss": 0.3708, + "step": 2742 + }, + { + "epoch": 0.05094254853837214, + "grad_norm": 0.3827840983867645, + "learning_rate": 1.9872205084382306e-05, + "loss": 0.457, + "step": 2744 + }, + { + "epoch": 0.05097967867579078, + "grad_norm": 0.27355948090553284, + "learning_rate": 1.9872019125052388e-05, + "loss": 0.1705, + "step": 2746 + }, + { + "epoch": 0.05101680881320942, + "grad_norm": 0.3710668683052063, + "learning_rate": 1.9871833031393865e-05, + "loss": 0.3192, + "step": 2748 + }, + { + "epoch": 0.051053938950628056, + "grad_norm": 0.4130370020866394, + "learning_rate": 1.987164680340926e-05, + "loss": 0.2568, + "step": 2750 + }, + { + "epoch": 0.0510910690880467, + "grad_norm": 0.24248231947422028, + "learning_rate": 1.9871460441101113e-05, + "loss": 0.1913, + "step": 2752 + }, + { + "epoch": 0.05112819922546533, + "grad_norm": 0.39129525423049927, + "learning_rate": 1.9871273944471962e-05, + "loss": 0.3885, + "step": 2754 + }, + { + "epoch": 0.051165329362883975, + "grad_norm": 0.3475819230079651, + "learning_rate": 1.9871087313524334e-05, + "loss": 0.3656, + "step": 2756 + }, + { + "epoch": 0.05120245950030261, + "grad_norm": 0.3614087998867035, + "learning_rate": 1.9870900548260777e-05, + "loss": 0.337, + "step": 2758 + }, + { + "epoch": 0.051239589637721246, + "grad_norm": 0.3031628727912903, + "learning_rate": 1.987071364868383e-05, + "loss": 0.388, + "step": 2760 + }, + { + "epoch": 0.05127671977513989, + "grad_norm": 0.31338509917259216, + "learning_rate": 1.9870526614796036e-05, + "loss": 0.3311, + "step": 2762 + }, + { + "epoch": 0.05131384991255852, + "grad_norm": 0.47206684947013855, + "learning_rate": 1.9870339446599943e-05, + "loss": 0.3052, + "step": 2764 + }, + { + "epoch": 0.051350980049977166, + "grad_norm": 0.2726995050907135, + "learning_rate": 1.9870152144098092e-05, + "loss": 0.3617, + "step": 2766 + }, + { + "epoch": 0.0513881101873958, + "grad_norm": 0.26517996191978455, + "learning_rate": 1.9869964707293036e-05, + "loss": 0.4083, + "step": 2768 + }, + { + "epoch": 0.05142524032481444, + "grad_norm": 0.4201814830303192, + "learning_rate": 1.9869777136187324e-05, + "loss": 0.5227, + "step": 2770 + }, + { + "epoch": 0.05146237046223308, + "grad_norm": 0.34274962544441223, + "learning_rate": 1.9869589430783512e-05, + "loss": 0.4098, + "step": 2772 + }, + { + "epoch": 0.05149950059965172, + "grad_norm": 0.3870795965194702, + "learning_rate": 1.9869401591084147e-05, + "loss": 0.4049, + "step": 2774 + }, + { + "epoch": 0.051536630737070356, + "grad_norm": 0.3675890862941742, + "learning_rate": 1.9869213617091788e-05, + "loss": 0.4725, + "step": 2776 + }, + { + "epoch": 0.051573760874489, + "grad_norm": 0.4682360291481018, + "learning_rate": 1.9869025508808996e-05, + "loss": 0.408, + "step": 2778 + }, + { + "epoch": 0.051610891011907634, + "grad_norm": 0.32862234115600586, + "learning_rate": 1.9868837266238325e-05, + "loss": 0.3602, + "step": 2780 + }, + { + "epoch": 0.051648021149326276, + "grad_norm": 0.36561235785484314, + "learning_rate": 1.9868648889382344e-05, + "loss": 0.2912, + "step": 2782 + }, + { + "epoch": 0.05168515128674491, + "grad_norm": 0.3661783039569855, + "learning_rate": 1.9868460378243608e-05, + "loss": 0.3873, + "step": 2784 + }, + { + "epoch": 0.05172228142416355, + "grad_norm": 0.3470335304737091, + "learning_rate": 1.9868271732824688e-05, + "loss": 0.4364, + "step": 2786 + }, + { + "epoch": 0.05175941156158219, + "grad_norm": 0.26784074306488037, + "learning_rate": 1.986808295312815e-05, + "loss": 0.3562, + "step": 2788 + }, + { + "epoch": 0.05179654169900083, + "grad_norm": 0.322917103767395, + "learning_rate": 1.9867894039156555e-05, + "loss": 0.3666, + "step": 2790 + }, + { + "epoch": 0.051833671836419466, + "grad_norm": 0.3684297800064087, + "learning_rate": 1.9867704990912485e-05, + "loss": 0.4611, + "step": 2792 + }, + { + "epoch": 0.05187080197383811, + "grad_norm": 0.3939528465270996, + "learning_rate": 1.9867515808398504e-05, + "loss": 0.3503, + "step": 2794 + }, + { + "epoch": 0.051907932111256744, + "grad_norm": 0.332139253616333, + "learning_rate": 1.986732649161719e-05, + "loss": 0.2579, + "step": 2796 + }, + { + "epoch": 0.05194506224867538, + "grad_norm": 0.4107329547405243, + "learning_rate": 1.9867137040571123e-05, + "loss": 0.3221, + "step": 2798 + }, + { + "epoch": 0.05198219238609402, + "grad_norm": 0.300315260887146, + "learning_rate": 1.9866947455262872e-05, + "loss": 0.413, + "step": 2800 + }, + { + "epoch": 0.05201932252351266, + "grad_norm": 0.3489567041397095, + "learning_rate": 1.986675773569502e-05, + "loss": 0.4896, + "step": 2802 + }, + { + "epoch": 0.0520564526609313, + "grad_norm": 0.2821277678012848, + "learning_rate": 1.9866567881870152e-05, + "loss": 0.4117, + "step": 2804 + }, + { + "epoch": 0.052093582798349934, + "grad_norm": 0.3023563623428345, + "learning_rate": 1.986637789379085e-05, + "loss": 0.291, + "step": 2806 + }, + { + "epoch": 0.052130712935768576, + "grad_norm": 0.5156275629997253, + "learning_rate": 1.9866187771459696e-05, + "loss": 0.3509, + "step": 2808 + }, + { + "epoch": 0.05216784307318721, + "grad_norm": 0.41604647040367126, + "learning_rate": 1.986599751487928e-05, + "loss": 0.36, + "step": 2810 + }, + { + "epoch": 0.052204973210605854, + "grad_norm": 0.2888268232345581, + "learning_rate": 1.986580712405219e-05, + "loss": 0.321, + "step": 2812 + }, + { + "epoch": 0.05224210334802449, + "grad_norm": 0.4198257625102997, + "learning_rate": 1.9865616598981018e-05, + "loss": 0.3661, + "step": 2814 + }, + { + "epoch": 0.05227923348544313, + "grad_norm": 0.44610345363616943, + "learning_rate": 1.9865425939668353e-05, + "loss": 0.4834, + "step": 2816 + }, + { + "epoch": 0.05231636362286177, + "grad_norm": 0.39528095722198486, + "learning_rate": 1.986523514611679e-05, + "loss": 0.489, + "step": 2818 + }, + { + "epoch": 0.05235349376028041, + "grad_norm": 0.3856521248817444, + "learning_rate": 1.986504421832893e-05, + "loss": 0.3758, + "step": 2820 + }, + { + "epoch": 0.052390623897699044, + "grad_norm": 0.40536215901374817, + "learning_rate": 1.9864853156307365e-05, + "loss": 0.2779, + "step": 2822 + }, + { + "epoch": 0.05242775403511769, + "grad_norm": 0.339359849691391, + "learning_rate": 1.9864661960054698e-05, + "loss": 0.2139, + "step": 2824 + }, + { + "epoch": 0.05246488417253632, + "grad_norm": 0.31417736411094666, + "learning_rate": 1.986447062957353e-05, + "loss": 0.3379, + "step": 2826 + }, + { + "epoch": 0.052502014309954964, + "grad_norm": 0.3061143159866333, + "learning_rate": 1.9864279164866464e-05, + "loss": 0.4347, + "step": 2828 + }, + { + "epoch": 0.0525391444473736, + "grad_norm": 0.3430987298488617, + "learning_rate": 1.9864087565936105e-05, + "loss": 0.5254, + "step": 2830 + }, + { + "epoch": 0.05257627458479224, + "grad_norm": 0.29536962509155273, + "learning_rate": 1.986389583278506e-05, + "loss": 0.4528, + "step": 2832 + }, + { + "epoch": 0.05261340472221088, + "grad_norm": 0.3575668931007385, + "learning_rate": 1.9863703965415938e-05, + "loss": 0.3813, + "step": 2834 + }, + { + "epoch": 0.05265053485962951, + "grad_norm": 0.3959839642047882, + "learning_rate": 1.986351196383135e-05, + "loss": 0.4202, + "step": 2836 + }, + { + "epoch": 0.052687664997048154, + "grad_norm": 0.2617895007133484, + "learning_rate": 1.9863319828033916e-05, + "loss": 0.5305, + "step": 2838 + }, + { + "epoch": 0.05272479513446679, + "grad_norm": 0.26240354776382446, + "learning_rate": 1.9863127558026238e-05, + "loss": 0.5136, + "step": 2840 + }, + { + "epoch": 0.05276192527188543, + "grad_norm": 0.3180220127105713, + "learning_rate": 1.9862935153810932e-05, + "loss": 0.3635, + "step": 2842 + }, + { + "epoch": 0.05279905540930407, + "grad_norm": 0.4007718563079834, + "learning_rate": 1.986274261539063e-05, + "loss": 0.264, + "step": 2844 + }, + { + "epoch": 0.05283618554672271, + "grad_norm": 0.4117051064968109, + "learning_rate": 1.986254994276794e-05, + "loss": 0.3939, + "step": 2846 + }, + { + "epoch": 0.052873315684141345, + "grad_norm": 0.29042381048202515, + "learning_rate": 1.986235713594549e-05, + "loss": 0.4917, + "step": 2848 + }, + { + "epoch": 0.05291044582155999, + "grad_norm": 0.33027327060699463, + "learning_rate": 1.9862164194925896e-05, + "loss": 0.1821, + "step": 2850 + }, + { + "epoch": 0.05294757595897862, + "grad_norm": 0.36862608790397644, + "learning_rate": 1.9861971119711788e-05, + "loss": 0.2645, + "step": 2852 + }, + { + "epoch": 0.052984706096397265, + "grad_norm": 0.36101314425468445, + "learning_rate": 1.9861777910305794e-05, + "loss": 0.3313, + "step": 2854 + }, + { + "epoch": 0.0530218362338159, + "grad_norm": 0.39052248001098633, + "learning_rate": 1.9861584566710544e-05, + "loss": 0.4555, + "step": 2856 + }, + { + "epoch": 0.05305896637123454, + "grad_norm": 0.3746533691883087, + "learning_rate": 1.9861391088928666e-05, + "loss": 0.413, + "step": 2858 + }, + { + "epoch": 0.05309609650865318, + "grad_norm": 0.3168638050556183, + "learning_rate": 1.9861197476962793e-05, + "loss": 0.5128, + "step": 2860 + }, + { + "epoch": 0.05313322664607182, + "grad_norm": 0.36404016613960266, + "learning_rate": 1.986100373081556e-05, + "loss": 0.3457, + "step": 2862 + }, + { + "epoch": 0.053170356783490455, + "grad_norm": 0.3756319284439087, + "learning_rate": 1.9860809850489603e-05, + "loss": 0.4069, + "step": 2864 + }, + { + "epoch": 0.0532074869209091, + "grad_norm": 0.3924770653247833, + "learning_rate": 1.986061583598756e-05, + "loss": 0.4218, + "step": 2866 + }, + { + "epoch": 0.05324461705832773, + "grad_norm": 0.1839403510093689, + "learning_rate": 1.9860421687312072e-05, + "loss": 0.364, + "step": 2868 + }, + { + "epoch": 0.053281747195746375, + "grad_norm": 0.37121400237083435, + "learning_rate": 1.986022740446578e-05, + "loss": 0.3547, + "step": 2870 + }, + { + "epoch": 0.05331887733316501, + "grad_norm": 0.3728128969669342, + "learning_rate": 1.986003298745133e-05, + "loss": 0.3992, + "step": 2872 + }, + { + "epoch": 0.053356007470583645, + "grad_norm": 0.34892934560775757, + "learning_rate": 1.9859838436271363e-05, + "loss": 0.417, + "step": 2874 + }, + { + "epoch": 0.05339313760800229, + "grad_norm": 0.33326664566993713, + "learning_rate": 1.985964375092853e-05, + "loss": 0.2545, + "step": 2876 + }, + { + "epoch": 0.05343026774542092, + "grad_norm": 0.5060580372810364, + "learning_rate": 1.9859448931425475e-05, + "loss": 0.4769, + "step": 2878 + }, + { + "epoch": 0.053467397882839565, + "grad_norm": 0.4234153628349304, + "learning_rate": 1.9859253977764855e-05, + "loss": 0.7388, + "step": 2880 + }, + { + "epoch": 0.0535045280202582, + "grad_norm": 0.4180959463119507, + "learning_rate": 1.9859058889949324e-05, + "loss": 0.5245, + "step": 2882 + }, + { + "epoch": 0.05354165815767684, + "grad_norm": 0.30391424894332886, + "learning_rate": 1.985886366798153e-05, + "loss": 0.3338, + "step": 2884 + }, + { + "epoch": 0.05357878829509548, + "grad_norm": 0.3675253987312317, + "learning_rate": 1.985866831186413e-05, + "loss": 0.2802, + "step": 2886 + }, + { + "epoch": 0.05361591843251412, + "grad_norm": 0.35947632789611816, + "learning_rate": 1.9858472821599787e-05, + "loss": 0.5354, + "step": 2888 + }, + { + "epoch": 0.053653048569932756, + "grad_norm": 0.32267966866493225, + "learning_rate": 1.9858277197191157e-05, + "loss": 0.3534, + "step": 2890 + }, + { + "epoch": 0.0536901787073514, + "grad_norm": 0.3630273640155792, + "learning_rate": 1.9858081438640904e-05, + "loss": 0.3977, + "step": 2892 + }, + { + "epoch": 0.05372730884477003, + "grad_norm": 0.3030064105987549, + "learning_rate": 1.985788554595169e-05, + "loss": 0.3888, + "step": 2894 + }, + { + "epoch": 0.053764438982188675, + "grad_norm": 0.42525264620780945, + "learning_rate": 1.9857689519126183e-05, + "loss": 0.4263, + "step": 2896 + }, + { + "epoch": 0.05380156911960731, + "grad_norm": 0.31737402081489563, + "learning_rate": 1.9857493358167047e-05, + "loss": 0.5581, + "step": 2898 + }, + { + "epoch": 0.05383869925702595, + "grad_norm": 0.2848888635635376, + "learning_rate": 1.9857297063076953e-05, + "loss": 0.275, + "step": 2900 + }, + { + "epoch": 0.05387582939444459, + "grad_norm": 0.27533063292503357, + "learning_rate": 1.9857100633858576e-05, + "loss": 0.4121, + "step": 2902 + }, + { + "epoch": 0.05391295953186323, + "grad_norm": 0.37541788816452026, + "learning_rate": 1.985690407051458e-05, + "loss": 0.2733, + "step": 2904 + }, + { + "epoch": 0.053950089669281866, + "grad_norm": 0.30114927887916565, + "learning_rate": 1.9856707373047647e-05, + "loss": 0.283, + "step": 2906 + }, + { + "epoch": 0.05398721980670051, + "grad_norm": 0.46774184703826904, + "learning_rate": 1.985651054146045e-05, + "loss": 0.3731, + "step": 2908 + }, + { + "epoch": 0.05402434994411914, + "grad_norm": 0.2963886559009552, + "learning_rate": 1.9856313575755667e-05, + "loss": 0.5117, + "step": 2910 + }, + { + "epoch": 0.05406148008153778, + "grad_norm": 0.3226279020309448, + "learning_rate": 1.9856116475935983e-05, + "loss": 0.1939, + "step": 2912 + }, + { + "epoch": 0.05409861021895642, + "grad_norm": 0.40098875761032104, + "learning_rate": 1.9855919242004075e-05, + "loss": 0.1663, + "step": 2914 + }, + { + "epoch": 0.054135740356375056, + "grad_norm": 0.4085533022880554, + "learning_rate": 1.9855721873962626e-05, + "loss": 0.3628, + "step": 2916 + }, + { + "epoch": 0.0541728704937937, + "grad_norm": 0.2947387993335724, + "learning_rate": 1.9855524371814323e-05, + "loss": 0.4139, + "step": 2918 + }, + { + "epoch": 0.054210000631212334, + "grad_norm": 0.467885822057724, + "learning_rate": 1.9855326735561857e-05, + "loss": 0.4517, + "step": 2920 + }, + { + "epoch": 0.054247130768630976, + "grad_norm": 0.341871976852417, + "learning_rate": 1.9855128965207913e-05, + "loss": 0.5479, + "step": 2922 + }, + { + "epoch": 0.05428426090604961, + "grad_norm": 0.2603185176849365, + "learning_rate": 1.9854931060755183e-05, + "loss": 0.5239, + "step": 2924 + }, + { + "epoch": 0.05432139104346825, + "grad_norm": 0.40321770310401917, + "learning_rate": 1.985473302220636e-05, + "loss": 0.2412, + "step": 2926 + }, + { + "epoch": 0.05435852118088689, + "grad_norm": 0.4545021057128906, + "learning_rate": 1.985453484956414e-05, + "loss": 0.2157, + "step": 2928 + }, + { + "epoch": 0.05439565131830553, + "grad_norm": 0.4130193889141083, + "learning_rate": 1.9854336542831218e-05, + "loss": 0.6308, + "step": 2930 + }, + { + "epoch": 0.054432781455724166, + "grad_norm": 0.30755752325057983, + "learning_rate": 1.9854138102010293e-05, + "loss": 0.2082, + "step": 2932 + }, + { + "epoch": 0.05446991159314281, + "grad_norm": 0.6711914539337158, + "learning_rate": 1.9853939527104066e-05, + "loss": 0.387, + "step": 2934 + }, + { + "epoch": 0.054507041730561444, + "grad_norm": 0.24945802986621857, + "learning_rate": 1.985374081811523e-05, + "loss": 0.3164, + "step": 2936 + }, + { + "epoch": 0.054544171867980086, + "grad_norm": 0.3281954228878021, + "learning_rate": 1.9853541975046505e-05, + "loss": 0.3061, + "step": 2938 + }, + { + "epoch": 0.05458130200539872, + "grad_norm": 0.3453971743583679, + "learning_rate": 1.9853342997900588e-05, + "loss": 0.4193, + "step": 2940 + }, + { + "epoch": 0.054618432142817364, + "grad_norm": 0.3509087860584259, + "learning_rate": 1.9853143886680187e-05, + "loss": 0.2724, + "step": 2942 + }, + { + "epoch": 0.054655562280236, + "grad_norm": 0.4127955436706543, + "learning_rate": 1.985294464138801e-05, + "loss": 0.3478, + "step": 2944 + }, + { + "epoch": 0.05469269241765464, + "grad_norm": 0.2635808289051056, + "learning_rate": 1.9852745262026773e-05, + "loss": 0.3018, + "step": 2946 + }, + { + "epoch": 0.054729822555073276, + "grad_norm": 0.39689818024635315, + "learning_rate": 1.985254574859918e-05, + "loss": 0.4207, + "step": 2948 + }, + { + "epoch": 0.05476695269249191, + "grad_norm": 0.3357161283493042, + "learning_rate": 1.985234610110795e-05, + "loss": 0.4418, + "step": 2950 + }, + { + "epoch": 0.054804082829910554, + "grad_norm": 0.25282159447669983, + "learning_rate": 1.9852146319555805e-05, + "loss": 0.4123, + "step": 2952 + }, + { + "epoch": 0.05484121296732919, + "grad_norm": 0.5032823085784912, + "learning_rate": 1.985194640394546e-05, + "loss": 0.1766, + "step": 2954 + }, + { + "epoch": 0.05487834310474783, + "grad_norm": 0.32633665204048157, + "learning_rate": 1.9851746354279632e-05, + "loss": 0.194, + "step": 2956 + }, + { + "epoch": 0.05491547324216647, + "grad_norm": 0.3969549834728241, + "learning_rate": 1.985154617056105e-05, + "loss": 0.4088, + "step": 2958 + }, + { + "epoch": 0.05495260337958511, + "grad_norm": 0.3090079426765442, + "learning_rate": 1.9851345852792426e-05, + "loss": 0.5859, + "step": 2960 + }, + { + "epoch": 0.054989733517003744, + "grad_norm": 0.3409533202648163, + "learning_rate": 1.9851145400976493e-05, + "loss": 0.3124, + "step": 2962 + }, + { + "epoch": 0.05502686365442239, + "grad_norm": 0.30572324991226196, + "learning_rate": 1.9850944815115984e-05, + "loss": 0.2787, + "step": 2964 + }, + { + "epoch": 0.05506399379184102, + "grad_norm": 0.34997761249542236, + "learning_rate": 1.985074409521362e-05, + "loss": 0.2627, + "step": 2966 + }, + { + "epoch": 0.055101123929259664, + "grad_norm": 0.25141867995262146, + "learning_rate": 1.9850543241272138e-05, + "loss": 0.2682, + "step": 2968 + }, + { + "epoch": 0.0551382540666783, + "grad_norm": 0.3440292179584503, + "learning_rate": 1.9850342253294266e-05, + "loss": 0.3827, + "step": 2970 + }, + { + "epoch": 0.05517538420409694, + "grad_norm": 0.3372299075126648, + "learning_rate": 1.985014113128274e-05, + "loss": 0.2847, + "step": 2972 + }, + { + "epoch": 0.05521251434151558, + "grad_norm": 0.26456281542778015, + "learning_rate": 1.9849939875240304e-05, + "loss": 0.361, + "step": 2974 + }, + { + "epoch": 0.05524964447893422, + "grad_norm": 0.26251885294914246, + "learning_rate": 1.9849738485169686e-05, + "loss": 0.4045, + "step": 2976 + }, + { + "epoch": 0.055286774616352855, + "grad_norm": 0.3801189064979553, + "learning_rate": 1.984953696107363e-05, + "loss": 0.2112, + "step": 2978 + }, + { + "epoch": 0.0553239047537715, + "grad_norm": 0.38826003670692444, + "learning_rate": 1.9849335302954878e-05, + "loss": 0.3879, + "step": 2980 + }, + { + "epoch": 0.05536103489119013, + "grad_norm": 0.3299528658390045, + "learning_rate": 1.9849133510816178e-05, + "loss": 0.4362, + "step": 2982 + }, + { + "epoch": 0.055398165028608774, + "grad_norm": 0.39100533723831177, + "learning_rate": 1.984893158466027e-05, + "loss": 0.5361, + "step": 2984 + }, + { + "epoch": 0.05543529516602741, + "grad_norm": 0.5104507207870483, + "learning_rate": 1.9848729524489904e-05, + "loss": 0.4108, + "step": 2986 + }, + { + "epoch": 0.055472425303446045, + "grad_norm": 0.31418415904045105, + "learning_rate": 1.984852733030783e-05, + "loss": 0.2392, + "step": 2988 + }, + { + "epoch": 0.05550955544086469, + "grad_norm": 0.39942795038223267, + "learning_rate": 1.98483250021168e-05, + "loss": 0.2222, + "step": 2990 + }, + { + "epoch": 0.05554668557828332, + "grad_norm": 0.5871292948722839, + "learning_rate": 1.9848122539919564e-05, + "loss": 0.2719, + "step": 2992 + }, + { + "epoch": 0.055583815715701965, + "grad_norm": 0.33943361043930054, + "learning_rate": 1.984791994371888e-05, + "loss": 0.2929, + "step": 2994 + }, + { + "epoch": 0.0556209458531206, + "grad_norm": 0.3780500888824463, + "learning_rate": 1.9847717213517503e-05, + "loss": 0.4432, + "step": 2996 + }, + { + "epoch": 0.05565807599053924, + "grad_norm": 0.4042004346847534, + "learning_rate": 1.9847514349318194e-05, + "loss": 0.5347, + "step": 2998 + }, + { + "epoch": 0.05569520612795788, + "grad_norm": 0.3743478059768677, + "learning_rate": 1.9847311351123707e-05, + "loss": 0.383, + "step": 3000 + }, + { + "epoch": 0.05573233626537652, + "grad_norm": 0.2726301848888397, + "learning_rate": 1.984710821893681e-05, + "loss": 0.3469, + "step": 3002 + }, + { + "epoch": 0.055769466402795155, + "grad_norm": 0.38962477445602417, + "learning_rate": 1.984690495276027e-05, + "loss": 0.3564, + "step": 3004 + }, + { + "epoch": 0.0558065965402138, + "grad_norm": 0.7716733813285828, + "learning_rate": 1.9846701552596845e-05, + "loss": 0.3652, + "step": 3006 + }, + { + "epoch": 0.05584372667763243, + "grad_norm": 0.2975520193576813, + "learning_rate": 1.9846498018449308e-05, + "loss": 0.4046, + "step": 3008 + }, + { + "epoch": 0.055880856815051075, + "grad_norm": 0.41402673721313477, + "learning_rate": 1.9846294350320423e-05, + "loss": 0.3484, + "step": 3010 + }, + { + "epoch": 0.05591798695246971, + "grad_norm": 0.34582993388175964, + "learning_rate": 1.984609054821297e-05, + "loss": 0.5227, + "step": 3012 + }, + { + "epoch": 0.05595511708988835, + "grad_norm": 0.40721312165260315, + "learning_rate": 1.9845886612129712e-05, + "loss": 0.2801, + "step": 3014 + }, + { + "epoch": 0.05599224722730699, + "grad_norm": 0.331721693277359, + "learning_rate": 1.984568254207343e-05, + "loss": 0.384, + "step": 3016 + }, + { + "epoch": 0.05602937736472563, + "grad_norm": 0.25454097986221313, + "learning_rate": 1.98454783380469e-05, + "loss": 0.513, + "step": 3018 + }, + { + "epoch": 0.056066507502144265, + "grad_norm": 0.2617400884628296, + "learning_rate": 1.9845274000052902e-05, + "loss": 0.4342, + "step": 3020 + }, + { + "epoch": 0.05610363763956291, + "grad_norm": 0.4297643005847931, + "learning_rate": 1.9845069528094213e-05, + "loss": 0.3669, + "step": 3022 + }, + { + "epoch": 0.05614076777698154, + "grad_norm": 0.4833610951900482, + "learning_rate": 1.984486492217362e-05, + "loss": 0.3824, + "step": 3024 + }, + { + "epoch": 0.05617789791440018, + "grad_norm": 0.3023262023925781, + "learning_rate": 1.9844660182293905e-05, + "loss": 0.5026, + "step": 3026 + }, + { + "epoch": 0.05621502805181882, + "grad_norm": 0.28864169120788574, + "learning_rate": 1.984445530845785e-05, + "loss": 0.3047, + "step": 3028 + }, + { + "epoch": 0.056252158189237456, + "grad_norm": 0.2669251263141632, + "learning_rate": 1.9844250300668245e-05, + "loss": 0.4851, + "step": 3030 + }, + { + "epoch": 0.0562892883266561, + "grad_norm": 0.3894294500350952, + "learning_rate": 1.9844045158927882e-05, + "loss": 0.3312, + "step": 3032 + }, + { + "epoch": 0.05632641846407473, + "grad_norm": 0.24255990982055664, + "learning_rate": 1.984383988323955e-05, + "loss": 0.4421, + "step": 3034 + }, + { + "epoch": 0.056363548601493375, + "grad_norm": 0.3418087065219879, + "learning_rate": 1.9843634473606045e-05, + "loss": 0.4233, + "step": 3036 + }, + { + "epoch": 0.05640067873891201, + "grad_norm": 0.32762765884399414, + "learning_rate": 1.984342893003016e-05, + "loss": 0.1734, + "step": 3038 + }, + { + "epoch": 0.05643780887633065, + "grad_norm": 0.30636054277420044, + "learning_rate": 1.984322325251469e-05, + "loss": 0.376, + "step": 3040 + }, + { + "epoch": 0.05647493901374929, + "grad_norm": 0.3261352479457855, + "learning_rate": 1.9843017441062432e-05, + "loss": 0.4137, + "step": 3042 + }, + { + "epoch": 0.05651206915116793, + "grad_norm": 0.297899454832077, + "learning_rate": 1.9842811495676196e-05, + "loss": 0.4331, + "step": 3044 + }, + { + "epoch": 0.056549199288586566, + "grad_norm": 0.48860272765159607, + "learning_rate": 1.9842605416358777e-05, + "loss": 0.2606, + "step": 3046 + }, + { + "epoch": 0.05658632942600521, + "grad_norm": 0.29924604296684265, + "learning_rate": 1.9842399203112977e-05, + "loss": 0.3108, + "step": 3048 + }, + { + "epoch": 0.05662345956342384, + "grad_norm": 0.35606759786605835, + "learning_rate": 1.984219285594161e-05, + "loss": 0.4766, + "step": 3050 + }, + { + "epoch": 0.056660589700842486, + "grad_norm": 0.25527697801589966, + "learning_rate": 1.9841986374847474e-05, + "loss": 0.3813, + "step": 3052 + }, + { + "epoch": 0.05669771983826112, + "grad_norm": 0.40612947940826416, + "learning_rate": 1.9841779759833385e-05, + "loss": 0.2962, + "step": 3054 + }, + { + "epoch": 0.05673484997567976, + "grad_norm": 0.3753218948841095, + "learning_rate": 1.9841573010902157e-05, + "loss": 0.2815, + "step": 3056 + }, + { + "epoch": 0.0567719801130984, + "grad_norm": 0.34082576632499695, + "learning_rate": 1.9841366128056596e-05, + "loss": 0.354, + "step": 3058 + }, + { + "epoch": 0.05680911025051704, + "grad_norm": 0.37212178111076355, + "learning_rate": 1.984115911129952e-05, + "loss": 0.2811, + "step": 3060 + }, + { + "epoch": 0.056846240387935676, + "grad_norm": 0.3254123330116272, + "learning_rate": 1.9840951960633745e-05, + "loss": 0.3702, + "step": 3062 + }, + { + "epoch": 0.05688337052535431, + "grad_norm": 0.3400002717971802, + "learning_rate": 1.9840744676062093e-05, + "loss": 0.3765, + "step": 3064 + }, + { + "epoch": 0.05692050066277295, + "grad_norm": 0.3192213177680969, + "learning_rate": 1.9840537257587383e-05, + "loss": 0.3905, + "step": 3066 + }, + { + "epoch": 0.05695763080019159, + "grad_norm": 0.39725351333618164, + "learning_rate": 1.9840329705212435e-05, + "loss": 0.4894, + "step": 3068 + }, + { + "epoch": 0.05699476093761023, + "grad_norm": 0.3115709722042084, + "learning_rate": 1.9840122018940077e-05, + "loss": 0.4796, + "step": 3070 + }, + { + "epoch": 0.057031891075028866, + "grad_norm": 0.3007223308086395, + "learning_rate": 1.9839914198773128e-05, + "loss": 0.2796, + "step": 3072 + }, + { + "epoch": 0.05706902121244751, + "grad_norm": 0.4191291928291321, + "learning_rate": 1.9839706244714424e-05, + "loss": 0.7142, + "step": 3074 + }, + { + "epoch": 0.057106151349866144, + "grad_norm": 0.30084994435310364, + "learning_rate": 1.983949815676679e-05, + "loss": 0.3169, + "step": 3076 + }, + { + "epoch": 0.057143281487284786, + "grad_norm": 0.41753578186035156, + "learning_rate": 1.9839289934933064e-05, + "loss": 0.3967, + "step": 3078 + }, + { + "epoch": 0.05718041162470342, + "grad_norm": 0.30716875195503235, + "learning_rate": 1.983908157921607e-05, + "loss": 0.3532, + "step": 3080 + }, + { + "epoch": 0.057217541762122064, + "grad_norm": 0.26224958896636963, + "learning_rate": 1.9838873089618647e-05, + "loss": 0.266, + "step": 3082 + }, + { + "epoch": 0.0572546718995407, + "grad_norm": 0.3286738693714142, + "learning_rate": 1.9838664466143632e-05, + "loss": 0.3695, + "step": 3084 + }, + { + "epoch": 0.05729180203695934, + "grad_norm": 0.3400304317474365, + "learning_rate": 1.9838455708793867e-05, + "loss": 0.5041, + "step": 3086 + }, + { + "epoch": 0.057328932174377976, + "grad_norm": 0.3571081757545471, + "learning_rate": 1.9838246817572188e-05, + "loss": 0.3704, + "step": 3088 + }, + { + "epoch": 0.05736606231179662, + "grad_norm": 0.3553451597690582, + "learning_rate": 1.9838037792481434e-05, + "loss": 0.2902, + "step": 3090 + }, + { + "epoch": 0.057403192449215254, + "grad_norm": 0.2603435814380646, + "learning_rate": 1.9837828633524457e-05, + "loss": 0.3933, + "step": 3092 + }, + { + "epoch": 0.057440322586633896, + "grad_norm": 0.39056339859962463, + "learning_rate": 1.9837619340704102e-05, + "loss": 0.3759, + "step": 3094 + }, + { + "epoch": 0.05747745272405253, + "grad_norm": 0.4029666781425476, + "learning_rate": 1.9837409914023213e-05, + "loss": 0.2642, + "step": 3096 + }, + { + "epoch": 0.057514582861471174, + "grad_norm": 0.2690047025680542, + "learning_rate": 1.9837200353484644e-05, + "loss": 0.1703, + "step": 3098 + }, + { + "epoch": 0.05755171299888981, + "grad_norm": 0.41959962248802185, + "learning_rate": 1.9836990659091243e-05, + "loss": 0.3807, + "step": 3100 + }, + { + "epoch": 0.057588843136308444, + "grad_norm": 0.4477682411670685, + "learning_rate": 1.983678083084586e-05, + "loss": 0.2581, + "step": 3102 + }, + { + "epoch": 0.05762597327372709, + "grad_norm": 0.32175272703170776, + "learning_rate": 1.9836570868751357e-05, + "loss": 0.381, + "step": 3104 + }, + { + "epoch": 0.05766310341114572, + "grad_norm": 0.2631858289241791, + "learning_rate": 1.9836360772810594e-05, + "loss": 0.3842, + "step": 3106 + }, + { + "epoch": 0.057700233548564364, + "grad_norm": 0.4008104205131531, + "learning_rate": 1.9836150543026417e-05, + "loss": 0.3837, + "step": 3108 + }, + { + "epoch": 0.057737363685983, + "grad_norm": 0.39099541306495667, + "learning_rate": 1.9835940179401697e-05, + "loss": 0.5785, + "step": 3110 + }, + { + "epoch": 0.05777449382340164, + "grad_norm": 0.3048926591873169, + "learning_rate": 1.983572968193929e-05, + "loss": 0.241, + "step": 3112 + }, + { + "epoch": 0.05781162396082028, + "grad_norm": 0.3737243711948395, + "learning_rate": 1.9835519050642066e-05, + "loss": 0.4143, + "step": 3114 + }, + { + "epoch": 0.05784875409823892, + "grad_norm": 0.3285117447376251, + "learning_rate": 1.983530828551289e-05, + "loss": 0.3916, + "step": 3116 + }, + { + "epoch": 0.057885884235657555, + "grad_norm": 0.4149942398071289, + "learning_rate": 1.9835097386554623e-05, + "loss": 0.4231, + "step": 3118 + }, + { + "epoch": 0.0579230143730762, + "grad_norm": 0.27757516503334045, + "learning_rate": 1.9834886353770145e-05, + "loss": 0.5977, + "step": 3120 + }, + { + "epoch": 0.05796014451049483, + "grad_norm": 0.38600555062294006, + "learning_rate": 1.9834675187162324e-05, + "loss": 0.2501, + "step": 3122 + }, + { + "epoch": 0.057997274647913474, + "grad_norm": 0.3534892499446869, + "learning_rate": 1.983446388673403e-05, + "loss": 0.303, + "step": 3124 + }, + { + "epoch": 0.05803440478533211, + "grad_norm": 0.31189200282096863, + "learning_rate": 1.9834252452488137e-05, + "loss": 0.2883, + "step": 3126 + }, + { + "epoch": 0.05807153492275075, + "grad_norm": 0.47020286321640015, + "learning_rate": 1.983404088442753e-05, + "loss": 0.3679, + "step": 3128 + }, + { + "epoch": 0.05810866506016939, + "grad_norm": 0.43118447065353394, + "learning_rate": 1.983382918255508e-05, + "loss": 0.3649, + "step": 3130 + }, + { + "epoch": 0.05814579519758803, + "grad_norm": 0.31245341897010803, + "learning_rate": 1.983361734687367e-05, + "loss": 0.2826, + "step": 3132 + }, + { + "epoch": 0.058182925335006665, + "grad_norm": 0.3751651346683502, + "learning_rate": 1.9833405377386184e-05, + "loss": 0.2616, + "step": 3134 + }, + { + "epoch": 0.0582200554724253, + "grad_norm": 0.34476155042648315, + "learning_rate": 1.9833193274095507e-05, + "loss": 0.247, + "step": 3136 + }, + { + "epoch": 0.05825718560984394, + "grad_norm": 0.3048694431781769, + "learning_rate": 1.983298103700452e-05, + "loss": 0.202, + "step": 3138 + }, + { + "epoch": 0.05829431574726258, + "grad_norm": 0.2760787606239319, + "learning_rate": 1.9832768666116115e-05, + "loss": 0.4239, + "step": 3140 + }, + { + "epoch": 0.05833144588468122, + "grad_norm": 0.262808620929718, + "learning_rate": 1.9832556161433184e-05, + "loss": 0.4628, + "step": 3142 + }, + { + "epoch": 0.058368576022099855, + "grad_norm": 0.22784578800201416, + "learning_rate": 1.9832343522958613e-05, + "loss": 0.2611, + "step": 3144 + }, + { + "epoch": 0.0584057061595185, + "grad_norm": 0.35843998193740845, + "learning_rate": 1.98321307506953e-05, + "loss": 0.4844, + "step": 3146 + }, + { + "epoch": 0.05844283629693713, + "grad_norm": 0.4081767201423645, + "learning_rate": 1.9831917844646136e-05, + "loss": 0.3774, + "step": 3148 + }, + { + "epoch": 0.058479966434355775, + "grad_norm": 0.41278553009033203, + "learning_rate": 1.983170480481402e-05, + "loss": 0.2661, + "step": 3150 + }, + { + "epoch": 0.05851709657177441, + "grad_norm": 0.2524242401123047, + "learning_rate": 1.983149163120185e-05, + "loss": 0.2968, + "step": 3152 + }, + { + "epoch": 0.05855422670919305, + "grad_norm": 0.3095013201236725, + "learning_rate": 1.983127832381253e-05, + "loss": 0.2998, + "step": 3154 + }, + { + "epoch": 0.05859135684661169, + "grad_norm": 0.3590630888938904, + "learning_rate": 1.983106488264896e-05, + "loss": 0.4321, + "step": 3156 + }, + { + "epoch": 0.05862848698403033, + "grad_norm": 0.4180951714515686, + "learning_rate": 1.9830851307714045e-05, + "loss": 0.3275, + "step": 3158 + }, + { + "epoch": 0.058665617121448965, + "grad_norm": 0.30163106322288513, + "learning_rate": 1.983063759901069e-05, + "loss": 0.5381, + "step": 3160 + }, + { + "epoch": 0.05870274725886761, + "grad_norm": 0.3472830355167389, + "learning_rate": 1.98304237565418e-05, + "loss": 0.4688, + "step": 3162 + }, + { + "epoch": 0.05873987739628624, + "grad_norm": 0.27661651372909546, + "learning_rate": 1.9830209780310295e-05, + "loss": 0.2387, + "step": 3164 + }, + { + "epoch": 0.058777007533704885, + "grad_norm": 0.3514387309551239, + "learning_rate": 1.9829995670319074e-05, + "loss": 0.1432, + "step": 3166 + }, + { + "epoch": 0.05881413767112352, + "grad_norm": 0.4931705892086029, + "learning_rate": 1.982978142657106e-05, + "loss": 0.3628, + "step": 3168 + }, + { + "epoch": 0.05885126780854216, + "grad_norm": 0.31983834505081177, + "learning_rate": 1.982956704906916e-05, + "loss": 0.3613, + "step": 3170 + }, + { + "epoch": 0.0588883979459608, + "grad_norm": 0.4016130566596985, + "learning_rate": 1.98293525378163e-05, + "loss": 0.4634, + "step": 3172 + }, + { + "epoch": 0.05892552808337943, + "grad_norm": 0.3541540205478668, + "learning_rate": 1.982913789281539e-05, + "loss": 0.2865, + "step": 3174 + }, + { + "epoch": 0.058962658220798075, + "grad_norm": 0.29048845171928406, + "learning_rate": 1.982892311406936e-05, + "loss": 0.3115, + "step": 3176 + }, + { + "epoch": 0.05899978835821671, + "grad_norm": 0.36161670088768005, + "learning_rate": 1.9828708201581123e-05, + "loss": 0.1945, + "step": 3178 + }, + { + "epoch": 0.05903691849563535, + "grad_norm": 0.3577595055103302, + "learning_rate": 1.982849315535361e-05, + "loss": 0.3475, + "step": 3180 + }, + { + "epoch": 0.05907404863305399, + "grad_norm": 0.4106470048427582, + "learning_rate": 1.9828277975389743e-05, + "loss": 0.2811, + "step": 3182 + }, + { + "epoch": 0.05911117877047263, + "grad_norm": 0.35093221068382263, + "learning_rate": 1.9828062661692452e-05, + "loss": 0.3538, + "step": 3184 + }, + { + "epoch": 0.059148308907891266, + "grad_norm": 0.2504813075065613, + "learning_rate": 1.9827847214264667e-05, + "loss": 0.5435, + "step": 3186 + }, + { + "epoch": 0.05918543904530991, + "grad_norm": 0.307650625705719, + "learning_rate": 1.9827631633109323e-05, + "loss": 0.2275, + "step": 3188 + }, + { + "epoch": 0.05922256918272854, + "grad_norm": 0.3227376341819763, + "learning_rate": 1.9827415918229346e-05, + "loss": 0.2815, + "step": 3190 + }, + { + "epoch": 0.059259699320147186, + "grad_norm": 0.25418251752853394, + "learning_rate": 1.9827200069627676e-05, + "loss": 0.1264, + "step": 3192 + }, + { + "epoch": 0.05929682945756582, + "grad_norm": 0.4106939435005188, + "learning_rate": 1.9826984087307245e-05, + "loss": 0.3652, + "step": 3194 + }, + { + "epoch": 0.05933395959498446, + "grad_norm": 0.23905248939990997, + "learning_rate": 1.9826767971271e-05, + "loss": 0.3297, + "step": 3196 + }, + { + "epoch": 0.0593710897324031, + "grad_norm": 0.4137873947620392, + "learning_rate": 1.982655172152188e-05, + "loss": 0.3361, + "step": 3198 + }, + { + "epoch": 0.05940821986982174, + "grad_norm": 0.9614133238792419, + "learning_rate": 1.982633533806282e-05, + "loss": 0.3971, + "step": 3200 + }, + { + "epoch": 0.059445350007240376, + "grad_norm": 0.3563469648361206, + "learning_rate": 1.982611882089677e-05, + "loss": 0.4287, + "step": 3202 + }, + { + "epoch": 0.05948248014465902, + "grad_norm": 0.4971561133861542, + "learning_rate": 1.9825902170026677e-05, + "loss": 0.3355, + "step": 3204 + }, + { + "epoch": 0.059519610282077653, + "grad_norm": 0.3778611123561859, + "learning_rate": 1.9825685385455486e-05, + "loss": 0.3412, + "step": 3206 + }, + { + "epoch": 0.059556740419496296, + "grad_norm": 0.5087078809738159, + "learning_rate": 1.982546846718615e-05, + "loss": 0.278, + "step": 3208 + }, + { + "epoch": 0.05959387055691493, + "grad_norm": 0.30923616886138916, + "learning_rate": 1.982525141522162e-05, + "loss": 0.2881, + "step": 3210 + }, + { + "epoch": 0.059631000694333566, + "grad_norm": 0.39335906505584717, + "learning_rate": 1.9825034229564845e-05, + "loss": 0.3954, + "step": 3212 + }, + { + "epoch": 0.05966813083175221, + "grad_norm": 0.3137960433959961, + "learning_rate": 1.9824816910218784e-05, + "loss": 0.4129, + "step": 3214 + }, + { + "epoch": 0.059705260969170844, + "grad_norm": 0.30671659111976624, + "learning_rate": 1.9824599457186396e-05, + "loss": 0.2662, + "step": 3216 + }, + { + "epoch": 0.059742391106589486, + "grad_norm": 0.38358739018440247, + "learning_rate": 1.9824381870470634e-05, + "loss": 0.3645, + "step": 3218 + }, + { + "epoch": 0.05977952124400812, + "grad_norm": 0.3540777266025543, + "learning_rate": 1.9824164150074466e-05, + "loss": 0.3505, + "step": 3220 + }, + { + "epoch": 0.059816651381426764, + "grad_norm": 0.35432547330856323, + "learning_rate": 1.9823946296000848e-05, + "loss": 0.2511, + "step": 3222 + }, + { + "epoch": 0.0598537815188454, + "grad_norm": 0.3370950520038605, + "learning_rate": 1.9823728308252746e-05, + "loss": 0.4815, + "step": 3224 + }, + { + "epoch": 0.05989091165626404, + "grad_norm": 0.2935311794281006, + "learning_rate": 1.982351018683313e-05, + "loss": 0.3035, + "step": 3226 + }, + { + "epoch": 0.059928041793682676, + "grad_norm": 0.26078927516937256, + "learning_rate": 1.9823291931744962e-05, + "loss": 0.3891, + "step": 3228 + }, + { + "epoch": 0.05996517193110132, + "grad_norm": 0.3112160265445709, + "learning_rate": 1.982307354299122e-05, + "loss": 0.1965, + "step": 3230 + }, + { + "epoch": 0.060002302068519954, + "grad_norm": 0.3444506525993347, + "learning_rate": 1.9822855020574864e-05, + "loss": 0.2641, + "step": 3232 + }, + { + "epoch": 0.060039432205938596, + "grad_norm": 0.38369259238243103, + "learning_rate": 1.982263636449888e-05, + "loss": 0.3712, + "step": 3234 + }, + { + "epoch": 0.06007656234335723, + "grad_norm": 0.36025455594062805, + "learning_rate": 1.9822417574766236e-05, + "loss": 0.4428, + "step": 3236 + }, + { + "epoch": 0.060113692480775874, + "grad_norm": 0.44601327180862427, + "learning_rate": 1.982219865137991e-05, + "loss": 0.4692, + "step": 3238 + }, + { + "epoch": 0.06015082261819451, + "grad_norm": 0.24177546799182892, + "learning_rate": 1.982197959434288e-05, + "loss": 0.4257, + "step": 3240 + }, + { + "epoch": 0.06018795275561315, + "grad_norm": 0.3033654987812042, + "learning_rate": 1.982176040365813e-05, + "loss": 0.3768, + "step": 3242 + }, + { + "epoch": 0.06022508289303179, + "grad_norm": 0.2943459451198578, + "learning_rate": 1.9821541079328636e-05, + "loss": 0.3928, + "step": 3244 + }, + { + "epoch": 0.06026221303045043, + "grad_norm": 0.26412588357925415, + "learning_rate": 1.982132162135739e-05, + "loss": 0.4435, + "step": 3246 + }, + { + "epoch": 0.060299343167869064, + "grad_norm": 0.27709636092185974, + "learning_rate": 1.9821102029747378e-05, + "loss": 0.2038, + "step": 3248 + }, + { + "epoch": 0.0603364733052877, + "grad_norm": 0.3554525077342987, + "learning_rate": 1.9820882304501578e-05, + "loss": 0.2629, + "step": 3250 + }, + { + "epoch": 0.06037360344270634, + "grad_norm": 0.4392123818397522, + "learning_rate": 1.982066244562299e-05, + "loss": 0.2653, + "step": 3252 + }, + { + "epoch": 0.06041073358012498, + "grad_norm": 0.38870763778686523, + "learning_rate": 1.9820442453114602e-05, + "loss": 0.2493, + "step": 3254 + }, + { + "epoch": 0.06044786371754362, + "grad_norm": 0.37764909863471985, + "learning_rate": 1.982022232697941e-05, + "loss": 0.3315, + "step": 3256 + }, + { + "epoch": 0.060484993854962255, + "grad_norm": 0.3400952219963074, + "learning_rate": 1.9820002067220408e-05, + "loss": 0.3909, + "step": 3258 + }, + { + "epoch": 0.0605221239923809, + "grad_norm": 0.24018500745296478, + "learning_rate": 1.981978167384059e-05, + "loss": 0.3641, + "step": 3260 + }, + { + "epoch": 0.06055925412979953, + "grad_norm": 0.23341913521289825, + "learning_rate": 1.981956114684296e-05, + "loss": 0.1788, + "step": 3262 + }, + { + "epoch": 0.060596384267218174, + "grad_norm": 0.3269834816455841, + "learning_rate": 1.981934048623051e-05, + "loss": 0.4447, + "step": 3264 + }, + { + "epoch": 0.06063351440463681, + "grad_norm": 0.31003937125205994, + "learning_rate": 1.981911969200625e-05, + "loss": 0.1936, + "step": 3266 + }, + { + "epoch": 0.06067064454205545, + "grad_norm": 0.4288104176521301, + "learning_rate": 1.9818898764173188e-05, + "loss": 0.3103, + "step": 3268 + }, + { + "epoch": 0.06070777467947409, + "grad_norm": 0.394898384809494, + "learning_rate": 1.9818677702734316e-05, + "loss": 0.2631, + "step": 3270 + }, + { + "epoch": 0.06074490481689273, + "grad_norm": 0.2896632254123688, + "learning_rate": 1.9818456507692656e-05, + "loss": 0.3922, + "step": 3272 + }, + { + "epoch": 0.060782034954311365, + "grad_norm": 0.34399017691612244, + "learning_rate": 1.9818235179051217e-05, + "loss": 0.2424, + "step": 3274 + }, + { + "epoch": 0.06081916509173001, + "grad_norm": 0.46302902698516846, + "learning_rate": 1.9818013716812997e-05, + "loss": 0.3542, + "step": 3276 + }, + { + "epoch": 0.06085629522914864, + "grad_norm": 0.3556465804576874, + "learning_rate": 1.9817792120981025e-05, + "loss": 0.381, + "step": 3278 + }, + { + "epoch": 0.060893425366567284, + "grad_norm": 0.2924623489379883, + "learning_rate": 1.9817570391558306e-05, + "loss": 0.3708, + "step": 3280 + }, + { + "epoch": 0.06093055550398592, + "grad_norm": 0.29451170563697815, + "learning_rate": 1.9817348528547863e-05, + "loss": 0.3773, + "step": 3282 + }, + { + "epoch": 0.06096768564140456, + "grad_norm": 0.2974057197570801, + "learning_rate": 1.981712653195271e-05, + "loss": 0.5249, + "step": 3284 + }, + { + "epoch": 0.0610048157788232, + "grad_norm": 0.3566272258758545, + "learning_rate": 1.981690440177588e-05, + "loss": 0.1815, + "step": 3286 + }, + { + "epoch": 0.06104194591624183, + "grad_norm": 0.3027786612510681, + "learning_rate": 1.9816682138020377e-05, + "loss": 0.3064, + "step": 3288 + }, + { + "epoch": 0.061079076053660475, + "grad_norm": 0.4292389750480652, + "learning_rate": 1.981645974068924e-05, + "loss": 0.2159, + "step": 3290 + }, + { + "epoch": 0.06111620619107911, + "grad_norm": 0.37225788831710815, + "learning_rate": 1.9816237209785487e-05, + "loss": 0.2118, + "step": 3292 + }, + { + "epoch": 0.06115333632849775, + "grad_norm": 0.47235774993896484, + "learning_rate": 1.9816014545312147e-05, + "loss": 0.294, + "step": 3294 + }, + { + "epoch": 0.06119046646591639, + "grad_norm": 0.2898741662502289, + "learning_rate": 1.9815791747272252e-05, + "loss": 0.3089, + "step": 3296 + }, + { + "epoch": 0.06122759660333503, + "grad_norm": 0.2759163975715637, + "learning_rate": 1.9815568815668835e-05, + "loss": 0.4878, + "step": 3298 + }, + { + "epoch": 0.061264726740753665, + "grad_norm": 0.41972827911376953, + "learning_rate": 1.9815345750504926e-05, + "loss": 0.2574, + "step": 3300 + }, + { + "epoch": 0.06130185687817231, + "grad_norm": 0.45233675837516785, + "learning_rate": 1.981512255178356e-05, + "loss": 0.3451, + "step": 3302 + }, + { + "epoch": 0.06133898701559094, + "grad_norm": 0.33466336131095886, + "learning_rate": 1.9814899219507778e-05, + "loss": 0.1477, + "step": 3304 + }, + { + "epoch": 0.061376117153009585, + "grad_norm": 0.3279944658279419, + "learning_rate": 1.9814675753680616e-05, + "loss": 0.4011, + "step": 3306 + }, + { + "epoch": 0.06141324729042822, + "grad_norm": 0.4179050624370575, + "learning_rate": 1.9814452154305117e-05, + "loss": 0.413, + "step": 3308 + }, + { + "epoch": 0.06145037742784686, + "grad_norm": 0.5420733690261841, + "learning_rate": 1.981422842138432e-05, + "loss": 0.2935, + "step": 3310 + }, + { + "epoch": 0.0614875075652655, + "grad_norm": 0.37667885422706604, + "learning_rate": 1.981400455492127e-05, + "loss": 0.3755, + "step": 3312 + }, + { + "epoch": 0.06152463770268414, + "grad_norm": 0.30398014187812805, + "learning_rate": 1.9813780554919018e-05, + "loss": 0.201, + "step": 3314 + }, + { + "epoch": 0.061561767840102775, + "grad_norm": 0.3155660629272461, + "learning_rate": 1.9813556421380608e-05, + "loss": 0.4483, + "step": 3316 + }, + { + "epoch": 0.06159889797752142, + "grad_norm": 0.22533252835273743, + "learning_rate": 1.9813332154309087e-05, + "loss": 0.3155, + "step": 3318 + }, + { + "epoch": 0.06163602811494005, + "grad_norm": 0.32661017775535583, + "learning_rate": 1.981310775370751e-05, + "loss": 0.2922, + "step": 3320 + }, + { + "epoch": 0.061673158252358695, + "grad_norm": 0.3552851378917694, + "learning_rate": 1.981288321957893e-05, + "loss": 0.347, + "step": 3322 + }, + { + "epoch": 0.06171028838977733, + "grad_norm": 0.3009335398674011, + "learning_rate": 1.9812658551926403e-05, + "loss": 0.1936, + "step": 3324 + }, + { + "epoch": 0.061747418527195966, + "grad_norm": 0.30215367674827576, + "learning_rate": 1.981243375075299e-05, + "loss": 0.3696, + "step": 3326 + }, + { + "epoch": 0.06178454866461461, + "grad_norm": 0.4027291238307953, + "learning_rate": 1.981220881606174e-05, + "loss": 0.2757, + "step": 3328 + }, + { + "epoch": 0.06182167880203324, + "grad_norm": 0.40032291412353516, + "learning_rate": 1.9811983747855722e-05, + "loss": 0.335, + "step": 3330 + }, + { + "epoch": 0.061858808939451886, + "grad_norm": 0.27187496423721313, + "learning_rate": 1.9811758546137995e-05, + "loss": 0.3502, + "step": 3332 + }, + { + "epoch": 0.06189593907687052, + "grad_norm": 0.3077321946620941, + "learning_rate": 1.9811533210911623e-05, + "loss": 0.4143, + "step": 3334 + }, + { + "epoch": 0.06193306921428916, + "grad_norm": 0.2704857587814331, + "learning_rate": 1.981130774217967e-05, + "loss": 0.2955, + "step": 3336 + }, + { + "epoch": 0.0619701993517078, + "grad_norm": 0.3517553508281708, + "learning_rate": 1.981108213994521e-05, + "loss": 0.3972, + "step": 3338 + }, + { + "epoch": 0.06200732948912644, + "grad_norm": 0.36262568831443787, + "learning_rate": 1.981085640421131e-05, + "loss": 0.3097, + "step": 3340 + }, + { + "epoch": 0.062044459626545076, + "grad_norm": 0.35919928550720215, + "learning_rate": 1.981063053498104e-05, + "loss": 0.2836, + "step": 3342 + }, + { + "epoch": 0.06208158976396372, + "grad_norm": 0.43146777153015137, + "learning_rate": 1.9810404532257476e-05, + "loss": 0.4293, + "step": 3344 + }, + { + "epoch": 0.062118719901382353, + "grad_norm": 0.3592202961444855, + "learning_rate": 1.981017839604369e-05, + "loss": 0.4629, + "step": 3346 + }, + { + "epoch": 0.062155850038800996, + "grad_norm": 0.2898438274860382, + "learning_rate": 1.980995212634276e-05, + "loss": 0.3921, + "step": 3348 + }, + { + "epoch": 0.06219298017621963, + "grad_norm": 0.3588927090167999, + "learning_rate": 1.980972572315777e-05, + "loss": 0.2682, + "step": 3350 + }, + { + "epoch": 0.06223011031363827, + "grad_norm": 0.3686981499195099, + "learning_rate": 1.9809499186491794e-05, + "loss": 0.3534, + "step": 3352 + }, + { + "epoch": 0.06226724045105691, + "grad_norm": 0.274715781211853, + "learning_rate": 1.9809272516347917e-05, + "loss": 0.324, + "step": 3354 + }, + { + "epoch": 0.06230437058847555, + "grad_norm": 0.36873549222946167, + "learning_rate": 1.9809045712729225e-05, + "loss": 0.2721, + "step": 3356 + }, + { + "epoch": 0.062341500725894186, + "grad_norm": 0.37963634729385376, + "learning_rate": 1.9808818775638802e-05, + "loss": 0.4641, + "step": 3358 + }, + { + "epoch": 0.06237863086331283, + "grad_norm": 0.221884623169899, + "learning_rate": 1.9808591705079733e-05, + "loss": 0.2945, + "step": 3360 + }, + { + "epoch": 0.062415761000731464, + "grad_norm": 0.4325578808784485, + "learning_rate": 1.9808364501055113e-05, + "loss": 0.2793, + "step": 3362 + }, + { + "epoch": 0.0624528911381501, + "grad_norm": 0.38638925552368164, + "learning_rate": 1.9808137163568034e-05, + "loss": 0.4063, + "step": 3364 + }, + { + "epoch": 0.06249002127556874, + "grad_norm": 0.35687869787216187, + "learning_rate": 1.980790969262158e-05, + "loss": 0.3402, + "step": 3366 + }, + { + "epoch": 0.06252715141298738, + "grad_norm": 0.34990063309669495, + "learning_rate": 1.9807682088218862e-05, + "loss": 0.6077, + "step": 3368 + }, + { + "epoch": 0.06256428155040601, + "grad_norm": 0.3275136947631836, + "learning_rate": 1.980745435036296e-05, + "loss": 0.3089, + "step": 3370 + }, + { + "epoch": 0.06260141168782465, + "grad_norm": 0.35529887676239014, + "learning_rate": 1.9807226479056987e-05, + "loss": 0.2594, + "step": 3372 + }, + { + "epoch": 0.0626385418252433, + "grad_norm": 0.2974911332130432, + "learning_rate": 1.9806998474304037e-05, + "loss": 0.4894, + "step": 3374 + }, + { + "epoch": 0.06267567196266194, + "grad_norm": 0.45959633588790894, + "learning_rate": 1.980677033610721e-05, + "loss": 0.3815, + "step": 3376 + }, + { + "epoch": 0.06271280210008057, + "grad_norm": 0.484009325504303, + "learning_rate": 1.9806542064469615e-05, + "loss": 0.3984, + "step": 3378 + }, + { + "epoch": 0.06274993223749921, + "grad_norm": 0.41314223408699036, + "learning_rate": 1.9806313659394356e-05, + "loss": 0.3932, + "step": 3380 + }, + { + "epoch": 0.06278706237491785, + "grad_norm": 0.2912636995315552, + "learning_rate": 1.9806085120884543e-05, + "loss": 0.2438, + "step": 3382 + }, + { + "epoch": 0.0628241925123365, + "grad_norm": 0.3224206864833832, + "learning_rate": 1.9805856448943284e-05, + "loss": 0.4137, + "step": 3384 + }, + { + "epoch": 0.06286132264975512, + "grad_norm": 0.3114950656890869, + "learning_rate": 1.980562764357369e-05, + "loss": 0.316, + "step": 3386 + }, + { + "epoch": 0.06289845278717376, + "grad_norm": 0.31566303968429565, + "learning_rate": 1.9805398704778876e-05, + "loss": 0.2337, + "step": 3388 + }, + { + "epoch": 0.0629355829245924, + "grad_norm": 0.5739381313323975, + "learning_rate": 1.9805169632561952e-05, + "loss": 0.2494, + "step": 3390 + }, + { + "epoch": 0.06297271306201105, + "grad_norm": 0.6408292055130005, + "learning_rate": 1.9804940426926042e-05, + "loss": 0.3554, + "step": 3392 + }, + { + "epoch": 0.06300984319942968, + "grad_norm": 0.4740843176841736, + "learning_rate": 1.9804711087874265e-05, + "loss": 0.4606, + "step": 3394 + }, + { + "epoch": 0.06304697333684832, + "grad_norm": 0.23687942326068878, + "learning_rate": 1.9804481615409732e-05, + "loss": 0.3875, + "step": 3396 + }, + { + "epoch": 0.06308410347426696, + "grad_norm": 0.2944756746292114, + "learning_rate": 1.980425200953558e-05, + "loss": 0.2461, + "step": 3398 + }, + { + "epoch": 0.0631212336116856, + "grad_norm": 0.26801395416259766, + "learning_rate": 1.980402227025492e-05, + "loss": 0.2823, + "step": 3400 + }, + { + "epoch": 0.06315836374910423, + "grad_norm": 0.36365678906440735, + "learning_rate": 1.9803792397570882e-05, + "loss": 0.388, + "step": 3402 + }, + { + "epoch": 0.06319549388652287, + "grad_norm": 0.3454486131668091, + "learning_rate": 1.9803562391486598e-05, + "loss": 0.2633, + "step": 3404 + }, + { + "epoch": 0.06323262402394152, + "grad_norm": 0.3128768503665924, + "learning_rate": 1.9803332252005195e-05, + "loss": 0.3173, + "step": 3406 + }, + { + "epoch": 0.06326975416136014, + "grad_norm": 0.335010826587677, + "learning_rate": 1.9803101979129805e-05, + "loss": 0.4111, + "step": 3408 + }, + { + "epoch": 0.06330688429877879, + "grad_norm": 0.39339467883110046, + "learning_rate": 1.980287157286356e-05, + "loss": 0.2494, + "step": 3410 + }, + { + "epoch": 0.06334401443619743, + "grad_norm": 0.4477849304676056, + "learning_rate": 1.9802641033209595e-05, + "loss": 0.521, + "step": 3412 + }, + { + "epoch": 0.06338114457361607, + "grad_norm": 0.427157461643219, + "learning_rate": 1.980241036017105e-05, + "loss": 0.3358, + "step": 3414 + }, + { + "epoch": 0.0634182747110347, + "grad_norm": 0.3436141908168793, + "learning_rate": 1.980217955375106e-05, + "loss": 0.4574, + "step": 3416 + }, + { + "epoch": 0.06345540484845334, + "grad_norm": 0.29612526297569275, + "learning_rate": 1.9801948613952766e-05, + "loss": 0.3696, + "step": 3418 + }, + { + "epoch": 0.06349253498587198, + "grad_norm": 0.313785195350647, + "learning_rate": 1.9801717540779312e-05, + "loss": 0.3888, + "step": 3420 + }, + { + "epoch": 0.06352966512329063, + "grad_norm": 0.5631344318389893, + "learning_rate": 1.9801486334233843e-05, + "loss": 0.431, + "step": 3422 + }, + { + "epoch": 0.06356679526070926, + "grad_norm": 0.5482927560806274, + "learning_rate": 1.9801254994319506e-05, + "loss": 0.4207, + "step": 3424 + }, + { + "epoch": 0.0636039253981279, + "grad_norm": 0.2833101153373718, + "learning_rate": 1.9801023521039445e-05, + "loss": 0.3317, + "step": 3426 + }, + { + "epoch": 0.06364105553554654, + "grad_norm": 0.41085535287857056, + "learning_rate": 1.980079191439681e-05, + "loss": 0.6227, + "step": 3428 + }, + { + "epoch": 0.06367818567296518, + "grad_norm": 0.33912044763565063, + "learning_rate": 1.9800560174394755e-05, + "loss": 0.3248, + "step": 3430 + }, + { + "epoch": 0.06371531581038381, + "grad_norm": 0.5338802933692932, + "learning_rate": 1.9800328301036432e-05, + "loss": 0.3868, + "step": 3432 + }, + { + "epoch": 0.06375244594780245, + "grad_norm": 0.39040011167526245, + "learning_rate": 1.9800096294324995e-05, + "loss": 0.3828, + "step": 3434 + }, + { + "epoch": 0.0637895760852211, + "grad_norm": 0.2740331292152405, + "learning_rate": 1.9799864154263604e-05, + "loss": 0.2468, + "step": 3436 + }, + { + "epoch": 0.06382670622263974, + "grad_norm": 0.29916927218437195, + "learning_rate": 1.979963188085541e-05, + "loss": 0.4015, + "step": 3438 + }, + { + "epoch": 0.06386383636005837, + "grad_norm": 0.31425076723098755, + "learning_rate": 1.9799399474103588e-05, + "loss": 0.3541, + "step": 3440 + }, + { + "epoch": 0.06390096649747701, + "grad_norm": 0.2851545810699463, + "learning_rate": 1.9799166934011287e-05, + "loss": 0.2345, + "step": 3442 + }, + { + "epoch": 0.06393809663489565, + "grad_norm": 0.3720426857471466, + "learning_rate": 1.9798934260581677e-05, + "loss": 0.2959, + "step": 3444 + }, + { + "epoch": 0.06397522677231428, + "grad_norm": 0.3075149655342102, + "learning_rate": 1.9798701453817922e-05, + "loss": 0.2005, + "step": 3446 + }, + { + "epoch": 0.06401235690973292, + "grad_norm": 0.6257651448249817, + "learning_rate": 1.979846851372319e-05, + "loss": 0.3182, + "step": 3448 + }, + { + "epoch": 0.06404948704715156, + "grad_norm": 0.36483991146087646, + "learning_rate": 1.9798235440300654e-05, + "loss": 0.3384, + "step": 3450 + }, + { + "epoch": 0.0640866171845702, + "grad_norm": 0.3266531229019165, + "learning_rate": 1.979800223355348e-05, + "loss": 0.2649, + "step": 3452 + }, + { + "epoch": 0.06412374732198883, + "grad_norm": 0.3313089907169342, + "learning_rate": 1.979776889348485e-05, + "loss": 0.3701, + "step": 3454 + }, + { + "epoch": 0.06416087745940748, + "grad_norm": 0.4297769069671631, + "learning_rate": 1.9797535420097927e-05, + "loss": 0.4376, + "step": 3456 + }, + { + "epoch": 0.06419800759682612, + "grad_norm": 0.3002229332923889, + "learning_rate": 1.9797301813395897e-05, + "loss": 0.5252, + "step": 3458 + }, + { + "epoch": 0.06423513773424476, + "grad_norm": 0.31100016832351685, + "learning_rate": 1.9797068073381933e-05, + "loss": 0.2313, + "step": 3460 + }, + { + "epoch": 0.06427226787166339, + "grad_norm": 0.3578316569328308, + "learning_rate": 1.9796834200059217e-05, + "loss": 0.3925, + "step": 3462 + }, + { + "epoch": 0.06430939800908203, + "grad_norm": 0.36674967408180237, + "learning_rate": 1.979660019343094e-05, + "loss": 0.3483, + "step": 3464 + }, + { + "epoch": 0.06434652814650067, + "grad_norm": 0.3229820132255554, + "learning_rate": 1.979636605350027e-05, + "loss": 0.2578, + "step": 3466 + }, + { + "epoch": 0.06438365828391932, + "grad_norm": 0.2721008062362671, + "learning_rate": 1.9796131780270403e-05, + "loss": 0.3217, + "step": 3468 + }, + { + "epoch": 0.06442078842133794, + "grad_norm": 0.40643110871315, + "learning_rate": 1.979589737374453e-05, + "loss": 0.2806, + "step": 3470 + }, + { + "epoch": 0.06445791855875659, + "grad_norm": 0.5781317353248596, + "learning_rate": 1.9795662833925834e-05, + "loss": 0.4425, + "step": 3472 + }, + { + "epoch": 0.06449504869617523, + "grad_norm": 0.31618189811706543, + "learning_rate": 1.9795428160817506e-05, + "loss": 0.4773, + "step": 3474 + }, + { + "epoch": 0.06453217883359386, + "grad_norm": 0.4076223373413086, + "learning_rate": 1.9795193354422743e-05, + "loss": 0.2774, + "step": 3476 + }, + { + "epoch": 0.0645693089710125, + "grad_norm": 0.3126170337200165, + "learning_rate": 1.9794958414744737e-05, + "loss": 0.2315, + "step": 3478 + }, + { + "epoch": 0.06460643910843114, + "grad_norm": 0.28904321789741516, + "learning_rate": 1.9794723341786686e-05, + "loss": 0.1381, + "step": 3480 + }, + { + "epoch": 0.06464356924584978, + "grad_norm": 0.3527877926826477, + "learning_rate": 1.9794488135551796e-05, + "loss": 0.3644, + "step": 3482 + }, + { + "epoch": 0.06468069938326841, + "grad_norm": 0.3266255855560303, + "learning_rate": 1.979425279604325e-05, + "loss": 0.4152, + "step": 3484 + }, + { + "epoch": 0.06471782952068705, + "grad_norm": 0.4346773624420166, + "learning_rate": 1.979401732326427e-05, + "loss": 0.2437, + "step": 3486 + }, + { + "epoch": 0.0647549596581057, + "grad_norm": 0.3893941640853882, + "learning_rate": 1.9793781717218045e-05, + "loss": 0.296, + "step": 3488 + }, + { + "epoch": 0.06479208979552434, + "grad_norm": 0.3035922646522522, + "learning_rate": 1.9793545977907787e-05, + "loss": 0.446, + "step": 3490 + }, + { + "epoch": 0.06482921993294297, + "grad_norm": 0.39036884903907776, + "learning_rate": 1.9793310105336702e-05, + "loss": 0.3657, + "step": 3492 + }, + { + "epoch": 0.06486635007036161, + "grad_norm": 0.31383204460144043, + "learning_rate": 1.9793074099508005e-05, + "loss": 0.4136, + "step": 3494 + }, + { + "epoch": 0.06490348020778025, + "grad_norm": 0.3066549003124237, + "learning_rate": 1.97928379604249e-05, + "loss": 0.3488, + "step": 3496 + }, + { + "epoch": 0.0649406103451989, + "grad_norm": 0.3661425709724426, + "learning_rate": 1.9792601688090603e-05, + "loss": 0.2929, + "step": 3498 + }, + { + "epoch": 0.06497774048261752, + "grad_norm": 0.33869463205337524, + "learning_rate": 1.9792365282508332e-05, + "loss": 0.4609, + "step": 3500 + }, + { + "epoch": 0.06501487062003616, + "grad_norm": 0.3715529441833496, + "learning_rate": 1.97921287436813e-05, + "loss": 0.453, + "step": 3502 + }, + { + "epoch": 0.0650520007574548, + "grad_norm": 0.37088167667388916, + "learning_rate": 1.9791892071612727e-05, + "loss": 0.3071, + "step": 3504 + }, + { + "epoch": 0.06508913089487345, + "grad_norm": 0.2620362639427185, + "learning_rate": 1.9791655266305833e-05, + "loss": 0.4533, + "step": 3506 + }, + { + "epoch": 0.06512626103229208, + "grad_norm": 0.5008695125579834, + "learning_rate": 1.9791418327763838e-05, + "loss": 0.3663, + "step": 3508 + }, + { + "epoch": 0.06516339116971072, + "grad_norm": 0.6745080947875977, + "learning_rate": 1.979118125598997e-05, + "loss": 0.4937, + "step": 3510 + }, + { + "epoch": 0.06520052130712936, + "grad_norm": 0.7969282269477844, + "learning_rate": 1.9790944050987454e-05, + "loss": 0.4882, + "step": 3512 + }, + { + "epoch": 0.06523765144454799, + "grad_norm": 0.2834987938404083, + "learning_rate": 1.9790706712759516e-05, + "loss": 0.4779, + "step": 3514 + }, + { + "epoch": 0.06527478158196663, + "grad_norm": 0.3522730767726898, + "learning_rate": 1.9790469241309385e-05, + "loss": 0.427, + "step": 3516 + }, + { + "epoch": 0.06531191171938527, + "grad_norm": 0.3356669247150421, + "learning_rate": 1.9790231636640294e-05, + "loss": 0.2438, + "step": 3518 + }, + { + "epoch": 0.06534904185680392, + "grad_norm": 0.23388394713401794, + "learning_rate": 1.9789993898755475e-05, + "loss": 0.4224, + "step": 3520 + }, + { + "epoch": 0.06538617199422254, + "grad_norm": 0.35800936818122864, + "learning_rate": 1.9789756027658165e-05, + "loss": 0.4319, + "step": 3522 + }, + { + "epoch": 0.06542330213164119, + "grad_norm": 0.3816884458065033, + "learning_rate": 1.9789518023351597e-05, + "loss": 0.4366, + "step": 3524 + }, + { + "epoch": 0.06546043226905983, + "grad_norm": 0.48795273900032043, + "learning_rate": 1.9789279885839016e-05, + "loss": 0.4556, + "step": 3526 + }, + { + "epoch": 0.06549756240647847, + "grad_norm": 0.2694101333618164, + "learning_rate": 1.9789041615123653e-05, + "loss": 0.3967, + "step": 3528 + }, + { + "epoch": 0.0655346925438971, + "grad_norm": 0.3457162380218506, + "learning_rate": 1.9788803211208757e-05, + "loss": 0.4285, + "step": 3530 + }, + { + "epoch": 0.06557182268131574, + "grad_norm": 0.36494818329811096, + "learning_rate": 1.978856467409757e-05, + "loss": 0.3815, + "step": 3532 + }, + { + "epoch": 0.06560895281873438, + "grad_norm": 0.3457881808280945, + "learning_rate": 1.978832600379334e-05, + "loss": 0.3317, + "step": 3534 + }, + { + "epoch": 0.06564608295615303, + "grad_norm": 0.3842501640319824, + "learning_rate": 1.978808720029931e-05, + "loss": 0.4328, + "step": 3536 + }, + { + "epoch": 0.06568321309357165, + "grad_norm": 0.34488967061042786, + "learning_rate": 1.9787848263618735e-05, + "loss": 0.2453, + "step": 3538 + }, + { + "epoch": 0.0657203432309903, + "grad_norm": 0.40323707461357117, + "learning_rate": 1.978760919375486e-05, + "loss": 0.3317, + "step": 3540 + }, + { + "epoch": 0.06575747336840894, + "grad_norm": 0.364903062582016, + "learning_rate": 1.9787369990710943e-05, + "loss": 0.4382, + "step": 3542 + }, + { + "epoch": 0.06579460350582758, + "grad_norm": 0.4243890643119812, + "learning_rate": 1.9787130654490237e-05, + "loss": 0.4304, + "step": 3544 + }, + { + "epoch": 0.06583173364324621, + "grad_norm": 0.43857312202453613, + "learning_rate": 1.9786891185096e-05, + "loss": 0.4892, + "step": 3546 + }, + { + "epoch": 0.06586886378066485, + "grad_norm": 0.3423400819301605, + "learning_rate": 1.9786651582531484e-05, + "loss": 0.2354, + "step": 3548 + }, + { + "epoch": 0.0659059939180835, + "grad_norm": 0.3118945062160492, + "learning_rate": 1.9786411846799957e-05, + "loss": 0.3271, + "step": 3550 + }, + { + "epoch": 0.06594312405550212, + "grad_norm": 0.3465314507484436, + "learning_rate": 1.978617197790468e-05, + "loss": 0.3232, + "step": 3552 + }, + { + "epoch": 0.06598025419292076, + "grad_norm": 0.3505212068557739, + "learning_rate": 1.9785931975848913e-05, + "loss": 0.4013, + "step": 3554 + }, + { + "epoch": 0.06601738433033941, + "grad_norm": 0.40989041328430176, + "learning_rate": 1.9785691840635922e-05, + "loss": 0.2751, + "step": 3556 + }, + { + "epoch": 0.06605451446775805, + "grad_norm": 0.24111665785312653, + "learning_rate": 1.9785451572268983e-05, + "loss": 0.1604, + "step": 3558 + }, + { + "epoch": 0.06609164460517668, + "grad_norm": 0.3601658046245575, + "learning_rate": 1.9785211170751354e-05, + "loss": 0.4636, + "step": 3560 + }, + { + "epoch": 0.06612877474259532, + "grad_norm": 0.31617647409439087, + "learning_rate": 1.9784970636086313e-05, + "loss": 0.4174, + "step": 3562 + }, + { + "epoch": 0.06616590488001396, + "grad_norm": 0.4005337655544281, + "learning_rate": 1.978472996827713e-05, + "loss": 0.3572, + "step": 3564 + }, + { + "epoch": 0.0662030350174326, + "grad_norm": 0.27472394704818726, + "learning_rate": 1.9784489167327083e-05, + "loss": 0.3076, + "step": 3566 + }, + { + "epoch": 0.06624016515485123, + "grad_norm": 0.283316969871521, + "learning_rate": 1.9784248233239447e-05, + "loss": 0.4069, + "step": 3568 + }, + { + "epoch": 0.06627729529226987, + "grad_norm": 0.5566747188568115, + "learning_rate": 1.9784007166017497e-05, + "loss": 0.3595, + "step": 3570 + }, + { + "epoch": 0.06631442542968852, + "grad_norm": 0.30584490299224854, + "learning_rate": 1.9783765965664515e-05, + "loss": 0.5073, + "step": 3572 + }, + { + "epoch": 0.06635155556710716, + "grad_norm": 0.7525100111961365, + "learning_rate": 1.9783524632183786e-05, + "loss": 0.6261, + "step": 3574 + }, + { + "epoch": 0.06638868570452579, + "grad_norm": 0.42935389280319214, + "learning_rate": 1.9783283165578594e-05, + "loss": 0.3267, + "step": 3576 + }, + { + "epoch": 0.06642581584194443, + "grad_norm": 0.28410154581069946, + "learning_rate": 1.9783041565852218e-05, + "loss": 0.5157, + "step": 3578 + }, + { + "epoch": 0.06646294597936307, + "grad_norm": 0.2615748345851898, + "learning_rate": 1.9782799833007952e-05, + "loss": 0.392, + "step": 3580 + }, + { + "epoch": 0.06650007611678171, + "grad_norm": 0.34294745326042175, + "learning_rate": 1.9782557967049085e-05, + "loss": 0.4662, + "step": 3582 + }, + { + "epoch": 0.06653720625420034, + "grad_norm": 0.3632686734199524, + "learning_rate": 1.97823159679789e-05, + "loss": 0.5372, + "step": 3584 + }, + { + "epoch": 0.06657433639161899, + "grad_norm": 0.33623144030570984, + "learning_rate": 1.9782073835800705e-05, + "loss": 0.2725, + "step": 3586 + }, + { + "epoch": 0.06661146652903763, + "grad_norm": 0.28490644693374634, + "learning_rate": 1.978183157051778e-05, + "loss": 0.4684, + "step": 3588 + }, + { + "epoch": 0.06664859666645626, + "grad_norm": 0.2859022617340088, + "learning_rate": 1.978158917213343e-05, + "loss": 0.4832, + "step": 3590 + }, + { + "epoch": 0.0666857268038749, + "grad_norm": 0.42026716470718384, + "learning_rate": 1.9781346640650946e-05, + "loss": 0.4421, + "step": 3592 + }, + { + "epoch": 0.06672285694129354, + "grad_norm": 0.32438981533050537, + "learning_rate": 1.9781103976073637e-05, + "loss": 0.5605, + "step": 3594 + }, + { + "epoch": 0.06675998707871218, + "grad_norm": 0.3323333263397217, + "learning_rate": 1.97808611784048e-05, + "loss": 0.3469, + "step": 3596 + }, + { + "epoch": 0.06679711721613081, + "grad_norm": 0.3498179018497467, + "learning_rate": 1.978061824764774e-05, + "loss": 0.3391, + "step": 3598 + }, + { + "epoch": 0.06683424735354945, + "grad_norm": 0.40430712699890137, + "learning_rate": 1.978037518380576e-05, + "loss": 0.2071, + "step": 3600 + }, + { + "epoch": 0.0668713774909681, + "grad_norm": 0.3161903917789459, + "learning_rate": 1.9780131986882168e-05, + "loss": 0.3566, + "step": 3602 + }, + { + "epoch": 0.06690850762838674, + "grad_norm": 0.5689277052879333, + "learning_rate": 1.977988865688028e-05, + "loss": 0.4536, + "step": 3604 + }, + { + "epoch": 0.06694563776580537, + "grad_norm": 0.3722653090953827, + "learning_rate": 1.9779645193803394e-05, + "loss": 0.3497, + "step": 3606 + }, + { + "epoch": 0.06698276790322401, + "grad_norm": 0.4058457016944885, + "learning_rate": 1.9779401597654835e-05, + "loss": 0.2542, + "step": 3608 + }, + { + "epoch": 0.06701989804064265, + "grad_norm": 0.37038716673851013, + "learning_rate": 1.9779157868437915e-05, + "loss": 0.3778, + "step": 3610 + }, + { + "epoch": 0.06705702817806129, + "grad_norm": 0.286577433347702, + "learning_rate": 1.9778914006155944e-05, + "loss": 0.2709, + "step": 3612 + }, + { + "epoch": 0.06709415831547992, + "grad_norm": 0.396380752325058, + "learning_rate": 1.977867001081225e-05, + "loss": 0.4762, + "step": 3614 + }, + { + "epoch": 0.06713128845289856, + "grad_norm": 0.34840935468673706, + "learning_rate": 1.977842588241014e-05, + "loss": 0.478, + "step": 3616 + }, + { + "epoch": 0.0671684185903172, + "grad_norm": 0.4267539381980896, + "learning_rate": 1.977818162095295e-05, + "loss": 0.3074, + "step": 3618 + }, + { + "epoch": 0.06720554872773585, + "grad_norm": 0.2859472632408142, + "learning_rate": 1.9777937226443996e-05, + "loss": 0.2739, + "step": 3620 + }, + { + "epoch": 0.06724267886515448, + "grad_norm": 0.35712242126464844, + "learning_rate": 1.9777692698886604e-05, + "loss": 0.3906, + "step": 3622 + }, + { + "epoch": 0.06727980900257312, + "grad_norm": 0.28851747512817383, + "learning_rate": 1.97774480382841e-05, + "loss": 0.2724, + "step": 3624 + }, + { + "epoch": 0.06731693913999176, + "grad_norm": 0.2951601445674896, + "learning_rate": 1.9777203244639818e-05, + "loss": 0.4262, + "step": 3626 + }, + { + "epoch": 0.06735406927741039, + "grad_norm": 0.3259916603565216, + "learning_rate": 1.9776958317957082e-05, + "loss": 0.1188, + "step": 3628 + }, + { + "epoch": 0.06739119941482903, + "grad_norm": 0.35760271549224854, + "learning_rate": 1.977671325823923e-05, + "loss": 0.283, + "step": 3630 + }, + { + "epoch": 0.06742832955224767, + "grad_norm": 0.3593396842479706, + "learning_rate": 1.9776468065489598e-05, + "loss": 0.5277, + "step": 3632 + }, + { + "epoch": 0.06746545968966632, + "grad_norm": 0.2551288604736328, + "learning_rate": 1.977622273971152e-05, + "loss": 0.3124, + "step": 3634 + }, + { + "epoch": 0.06750258982708494, + "grad_norm": 0.38036009669303894, + "learning_rate": 1.9775977280908328e-05, + "loss": 0.4735, + "step": 3636 + }, + { + "epoch": 0.06753971996450359, + "grad_norm": 0.3669552505016327, + "learning_rate": 1.9775731689083372e-05, + "loss": 0.2819, + "step": 3638 + }, + { + "epoch": 0.06757685010192223, + "grad_norm": 0.32467395067214966, + "learning_rate": 1.9775485964239993e-05, + "loss": 0.3542, + "step": 3640 + }, + { + "epoch": 0.06761398023934087, + "grad_norm": 0.3737581968307495, + "learning_rate": 1.9775240106381527e-05, + "loss": 0.3569, + "step": 3642 + }, + { + "epoch": 0.0676511103767595, + "grad_norm": 0.36326664686203003, + "learning_rate": 1.977499411551132e-05, + "loss": 0.226, + "step": 3644 + }, + { + "epoch": 0.06768824051417814, + "grad_norm": 0.3367057144641876, + "learning_rate": 1.9774747991632724e-05, + "loss": 0.2422, + "step": 3646 + }, + { + "epoch": 0.06772537065159678, + "grad_norm": 0.5563686490058899, + "learning_rate": 1.977450173474909e-05, + "loss": 0.3869, + "step": 3648 + }, + { + "epoch": 0.06776250078901543, + "grad_norm": 0.4534398019313812, + "learning_rate": 1.9774255344863764e-05, + "loss": 0.4673, + "step": 3650 + }, + { + "epoch": 0.06779963092643405, + "grad_norm": 0.25752100348472595, + "learning_rate": 1.97740088219801e-05, + "loss": 0.2981, + "step": 3652 + }, + { + "epoch": 0.0678367610638527, + "grad_norm": 0.3885330259799957, + "learning_rate": 1.977376216610145e-05, + "loss": 0.3304, + "step": 3654 + }, + { + "epoch": 0.06787389120127134, + "grad_norm": 0.3164767920970917, + "learning_rate": 1.977351537723118e-05, + "loss": 0.511, + "step": 3656 + }, + { + "epoch": 0.06791102133868998, + "grad_norm": 0.3914724886417389, + "learning_rate": 1.9773268455372632e-05, + "loss": 0.3287, + "step": 3658 + }, + { + "epoch": 0.06794815147610861, + "grad_norm": 0.44187113642692566, + "learning_rate": 1.977302140052918e-05, + "loss": 0.4321, + "step": 3660 + }, + { + "epoch": 0.06798528161352725, + "grad_norm": 0.3751872479915619, + "learning_rate": 1.9772774212704176e-05, + "loss": 0.4284, + "step": 3662 + }, + { + "epoch": 0.0680224117509459, + "grad_norm": 0.2755732238292694, + "learning_rate": 1.9772526891900993e-05, + "loss": 0.4088, + "step": 3664 + }, + { + "epoch": 0.06805954188836452, + "grad_norm": 0.3075898289680481, + "learning_rate": 1.9772279438122987e-05, + "loss": 0.1782, + "step": 3666 + }, + { + "epoch": 0.06809667202578316, + "grad_norm": 0.3583005368709564, + "learning_rate": 1.977203185137353e-05, + "loss": 0.3419, + "step": 3668 + }, + { + "epoch": 0.0681338021632018, + "grad_norm": 0.36649009585380554, + "learning_rate": 1.9771784131655993e-05, + "loss": 0.3716, + "step": 3670 + }, + { + "epoch": 0.06817093230062045, + "grad_norm": 0.31687599420547485, + "learning_rate": 1.977153627897374e-05, + "loss": 0.3021, + "step": 3672 + }, + { + "epoch": 0.06820806243803908, + "grad_norm": 0.3212026357650757, + "learning_rate": 1.9771288293330154e-05, + "loss": 0.4681, + "step": 3674 + }, + { + "epoch": 0.06824519257545772, + "grad_norm": 0.27392446994781494, + "learning_rate": 1.9771040174728596e-05, + "loss": 0.4048, + "step": 3676 + }, + { + "epoch": 0.06828232271287636, + "grad_norm": 0.306193083524704, + "learning_rate": 1.977079192317245e-05, + "loss": 0.4329, + "step": 3678 + }, + { + "epoch": 0.068319452850295, + "grad_norm": 0.32209184765815735, + "learning_rate": 1.9770543538665095e-05, + "loss": 0.336, + "step": 3680 + }, + { + "epoch": 0.06835658298771363, + "grad_norm": 0.2807345390319824, + "learning_rate": 1.9770295021209907e-05, + "loss": 0.3375, + "step": 3682 + }, + { + "epoch": 0.06839371312513227, + "grad_norm": 0.3570718467235565, + "learning_rate": 1.9770046370810267e-05, + "loss": 0.3185, + "step": 3684 + }, + { + "epoch": 0.06843084326255092, + "grad_norm": 0.3253606855869293, + "learning_rate": 1.9769797587469567e-05, + "loss": 0.5198, + "step": 3686 + }, + { + "epoch": 0.06846797339996956, + "grad_norm": 0.4012141227722168, + "learning_rate": 1.976954867119118e-05, + "loss": 0.316, + "step": 3688 + }, + { + "epoch": 0.06850510353738819, + "grad_norm": 0.3480335474014282, + "learning_rate": 1.97692996219785e-05, + "loss": 0.3325, + "step": 3690 + }, + { + "epoch": 0.06854223367480683, + "grad_norm": 0.46913138031959534, + "learning_rate": 1.9769050439834916e-05, + "loss": 0.4427, + "step": 3692 + }, + { + "epoch": 0.06857936381222547, + "grad_norm": 0.330547571182251, + "learning_rate": 1.9768801124763815e-05, + "loss": 0.282, + "step": 3694 + }, + { + "epoch": 0.06861649394964411, + "grad_norm": 0.3649209439754486, + "learning_rate": 1.9768551676768593e-05, + "loss": 0.3271, + "step": 3696 + }, + { + "epoch": 0.06865362408706274, + "grad_norm": 0.30090102553367615, + "learning_rate": 1.9768302095852644e-05, + "loss": 0.5838, + "step": 3698 + }, + { + "epoch": 0.06869075422448138, + "grad_norm": 0.2098192274570465, + "learning_rate": 1.9768052382019364e-05, + "loss": 0.2257, + "step": 3700 + }, + { + "epoch": 0.06872788436190003, + "grad_norm": 0.340071439743042, + "learning_rate": 1.9767802535272143e-05, + "loss": 0.313, + "step": 3702 + }, + { + "epoch": 0.06876501449931866, + "grad_norm": 0.3772391080856323, + "learning_rate": 1.9767552555614395e-05, + "loss": 0.5547, + "step": 3704 + }, + { + "epoch": 0.0688021446367373, + "grad_norm": 0.3363734185695648, + "learning_rate": 1.976730244304951e-05, + "loss": 0.3533, + "step": 3706 + }, + { + "epoch": 0.06883927477415594, + "grad_norm": 0.3628649115562439, + "learning_rate": 1.9767052197580895e-05, + "loss": 0.2255, + "step": 3708 + }, + { + "epoch": 0.06887640491157458, + "grad_norm": 0.3915059268474579, + "learning_rate": 1.9766801819211958e-05, + "loss": 0.4689, + "step": 3710 + }, + { + "epoch": 0.06891353504899321, + "grad_norm": 0.3537546694278717, + "learning_rate": 1.97665513079461e-05, + "loss": 0.2736, + "step": 3712 + }, + { + "epoch": 0.06895066518641185, + "grad_norm": 0.4346698820590973, + "learning_rate": 1.9766300663786735e-05, + "loss": 0.3228, + "step": 3714 + }, + { + "epoch": 0.0689877953238305, + "grad_norm": 0.23483529686927795, + "learning_rate": 1.9766049886737272e-05, + "loss": 0.3508, + "step": 3716 + }, + { + "epoch": 0.06902492546124914, + "grad_norm": 0.4852348566055298, + "learning_rate": 1.9765798976801123e-05, + "loss": 0.3446, + "step": 3718 + }, + { + "epoch": 0.06906205559866777, + "grad_norm": 0.2716207802295685, + "learning_rate": 1.97655479339817e-05, + "loss": 0.3474, + "step": 3720 + }, + { + "epoch": 0.06909918573608641, + "grad_norm": 0.2739524841308594, + "learning_rate": 1.9765296758282422e-05, + "loss": 0.2856, + "step": 3722 + }, + { + "epoch": 0.06913631587350505, + "grad_norm": 0.3598240613937378, + "learning_rate": 1.9765045449706703e-05, + "loss": 0.4551, + "step": 3724 + }, + { + "epoch": 0.06917344601092369, + "grad_norm": 0.29737603664398193, + "learning_rate": 1.976479400825797e-05, + "loss": 0.1317, + "step": 3726 + }, + { + "epoch": 0.06921057614834232, + "grad_norm": 0.384287565946579, + "learning_rate": 1.9764542433939633e-05, + "loss": 0.3577, + "step": 3728 + }, + { + "epoch": 0.06924770628576096, + "grad_norm": 0.4090096056461334, + "learning_rate": 1.9764290726755125e-05, + "loss": 0.3684, + "step": 3730 + }, + { + "epoch": 0.0692848364231796, + "grad_norm": 0.4387440085411072, + "learning_rate": 1.976403888670787e-05, + "loss": 0.2105, + "step": 3732 + }, + { + "epoch": 0.06932196656059825, + "grad_norm": 0.2417214959859848, + "learning_rate": 1.9763786913801292e-05, + "loss": 0.2489, + "step": 3734 + }, + { + "epoch": 0.06935909669801688, + "grad_norm": 0.3465513586997986, + "learning_rate": 1.9763534808038818e-05, + "loss": 0.471, + "step": 3736 + }, + { + "epoch": 0.06939622683543552, + "grad_norm": 0.28989678621292114, + "learning_rate": 1.976328256942388e-05, + "loss": 0.3433, + "step": 3738 + }, + { + "epoch": 0.06943335697285416, + "grad_norm": 0.9069617390632629, + "learning_rate": 1.976303019795991e-05, + "loss": 0.2912, + "step": 3740 + }, + { + "epoch": 0.06947048711027279, + "grad_norm": 0.41809213161468506, + "learning_rate": 1.9762777693650344e-05, + "loss": 0.6237, + "step": 3742 + }, + { + "epoch": 0.06950761724769143, + "grad_norm": 0.3669723570346832, + "learning_rate": 1.976252505649862e-05, + "loss": 0.3269, + "step": 3744 + }, + { + "epoch": 0.06954474738511007, + "grad_norm": 0.28882896900177, + "learning_rate": 1.9762272286508165e-05, + "loss": 0.2876, + "step": 3746 + }, + { + "epoch": 0.06958187752252872, + "grad_norm": 0.33825916051864624, + "learning_rate": 1.9762019383682432e-05, + "loss": 0.4551, + "step": 3748 + }, + { + "epoch": 0.06961900765994734, + "grad_norm": 0.31834837794303894, + "learning_rate": 1.976176634802485e-05, + "loss": 0.2877, + "step": 3750 + }, + { + "epoch": 0.06965613779736599, + "grad_norm": 0.3667811155319214, + "learning_rate": 1.9761513179538875e-05, + "loss": 0.367, + "step": 3752 + }, + { + "epoch": 0.06969326793478463, + "grad_norm": 0.412523478269577, + "learning_rate": 1.9761259878227942e-05, + "loss": 0.1561, + "step": 3754 + }, + { + "epoch": 0.06973039807220327, + "grad_norm": 0.39357614517211914, + "learning_rate": 1.9761006444095497e-05, + "loss": 0.3947, + "step": 3756 + }, + { + "epoch": 0.0697675282096219, + "grad_norm": 0.2894134819507599, + "learning_rate": 1.9760752877144993e-05, + "loss": 0.2899, + "step": 3758 + }, + { + "epoch": 0.06980465834704054, + "grad_norm": 0.49150723218917847, + "learning_rate": 1.9760499177379882e-05, + "loss": 0.3777, + "step": 3760 + }, + { + "epoch": 0.06984178848445918, + "grad_norm": 0.2904317378997803, + "learning_rate": 1.976024534480361e-05, + "loss": 0.4219, + "step": 3762 + }, + { + "epoch": 0.06987891862187783, + "grad_norm": 0.5349287986755371, + "learning_rate": 1.9759991379419636e-05, + "loss": 0.473, + "step": 3764 + }, + { + "epoch": 0.06991604875929645, + "grad_norm": 0.3040470778942108, + "learning_rate": 1.9759737281231413e-05, + "loss": 0.2991, + "step": 3766 + }, + { + "epoch": 0.0699531788967151, + "grad_norm": 0.3615362346172333, + "learning_rate": 1.97594830502424e-05, + "loss": 0.3554, + "step": 3768 + }, + { + "epoch": 0.06999030903413374, + "grad_norm": 0.5429883003234863, + "learning_rate": 1.9759228686456055e-05, + "loss": 0.5334, + "step": 3770 + }, + { + "epoch": 0.07002743917155238, + "grad_norm": 0.29389920830726624, + "learning_rate": 1.9758974189875843e-05, + "loss": 0.4588, + "step": 3772 + }, + { + "epoch": 0.07006456930897101, + "grad_norm": 0.3935430347919464, + "learning_rate": 1.975871956050522e-05, + "loss": 0.3704, + "step": 3774 + }, + { + "epoch": 0.07010169944638965, + "grad_norm": 0.2728888690471649, + "learning_rate": 1.9758464798347657e-05, + "loss": 0.3344, + "step": 3776 + }, + { + "epoch": 0.0701388295838083, + "grad_norm": 0.2200375199317932, + "learning_rate": 1.9758209903406616e-05, + "loss": 0.1535, + "step": 3778 + }, + { + "epoch": 0.07017595972122692, + "grad_norm": 0.4148258864879608, + "learning_rate": 1.975795487568557e-05, + "loss": 0.3065, + "step": 3780 + }, + { + "epoch": 0.07021308985864556, + "grad_norm": 0.37440967559814453, + "learning_rate": 1.9757699715187986e-05, + "loss": 0.5939, + "step": 3782 + }, + { + "epoch": 0.0702502199960642, + "grad_norm": 0.29972875118255615, + "learning_rate": 1.9757444421917335e-05, + "loss": 0.4825, + "step": 3784 + }, + { + "epoch": 0.07028735013348285, + "grad_norm": 0.3693486452102661, + "learning_rate": 1.975718899587709e-05, + "loss": 0.4411, + "step": 3786 + }, + { + "epoch": 0.07032448027090148, + "grad_norm": 0.2998223304748535, + "learning_rate": 1.9756933437070733e-05, + "loss": 0.3265, + "step": 3788 + }, + { + "epoch": 0.07036161040832012, + "grad_norm": 0.49485644698143005, + "learning_rate": 1.9756677745501734e-05, + "loss": 0.5322, + "step": 3790 + }, + { + "epoch": 0.07039874054573876, + "grad_norm": 0.3595532774925232, + "learning_rate": 1.9756421921173582e-05, + "loss": 0.5159, + "step": 3792 + }, + { + "epoch": 0.0704358706831574, + "grad_norm": 0.3375842273235321, + "learning_rate": 1.9756165964089746e-05, + "loss": 0.5149, + "step": 3794 + }, + { + "epoch": 0.07047300082057603, + "grad_norm": 0.3298659324645996, + "learning_rate": 1.9755909874253716e-05, + "loss": 0.2955, + "step": 3796 + }, + { + "epoch": 0.07051013095799467, + "grad_norm": 0.4905138909816742, + "learning_rate": 1.9755653651668975e-05, + "loss": 0.2938, + "step": 3798 + }, + { + "epoch": 0.07054726109541332, + "grad_norm": 0.32392483949661255, + "learning_rate": 1.975539729633901e-05, + "loss": 0.3268, + "step": 3800 + }, + { + "epoch": 0.07058439123283196, + "grad_norm": 0.33975857496261597, + "learning_rate": 1.9755140808267305e-05, + "loss": 0.461, + "step": 3802 + }, + { + "epoch": 0.07062152137025059, + "grad_norm": 0.301897257566452, + "learning_rate": 1.9754884187457355e-05, + "loss": 0.2697, + "step": 3804 + }, + { + "epoch": 0.07065865150766923, + "grad_norm": 0.3485341966152191, + "learning_rate": 1.9754627433912654e-05, + "loss": 0.5747, + "step": 3806 + }, + { + "epoch": 0.07069578164508787, + "grad_norm": 0.33531078696250916, + "learning_rate": 1.9754370547636687e-05, + "loss": 0.5278, + "step": 3808 + }, + { + "epoch": 0.07073291178250651, + "grad_norm": 0.26730644702911377, + "learning_rate": 1.9754113528632957e-05, + "loss": 0.3198, + "step": 3810 + }, + { + "epoch": 0.07077004191992514, + "grad_norm": 0.3827107548713684, + "learning_rate": 1.9753856376904962e-05, + "loss": 0.4505, + "step": 3812 + }, + { + "epoch": 0.07080717205734378, + "grad_norm": 0.2702629268169403, + "learning_rate": 1.9753599092456195e-05, + "loss": 0.2852, + "step": 3814 + }, + { + "epoch": 0.07084430219476243, + "grad_norm": 0.4730794429779053, + "learning_rate": 1.9753341675290158e-05, + "loss": 0.4951, + "step": 3816 + }, + { + "epoch": 0.07088143233218105, + "grad_norm": 0.2305094450712204, + "learning_rate": 1.975308412541036e-05, + "loss": 0.4, + "step": 3818 + }, + { + "epoch": 0.0709185624695997, + "grad_norm": 0.2829965651035309, + "learning_rate": 1.9752826442820296e-05, + "loss": 0.3778, + "step": 3820 + }, + { + "epoch": 0.07095569260701834, + "grad_norm": 0.2873823940753937, + "learning_rate": 1.975256862752348e-05, + "loss": 0.3472, + "step": 3822 + }, + { + "epoch": 0.07099282274443698, + "grad_norm": 0.23630329966545105, + "learning_rate": 1.9752310679523418e-05, + "loss": 0.2138, + "step": 3824 + }, + { + "epoch": 0.07102995288185561, + "grad_norm": 0.34078922867774963, + "learning_rate": 1.9752052598823615e-05, + "loss": 0.3072, + "step": 3826 + }, + { + "epoch": 0.07106708301927425, + "grad_norm": 0.24479207396507263, + "learning_rate": 1.975179438542759e-05, + "loss": 0.2948, + "step": 3828 + }, + { + "epoch": 0.0711042131566929, + "grad_norm": 0.6064154505729675, + "learning_rate": 1.9751536039338854e-05, + "loss": 0.3746, + "step": 3830 + }, + { + "epoch": 0.07114134329411154, + "grad_norm": 0.9282618761062622, + "learning_rate": 1.975127756056092e-05, + "loss": 0.411, + "step": 3832 + }, + { + "epoch": 0.07117847343153016, + "grad_norm": 0.3278948962688446, + "learning_rate": 1.975101894909731e-05, + "loss": 0.587, + "step": 3834 + }, + { + "epoch": 0.07121560356894881, + "grad_norm": 0.38723716139793396, + "learning_rate": 1.975076020495154e-05, + "loss": 0.2335, + "step": 3836 + }, + { + "epoch": 0.07125273370636745, + "grad_norm": 0.3710725009441376, + "learning_rate": 1.9750501328127126e-05, + "loss": 0.328, + "step": 3838 + }, + { + "epoch": 0.07128986384378609, + "grad_norm": 0.2876928150653839, + "learning_rate": 1.9750242318627595e-05, + "loss": 0.1937, + "step": 3840 + }, + { + "epoch": 0.07132699398120472, + "grad_norm": 0.416787713766098, + "learning_rate": 1.9749983176456474e-05, + "loss": 0.2841, + "step": 3842 + }, + { + "epoch": 0.07136412411862336, + "grad_norm": 0.2601301372051239, + "learning_rate": 1.9749723901617286e-05, + "loss": 0.3131, + "step": 3844 + }, + { + "epoch": 0.071401254256042, + "grad_norm": 0.42258259654045105, + "learning_rate": 1.974946449411356e-05, + "loss": 0.563, + "step": 3846 + }, + { + "epoch": 0.07143838439346065, + "grad_norm": 0.42055875062942505, + "learning_rate": 1.9749204953948825e-05, + "loss": 0.4556, + "step": 3848 + }, + { + "epoch": 0.07147551453087927, + "grad_norm": 0.3523327708244324, + "learning_rate": 1.9748945281126613e-05, + "loss": 0.4599, + "step": 3850 + }, + { + "epoch": 0.07151264466829792, + "grad_norm": 0.24449065327644348, + "learning_rate": 1.974868547565046e-05, + "loss": 0.308, + "step": 3852 + }, + { + "epoch": 0.07154977480571656, + "grad_norm": 0.5089482069015503, + "learning_rate": 1.9748425537523894e-05, + "loss": 0.2091, + "step": 3854 + }, + { + "epoch": 0.07158690494313519, + "grad_norm": 0.3861032724380493, + "learning_rate": 1.9748165466750454e-05, + "loss": 0.3799, + "step": 3856 + }, + { + "epoch": 0.07162403508055383, + "grad_norm": 0.4229990243911743, + "learning_rate": 1.9747905263333688e-05, + "loss": 0.3585, + "step": 3858 + }, + { + "epoch": 0.07166116521797247, + "grad_norm": 0.3257265090942383, + "learning_rate": 1.9747644927277126e-05, + "loss": 0.3603, + "step": 3860 + }, + { + "epoch": 0.07169829535539111, + "grad_norm": 0.31086355447769165, + "learning_rate": 1.974738445858431e-05, + "loss": 0.3673, + "step": 3862 + }, + { + "epoch": 0.07173542549280974, + "grad_norm": 0.28723418712615967, + "learning_rate": 1.9747123857258795e-05, + "loss": 0.3488, + "step": 3864 + }, + { + "epoch": 0.07177255563022839, + "grad_norm": 0.24640463292598724, + "learning_rate": 1.974686312330412e-05, + "loss": 0.318, + "step": 3866 + }, + { + "epoch": 0.07180968576764703, + "grad_norm": 0.417511910200119, + "learning_rate": 1.974660225672383e-05, + "loss": 0.4271, + "step": 3868 + }, + { + "epoch": 0.07184681590506567, + "grad_norm": 0.2622246742248535, + "learning_rate": 1.974634125752148e-05, + "loss": 0.4102, + "step": 3870 + }, + { + "epoch": 0.0718839460424843, + "grad_norm": 0.2264738529920578, + "learning_rate": 1.9746080125700617e-05, + "loss": 0.3768, + "step": 3872 + }, + { + "epoch": 0.07192107617990294, + "grad_norm": 0.31941741704940796, + "learning_rate": 1.9745818861264797e-05, + "loss": 0.3741, + "step": 3874 + }, + { + "epoch": 0.07195820631732158, + "grad_norm": 0.3227205276489258, + "learning_rate": 1.9745557464217574e-05, + "loss": 0.3722, + "step": 3876 + }, + { + "epoch": 0.07199533645474022, + "grad_norm": 0.35105079412460327, + "learning_rate": 1.9745295934562508e-05, + "loss": 0.4296, + "step": 3878 + }, + { + "epoch": 0.07203246659215885, + "grad_norm": 0.45736071467399597, + "learning_rate": 1.974503427230315e-05, + "loss": 0.57, + "step": 3880 + }, + { + "epoch": 0.0720695967295775, + "grad_norm": 0.2562812268733978, + "learning_rate": 1.974477247744307e-05, + "loss": 0.4084, + "step": 3882 + }, + { + "epoch": 0.07210672686699614, + "grad_norm": 0.29160887002944946, + "learning_rate": 1.9744510549985826e-05, + "loss": 0.3286, + "step": 3884 + }, + { + "epoch": 0.07214385700441478, + "grad_norm": 0.2824247181415558, + "learning_rate": 1.9744248489934978e-05, + "loss": 0.5074, + "step": 3886 + }, + { + "epoch": 0.07218098714183341, + "grad_norm": 0.48832982778549194, + "learning_rate": 1.9743986297294098e-05, + "loss": 0.2862, + "step": 3888 + }, + { + "epoch": 0.07221811727925205, + "grad_norm": 0.4221515655517578, + "learning_rate": 1.9743723972066752e-05, + "loss": 0.3733, + "step": 3890 + }, + { + "epoch": 0.07225524741667069, + "grad_norm": 0.3622238039970398, + "learning_rate": 1.9743461514256506e-05, + "loss": 0.2423, + "step": 3892 + }, + { + "epoch": 0.07229237755408932, + "grad_norm": 0.35267168283462524, + "learning_rate": 1.9743198923866935e-05, + "loss": 0.4312, + "step": 3894 + }, + { + "epoch": 0.07232950769150796, + "grad_norm": 0.2510712146759033, + "learning_rate": 1.974293620090161e-05, + "loss": 0.2027, + "step": 3896 + }, + { + "epoch": 0.0723666378289266, + "grad_norm": 0.2505778968334198, + "learning_rate": 1.9742673345364104e-05, + "loss": 0.4079, + "step": 3898 + }, + { + "epoch": 0.07240376796634525, + "grad_norm": 0.4601978659629822, + "learning_rate": 1.9742410357258002e-05, + "loss": 0.3597, + "step": 3900 + }, + { + "epoch": 0.07244089810376388, + "grad_norm": 0.276941180229187, + "learning_rate": 1.9742147236586874e-05, + "loss": 0.2981, + "step": 3902 + }, + { + "epoch": 0.07247802824118252, + "grad_norm": 0.3207315504550934, + "learning_rate": 1.9741883983354303e-05, + "loss": 0.3289, + "step": 3904 + }, + { + "epoch": 0.07251515837860116, + "grad_norm": 0.4371922016143799, + "learning_rate": 1.9741620597563874e-05, + "loss": 0.3835, + "step": 3906 + }, + { + "epoch": 0.0725522885160198, + "grad_norm": 0.3222271502017975, + "learning_rate": 1.9741357079219166e-05, + "loss": 0.3112, + "step": 3908 + }, + { + "epoch": 0.07258941865343843, + "grad_norm": 0.32740986347198486, + "learning_rate": 1.974109342832377e-05, + "loss": 0.2781, + "step": 3910 + }, + { + "epoch": 0.07262654879085707, + "grad_norm": 0.2803072929382324, + "learning_rate": 1.9740829644881265e-05, + "loss": 0.26, + "step": 3912 + }, + { + "epoch": 0.07266367892827572, + "grad_norm": 0.33803749084472656, + "learning_rate": 1.974056572889525e-05, + "loss": 0.456, + "step": 3914 + }, + { + "epoch": 0.07270080906569436, + "grad_norm": 0.3652576506137848, + "learning_rate": 1.9740301680369305e-05, + "loss": 0.3352, + "step": 3916 + }, + { + "epoch": 0.07273793920311299, + "grad_norm": 0.3471367657184601, + "learning_rate": 1.974003749930704e-05, + "loss": 0.4407, + "step": 3918 + }, + { + "epoch": 0.07277506934053163, + "grad_norm": 0.37709423899650574, + "learning_rate": 1.973977318571203e-05, + "loss": 0.4074, + "step": 3920 + }, + { + "epoch": 0.07281219947795027, + "grad_norm": 0.2453843206167221, + "learning_rate": 1.973950873958789e-05, + "loss": 0.2616, + "step": 3922 + }, + { + "epoch": 0.07284932961536891, + "grad_norm": 0.41919073462486267, + "learning_rate": 1.97392441609382e-05, + "loss": 0.2783, + "step": 3924 + }, + { + "epoch": 0.07288645975278754, + "grad_norm": 0.26294803619384766, + "learning_rate": 1.9738979449766575e-05, + "loss": 0.2955, + "step": 3926 + }, + { + "epoch": 0.07292358989020618, + "grad_norm": 0.24029801785945892, + "learning_rate": 1.973871460607661e-05, + "loss": 0.1671, + "step": 3928 + }, + { + "epoch": 0.07296072002762483, + "grad_norm": 0.3198600113391876, + "learning_rate": 1.9738449629871912e-05, + "loss": 0.5012, + "step": 3930 + }, + { + "epoch": 0.07299785016504345, + "grad_norm": 0.3556744158267975, + "learning_rate": 1.9738184521156082e-05, + "loss": 0.3531, + "step": 3932 + }, + { + "epoch": 0.0730349803024621, + "grad_norm": 0.3938334882259369, + "learning_rate": 1.9737919279932732e-05, + "loss": 0.4564, + "step": 3934 + }, + { + "epoch": 0.07307211043988074, + "grad_norm": 0.3393716812133789, + "learning_rate": 1.9737653906205467e-05, + "loss": 0.4631, + "step": 3936 + }, + { + "epoch": 0.07310924057729938, + "grad_norm": 0.2686874270439148, + "learning_rate": 1.97373883999779e-05, + "loss": 0.2978, + "step": 3938 + }, + { + "epoch": 0.07314637071471801, + "grad_norm": 0.38741400837898254, + "learning_rate": 1.9737122761253644e-05, + "loss": 0.3423, + "step": 3940 + }, + { + "epoch": 0.07318350085213665, + "grad_norm": 0.39600151777267456, + "learning_rate": 1.9736856990036314e-05, + "loss": 0.2621, + "step": 3942 + }, + { + "epoch": 0.0732206309895553, + "grad_norm": 0.4025113880634308, + "learning_rate": 1.9736591086329526e-05, + "loss": 0.5492, + "step": 3944 + }, + { + "epoch": 0.07325776112697394, + "grad_norm": 0.4818654954433441, + "learning_rate": 1.9736325050136898e-05, + "loss": 0.3101, + "step": 3946 + }, + { + "epoch": 0.07329489126439256, + "grad_norm": 0.2225620150566101, + "learning_rate": 1.973605888146205e-05, + "loss": 0.3906, + "step": 3948 + }, + { + "epoch": 0.0733320214018112, + "grad_norm": 0.32570746541023254, + "learning_rate": 1.9735792580308598e-05, + "loss": 0.3475, + "step": 3950 + }, + { + "epoch": 0.07336915153922985, + "grad_norm": 0.2975451648235321, + "learning_rate": 1.9735526146680176e-05, + "loss": 0.4291, + "step": 3952 + }, + { + "epoch": 0.07340628167664849, + "grad_norm": 0.3218895196914673, + "learning_rate": 1.9735259580580406e-05, + "loss": 0.3967, + "step": 3954 + }, + { + "epoch": 0.07344341181406712, + "grad_norm": 0.4532105028629303, + "learning_rate": 1.9734992882012912e-05, + "loss": 0.4763, + "step": 3956 + }, + { + "epoch": 0.07348054195148576, + "grad_norm": 0.3328412175178528, + "learning_rate": 1.9734726050981324e-05, + "loss": 0.2871, + "step": 3958 + }, + { + "epoch": 0.0735176720889044, + "grad_norm": 0.41240718960762024, + "learning_rate": 1.9734459087489274e-05, + "loss": 0.307, + "step": 3960 + }, + { + "epoch": 0.07355480222632305, + "grad_norm": 0.43801966309547424, + "learning_rate": 1.973419199154039e-05, + "loss": 0.2506, + "step": 3962 + }, + { + "epoch": 0.07359193236374167, + "grad_norm": 0.2605685293674469, + "learning_rate": 1.9733924763138315e-05, + "loss": 0.3686, + "step": 3964 + }, + { + "epoch": 0.07362906250116032, + "grad_norm": 0.3927134573459625, + "learning_rate": 1.973365740228668e-05, + "loss": 0.1737, + "step": 3966 + }, + { + "epoch": 0.07366619263857896, + "grad_norm": 0.29191920161247253, + "learning_rate": 1.973338990898912e-05, + "loss": 0.4449, + "step": 3968 + }, + { + "epoch": 0.07370332277599759, + "grad_norm": 0.29910171031951904, + "learning_rate": 1.973312228324928e-05, + "loss": 0.45, + "step": 3970 + }, + { + "epoch": 0.07374045291341623, + "grad_norm": 0.43258580565452576, + "learning_rate": 1.97328545250708e-05, + "loss": 0.2607, + "step": 3972 + }, + { + "epoch": 0.07377758305083487, + "grad_norm": 0.2679401934146881, + "learning_rate": 1.9732586634457327e-05, + "loss": 0.225, + "step": 3974 + }, + { + "epoch": 0.07381471318825351, + "grad_norm": 0.4128734767436981, + "learning_rate": 1.9732318611412498e-05, + "loss": 0.3121, + "step": 3976 + }, + { + "epoch": 0.07385184332567214, + "grad_norm": 0.33572301268577576, + "learning_rate": 1.9732050455939963e-05, + "loss": 0.2784, + "step": 3978 + }, + { + "epoch": 0.07388897346309078, + "grad_norm": 0.4437161684036255, + "learning_rate": 1.9731782168043375e-05, + "loss": 0.1874, + "step": 3980 + }, + { + "epoch": 0.07392610360050943, + "grad_norm": 0.40449434518814087, + "learning_rate": 1.9731513747726382e-05, + "loss": 0.3478, + "step": 3982 + }, + { + "epoch": 0.07396323373792807, + "grad_norm": 0.5866910815238953, + "learning_rate": 1.9731245194992637e-05, + "loss": 0.5451, + "step": 3984 + }, + { + "epoch": 0.0740003638753467, + "grad_norm": 0.41867196559906006, + "learning_rate": 1.9730976509845793e-05, + "loss": 0.2274, + "step": 3986 + }, + { + "epoch": 0.07403749401276534, + "grad_norm": 0.2776102125644684, + "learning_rate": 1.9730707692289504e-05, + "loss": 0.4524, + "step": 3988 + }, + { + "epoch": 0.07407462415018398, + "grad_norm": 0.38838690519332886, + "learning_rate": 1.9730438742327428e-05, + "loss": 0.301, + "step": 3990 + }, + { + "epoch": 0.07411175428760262, + "grad_norm": 0.36092332005500793, + "learning_rate": 1.9730169659963235e-05, + "loss": 0.3999, + "step": 3992 + }, + { + "epoch": 0.07414888442502125, + "grad_norm": 0.29829004406929016, + "learning_rate": 1.972990044520057e-05, + "loss": 0.4005, + "step": 3994 + }, + { + "epoch": 0.0741860145624399, + "grad_norm": 0.23319177329540253, + "learning_rate": 1.9729631098043108e-05, + "loss": 0.3205, + "step": 3996 + }, + { + "epoch": 0.07422314469985854, + "grad_norm": 0.2570216953754425, + "learning_rate": 1.972936161849451e-05, + "loss": 0.5175, + "step": 3998 + }, + { + "epoch": 0.07426027483727718, + "grad_norm": 0.2947014570236206, + "learning_rate": 1.9729092006558443e-05, + "loss": 0.3778, + "step": 4000 + }, + { + "epoch": 0.07429740497469581, + "grad_norm": 0.25722771883010864, + "learning_rate": 1.9728822262238575e-05, + "loss": 0.4128, + "step": 4002 + }, + { + "epoch": 0.07433453511211445, + "grad_norm": 0.45795461535453796, + "learning_rate": 1.972855238553858e-05, + "loss": 0.2863, + "step": 4004 + }, + { + "epoch": 0.07437166524953309, + "grad_norm": 0.41443169116973877, + "learning_rate": 1.9728282376462126e-05, + "loss": 0.2423, + "step": 4006 + }, + { + "epoch": 0.07440879538695172, + "grad_norm": 0.34766697883605957, + "learning_rate": 1.9728012235012887e-05, + "loss": 0.2741, + "step": 4008 + }, + { + "epoch": 0.07444592552437036, + "grad_norm": 0.33034998178482056, + "learning_rate": 1.972774196119454e-05, + "loss": 0.3662, + "step": 4010 + }, + { + "epoch": 0.074483055661789, + "grad_norm": 0.26384037733078003, + "learning_rate": 1.9727471555010767e-05, + "loss": 0.189, + "step": 4012 + }, + { + "epoch": 0.07452018579920765, + "grad_norm": 0.38193410634994507, + "learning_rate": 1.972720101646524e-05, + "loss": 0.2962, + "step": 4014 + }, + { + "epoch": 0.07455731593662628, + "grad_norm": 0.45072677731513977, + "learning_rate": 1.9726930345561644e-05, + "loss": 0.2836, + "step": 4016 + }, + { + "epoch": 0.07459444607404492, + "grad_norm": 0.3227638006210327, + "learning_rate": 1.9726659542303663e-05, + "loss": 0.4306, + "step": 4018 + }, + { + "epoch": 0.07463157621146356, + "grad_norm": 0.25836917757987976, + "learning_rate": 1.9726388606694974e-05, + "loss": 0.1992, + "step": 4020 + }, + { + "epoch": 0.0746687063488822, + "grad_norm": 0.3983252942562103, + "learning_rate": 1.9726117538739274e-05, + "loss": 0.3106, + "step": 4022 + }, + { + "epoch": 0.07470583648630083, + "grad_norm": 0.4189784824848175, + "learning_rate": 1.972584633844025e-05, + "loss": 0.4363, + "step": 4024 + }, + { + "epoch": 0.07474296662371947, + "grad_norm": 0.41066834330558777, + "learning_rate": 1.972557500580159e-05, + "loss": 0.4359, + "step": 4026 + }, + { + "epoch": 0.07478009676113812, + "grad_norm": 0.4322146773338318, + "learning_rate": 1.972530354082698e-05, + "loss": 0.4167, + "step": 4028 + }, + { + "epoch": 0.07481722689855676, + "grad_norm": 0.37955912947654724, + "learning_rate": 1.9725031943520127e-05, + "loss": 0.3324, + "step": 4030 + }, + { + "epoch": 0.07485435703597539, + "grad_norm": 0.3345361649990082, + "learning_rate": 1.972476021388471e-05, + "loss": 0.3535, + "step": 4032 + }, + { + "epoch": 0.07489148717339403, + "grad_norm": 0.2872040569782257, + "learning_rate": 1.9724488351924444e-05, + "loss": 0.1472, + "step": 4034 + }, + { + "epoch": 0.07492861731081267, + "grad_norm": 0.4636293649673462, + "learning_rate": 1.9724216357643017e-05, + "loss": 0.4432, + "step": 4036 + }, + { + "epoch": 0.07496574744823131, + "grad_norm": 0.4114382266998291, + "learning_rate": 1.972394423104413e-05, + "loss": 0.5859, + "step": 4038 + }, + { + "epoch": 0.07500287758564994, + "grad_norm": 0.3368780016899109, + "learning_rate": 1.972367197213149e-05, + "loss": 0.3176, + "step": 4040 + }, + { + "epoch": 0.07504000772306858, + "grad_norm": 0.3746050298213959, + "learning_rate": 1.97233995809088e-05, + "loss": 0.1935, + "step": 4042 + }, + { + "epoch": 0.07507713786048723, + "grad_norm": 0.3974166810512543, + "learning_rate": 1.972312705737977e-05, + "loss": 0.5018, + "step": 4044 + }, + { + "epoch": 0.07511426799790585, + "grad_norm": 0.28588545322418213, + "learning_rate": 1.97228544015481e-05, + "loss": 0.1791, + "step": 4046 + }, + { + "epoch": 0.0751513981353245, + "grad_norm": 0.3535286486148834, + "learning_rate": 1.9722581613417508e-05, + "loss": 0.3538, + "step": 4048 + }, + { + "epoch": 0.07518852827274314, + "grad_norm": 0.31911003589630127, + "learning_rate": 1.9722308692991703e-05, + "loss": 0.3646, + "step": 4050 + }, + { + "epoch": 0.07522565841016178, + "grad_norm": 0.4504331946372986, + "learning_rate": 1.9722035640274395e-05, + "loss": 0.2916, + "step": 4052 + }, + { + "epoch": 0.07526278854758041, + "grad_norm": 0.275995135307312, + "learning_rate": 1.9721762455269307e-05, + "loss": 0.2143, + "step": 4054 + }, + { + "epoch": 0.07529991868499905, + "grad_norm": 0.34938135743141174, + "learning_rate": 1.972148913798015e-05, + "loss": 0.2619, + "step": 4056 + }, + { + "epoch": 0.0753370488224177, + "grad_norm": 0.2995147109031677, + "learning_rate": 1.9721215688410647e-05, + "loss": 0.4379, + "step": 4058 + }, + { + "epoch": 0.07537417895983634, + "grad_norm": 0.325248658657074, + "learning_rate": 1.9720942106564515e-05, + "loss": 0.3159, + "step": 4060 + }, + { + "epoch": 0.07541130909725496, + "grad_norm": 0.4562129080295563, + "learning_rate": 1.9720668392445477e-05, + "loss": 0.3147, + "step": 4062 + }, + { + "epoch": 0.0754484392346736, + "grad_norm": 0.3024131655693054, + "learning_rate": 1.972039454605726e-05, + "loss": 0.2656, + "step": 4064 + }, + { + "epoch": 0.07548556937209225, + "grad_norm": 0.44684678316116333, + "learning_rate": 1.9720120567403593e-05, + "loss": 0.2539, + "step": 4066 + }, + { + "epoch": 0.07552269950951089, + "grad_norm": 0.38811197876930237, + "learning_rate": 1.9719846456488193e-05, + "loss": 0.4876, + "step": 4068 + }, + { + "epoch": 0.07555982964692952, + "grad_norm": 0.30226844549179077, + "learning_rate": 1.9719572213314802e-05, + "loss": 0.1709, + "step": 4070 + }, + { + "epoch": 0.07559695978434816, + "grad_norm": 0.3021453022956848, + "learning_rate": 1.9719297837887147e-05, + "loss": 0.2621, + "step": 4072 + }, + { + "epoch": 0.0756340899217668, + "grad_norm": 0.3178151249885559, + "learning_rate": 1.971902333020896e-05, + "loss": 0.3794, + "step": 4074 + }, + { + "epoch": 0.07567122005918545, + "grad_norm": 0.2373242825269699, + "learning_rate": 1.9718748690283974e-05, + "loss": 0.3639, + "step": 4076 + }, + { + "epoch": 0.07570835019660407, + "grad_norm": 0.34492459893226624, + "learning_rate": 1.9718473918115936e-05, + "loss": 0.444, + "step": 4078 + }, + { + "epoch": 0.07574548033402272, + "grad_norm": 0.3752194941043854, + "learning_rate": 1.9718199013708572e-05, + "loss": 0.3515, + "step": 4080 + }, + { + "epoch": 0.07578261047144136, + "grad_norm": 0.37196120619773865, + "learning_rate": 1.9717923977065633e-05, + "loss": 0.4438, + "step": 4082 + }, + { + "epoch": 0.07581974060885999, + "grad_norm": 0.24771851301193237, + "learning_rate": 1.9717648808190856e-05, + "loss": 0.2264, + "step": 4084 + }, + { + "epoch": 0.07585687074627863, + "grad_norm": 0.4009481370449066, + "learning_rate": 1.9717373507087988e-05, + "loss": 0.4171, + "step": 4086 + }, + { + "epoch": 0.07589400088369727, + "grad_norm": 0.32479986548423767, + "learning_rate": 1.971709807376077e-05, + "loss": 0.2447, + "step": 4088 + }, + { + "epoch": 0.07593113102111591, + "grad_norm": 0.3384368419647217, + "learning_rate": 1.9716822508212952e-05, + "loss": 0.4226, + "step": 4090 + }, + { + "epoch": 0.07596826115853454, + "grad_norm": 0.36190101504325867, + "learning_rate": 1.9716546810448292e-05, + "loss": 0.2607, + "step": 4092 + }, + { + "epoch": 0.07600539129595318, + "grad_norm": 0.33590051531791687, + "learning_rate": 1.9716270980470527e-05, + "loss": 0.4025, + "step": 4094 + }, + { + "epoch": 0.07604252143337183, + "grad_norm": 0.34180957078933716, + "learning_rate": 1.971599501828342e-05, + "loss": 0.3514, + "step": 4096 + }, + { + "epoch": 0.07607965157079047, + "grad_norm": 0.27896565198898315, + "learning_rate": 1.9715718923890726e-05, + "loss": 0.2845, + "step": 4098 + }, + { + "epoch": 0.0761167817082091, + "grad_norm": 0.4229014813899994, + "learning_rate": 1.97154426972962e-05, + "loss": 0.3526, + "step": 4100 + }, + { + "epoch": 0.07615391184562774, + "grad_norm": 0.4036638140678406, + "learning_rate": 1.9715166338503596e-05, + "loss": 0.342, + "step": 4102 + }, + { + "epoch": 0.07619104198304638, + "grad_norm": 0.24274615943431854, + "learning_rate": 1.9714889847516677e-05, + "loss": 0.2019, + "step": 4104 + }, + { + "epoch": 0.07622817212046502, + "grad_norm": 0.3785533308982849, + "learning_rate": 1.9714613224339212e-05, + "loss": 0.3266, + "step": 4106 + }, + { + "epoch": 0.07626530225788365, + "grad_norm": 0.3162519931793213, + "learning_rate": 1.971433646897496e-05, + "loss": 0.5281, + "step": 4108 + }, + { + "epoch": 0.0763024323953023, + "grad_norm": 0.34028100967407227, + "learning_rate": 1.9714059581427684e-05, + "loss": 0.1673, + "step": 4110 + }, + { + "epoch": 0.07633956253272094, + "grad_norm": 0.4366309940814972, + "learning_rate": 1.9713782561701152e-05, + "loss": 0.2767, + "step": 4112 + }, + { + "epoch": 0.07637669267013958, + "grad_norm": 0.4714266359806061, + "learning_rate": 1.9713505409799135e-05, + "loss": 0.3825, + "step": 4114 + }, + { + "epoch": 0.07641382280755821, + "grad_norm": 0.259251207113266, + "learning_rate": 1.9713228125725408e-05, + "loss": 0.3008, + "step": 4116 + }, + { + "epoch": 0.07645095294497685, + "grad_norm": 0.3195223808288574, + "learning_rate": 1.971295070948374e-05, + "loss": 0.3289, + "step": 4118 + }, + { + "epoch": 0.07648808308239549, + "grad_norm": 0.41382530331611633, + "learning_rate": 1.971267316107791e-05, + "loss": 0.4593, + "step": 4120 + }, + { + "epoch": 0.07652521321981412, + "grad_norm": 0.272312730550766, + "learning_rate": 1.9712395480511685e-05, + "loss": 0.3231, + "step": 4122 + }, + { + "epoch": 0.07656234335723276, + "grad_norm": 0.3412688672542572, + "learning_rate": 1.9712117667788853e-05, + "loss": 0.315, + "step": 4124 + }, + { + "epoch": 0.0765994734946514, + "grad_norm": 0.35669073462486267, + "learning_rate": 1.9711839722913188e-05, + "loss": 0.3358, + "step": 4126 + }, + { + "epoch": 0.07663660363207005, + "grad_norm": 0.3865607976913452, + "learning_rate": 1.9711561645888477e-05, + "loss": 0.423, + "step": 4128 + }, + { + "epoch": 0.07667373376948868, + "grad_norm": 0.48304542899131775, + "learning_rate": 1.97112834367185e-05, + "loss": 0.3612, + "step": 4130 + }, + { + "epoch": 0.07671086390690732, + "grad_norm": 0.3380900025367737, + "learning_rate": 1.9711005095407045e-05, + "loss": 0.4337, + "step": 4132 + }, + { + "epoch": 0.07674799404432596, + "grad_norm": 0.45825114846229553, + "learning_rate": 1.9710726621957898e-05, + "loss": 0.4028, + "step": 4134 + }, + { + "epoch": 0.0767851241817446, + "grad_norm": 0.33455803990364075, + "learning_rate": 1.971044801637485e-05, + "loss": 0.4201, + "step": 4136 + }, + { + "epoch": 0.07682225431916323, + "grad_norm": 0.4597374498844147, + "learning_rate": 1.971016927866169e-05, + "loss": 0.2897, + "step": 4138 + }, + { + "epoch": 0.07685938445658187, + "grad_norm": 0.31456583738327026, + "learning_rate": 1.9709890408822212e-05, + "loss": 0.3231, + "step": 4140 + }, + { + "epoch": 0.07689651459400051, + "grad_norm": 0.31309837102890015, + "learning_rate": 1.9709611406860206e-05, + "loss": 0.206, + "step": 4142 + }, + { + "epoch": 0.07693364473141916, + "grad_norm": 0.3602977693080902, + "learning_rate": 1.970933227277948e-05, + "loss": 0.427, + "step": 4144 + }, + { + "epoch": 0.07697077486883779, + "grad_norm": 0.6244282126426697, + "learning_rate": 1.9709053006583817e-05, + "loss": 0.4821, + "step": 4146 + }, + { + "epoch": 0.07700790500625643, + "grad_norm": 0.2669077515602112, + "learning_rate": 1.970877360827703e-05, + "loss": 0.2092, + "step": 4148 + }, + { + "epoch": 0.07704503514367507, + "grad_norm": 0.6031359434127808, + "learning_rate": 1.970849407786291e-05, + "loss": 0.2872, + "step": 4150 + }, + { + "epoch": 0.07708216528109371, + "grad_norm": 0.30174508690834045, + "learning_rate": 1.970821441534527e-05, + "loss": 0.4057, + "step": 4152 + }, + { + "epoch": 0.07711929541851234, + "grad_norm": 0.2911519706249237, + "learning_rate": 1.970793462072791e-05, + "loss": 0.4206, + "step": 4154 + }, + { + "epoch": 0.07715642555593098, + "grad_norm": 0.3112300634384155, + "learning_rate": 1.9707654694014638e-05, + "loss": 0.4081, + "step": 4156 + }, + { + "epoch": 0.07719355569334962, + "grad_norm": 0.3850885331630707, + "learning_rate": 1.9707374635209265e-05, + "loss": 0.5078, + "step": 4158 + }, + { + "epoch": 0.07723068583076825, + "grad_norm": 0.7911838889122009, + "learning_rate": 1.97070944443156e-05, + "loss": 0.2587, + "step": 4160 + }, + { + "epoch": 0.0772678159681869, + "grad_norm": 0.35940849781036377, + "learning_rate": 1.9706814121337454e-05, + "loss": 0.2604, + "step": 4162 + }, + { + "epoch": 0.07730494610560554, + "grad_norm": 0.2801651060581207, + "learning_rate": 1.9706533666278646e-05, + "loss": 0.3961, + "step": 4164 + }, + { + "epoch": 0.07734207624302418, + "grad_norm": 0.17348363995552063, + "learning_rate": 1.9706253079142987e-05, + "loss": 0.235, + "step": 4166 + }, + { + "epoch": 0.07737920638044281, + "grad_norm": 0.2846076786518097, + "learning_rate": 1.9705972359934295e-05, + "loss": 0.236, + "step": 4168 + }, + { + "epoch": 0.07741633651786145, + "grad_norm": 0.21596375107765198, + "learning_rate": 1.9705691508656396e-05, + "loss": 0.3377, + "step": 4170 + }, + { + "epoch": 0.07745346665528009, + "grad_norm": 0.27789488434791565, + "learning_rate": 1.9705410525313103e-05, + "loss": 0.4623, + "step": 4172 + }, + { + "epoch": 0.07749059679269873, + "grad_norm": 0.3166808485984802, + "learning_rate": 1.9705129409908247e-05, + "loss": 0.2986, + "step": 4174 + }, + { + "epoch": 0.07752772693011736, + "grad_norm": 0.366046279668808, + "learning_rate": 1.9704848162445652e-05, + "loss": 0.4917, + "step": 4176 + }, + { + "epoch": 0.077564857067536, + "grad_norm": 0.3062710464000702, + "learning_rate": 1.970456678292914e-05, + "loss": 0.3284, + "step": 4178 + }, + { + "epoch": 0.07760198720495465, + "grad_norm": 0.3104027509689331, + "learning_rate": 1.9704285271362545e-05, + "loss": 0.2021, + "step": 4180 + }, + { + "epoch": 0.07763911734237329, + "grad_norm": 0.23581327497959137, + "learning_rate": 1.970400362774969e-05, + "loss": 0.4941, + "step": 4182 + }, + { + "epoch": 0.07767624747979192, + "grad_norm": 0.2966723144054413, + "learning_rate": 1.970372185209442e-05, + "loss": 0.2486, + "step": 4184 + }, + { + "epoch": 0.07771337761721056, + "grad_norm": 0.47806960344314575, + "learning_rate": 1.9703439944400555e-05, + "loss": 0.3719, + "step": 4186 + }, + { + "epoch": 0.0777505077546292, + "grad_norm": 0.33518242835998535, + "learning_rate": 1.970315790467194e-05, + "loss": 0.3162, + "step": 4188 + }, + { + "epoch": 0.07778763789204785, + "grad_norm": 0.5183612704277039, + "learning_rate": 1.970287573291241e-05, + "loss": 0.1895, + "step": 4190 + }, + { + "epoch": 0.07782476802946647, + "grad_norm": 0.5089662075042725, + "learning_rate": 1.9702593429125808e-05, + "loss": 0.3401, + "step": 4192 + }, + { + "epoch": 0.07786189816688512, + "grad_norm": 0.3420623540878296, + "learning_rate": 1.9702310993315968e-05, + "loss": 0.4496, + "step": 4194 + }, + { + "epoch": 0.07789902830430376, + "grad_norm": 0.3703691363334656, + "learning_rate": 1.9702028425486735e-05, + "loss": 0.2048, + "step": 4196 + }, + { + "epoch": 0.07793615844172239, + "grad_norm": 0.41426002979278564, + "learning_rate": 1.9701745725641963e-05, + "loss": 0.4109, + "step": 4198 + }, + { + "epoch": 0.07797328857914103, + "grad_norm": 0.3361063003540039, + "learning_rate": 1.9701462893785487e-05, + "loss": 0.3102, + "step": 4200 + }, + { + "epoch": 0.07801041871655967, + "grad_norm": 0.31623128056526184, + "learning_rate": 1.970117992992116e-05, + "loss": 0.5012, + "step": 4202 + }, + { + "epoch": 0.07804754885397831, + "grad_norm": 0.2824561595916748, + "learning_rate": 1.9700896834052834e-05, + "loss": 0.352, + "step": 4204 + }, + { + "epoch": 0.07808467899139694, + "grad_norm": 0.2287907749414444, + "learning_rate": 1.970061360618436e-05, + "loss": 0.279, + "step": 4206 + }, + { + "epoch": 0.07812180912881558, + "grad_norm": 0.46598443388938904, + "learning_rate": 1.9700330246319594e-05, + "loss": 0.2332, + "step": 4208 + }, + { + "epoch": 0.07815893926623423, + "grad_norm": 0.342803418636322, + "learning_rate": 1.9700046754462384e-05, + "loss": 0.2013, + "step": 4210 + }, + { + "epoch": 0.07819606940365287, + "grad_norm": 0.38624659180641174, + "learning_rate": 1.9699763130616593e-05, + "loss": 0.4011, + "step": 4212 + }, + { + "epoch": 0.0782331995410715, + "grad_norm": 0.3567831516265869, + "learning_rate": 1.9699479374786085e-05, + "loss": 0.2867, + "step": 4214 + }, + { + "epoch": 0.07827032967849014, + "grad_norm": 0.3960648477077484, + "learning_rate": 1.969919548697471e-05, + "loss": 0.1516, + "step": 4216 + }, + { + "epoch": 0.07830745981590878, + "grad_norm": 0.2812891900539398, + "learning_rate": 1.969891146718634e-05, + "loss": 0.3329, + "step": 4218 + }, + { + "epoch": 0.07834458995332742, + "grad_norm": 0.3432716429233551, + "learning_rate": 1.9698627315424836e-05, + "loss": 0.2467, + "step": 4220 + }, + { + "epoch": 0.07838172009074605, + "grad_norm": 0.29508450627326965, + "learning_rate": 1.9698343031694067e-05, + "loss": 0.3327, + "step": 4222 + }, + { + "epoch": 0.0784188502281647, + "grad_norm": 0.39739924669265747, + "learning_rate": 1.9698058615997896e-05, + "loss": 0.3809, + "step": 4224 + }, + { + "epoch": 0.07845598036558334, + "grad_norm": 0.3428349494934082, + "learning_rate": 1.96977740683402e-05, + "loss": 0.2245, + "step": 4226 + }, + { + "epoch": 0.07849311050300198, + "grad_norm": 0.3312964141368866, + "learning_rate": 1.9697489388724845e-05, + "loss": 0.4426, + "step": 4228 + }, + { + "epoch": 0.0785302406404206, + "grad_norm": 0.2318098098039627, + "learning_rate": 1.9697204577155707e-05, + "loss": 0.2867, + "step": 4230 + }, + { + "epoch": 0.07856737077783925, + "grad_norm": 0.5615546703338623, + "learning_rate": 1.969691963363666e-05, + "loss": 0.4904, + "step": 4232 + }, + { + "epoch": 0.07860450091525789, + "grad_norm": 0.3217709958553314, + "learning_rate": 1.969663455817158e-05, + "loss": 0.5586, + "step": 4234 + }, + { + "epoch": 0.07864163105267652, + "grad_norm": 0.29852867126464844, + "learning_rate": 1.9696349350764354e-05, + "loss": 0.2817, + "step": 4236 + }, + { + "epoch": 0.07867876119009516, + "grad_norm": 0.3760199248790741, + "learning_rate": 1.9696064011418855e-05, + "loss": 0.2395, + "step": 4238 + }, + { + "epoch": 0.0787158913275138, + "grad_norm": 0.3407188951969147, + "learning_rate": 1.9695778540138967e-05, + "loss": 0.4005, + "step": 4240 + }, + { + "epoch": 0.07875302146493245, + "grad_norm": 0.35757091641426086, + "learning_rate": 1.9695492936928574e-05, + "loss": 0.3033, + "step": 4242 + }, + { + "epoch": 0.07879015160235107, + "grad_norm": 0.2942734360694885, + "learning_rate": 1.969520720179157e-05, + "loss": 0.1635, + "step": 4244 + }, + { + "epoch": 0.07882728173976972, + "grad_norm": 0.4190855920314789, + "learning_rate": 1.9694921334731833e-05, + "loss": 0.3649, + "step": 4246 + }, + { + "epoch": 0.07886441187718836, + "grad_norm": 0.33898401260375977, + "learning_rate": 1.969463533575325e-05, + "loss": 0.5075, + "step": 4248 + }, + { + "epoch": 0.078901542014607, + "grad_norm": 0.4412969946861267, + "learning_rate": 1.9694349204859726e-05, + "loss": 0.4492, + "step": 4250 + }, + { + "epoch": 0.07893867215202563, + "grad_norm": 0.3370778560638428, + "learning_rate": 1.9694062942055144e-05, + "loss": 0.3274, + "step": 4252 + }, + { + "epoch": 0.07897580228944427, + "grad_norm": 0.34823164343833923, + "learning_rate": 1.969377654734341e-05, + "loss": 0.3578, + "step": 4254 + }, + { + "epoch": 0.07901293242686291, + "grad_norm": 0.5024003386497498, + "learning_rate": 1.9693490020728404e-05, + "loss": 0.3701, + "step": 4256 + }, + { + "epoch": 0.07905006256428156, + "grad_norm": 0.4456743597984314, + "learning_rate": 1.969320336221404e-05, + "loss": 0.2405, + "step": 4258 + }, + { + "epoch": 0.07908719270170018, + "grad_norm": 0.3806392252445221, + "learning_rate": 1.9692916571804207e-05, + "loss": 0.275, + "step": 4260 + }, + { + "epoch": 0.07912432283911883, + "grad_norm": 0.3193275034427643, + "learning_rate": 1.9692629649502815e-05, + "loss": 0.4136, + "step": 4262 + }, + { + "epoch": 0.07916145297653747, + "grad_norm": 0.3430837094783783, + "learning_rate": 1.9692342595313772e-05, + "loss": 0.3763, + "step": 4264 + }, + { + "epoch": 0.07919858311395611, + "grad_norm": 0.3324311673641205, + "learning_rate": 1.9692055409240974e-05, + "loss": 0.2553, + "step": 4266 + }, + { + "epoch": 0.07923571325137474, + "grad_norm": 0.4314964711666107, + "learning_rate": 1.969176809128833e-05, + "loss": 0.2409, + "step": 4268 + }, + { + "epoch": 0.07927284338879338, + "grad_norm": 0.2835943102836609, + "learning_rate": 1.9691480641459753e-05, + "loss": 0.2757, + "step": 4270 + }, + { + "epoch": 0.07930997352621202, + "grad_norm": 0.36165377497673035, + "learning_rate": 1.969119305975916e-05, + "loss": 0.3112, + "step": 4272 + }, + { + "epoch": 0.07934710366363065, + "grad_norm": 0.3235855996608734, + "learning_rate": 1.9690905346190455e-05, + "loss": 0.4472, + "step": 4274 + }, + { + "epoch": 0.0793842338010493, + "grad_norm": 0.33653008937835693, + "learning_rate": 1.9690617500757554e-05, + "loss": 0.336, + "step": 4276 + }, + { + "epoch": 0.07942136393846794, + "grad_norm": 0.3267183005809784, + "learning_rate": 1.969032952346438e-05, + "loss": 0.3028, + "step": 4278 + }, + { + "epoch": 0.07945849407588658, + "grad_norm": 0.3111686706542969, + "learning_rate": 1.969004141431484e-05, + "loss": 0.4869, + "step": 4280 + }, + { + "epoch": 0.07949562421330521, + "grad_norm": 0.3802286386489868, + "learning_rate": 1.9689753173312865e-05, + "loss": 0.3497, + "step": 4282 + }, + { + "epoch": 0.07953275435072385, + "grad_norm": 0.31104668974876404, + "learning_rate": 1.9689464800462374e-05, + "loss": 0.4965, + "step": 4284 + }, + { + "epoch": 0.07956988448814249, + "grad_norm": 0.33291587233543396, + "learning_rate": 1.968917629576729e-05, + "loss": 0.3282, + "step": 4286 + }, + { + "epoch": 0.07960701462556113, + "grad_norm": 0.2918630838394165, + "learning_rate": 1.9688887659231538e-05, + "loss": 0.2416, + "step": 4288 + }, + { + "epoch": 0.07964414476297976, + "grad_norm": 0.4246099293231964, + "learning_rate": 1.9688598890859046e-05, + "loss": 0.343, + "step": 4290 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.39599692821502686, + "learning_rate": 1.9688309990653747e-05, + "loss": 0.2591, + "step": 4292 + }, + { + "epoch": 0.07971840503781705, + "grad_norm": 0.27274757623672485, + "learning_rate": 1.9688020958619567e-05, + "loss": 0.2699, + "step": 4294 + }, + { + "epoch": 0.07975553517523569, + "grad_norm": 0.3805229067802429, + "learning_rate": 1.968773179476044e-05, + "loss": 0.3804, + "step": 4296 + }, + { + "epoch": 0.07979266531265432, + "grad_norm": 0.33901098370552063, + "learning_rate": 1.9687442499080302e-05, + "loss": 0.4679, + "step": 4298 + }, + { + "epoch": 0.07982979545007296, + "grad_norm": 0.3738965392112732, + "learning_rate": 1.968715307158309e-05, + "loss": 0.3382, + "step": 4300 + }, + { + "epoch": 0.0798669255874916, + "grad_norm": 0.3245031237602234, + "learning_rate": 1.968686351227274e-05, + "loss": 0.4692, + "step": 4302 + }, + { + "epoch": 0.07990405572491024, + "grad_norm": 0.354218453168869, + "learning_rate": 1.9686573821153196e-05, + "loss": 0.327, + "step": 4304 + }, + { + "epoch": 0.07994118586232887, + "grad_norm": 0.2955273985862732, + "learning_rate": 1.968628399822839e-05, + "loss": 0.4704, + "step": 4306 + }, + { + "epoch": 0.07997831599974752, + "grad_norm": 0.32826635241508484, + "learning_rate": 1.968599404350228e-05, + "loss": 0.3188, + "step": 4308 + }, + { + "epoch": 0.08001544613716616, + "grad_norm": 0.32851389050483704, + "learning_rate": 1.9685703956978798e-05, + "loss": 0.3172, + "step": 4310 + }, + { + "epoch": 0.08005257627458479, + "grad_norm": 0.3576257526874542, + "learning_rate": 1.9685413738661904e-05, + "loss": 0.3116, + "step": 4312 + }, + { + "epoch": 0.08008970641200343, + "grad_norm": 0.410132497549057, + "learning_rate": 1.9685123388555533e-05, + "loss": 0.2865, + "step": 4314 + }, + { + "epoch": 0.08012683654942207, + "grad_norm": 0.438515305519104, + "learning_rate": 1.9684832906663646e-05, + "loss": 0.3851, + "step": 4316 + }, + { + "epoch": 0.08016396668684071, + "grad_norm": 0.31009963154792786, + "learning_rate": 1.9684542292990193e-05, + "loss": 0.3517, + "step": 4318 + }, + { + "epoch": 0.08020109682425934, + "grad_norm": 0.3820616602897644, + "learning_rate": 1.968425154753913e-05, + "loss": 0.3659, + "step": 4320 + }, + { + "epoch": 0.08023822696167798, + "grad_norm": 0.3518998622894287, + "learning_rate": 1.9683960670314407e-05, + "loss": 0.4084, + "step": 4322 + }, + { + "epoch": 0.08027535709909663, + "grad_norm": 0.43173128366470337, + "learning_rate": 1.9683669661319985e-05, + "loss": 0.4391, + "step": 4324 + }, + { + "epoch": 0.08031248723651527, + "grad_norm": 0.28281405568122864, + "learning_rate": 1.968337852055983e-05, + "loss": 0.3639, + "step": 4326 + }, + { + "epoch": 0.0803496173739339, + "grad_norm": 0.3975304663181305, + "learning_rate": 1.9683087248037897e-05, + "loss": 0.1999, + "step": 4328 + }, + { + "epoch": 0.08038674751135254, + "grad_norm": 0.34033700823783875, + "learning_rate": 1.968279584375815e-05, + "loss": 0.2782, + "step": 4330 + }, + { + "epoch": 0.08042387764877118, + "grad_norm": 0.5377874374389648, + "learning_rate": 1.9682504307724553e-05, + "loss": 0.3351, + "step": 4332 + }, + { + "epoch": 0.08046100778618982, + "grad_norm": 0.25888651609420776, + "learning_rate": 1.9682212639941078e-05, + "loss": 0.1861, + "step": 4334 + }, + { + "epoch": 0.08049813792360845, + "grad_norm": 0.4718436598777771, + "learning_rate": 1.968192084041169e-05, + "loss": 0.4838, + "step": 4336 + }, + { + "epoch": 0.0805352680610271, + "grad_norm": 0.3890031576156616, + "learning_rate": 1.968162890914036e-05, + "loss": 0.4469, + "step": 4338 + }, + { + "epoch": 0.08057239819844574, + "grad_norm": 0.3125271797180176, + "learning_rate": 1.968133684613106e-05, + "loss": 0.4054, + "step": 4340 + }, + { + "epoch": 0.08060952833586438, + "grad_norm": 0.3753014802932739, + "learning_rate": 1.968104465138776e-05, + "loss": 0.4793, + "step": 4342 + }, + { + "epoch": 0.080646658473283, + "grad_norm": 0.2874627709388733, + "learning_rate": 1.9680752324914447e-05, + "loss": 0.3491, + "step": 4344 + }, + { + "epoch": 0.08068378861070165, + "grad_norm": 0.4077173173427582, + "learning_rate": 1.968045986671509e-05, + "loss": 0.3322, + "step": 4346 + }, + { + "epoch": 0.08072091874812029, + "grad_norm": 0.2548542320728302, + "learning_rate": 1.968016727679367e-05, + "loss": 0.3945, + "step": 4348 + }, + { + "epoch": 0.08075804888553892, + "grad_norm": 0.3074374496936798, + "learning_rate": 1.967987455515417e-05, + "loss": 0.3587, + "step": 4350 + }, + { + "epoch": 0.08079517902295756, + "grad_norm": 0.3787061870098114, + "learning_rate": 1.9679581701800568e-05, + "loss": 0.3588, + "step": 4352 + }, + { + "epoch": 0.0808323091603762, + "grad_norm": 0.3199010193347931, + "learning_rate": 1.9679288716736854e-05, + "loss": 0.2033, + "step": 4354 + }, + { + "epoch": 0.08086943929779485, + "grad_norm": 0.3644576370716095, + "learning_rate": 1.967899559996702e-05, + "loss": 0.3484, + "step": 4356 + }, + { + "epoch": 0.08090656943521347, + "grad_norm": 0.30899494886398315, + "learning_rate": 1.967870235149504e-05, + "loss": 0.3544, + "step": 4358 + }, + { + "epoch": 0.08094369957263212, + "grad_norm": 0.409390389919281, + "learning_rate": 1.9678408971324915e-05, + "loss": 0.3397, + "step": 4360 + }, + { + "epoch": 0.08098082971005076, + "grad_norm": 0.3568435609340668, + "learning_rate": 1.9678115459460633e-05, + "loss": 0.3103, + "step": 4362 + }, + { + "epoch": 0.0810179598474694, + "grad_norm": 0.3840464651584625, + "learning_rate": 1.967782181590619e-05, + "loss": 0.4137, + "step": 4364 + }, + { + "epoch": 0.08105508998488803, + "grad_norm": 0.47364285588264465, + "learning_rate": 1.967752804066558e-05, + "loss": 0.2413, + "step": 4366 + }, + { + "epoch": 0.08109222012230667, + "grad_norm": 0.319794237613678, + "learning_rate": 1.96772341337428e-05, + "loss": 0.2091, + "step": 4368 + }, + { + "epoch": 0.08112935025972531, + "grad_norm": 0.36961498856544495, + "learning_rate": 1.9676940095141855e-05, + "loss": 0.5444, + "step": 4370 + }, + { + "epoch": 0.08116648039714396, + "grad_norm": 0.5251149535179138, + "learning_rate": 1.9676645924866737e-05, + "loss": 0.4311, + "step": 4372 + }, + { + "epoch": 0.08120361053456258, + "grad_norm": 0.38542047142982483, + "learning_rate": 1.9676351622921453e-05, + "loss": 0.3008, + "step": 4374 + }, + { + "epoch": 0.08124074067198123, + "grad_norm": 0.3419176936149597, + "learning_rate": 1.967605718931001e-05, + "loss": 0.5594, + "step": 4376 + }, + { + "epoch": 0.08127787080939987, + "grad_norm": 0.4605537950992584, + "learning_rate": 1.9675762624036408e-05, + "loss": 0.2084, + "step": 4378 + }, + { + "epoch": 0.08131500094681851, + "grad_norm": 0.30281922221183777, + "learning_rate": 1.967546792710466e-05, + "loss": 0.3386, + "step": 4380 + }, + { + "epoch": 0.08135213108423714, + "grad_norm": 0.44911879301071167, + "learning_rate": 1.9675173098518775e-05, + "loss": 0.4053, + "step": 4382 + }, + { + "epoch": 0.08138926122165578, + "grad_norm": 0.3575640022754669, + "learning_rate": 1.9674878138282767e-05, + "loss": 0.4251, + "step": 4384 + }, + { + "epoch": 0.08142639135907442, + "grad_norm": 0.3299250304698944, + "learning_rate": 1.9674583046400644e-05, + "loss": 0.2951, + "step": 4386 + }, + { + "epoch": 0.08146352149649305, + "grad_norm": 0.32510480284690857, + "learning_rate": 1.9674287822876425e-05, + "loss": 0.2401, + "step": 4388 + }, + { + "epoch": 0.0815006516339117, + "grad_norm": 0.32736584544181824, + "learning_rate": 1.9673992467714127e-05, + "loss": 0.4181, + "step": 4390 + }, + { + "epoch": 0.08153778177133034, + "grad_norm": 0.3797236382961273, + "learning_rate": 1.967369698091777e-05, + "loss": 0.1149, + "step": 4392 + }, + { + "epoch": 0.08157491190874898, + "grad_norm": 0.2579886019229889, + "learning_rate": 1.967340136249137e-05, + "loss": 0.159, + "step": 4394 + }, + { + "epoch": 0.08161204204616761, + "grad_norm": 0.18991221487522125, + "learning_rate": 1.9673105612438956e-05, + "loss": 0.2736, + "step": 4396 + }, + { + "epoch": 0.08164917218358625, + "grad_norm": 0.34827739000320435, + "learning_rate": 1.9672809730764547e-05, + "loss": 0.3956, + "step": 4398 + }, + { + "epoch": 0.08168630232100489, + "grad_norm": 0.25867408514022827, + "learning_rate": 1.9672513717472174e-05, + "loss": 0.4237, + "step": 4400 + }, + { + "epoch": 0.08172343245842353, + "grad_norm": 0.4546580910682678, + "learning_rate": 1.967221757256586e-05, + "loss": 0.1816, + "step": 4402 + }, + { + "epoch": 0.08176056259584216, + "grad_norm": 0.38588833808898926, + "learning_rate": 1.9671921296049634e-05, + "loss": 0.2671, + "step": 4404 + }, + { + "epoch": 0.0817976927332608, + "grad_norm": 0.34596455097198486, + "learning_rate": 1.967162488792753e-05, + "loss": 0.2353, + "step": 4406 + }, + { + "epoch": 0.08183482287067945, + "grad_norm": 0.3007392883300781, + "learning_rate": 1.9671328348203585e-05, + "loss": 0.3836, + "step": 4408 + }, + { + "epoch": 0.08187195300809809, + "grad_norm": 0.29349493980407715, + "learning_rate": 1.967103167688183e-05, + "loss": 0.1965, + "step": 4410 + }, + { + "epoch": 0.08190908314551672, + "grad_norm": 0.2955908477306366, + "learning_rate": 1.96707348739663e-05, + "loss": 0.4331, + "step": 4412 + }, + { + "epoch": 0.08194621328293536, + "grad_norm": 0.4445996880531311, + "learning_rate": 1.9670437939461032e-05, + "loss": 0.5324, + "step": 4414 + }, + { + "epoch": 0.081983343420354, + "grad_norm": 0.3887993097305298, + "learning_rate": 1.9670140873370074e-05, + "loss": 0.3722, + "step": 4416 + }, + { + "epoch": 0.08202047355777264, + "grad_norm": 0.38987305760383606, + "learning_rate": 1.9669843675697464e-05, + "loss": 0.4525, + "step": 4418 + }, + { + "epoch": 0.08205760369519127, + "grad_norm": 0.35607287287712097, + "learning_rate": 1.9669546346447247e-05, + "loss": 0.4329, + "step": 4420 + }, + { + "epoch": 0.08209473383260991, + "grad_norm": 1.3302582502365112, + "learning_rate": 1.9669248885623466e-05, + "loss": 0.6361, + "step": 4422 + }, + { + "epoch": 0.08213186397002856, + "grad_norm": 0.38887298107147217, + "learning_rate": 1.966895129323017e-05, + "loss": 0.2566, + "step": 4424 + }, + { + "epoch": 0.08216899410744719, + "grad_norm": 0.33553558588027954, + "learning_rate": 1.966865356927141e-05, + "loss": 0.4759, + "step": 4426 + }, + { + "epoch": 0.08220612424486583, + "grad_norm": 0.5017754435539246, + "learning_rate": 1.9668355713751235e-05, + "loss": 0.3574, + "step": 4428 + }, + { + "epoch": 0.08224325438228447, + "grad_norm": 0.27615684270858765, + "learning_rate": 1.96680577266737e-05, + "loss": 0.6518, + "step": 4430 + }, + { + "epoch": 0.08228038451970311, + "grad_norm": 0.4217571020126343, + "learning_rate": 1.9667759608042858e-05, + "loss": 0.4291, + "step": 4432 + }, + { + "epoch": 0.08231751465712174, + "grad_norm": 0.32189127802848816, + "learning_rate": 1.9667461357862768e-05, + "loss": 0.5517, + "step": 4434 + }, + { + "epoch": 0.08235464479454038, + "grad_norm": 0.3140683174133301, + "learning_rate": 1.9667162976137485e-05, + "loss": 0.375, + "step": 4436 + }, + { + "epoch": 0.08239177493195902, + "grad_norm": 0.372673898935318, + "learning_rate": 1.9666864462871066e-05, + "loss": 0.3182, + "step": 4438 + }, + { + "epoch": 0.08242890506937767, + "grad_norm": 0.39809122681617737, + "learning_rate": 1.9666565818067585e-05, + "loss": 0.2904, + "step": 4440 + }, + { + "epoch": 0.0824660352067963, + "grad_norm": 0.3632083237171173, + "learning_rate": 1.9666267041731092e-05, + "loss": 0.3647, + "step": 4442 + }, + { + "epoch": 0.08250316534421494, + "grad_norm": 0.37740546464920044, + "learning_rate": 1.966596813386566e-05, + "loss": 0.2874, + "step": 4444 + }, + { + "epoch": 0.08254029548163358, + "grad_norm": 0.3099859356880188, + "learning_rate": 1.9665669094475354e-05, + "loss": 0.3626, + "step": 4446 + }, + { + "epoch": 0.08257742561905222, + "grad_norm": 0.4261600375175476, + "learning_rate": 1.966536992356425e-05, + "loss": 0.4043, + "step": 4448 + }, + { + "epoch": 0.08261455575647085, + "grad_norm": 0.33482927083969116, + "learning_rate": 1.9665070621136403e-05, + "loss": 0.2671, + "step": 4450 + }, + { + "epoch": 0.08265168589388949, + "grad_norm": 0.32126525044441223, + "learning_rate": 1.96647711871959e-05, + "loss": 0.3348, + "step": 4452 + }, + { + "epoch": 0.08268881603130813, + "grad_norm": 0.3109072744846344, + "learning_rate": 1.966447162174681e-05, + "loss": 0.2859, + "step": 4454 + }, + { + "epoch": 0.08272594616872678, + "grad_norm": 0.37587040662765503, + "learning_rate": 1.966417192479321e-05, + "loss": 0.2943, + "step": 4456 + }, + { + "epoch": 0.0827630763061454, + "grad_norm": 0.3386680483818054, + "learning_rate": 1.9663872096339176e-05, + "loss": 0.2397, + "step": 4458 + }, + { + "epoch": 0.08280020644356405, + "grad_norm": 0.4219374656677246, + "learning_rate": 1.966357213638879e-05, + "loss": 0.4738, + "step": 4460 + }, + { + "epoch": 0.08283733658098269, + "grad_norm": 0.4060429334640503, + "learning_rate": 1.9663272044946135e-05, + "loss": 0.3928, + "step": 4462 + }, + { + "epoch": 0.08287446671840132, + "grad_norm": 0.5281952619552612, + "learning_rate": 1.9662971822015292e-05, + "loss": 0.2972, + "step": 4464 + }, + { + "epoch": 0.08291159685581996, + "grad_norm": 0.27590611577033997, + "learning_rate": 1.9662671467600342e-05, + "loss": 0.3297, + "step": 4466 + }, + { + "epoch": 0.0829487269932386, + "grad_norm": 0.3817589282989502, + "learning_rate": 1.966237098170538e-05, + "loss": 0.3604, + "step": 4468 + }, + { + "epoch": 0.08298585713065725, + "grad_norm": 0.2598402202129364, + "learning_rate": 1.9662070364334492e-05, + "loss": 0.2951, + "step": 4470 + }, + { + "epoch": 0.08302298726807587, + "grad_norm": 0.33865490555763245, + "learning_rate": 1.9661769615491765e-05, + "loss": 0.3759, + "step": 4472 + }, + { + "epoch": 0.08306011740549452, + "grad_norm": 0.3624274730682373, + "learning_rate": 1.96614687351813e-05, + "loss": 0.2334, + "step": 4474 + }, + { + "epoch": 0.08309724754291316, + "grad_norm": 0.33348944783210754, + "learning_rate": 1.9661167723407178e-05, + "loss": 0.4479, + "step": 4476 + }, + { + "epoch": 0.0831343776803318, + "grad_norm": 0.3330913782119751, + "learning_rate": 1.9660866580173503e-05, + "loss": 0.5039, + "step": 4478 + }, + { + "epoch": 0.08317150781775043, + "grad_norm": 0.37822842597961426, + "learning_rate": 1.966056530548437e-05, + "loss": 0.3913, + "step": 4480 + }, + { + "epoch": 0.08320863795516907, + "grad_norm": 0.41285809874534607, + "learning_rate": 1.9660263899343884e-05, + "loss": 0.4721, + "step": 4482 + }, + { + "epoch": 0.08324576809258771, + "grad_norm": 0.4298669695854187, + "learning_rate": 1.965996236175614e-05, + "loss": 0.5382, + "step": 4484 + }, + { + "epoch": 0.08328289823000636, + "grad_norm": 0.2986083924770355, + "learning_rate": 1.9659660692725243e-05, + "loss": 0.5577, + "step": 4486 + }, + { + "epoch": 0.08332002836742498, + "grad_norm": 0.3051527440547943, + "learning_rate": 1.96593588922553e-05, + "loss": 0.2837, + "step": 4488 + }, + { + "epoch": 0.08335715850484363, + "grad_norm": 0.35509946942329407, + "learning_rate": 1.9659056960350417e-05, + "loss": 0.2567, + "step": 4490 + }, + { + "epoch": 0.08339428864226227, + "grad_norm": 0.42667677998542786, + "learning_rate": 1.9658754897014694e-05, + "loss": 0.4816, + "step": 4492 + }, + { + "epoch": 0.08343141877968091, + "grad_norm": 0.3686564564704895, + "learning_rate": 1.9658452702252252e-05, + "loss": 0.3361, + "step": 4494 + }, + { + "epoch": 0.08346854891709954, + "grad_norm": 0.3154265880584717, + "learning_rate": 1.9658150376067203e-05, + "loss": 0.3437, + "step": 4496 + }, + { + "epoch": 0.08350567905451818, + "grad_norm": 0.4197397232055664, + "learning_rate": 1.9657847918463654e-05, + "loss": 0.478, + "step": 4498 + }, + { + "epoch": 0.08354280919193682, + "grad_norm": 0.32215428352355957, + "learning_rate": 1.9657545329445722e-05, + "loss": 0.4424, + "step": 4500 + }, + { + "epoch": 0.08357993932935545, + "grad_norm": 0.44540244340896606, + "learning_rate": 1.9657242609017526e-05, + "loss": 0.3716, + "step": 4502 + }, + { + "epoch": 0.0836170694667741, + "grad_norm": 0.4637117087841034, + "learning_rate": 1.9656939757183187e-05, + "loss": 0.2526, + "step": 4504 + }, + { + "epoch": 0.08365419960419274, + "grad_norm": 0.28792962431907654, + "learning_rate": 1.965663677394682e-05, + "loss": 0.3533, + "step": 4506 + }, + { + "epoch": 0.08369132974161138, + "grad_norm": 0.3542967140674591, + "learning_rate": 1.9656333659312557e-05, + "loss": 0.4268, + "step": 4508 + }, + { + "epoch": 0.08372845987903, + "grad_norm": 0.3405669927597046, + "learning_rate": 1.9656030413284514e-05, + "loss": 0.4391, + "step": 4510 + }, + { + "epoch": 0.08376559001644865, + "grad_norm": 0.5863144397735596, + "learning_rate": 1.9655727035866817e-05, + "loss": 0.2254, + "step": 4512 + }, + { + "epoch": 0.08380272015386729, + "grad_norm": 0.36684104800224304, + "learning_rate": 1.9655423527063603e-05, + "loss": 0.4588, + "step": 4514 + }, + { + "epoch": 0.08383985029128593, + "grad_norm": 0.32833239436149597, + "learning_rate": 1.9655119886878993e-05, + "loss": 0.4541, + "step": 4516 + }, + { + "epoch": 0.08387698042870456, + "grad_norm": 0.34907475113868713, + "learning_rate": 1.9654816115317123e-05, + "loss": 0.3647, + "step": 4518 + }, + { + "epoch": 0.0839141105661232, + "grad_norm": 0.33335158228874207, + "learning_rate": 1.9654512212382123e-05, + "loss": 0.2761, + "step": 4520 + }, + { + "epoch": 0.08395124070354185, + "grad_norm": 0.2944607138633728, + "learning_rate": 1.965420817807813e-05, + "loss": 0.3219, + "step": 4522 + }, + { + "epoch": 0.08398837084096049, + "grad_norm": 0.3625856637954712, + "learning_rate": 1.9653904012409283e-05, + "loss": 0.3374, + "step": 4524 + }, + { + "epoch": 0.08402550097837912, + "grad_norm": 0.36513465642929077, + "learning_rate": 1.9653599715379722e-05, + "loss": 0.2837, + "step": 4526 + }, + { + "epoch": 0.08406263111579776, + "grad_norm": 0.26217037439346313, + "learning_rate": 1.9653295286993584e-05, + "loss": 0.5084, + "step": 4528 + }, + { + "epoch": 0.0840997612532164, + "grad_norm": 0.30461055040359497, + "learning_rate": 1.9652990727255007e-05, + "loss": 0.4252, + "step": 4530 + }, + { + "epoch": 0.08413689139063504, + "grad_norm": 0.32422107458114624, + "learning_rate": 1.9652686036168145e-05, + "loss": 0.1812, + "step": 4532 + }, + { + "epoch": 0.08417402152805367, + "grad_norm": 0.3543776273727417, + "learning_rate": 1.9652381213737136e-05, + "loss": 0.25, + "step": 4534 + }, + { + "epoch": 0.08421115166547231, + "grad_norm": 0.44842255115509033, + "learning_rate": 1.9652076259966133e-05, + "loss": 0.4179, + "step": 4536 + }, + { + "epoch": 0.08424828180289096, + "grad_norm": 0.39853161573410034, + "learning_rate": 1.9651771174859284e-05, + "loss": 0.3174, + "step": 4538 + }, + { + "epoch": 0.08428541194030958, + "grad_norm": 0.35731151700019836, + "learning_rate": 1.965146595842074e-05, + "loss": 0.3676, + "step": 4540 + }, + { + "epoch": 0.08432254207772823, + "grad_norm": 0.3116741478443146, + "learning_rate": 1.9651160610654654e-05, + "loss": 0.1742, + "step": 4542 + }, + { + "epoch": 0.08435967221514687, + "grad_norm": 0.31342506408691406, + "learning_rate": 1.9650855131565178e-05, + "loss": 0.3839, + "step": 4544 + }, + { + "epoch": 0.08439680235256551, + "grad_norm": 0.378753125667572, + "learning_rate": 1.9650549521156474e-05, + "loss": 0.4448, + "step": 4546 + }, + { + "epoch": 0.08443393248998414, + "grad_norm": 0.33333152532577515, + "learning_rate": 1.9650243779432692e-05, + "loss": 0.443, + "step": 4548 + }, + { + "epoch": 0.08447106262740278, + "grad_norm": 0.4335893392562866, + "learning_rate": 1.9649937906398004e-05, + "loss": 0.3738, + "step": 4550 + }, + { + "epoch": 0.08450819276482142, + "grad_norm": 0.4551970958709717, + "learning_rate": 1.9649631902056564e-05, + "loss": 0.55, + "step": 4552 + }, + { + "epoch": 0.08454532290224007, + "grad_norm": 0.27086567878723145, + "learning_rate": 1.9649325766412538e-05, + "loss": 0.5156, + "step": 4554 + }, + { + "epoch": 0.0845824530396587, + "grad_norm": 0.31647568941116333, + "learning_rate": 1.9649019499470094e-05, + "loss": 0.3354, + "step": 4556 + }, + { + "epoch": 0.08461958317707734, + "grad_norm": 0.3380351662635803, + "learning_rate": 1.9648713101233393e-05, + "loss": 0.3419, + "step": 4558 + }, + { + "epoch": 0.08465671331449598, + "grad_norm": 0.39634308218955994, + "learning_rate": 1.964840657170661e-05, + "loss": 0.405, + "step": 4560 + }, + { + "epoch": 0.08469384345191462, + "grad_norm": 0.29830124974250793, + "learning_rate": 1.9648099910893915e-05, + "loss": 0.2347, + "step": 4562 + }, + { + "epoch": 0.08473097358933325, + "grad_norm": 0.27111971378326416, + "learning_rate": 1.9647793118799474e-05, + "loss": 0.3104, + "step": 4564 + }, + { + "epoch": 0.08476810372675189, + "grad_norm": 0.3366991877555847, + "learning_rate": 1.964748619542747e-05, + "loss": 0.5462, + "step": 4566 + }, + { + "epoch": 0.08480523386417053, + "grad_norm": 0.2871280014514923, + "learning_rate": 1.964717914078208e-05, + "loss": 0.4995, + "step": 4568 + }, + { + "epoch": 0.08484236400158918, + "grad_norm": 0.4819769859313965, + "learning_rate": 1.9646871954867476e-05, + "loss": 0.3203, + "step": 4570 + }, + { + "epoch": 0.0848794941390078, + "grad_norm": 0.40006667375564575, + "learning_rate": 1.964656463768784e-05, + "loss": 0.3776, + "step": 4572 + }, + { + "epoch": 0.08491662427642645, + "grad_norm": 0.18466511368751526, + "learning_rate": 1.9646257189247355e-05, + "loss": 0.261, + "step": 4574 + }, + { + "epoch": 0.08495375441384509, + "grad_norm": 0.5172551274299622, + "learning_rate": 1.96459496095502e-05, + "loss": 0.36, + "step": 4576 + }, + { + "epoch": 0.08499088455126372, + "grad_norm": 0.3059728741645813, + "learning_rate": 1.964564189860057e-05, + "loss": 0.3953, + "step": 4578 + }, + { + "epoch": 0.08502801468868236, + "grad_norm": 0.392494261264801, + "learning_rate": 1.964533405640264e-05, + "loss": 0.3833, + "step": 4580 + }, + { + "epoch": 0.085065144826101, + "grad_norm": 0.43828439712524414, + "learning_rate": 1.9645026082960606e-05, + "loss": 0.2643, + "step": 4582 + }, + { + "epoch": 0.08510227496351964, + "grad_norm": 0.40079665184020996, + "learning_rate": 1.964471797827866e-05, + "loss": 0.1689, + "step": 4584 + }, + { + "epoch": 0.08513940510093827, + "grad_norm": 0.3955659866333008, + "learning_rate": 1.9644409742360992e-05, + "loss": 0.3265, + "step": 4586 + }, + { + "epoch": 0.08517653523835692, + "grad_norm": 0.3505641520023346, + "learning_rate": 1.9644101375211794e-05, + "loss": 0.2854, + "step": 4588 + }, + { + "epoch": 0.08521366537577556, + "grad_norm": 0.4161413609981537, + "learning_rate": 1.964379287683526e-05, + "loss": 0.2986, + "step": 4590 + }, + { + "epoch": 0.0852507955131942, + "grad_norm": 0.3062976002693176, + "learning_rate": 1.9643484247235597e-05, + "loss": 0.3982, + "step": 4592 + }, + { + "epoch": 0.08528792565061283, + "grad_norm": 0.29844093322753906, + "learning_rate": 1.9643175486416998e-05, + "loss": 0.4544, + "step": 4594 + }, + { + "epoch": 0.08532505578803147, + "grad_norm": 0.29506438970565796, + "learning_rate": 1.964286659438366e-05, + "loss": 0.2951, + "step": 4596 + }, + { + "epoch": 0.08536218592545011, + "grad_norm": 0.3231109380722046, + "learning_rate": 1.9642557571139798e-05, + "loss": 0.3056, + "step": 4598 + }, + { + "epoch": 0.08539931606286875, + "grad_norm": 0.24248245358467102, + "learning_rate": 1.964224841668961e-05, + "loss": 0.381, + "step": 4600 + }, + { + "epoch": 0.08543644620028738, + "grad_norm": 0.30829092860221863, + "learning_rate": 1.96419391310373e-05, + "loss": 0.4148, + "step": 4602 + }, + { + "epoch": 0.08547357633770603, + "grad_norm": 0.6405683755874634, + "learning_rate": 1.964162971418708e-05, + "loss": 0.3978, + "step": 4604 + }, + { + "epoch": 0.08551070647512467, + "grad_norm": 0.38810399174690247, + "learning_rate": 1.9641320166143157e-05, + "loss": 0.3795, + "step": 4606 + }, + { + "epoch": 0.08554783661254331, + "grad_norm": 0.3913334608078003, + "learning_rate": 1.964101048690975e-05, + "loss": 0.2631, + "step": 4608 + }, + { + "epoch": 0.08558496674996194, + "grad_norm": 0.27588650584220886, + "learning_rate": 1.9640700676491066e-05, + "loss": 0.3818, + "step": 4610 + }, + { + "epoch": 0.08562209688738058, + "grad_norm": 0.21955938637256622, + "learning_rate": 1.9640390734891323e-05, + "loss": 0.2611, + "step": 4612 + }, + { + "epoch": 0.08565922702479922, + "grad_norm": 0.35103267431259155, + "learning_rate": 1.964008066211474e-05, + "loss": 0.2142, + "step": 4614 + }, + { + "epoch": 0.08569635716221785, + "grad_norm": 0.35324332118034363, + "learning_rate": 1.9639770458165535e-05, + "loss": 0.5041, + "step": 4616 + }, + { + "epoch": 0.0857334872996365, + "grad_norm": 0.28122270107269287, + "learning_rate": 1.9639460123047925e-05, + "loss": 0.3081, + "step": 4618 + }, + { + "epoch": 0.08577061743705514, + "grad_norm": 0.3309430480003357, + "learning_rate": 1.963914965676614e-05, + "loss": 0.4823, + "step": 4620 + }, + { + "epoch": 0.08580774757447378, + "grad_norm": 0.3529426157474518, + "learning_rate": 1.9638839059324398e-05, + "loss": 0.6578, + "step": 4622 + }, + { + "epoch": 0.0858448777118924, + "grad_norm": 0.5730276107788086, + "learning_rate": 1.963852833072693e-05, + "loss": 0.3286, + "step": 4624 + }, + { + "epoch": 0.08588200784931105, + "grad_norm": 0.347434401512146, + "learning_rate": 1.9638217470977963e-05, + "loss": 0.4237, + "step": 4626 + }, + { + "epoch": 0.08591913798672969, + "grad_norm": 0.3839362561702728, + "learning_rate": 1.9637906480081727e-05, + "loss": 0.3103, + "step": 4628 + }, + { + "epoch": 0.08595626812414833, + "grad_norm": 0.5454792976379395, + "learning_rate": 1.9637595358042446e-05, + "loss": 0.3282, + "step": 4630 + }, + { + "epoch": 0.08599339826156696, + "grad_norm": 0.28368109464645386, + "learning_rate": 1.9637284104864368e-05, + "loss": 0.4548, + "step": 4632 + }, + { + "epoch": 0.0860305283989856, + "grad_norm": 0.3019779622554779, + "learning_rate": 1.9636972720551716e-05, + "loss": 0.4542, + "step": 4634 + }, + { + "epoch": 0.08606765853640425, + "grad_norm": 0.29099613428115845, + "learning_rate": 1.9636661205108728e-05, + "loss": 0.4649, + "step": 4636 + }, + { + "epoch": 0.08610478867382289, + "grad_norm": 0.4067786633968353, + "learning_rate": 1.9636349558539652e-05, + "loss": 0.4441, + "step": 4638 + }, + { + "epoch": 0.08614191881124152, + "grad_norm": 0.420219361782074, + "learning_rate": 1.963603778084872e-05, + "loss": 0.4623, + "step": 4640 + }, + { + "epoch": 0.08617904894866016, + "grad_norm": 0.3464505672454834, + "learning_rate": 1.963572587204018e-05, + "loss": 0.1677, + "step": 4642 + }, + { + "epoch": 0.0862161790860788, + "grad_norm": 0.3247014582157135, + "learning_rate": 1.963541383211827e-05, + "loss": 0.3703, + "step": 4644 + }, + { + "epoch": 0.08625330922349744, + "grad_norm": 0.3485212028026581, + "learning_rate": 1.963510166108724e-05, + "loss": 0.2499, + "step": 4646 + }, + { + "epoch": 0.08629043936091607, + "grad_norm": 0.3235088586807251, + "learning_rate": 1.9634789358951337e-05, + "loss": 0.4659, + "step": 4648 + }, + { + "epoch": 0.08632756949833471, + "grad_norm": 0.3788321614265442, + "learning_rate": 1.963447692571481e-05, + "loss": 0.3304, + "step": 4650 + }, + { + "epoch": 0.08636469963575336, + "grad_norm": 0.4088015854358673, + "learning_rate": 1.963416436138191e-05, + "loss": 0.4006, + "step": 4652 + }, + { + "epoch": 0.08640182977317198, + "grad_norm": 0.34424135088920593, + "learning_rate": 1.9633851665956894e-05, + "loss": 0.4444, + "step": 4654 + }, + { + "epoch": 0.08643895991059063, + "grad_norm": 0.2938086688518524, + "learning_rate": 1.963353883944401e-05, + "loss": 0.2668, + "step": 4656 + }, + { + "epoch": 0.08647609004800927, + "grad_norm": 0.29691851139068604, + "learning_rate": 1.963322588184752e-05, + "loss": 0.2383, + "step": 4658 + }, + { + "epoch": 0.08651322018542791, + "grad_norm": 0.32453083992004395, + "learning_rate": 1.963291279317168e-05, + "loss": 0.2668, + "step": 4660 + }, + { + "epoch": 0.08655035032284654, + "grad_norm": 0.43395793437957764, + "learning_rate": 1.9632599573420753e-05, + "loss": 0.3599, + "step": 4662 + }, + { + "epoch": 0.08658748046026518, + "grad_norm": 0.3512604236602783, + "learning_rate": 1.9632286222598998e-05, + "loss": 0.4005, + "step": 4664 + }, + { + "epoch": 0.08662461059768382, + "grad_norm": 0.3002413511276245, + "learning_rate": 1.963197274071068e-05, + "loss": 0.3406, + "step": 4666 + }, + { + "epoch": 0.08666174073510247, + "grad_norm": 0.40121015906333923, + "learning_rate": 1.963165912776006e-05, + "loss": 0.416, + "step": 4668 + }, + { + "epoch": 0.0866988708725211, + "grad_norm": 0.3829343318939209, + "learning_rate": 1.9631345383751413e-05, + "loss": 0.2023, + "step": 4670 + }, + { + "epoch": 0.08673600100993974, + "grad_norm": 0.24035698175430298, + "learning_rate": 1.9631031508689e-05, + "loss": 0.204, + "step": 4672 + }, + { + "epoch": 0.08677313114735838, + "grad_norm": 0.30460163950920105, + "learning_rate": 1.96307175025771e-05, + "loss": 0.4296, + "step": 4674 + }, + { + "epoch": 0.08681026128477702, + "grad_norm": 0.2587651014328003, + "learning_rate": 1.9630403365419984e-05, + "loss": 0.3299, + "step": 4676 + }, + { + "epoch": 0.08684739142219565, + "grad_norm": 0.355174720287323, + "learning_rate": 1.9630089097221924e-05, + "loss": 0.1926, + "step": 4678 + }, + { + "epoch": 0.08688452155961429, + "grad_norm": 0.3191797733306885, + "learning_rate": 1.9629774697987195e-05, + "loss": 0.256, + "step": 4680 + }, + { + "epoch": 0.08692165169703293, + "grad_norm": 0.3339892327785492, + "learning_rate": 1.9629460167720075e-05, + "loss": 0.3226, + "step": 4682 + }, + { + "epoch": 0.08695878183445158, + "grad_norm": 0.36235374212265015, + "learning_rate": 1.9629145506424853e-05, + "loss": 0.315, + "step": 4684 + }, + { + "epoch": 0.0869959119718702, + "grad_norm": 0.2823745906352997, + "learning_rate": 1.9628830714105796e-05, + "loss": 0.3854, + "step": 4686 + }, + { + "epoch": 0.08703304210928885, + "grad_norm": 0.3788362145423889, + "learning_rate": 1.9628515790767196e-05, + "loss": 0.313, + "step": 4688 + }, + { + "epoch": 0.08707017224670749, + "grad_norm": 0.2732026278972626, + "learning_rate": 1.9628200736413337e-05, + "loss": 0.5059, + "step": 4690 + }, + { + "epoch": 0.08710730238412612, + "grad_norm": 0.265223890542984, + "learning_rate": 1.962788555104851e-05, + "loss": 0.2701, + "step": 4692 + }, + { + "epoch": 0.08714443252154476, + "grad_norm": 0.2825123369693756, + "learning_rate": 1.9627570234676993e-05, + "loss": 0.3289, + "step": 4694 + }, + { + "epoch": 0.0871815626589634, + "grad_norm": 0.2788553237915039, + "learning_rate": 1.9627254787303086e-05, + "loss": 0.4309, + "step": 4696 + }, + { + "epoch": 0.08721869279638204, + "grad_norm": 0.340683251619339, + "learning_rate": 1.9626939208931078e-05, + "loss": 0.3215, + "step": 4698 + }, + { + "epoch": 0.08725582293380067, + "grad_norm": 0.308350145816803, + "learning_rate": 1.9626623499565266e-05, + "loss": 0.3846, + "step": 4700 + }, + { + "epoch": 0.08729295307121931, + "grad_norm": 0.379747599363327, + "learning_rate": 1.962630765920994e-05, + "loss": 0.553, + "step": 4702 + }, + { + "epoch": 0.08733008320863796, + "grad_norm": 0.42858612537384033, + "learning_rate": 1.9625991687869402e-05, + "loss": 0.3364, + "step": 4704 + }, + { + "epoch": 0.0873672133460566, + "grad_norm": 0.38619449734687805, + "learning_rate": 1.962567558554795e-05, + "loss": 0.2324, + "step": 4706 + }, + { + "epoch": 0.08740434348347523, + "grad_norm": 0.31159231066703796, + "learning_rate": 1.9625359352249888e-05, + "loss": 0.4673, + "step": 4708 + }, + { + "epoch": 0.08744147362089387, + "grad_norm": 0.2789519131183624, + "learning_rate": 1.9625042987979512e-05, + "loss": 0.3444, + "step": 4710 + }, + { + "epoch": 0.08747860375831251, + "grad_norm": 0.31110092997550964, + "learning_rate": 1.9624726492741132e-05, + "loss": 0.3605, + "step": 4712 + }, + { + "epoch": 0.08751573389573115, + "grad_norm": 0.3585788905620575, + "learning_rate": 1.9624409866539058e-05, + "loss": 0.5106, + "step": 4714 + }, + { + "epoch": 0.08755286403314978, + "grad_norm": 0.4569585919380188, + "learning_rate": 1.962409310937759e-05, + "loss": 0.3315, + "step": 4716 + }, + { + "epoch": 0.08758999417056842, + "grad_norm": 0.3330109417438507, + "learning_rate": 1.9623776221261046e-05, + "loss": 0.3405, + "step": 4718 + }, + { + "epoch": 0.08762712430798707, + "grad_norm": 0.43835577368736267, + "learning_rate": 1.962345920219373e-05, + "loss": 0.3185, + "step": 4720 + }, + { + "epoch": 0.08766425444540571, + "grad_norm": 0.34292635321617126, + "learning_rate": 1.962314205217996e-05, + "loss": 0.4202, + "step": 4722 + }, + { + "epoch": 0.08770138458282434, + "grad_norm": 0.44344761967658997, + "learning_rate": 1.9622824771224058e-05, + "loss": 0.3566, + "step": 4724 + }, + { + "epoch": 0.08773851472024298, + "grad_norm": 0.31321951746940613, + "learning_rate": 1.962250735933033e-05, + "loss": 0.4234, + "step": 4726 + }, + { + "epoch": 0.08777564485766162, + "grad_norm": 0.5036094188690186, + "learning_rate": 1.9622189816503098e-05, + "loss": 0.5105, + "step": 4728 + }, + { + "epoch": 0.08781277499508025, + "grad_norm": 0.4505239427089691, + "learning_rate": 1.9621872142746684e-05, + "loss": 0.3808, + "step": 4730 + }, + { + "epoch": 0.08784990513249889, + "grad_norm": 0.2138471156358719, + "learning_rate": 1.9621554338065414e-05, + "loss": 0.401, + "step": 4732 + }, + { + "epoch": 0.08788703526991754, + "grad_norm": 0.4337904453277588, + "learning_rate": 1.9621236402463608e-05, + "loss": 0.3911, + "step": 4734 + }, + { + "epoch": 0.08792416540733618, + "grad_norm": 0.42932403087615967, + "learning_rate": 1.9620918335945593e-05, + "loss": 0.2703, + "step": 4736 + }, + { + "epoch": 0.0879612955447548, + "grad_norm": 1.9188131093978882, + "learning_rate": 1.96206001385157e-05, + "loss": 0.3405, + "step": 4738 + }, + { + "epoch": 0.08799842568217345, + "grad_norm": 0.3674734830856323, + "learning_rate": 1.9620281810178253e-05, + "loss": 0.2512, + "step": 4740 + }, + { + "epoch": 0.08803555581959209, + "grad_norm": 0.35753199458122253, + "learning_rate": 1.961996335093759e-05, + "loss": 0.3655, + "step": 4742 + }, + { + "epoch": 0.08807268595701073, + "grad_norm": 0.327698677778244, + "learning_rate": 1.9619644760798035e-05, + "loss": 0.2822, + "step": 4744 + }, + { + "epoch": 0.08810981609442936, + "grad_norm": 0.44055238366127014, + "learning_rate": 1.9619326039763936e-05, + "loss": 0.3258, + "step": 4746 + }, + { + "epoch": 0.088146946231848, + "grad_norm": 0.33755528926849365, + "learning_rate": 1.9619007187839618e-05, + "loss": 0.1912, + "step": 4748 + }, + { + "epoch": 0.08818407636926665, + "grad_norm": 0.3712221086025238, + "learning_rate": 1.9618688205029427e-05, + "loss": 0.1099, + "step": 4750 + }, + { + "epoch": 0.08822120650668529, + "grad_norm": 0.358967661857605, + "learning_rate": 1.9618369091337698e-05, + "loss": 0.4273, + "step": 4752 + }, + { + "epoch": 0.08825833664410392, + "grad_norm": 0.2960997521877289, + "learning_rate": 1.961804984676878e-05, + "loss": 0.1827, + "step": 4754 + }, + { + "epoch": 0.08829546678152256, + "grad_norm": 0.28012827038764954, + "learning_rate": 1.961773047132701e-05, + "loss": 0.4107, + "step": 4756 + }, + { + "epoch": 0.0883325969189412, + "grad_norm": 0.3843344748020172, + "learning_rate": 1.9617410965016736e-05, + "loss": 0.2733, + "step": 4758 + }, + { + "epoch": 0.08836972705635984, + "grad_norm": 0.33418747782707214, + "learning_rate": 1.9617091327842308e-05, + "loss": 0.4315, + "step": 4760 + }, + { + "epoch": 0.08840685719377847, + "grad_norm": 0.3174058198928833, + "learning_rate": 1.9616771559808075e-05, + "loss": 0.3955, + "step": 4762 + }, + { + "epoch": 0.08844398733119711, + "grad_norm": 0.41320618987083435, + "learning_rate": 1.9616451660918382e-05, + "loss": 0.4344, + "step": 4764 + }, + { + "epoch": 0.08848111746861576, + "grad_norm": 0.3358910381793976, + "learning_rate": 1.961613163117759e-05, + "loss": 0.2313, + "step": 4766 + }, + { + "epoch": 0.08851824760603438, + "grad_norm": 0.3710285425186157, + "learning_rate": 1.9615811470590048e-05, + "loss": 0.3123, + "step": 4768 + }, + { + "epoch": 0.08855537774345303, + "grad_norm": 0.37014228105545044, + "learning_rate": 1.9615491179160117e-05, + "loss": 0.4508, + "step": 4770 + }, + { + "epoch": 0.08859250788087167, + "grad_norm": 0.3562534749507904, + "learning_rate": 1.961517075689215e-05, + "loss": 0.3191, + "step": 4772 + }, + { + "epoch": 0.08862963801829031, + "grad_norm": 0.4657309949398041, + "learning_rate": 1.9614850203790517e-05, + "loss": 0.5744, + "step": 4774 + }, + { + "epoch": 0.08866676815570894, + "grad_norm": 0.30195385217666626, + "learning_rate": 1.9614529519859566e-05, + "loss": 0.3684, + "step": 4776 + }, + { + "epoch": 0.08870389829312758, + "grad_norm": 0.39522865414619446, + "learning_rate": 1.9614208705103667e-05, + "loss": 0.3502, + "step": 4778 + }, + { + "epoch": 0.08874102843054622, + "grad_norm": 0.22946004569530487, + "learning_rate": 1.9613887759527186e-05, + "loss": 0.3307, + "step": 4780 + }, + { + "epoch": 0.08877815856796487, + "grad_norm": 0.5596912503242493, + "learning_rate": 1.961356668313449e-05, + "loss": 0.3137, + "step": 4782 + }, + { + "epoch": 0.0888152887053835, + "grad_norm": 0.37164831161499023, + "learning_rate": 1.961324547592995e-05, + "loss": 0.2118, + "step": 4784 + }, + { + "epoch": 0.08885241884280214, + "grad_norm": 0.33855870366096497, + "learning_rate": 1.9612924137917932e-05, + "loss": 0.3222, + "step": 4786 + }, + { + "epoch": 0.08888954898022078, + "grad_norm": 0.429386168718338, + "learning_rate": 1.961260266910281e-05, + "loss": 0.4907, + "step": 4788 + }, + { + "epoch": 0.08892667911763942, + "grad_norm": 0.3675706088542938, + "learning_rate": 1.961228106948896e-05, + "loss": 0.2544, + "step": 4790 + }, + { + "epoch": 0.08896380925505805, + "grad_norm": 0.31491485238075256, + "learning_rate": 1.9611959339080756e-05, + "loss": 0.376, + "step": 4792 + }, + { + "epoch": 0.08900093939247669, + "grad_norm": 0.4208625555038452, + "learning_rate": 1.9611637477882573e-05, + "loss": 0.1293, + "step": 4794 + }, + { + "epoch": 0.08903806952989533, + "grad_norm": 0.2978513836860657, + "learning_rate": 1.9611315485898798e-05, + "loss": 0.5303, + "step": 4796 + }, + { + "epoch": 0.08907519966731398, + "grad_norm": 0.460877001285553, + "learning_rate": 1.961099336313381e-05, + "loss": 0.2822, + "step": 4798 + }, + { + "epoch": 0.0891123298047326, + "grad_norm": 0.3632570207118988, + "learning_rate": 1.9610671109591988e-05, + "loss": 0.3158, + "step": 4800 + }, + { + "epoch": 0.08914945994215125, + "grad_norm": 0.4490357041358948, + "learning_rate": 1.961034872527772e-05, + "loss": 0.2587, + "step": 4802 + }, + { + "epoch": 0.08918659007956989, + "grad_norm": 0.3445192873477936, + "learning_rate": 1.961002621019539e-05, + "loss": 0.4936, + "step": 4804 + }, + { + "epoch": 0.08922372021698852, + "grad_norm": 0.33543261885643005, + "learning_rate": 1.960970356434939e-05, + "loss": 0.435, + "step": 4806 + }, + { + "epoch": 0.08926085035440716, + "grad_norm": 0.2589396834373474, + "learning_rate": 1.960938078774411e-05, + "loss": 0.2938, + "step": 4808 + }, + { + "epoch": 0.0892979804918258, + "grad_norm": 0.42650461196899414, + "learning_rate": 1.9609057880383942e-05, + "loss": 0.196, + "step": 4810 + }, + { + "epoch": 0.08933511062924444, + "grad_norm": 0.4195377230644226, + "learning_rate": 1.9608734842273276e-05, + "loss": 0.3861, + "step": 4812 + }, + { + "epoch": 0.08937224076666307, + "grad_norm": 0.3228859305381775, + "learning_rate": 1.9608411673416513e-05, + "loss": 0.4203, + "step": 4814 + }, + { + "epoch": 0.08940937090408171, + "grad_norm": 0.3798494040966034, + "learning_rate": 1.9608088373818045e-05, + "loss": 0.5612, + "step": 4816 + }, + { + "epoch": 0.08944650104150036, + "grad_norm": 0.5108641982078552, + "learning_rate": 1.960776494348228e-05, + "loss": 0.3089, + "step": 4818 + }, + { + "epoch": 0.089483631178919, + "grad_norm": 0.45247113704681396, + "learning_rate": 1.9607441382413604e-05, + "loss": 0.2938, + "step": 4820 + }, + { + "epoch": 0.08952076131633763, + "grad_norm": 0.40164345502853394, + "learning_rate": 1.9607117690616432e-05, + "loss": 0.4178, + "step": 4822 + }, + { + "epoch": 0.08955789145375627, + "grad_norm": 0.31786108016967773, + "learning_rate": 1.9606793868095164e-05, + "loss": 0.2715, + "step": 4824 + }, + { + "epoch": 0.08959502159117491, + "grad_norm": 0.432865172624588, + "learning_rate": 1.960646991485421e-05, + "loss": 0.5264, + "step": 4826 + }, + { + "epoch": 0.08963215172859355, + "grad_norm": 0.312656044960022, + "learning_rate": 1.960614583089797e-05, + "loss": 0.4556, + "step": 4828 + }, + { + "epoch": 0.08966928186601218, + "grad_norm": 0.42316365242004395, + "learning_rate": 1.9605821616230867e-05, + "loss": 0.3504, + "step": 4830 + }, + { + "epoch": 0.08970641200343082, + "grad_norm": 0.3008486330509186, + "learning_rate": 1.96054972708573e-05, + "loss": 0.2799, + "step": 4832 + }, + { + "epoch": 0.08974354214084947, + "grad_norm": 0.38963010907173157, + "learning_rate": 1.9605172794781687e-05, + "loss": 0.3733, + "step": 4834 + }, + { + "epoch": 0.0897806722782681, + "grad_norm": 0.3395363390445709, + "learning_rate": 1.960484818800844e-05, + "loss": 0.3424, + "step": 4836 + }, + { + "epoch": 0.08981780241568674, + "grad_norm": 0.332244873046875, + "learning_rate": 1.9604523450541983e-05, + "loss": 0.2607, + "step": 4838 + }, + { + "epoch": 0.08985493255310538, + "grad_norm": 0.33453601598739624, + "learning_rate": 1.960419858238673e-05, + "loss": 0.4416, + "step": 4840 + }, + { + "epoch": 0.08989206269052402, + "grad_norm": 0.38418081402778625, + "learning_rate": 1.96038735835471e-05, + "loss": 0.3032, + "step": 4842 + }, + { + "epoch": 0.08992919282794265, + "grad_norm": 0.38488829135894775, + "learning_rate": 1.960354845402752e-05, + "loss": 0.3229, + "step": 4844 + }, + { + "epoch": 0.08996632296536129, + "grad_norm": 0.5189816355705261, + "learning_rate": 1.960322319383241e-05, + "loss": 0.3619, + "step": 4846 + }, + { + "epoch": 0.09000345310277993, + "grad_norm": 0.4044243395328522, + "learning_rate": 1.9602897802966197e-05, + "loss": 0.2732, + "step": 4848 + }, + { + "epoch": 0.09004058324019858, + "grad_norm": 0.4594970643520355, + "learning_rate": 1.960257228143331e-05, + "loss": 0.4085, + "step": 4850 + }, + { + "epoch": 0.0900777133776172, + "grad_norm": 0.42997562885284424, + "learning_rate": 1.960224662923818e-05, + "loss": 0.3644, + "step": 4852 + }, + { + "epoch": 0.09011484351503585, + "grad_norm": 0.24007850885391235, + "learning_rate": 1.9601920846385232e-05, + "loss": 0.168, + "step": 4854 + }, + { + "epoch": 0.09015197365245449, + "grad_norm": 0.286358505487442, + "learning_rate": 1.9601594932878902e-05, + "loss": 0.2796, + "step": 4856 + }, + { + "epoch": 0.09018910378987313, + "grad_norm": 0.3698863089084625, + "learning_rate": 1.960126888872363e-05, + "loss": 0.3305, + "step": 4858 + }, + { + "epoch": 0.09022623392729176, + "grad_norm": 0.41181281208992004, + "learning_rate": 1.960094271392384e-05, + "loss": 0.4354, + "step": 4860 + }, + { + "epoch": 0.0902633640647104, + "grad_norm": 0.22134605050086975, + "learning_rate": 1.960061640848398e-05, + "loss": 0.3455, + "step": 4862 + }, + { + "epoch": 0.09030049420212904, + "grad_norm": 0.4314284026622772, + "learning_rate": 1.960028997240849e-05, + "loss": 0.2922, + "step": 4864 + }, + { + "epoch": 0.09033762433954769, + "grad_norm": 0.3973320424556732, + "learning_rate": 1.959996340570181e-05, + "loss": 0.3395, + "step": 4866 + }, + { + "epoch": 0.09037475447696632, + "grad_norm": 0.40343576669692993, + "learning_rate": 1.9599636708368385e-05, + "loss": 0.2528, + "step": 4868 + }, + { + "epoch": 0.09041188461438496, + "grad_norm": 0.24007877707481384, + "learning_rate": 1.9599309880412654e-05, + "loss": 0.3304, + "step": 4870 + }, + { + "epoch": 0.0904490147518036, + "grad_norm": 0.3877570331096649, + "learning_rate": 1.9598982921839073e-05, + "loss": 0.4146, + "step": 4872 + }, + { + "epoch": 0.09048614488922223, + "grad_norm": 0.37691619992256165, + "learning_rate": 1.9598655832652082e-05, + "loss": 0.263, + "step": 4874 + }, + { + "epoch": 0.09052327502664087, + "grad_norm": 0.22564902901649475, + "learning_rate": 1.959832861285614e-05, + "loss": 0.3041, + "step": 4876 + }, + { + "epoch": 0.09056040516405951, + "grad_norm": 0.2808085083961487, + "learning_rate": 1.9598001262455694e-05, + "loss": 0.4411, + "step": 4878 + }, + { + "epoch": 0.09059753530147815, + "grad_norm": 0.3611622750759125, + "learning_rate": 1.9597673781455202e-05, + "loss": 0.3413, + "step": 4880 + }, + { + "epoch": 0.09063466543889678, + "grad_norm": 0.38722723722457886, + "learning_rate": 1.9597346169859114e-05, + "loss": 0.1954, + "step": 4882 + }, + { + "epoch": 0.09067179557631543, + "grad_norm": 0.32024624943733215, + "learning_rate": 1.95970184276719e-05, + "loss": 0.1965, + "step": 4884 + }, + { + "epoch": 0.09070892571373407, + "grad_norm": 0.3292338252067566, + "learning_rate": 1.9596690554898003e-05, + "loss": 0.3352, + "step": 4886 + }, + { + "epoch": 0.09074605585115271, + "grad_norm": 0.3589654564857483, + "learning_rate": 1.9596362551541896e-05, + "loss": 0.4066, + "step": 4888 + }, + { + "epoch": 0.09078318598857134, + "grad_norm": 0.4215356707572937, + "learning_rate": 1.9596034417608042e-05, + "loss": 0.4987, + "step": 4890 + }, + { + "epoch": 0.09082031612598998, + "grad_norm": 0.4367898106575012, + "learning_rate": 1.95957061531009e-05, + "loss": 0.399, + "step": 4892 + }, + { + "epoch": 0.09085744626340862, + "grad_norm": 0.26233628392219543, + "learning_rate": 1.959537775802494e-05, + "loss": 0.2887, + "step": 4894 + }, + { + "epoch": 0.09089457640082726, + "grad_norm": 0.32188430428504944, + "learning_rate": 1.959504923238463e-05, + "loss": 0.3507, + "step": 4896 + }, + { + "epoch": 0.0909317065382459, + "grad_norm": 0.2501329779624939, + "learning_rate": 1.9594720576184444e-05, + "loss": 0.3872, + "step": 4898 + }, + { + "epoch": 0.09096883667566454, + "grad_norm": 0.3407454490661621, + "learning_rate": 1.9594391789428847e-05, + "loss": 0.4934, + "step": 4900 + }, + { + "epoch": 0.09100596681308318, + "grad_norm": 0.3168795108795166, + "learning_rate": 1.9594062872122316e-05, + "loss": 0.2462, + "step": 4902 + }, + { + "epoch": 0.09104309695050182, + "grad_norm": 0.4565092921257019, + "learning_rate": 1.959373382426933e-05, + "loss": 0.2282, + "step": 4904 + }, + { + "epoch": 0.09108022708792045, + "grad_norm": 0.3739783465862274, + "learning_rate": 1.9593404645874357e-05, + "loss": 0.2029, + "step": 4906 + }, + { + "epoch": 0.09111735722533909, + "grad_norm": 0.44228729605674744, + "learning_rate": 1.959307533694189e-05, + "loss": 0.3279, + "step": 4908 + }, + { + "epoch": 0.09115448736275773, + "grad_norm": 0.3853660523891449, + "learning_rate": 1.9592745897476398e-05, + "loss": 0.3799, + "step": 4910 + }, + { + "epoch": 0.09119161750017636, + "grad_norm": 0.33759164810180664, + "learning_rate": 1.9592416327482368e-05, + "loss": 0.1575, + "step": 4912 + }, + { + "epoch": 0.091228747637595, + "grad_norm": 0.2535952627658844, + "learning_rate": 1.9592086626964284e-05, + "loss": 0.2389, + "step": 4914 + }, + { + "epoch": 0.09126587777501365, + "grad_norm": 0.3438250720500946, + "learning_rate": 1.9591756795926636e-05, + "loss": 0.4631, + "step": 4916 + }, + { + "epoch": 0.09130300791243229, + "grad_norm": 0.3743865191936493, + "learning_rate": 1.9591426834373906e-05, + "loss": 0.3054, + "step": 4918 + }, + { + "epoch": 0.09134013804985092, + "grad_norm": 0.30598023533821106, + "learning_rate": 1.9591096742310583e-05, + "loss": 0.3011, + "step": 4920 + }, + { + "epoch": 0.09137726818726956, + "grad_norm": 0.39038604497909546, + "learning_rate": 1.9590766519741167e-05, + "loss": 0.2636, + "step": 4922 + }, + { + "epoch": 0.0914143983246882, + "grad_norm": 0.3603297770023346, + "learning_rate": 1.9590436166670148e-05, + "loss": 0.1852, + "step": 4924 + }, + { + "epoch": 0.09145152846210684, + "grad_norm": 0.304485023021698, + "learning_rate": 1.9590105683102014e-05, + "loss": 0.4916, + "step": 4926 + }, + { + "epoch": 0.09148865859952547, + "grad_norm": 0.32898202538490295, + "learning_rate": 1.958977506904127e-05, + "loss": 0.2939, + "step": 4928 + }, + { + "epoch": 0.09152578873694411, + "grad_norm": 0.3155774474143982, + "learning_rate": 1.9589444324492413e-05, + "loss": 0.2508, + "step": 4930 + }, + { + "epoch": 0.09156291887436276, + "grad_norm": 0.41699492931365967, + "learning_rate": 1.958911344945994e-05, + "loss": 0.2851, + "step": 4932 + }, + { + "epoch": 0.0916000490117814, + "grad_norm": 0.35700055956840515, + "learning_rate": 1.9588782443948357e-05, + "loss": 0.5291, + "step": 4934 + }, + { + "epoch": 0.09163717914920003, + "grad_norm": 0.33645716309547424, + "learning_rate": 1.958845130796217e-05, + "loss": 0.2381, + "step": 4936 + }, + { + "epoch": 0.09167430928661867, + "grad_norm": 0.3360303044319153, + "learning_rate": 1.9588120041505876e-05, + "loss": 0.392, + "step": 4938 + }, + { + "epoch": 0.09171143942403731, + "grad_norm": 0.3056754171848297, + "learning_rate": 1.958778864458399e-05, + "loss": 0.4072, + "step": 4940 + }, + { + "epoch": 0.09174856956145595, + "grad_norm": 0.3132539987564087, + "learning_rate": 1.958745711720102e-05, + "loss": 0.4425, + "step": 4942 + }, + { + "epoch": 0.09178569969887458, + "grad_norm": 0.33548447489738464, + "learning_rate": 1.9587125459361474e-05, + "loss": 0.4424, + "step": 4944 + }, + { + "epoch": 0.09182282983629322, + "grad_norm": 0.31488487124443054, + "learning_rate": 1.958679367106987e-05, + "loss": 0.2995, + "step": 4946 + }, + { + "epoch": 0.09185995997371187, + "grad_norm": 0.5252687335014343, + "learning_rate": 1.958646175233072e-05, + "loss": 0.4117, + "step": 4948 + }, + { + "epoch": 0.0918970901111305, + "grad_norm": 0.3897077143192291, + "learning_rate": 1.9586129703148536e-05, + "loss": 0.3536, + "step": 4950 + }, + { + "epoch": 0.09193422024854914, + "grad_norm": 0.47567206621170044, + "learning_rate": 1.9585797523527846e-05, + "loss": 0.3898, + "step": 4952 + }, + { + "epoch": 0.09197135038596778, + "grad_norm": 0.38732656836509705, + "learning_rate": 1.9585465213473162e-05, + "loss": 0.4326, + "step": 4954 + }, + { + "epoch": 0.09200848052338642, + "grad_norm": 0.33776918053627014, + "learning_rate": 1.9585132772989007e-05, + "loss": 0.4891, + "step": 4956 + }, + { + "epoch": 0.09204561066080505, + "grad_norm": 0.2822454273700714, + "learning_rate": 1.958480020207991e-05, + "loss": 0.3671, + "step": 4958 + }, + { + "epoch": 0.09208274079822369, + "grad_norm": 0.5686154365539551, + "learning_rate": 1.958446750075039e-05, + "loss": 0.3593, + "step": 4960 + }, + { + "epoch": 0.09211987093564233, + "grad_norm": 0.3431811034679413, + "learning_rate": 1.9584134669004973e-05, + "loss": 0.4032, + "step": 4962 + }, + { + "epoch": 0.09215700107306098, + "grad_norm": 0.40061676502227783, + "learning_rate": 1.9583801706848195e-05, + "loss": 0.3971, + "step": 4964 + }, + { + "epoch": 0.0921941312104796, + "grad_norm": 0.49755749106407166, + "learning_rate": 1.958346861428458e-05, + "loss": 0.4568, + "step": 4966 + }, + { + "epoch": 0.09223126134789825, + "grad_norm": 0.3119446039199829, + "learning_rate": 1.9583135391318666e-05, + "loss": 0.37, + "step": 4968 + }, + { + "epoch": 0.09226839148531689, + "grad_norm": 0.3328152000904083, + "learning_rate": 1.958280203795498e-05, + "loss": 0.3955, + "step": 4970 + }, + { + "epoch": 0.09230552162273553, + "grad_norm": 0.33143851161003113, + "learning_rate": 1.9582468554198065e-05, + "loss": 0.3534, + "step": 4972 + }, + { + "epoch": 0.09234265176015416, + "grad_norm": 0.34953469038009644, + "learning_rate": 1.9582134940052455e-05, + "loss": 0.2167, + "step": 4974 + }, + { + "epoch": 0.0923797818975728, + "grad_norm": 0.3447701036930084, + "learning_rate": 1.9581801195522688e-05, + "loss": 0.3402, + "step": 4976 + }, + { + "epoch": 0.09241691203499144, + "grad_norm": 0.3659749925136566, + "learning_rate": 1.958146732061331e-05, + "loss": 0.5568, + "step": 4978 + }, + { + "epoch": 0.09245404217241009, + "grad_norm": 0.2932274639606476, + "learning_rate": 1.9581133315328863e-05, + "loss": 0.2979, + "step": 4980 + }, + { + "epoch": 0.09249117230982871, + "grad_norm": 0.3699193596839905, + "learning_rate": 1.9580799179673887e-05, + "loss": 0.3244, + "step": 4982 + }, + { + "epoch": 0.09252830244724736, + "grad_norm": 0.3554528057575226, + "learning_rate": 1.9580464913652934e-05, + "loss": 0.2859, + "step": 4984 + }, + { + "epoch": 0.092565432584666, + "grad_norm": 0.32328295707702637, + "learning_rate": 1.958013051727055e-05, + "loss": 0.3323, + "step": 4986 + }, + { + "epoch": 0.09260256272208463, + "grad_norm": 0.24554848670959473, + "learning_rate": 1.9579795990531284e-05, + "loss": 0.4453, + "step": 4988 + }, + { + "epoch": 0.09263969285950327, + "grad_norm": 0.3720664381980896, + "learning_rate": 1.9579461333439695e-05, + "loss": 0.3158, + "step": 4990 + }, + { + "epoch": 0.09267682299692191, + "grad_norm": 0.40524178743362427, + "learning_rate": 1.9579126546000326e-05, + "loss": 0.3411, + "step": 4992 + }, + { + "epoch": 0.09271395313434055, + "grad_norm": 0.37116512656211853, + "learning_rate": 1.957879162821774e-05, + "loss": 0.488, + "step": 4994 + }, + { + "epoch": 0.09275108327175918, + "grad_norm": 0.31278982758522034, + "learning_rate": 1.957845658009649e-05, + "loss": 0.3637, + "step": 4996 + }, + { + "epoch": 0.09278821340917782, + "grad_norm": 0.37581178545951843, + "learning_rate": 1.9578121401641135e-05, + "loss": 0.362, + "step": 4998 + }, + { + "epoch": 0.09282534354659647, + "grad_norm": 0.36685124039649963, + "learning_rate": 1.957778609285624e-05, + "loss": 0.2361, + "step": 5000 + }, + { + "epoch": 0.09286247368401511, + "grad_norm": 0.410597026348114, + "learning_rate": 1.9577450653746365e-05, + "loss": 0.2514, + "step": 5002 + }, + { + "epoch": 0.09289960382143374, + "grad_norm": 0.2828640043735504, + "learning_rate": 1.9577115084316076e-05, + "loss": 0.275, + "step": 5004 + }, + { + "epoch": 0.09293673395885238, + "grad_norm": 0.3487793803215027, + "learning_rate": 1.957677938456993e-05, + "loss": 0.5032, + "step": 5006 + }, + { + "epoch": 0.09297386409627102, + "grad_norm": 0.2856448292732239, + "learning_rate": 1.957644355451251e-05, + "loss": 0.2998, + "step": 5008 + }, + { + "epoch": 0.09301099423368966, + "grad_norm": 0.48812663555145264, + "learning_rate": 1.9576107594148377e-05, + "loss": 0.3224, + "step": 5010 + }, + { + "epoch": 0.09304812437110829, + "grad_norm": 0.23672828078269958, + "learning_rate": 1.95757715034821e-05, + "loss": 0.5631, + "step": 5012 + }, + { + "epoch": 0.09308525450852694, + "grad_norm": 0.3710114359855652, + "learning_rate": 1.957543528251826e-05, + "loss": 0.4558, + "step": 5014 + }, + { + "epoch": 0.09312238464594558, + "grad_norm": 0.4182606041431427, + "learning_rate": 1.9575098931261425e-05, + "loss": 0.1844, + "step": 5016 + }, + { + "epoch": 0.09315951478336422, + "grad_norm": 0.3306743800640106, + "learning_rate": 1.9574762449716172e-05, + "loss": 0.3151, + "step": 5018 + }, + { + "epoch": 0.09319664492078285, + "grad_norm": 0.4734524190425873, + "learning_rate": 1.9574425837887087e-05, + "loss": 0.3834, + "step": 5020 + }, + { + "epoch": 0.09323377505820149, + "grad_norm": 0.3479258120059967, + "learning_rate": 1.957408909577874e-05, + "loss": 0.1905, + "step": 5022 + }, + { + "epoch": 0.09327090519562013, + "grad_norm": 0.33753538131713867, + "learning_rate": 1.957375222339572e-05, + "loss": 0.3113, + "step": 5024 + }, + { + "epoch": 0.09330803533303876, + "grad_norm": 0.2758099436759949, + "learning_rate": 1.9573415220742613e-05, + "loss": 0.316, + "step": 5026 + }, + { + "epoch": 0.0933451654704574, + "grad_norm": 0.272198349237442, + "learning_rate": 1.9573078087823995e-05, + "loss": 0.3678, + "step": 5028 + }, + { + "epoch": 0.09338229560787605, + "grad_norm": 0.26651960611343384, + "learning_rate": 1.957274082464446e-05, + "loss": 0.2121, + "step": 5030 + }, + { + "epoch": 0.09341942574529469, + "grad_norm": 0.403515487909317, + "learning_rate": 1.95724034312086e-05, + "loss": 0.2852, + "step": 5032 + }, + { + "epoch": 0.09345655588271332, + "grad_norm": 0.22324799001216888, + "learning_rate": 1.9572065907520996e-05, + "loss": 0.2286, + "step": 5034 + }, + { + "epoch": 0.09349368602013196, + "grad_norm": 0.28638893365859985, + "learning_rate": 1.957172825358625e-05, + "loss": 0.3511, + "step": 5036 + }, + { + "epoch": 0.0935308161575506, + "grad_norm": 0.3121299743652344, + "learning_rate": 1.9571390469408956e-05, + "loss": 0.1518, + "step": 5038 + }, + { + "epoch": 0.09356794629496924, + "grad_norm": 0.25477030873298645, + "learning_rate": 1.9571052554993702e-05, + "loss": 0.4506, + "step": 5040 + }, + { + "epoch": 0.09360507643238787, + "grad_norm": 0.40355873107910156, + "learning_rate": 1.9570714510345093e-05, + "loss": 0.3028, + "step": 5042 + }, + { + "epoch": 0.09364220656980651, + "grad_norm": 0.3837955594062805, + "learning_rate": 1.9570376335467727e-05, + "loss": 0.2841, + "step": 5044 + }, + { + "epoch": 0.09367933670722516, + "grad_norm": 0.42483997344970703, + "learning_rate": 1.9570038030366207e-05, + "loss": 0.395, + "step": 5046 + }, + { + "epoch": 0.0937164668446438, + "grad_norm": 0.3975624144077301, + "learning_rate": 1.9569699595045135e-05, + "loss": 0.3393, + "step": 5048 + }, + { + "epoch": 0.09375359698206243, + "grad_norm": 0.36357682943344116, + "learning_rate": 1.9569361029509116e-05, + "loss": 0.2822, + "step": 5050 + }, + { + "epoch": 0.09379072711948107, + "grad_norm": 0.3386733829975128, + "learning_rate": 1.9569022333762757e-05, + "loss": 0.4156, + "step": 5052 + }, + { + "epoch": 0.09382785725689971, + "grad_norm": 0.3303660750389099, + "learning_rate": 1.9568683507810666e-05, + "loss": 0.3708, + "step": 5054 + }, + { + "epoch": 0.09386498739431835, + "grad_norm": 0.2501733601093292, + "learning_rate": 1.956834455165745e-05, + "loss": 0.2368, + "step": 5056 + }, + { + "epoch": 0.09390211753173698, + "grad_norm": 0.2809985876083374, + "learning_rate": 1.956800546530773e-05, + "loss": 0.3797, + "step": 5058 + }, + { + "epoch": 0.09393924766915562, + "grad_norm": 0.39119815826416016, + "learning_rate": 1.9567666248766115e-05, + "loss": 0.3922, + "step": 5060 + }, + { + "epoch": 0.09397637780657427, + "grad_norm": 0.2788567841053009, + "learning_rate": 1.956732690203722e-05, + "loss": 0.4525, + "step": 5062 + }, + { + "epoch": 0.0940135079439929, + "grad_norm": 0.42304113507270813, + "learning_rate": 1.9566987425125665e-05, + "loss": 0.3268, + "step": 5064 + }, + { + "epoch": 0.09405063808141154, + "grad_norm": 0.28994688391685486, + "learning_rate": 1.9566647818036064e-05, + "loss": 0.2162, + "step": 5066 + }, + { + "epoch": 0.09408776821883018, + "grad_norm": 0.23911580443382263, + "learning_rate": 1.9566308080773043e-05, + "loss": 0.3033, + "step": 5068 + }, + { + "epoch": 0.09412489835624882, + "grad_norm": 0.3103496730327606, + "learning_rate": 1.9565968213341224e-05, + "loss": 0.346, + "step": 5070 + }, + { + "epoch": 0.09416202849366745, + "grad_norm": 0.40186089277267456, + "learning_rate": 1.9565628215745232e-05, + "loss": 0.2603, + "step": 5072 + }, + { + "epoch": 0.09419915863108609, + "grad_norm": 0.3488561511039734, + "learning_rate": 1.9565288087989692e-05, + "loss": 0.3838, + "step": 5074 + }, + { + "epoch": 0.09423628876850473, + "grad_norm": 0.39364808797836304, + "learning_rate": 1.9564947830079232e-05, + "loss": 0.4645, + "step": 5076 + }, + { + "epoch": 0.09427341890592338, + "grad_norm": 0.3354980945587158, + "learning_rate": 1.956460744201848e-05, + "loss": 0.3684, + "step": 5078 + }, + { + "epoch": 0.094310549043342, + "grad_norm": 0.40367186069488525, + "learning_rate": 1.956426692381207e-05, + "loss": 0.3756, + "step": 5080 + }, + { + "epoch": 0.09434767918076065, + "grad_norm": 0.40898868441581726, + "learning_rate": 1.956392627546464e-05, + "loss": 0.2665, + "step": 5082 + }, + { + "epoch": 0.09438480931817929, + "grad_norm": 0.26182201504707336, + "learning_rate": 1.956358549698082e-05, + "loss": 0.4749, + "step": 5084 + }, + { + "epoch": 0.09442193945559793, + "grad_norm": 0.38462671637535095, + "learning_rate": 1.9563244588365243e-05, + "loss": 0.1524, + "step": 5086 + }, + { + "epoch": 0.09445906959301656, + "grad_norm": 0.26189687848091125, + "learning_rate": 1.956290354962256e-05, + "loss": 0.346, + "step": 5088 + }, + { + "epoch": 0.0944961997304352, + "grad_norm": 0.4743839502334595, + "learning_rate": 1.9562562380757397e-05, + "loss": 0.4745, + "step": 5090 + }, + { + "epoch": 0.09453332986785384, + "grad_norm": 0.2681953012943268, + "learning_rate": 1.9562221081774405e-05, + "loss": 0.4601, + "step": 5092 + }, + { + "epoch": 0.09457046000527249, + "grad_norm": 0.38592663407325745, + "learning_rate": 1.9561879652678225e-05, + "loss": 0.467, + "step": 5094 + }, + { + "epoch": 0.09460759014269111, + "grad_norm": 0.4928671717643738, + "learning_rate": 1.9561538093473507e-05, + "loss": 0.4582, + "step": 5096 + }, + { + "epoch": 0.09464472028010976, + "grad_norm": 0.38182348012924194, + "learning_rate": 1.9561196404164895e-05, + "loss": 0.2429, + "step": 5098 + }, + { + "epoch": 0.0946818504175284, + "grad_norm": 0.2938053011894226, + "learning_rate": 1.9560854584757037e-05, + "loss": 0.4313, + "step": 5100 + }, + { + "epoch": 0.09471898055494703, + "grad_norm": 0.3560974895954132, + "learning_rate": 1.9560512635254588e-05, + "loss": 0.2333, + "step": 5102 + }, + { + "epoch": 0.09475611069236567, + "grad_norm": 0.2717159688472748, + "learning_rate": 1.95601705556622e-05, + "loss": 0.1733, + "step": 5104 + }, + { + "epoch": 0.09479324082978431, + "grad_norm": 0.4864628314971924, + "learning_rate": 1.9559828345984524e-05, + "loss": 0.4561, + "step": 5106 + }, + { + "epoch": 0.09483037096720295, + "grad_norm": 0.4320840537548065, + "learning_rate": 1.9559486006226224e-05, + "loss": 0.3936, + "step": 5108 + }, + { + "epoch": 0.09486750110462158, + "grad_norm": 0.3660784363746643, + "learning_rate": 1.955914353639195e-05, + "loss": 0.4333, + "step": 5110 + }, + { + "epoch": 0.09490463124204022, + "grad_norm": 0.30558788776397705, + "learning_rate": 1.9558800936486365e-05, + "loss": 0.302, + "step": 5112 + }, + { + "epoch": 0.09494176137945887, + "grad_norm": 0.355805903673172, + "learning_rate": 1.955845820651413e-05, + "loss": 0.4207, + "step": 5114 + }, + { + "epoch": 0.09497889151687751, + "grad_norm": 0.36431872844696045, + "learning_rate": 1.955811534647991e-05, + "loss": 0.4426, + "step": 5116 + }, + { + "epoch": 0.09501602165429614, + "grad_norm": 0.31632253527641296, + "learning_rate": 1.9557772356388375e-05, + "loss": 0.2737, + "step": 5118 + }, + { + "epoch": 0.09505315179171478, + "grad_norm": 0.3225926458835602, + "learning_rate": 1.9557429236244183e-05, + "loss": 0.2434, + "step": 5120 + }, + { + "epoch": 0.09509028192913342, + "grad_norm": 0.4353872835636139, + "learning_rate": 1.9557085986052008e-05, + "loss": 0.4066, + "step": 5122 + }, + { + "epoch": 0.09512741206655206, + "grad_norm": 0.29980993270874023, + "learning_rate": 1.9556742605816517e-05, + "loss": 0.3311, + "step": 5124 + }, + { + "epoch": 0.09516454220397069, + "grad_norm": 0.37580832839012146, + "learning_rate": 1.9556399095542388e-05, + "loss": 0.4116, + "step": 5126 + }, + { + "epoch": 0.09520167234138933, + "grad_norm": 0.3811756670475006, + "learning_rate": 1.955605545523429e-05, + "loss": 0.2633, + "step": 5128 + }, + { + "epoch": 0.09523880247880798, + "grad_norm": 0.5881888270378113, + "learning_rate": 1.9555711684896902e-05, + "loss": 0.2603, + "step": 5130 + }, + { + "epoch": 0.09527593261622662, + "grad_norm": 0.3645700216293335, + "learning_rate": 1.95553677845349e-05, + "loss": 0.3792, + "step": 5132 + }, + { + "epoch": 0.09531306275364525, + "grad_norm": 0.48785102367401123, + "learning_rate": 1.9555023754152964e-05, + "loss": 0.3885, + "step": 5134 + }, + { + "epoch": 0.09535019289106389, + "grad_norm": 0.5230411291122437, + "learning_rate": 1.9554679593755778e-05, + "loss": 0.2939, + "step": 5136 + }, + { + "epoch": 0.09538732302848253, + "grad_norm": 0.48963722586631775, + "learning_rate": 1.9554335303348017e-05, + "loss": 0.3726, + "step": 5138 + }, + { + "epoch": 0.09542445316590116, + "grad_norm": 0.33693617582321167, + "learning_rate": 1.9553990882934377e-05, + "loss": 0.3823, + "step": 5140 + }, + { + "epoch": 0.0954615833033198, + "grad_norm": 0.47429797053337097, + "learning_rate": 1.9553646332519536e-05, + "loss": 0.2852, + "step": 5142 + }, + { + "epoch": 0.09549871344073844, + "grad_norm": 0.29600808024406433, + "learning_rate": 1.9553301652108187e-05, + "loss": 0.3106, + "step": 5144 + }, + { + "epoch": 0.09553584357815709, + "grad_norm": 0.4525470435619354, + "learning_rate": 1.9552956841705012e-05, + "loss": 0.4306, + "step": 5146 + }, + { + "epoch": 0.09557297371557572, + "grad_norm": 0.3995015621185303, + "learning_rate": 1.9552611901314715e-05, + "loss": 0.1079, + "step": 5148 + }, + { + "epoch": 0.09561010385299436, + "grad_norm": 0.3839956521987915, + "learning_rate": 1.955226683094198e-05, + "loss": 0.3808, + "step": 5150 + }, + { + "epoch": 0.095647233990413, + "grad_norm": 0.32153570652008057, + "learning_rate": 1.9551921630591505e-05, + "loss": 0.2056, + "step": 5152 + }, + { + "epoch": 0.09568436412783164, + "grad_norm": 0.4203532338142395, + "learning_rate": 1.955157630026799e-05, + "loss": 0.4167, + "step": 5154 + }, + { + "epoch": 0.09572149426525027, + "grad_norm": 0.3859817087650299, + "learning_rate": 1.955123083997613e-05, + "loss": 0.447, + "step": 5156 + }, + { + "epoch": 0.09575862440266891, + "grad_norm": 0.34617263078689575, + "learning_rate": 1.955088524972063e-05, + "loss": 0.4364, + "step": 5158 + }, + { + "epoch": 0.09579575454008755, + "grad_norm": 0.606687605381012, + "learning_rate": 1.955053952950619e-05, + "loss": 0.2586, + "step": 5160 + }, + { + "epoch": 0.0958328846775062, + "grad_norm": 0.34183990955352783, + "learning_rate": 1.9550193679337512e-05, + "loss": 0.2831, + "step": 5162 + }, + { + "epoch": 0.09587001481492483, + "grad_norm": 0.419951468706131, + "learning_rate": 1.9549847699219304e-05, + "loss": 0.3835, + "step": 5164 + }, + { + "epoch": 0.09590714495234347, + "grad_norm": 0.3562627136707306, + "learning_rate": 1.9549501589156276e-05, + "loss": 0.3247, + "step": 5166 + }, + { + "epoch": 0.09594427508976211, + "grad_norm": 0.43377891182899475, + "learning_rate": 1.9549155349153136e-05, + "loss": 0.2426, + "step": 5168 + }, + { + "epoch": 0.09598140522718075, + "grad_norm": 0.36018314957618713, + "learning_rate": 1.9548808979214594e-05, + "loss": 0.293, + "step": 5170 + }, + { + "epoch": 0.09601853536459938, + "grad_norm": 0.3135817348957062, + "learning_rate": 1.9548462479345364e-05, + "loss": 0.2077, + "step": 5172 + }, + { + "epoch": 0.09605566550201802, + "grad_norm": 0.34467533230781555, + "learning_rate": 1.9548115849550158e-05, + "loss": 0.5506, + "step": 5174 + }, + { + "epoch": 0.09609279563943667, + "grad_norm": 0.40089449286460876, + "learning_rate": 1.9547769089833697e-05, + "loss": 0.2338, + "step": 5176 + }, + { + "epoch": 0.0961299257768553, + "grad_norm": 0.3767624497413635, + "learning_rate": 1.9547422200200698e-05, + "loss": 0.5357, + "step": 5178 + }, + { + "epoch": 0.09616705591427394, + "grad_norm": 0.9575973153114319, + "learning_rate": 1.9547075180655878e-05, + "loss": 0.2131, + "step": 5180 + }, + { + "epoch": 0.09620418605169258, + "grad_norm": 0.2937701642513275, + "learning_rate": 1.9546728031203966e-05, + "loss": 0.1602, + "step": 5182 + }, + { + "epoch": 0.09624131618911122, + "grad_norm": 0.38985559344291687, + "learning_rate": 1.9546380751849678e-05, + "loss": 0.3263, + "step": 5184 + }, + { + "epoch": 0.09627844632652985, + "grad_norm": 0.32413989305496216, + "learning_rate": 1.9546033342597745e-05, + "loss": 0.3023, + "step": 5186 + }, + { + "epoch": 0.09631557646394849, + "grad_norm": 0.30475321412086487, + "learning_rate": 1.954568580345289e-05, + "loss": 0.2922, + "step": 5188 + }, + { + "epoch": 0.09635270660136713, + "grad_norm": 0.36882004141807556, + "learning_rate": 1.9545338134419844e-05, + "loss": 0.4484, + "step": 5190 + }, + { + "epoch": 0.09638983673878578, + "grad_norm": 0.29994648694992065, + "learning_rate": 1.954499033550334e-05, + "loss": 0.2438, + "step": 5192 + }, + { + "epoch": 0.0964269668762044, + "grad_norm": 0.4560507833957672, + "learning_rate": 1.9544642406708105e-05, + "loss": 0.2112, + "step": 5194 + }, + { + "epoch": 0.09646409701362305, + "grad_norm": 0.3699958622455597, + "learning_rate": 1.9544294348038878e-05, + "loss": 0.2584, + "step": 5196 + }, + { + "epoch": 0.09650122715104169, + "grad_norm": 0.3390488922595978, + "learning_rate": 1.9543946159500394e-05, + "loss": 0.1729, + "step": 5198 + }, + { + "epoch": 0.09653835728846033, + "grad_norm": 0.2970775365829468, + "learning_rate": 1.954359784109739e-05, + "loss": 0.3008, + "step": 5200 + }, + { + "epoch": 0.09657548742587896, + "grad_norm": 0.4615436792373657, + "learning_rate": 1.9543249392834603e-05, + "loss": 0.4401, + "step": 5202 + }, + { + "epoch": 0.0966126175632976, + "grad_norm": 0.4490794837474823, + "learning_rate": 1.954290081471678e-05, + "loss": 0.3737, + "step": 5204 + }, + { + "epoch": 0.09664974770071624, + "grad_norm": 0.3618118464946747, + "learning_rate": 1.954255210674866e-05, + "loss": 0.3109, + "step": 5206 + }, + { + "epoch": 0.09668687783813489, + "grad_norm": 0.4270692765712738, + "learning_rate": 1.954220326893499e-05, + "loss": 0.4952, + "step": 5208 + }, + { + "epoch": 0.09672400797555351, + "grad_norm": 0.2871691882610321, + "learning_rate": 1.9541854301280513e-05, + "loss": 0.3625, + "step": 5210 + }, + { + "epoch": 0.09676113811297216, + "grad_norm": 0.31491750478744507, + "learning_rate": 1.954150520378998e-05, + "loss": 0.3116, + "step": 5212 + }, + { + "epoch": 0.0967982682503908, + "grad_norm": 0.5523895025253296, + "learning_rate": 1.9541155976468146e-05, + "loss": 0.3392, + "step": 5214 + }, + { + "epoch": 0.09683539838780943, + "grad_norm": 0.45001524686813354, + "learning_rate": 1.9540806619319755e-05, + "loss": 0.3753, + "step": 5216 + }, + { + "epoch": 0.09687252852522807, + "grad_norm": 0.48795196413993835, + "learning_rate": 1.9540457132349565e-05, + "loss": 0.2818, + "step": 5218 + }, + { + "epoch": 0.09690965866264671, + "grad_norm": 0.34552592039108276, + "learning_rate": 1.954010751556233e-05, + "loss": 0.1735, + "step": 5220 + }, + { + "epoch": 0.09694678880006535, + "grad_norm": 0.4129865765571594, + "learning_rate": 1.953975776896281e-05, + "loss": 0.3015, + "step": 5222 + }, + { + "epoch": 0.09698391893748398, + "grad_norm": 0.43247342109680176, + "learning_rate": 1.9539407892555758e-05, + "loss": 0.2361, + "step": 5224 + }, + { + "epoch": 0.09702104907490262, + "grad_norm": 0.2945445775985718, + "learning_rate": 1.953905788634594e-05, + "loss": 0.4375, + "step": 5226 + }, + { + "epoch": 0.09705817921232127, + "grad_norm": 0.45246607065200806, + "learning_rate": 1.9538707750338115e-05, + "loss": 0.383, + "step": 5228 + }, + { + "epoch": 0.09709530934973991, + "grad_norm": 0.8424841165542603, + "learning_rate": 1.953835748453705e-05, + "loss": 0.2129, + "step": 5230 + }, + { + "epoch": 0.09713243948715854, + "grad_norm": 0.46386075019836426, + "learning_rate": 1.9538007088947513e-05, + "loss": 0.3822, + "step": 5232 + }, + { + "epoch": 0.09716956962457718, + "grad_norm": 0.35502320528030396, + "learning_rate": 1.9537656563574267e-05, + "loss": 0.2684, + "step": 5234 + }, + { + "epoch": 0.09720669976199582, + "grad_norm": 0.31353959441185, + "learning_rate": 1.9537305908422084e-05, + "loss": 0.372, + "step": 5236 + }, + { + "epoch": 0.09724382989941446, + "grad_norm": 0.32439714670181274, + "learning_rate": 1.9536955123495736e-05, + "loss": 0.5392, + "step": 5238 + }, + { + "epoch": 0.09728096003683309, + "grad_norm": 0.38243433833122253, + "learning_rate": 1.9536604208799994e-05, + "loss": 0.4967, + "step": 5240 + }, + { + "epoch": 0.09731809017425173, + "grad_norm": 0.39996588230133057, + "learning_rate": 1.9536253164339634e-05, + "loss": 0.3328, + "step": 5242 + }, + { + "epoch": 0.09735522031167038, + "grad_norm": 0.34999120235443115, + "learning_rate": 1.9535901990119434e-05, + "loss": 0.2745, + "step": 5244 + }, + { + "epoch": 0.09739235044908902, + "grad_norm": 0.5402022004127502, + "learning_rate": 1.953555068614417e-05, + "loss": 0.4193, + "step": 5246 + }, + { + "epoch": 0.09742948058650765, + "grad_norm": 0.3582979142665863, + "learning_rate": 1.9535199252418623e-05, + "loss": 0.3934, + "step": 5248 + }, + { + "epoch": 0.09746661072392629, + "grad_norm": 0.42996945977211, + "learning_rate": 1.9534847688947577e-05, + "loss": 0.2625, + "step": 5250 + }, + { + "epoch": 0.09750374086134493, + "grad_norm": 0.3576291501522064, + "learning_rate": 1.9534495995735816e-05, + "loss": 0.2958, + "step": 5252 + }, + { + "epoch": 0.09754087099876356, + "grad_norm": 0.3733225166797638, + "learning_rate": 1.953414417278812e-05, + "loss": 0.3195, + "step": 5254 + }, + { + "epoch": 0.0975780011361822, + "grad_norm": 0.43775293231010437, + "learning_rate": 1.953379222010928e-05, + "loss": 0.3097, + "step": 5256 + }, + { + "epoch": 0.09761513127360084, + "grad_norm": 0.2613062560558319, + "learning_rate": 1.953344013770409e-05, + "loss": 0.3565, + "step": 5258 + }, + { + "epoch": 0.09765226141101949, + "grad_norm": 0.31612250208854675, + "learning_rate": 1.9533087925577332e-05, + "loss": 0.2586, + "step": 5260 + }, + { + "epoch": 0.09768939154843811, + "grad_norm": 0.3904920220375061, + "learning_rate": 1.95327355837338e-05, + "loss": 0.263, + "step": 5262 + }, + { + "epoch": 0.09772652168585676, + "grad_norm": 0.37386247515678406, + "learning_rate": 1.9532383112178292e-05, + "loss": 0.5239, + "step": 5264 + }, + { + "epoch": 0.0977636518232754, + "grad_norm": 0.5240536332130432, + "learning_rate": 1.9532030510915604e-05, + "loss": 0.1838, + "step": 5266 + }, + { + "epoch": 0.09780078196069404, + "grad_norm": 0.42270922660827637, + "learning_rate": 1.953167777995053e-05, + "loss": 0.453, + "step": 5268 + }, + { + "epoch": 0.09783791209811267, + "grad_norm": 0.4509906470775604, + "learning_rate": 1.9531324919287876e-05, + "loss": 0.3839, + "step": 5270 + }, + { + "epoch": 0.09787504223553131, + "grad_norm": 0.38611701130867004, + "learning_rate": 1.9530971928932438e-05, + "loss": 0.5229, + "step": 5272 + }, + { + "epoch": 0.09791217237294995, + "grad_norm": 0.3556097745895386, + "learning_rate": 1.953061880888902e-05, + "loss": 0.3473, + "step": 5274 + }, + { + "epoch": 0.0979493025103686, + "grad_norm": 0.29100754857063293, + "learning_rate": 1.9530265559162426e-05, + "loss": 0.356, + "step": 5276 + }, + { + "epoch": 0.09798643264778722, + "grad_norm": 0.4549669921398163, + "learning_rate": 1.9529912179757468e-05, + "loss": 0.4548, + "step": 5278 + }, + { + "epoch": 0.09802356278520587, + "grad_norm": 0.47515588998794556, + "learning_rate": 1.9529558670678948e-05, + "loss": 0.3572, + "step": 5280 + }, + { + "epoch": 0.09806069292262451, + "grad_norm": 0.31426772475242615, + "learning_rate": 1.9529205031931678e-05, + "loss": 0.3466, + "step": 5282 + }, + { + "epoch": 0.09809782306004315, + "grad_norm": 0.3713514804840088, + "learning_rate": 1.9528851263520472e-05, + "loss": 0.3487, + "step": 5284 + }, + { + "epoch": 0.09813495319746178, + "grad_norm": 0.40842631459236145, + "learning_rate": 1.9528497365450142e-05, + "loss": 0.4509, + "step": 5286 + }, + { + "epoch": 0.09817208333488042, + "grad_norm": 0.3724248707294464, + "learning_rate": 1.9528143337725505e-05, + "loss": 0.4582, + "step": 5288 + }, + { + "epoch": 0.09820921347229906, + "grad_norm": 0.27923640608787537, + "learning_rate": 1.9527789180351376e-05, + "loss": 0.2932, + "step": 5290 + }, + { + "epoch": 0.09824634360971769, + "grad_norm": 0.3695182800292969, + "learning_rate": 1.9527434893332576e-05, + "loss": 0.4083, + "step": 5292 + }, + { + "epoch": 0.09828347374713634, + "grad_norm": 0.3551189601421356, + "learning_rate": 1.9527080476673924e-05, + "loss": 0.2452, + "step": 5294 + }, + { + "epoch": 0.09832060388455498, + "grad_norm": 0.3665805160999298, + "learning_rate": 1.9526725930380246e-05, + "loss": 0.4893, + "step": 5296 + }, + { + "epoch": 0.09835773402197362, + "grad_norm": 0.41149330139160156, + "learning_rate": 1.9526371254456362e-05, + "loss": 0.2324, + "step": 5298 + }, + { + "epoch": 0.09839486415939225, + "grad_norm": 0.46584969758987427, + "learning_rate": 1.9526016448907102e-05, + "loss": 0.3626, + "step": 5300 + }, + { + "epoch": 0.09843199429681089, + "grad_norm": 0.3277958035469055, + "learning_rate": 1.952566151373729e-05, + "loss": 0.1985, + "step": 5302 + }, + { + "epoch": 0.09846912443422953, + "grad_norm": 0.2779730558395386, + "learning_rate": 1.952530644895176e-05, + "loss": 0.3503, + "step": 5304 + }, + { + "epoch": 0.09850625457164817, + "grad_norm": 0.39081552624702454, + "learning_rate": 1.9524951254555335e-05, + "loss": 0.3922, + "step": 5306 + }, + { + "epoch": 0.0985433847090668, + "grad_norm": 0.26650571823120117, + "learning_rate": 1.9524595930552858e-05, + "loss": 0.3063, + "step": 5308 + }, + { + "epoch": 0.09858051484648545, + "grad_norm": 0.28204792737960815, + "learning_rate": 1.952424047694916e-05, + "loss": 0.3786, + "step": 5310 + }, + { + "epoch": 0.09861764498390409, + "grad_norm": 0.3474501967430115, + "learning_rate": 1.952388489374908e-05, + "loss": 0.2821, + "step": 5312 + }, + { + "epoch": 0.09865477512132273, + "grad_norm": 0.41910436749458313, + "learning_rate": 1.952352918095745e-05, + "loss": 0.3478, + "step": 5314 + }, + { + "epoch": 0.09869190525874136, + "grad_norm": 0.30340978503227234, + "learning_rate": 1.9523173338579112e-05, + "loss": 0.3482, + "step": 5316 + }, + { + "epoch": 0.09872903539616, + "grad_norm": 0.28818002343177795, + "learning_rate": 1.9522817366618914e-05, + "loss": 0.4146, + "step": 5318 + }, + { + "epoch": 0.09876616553357864, + "grad_norm": 0.40755388140678406, + "learning_rate": 1.9522461265081697e-05, + "loss": 0.2265, + "step": 5320 + }, + { + "epoch": 0.09880329567099728, + "grad_norm": 0.2984924912452698, + "learning_rate": 1.9522105033972304e-05, + "loss": 0.1711, + "step": 5322 + }, + { + "epoch": 0.09884042580841591, + "grad_norm": 0.32700037956237793, + "learning_rate": 1.952174867329558e-05, + "loss": 0.3759, + "step": 5324 + }, + { + "epoch": 0.09887755594583456, + "grad_norm": 0.3570864796638489, + "learning_rate": 1.952139218305638e-05, + "loss": 0.3235, + "step": 5326 + }, + { + "epoch": 0.0989146860832532, + "grad_norm": 0.32278236746788025, + "learning_rate": 1.9521035563259553e-05, + "loss": 0.3323, + "step": 5328 + }, + { + "epoch": 0.09895181622067183, + "grad_norm": 0.3525971472263336, + "learning_rate": 1.952067881390995e-05, + "loss": 0.3428, + "step": 5330 + }, + { + "epoch": 0.09898894635809047, + "grad_norm": 0.37292882800102234, + "learning_rate": 1.9520321935012427e-05, + "loss": 0.3548, + "step": 5332 + }, + { + "epoch": 0.09902607649550911, + "grad_norm": 0.4355502426624298, + "learning_rate": 1.951996492657184e-05, + "loss": 0.4244, + "step": 5334 + }, + { + "epoch": 0.09906320663292775, + "grad_norm": 1.4056992530822754, + "learning_rate": 1.9519607788593042e-05, + "loss": 0.4524, + "step": 5336 + }, + { + "epoch": 0.09910033677034638, + "grad_norm": 0.35588961839675903, + "learning_rate": 1.95192505210809e-05, + "loss": 0.4264, + "step": 5338 + }, + { + "epoch": 0.09913746690776502, + "grad_norm": 0.33673223853111267, + "learning_rate": 1.9518893124040272e-05, + "loss": 0.2656, + "step": 5340 + }, + { + "epoch": 0.09917459704518367, + "grad_norm": 0.327662855386734, + "learning_rate": 1.951853559747602e-05, + "loss": 0.185, + "step": 5342 + }, + { + "epoch": 0.09921172718260231, + "grad_norm": 0.3220953047275543, + "learning_rate": 1.951817794139301e-05, + "loss": 0.3434, + "step": 5344 + }, + { + "epoch": 0.09924885732002094, + "grad_norm": 0.30883336067199707, + "learning_rate": 1.9517820155796104e-05, + "loss": 0.2657, + "step": 5346 + }, + { + "epoch": 0.09928598745743958, + "grad_norm": 0.30116966366767883, + "learning_rate": 1.9517462240690182e-05, + "loss": 0.3001, + "step": 5348 + }, + { + "epoch": 0.09932311759485822, + "grad_norm": 0.33159059286117554, + "learning_rate": 1.95171041960801e-05, + "loss": 0.2763, + "step": 5350 + }, + { + "epoch": 0.09936024773227686, + "grad_norm": 0.31253114342689514, + "learning_rate": 1.9516746021970742e-05, + "loss": 0.4359, + "step": 5352 + }, + { + "epoch": 0.09939737786969549, + "grad_norm": 0.38211649656295776, + "learning_rate": 1.951638771836698e-05, + "loss": 0.2949, + "step": 5354 + }, + { + "epoch": 0.09943450800711413, + "grad_norm": 0.36627936363220215, + "learning_rate": 1.951602928527368e-05, + "loss": 0.2508, + "step": 5356 + }, + { + "epoch": 0.09947163814453278, + "grad_norm": 0.33420252799987793, + "learning_rate": 1.9515670722695725e-05, + "loss": 0.3512, + "step": 5358 + }, + { + "epoch": 0.09950876828195142, + "grad_norm": 0.29587164521217346, + "learning_rate": 1.9515312030637997e-05, + "loss": 0.3399, + "step": 5360 + }, + { + "epoch": 0.09954589841937005, + "grad_norm": 0.2664264142513275, + "learning_rate": 1.951495320910537e-05, + "loss": 0.3099, + "step": 5362 + }, + { + "epoch": 0.09958302855678869, + "grad_norm": 0.26353761553764343, + "learning_rate": 1.9514594258102735e-05, + "loss": 0.2825, + "step": 5364 + }, + { + "epoch": 0.09962015869420733, + "grad_norm": 0.4898408353328705, + "learning_rate": 1.9514235177634972e-05, + "loss": 0.3378, + "step": 5366 + }, + { + "epoch": 0.09965728883162596, + "grad_norm": 0.29862871766090393, + "learning_rate": 1.951387596770696e-05, + "loss": 0.2809, + "step": 5368 + }, + { + "epoch": 0.0996944189690446, + "grad_norm": 0.28889936208724976, + "learning_rate": 1.95135166283236e-05, + "loss": 0.5213, + "step": 5370 + }, + { + "epoch": 0.09973154910646324, + "grad_norm": 0.3810235559940338, + "learning_rate": 1.9513157159489772e-05, + "loss": 0.3083, + "step": 5372 + }, + { + "epoch": 0.09976867924388189, + "grad_norm": 0.4826198220252991, + "learning_rate": 1.9512797561210372e-05, + "loss": 0.2303, + "step": 5374 + }, + { + "epoch": 0.09980580938130051, + "grad_norm": 0.2561202645301819, + "learning_rate": 1.951243783349029e-05, + "loss": 0.3414, + "step": 5376 + }, + { + "epoch": 0.09984293951871916, + "grad_norm": 0.4537256062030792, + "learning_rate": 1.9512077976334424e-05, + "loss": 0.4304, + "step": 5378 + }, + { + "epoch": 0.0998800696561378, + "grad_norm": 0.30643871426582336, + "learning_rate": 1.9511717989747666e-05, + "loss": 0.3015, + "step": 5380 + }, + { + "epoch": 0.09991719979355644, + "grad_norm": 0.23798967897891998, + "learning_rate": 1.951135787373492e-05, + "loss": 0.4162, + "step": 5382 + }, + { + "epoch": 0.09995432993097507, + "grad_norm": 0.2725050449371338, + "learning_rate": 1.9510997628301083e-05, + "loss": 0.3167, + "step": 5384 + }, + { + "epoch": 0.09999146006839371, + "grad_norm": 0.22903631627559662, + "learning_rate": 1.9510637253451056e-05, + "loss": 0.3809, + "step": 5386 + }, + { + "epoch": 0.10002859020581235, + "grad_norm": 0.38538771867752075, + "learning_rate": 1.9510276749189742e-05, + "loss": 0.2494, + "step": 5388 + }, + { + "epoch": 0.100065720343231, + "grad_norm": 0.5259299874305725, + "learning_rate": 1.9509916115522048e-05, + "loss": 0.5736, + "step": 5390 + }, + { + "epoch": 0.10010285048064962, + "grad_norm": 0.3985842168331146, + "learning_rate": 1.9509555352452883e-05, + "loss": 0.239, + "step": 5392 + }, + { + "epoch": 0.10013998061806827, + "grad_norm": 0.2761503756046295, + "learning_rate": 1.9509194459987154e-05, + "loss": 0.4563, + "step": 5394 + }, + { + "epoch": 0.10017711075548691, + "grad_norm": 0.433281809091568, + "learning_rate": 1.950883343812977e-05, + "loss": 0.3784, + "step": 5396 + }, + { + "epoch": 0.10021424089290555, + "grad_norm": 0.326143354177475, + "learning_rate": 1.9508472286885646e-05, + "loss": 0.28, + "step": 5398 + }, + { + "epoch": 0.10025137103032418, + "grad_norm": 0.3231518268585205, + "learning_rate": 1.95081110062597e-05, + "loss": 0.3623, + "step": 5400 + }, + { + "epoch": 0.10028850116774282, + "grad_norm": 0.24535638093948364, + "learning_rate": 1.9507749596256836e-05, + "loss": 0.1355, + "step": 5402 + }, + { + "epoch": 0.10032563130516146, + "grad_norm": 0.33904486894607544, + "learning_rate": 1.9507388056881982e-05, + "loss": 0.5071, + "step": 5404 + }, + { + "epoch": 0.10036276144258009, + "grad_norm": 0.2902483642101288, + "learning_rate": 1.9507026388140057e-05, + "loss": 0.3308, + "step": 5406 + }, + { + "epoch": 0.10039989157999873, + "grad_norm": 0.4825877547264099, + "learning_rate": 1.9506664590035974e-05, + "loss": 0.3736, + "step": 5408 + }, + { + "epoch": 0.10043702171741738, + "grad_norm": 0.25483444333076477, + "learning_rate": 1.9506302662574666e-05, + "loss": 0.3465, + "step": 5410 + }, + { + "epoch": 0.10047415185483602, + "grad_norm": 0.40256524085998535, + "learning_rate": 1.9505940605761052e-05, + "loss": 0.3799, + "step": 5412 + }, + { + "epoch": 0.10051128199225465, + "grad_norm": 0.34911879897117615, + "learning_rate": 1.950557841960006e-05, + "loss": 0.4189, + "step": 5414 + }, + { + "epoch": 0.10054841212967329, + "grad_norm": 0.47168174386024475, + "learning_rate": 1.9505216104096617e-05, + "loss": 0.3016, + "step": 5416 + }, + { + "epoch": 0.10058554226709193, + "grad_norm": 0.327131986618042, + "learning_rate": 1.9504853659255655e-05, + "loss": 0.2868, + "step": 5418 + }, + { + "epoch": 0.10062267240451057, + "grad_norm": 0.25280463695526123, + "learning_rate": 1.950449108508211e-05, + "loss": 0.2506, + "step": 5420 + }, + { + "epoch": 0.1006598025419292, + "grad_norm": 0.2926620543003082, + "learning_rate": 1.9504128381580904e-05, + "loss": 0.5166, + "step": 5422 + }, + { + "epoch": 0.10069693267934784, + "grad_norm": 0.470705509185791, + "learning_rate": 1.9503765548756982e-05, + "loss": 0.2277, + "step": 5424 + }, + { + "epoch": 0.10073406281676649, + "grad_norm": 0.5013469457626343, + "learning_rate": 1.9503402586615276e-05, + "loss": 0.3182, + "step": 5426 + }, + { + "epoch": 0.10077119295418513, + "grad_norm": 0.29224976897239685, + "learning_rate": 1.950303949516073e-05, + "loss": 0.4388, + "step": 5428 + }, + { + "epoch": 0.10080832309160376, + "grad_norm": 0.3648039400577545, + "learning_rate": 1.9502676274398278e-05, + "loss": 0.4566, + "step": 5430 + }, + { + "epoch": 0.1008454532290224, + "grad_norm": 0.37001094222068787, + "learning_rate": 1.9502312924332866e-05, + "loss": 0.3175, + "step": 5432 + }, + { + "epoch": 0.10088258336644104, + "grad_norm": 0.28458839654922485, + "learning_rate": 1.950194944496944e-05, + "loss": 0.2475, + "step": 5434 + }, + { + "epoch": 0.10091971350385968, + "grad_norm": 0.30913984775543213, + "learning_rate": 1.950158583631294e-05, + "loss": 0.3869, + "step": 5436 + }, + { + "epoch": 0.10095684364127831, + "grad_norm": 0.3252734839916229, + "learning_rate": 1.9501222098368324e-05, + "loss": 0.3185, + "step": 5438 + }, + { + "epoch": 0.10099397377869695, + "grad_norm": 0.35855549573898315, + "learning_rate": 1.950085823114053e-05, + "loss": 0.3551, + "step": 5440 + }, + { + "epoch": 0.1010311039161156, + "grad_norm": 0.4488309323787689, + "learning_rate": 1.9500494234634514e-05, + "loss": 0.3972, + "step": 5442 + }, + { + "epoch": 0.10106823405353423, + "grad_norm": 0.2795882821083069, + "learning_rate": 1.9500130108855228e-05, + "loss": 0.3448, + "step": 5444 + }, + { + "epoch": 0.10110536419095287, + "grad_norm": 0.32300645112991333, + "learning_rate": 1.9499765853807632e-05, + "loss": 0.4746, + "step": 5446 + }, + { + "epoch": 0.10114249432837151, + "grad_norm": 0.42869728803634644, + "learning_rate": 1.949940146949667e-05, + "loss": 0.3796, + "step": 5448 + }, + { + "epoch": 0.10117962446579015, + "grad_norm": 0.3509422540664673, + "learning_rate": 1.9499036955927316e-05, + "loss": 0.3397, + "step": 5450 + }, + { + "epoch": 0.10121675460320878, + "grad_norm": 0.33761149644851685, + "learning_rate": 1.9498672313104516e-05, + "loss": 0.4286, + "step": 5452 + }, + { + "epoch": 0.10125388474062742, + "grad_norm": 0.35593512654304504, + "learning_rate": 1.949830754103324e-05, + "loss": 0.1968, + "step": 5454 + }, + { + "epoch": 0.10129101487804607, + "grad_norm": 0.5173635482788086, + "learning_rate": 1.949794263971845e-05, + "loss": 0.3539, + "step": 5456 + }, + { + "epoch": 0.10132814501546471, + "grad_norm": 0.2860984206199646, + "learning_rate": 1.9497577609165108e-05, + "loss": 0.3764, + "step": 5458 + }, + { + "epoch": 0.10136527515288334, + "grad_norm": 0.27472686767578125, + "learning_rate": 1.9497212449378185e-05, + "loss": 0.2928, + "step": 5460 + }, + { + "epoch": 0.10140240529030198, + "grad_norm": 0.3337462544441223, + "learning_rate": 1.9496847160362647e-05, + "loss": 0.3737, + "step": 5462 + }, + { + "epoch": 0.10143953542772062, + "grad_norm": 0.36239540576934814, + "learning_rate": 1.9496481742123468e-05, + "loss": 0.4175, + "step": 5464 + }, + { + "epoch": 0.10147666556513926, + "grad_norm": 0.36568373441696167, + "learning_rate": 1.9496116194665615e-05, + "loss": 0.1936, + "step": 5466 + }, + { + "epoch": 0.10151379570255789, + "grad_norm": 0.3121700882911682, + "learning_rate": 1.9495750517994065e-05, + "loss": 0.2305, + "step": 5468 + }, + { + "epoch": 0.10155092583997653, + "grad_norm": 0.4357566833496094, + "learning_rate": 1.9495384712113795e-05, + "loss": 0.3119, + "step": 5470 + }, + { + "epoch": 0.10158805597739518, + "grad_norm": 0.246283158659935, + "learning_rate": 1.9495018777029782e-05, + "loss": 0.3571, + "step": 5472 + }, + { + "epoch": 0.10162518611481382, + "grad_norm": 0.36319881677627563, + "learning_rate": 1.9494652712747e-05, + "loss": 0.2209, + "step": 5474 + }, + { + "epoch": 0.10166231625223245, + "grad_norm": 0.46618902683258057, + "learning_rate": 1.9494286519270437e-05, + "loss": 0.3714, + "step": 5476 + }, + { + "epoch": 0.10169944638965109, + "grad_norm": 0.4208773970603943, + "learning_rate": 1.9493920196605073e-05, + "loss": 0.346, + "step": 5478 + }, + { + "epoch": 0.10173657652706973, + "grad_norm": 0.3264392912387848, + "learning_rate": 1.9493553744755895e-05, + "loss": 0.2895, + "step": 5480 + }, + { + "epoch": 0.10177370666448836, + "grad_norm": 0.44028425216674805, + "learning_rate": 1.9493187163727883e-05, + "loss": 0.341, + "step": 5482 + }, + { + "epoch": 0.101810836801907, + "grad_norm": 0.540948748588562, + "learning_rate": 1.949282045352603e-05, + "loss": 0.444, + "step": 5484 + }, + { + "epoch": 0.10184796693932564, + "grad_norm": 0.437313973903656, + "learning_rate": 1.9492453614155328e-05, + "loss": 0.4575, + "step": 5486 + }, + { + "epoch": 0.10188509707674429, + "grad_norm": 0.37025347352027893, + "learning_rate": 1.949208664562076e-05, + "loss": 0.3717, + "step": 5488 + }, + { + "epoch": 0.10192222721416291, + "grad_norm": 0.38311830163002014, + "learning_rate": 1.949171954792733e-05, + "loss": 0.3728, + "step": 5490 + }, + { + "epoch": 0.10195935735158156, + "grad_norm": 0.34568244218826294, + "learning_rate": 1.9491352321080026e-05, + "loss": 0.3232, + "step": 5492 + }, + { + "epoch": 0.1019964874890002, + "grad_norm": 0.391882985830307, + "learning_rate": 1.9490984965083844e-05, + "loss": 0.1835, + "step": 5494 + }, + { + "epoch": 0.10203361762641884, + "grad_norm": 0.33830374479293823, + "learning_rate": 1.949061747994379e-05, + "loss": 0.331, + "step": 5496 + }, + { + "epoch": 0.10207074776383747, + "grad_norm": 0.305108904838562, + "learning_rate": 1.949024986566486e-05, + "loss": 0.2525, + "step": 5498 + }, + { + "epoch": 0.10210787790125611, + "grad_norm": 0.29561322927474976, + "learning_rate": 1.9489882122252054e-05, + "loss": 0.3222, + "step": 5500 + }, + { + "epoch": 0.10214500803867475, + "grad_norm": 0.31296899914741516, + "learning_rate": 1.9489514249710377e-05, + "loss": 0.2674, + "step": 5502 + }, + { + "epoch": 0.1021821381760934, + "grad_norm": 0.41582176089286804, + "learning_rate": 1.9489146248044836e-05, + "loss": 0.3928, + "step": 5504 + }, + { + "epoch": 0.10221926831351202, + "grad_norm": 0.31509995460510254, + "learning_rate": 1.9488778117260438e-05, + "loss": 0.4059, + "step": 5506 + }, + { + "epoch": 0.10225639845093067, + "grad_norm": 0.4125397503376007, + "learning_rate": 1.948840985736219e-05, + "loss": 0.4757, + "step": 5508 + }, + { + "epoch": 0.10229352858834931, + "grad_norm": 0.30273544788360596, + "learning_rate": 1.948804146835511e-05, + "loss": 0.2823, + "step": 5510 + }, + { + "epoch": 0.10233065872576795, + "grad_norm": 0.46629342436790466, + "learning_rate": 1.9487672950244203e-05, + "loss": 0.4176, + "step": 5512 + }, + { + "epoch": 0.10236778886318658, + "grad_norm": 0.2867383062839508, + "learning_rate": 1.9487304303034483e-05, + "loss": 0.2607, + "step": 5514 + }, + { + "epoch": 0.10240491900060522, + "grad_norm": 0.25056639313697815, + "learning_rate": 1.9486935526730973e-05, + "loss": 0.293, + "step": 5516 + }, + { + "epoch": 0.10244204913802386, + "grad_norm": 0.386088490486145, + "learning_rate": 1.9486566621338685e-05, + "loss": 0.1782, + "step": 5518 + }, + { + "epoch": 0.10247917927544249, + "grad_norm": 0.40196260809898376, + "learning_rate": 1.9486197586862642e-05, + "loss": 0.2662, + "step": 5520 + }, + { + "epoch": 0.10251630941286113, + "grad_norm": 0.7502794861793518, + "learning_rate": 1.9485828423307865e-05, + "loss": 0.4629, + "step": 5522 + }, + { + "epoch": 0.10255343955027978, + "grad_norm": 0.3743736147880554, + "learning_rate": 1.9485459130679375e-05, + "loss": 0.2583, + "step": 5524 + }, + { + "epoch": 0.10259056968769842, + "grad_norm": 0.3248589336872101, + "learning_rate": 1.94850897089822e-05, + "loss": 0.4207, + "step": 5526 + }, + { + "epoch": 0.10262769982511705, + "grad_norm": 0.3544054925441742, + "learning_rate": 1.9484720158221365e-05, + "loss": 0.2415, + "step": 5528 + }, + { + "epoch": 0.10266482996253569, + "grad_norm": 0.6023651361465454, + "learning_rate": 1.9484350478401897e-05, + "loss": 0.2938, + "step": 5530 + }, + { + "epoch": 0.10270196009995433, + "grad_norm": 0.3340618908405304, + "learning_rate": 1.948398066952883e-05, + "loss": 0.2678, + "step": 5532 + }, + { + "epoch": 0.10273909023737297, + "grad_norm": 0.3078499138355255, + "learning_rate": 1.9483610731607193e-05, + "loss": 0.3749, + "step": 5534 + }, + { + "epoch": 0.1027762203747916, + "grad_norm": 0.3383847177028656, + "learning_rate": 1.9483240664642016e-05, + "loss": 0.3723, + "step": 5536 + }, + { + "epoch": 0.10281335051221024, + "grad_norm": 0.3159855306148529, + "learning_rate": 1.9482870468638346e-05, + "loss": 0.3411, + "step": 5538 + }, + { + "epoch": 0.10285048064962889, + "grad_norm": 0.49799844622612, + "learning_rate": 1.948250014360121e-05, + "loss": 0.2778, + "step": 5540 + }, + { + "epoch": 0.10288761078704753, + "grad_norm": 0.41063013672828674, + "learning_rate": 1.9482129689535655e-05, + "loss": 0.2612, + "step": 5542 + }, + { + "epoch": 0.10292474092446616, + "grad_norm": 0.4678843021392822, + "learning_rate": 1.948175910644671e-05, + "loss": 0.3867, + "step": 5544 + }, + { + "epoch": 0.1029618710618848, + "grad_norm": 0.49532073736190796, + "learning_rate": 1.9481388394339428e-05, + "loss": 0.5261, + "step": 5546 + }, + { + "epoch": 0.10299900119930344, + "grad_norm": 0.5383819341659546, + "learning_rate": 1.948101755321885e-05, + "loss": 0.2307, + "step": 5548 + }, + { + "epoch": 0.10303613133672208, + "grad_norm": 0.3777703642845154, + "learning_rate": 1.9480646583090024e-05, + "loss": 0.3078, + "step": 5550 + }, + { + "epoch": 0.10307326147414071, + "grad_norm": 0.3909919261932373, + "learning_rate": 1.9480275483957992e-05, + "loss": 0.3466, + "step": 5552 + }, + { + "epoch": 0.10311039161155935, + "grad_norm": 0.32706278562545776, + "learning_rate": 1.947990425582781e-05, + "loss": 0.2415, + "step": 5554 + }, + { + "epoch": 0.103147521748978, + "grad_norm": 0.44836801290512085, + "learning_rate": 1.9479532898704528e-05, + "loss": 0.273, + "step": 5556 + }, + { + "epoch": 0.10318465188639662, + "grad_norm": 0.31327760219573975, + "learning_rate": 1.9479161412593196e-05, + "loss": 0.2639, + "step": 5558 + }, + { + "epoch": 0.10322178202381527, + "grad_norm": 0.42859169840812683, + "learning_rate": 1.947878979749887e-05, + "loss": 0.2693, + "step": 5560 + }, + { + "epoch": 0.10325891216123391, + "grad_norm": 0.26981452107429504, + "learning_rate": 1.9478418053426605e-05, + "loss": 0.3979, + "step": 5562 + }, + { + "epoch": 0.10329604229865255, + "grad_norm": 0.3653850853443146, + "learning_rate": 1.9478046180381468e-05, + "loss": 0.3371, + "step": 5564 + }, + { + "epoch": 0.10333317243607118, + "grad_norm": 0.29580119252204895, + "learning_rate": 1.947767417836851e-05, + "loss": 0.3759, + "step": 5566 + }, + { + "epoch": 0.10337030257348982, + "grad_norm": 0.34580251574516296, + "learning_rate": 1.947730204739279e-05, + "loss": 0.3574, + "step": 5568 + }, + { + "epoch": 0.10340743271090846, + "grad_norm": 0.22512584924697876, + "learning_rate": 1.947692978745938e-05, + "loss": 0.1099, + "step": 5570 + }, + { + "epoch": 0.1034445628483271, + "grad_norm": 0.4294097125530243, + "learning_rate": 1.9476557398573344e-05, + "loss": 0.2895, + "step": 5572 + }, + { + "epoch": 0.10348169298574574, + "grad_norm": 0.3201062083244324, + "learning_rate": 1.9476184880739747e-05, + "loss": 0.385, + "step": 5574 + }, + { + "epoch": 0.10351882312316438, + "grad_norm": 0.4645669162273407, + "learning_rate": 1.9475812233963656e-05, + "loss": 0.3803, + "step": 5576 + }, + { + "epoch": 0.10355595326058302, + "grad_norm": 0.38869452476501465, + "learning_rate": 1.947543945825015e-05, + "loss": 0.4267, + "step": 5578 + }, + { + "epoch": 0.10359308339800166, + "grad_norm": 0.3133956491947174, + "learning_rate": 1.9475066553604288e-05, + "loss": 0.2642, + "step": 5580 + }, + { + "epoch": 0.10363021353542029, + "grad_norm": 0.4013133943080902, + "learning_rate": 1.9474693520031158e-05, + "loss": 0.3062, + "step": 5582 + }, + { + "epoch": 0.10366734367283893, + "grad_norm": 0.3490278422832489, + "learning_rate": 1.947432035753582e-05, + "loss": 0.3235, + "step": 5584 + }, + { + "epoch": 0.10370447381025757, + "grad_norm": 0.42173048853874207, + "learning_rate": 1.9473947066123368e-05, + "loss": 0.4087, + "step": 5586 + }, + { + "epoch": 0.10374160394767622, + "grad_norm": 0.3163757622241974, + "learning_rate": 1.947357364579887e-05, + "loss": 0.2032, + "step": 5588 + }, + { + "epoch": 0.10377873408509485, + "grad_norm": 0.3753913938999176, + "learning_rate": 1.9473200096567416e-05, + "loss": 0.2795, + "step": 5590 + }, + { + "epoch": 0.10381586422251349, + "grad_norm": 0.3602595329284668, + "learning_rate": 1.947282641843408e-05, + "loss": 0.1914, + "step": 5592 + }, + { + "epoch": 0.10385299435993213, + "grad_norm": 0.25141608715057373, + "learning_rate": 1.9472452611403952e-05, + "loss": 0.3458, + "step": 5594 + }, + { + "epoch": 0.10389012449735076, + "grad_norm": 0.25714215636253357, + "learning_rate": 1.947207867548212e-05, + "loss": 0.3501, + "step": 5596 + }, + { + "epoch": 0.1039272546347694, + "grad_norm": 0.42117518186569214, + "learning_rate": 1.9471704610673668e-05, + "loss": 0.3917, + "step": 5598 + }, + { + "epoch": 0.10396438477218804, + "grad_norm": 0.4055294990539551, + "learning_rate": 1.9471330416983684e-05, + "loss": 0.2976, + "step": 5600 + }, + { + "epoch": 0.10400151490960668, + "grad_norm": 0.23825065791606903, + "learning_rate": 1.9470956094417265e-05, + "loss": 0.3418, + "step": 5602 + }, + { + "epoch": 0.10403864504702531, + "grad_norm": 0.3468947410583496, + "learning_rate": 1.9470581642979506e-05, + "loss": 0.35, + "step": 5604 + }, + { + "epoch": 0.10407577518444396, + "grad_norm": 0.3699585795402527, + "learning_rate": 1.9470207062675496e-05, + "loss": 0.2119, + "step": 5606 + }, + { + "epoch": 0.1041129053218626, + "grad_norm": 0.3218643367290497, + "learning_rate": 1.9469832353510334e-05, + "loss": 0.2598, + "step": 5608 + }, + { + "epoch": 0.10415003545928124, + "grad_norm": 0.25380244851112366, + "learning_rate": 1.9469457515489116e-05, + "loss": 0.4322, + "step": 5610 + }, + { + "epoch": 0.10418716559669987, + "grad_norm": 0.31532925367355347, + "learning_rate": 1.9469082548616952e-05, + "loss": 0.2267, + "step": 5612 + }, + { + "epoch": 0.10422429573411851, + "grad_norm": 0.3594014644622803, + "learning_rate": 1.9468707452898933e-05, + "loss": 0.491, + "step": 5614 + }, + { + "epoch": 0.10426142587153715, + "grad_norm": 0.41942188143730164, + "learning_rate": 1.946833222834017e-05, + "loss": 0.2912, + "step": 5616 + }, + { + "epoch": 0.1042985560089558, + "grad_norm": 0.3073533773422241, + "learning_rate": 1.9467956874945763e-05, + "loss": 0.2299, + "step": 5618 + }, + { + "epoch": 0.10433568614637442, + "grad_norm": 0.5761581659317017, + "learning_rate": 1.9467581392720823e-05, + "loss": 0.3474, + "step": 5620 + }, + { + "epoch": 0.10437281628379307, + "grad_norm": 0.49407994747161865, + "learning_rate": 1.9467205781670463e-05, + "loss": 0.2762, + "step": 5622 + }, + { + "epoch": 0.10440994642121171, + "grad_norm": 0.3751986026763916, + "learning_rate": 1.9466830041799785e-05, + "loss": 0.4026, + "step": 5624 + }, + { + "epoch": 0.10444707655863035, + "grad_norm": 0.30550137162208557, + "learning_rate": 1.9466454173113912e-05, + "loss": 0.3968, + "step": 5626 + }, + { + "epoch": 0.10448420669604898, + "grad_norm": 0.3125455677509308, + "learning_rate": 1.946607817561795e-05, + "loss": 0.3578, + "step": 5628 + }, + { + "epoch": 0.10452133683346762, + "grad_norm": 0.45746296644210815, + "learning_rate": 1.946570204931702e-05, + "loss": 0.4813, + "step": 5630 + }, + { + "epoch": 0.10455846697088626, + "grad_norm": 0.307971328496933, + "learning_rate": 1.9465325794216235e-05, + "loss": 0.2687, + "step": 5632 + }, + { + "epoch": 0.10459559710830489, + "grad_norm": 0.5898893475532532, + "learning_rate": 1.9464949410320718e-05, + "loss": 0.2711, + "step": 5634 + }, + { + "epoch": 0.10463272724572353, + "grad_norm": 0.30872514843940735, + "learning_rate": 1.9464572897635593e-05, + "loss": 0.3561, + "step": 5636 + }, + { + "epoch": 0.10466985738314218, + "grad_norm": 0.9824311137199402, + "learning_rate": 1.946419625616598e-05, + "loss": 0.2251, + "step": 5638 + }, + { + "epoch": 0.10470698752056082, + "grad_norm": 0.42937248945236206, + "learning_rate": 1.9463819485917e-05, + "loss": 0.3925, + "step": 5640 + }, + { + "epoch": 0.10474411765797945, + "grad_norm": 0.3466099202632904, + "learning_rate": 1.9463442586893793e-05, + "loss": 0.2799, + "step": 5642 + }, + { + "epoch": 0.10478124779539809, + "grad_norm": 0.32875850796699524, + "learning_rate": 1.9463065559101472e-05, + "loss": 0.4291, + "step": 5644 + }, + { + "epoch": 0.10481837793281673, + "grad_norm": 0.3984617590904236, + "learning_rate": 1.9462688402545174e-05, + "loss": 0.2584, + "step": 5646 + }, + { + "epoch": 0.10485550807023537, + "grad_norm": 0.5372908711433411, + "learning_rate": 1.9462311117230036e-05, + "loss": 0.4681, + "step": 5648 + }, + { + "epoch": 0.104892638207654, + "grad_norm": 0.34178024530410767, + "learning_rate": 1.946193370316118e-05, + "loss": 0.2364, + "step": 5650 + }, + { + "epoch": 0.10492976834507264, + "grad_norm": 0.3702748417854309, + "learning_rate": 1.9461556160343753e-05, + "loss": 0.3125, + "step": 5652 + }, + { + "epoch": 0.10496689848249129, + "grad_norm": 0.440461665391922, + "learning_rate": 1.9461178488782882e-05, + "loss": 0.1564, + "step": 5654 + }, + { + "epoch": 0.10500402861990993, + "grad_norm": 0.2766466438770294, + "learning_rate": 1.946080068848372e-05, + "loss": 0.2424, + "step": 5656 + }, + { + "epoch": 0.10504115875732856, + "grad_norm": 0.27612540125846863, + "learning_rate": 1.946042275945139e-05, + "loss": 0.3179, + "step": 5658 + }, + { + "epoch": 0.1050782888947472, + "grad_norm": 0.44314730167388916, + "learning_rate": 1.946004470169105e-05, + "loss": 0.3984, + "step": 5660 + }, + { + "epoch": 0.10511541903216584, + "grad_norm": 0.4054076075553894, + "learning_rate": 1.9459666515207834e-05, + "loss": 0.3668, + "step": 5662 + }, + { + "epoch": 0.10515254916958448, + "grad_norm": 0.3379444181919098, + "learning_rate": 1.945928820000689e-05, + "loss": 0.2956, + "step": 5664 + }, + { + "epoch": 0.10518967930700311, + "grad_norm": 0.27987322211265564, + "learning_rate": 1.9458909756093375e-05, + "loss": 0.3556, + "step": 5666 + }, + { + "epoch": 0.10522680944442175, + "grad_norm": 0.38211536407470703, + "learning_rate": 1.9458531183472424e-05, + "loss": 0.3858, + "step": 5668 + }, + { + "epoch": 0.1052639395818404, + "grad_norm": 0.4223131537437439, + "learning_rate": 1.9458152482149197e-05, + "loss": 0.3542, + "step": 5670 + }, + { + "epoch": 0.10530106971925902, + "grad_norm": 0.473664790391922, + "learning_rate": 1.9457773652128845e-05, + "loss": 0.3758, + "step": 5672 + }, + { + "epoch": 0.10533819985667767, + "grad_norm": 0.3229224681854248, + "learning_rate": 1.9457394693416524e-05, + "loss": 0.2796, + "step": 5674 + }, + { + "epoch": 0.10537532999409631, + "grad_norm": 0.34035953879356384, + "learning_rate": 1.9457015606017387e-05, + "loss": 0.3071, + "step": 5676 + }, + { + "epoch": 0.10541246013151495, + "grad_norm": 0.26566579937934875, + "learning_rate": 1.9456636389936596e-05, + "loss": 0.3156, + "step": 5678 + }, + { + "epoch": 0.10544959026893358, + "grad_norm": 0.2555239796638489, + "learning_rate": 1.9456257045179312e-05, + "loss": 0.2193, + "step": 5680 + }, + { + "epoch": 0.10548672040635222, + "grad_norm": 0.2860983908176422, + "learning_rate": 1.9455877571750694e-05, + "loss": 0.5282, + "step": 5682 + }, + { + "epoch": 0.10552385054377086, + "grad_norm": 0.35503295063972473, + "learning_rate": 1.9455497969655903e-05, + "loss": 0.3858, + "step": 5684 + }, + { + "epoch": 0.1055609806811895, + "grad_norm": 0.3632769286632538, + "learning_rate": 1.945511823890011e-05, + "loss": 0.4672, + "step": 5686 + }, + { + "epoch": 0.10559811081860813, + "grad_norm": 0.2467278391122818, + "learning_rate": 1.9454738379488476e-05, + "loss": 0.4479, + "step": 5688 + }, + { + "epoch": 0.10563524095602678, + "grad_norm": 0.5296027064323425, + "learning_rate": 1.9454358391426173e-05, + "loss": 0.4009, + "step": 5690 + }, + { + "epoch": 0.10567237109344542, + "grad_norm": 0.37853768467903137, + "learning_rate": 1.9453978274718372e-05, + "loss": 0.3023, + "step": 5692 + }, + { + "epoch": 0.10570950123086406, + "grad_norm": 0.36298149824142456, + "learning_rate": 1.9453598029370248e-05, + "loss": 0.3133, + "step": 5694 + }, + { + "epoch": 0.10574663136828269, + "grad_norm": 0.4778239130973816, + "learning_rate": 1.9453217655386968e-05, + "loss": 0.2592, + "step": 5696 + }, + { + "epoch": 0.10578376150570133, + "grad_norm": 0.39348888397216797, + "learning_rate": 1.945283715277371e-05, + "loss": 0.377, + "step": 5698 + }, + { + "epoch": 0.10582089164311997, + "grad_norm": 0.3999602198600769, + "learning_rate": 1.945245652153566e-05, + "loss": 0.3594, + "step": 5700 + }, + { + "epoch": 0.10585802178053862, + "grad_norm": 0.4513964056968689, + "learning_rate": 1.9452075761677984e-05, + "loss": 0.3914, + "step": 5702 + }, + { + "epoch": 0.10589515191795724, + "grad_norm": 0.4504709541797638, + "learning_rate": 1.945169487320587e-05, + "loss": 0.2521, + "step": 5704 + }, + { + "epoch": 0.10593228205537589, + "grad_norm": 0.26126736402511597, + "learning_rate": 1.94513138561245e-05, + "loss": 0.3002, + "step": 5706 + }, + { + "epoch": 0.10596941219279453, + "grad_norm": 0.2932392358779907, + "learning_rate": 1.945093271043906e-05, + "loss": 0.24, + "step": 5708 + }, + { + "epoch": 0.10600654233021316, + "grad_norm": 0.2898675799369812, + "learning_rate": 1.9450551436154733e-05, + "loss": 0.2744, + "step": 5710 + }, + { + "epoch": 0.1060436724676318, + "grad_norm": 0.4925738275051117, + "learning_rate": 1.9450170033276706e-05, + "loss": 0.5037, + "step": 5712 + }, + { + "epoch": 0.10608080260505044, + "grad_norm": 0.3793063163757324, + "learning_rate": 1.9449788501810177e-05, + "loss": 0.2955, + "step": 5714 + }, + { + "epoch": 0.10611793274246908, + "grad_norm": 0.29674798250198364, + "learning_rate": 1.9449406841760327e-05, + "loss": 0.4152, + "step": 5716 + }, + { + "epoch": 0.10615506287988771, + "grad_norm": 0.4603341519832611, + "learning_rate": 1.944902505313236e-05, + "loss": 0.3756, + "step": 5718 + }, + { + "epoch": 0.10619219301730635, + "grad_norm": 0.3077545166015625, + "learning_rate": 1.944864313593146e-05, + "loss": 0.3584, + "step": 5720 + }, + { + "epoch": 0.106229323154725, + "grad_norm": 0.33327987790107727, + "learning_rate": 1.9448261090162833e-05, + "loss": 0.4767, + "step": 5722 + }, + { + "epoch": 0.10626645329214364, + "grad_norm": 0.28055399656295776, + "learning_rate": 1.9447878915831667e-05, + "loss": 0.4711, + "step": 5724 + }, + { + "epoch": 0.10630358342956227, + "grad_norm": 0.25300803780555725, + "learning_rate": 1.9447496612943177e-05, + "loss": 0.3375, + "step": 5726 + }, + { + "epoch": 0.10634071356698091, + "grad_norm": 0.33119189739227295, + "learning_rate": 1.944711418150255e-05, + "loss": 0.4009, + "step": 5728 + }, + { + "epoch": 0.10637784370439955, + "grad_norm": 0.32731327414512634, + "learning_rate": 1.9446731621515e-05, + "loss": 0.3118, + "step": 5730 + }, + { + "epoch": 0.1064149738418182, + "grad_norm": 0.39733195304870605, + "learning_rate": 1.9446348932985722e-05, + "loss": 0.3191, + "step": 5732 + }, + { + "epoch": 0.10645210397923682, + "grad_norm": 0.38298898935317993, + "learning_rate": 1.944596611591994e-05, + "loss": 0.2386, + "step": 5734 + }, + { + "epoch": 0.10648923411665547, + "grad_norm": 0.39737269282341003, + "learning_rate": 1.9445583170322848e-05, + "loss": 0.417, + "step": 5736 + }, + { + "epoch": 0.10652636425407411, + "grad_norm": 0.3341575264930725, + "learning_rate": 1.944520009619966e-05, + "loss": 0.2563, + "step": 5738 + }, + { + "epoch": 0.10656349439149275, + "grad_norm": 0.4088996648788452, + "learning_rate": 1.9444816893555596e-05, + "loss": 0.236, + "step": 5740 + }, + { + "epoch": 0.10660062452891138, + "grad_norm": 0.3626585900783539, + "learning_rate": 1.944443356239586e-05, + "loss": 0.378, + "step": 5742 + }, + { + "epoch": 0.10663775466633002, + "grad_norm": 0.42454993724823, + "learning_rate": 1.9444050102725675e-05, + "loss": 0.255, + "step": 5744 + }, + { + "epoch": 0.10667488480374866, + "grad_norm": 0.23644159734249115, + "learning_rate": 1.9443666514550254e-05, + "loss": 0.3274, + "step": 5746 + }, + { + "epoch": 0.10671201494116729, + "grad_norm": 0.36012452840805054, + "learning_rate": 1.944328279787482e-05, + "loss": 0.3883, + "step": 5748 + }, + { + "epoch": 0.10674914507858593, + "grad_norm": 0.34978556632995605, + "learning_rate": 1.944289895270459e-05, + "loss": 0.52, + "step": 5750 + }, + { + "epoch": 0.10678627521600458, + "grad_norm": 0.3932737708091736, + "learning_rate": 1.9442514979044794e-05, + "loss": 0.1831, + "step": 5752 + }, + { + "epoch": 0.10682340535342322, + "grad_norm": 0.3289352357387543, + "learning_rate": 1.9442130876900654e-05, + "loss": 0.1993, + "step": 5754 + }, + { + "epoch": 0.10686053549084185, + "grad_norm": 0.28233715891838074, + "learning_rate": 1.9441746646277394e-05, + "loss": 0.1737, + "step": 5756 + }, + { + "epoch": 0.10689766562826049, + "grad_norm": 0.43340542912483215, + "learning_rate": 1.9441362287180242e-05, + "loss": 0.3667, + "step": 5758 + }, + { + "epoch": 0.10693479576567913, + "grad_norm": 0.38987642526626587, + "learning_rate": 1.944097779961443e-05, + "loss": 0.3164, + "step": 5760 + }, + { + "epoch": 0.10697192590309777, + "grad_norm": 0.3755171597003937, + "learning_rate": 1.9440593183585188e-05, + "loss": 0.3508, + "step": 5762 + }, + { + "epoch": 0.1070090560405164, + "grad_norm": 0.44960302114486694, + "learning_rate": 1.9440208439097752e-05, + "loss": 0.4445, + "step": 5764 + }, + { + "epoch": 0.10704618617793504, + "grad_norm": 0.3175716698169708, + "learning_rate": 1.9439823566157353e-05, + "loss": 0.3236, + "step": 5766 + }, + { + "epoch": 0.10708331631535369, + "grad_norm": 0.30718982219696045, + "learning_rate": 1.9439438564769236e-05, + "loss": 0.2313, + "step": 5768 + }, + { + "epoch": 0.10712044645277233, + "grad_norm": 0.8927876353263855, + "learning_rate": 1.943905343493863e-05, + "loss": 0.3348, + "step": 5770 + }, + { + "epoch": 0.10715757659019096, + "grad_norm": 0.3095676004886627, + "learning_rate": 1.9438668176670785e-05, + "loss": 0.4022, + "step": 5772 + }, + { + "epoch": 0.1071947067276096, + "grad_norm": 0.4106636345386505, + "learning_rate": 1.9438282789970934e-05, + "loss": 0.1842, + "step": 5774 + }, + { + "epoch": 0.10723183686502824, + "grad_norm": 0.26190951466560364, + "learning_rate": 1.9437897274844327e-05, + "loss": 0.1287, + "step": 5776 + }, + { + "epoch": 0.10726896700244688, + "grad_norm": 0.31700852513313293, + "learning_rate": 1.9437511631296205e-05, + "loss": 0.3923, + "step": 5778 + }, + { + "epoch": 0.10730609713986551, + "grad_norm": 0.3483114242553711, + "learning_rate": 1.9437125859331824e-05, + "loss": 0.2641, + "step": 5780 + }, + { + "epoch": 0.10734322727728415, + "grad_norm": 0.2810349762439728, + "learning_rate": 1.943673995895642e-05, + "loss": 0.2339, + "step": 5782 + }, + { + "epoch": 0.1073803574147028, + "grad_norm": 0.4115602970123291, + "learning_rate": 1.9436353930175256e-05, + "loss": 0.3854, + "step": 5784 + }, + { + "epoch": 0.10741748755212142, + "grad_norm": 0.4067830741405487, + "learning_rate": 1.9435967772993582e-05, + "loss": 0.314, + "step": 5786 + }, + { + "epoch": 0.10745461768954007, + "grad_norm": 0.3596358895301819, + "learning_rate": 1.9435581487416645e-05, + "loss": 0.3275, + "step": 5788 + }, + { + "epoch": 0.10749174782695871, + "grad_norm": 0.39568236470222473, + "learning_rate": 1.943519507344971e-05, + "loss": 0.2978, + "step": 5790 + }, + { + "epoch": 0.10752887796437735, + "grad_norm": 0.3731325566768646, + "learning_rate": 1.9434808531098034e-05, + "loss": 0.4138, + "step": 5792 + }, + { + "epoch": 0.10756600810179598, + "grad_norm": 0.38418689370155334, + "learning_rate": 1.943442186036687e-05, + "loss": 0.2724, + "step": 5794 + }, + { + "epoch": 0.10760313823921462, + "grad_norm": 0.3116511404514313, + "learning_rate": 1.9434035061261488e-05, + "loss": 0.1247, + "step": 5796 + }, + { + "epoch": 0.10764026837663326, + "grad_norm": 0.4002717137336731, + "learning_rate": 1.943364813378714e-05, + "loss": 0.4214, + "step": 5798 + }, + { + "epoch": 0.1076773985140519, + "grad_norm": 0.3120017349720001, + "learning_rate": 1.9433261077949107e-05, + "loss": 0.3497, + "step": 5800 + }, + { + "epoch": 0.10771452865147053, + "grad_norm": 0.2992229461669922, + "learning_rate": 1.9432873893752643e-05, + "loss": 0.2889, + "step": 5802 + }, + { + "epoch": 0.10775165878888918, + "grad_norm": 0.3299771547317505, + "learning_rate": 1.943248658120302e-05, + "loss": 0.2584, + "step": 5804 + }, + { + "epoch": 0.10778878892630782, + "grad_norm": 0.3887860178947449, + "learning_rate": 1.9432099140305506e-05, + "loss": 0.3226, + "step": 5806 + }, + { + "epoch": 0.10782591906372646, + "grad_norm": 0.44004178047180176, + "learning_rate": 1.9431711571065377e-05, + "loss": 0.2345, + "step": 5808 + }, + { + "epoch": 0.10786304920114509, + "grad_norm": 0.32659563422203064, + "learning_rate": 1.9431323873487905e-05, + "loss": 0.512, + "step": 5810 + }, + { + "epoch": 0.10790017933856373, + "grad_norm": 0.44951125979423523, + "learning_rate": 1.9430936047578365e-05, + "loss": 0.3856, + "step": 5812 + }, + { + "epoch": 0.10793730947598237, + "grad_norm": 0.28977170586586, + "learning_rate": 1.9430548093342036e-05, + "loss": 0.2726, + "step": 5814 + }, + { + "epoch": 0.10797443961340102, + "grad_norm": 0.3876648247241974, + "learning_rate": 1.943016001078419e-05, + "loss": 0.378, + "step": 5816 + }, + { + "epoch": 0.10801156975081964, + "grad_norm": 0.36916598677635193, + "learning_rate": 1.9429771799910115e-05, + "loss": 0.4125, + "step": 5818 + }, + { + "epoch": 0.10804869988823829, + "grad_norm": 0.3590720593929291, + "learning_rate": 1.9429383460725094e-05, + "loss": 0.261, + "step": 5820 + }, + { + "epoch": 0.10808583002565693, + "grad_norm": 0.30345419049263, + "learning_rate": 1.942899499323441e-05, + "loss": 0.3931, + "step": 5822 + }, + { + "epoch": 0.10812296016307556, + "grad_norm": 0.33233898878097534, + "learning_rate": 1.942860639744334e-05, + "loss": 0.2892, + "step": 5824 + }, + { + "epoch": 0.1081600903004942, + "grad_norm": 0.3375394344329834, + "learning_rate": 1.9428217673357184e-05, + "loss": 0.2629, + "step": 5826 + }, + { + "epoch": 0.10819722043791284, + "grad_norm": 0.2185717672109604, + "learning_rate": 1.9427828820981226e-05, + "loss": 0.1229, + "step": 5828 + }, + { + "epoch": 0.10823435057533148, + "grad_norm": 0.27578356862068176, + "learning_rate": 1.9427439840320756e-05, + "loss": 0.4499, + "step": 5830 + }, + { + "epoch": 0.10827148071275011, + "grad_norm": 0.27450743317604065, + "learning_rate": 1.942705073138107e-05, + "loss": 0.4816, + "step": 5832 + }, + { + "epoch": 0.10830861085016875, + "grad_norm": 0.5252142548561096, + "learning_rate": 1.942666149416746e-05, + "loss": 0.3833, + "step": 5834 + }, + { + "epoch": 0.1083457409875874, + "grad_norm": 0.32267916202545166, + "learning_rate": 1.942627212868522e-05, + "loss": 0.2481, + "step": 5836 + }, + { + "epoch": 0.10838287112500604, + "grad_norm": 0.38306835293769836, + "learning_rate": 1.9425882634939655e-05, + "loss": 0.3806, + "step": 5838 + }, + { + "epoch": 0.10842000126242467, + "grad_norm": 0.33959731459617615, + "learning_rate": 1.942549301293606e-05, + "loss": 0.2882, + "step": 5840 + }, + { + "epoch": 0.10845713139984331, + "grad_norm": 0.3266756534576416, + "learning_rate": 1.9425103262679736e-05, + "loss": 0.3852, + "step": 5842 + }, + { + "epoch": 0.10849426153726195, + "grad_norm": 0.35104936361312866, + "learning_rate": 1.9424713384175994e-05, + "loss": 0.223, + "step": 5844 + }, + { + "epoch": 0.1085313916746806, + "grad_norm": 0.32879096269607544, + "learning_rate": 1.9424323377430125e-05, + "loss": 0.2237, + "step": 5846 + }, + { + "epoch": 0.10856852181209922, + "grad_norm": 0.4024861454963684, + "learning_rate": 1.942393324244745e-05, + "loss": 0.1185, + "step": 5848 + }, + { + "epoch": 0.10860565194951786, + "grad_norm": 0.3981323838233948, + "learning_rate": 1.942354297923327e-05, + "loss": 0.6142, + "step": 5850 + }, + { + "epoch": 0.1086427820869365, + "grad_norm": 0.41584548354148865, + "learning_rate": 1.9423152587792896e-05, + "loss": 0.4369, + "step": 5852 + }, + { + "epoch": 0.10867991222435515, + "grad_norm": 0.39271169900894165, + "learning_rate": 1.9422762068131643e-05, + "loss": 0.5093, + "step": 5854 + }, + { + "epoch": 0.10871704236177378, + "grad_norm": 0.3449486494064331, + "learning_rate": 1.9422371420254817e-05, + "loss": 0.4341, + "step": 5856 + }, + { + "epoch": 0.10875417249919242, + "grad_norm": 0.4763622581958771, + "learning_rate": 1.9421980644167747e-05, + "loss": 0.4724, + "step": 5858 + }, + { + "epoch": 0.10879130263661106, + "grad_norm": 0.32343971729278564, + "learning_rate": 1.9421589739875738e-05, + "loss": 0.5369, + "step": 5860 + }, + { + "epoch": 0.10882843277402969, + "grad_norm": 0.33949631452560425, + "learning_rate": 1.9421198707384114e-05, + "loss": 0.3341, + "step": 5862 + }, + { + "epoch": 0.10886556291144833, + "grad_norm": 0.23112685978412628, + "learning_rate": 1.9420807546698196e-05, + "loss": 0.1913, + "step": 5864 + }, + { + "epoch": 0.10890269304886697, + "grad_norm": 0.40140068531036377, + "learning_rate": 1.9420416257823302e-05, + "loss": 0.4568, + "step": 5866 + }, + { + "epoch": 0.10893982318628562, + "grad_norm": 0.3172307014465332, + "learning_rate": 1.9420024840764767e-05, + "loss": 0.4682, + "step": 5868 + }, + { + "epoch": 0.10897695332370425, + "grad_norm": 0.2731354832649231, + "learning_rate": 1.9419633295527905e-05, + "loss": 0.4819, + "step": 5870 + }, + { + "epoch": 0.10901408346112289, + "grad_norm": 0.34615558385849, + "learning_rate": 1.9419241622118053e-05, + "loss": 0.2828, + "step": 5872 + }, + { + "epoch": 0.10905121359854153, + "grad_norm": 0.5087578296661377, + "learning_rate": 1.9418849820540533e-05, + "loss": 0.2022, + "step": 5874 + }, + { + "epoch": 0.10908834373596017, + "grad_norm": 0.43714675307273865, + "learning_rate": 1.941845789080068e-05, + "loss": 0.2583, + "step": 5876 + }, + { + "epoch": 0.1091254738733788, + "grad_norm": 0.4163441061973572, + "learning_rate": 1.941806583290383e-05, + "loss": 0.3424, + "step": 5878 + }, + { + "epoch": 0.10916260401079744, + "grad_norm": 0.2833601236343384, + "learning_rate": 1.941767364685531e-05, + "loss": 0.3319, + "step": 5880 + }, + { + "epoch": 0.10919973414821608, + "grad_norm": 0.4008776843547821, + "learning_rate": 1.9417281332660463e-05, + "loss": 0.4196, + "step": 5882 + }, + { + "epoch": 0.10923686428563473, + "grad_norm": 0.3623258173465729, + "learning_rate": 1.9416888890324626e-05, + "loss": 0.3032, + "step": 5884 + }, + { + "epoch": 0.10927399442305336, + "grad_norm": 0.3638496994972229, + "learning_rate": 1.9416496319853134e-05, + "loss": 0.3202, + "step": 5886 + }, + { + "epoch": 0.109311124560472, + "grad_norm": 0.5115017890930176, + "learning_rate": 1.9416103621251338e-05, + "loss": 0.4579, + "step": 5888 + }, + { + "epoch": 0.10934825469789064, + "grad_norm": 0.3284580111503601, + "learning_rate": 1.9415710794524574e-05, + "loss": 0.4716, + "step": 5890 + }, + { + "epoch": 0.10938538483530928, + "grad_norm": 0.4640825688838959, + "learning_rate": 1.9415317839678187e-05, + "loss": 0.419, + "step": 5892 + }, + { + "epoch": 0.10942251497272791, + "grad_norm": 0.2682100236415863, + "learning_rate": 1.941492475671753e-05, + "loss": 0.33, + "step": 5894 + }, + { + "epoch": 0.10945964511014655, + "grad_norm": 0.2789626121520996, + "learning_rate": 1.9414531545647945e-05, + "loss": 0.2711, + "step": 5896 + }, + { + "epoch": 0.1094967752475652, + "grad_norm": 0.34911608695983887, + "learning_rate": 1.941413820647479e-05, + "loss": 0.4164, + "step": 5898 + }, + { + "epoch": 0.10953390538498382, + "grad_norm": 0.31880897283554077, + "learning_rate": 1.9413744739203407e-05, + "loss": 0.4297, + "step": 5900 + }, + { + "epoch": 0.10957103552240247, + "grad_norm": 0.3555947542190552, + "learning_rate": 1.941335114383916e-05, + "loss": 0.3629, + "step": 5902 + }, + { + "epoch": 0.10960816565982111, + "grad_norm": 0.29058215022087097, + "learning_rate": 1.9412957420387396e-05, + "loss": 0.4357, + "step": 5904 + }, + { + "epoch": 0.10964529579723975, + "grad_norm": 0.4944290816783905, + "learning_rate": 1.941256356885348e-05, + "loss": 0.3046, + "step": 5906 + }, + { + "epoch": 0.10968242593465838, + "grad_norm": 0.4301474392414093, + "learning_rate": 1.9412169589242768e-05, + "loss": 0.69, + "step": 5908 + }, + { + "epoch": 0.10971955607207702, + "grad_norm": 0.31939250230789185, + "learning_rate": 1.9411775481560617e-05, + "loss": 0.3955, + "step": 5910 + }, + { + "epoch": 0.10975668620949566, + "grad_norm": 0.559546947479248, + "learning_rate": 1.9411381245812397e-05, + "loss": 0.2494, + "step": 5912 + }, + { + "epoch": 0.1097938163469143, + "grad_norm": 0.2580447196960449, + "learning_rate": 1.9410986882003468e-05, + "loss": 0.488, + "step": 5914 + }, + { + "epoch": 0.10983094648433293, + "grad_norm": 0.26733484864234924, + "learning_rate": 1.9410592390139196e-05, + "loss": 0.2879, + "step": 5916 + }, + { + "epoch": 0.10986807662175158, + "grad_norm": 0.31146278977394104, + "learning_rate": 1.941019777022495e-05, + "loss": 0.5469, + "step": 5918 + }, + { + "epoch": 0.10990520675917022, + "grad_norm": 0.34816449880599976, + "learning_rate": 1.9409803022266097e-05, + "loss": 0.348, + "step": 5920 + }, + { + "epoch": 0.10994233689658886, + "grad_norm": 0.3630014657974243, + "learning_rate": 1.940940814626801e-05, + "loss": 0.3334, + "step": 5922 + }, + { + "epoch": 0.10997946703400749, + "grad_norm": 0.2611609697341919, + "learning_rate": 1.9409013142236065e-05, + "loss": 0.3642, + "step": 5924 + }, + { + "epoch": 0.11001659717142613, + "grad_norm": 0.31617316603660583, + "learning_rate": 1.9408618010175636e-05, + "loss": 0.2407, + "step": 5926 + }, + { + "epoch": 0.11005372730884477, + "grad_norm": 0.36851227283477783, + "learning_rate": 1.9408222750092095e-05, + "loss": 0.413, + "step": 5928 + }, + { + "epoch": 0.11009085744626342, + "grad_norm": 0.19425345957279205, + "learning_rate": 1.9407827361990824e-05, + "loss": 0.4462, + "step": 5930 + }, + { + "epoch": 0.11012798758368204, + "grad_norm": 0.247972771525383, + "learning_rate": 1.9407431845877203e-05, + "loss": 0.3072, + "step": 5932 + }, + { + "epoch": 0.11016511772110069, + "grad_norm": 0.37046679854393005, + "learning_rate": 1.940703620175661e-05, + "loss": 0.2117, + "step": 5934 + }, + { + "epoch": 0.11020224785851933, + "grad_norm": 0.3156874179840088, + "learning_rate": 1.9406640429634434e-05, + "loss": 0.4296, + "step": 5936 + }, + { + "epoch": 0.11023937799593796, + "grad_norm": 0.3625591993331909, + "learning_rate": 1.9406244529516058e-05, + "loss": 0.2992, + "step": 5938 + }, + { + "epoch": 0.1102765081333566, + "grad_norm": 0.3214312791824341, + "learning_rate": 1.9405848501406873e-05, + "loss": 0.325, + "step": 5940 + }, + { + "epoch": 0.11031363827077524, + "grad_norm": 0.47295668721199036, + "learning_rate": 1.940545234531226e-05, + "loss": 0.317, + "step": 5942 + }, + { + "epoch": 0.11035076840819388, + "grad_norm": 0.42984381318092346, + "learning_rate": 1.940505606123761e-05, + "loss": 0.4246, + "step": 5944 + }, + { + "epoch": 0.11038789854561251, + "grad_norm": 0.25924059748649597, + "learning_rate": 1.9404659649188323e-05, + "loss": 0.2936, + "step": 5946 + }, + { + "epoch": 0.11042502868303115, + "grad_norm": 0.3624178469181061, + "learning_rate": 1.9404263109169786e-05, + "loss": 0.2656, + "step": 5948 + }, + { + "epoch": 0.1104621588204498, + "grad_norm": 0.2970726191997528, + "learning_rate": 1.9403866441187396e-05, + "loss": 0.3149, + "step": 5950 + }, + { + "epoch": 0.11049928895786844, + "grad_norm": 0.8646212220191956, + "learning_rate": 1.940346964524655e-05, + "loss": 0.5251, + "step": 5952 + }, + { + "epoch": 0.11053641909528707, + "grad_norm": 0.2533368170261383, + "learning_rate": 1.940307272135266e-05, + "loss": 0.3765, + "step": 5954 + }, + { + "epoch": 0.11057354923270571, + "grad_norm": 0.42803817987442017, + "learning_rate": 1.9402675669511106e-05, + "loss": 0.3591, + "step": 5956 + }, + { + "epoch": 0.11061067937012435, + "grad_norm": 0.45353007316589355, + "learning_rate": 1.9402278489727303e-05, + "loss": 0.2383, + "step": 5958 + }, + { + "epoch": 0.110647809507543, + "grad_norm": 0.36995524168014526, + "learning_rate": 1.940188118200665e-05, + "loss": 0.3012, + "step": 5960 + }, + { + "epoch": 0.11068493964496162, + "grad_norm": 0.29975393414497375, + "learning_rate": 1.9401483746354558e-05, + "loss": 0.463, + "step": 5962 + }, + { + "epoch": 0.11072206978238026, + "grad_norm": 0.37595194578170776, + "learning_rate": 1.9401086182776433e-05, + "loss": 0.3645, + "step": 5964 + }, + { + "epoch": 0.1107591999197989, + "grad_norm": 0.31594109535217285, + "learning_rate": 1.9400688491277682e-05, + "loss": 0.4021, + "step": 5966 + }, + { + "epoch": 0.11079633005721755, + "grad_norm": 0.3603358566761017, + "learning_rate": 1.9400290671863726e-05, + "loss": 0.3044, + "step": 5968 + }, + { + "epoch": 0.11083346019463618, + "grad_norm": 0.36992278695106506, + "learning_rate": 1.9399892724539967e-05, + "loss": 0.3763, + "step": 5970 + }, + { + "epoch": 0.11087059033205482, + "grad_norm": 0.4233797788619995, + "learning_rate": 1.939949464931182e-05, + "loss": 0.3312, + "step": 5972 + }, + { + "epoch": 0.11090772046947346, + "grad_norm": 0.23897391557693481, + "learning_rate": 1.939909644618471e-05, + "loss": 0.1467, + "step": 5974 + }, + { + "epoch": 0.11094485060689209, + "grad_norm": 0.31221428513526917, + "learning_rate": 1.9398698115164052e-05, + "loss": 0.2822, + "step": 5976 + }, + { + "epoch": 0.11098198074431073, + "grad_norm": 0.36981192231178284, + "learning_rate": 1.939829965625526e-05, + "loss": 0.3683, + "step": 5978 + }, + { + "epoch": 0.11101911088172937, + "grad_norm": 0.27857425808906555, + "learning_rate": 1.9397901069463768e-05, + "loss": 0.4174, + "step": 5980 + }, + { + "epoch": 0.11105624101914802, + "grad_norm": 0.21214932203292847, + "learning_rate": 1.9397502354794987e-05, + "loss": 0.1752, + "step": 5982 + }, + { + "epoch": 0.11109337115656664, + "grad_norm": 0.5696078538894653, + "learning_rate": 1.9397103512254348e-05, + "loss": 0.5898, + "step": 5984 + }, + { + "epoch": 0.11113050129398529, + "grad_norm": 0.27658164501190186, + "learning_rate": 1.939670454184728e-05, + "loss": 0.2478, + "step": 5986 + }, + { + "epoch": 0.11116763143140393, + "grad_norm": 0.29363882541656494, + "learning_rate": 1.9396305443579208e-05, + "loss": 0.4665, + "step": 5988 + }, + { + "epoch": 0.11120476156882257, + "grad_norm": 0.36517244577407837, + "learning_rate": 1.9395906217455565e-05, + "loss": 0.3164, + "step": 5990 + }, + { + "epoch": 0.1112418917062412, + "grad_norm": 0.5709163546562195, + "learning_rate": 1.9395506863481782e-05, + "loss": 0.5044, + "step": 5992 + }, + { + "epoch": 0.11127902184365984, + "grad_norm": 0.3584378957748413, + "learning_rate": 1.9395107381663294e-05, + "loss": 0.3409, + "step": 5994 + }, + { + "epoch": 0.11131615198107848, + "grad_norm": 0.241102933883667, + "learning_rate": 1.9394707772005535e-05, + "loss": 0.3295, + "step": 5996 + }, + { + "epoch": 0.11135328211849713, + "grad_norm": 0.27340802550315857, + "learning_rate": 1.9394308034513943e-05, + "loss": 0.257, + "step": 5998 + }, + { + "epoch": 0.11139041225591576, + "grad_norm": 0.40493902564048767, + "learning_rate": 1.9393908169193956e-05, + "loss": 0.3485, + "step": 6000 + }, + { + "epoch": 0.1114275423933344, + "grad_norm": 0.27664878964424133, + "learning_rate": 1.9393508176051023e-05, + "loss": 0.136, + "step": 6002 + }, + { + "epoch": 0.11146467253075304, + "grad_norm": 0.3698323667049408, + "learning_rate": 1.9393108055090577e-05, + "loss": 0.3324, + "step": 6004 + }, + { + "epoch": 0.11150180266817168, + "grad_norm": 0.3152863085269928, + "learning_rate": 1.9392707806318066e-05, + "loss": 0.2976, + "step": 6006 + }, + { + "epoch": 0.11153893280559031, + "grad_norm": 0.2979016602039337, + "learning_rate": 1.9392307429738937e-05, + "loss": 0.3829, + "step": 6008 + }, + { + "epoch": 0.11157606294300895, + "grad_norm": 0.26415732502937317, + "learning_rate": 1.9391906925358636e-05, + "loss": 0.2993, + "step": 6010 + }, + { + "epoch": 0.1116131930804276, + "grad_norm": 0.3328465521335602, + "learning_rate": 1.9391506293182615e-05, + "loss": 0.3772, + "step": 6012 + }, + { + "epoch": 0.11165032321784622, + "grad_norm": 0.2960035800933838, + "learning_rate": 1.9391105533216322e-05, + "loss": 0.4613, + "step": 6014 + }, + { + "epoch": 0.11168745335526487, + "grad_norm": 0.3554299473762512, + "learning_rate": 1.9390704645465216e-05, + "loss": 0.3894, + "step": 6016 + }, + { + "epoch": 0.11172458349268351, + "grad_norm": 0.4388827383518219, + "learning_rate": 1.9390303629934745e-05, + "loss": 0.3903, + "step": 6018 + }, + { + "epoch": 0.11176171363010215, + "grad_norm": 0.40241241455078125, + "learning_rate": 1.938990248663037e-05, + "loss": 0.4127, + "step": 6020 + }, + { + "epoch": 0.11179884376752078, + "grad_norm": 0.40406933426856995, + "learning_rate": 1.938950121555755e-05, + "loss": 0.353, + "step": 6022 + }, + { + "epoch": 0.11183597390493942, + "grad_norm": 0.42639797925949097, + "learning_rate": 1.9389099816721742e-05, + "loss": 0.232, + "step": 6024 + }, + { + "epoch": 0.11187310404235806, + "grad_norm": 0.5960007309913635, + "learning_rate": 1.9388698290128406e-05, + "loss": 0.3347, + "step": 6026 + }, + { + "epoch": 0.1119102341797767, + "grad_norm": 0.451932817697525, + "learning_rate": 1.938829663578301e-05, + "loss": 0.3026, + "step": 6028 + }, + { + "epoch": 0.11194736431719533, + "grad_norm": 0.46730172634124756, + "learning_rate": 1.938789485369102e-05, + "loss": 0.2846, + "step": 6030 + }, + { + "epoch": 0.11198449445461398, + "grad_norm": 0.3319157361984253, + "learning_rate": 1.9387492943857902e-05, + "loss": 0.2332, + "step": 6032 + }, + { + "epoch": 0.11202162459203262, + "grad_norm": 0.4053408205509186, + "learning_rate": 1.9387090906289118e-05, + "loss": 0.2424, + "step": 6034 + }, + { + "epoch": 0.11205875472945126, + "grad_norm": 0.31995534896850586, + "learning_rate": 1.938668874099015e-05, + "loss": 0.3918, + "step": 6036 + }, + { + "epoch": 0.11209588486686989, + "grad_norm": 0.8785678744316101, + "learning_rate": 1.9386286447966465e-05, + "loss": 0.2888, + "step": 6038 + }, + { + "epoch": 0.11213301500428853, + "grad_norm": 0.3702390491962433, + "learning_rate": 1.938588402722353e-05, + "loss": 0.1873, + "step": 6040 + }, + { + "epoch": 0.11217014514170717, + "grad_norm": 3.35353422164917, + "learning_rate": 1.9385481478766834e-05, + "loss": 0.3397, + "step": 6042 + }, + { + "epoch": 0.11220727527912581, + "grad_norm": 0.3046782612800598, + "learning_rate": 1.9385078802601846e-05, + "loss": 0.3496, + "step": 6044 + }, + { + "epoch": 0.11224440541654444, + "grad_norm": 0.5030434727668762, + "learning_rate": 1.9384675998734047e-05, + "loss": 0.3792, + "step": 6046 + }, + { + "epoch": 0.11228153555396309, + "grad_norm": 0.3410714566707611, + "learning_rate": 1.938427306716892e-05, + "loss": 0.3238, + "step": 6048 + }, + { + "epoch": 0.11231866569138173, + "grad_norm": 0.4026999771595001, + "learning_rate": 1.9383870007911942e-05, + "loss": 0.4002, + "step": 6050 + }, + { + "epoch": 0.11235579582880036, + "grad_norm": 0.32048147916793823, + "learning_rate": 1.93834668209686e-05, + "loss": 0.4739, + "step": 6052 + }, + { + "epoch": 0.112392925966219, + "grad_norm": 0.27759453654289246, + "learning_rate": 1.9383063506344386e-05, + "loss": 0.2648, + "step": 6054 + }, + { + "epoch": 0.11243005610363764, + "grad_norm": 0.6055189371109009, + "learning_rate": 1.9382660064044782e-05, + "loss": 0.3572, + "step": 6056 + }, + { + "epoch": 0.11246718624105628, + "grad_norm": 0.40630897879600525, + "learning_rate": 1.938225649407528e-05, + "loss": 0.4055, + "step": 6058 + }, + { + "epoch": 0.11250431637847491, + "grad_norm": 0.3931969404220581, + "learning_rate": 1.9381852796441367e-05, + "loss": 0.2969, + "step": 6060 + }, + { + "epoch": 0.11254144651589355, + "grad_norm": 0.40135693550109863, + "learning_rate": 1.938144897114854e-05, + "loss": 0.5507, + "step": 6062 + }, + { + "epoch": 0.1125785766533122, + "grad_norm": 0.2813001871109009, + "learning_rate": 1.9381045018202294e-05, + "loss": 0.2489, + "step": 6064 + }, + { + "epoch": 0.11261570679073084, + "grad_norm": 0.3997330963611603, + "learning_rate": 1.9380640937608123e-05, + "loss": 0.3273, + "step": 6066 + }, + { + "epoch": 0.11265283692814947, + "grad_norm": 0.32363638281822205, + "learning_rate": 1.9380236729371532e-05, + "loss": 0.2368, + "step": 6068 + }, + { + "epoch": 0.11268996706556811, + "grad_norm": 0.5048516392707825, + "learning_rate": 1.937983239349801e-05, + "loss": 0.4573, + "step": 6070 + }, + { + "epoch": 0.11272709720298675, + "grad_norm": 0.38184475898742676, + "learning_rate": 1.937942792999307e-05, + "loss": 0.1789, + "step": 6072 + }, + { + "epoch": 0.11276422734040539, + "grad_norm": 0.24894502758979797, + "learning_rate": 1.937902333886221e-05, + "loss": 0.6135, + "step": 6074 + }, + { + "epoch": 0.11280135747782402, + "grad_norm": 0.29433685541152954, + "learning_rate": 1.937861862011093e-05, + "loss": 0.2062, + "step": 6076 + }, + { + "epoch": 0.11283848761524266, + "grad_norm": 0.34825021028518677, + "learning_rate": 1.937821377374475e-05, + "loss": 0.4458, + "step": 6078 + }, + { + "epoch": 0.1128756177526613, + "grad_norm": 0.39847344160079956, + "learning_rate": 1.9377808799769167e-05, + "loss": 0.2596, + "step": 6080 + }, + { + "epoch": 0.11291274789007995, + "grad_norm": 0.36373013257980347, + "learning_rate": 1.9377403698189695e-05, + "loss": 0.4199, + "step": 6082 + }, + { + "epoch": 0.11294987802749858, + "grad_norm": 0.3375605046749115, + "learning_rate": 1.9376998469011852e-05, + "loss": 0.4184, + "step": 6084 + }, + { + "epoch": 0.11298700816491722, + "grad_norm": 0.4313795268535614, + "learning_rate": 1.9376593112241143e-05, + "loss": 0.211, + "step": 6086 + }, + { + "epoch": 0.11302413830233586, + "grad_norm": 0.4363456666469574, + "learning_rate": 1.937618762788309e-05, + "loss": 0.3217, + "step": 6088 + }, + { + "epoch": 0.11306126843975449, + "grad_norm": 0.24281597137451172, + "learning_rate": 1.937578201594321e-05, + "loss": 0.3087, + "step": 6090 + }, + { + "epoch": 0.11309839857717313, + "grad_norm": 0.29096898436546326, + "learning_rate": 1.937537627642702e-05, + "loss": 0.3114, + "step": 6092 + }, + { + "epoch": 0.11313552871459177, + "grad_norm": 0.29845109581947327, + "learning_rate": 1.937497040934004e-05, + "loss": 0.3343, + "step": 6094 + }, + { + "epoch": 0.11317265885201042, + "grad_norm": 0.3539322316646576, + "learning_rate": 1.9374564414687792e-05, + "loss": 0.2617, + "step": 6096 + }, + { + "epoch": 0.11320978898942904, + "grad_norm": 0.3430907428264618, + "learning_rate": 1.9374158292475805e-05, + "loss": 0.2286, + "step": 6098 + }, + { + "epoch": 0.11324691912684769, + "grad_norm": 0.41140106320381165, + "learning_rate": 1.9373752042709603e-05, + "loss": 0.4725, + "step": 6100 + }, + { + "epoch": 0.11328404926426633, + "grad_norm": 0.3160984218120575, + "learning_rate": 1.937334566539471e-05, + "loss": 0.2723, + "step": 6102 + }, + { + "epoch": 0.11332117940168497, + "grad_norm": 0.3963102400302887, + "learning_rate": 1.9372939160536663e-05, + "loss": 0.3138, + "step": 6104 + }, + { + "epoch": 0.1133583095391036, + "grad_norm": 0.3327803909778595, + "learning_rate": 1.937253252814099e-05, + "loss": 0.4306, + "step": 6106 + }, + { + "epoch": 0.11339543967652224, + "grad_norm": 0.3599388897418976, + "learning_rate": 1.9372125768213217e-05, + "loss": 0.4068, + "step": 6108 + }, + { + "epoch": 0.11343256981394088, + "grad_norm": 0.513262152671814, + "learning_rate": 1.937171888075889e-05, + "loss": 0.2592, + "step": 6110 + }, + { + "epoch": 0.11346969995135953, + "grad_norm": 0.43570563197135925, + "learning_rate": 1.9371311865783538e-05, + "loss": 0.2968, + "step": 6112 + }, + { + "epoch": 0.11350683008877815, + "grad_norm": 0.2661902606487274, + "learning_rate": 1.9370904723292705e-05, + "loss": 0.4131, + "step": 6114 + }, + { + "epoch": 0.1135439602261968, + "grad_norm": 0.23212552070617676, + "learning_rate": 1.9370497453291923e-05, + "loss": 0.3206, + "step": 6116 + }, + { + "epoch": 0.11358109036361544, + "grad_norm": 0.394327312707901, + "learning_rate": 1.9370090055786744e-05, + "loss": 0.4353, + "step": 6118 + }, + { + "epoch": 0.11361822050103408, + "grad_norm": 0.3261694014072418, + "learning_rate": 1.9369682530782703e-05, + "loss": 0.393, + "step": 6120 + }, + { + "epoch": 0.11365535063845271, + "grad_norm": 0.4460222125053406, + "learning_rate": 1.936927487828535e-05, + "loss": 0.3759, + "step": 6122 + }, + { + "epoch": 0.11369248077587135, + "grad_norm": 0.28853529691696167, + "learning_rate": 1.9368867098300227e-05, + "loss": 0.255, + "step": 6124 + }, + { + "epoch": 0.11372961091329, + "grad_norm": 0.2908032238483429, + "learning_rate": 1.9368459190832888e-05, + "loss": 0.3659, + "step": 6126 + }, + { + "epoch": 0.11376674105070862, + "grad_norm": 0.37115544080734253, + "learning_rate": 1.936805115588888e-05, + "loss": 0.2839, + "step": 6128 + }, + { + "epoch": 0.11380387118812726, + "grad_norm": 0.35863929986953735, + "learning_rate": 1.9367642993473755e-05, + "loss": 0.2969, + "step": 6130 + }, + { + "epoch": 0.1138410013255459, + "grad_norm": 0.3698493540287018, + "learning_rate": 1.9367234703593074e-05, + "loss": 0.2526, + "step": 6132 + }, + { + "epoch": 0.11387813146296455, + "grad_norm": 0.35082119703292847, + "learning_rate": 1.9366826286252382e-05, + "loss": 0.3339, + "step": 6134 + }, + { + "epoch": 0.11391526160038318, + "grad_norm": 0.44319337606430054, + "learning_rate": 1.9366417741457244e-05, + "loss": 0.3098, + "step": 6136 + }, + { + "epoch": 0.11395239173780182, + "grad_norm": 0.38933265209198, + "learning_rate": 1.9366009069213212e-05, + "loss": 0.4016, + "step": 6138 + }, + { + "epoch": 0.11398952187522046, + "grad_norm": 0.31285473704338074, + "learning_rate": 1.9365600269525856e-05, + "loss": 0.2897, + "step": 6140 + }, + { + "epoch": 0.1140266520126391, + "grad_norm": 0.2849639058113098, + "learning_rate": 1.936519134240073e-05, + "loss": 0.3427, + "step": 6142 + }, + { + "epoch": 0.11406378215005773, + "grad_norm": 0.3146854639053345, + "learning_rate": 1.9364782287843404e-05, + "loss": 0.2635, + "step": 6144 + }, + { + "epoch": 0.11410091228747637, + "grad_norm": 0.30868664383888245, + "learning_rate": 1.936437310585944e-05, + "loss": 0.3033, + "step": 6146 + }, + { + "epoch": 0.11413804242489502, + "grad_norm": 0.3137667775154114, + "learning_rate": 1.936396379645441e-05, + "loss": 0.2959, + "step": 6148 + }, + { + "epoch": 0.11417517256231366, + "grad_norm": 0.3515711724758148, + "learning_rate": 1.9363554359633875e-05, + "loss": 0.2523, + "step": 6150 + }, + { + "epoch": 0.11421230269973229, + "grad_norm": 0.42071786522865295, + "learning_rate": 1.936314479540342e-05, + "loss": 0.4516, + "step": 6152 + }, + { + "epoch": 0.11424943283715093, + "grad_norm": 0.43328598141670227, + "learning_rate": 1.9362735103768606e-05, + "loss": 0.3506, + "step": 6154 + }, + { + "epoch": 0.11428656297456957, + "grad_norm": 0.35118716955184937, + "learning_rate": 1.9362325284735012e-05, + "loss": 0.314, + "step": 6156 + }, + { + "epoch": 0.11432369311198821, + "grad_norm": 0.33372414112091064, + "learning_rate": 1.9361915338308215e-05, + "loss": 0.4007, + "step": 6158 + }, + { + "epoch": 0.11436082324940684, + "grad_norm": 0.3123158812522888, + "learning_rate": 1.9361505264493794e-05, + "loss": 0.2068, + "step": 6160 + }, + { + "epoch": 0.11439795338682548, + "grad_norm": 0.4641788899898529, + "learning_rate": 1.9361095063297325e-05, + "loss": 0.408, + "step": 6162 + }, + { + "epoch": 0.11443508352424413, + "grad_norm": 0.32063692808151245, + "learning_rate": 1.9360684734724393e-05, + "loss": 0.3507, + "step": 6164 + }, + { + "epoch": 0.11447221366166276, + "grad_norm": 0.3549119234085083, + "learning_rate": 1.936027427878058e-05, + "loss": 0.2362, + "step": 6166 + }, + { + "epoch": 0.1145093437990814, + "grad_norm": 0.2539325952529907, + "learning_rate": 1.935986369547147e-05, + "loss": 0.3978, + "step": 6168 + }, + { + "epoch": 0.11454647393650004, + "grad_norm": 0.2352052628993988, + "learning_rate": 1.9359452984802655e-05, + "loss": 0.3838, + "step": 6170 + }, + { + "epoch": 0.11458360407391868, + "grad_norm": 0.42627835273742676, + "learning_rate": 1.9359042146779714e-05, + "loss": 0.4012, + "step": 6172 + }, + { + "epoch": 0.11462073421133731, + "grad_norm": 0.3156546354293823, + "learning_rate": 1.9358631181408246e-05, + "loss": 0.4486, + "step": 6174 + }, + { + "epoch": 0.11465786434875595, + "grad_norm": 0.3225153386592865, + "learning_rate": 1.9358220088693838e-05, + "loss": 0.3925, + "step": 6176 + }, + { + "epoch": 0.1146949944861746, + "grad_norm": 0.255907267332077, + "learning_rate": 1.935780886864209e-05, + "loss": 0.2269, + "step": 6178 + }, + { + "epoch": 0.11473212462359324, + "grad_norm": 0.3502480089664459, + "learning_rate": 1.935739752125859e-05, + "loss": 0.564, + "step": 6180 + }, + { + "epoch": 0.11476925476101187, + "grad_norm": 0.4209260642528534, + "learning_rate": 1.9356986046548942e-05, + "loss": 0.3796, + "step": 6182 + }, + { + "epoch": 0.11480638489843051, + "grad_norm": 0.4643157422542572, + "learning_rate": 1.9356574444518736e-05, + "loss": 0.2852, + "step": 6184 + }, + { + "epoch": 0.11484351503584915, + "grad_norm": 0.2948068082332611, + "learning_rate": 1.9356162715173582e-05, + "loss": 0.2803, + "step": 6186 + }, + { + "epoch": 0.11488064517326779, + "grad_norm": 0.3148733675479889, + "learning_rate": 1.935575085851908e-05, + "loss": 0.3623, + "step": 6188 + }, + { + "epoch": 0.11491777531068642, + "grad_norm": 0.24655990302562714, + "learning_rate": 1.935533887456083e-05, + "loss": 0.4077, + "step": 6190 + }, + { + "epoch": 0.11495490544810506, + "grad_norm": 0.31325700879096985, + "learning_rate": 1.935492676330444e-05, + "loss": 0.3261, + "step": 6192 + }, + { + "epoch": 0.1149920355855237, + "grad_norm": 0.28601422905921936, + "learning_rate": 1.9354514524755516e-05, + "loss": 0.3074, + "step": 6194 + }, + { + "epoch": 0.11502916572294235, + "grad_norm": 0.3692812919616699, + "learning_rate": 1.935410215891967e-05, + "loss": 0.2409, + "step": 6196 + }, + { + "epoch": 0.11506629586036098, + "grad_norm": 0.47791656851768494, + "learning_rate": 1.9353689665802514e-05, + "loss": 0.3236, + "step": 6198 + }, + { + "epoch": 0.11510342599777962, + "grad_norm": 0.37320658564567566, + "learning_rate": 1.9353277045409657e-05, + "loss": 0.3115, + "step": 6200 + }, + { + "epoch": 0.11514055613519826, + "grad_norm": 0.4265114367008209, + "learning_rate": 1.935286429774672e-05, + "loss": 0.5481, + "step": 6202 + }, + { + "epoch": 0.11517768627261689, + "grad_norm": 0.4482223689556122, + "learning_rate": 1.9352451422819312e-05, + "loss": 0.3222, + "step": 6204 + }, + { + "epoch": 0.11521481641003553, + "grad_norm": 0.44008082151412964, + "learning_rate": 1.9352038420633056e-05, + "loss": 0.3901, + "step": 6206 + }, + { + "epoch": 0.11525194654745417, + "grad_norm": 0.3360796868801117, + "learning_rate": 1.9351625291193564e-05, + "loss": 0.418, + "step": 6208 + }, + { + "epoch": 0.11528907668487282, + "grad_norm": 0.33378854393959045, + "learning_rate": 1.935121203450647e-05, + "loss": 0.4117, + "step": 6210 + }, + { + "epoch": 0.11532620682229144, + "grad_norm": 0.4222334027290344, + "learning_rate": 1.935079865057739e-05, + "loss": 0.5197, + "step": 6212 + }, + { + "epoch": 0.11536333695971009, + "grad_norm": 0.5520004630088806, + "learning_rate": 1.9350385139411946e-05, + "loss": 0.3127, + "step": 6214 + }, + { + "epoch": 0.11540046709712873, + "grad_norm": 0.32125914096832275, + "learning_rate": 1.9349971501015767e-05, + "loss": 0.4435, + "step": 6216 + }, + { + "epoch": 0.11543759723454737, + "grad_norm": 0.3034518361091614, + "learning_rate": 1.9349557735394483e-05, + "loss": 0.1763, + "step": 6218 + }, + { + "epoch": 0.115474727371966, + "grad_norm": 0.29721304774284363, + "learning_rate": 1.9349143842553727e-05, + "loss": 0.2028, + "step": 6220 + }, + { + "epoch": 0.11551185750938464, + "grad_norm": 0.2339998483657837, + "learning_rate": 1.9348729822499125e-05, + "loss": 0.274, + "step": 6222 + }, + { + "epoch": 0.11554898764680328, + "grad_norm": 0.32831862568855286, + "learning_rate": 1.9348315675236312e-05, + "loss": 0.37, + "step": 6224 + }, + { + "epoch": 0.11558611778422193, + "grad_norm": 0.3146616816520691, + "learning_rate": 1.9347901400770923e-05, + "loss": 0.2613, + "step": 6226 + }, + { + "epoch": 0.11562324792164055, + "grad_norm": 0.3267780840396881, + "learning_rate": 1.93474869991086e-05, + "loss": 0.3033, + "step": 6228 + }, + { + "epoch": 0.1156603780590592, + "grad_norm": 0.3073938488960266, + "learning_rate": 1.9347072470254976e-05, + "loss": 0.3981, + "step": 6230 + }, + { + "epoch": 0.11569750819647784, + "grad_norm": 0.5259581804275513, + "learning_rate": 1.9346657814215694e-05, + "loss": 0.3399, + "step": 6232 + }, + { + "epoch": 0.11573463833389647, + "grad_norm": 0.3002197742462158, + "learning_rate": 1.9346243030996394e-05, + "loss": 0.4149, + "step": 6234 + }, + { + "epoch": 0.11577176847131511, + "grad_norm": 0.3518536388874054, + "learning_rate": 1.934582812060272e-05, + "loss": 0.5392, + "step": 6236 + }, + { + "epoch": 0.11580889860873375, + "grad_norm": 0.4209156036376953, + "learning_rate": 1.934541308304032e-05, + "loss": 0.311, + "step": 6238 + }, + { + "epoch": 0.1158460287461524, + "grad_norm": 0.2754109501838684, + "learning_rate": 1.9344997918314846e-05, + "loss": 0.341, + "step": 6240 + }, + { + "epoch": 0.11588315888357102, + "grad_norm": 0.30275246500968933, + "learning_rate": 1.934458262643194e-05, + "loss": 0.2383, + "step": 6242 + }, + { + "epoch": 0.11592028902098966, + "grad_norm": 0.5595771670341492, + "learning_rate": 1.934416720739725e-05, + "loss": 0.2676, + "step": 6244 + }, + { + "epoch": 0.1159574191584083, + "grad_norm": 0.2599327862262726, + "learning_rate": 1.9343751661216435e-05, + "loss": 0.2845, + "step": 6246 + }, + { + "epoch": 0.11599454929582695, + "grad_norm": 0.34935829043388367, + "learning_rate": 1.934333598789515e-05, + "loss": 0.4269, + "step": 6248 + }, + { + "epoch": 0.11603167943324558, + "grad_norm": 0.35757824778556824, + "learning_rate": 1.9342920187439046e-05, + "loss": 0.2328, + "step": 6250 + }, + { + "epoch": 0.11606880957066422, + "grad_norm": 0.34092605113983154, + "learning_rate": 1.9342504259853788e-05, + "loss": 0.4343, + "step": 6252 + }, + { + "epoch": 0.11610593970808286, + "grad_norm": 0.3107389509677887, + "learning_rate": 1.9342088205145027e-05, + "loss": 0.3598, + "step": 6254 + }, + { + "epoch": 0.1161430698455015, + "grad_norm": 0.387880802154541, + "learning_rate": 1.934167202331843e-05, + "loss": 0.3883, + "step": 6256 + }, + { + "epoch": 0.11618019998292013, + "grad_norm": 0.40455198287963867, + "learning_rate": 1.9341255714379657e-05, + "loss": 0.4243, + "step": 6258 + }, + { + "epoch": 0.11621733012033877, + "grad_norm": 0.30802977085113525, + "learning_rate": 1.9340839278334376e-05, + "loss": 0.2851, + "step": 6260 + }, + { + "epoch": 0.11625446025775742, + "grad_norm": 0.30081719160079956, + "learning_rate": 1.934042271518825e-05, + "loss": 0.3266, + "step": 6262 + }, + { + "epoch": 0.11629159039517606, + "grad_norm": 0.4688226878643036, + "learning_rate": 1.9340006024946947e-05, + "loss": 0.4721, + "step": 6264 + }, + { + "epoch": 0.11632872053259469, + "grad_norm": 0.34842449426651, + "learning_rate": 1.9339589207616144e-05, + "loss": 0.4076, + "step": 6266 + }, + { + "epoch": 0.11636585067001333, + "grad_norm": 0.861624538898468, + "learning_rate": 1.93391722632015e-05, + "loss": 0.3153, + "step": 6268 + }, + { + "epoch": 0.11640298080743197, + "grad_norm": 0.35545170307159424, + "learning_rate": 1.93387551917087e-05, + "loss": 0.2755, + "step": 6270 + }, + { + "epoch": 0.1164401109448506, + "grad_norm": 0.3339941203594208, + "learning_rate": 1.9338337993143415e-05, + "loss": 0.1827, + "step": 6272 + }, + { + "epoch": 0.11647724108226924, + "grad_norm": 0.3039368987083435, + "learning_rate": 1.933792066751132e-05, + "loss": 0.3806, + "step": 6274 + }, + { + "epoch": 0.11651437121968788, + "grad_norm": 0.3907169997692108, + "learning_rate": 1.9337503214818097e-05, + "loss": 0.2715, + "step": 6276 + }, + { + "epoch": 0.11655150135710653, + "grad_norm": 0.3075578212738037, + "learning_rate": 1.933708563506942e-05, + "loss": 0.3609, + "step": 6278 + }, + { + "epoch": 0.11658863149452516, + "grad_norm": 0.39553922414779663, + "learning_rate": 1.933666792827098e-05, + "loss": 0.3306, + "step": 6280 + }, + { + "epoch": 0.1166257616319438, + "grad_norm": 0.462171733379364, + "learning_rate": 1.9336250094428456e-05, + "loss": 0.3146, + "step": 6282 + }, + { + "epoch": 0.11666289176936244, + "grad_norm": 0.44297927618026733, + "learning_rate": 1.9335832133547527e-05, + "loss": 0.5232, + "step": 6284 + }, + { + "epoch": 0.11670002190678108, + "grad_norm": 0.32021188735961914, + "learning_rate": 1.9335414045633893e-05, + "loss": 0.3727, + "step": 6286 + }, + { + "epoch": 0.11673715204419971, + "grad_norm": 0.2519427537918091, + "learning_rate": 1.9334995830693234e-05, + "loss": 0.2275, + "step": 6288 + }, + { + "epoch": 0.11677428218161835, + "grad_norm": 0.3117574453353882, + "learning_rate": 1.9334577488731244e-05, + "loss": 0.4021, + "step": 6290 + }, + { + "epoch": 0.116811412319037, + "grad_norm": 0.33603936433792114, + "learning_rate": 1.9334159019753618e-05, + "loss": 0.3638, + "step": 6292 + }, + { + "epoch": 0.11684854245645564, + "grad_norm": 0.43508630990982056, + "learning_rate": 1.9333740423766044e-05, + "loss": 0.3672, + "step": 6294 + }, + { + "epoch": 0.11688567259387427, + "grad_norm": 0.49777570366859436, + "learning_rate": 1.933332170077422e-05, + "loss": 0.2723, + "step": 6296 + }, + { + "epoch": 0.11692280273129291, + "grad_norm": 0.32379472255706787, + "learning_rate": 1.9332902850783844e-05, + "loss": 0.4202, + "step": 6298 + }, + { + "epoch": 0.11695993286871155, + "grad_norm": 0.30513545870780945, + "learning_rate": 1.9332483873800613e-05, + "loss": 0.4536, + "step": 6300 + }, + { + "epoch": 0.11699706300613019, + "grad_norm": 0.400575190782547, + "learning_rate": 1.9332064769830237e-05, + "loss": 0.3342, + "step": 6302 + }, + { + "epoch": 0.11703419314354882, + "grad_norm": 0.34833985567092896, + "learning_rate": 1.9331645538878407e-05, + "loss": 0.4048, + "step": 6304 + }, + { + "epoch": 0.11707132328096746, + "grad_norm": 0.3945656418800354, + "learning_rate": 1.9331226180950835e-05, + "loss": 0.3248, + "step": 6306 + }, + { + "epoch": 0.1171084534183861, + "grad_norm": 0.3738468587398529, + "learning_rate": 1.9330806696053225e-05, + "loss": 0.2076, + "step": 6308 + }, + { + "epoch": 0.11714558355580473, + "grad_norm": 0.28878864645957947, + "learning_rate": 1.9330387084191285e-05, + "loss": 0.3897, + "step": 6310 + }, + { + "epoch": 0.11718271369322338, + "grad_norm": 0.311980277299881, + "learning_rate": 1.9329967345370724e-05, + "loss": 0.2539, + "step": 6312 + }, + { + "epoch": 0.11721984383064202, + "grad_norm": 0.30747777223587036, + "learning_rate": 1.932954747959725e-05, + "loss": 0.2191, + "step": 6314 + }, + { + "epoch": 0.11725697396806066, + "grad_norm": 0.3596014380455017, + "learning_rate": 1.9329127486876586e-05, + "loss": 0.3636, + "step": 6316 + }, + { + "epoch": 0.11729410410547929, + "grad_norm": 0.3481011986732483, + "learning_rate": 1.932870736721444e-05, + "loss": 0.24, + "step": 6318 + }, + { + "epoch": 0.11733123424289793, + "grad_norm": 0.28967753052711487, + "learning_rate": 1.9328287120616526e-05, + "loss": 0.3163, + "step": 6320 + }, + { + "epoch": 0.11736836438031657, + "grad_norm": 0.33237722516059875, + "learning_rate": 1.932786674708857e-05, + "loss": 0.3142, + "step": 6322 + }, + { + "epoch": 0.11740549451773521, + "grad_norm": 0.3374473750591278, + "learning_rate": 1.9327446246636285e-05, + "loss": 0.3372, + "step": 6324 + }, + { + "epoch": 0.11744262465515384, + "grad_norm": 0.2985314726829529, + "learning_rate": 1.9327025619265396e-05, + "loss": 0.261, + "step": 6326 + }, + { + "epoch": 0.11747975479257249, + "grad_norm": 0.3636707663536072, + "learning_rate": 1.9326604864981626e-05, + "loss": 0.3909, + "step": 6328 + }, + { + "epoch": 0.11751688492999113, + "grad_norm": 0.33055102825164795, + "learning_rate": 1.93261839837907e-05, + "loss": 0.3352, + "step": 6330 + }, + { + "epoch": 0.11755401506740977, + "grad_norm": 0.31929707527160645, + "learning_rate": 1.9325762975698346e-05, + "loss": 0.1707, + "step": 6332 + }, + { + "epoch": 0.1175911452048284, + "grad_norm": 0.3036884665489197, + "learning_rate": 1.9325341840710292e-05, + "loss": 0.2002, + "step": 6334 + }, + { + "epoch": 0.11762827534224704, + "grad_norm": 0.3538627624511719, + "learning_rate": 1.9324920578832267e-05, + "loss": 0.3487, + "step": 6336 + }, + { + "epoch": 0.11766540547966568, + "grad_norm": 0.31544238328933716, + "learning_rate": 1.9324499190070004e-05, + "loss": 0.3783, + "step": 6338 + }, + { + "epoch": 0.11770253561708433, + "grad_norm": 0.35008835792541504, + "learning_rate": 1.932407767442924e-05, + "loss": 0.3911, + "step": 6340 + }, + { + "epoch": 0.11773966575450295, + "grad_norm": 0.4592518210411072, + "learning_rate": 1.9323656031915706e-05, + "loss": 0.3756, + "step": 6342 + }, + { + "epoch": 0.1177767958919216, + "grad_norm": 0.4852827787399292, + "learning_rate": 1.932323426253514e-05, + "loss": 0.3631, + "step": 6344 + }, + { + "epoch": 0.11781392602934024, + "grad_norm": 0.39585623145103455, + "learning_rate": 1.932281236629328e-05, + "loss": 0.5045, + "step": 6346 + }, + { + "epoch": 0.11785105616675887, + "grad_norm": 0.2956748604774475, + "learning_rate": 1.9322390343195872e-05, + "loss": 0.4251, + "step": 6348 + }, + { + "epoch": 0.11788818630417751, + "grad_norm": 0.3633421063423157, + "learning_rate": 1.9321968193248654e-05, + "loss": 0.2381, + "step": 6350 + }, + { + "epoch": 0.11792531644159615, + "grad_norm": 0.4885057210922241, + "learning_rate": 1.932154591645737e-05, + "loss": 0.2246, + "step": 6352 + }, + { + "epoch": 0.11796244657901479, + "grad_norm": 0.1861577332019806, + "learning_rate": 1.9321123512827773e-05, + "loss": 0.1933, + "step": 6354 + }, + { + "epoch": 0.11799957671643342, + "grad_norm": 0.3718888759613037, + "learning_rate": 1.93207009823656e-05, + "loss": 0.4148, + "step": 6356 + }, + { + "epoch": 0.11803670685385206, + "grad_norm": 0.2409990131855011, + "learning_rate": 1.9320278325076607e-05, + "loss": 0.4167, + "step": 6358 + }, + { + "epoch": 0.1180738369912707, + "grad_norm": 0.34573447704315186, + "learning_rate": 1.9319855540966545e-05, + "loss": 0.2, + "step": 6360 + }, + { + "epoch": 0.11811096712868935, + "grad_norm": 0.21412226557731628, + "learning_rate": 1.9319432630041162e-05, + "loss": 0.5324, + "step": 6362 + }, + { + "epoch": 0.11814809726610798, + "grad_norm": 0.3289864659309387, + "learning_rate": 1.9319009592306217e-05, + "loss": 0.4534, + "step": 6364 + }, + { + "epoch": 0.11818522740352662, + "grad_norm": 0.32960912585258484, + "learning_rate": 1.9318586427767464e-05, + "loss": 0.5263, + "step": 6366 + }, + { + "epoch": 0.11822235754094526, + "grad_norm": 0.35673004388809204, + "learning_rate": 1.9318163136430665e-05, + "loss": 0.2807, + "step": 6368 + }, + { + "epoch": 0.1182594876783639, + "grad_norm": 0.33887341618537903, + "learning_rate": 1.9317739718301574e-05, + "loss": 0.2443, + "step": 6370 + }, + { + "epoch": 0.11829661781578253, + "grad_norm": 0.39766794443130493, + "learning_rate": 1.931731617338596e-05, + "loss": 0.4343, + "step": 6372 + }, + { + "epoch": 0.11833374795320117, + "grad_norm": 0.38862061500549316, + "learning_rate": 1.9316892501689577e-05, + "loss": 0.4005, + "step": 6374 + }, + { + "epoch": 0.11837087809061982, + "grad_norm": 0.36087262630462646, + "learning_rate": 1.9316468703218194e-05, + "loss": 0.2489, + "step": 6376 + }, + { + "epoch": 0.11840800822803846, + "grad_norm": 0.49925076961517334, + "learning_rate": 1.931604477797758e-05, + "loss": 0.2607, + "step": 6378 + }, + { + "epoch": 0.11844513836545709, + "grad_norm": 0.5335688591003418, + "learning_rate": 1.93156207259735e-05, + "loss": 0.4691, + "step": 6380 + }, + { + "epoch": 0.11848226850287573, + "grad_norm": 0.5479843020439148, + "learning_rate": 1.9315196547211727e-05, + "loss": 0.2915, + "step": 6382 + }, + { + "epoch": 0.11851939864029437, + "grad_norm": 0.32797855138778687, + "learning_rate": 1.931477224169803e-05, + "loss": 0.4484, + "step": 6384 + }, + { + "epoch": 0.118556528777713, + "grad_norm": 0.35125523805618286, + "learning_rate": 1.9314347809438184e-05, + "loss": 0.3748, + "step": 6386 + }, + { + "epoch": 0.11859365891513164, + "grad_norm": 0.40221527218818665, + "learning_rate": 1.9313923250437964e-05, + "loss": 0.4263, + "step": 6388 + }, + { + "epoch": 0.11863078905255028, + "grad_norm": 0.32278522849082947, + "learning_rate": 1.931349856470315e-05, + "loss": 0.4645, + "step": 6390 + }, + { + "epoch": 0.11866791918996893, + "grad_norm": 0.29107481241226196, + "learning_rate": 1.9313073752239514e-05, + "loss": 0.3998, + "step": 6392 + }, + { + "epoch": 0.11870504932738755, + "grad_norm": 0.3763576149940491, + "learning_rate": 1.931264881305284e-05, + "loss": 0.3354, + "step": 6394 + }, + { + "epoch": 0.1187421794648062, + "grad_norm": 0.45447906851768494, + "learning_rate": 1.9312223747148912e-05, + "loss": 0.22, + "step": 6396 + }, + { + "epoch": 0.11877930960222484, + "grad_norm": 0.38563793897628784, + "learning_rate": 1.9311798554533512e-05, + "loss": 0.1965, + "step": 6398 + }, + { + "epoch": 0.11881643973964348, + "grad_norm": 0.26056474447250366, + "learning_rate": 1.9311373235212426e-05, + "loss": 0.3718, + "step": 6400 + }, + { + "epoch": 0.11885356987706211, + "grad_norm": 0.461426705121994, + "learning_rate": 1.931094778919144e-05, + "loss": 0.4587, + "step": 6402 + }, + { + "epoch": 0.11889070001448075, + "grad_norm": 0.31230196356773376, + "learning_rate": 1.9310522216476344e-05, + "loss": 0.174, + "step": 6404 + }, + { + "epoch": 0.1189278301518994, + "grad_norm": 0.3929494321346283, + "learning_rate": 1.931009651707293e-05, + "loss": 0.3449, + "step": 6406 + }, + { + "epoch": 0.11896496028931804, + "grad_norm": 0.5228191018104553, + "learning_rate": 1.930967069098699e-05, + "loss": 0.282, + "step": 6408 + }, + { + "epoch": 0.11900209042673666, + "grad_norm": 0.3462566137313843, + "learning_rate": 1.9309244738224313e-05, + "loss": 0.4301, + "step": 6410 + }, + { + "epoch": 0.11903922056415531, + "grad_norm": 0.3149477243423462, + "learning_rate": 1.9308818658790703e-05, + "loss": 0.138, + "step": 6412 + }, + { + "epoch": 0.11907635070157395, + "grad_norm": 0.31245216727256775, + "learning_rate": 1.9308392452691955e-05, + "loss": 0.3328, + "step": 6414 + }, + { + "epoch": 0.11911348083899259, + "grad_norm": 1.0327508449554443, + "learning_rate": 1.9307966119933864e-05, + "loss": 0.2014, + "step": 6416 + }, + { + "epoch": 0.11915061097641122, + "grad_norm": 0.3724728226661682, + "learning_rate": 1.930753966052224e-05, + "loss": 0.3812, + "step": 6418 + }, + { + "epoch": 0.11918774111382986, + "grad_norm": 0.3237592279911041, + "learning_rate": 1.930711307446288e-05, + "loss": 0.3758, + "step": 6420 + }, + { + "epoch": 0.1192248712512485, + "grad_norm": 0.322488933801651, + "learning_rate": 1.9306686361761583e-05, + "loss": 0.3573, + "step": 6422 + }, + { + "epoch": 0.11926200138866713, + "grad_norm": 0.3465365767478943, + "learning_rate": 1.930625952242417e-05, + "loss": 0.2454, + "step": 6424 + }, + { + "epoch": 0.11929913152608577, + "grad_norm": 0.3229961097240448, + "learning_rate": 1.930583255645643e-05, + "loss": 0.406, + "step": 6426 + }, + { + "epoch": 0.11933626166350442, + "grad_norm": 0.42423954606056213, + "learning_rate": 1.930540546386419e-05, + "loss": 0.2641, + "step": 6428 + }, + { + "epoch": 0.11937339180092306, + "grad_norm": 0.38974571228027344, + "learning_rate": 1.9304978244653255e-05, + "loss": 0.3389, + "step": 6430 + }, + { + "epoch": 0.11941052193834169, + "grad_norm": 0.3108026683330536, + "learning_rate": 1.9304550898829433e-05, + "loss": 0.3768, + "step": 6432 + }, + { + "epoch": 0.11944765207576033, + "grad_norm": 0.29700908064842224, + "learning_rate": 1.9304123426398547e-05, + "loss": 0.2758, + "step": 6434 + }, + { + "epoch": 0.11948478221317897, + "grad_norm": 0.29016920924186707, + "learning_rate": 1.930369582736641e-05, + "loss": 0.3749, + "step": 6436 + }, + { + "epoch": 0.11952191235059761, + "grad_norm": 0.45442160964012146, + "learning_rate": 1.9303268101738843e-05, + "loss": 0.4117, + "step": 6438 + }, + { + "epoch": 0.11955904248801624, + "grad_norm": 0.3652421236038208, + "learning_rate": 1.9302840249521656e-05, + "loss": 0.3141, + "step": 6440 + }, + { + "epoch": 0.11959617262543489, + "grad_norm": 0.34132668375968933, + "learning_rate": 1.9302412270720684e-05, + "loss": 0.4183, + "step": 6442 + }, + { + "epoch": 0.11963330276285353, + "grad_norm": 0.26527708768844604, + "learning_rate": 1.9301984165341748e-05, + "loss": 0.3891, + "step": 6444 + }, + { + "epoch": 0.11967043290027217, + "grad_norm": 0.2945473790168762, + "learning_rate": 1.9301555933390665e-05, + "loss": 0.3966, + "step": 6446 + }, + { + "epoch": 0.1197075630376908, + "grad_norm": 0.2841329574584961, + "learning_rate": 1.930112757487327e-05, + "loss": 0.548, + "step": 6448 + }, + { + "epoch": 0.11974469317510944, + "grad_norm": 0.3590245842933655, + "learning_rate": 1.9300699089795386e-05, + "loss": 0.2249, + "step": 6450 + }, + { + "epoch": 0.11978182331252808, + "grad_norm": 0.47244569659233093, + "learning_rate": 1.930027047816285e-05, + "loss": 0.36, + "step": 6452 + }, + { + "epoch": 0.11981895344994672, + "grad_norm": 0.3336014449596405, + "learning_rate": 1.9299841739981488e-05, + "loss": 0.2288, + "step": 6454 + }, + { + "epoch": 0.11985608358736535, + "grad_norm": 0.2977416217327118, + "learning_rate": 1.9299412875257135e-05, + "loss": 0.3897, + "step": 6456 + }, + { + "epoch": 0.119893213724784, + "grad_norm": 0.38169124722480774, + "learning_rate": 1.929898388399563e-05, + "loss": 0.4117, + "step": 6458 + }, + { + "epoch": 0.11993034386220264, + "grad_norm": 0.3038351833820343, + "learning_rate": 1.9298554766202805e-05, + "loss": 0.3731, + "step": 6460 + }, + { + "epoch": 0.11996747399962127, + "grad_norm": 0.3170458674430847, + "learning_rate": 1.9298125521884506e-05, + "loss": 0.2526, + "step": 6462 + }, + { + "epoch": 0.12000460413703991, + "grad_norm": 0.344821035861969, + "learning_rate": 1.9297696151046567e-05, + "loss": 0.2507, + "step": 6464 + }, + { + "epoch": 0.12004173427445855, + "grad_norm": 0.322478711605072, + "learning_rate": 1.9297266653694834e-05, + "loss": 0.4167, + "step": 6466 + }, + { + "epoch": 0.12007886441187719, + "grad_norm": 0.41737422347068787, + "learning_rate": 1.929683702983515e-05, + "loss": 0.4625, + "step": 6468 + }, + { + "epoch": 0.12011599454929582, + "grad_norm": 0.2380305379629135, + "learning_rate": 1.929640727947336e-05, + "loss": 0.2276, + "step": 6470 + }, + { + "epoch": 0.12015312468671446, + "grad_norm": 0.3471153974533081, + "learning_rate": 1.9295977402615316e-05, + "loss": 0.5055, + "step": 6472 + }, + { + "epoch": 0.1201902548241331, + "grad_norm": 0.32918861508369446, + "learning_rate": 1.9295547399266863e-05, + "loss": 0.3427, + "step": 6474 + }, + { + "epoch": 0.12022738496155175, + "grad_norm": 0.3176921010017395, + "learning_rate": 1.9295117269433853e-05, + "loss": 0.2522, + "step": 6476 + }, + { + "epoch": 0.12026451509897038, + "grad_norm": 0.3586351275444031, + "learning_rate": 1.929468701312214e-05, + "loss": 0.4417, + "step": 6478 + }, + { + "epoch": 0.12030164523638902, + "grad_norm": 0.3228225111961365, + "learning_rate": 1.9294256630337574e-05, + "loss": 0.2962, + "step": 6480 + }, + { + "epoch": 0.12033877537380766, + "grad_norm": 0.3039906322956085, + "learning_rate": 1.9293826121086022e-05, + "loss": 0.2923, + "step": 6482 + }, + { + "epoch": 0.1203759055112263, + "grad_norm": 0.2316453754901886, + "learning_rate": 1.929339548537333e-05, + "loss": 0.4776, + "step": 6484 + }, + { + "epoch": 0.12041303564864493, + "grad_norm": 0.3888653814792633, + "learning_rate": 1.929296472320536e-05, + "loss": 0.3179, + "step": 6486 + }, + { + "epoch": 0.12045016578606357, + "grad_norm": 0.26490679383277893, + "learning_rate": 1.929253383458798e-05, + "loss": 0.4568, + "step": 6488 + }, + { + "epoch": 0.12048729592348222, + "grad_norm": 0.5689661502838135, + "learning_rate": 1.9292102819527046e-05, + "loss": 0.4179, + "step": 6490 + }, + { + "epoch": 0.12052442606090086, + "grad_norm": 0.22351336479187012, + "learning_rate": 1.929167167802843e-05, + "loss": 0.2891, + "step": 6492 + }, + { + "epoch": 0.12056155619831949, + "grad_norm": 0.37877222895622253, + "learning_rate": 1.9291240410097986e-05, + "loss": 0.4748, + "step": 6494 + }, + { + "epoch": 0.12059868633573813, + "grad_norm": 0.46963584423065186, + "learning_rate": 1.92908090157416e-05, + "loss": 0.3359, + "step": 6496 + }, + { + "epoch": 0.12063581647315677, + "grad_norm": 0.4761298894882202, + "learning_rate": 1.9290377494965127e-05, + "loss": 0.3705, + "step": 6498 + }, + { + "epoch": 0.1206729466105754, + "grad_norm": 0.3215329349040985, + "learning_rate": 1.9289945847774443e-05, + "loss": 0.3573, + "step": 6500 + }, + { + "epoch": 0.12071007674799404, + "grad_norm": 0.2952653765678406, + "learning_rate": 1.9289514074175424e-05, + "loss": 0.2789, + "step": 6502 + }, + { + "epoch": 0.12074720688541268, + "grad_norm": 0.35248643159866333, + "learning_rate": 1.9289082174173947e-05, + "loss": 0.3571, + "step": 6504 + }, + { + "epoch": 0.12078433702283133, + "grad_norm": 0.3120441138744354, + "learning_rate": 1.9288650147775882e-05, + "loss": 0.3292, + "step": 6506 + }, + { + "epoch": 0.12082146716024995, + "grad_norm": 0.21641826629638672, + "learning_rate": 1.9288217994987115e-05, + "loss": 0.2532, + "step": 6508 + }, + { + "epoch": 0.1208585972976686, + "grad_norm": 0.5590580701828003, + "learning_rate": 1.928778571581352e-05, + "loss": 0.2118, + "step": 6510 + }, + { + "epoch": 0.12089572743508724, + "grad_norm": 0.4010406732559204, + "learning_rate": 1.928735331026098e-05, + "loss": 0.3141, + "step": 6512 + }, + { + "epoch": 0.12093285757250588, + "grad_norm": 0.2877483367919922, + "learning_rate": 1.928692077833538e-05, + "loss": 0.3298, + "step": 6514 + }, + { + "epoch": 0.12096998770992451, + "grad_norm": 0.29856744408607483, + "learning_rate": 1.928648812004261e-05, + "loss": 0.296, + "step": 6516 + }, + { + "epoch": 0.12100711784734315, + "grad_norm": 0.29913046956062317, + "learning_rate": 1.9286055335388552e-05, + "loss": 0.4295, + "step": 6518 + }, + { + "epoch": 0.1210442479847618, + "grad_norm": 0.33249711990356445, + "learning_rate": 1.9285622424379093e-05, + "loss": 0.3854, + "step": 6520 + }, + { + "epoch": 0.12108137812218044, + "grad_norm": 0.29221323132514954, + "learning_rate": 1.9285189387020127e-05, + "loss": 0.3249, + "step": 6522 + }, + { + "epoch": 0.12111850825959906, + "grad_norm": 0.3873700499534607, + "learning_rate": 1.9284756223317548e-05, + "loss": 0.2973, + "step": 6524 + }, + { + "epoch": 0.1211556383970177, + "grad_norm": 0.35372382402420044, + "learning_rate": 1.928432293327725e-05, + "loss": 0.3401, + "step": 6526 + }, + { + "epoch": 0.12119276853443635, + "grad_norm": 0.40467023849487305, + "learning_rate": 1.9283889516905122e-05, + "loss": 0.1733, + "step": 6528 + }, + { + "epoch": 0.12122989867185499, + "grad_norm": 0.4137405753135681, + "learning_rate": 1.9283455974207066e-05, + "loss": 0.3361, + "step": 6530 + }, + { + "epoch": 0.12126702880927362, + "grad_norm": 0.46695342659950256, + "learning_rate": 1.9283022305188985e-05, + "loss": 0.345, + "step": 6532 + }, + { + "epoch": 0.12130415894669226, + "grad_norm": 0.4510466456413269, + "learning_rate": 1.9282588509856773e-05, + "loss": 0.3132, + "step": 6534 + }, + { + "epoch": 0.1213412890841109, + "grad_norm": 0.27598023414611816, + "learning_rate": 1.9282154588216335e-05, + "loss": 0.3661, + "step": 6536 + }, + { + "epoch": 0.12137841922152953, + "grad_norm": 0.38803061842918396, + "learning_rate": 1.928172054027358e-05, + "loss": 0.417, + "step": 6538 + }, + { + "epoch": 0.12141554935894817, + "grad_norm": 0.24103468656539917, + "learning_rate": 1.928128636603441e-05, + "loss": 0.4514, + "step": 6540 + }, + { + "epoch": 0.12145267949636682, + "grad_norm": 0.31650304794311523, + "learning_rate": 1.9280852065504733e-05, + "loss": 0.2983, + "step": 6542 + }, + { + "epoch": 0.12148980963378546, + "grad_norm": 0.3881564140319824, + "learning_rate": 1.9280417638690455e-05, + "loss": 0.4158, + "step": 6544 + }, + { + "epoch": 0.12152693977120409, + "grad_norm": 0.5013611316680908, + "learning_rate": 1.9279983085597492e-05, + "loss": 0.3397, + "step": 6546 + }, + { + "epoch": 0.12156406990862273, + "grad_norm": 0.6283319592475891, + "learning_rate": 1.927954840623176e-05, + "loss": 0.5536, + "step": 6548 + }, + { + "epoch": 0.12160120004604137, + "grad_norm": 0.304241806268692, + "learning_rate": 1.9279113600599166e-05, + "loss": 0.2746, + "step": 6550 + }, + { + "epoch": 0.12163833018346001, + "grad_norm": 0.5270283818244934, + "learning_rate": 1.9278678668705633e-05, + "loss": 0.3561, + "step": 6552 + }, + { + "epoch": 0.12167546032087864, + "grad_norm": 0.3649897575378418, + "learning_rate": 1.9278243610557075e-05, + "loss": 0.3251, + "step": 6554 + }, + { + "epoch": 0.12171259045829728, + "grad_norm": 0.3196565806865692, + "learning_rate": 1.927780842615941e-05, + "loss": 0.3917, + "step": 6556 + }, + { + "epoch": 0.12174972059571593, + "grad_norm": 0.2557104229927063, + "learning_rate": 1.9277373115518566e-05, + "loss": 0.3491, + "step": 6558 + }, + { + "epoch": 0.12178685073313457, + "grad_norm": 0.3482026755809784, + "learning_rate": 1.9276937678640462e-05, + "loss": 0.4229, + "step": 6560 + }, + { + "epoch": 0.1218239808705532, + "grad_norm": 0.39679408073425293, + "learning_rate": 1.9276502115531025e-05, + "loss": 0.2759, + "step": 6562 + }, + { + "epoch": 0.12186111100797184, + "grad_norm": 0.39061054587364197, + "learning_rate": 1.927606642619618e-05, + "loss": 0.3665, + "step": 6564 + }, + { + "epoch": 0.12189824114539048, + "grad_norm": 0.3834904730319977, + "learning_rate": 1.9275630610641855e-05, + "loss": 0.2948, + "step": 6566 + }, + { + "epoch": 0.12193537128280912, + "grad_norm": 0.3347572982311249, + "learning_rate": 1.927519466887398e-05, + "loss": 0.5463, + "step": 6568 + }, + { + "epoch": 0.12197250142022775, + "grad_norm": 0.3545336425304413, + "learning_rate": 1.927475860089849e-05, + "loss": 0.4329, + "step": 6570 + }, + { + "epoch": 0.1220096315576464, + "grad_norm": 0.3152683675289154, + "learning_rate": 1.9274322406721318e-05, + "loss": 0.4815, + "step": 6572 + }, + { + "epoch": 0.12204676169506504, + "grad_norm": 0.27598464488983154, + "learning_rate": 1.9273886086348396e-05, + "loss": 0.2405, + "step": 6574 + }, + { + "epoch": 0.12208389183248367, + "grad_norm": 0.1969933807849884, + "learning_rate": 1.927344963978566e-05, + "loss": 0.2258, + "step": 6576 + }, + { + "epoch": 0.12212102196990231, + "grad_norm": 0.3946920335292816, + "learning_rate": 1.9273013067039055e-05, + "loss": 0.3941, + "step": 6578 + }, + { + "epoch": 0.12215815210732095, + "grad_norm": 0.24420249462127686, + "learning_rate": 1.927257636811452e-05, + "loss": 0.4051, + "step": 6580 + }, + { + "epoch": 0.12219528224473959, + "grad_norm": 0.3174636662006378, + "learning_rate": 1.927213954301799e-05, + "loss": 0.2926, + "step": 6582 + }, + { + "epoch": 0.12223241238215822, + "grad_norm": 0.31497666239738464, + "learning_rate": 1.927170259175542e-05, + "loss": 0.2266, + "step": 6584 + }, + { + "epoch": 0.12226954251957686, + "grad_norm": 0.3753055930137634, + "learning_rate": 1.9271265514332745e-05, + "loss": 0.5234, + "step": 6586 + }, + { + "epoch": 0.1223066726569955, + "grad_norm": 0.4616703689098358, + "learning_rate": 1.927082831075592e-05, + "loss": 0.4288, + "step": 6588 + }, + { + "epoch": 0.12234380279441415, + "grad_norm": 0.37186765670776367, + "learning_rate": 1.9270390981030887e-05, + "loss": 0.1932, + "step": 6590 + }, + { + "epoch": 0.12238093293183278, + "grad_norm": 0.31633326411247253, + "learning_rate": 1.9269953525163604e-05, + "loss": 0.2587, + "step": 6592 + }, + { + "epoch": 0.12241806306925142, + "grad_norm": 0.4911516308784485, + "learning_rate": 1.926951594316002e-05, + "loss": 0.4691, + "step": 6594 + }, + { + "epoch": 0.12245519320667006, + "grad_norm": 0.44741296768188477, + "learning_rate": 1.9269078235026088e-05, + "loss": 0.2454, + "step": 6596 + }, + { + "epoch": 0.1224923233440887, + "grad_norm": 0.23290644586086273, + "learning_rate": 1.9268640400767767e-05, + "loss": 0.2905, + "step": 6598 + }, + { + "epoch": 0.12252945348150733, + "grad_norm": 0.3089956045150757, + "learning_rate": 1.926820244039101e-05, + "loss": 0.3888, + "step": 6600 + }, + { + "epoch": 0.12256658361892597, + "grad_norm": 0.25630542635917664, + "learning_rate": 1.9267764353901784e-05, + "loss": 0.4193, + "step": 6602 + }, + { + "epoch": 0.12260371375634461, + "grad_norm": 0.32781144976615906, + "learning_rate": 1.9267326141306043e-05, + "loss": 0.3036, + "step": 6604 + }, + { + "epoch": 0.12264084389376326, + "grad_norm": 0.3330346941947937, + "learning_rate": 1.926688780260975e-05, + "loss": 0.4617, + "step": 6606 + }, + { + "epoch": 0.12267797403118189, + "grad_norm": 0.3724346458911896, + "learning_rate": 1.9266449337818877e-05, + "loss": 0.2683, + "step": 6608 + }, + { + "epoch": 0.12271510416860053, + "grad_norm": 0.31460464000701904, + "learning_rate": 1.9266010746939378e-05, + "loss": 0.287, + "step": 6610 + }, + { + "epoch": 0.12275223430601917, + "grad_norm": 0.4137624204158783, + "learning_rate": 1.9265572029977232e-05, + "loss": 0.4386, + "step": 6612 + }, + { + "epoch": 0.1227893644434378, + "grad_norm": 0.39235442876815796, + "learning_rate": 1.9265133186938403e-05, + "loss": 0.5246, + "step": 6614 + }, + { + "epoch": 0.12282649458085644, + "grad_norm": 0.371348112821579, + "learning_rate": 1.9264694217828863e-05, + "loss": 0.2307, + "step": 6616 + }, + { + "epoch": 0.12286362471827508, + "grad_norm": 0.28955206274986267, + "learning_rate": 1.9264255122654587e-05, + "loss": 0.229, + "step": 6618 + }, + { + "epoch": 0.12290075485569373, + "grad_norm": 0.3146589398384094, + "learning_rate": 1.9263815901421547e-05, + "loss": 0.2618, + "step": 6620 + }, + { + "epoch": 0.12293788499311235, + "grad_norm": 0.34420573711395264, + "learning_rate": 1.926337655413572e-05, + "loss": 0.3771, + "step": 6622 + }, + { + "epoch": 0.122975015130531, + "grad_norm": 0.3516557812690735, + "learning_rate": 1.926293708080309e-05, + "loss": 0.3948, + "step": 6624 + }, + { + "epoch": 0.12301214526794964, + "grad_norm": 0.36902934312820435, + "learning_rate": 1.9262497481429626e-05, + "loss": 0.2671, + "step": 6626 + }, + { + "epoch": 0.12304927540536828, + "grad_norm": 0.34632790088653564, + "learning_rate": 1.926205775602132e-05, + "loss": 0.4492, + "step": 6628 + }, + { + "epoch": 0.12308640554278691, + "grad_norm": 0.3344527781009674, + "learning_rate": 1.926161790458415e-05, + "loss": 0.2571, + "step": 6630 + }, + { + "epoch": 0.12312353568020555, + "grad_norm": 0.37171030044555664, + "learning_rate": 1.9261177927124102e-05, + "loss": 0.4355, + "step": 6632 + }, + { + "epoch": 0.1231606658176242, + "grad_norm": 0.2987327575683594, + "learning_rate": 1.9260737823647162e-05, + "loss": 0.2807, + "step": 6634 + }, + { + "epoch": 0.12319779595504284, + "grad_norm": 0.6868311762809753, + "learning_rate": 1.9260297594159322e-05, + "loss": 0.3338, + "step": 6636 + }, + { + "epoch": 0.12323492609246146, + "grad_norm": 0.24660225212574005, + "learning_rate": 1.9259857238666567e-05, + "loss": 0.3914, + "step": 6638 + }, + { + "epoch": 0.1232720562298801, + "grad_norm": 0.2904195189476013, + "learning_rate": 1.9259416757174892e-05, + "loss": 0.2212, + "step": 6640 + }, + { + "epoch": 0.12330918636729875, + "grad_norm": 0.6242729425430298, + "learning_rate": 1.9258976149690286e-05, + "loss": 0.346, + "step": 6642 + }, + { + "epoch": 0.12334631650471739, + "grad_norm": 0.2929135262966156, + "learning_rate": 1.9258535416218754e-05, + "loss": 0.2867, + "step": 6644 + }, + { + "epoch": 0.12338344664213602, + "grad_norm": 0.36296072602272034, + "learning_rate": 1.9258094556766287e-05, + "loss": 0.1159, + "step": 6646 + }, + { + "epoch": 0.12342057677955466, + "grad_norm": 0.40695279836654663, + "learning_rate": 1.9257653571338883e-05, + "loss": 0.3907, + "step": 6648 + }, + { + "epoch": 0.1234577069169733, + "grad_norm": 0.3360540568828583, + "learning_rate": 1.925721245994254e-05, + "loss": 0.3343, + "step": 6650 + }, + { + "epoch": 0.12349483705439193, + "grad_norm": 0.31927624344825745, + "learning_rate": 1.925677122258327e-05, + "loss": 0.2943, + "step": 6652 + }, + { + "epoch": 0.12353196719181057, + "grad_norm": 0.3285635709762573, + "learning_rate": 1.925632985926707e-05, + "loss": 0.502, + "step": 6654 + }, + { + "epoch": 0.12356909732922922, + "grad_norm": 0.31849533319473267, + "learning_rate": 1.9255888369999946e-05, + "loss": 0.4344, + "step": 6656 + }, + { + "epoch": 0.12360622746664786, + "grad_norm": 0.3950842320919037, + "learning_rate": 1.9255446754787906e-05, + "loss": 0.2918, + "step": 6658 + }, + { + "epoch": 0.12364335760406649, + "grad_norm": 0.2804225981235504, + "learning_rate": 1.9255005013636958e-05, + "loss": 0.3223, + "step": 6660 + }, + { + "epoch": 0.12368048774148513, + "grad_norm": 0.3054465353488922, + "learning_rate": 1.9254563146553114e-05, + "loss": 0.3035, + "step": 6662 + }, + { + "epoch": 0.12371761787890377, + "grad_norm": 0.34982171654701233, + "learning_rate": 1.9254121153542384e-05, + "loss": 0.3633, + "step": 6664 + }, + { + "epoch": 0.12375474801632241, + "grad_norm": 0.39346396923065186, + "learning_rate": 1.9253679034610787e-05, + "loss": 0.4899, + "step": 6666 + }, + { + "epoch": 0.12379187815374104, + "grad_norm": 0.4542764723300934, + "learning_rate": 1.9253236789764337e-05, + "loss": 0.3445, + "step": 6668 + }, + { + "epoch": 0.12382900829115968, + "grad_norm": 0.3651010990142822, + "learning_rate": 1.9252794419009052e-05, + "loss": 0.28, + "step": 6670 + }, + { + "epoch": 0.12386613842857833, + "grad_norm": 0.27628299593925476, + "learning_rate": 1.9252351922350946e-05, + "loss": 0.3314, + "step": 6672 + }, + { + "epoch": 0.12390326856599697, + "grad_norm": 0.45232954621315, + "learning_rate": 1.925190929979605e-05, + "loss": 0.4632, + "step": 6674 + }, + { + "epoch": 0.1239403987034156, + "grad_norm": 0.37899380922317505, + "learning_rate": 1.925146655135037e-05, + "loss": 0.4119, + "step": 6676 + }, + { + "epoch": 0.12397752884083424, + "grad_norm": 0.25395578145980835, + "learning_rate": 1.9251023677019952e-05, + "loss": 0.3148, + "step": 6678 + }, + { + "epoch": 0.12401465897825288, + "grad_norm": 0.46651968359947205, + "learning_rate": 1.925058067681081e-05, + "loss": 0.3641, + "step": 6680 + }, + { + "epoch": 0.12405178911567152, + "grad_norm": 0.3114718198776245, + "learning_rate": 1.9250137550728972e-05, + "loss": 0.3062, + "step": 6682 + }, + { + "epoch": 0.12408891925309015, + "grad_norm": 0.31508609652519226, + "learning_rate": 1.924969429878047e-05, + "loss": 0.2802, + "step": 6684 + }, + { + "epoch": 0.1241260493905088, + "grad_norm": 0.38300344347953796, + "learning_rate": 1.9249250920971336e-05, + "loss": 0.3337, + "step": 6686 + }, + { + "epoch": 0.12416317952792744, + "grad_norm": 0.3081687390804291, + "learning_rate": 1.92488074173076e-05, + "loss": 0.4789, + "step": 6688 + }, + { + "epoch": 0.12420030966534606, + "grad_norm": 0.22241467237472534, + "learning_rate": 1.9248363787795297e-05, + "loss": 0.3204, + "step": 6690 + }, + { + "epoch": 0.12423743980276471, + "grad_norm": 0.2904149889945984, + "learning_rate": 1.9247920032440468e-05, + "loss": 0.4287, + "step": 6692 + }, + { + "epoch": 0.12427456994018335, + "grad_norm": 0.3065168559551239, + "learning_rate": 1.9247476151249147e-05, + "loss": 0.2763, + "step": 6694 + }, + { + "epoch": 0.12431170007760199, + "grad_norm": 0.3422154188156128, + "learning_rate": 1.9247032144227374e-05, + "loss": 0.513, + "step": 6696 + }, + { + "epoch": 0.12434883021502062, + "grad_norm": 0.4110053777694702, + "learning_rate": 1.9246588011381192e-05, + "loss": 0.4607, + "step": 6698 + }, + { + "epoch": 0.12438596035243926, + "grad_norm": 0.3829737603664398, + "learning_rate": 1.9246143752716645e-05, + "loss": 0.4163, + "step": 6700 + }, + { + "epoch": 0.1244230904898579, + "grad_norm": 0.32171669602394104, + "learning_rate": 1.9245699368239776e-05, + "loss": 0.7558, + "step": 6702 + }, + { + "epoch": 0.12446022062727655, + "grad_norm": 0.3709343373775482, + "learning_rate": 1.924525485795663e-05, + "loss": 0.4078, + "step": 6704 + }, + { + "epoch": 0.12449735076469517, + "grad_norm": 0.3441333770751953, + "learning_rate": 1.9244810221873264e-05, + "loss": 0.2391, + "step": 6706 + }, + { + "epoch": 0.12453448090211382, + "grad_norm": 0.4153997004032135, + "learning_rate": 1.924436545999572e-05, + "loss": 0.2241, + "step": 6708 + }, + { + "epoch": 0.12457161103953246, + "grad_norm": 0.29028961062431335, + "learning_rate": 1.9243920572330047e-05, + "loss": 0.2125, + "step": 6710 + }, + { + "epoch": 0.1246087411769511, + "grad_norm": 0.2682378888130188, + "learning_rate": 1.924347555888231e-05, + "loss": 0.2812, + "step": 6712 + }, + { + "epoch": 0.12464587131436973, + "grad_norm": 0.3279280662536621, + "learning_rate": 1.9243030419658554e-05, + "loss": 0.2952, + "step": 6714 + }, + { + "epoch": 0.12468300145178837, + "grad_norm": 0.36582037806510925, + "learning_rate": 1.9242585154664845e-05, + "loss": 0.3329, + "step": 6716 + }, + { + "epoch": 0.12472013158920701, + "grad_norm": 0.37971365451812744, + "learning_rate": 1.924213976390723e-05, + "loss": 0.2733, + "step": 6718 + }, + { + "epoch": 0.12475726172662566, + "grad_norm": 0.29044920206069946, + "learning_rate": 1.924169424739178e-05, + "loss": 0.2522, + "step": 6720 + }, + { + "epoch": 0.12479439186404429, + "grad_norm": 0.2184426337480545, + "learning_rate": 1.9241248605124555e-05, + "loss": 0.1542, + "step": 6722 + }, + { + "epoch": 0.12483152200146293, + "grad_norm": 0.3339032232761383, + "learning_rate": 1.924080283711162e-05, + "loss": 0.4775, + "step": 6724 + }, + { + "epoch": 0.12486865213888157, + "grad_norm": 0.36383771896362305, + "learning_rate": 1.924035694335903e-05, + "loss": 0.4332, + "step": 6726 + }, + { + "epoch": 0.1249057822763002, + "grad_norm": 0.32313409447669983, + "learning_rate": 1.9239910923872865e-05, + "loss": 0.3358, + "step": 6728 + }, + { + "epoch": 0.12494291241371884, + "grad_norm": 0.3498036861419678, + "learning_rate": 1.923946477865919e-05, + "loss": 0.2788, + "step": 6730 + }, + { + "epoch": 0.12498004255113748, + "grad_norm": 0.36773255467414856, + "learning_rate": 1.9239018507724074e-05, + "loss": 0.3664, + "step": 6732 + }, + { + "epoch": 0.1250171726885561, + "grad_norm": 0.38692253828048706, + "learning_rate": 1.9238572111073584e-05, + "loss": 0.4895, + "step": 6734 + }, + { + "epoch": 0.12505430282597477, + "grad_norm": 0.4416910409927368, + "learning_rate": 1.9238125588713807e-05, + "loss": 0.4031, + "step": 6736 + }, + { + "epoch": 0.1250914329633934, + "grad_norm": 0.320575475692749, + "learning_rate": 1.923767894065081e-05, + "loss": 0.273, + "step": 6738 + }, + { + "epoch": 0.12512856310081202, + "grad_norm": 0.32703378796577454, + "learning_rate": 1.923723216689067e-05, + "loss": 0.4847, + "step": 6740 + }, + { + "epoch": 0.12516569323823068, + "grad_norm": 0.43252208828926086, + "learning_rate": 1.9236785267439476e-05, + "loss": 0.3917, + "step": 6742 + }, + { + "epoch": 0.1252028233756493, + "grad_norm": 0.39739105105400085, + "learning_rate": 1.9236338242303297e-05, + "loss": 0.3036, + "step": 6744 + }, + { + "epoch": 0.12523995351306796, + "grad_norm": 0.4072605073451996, + "learning_rate": 1.923589109148822e-05, + "loss": 0.2088, + "step": 6746 + }, + { + "epoch": 0.1252770836504866, + "grad_norm": 0.361385315656662, + "learning_rate": 1.9235443815000333e-05, + "loss": 0.2747, + "step": 6748 + }, + { + "epoch": 0.12531421378790522, + "grad_norm": 0.2929092347621918, + "learning_rate": 1.9234996412845716e-05, + "loss": 0.2932, + "step": 6750 + }, + { + "epoch": 0.12535134392532388, + "grad_norm": 0.35604676604270935, + "learning_rate": 1.923454888503046e-05, + "loss": 0.315, + "step": 6752 + }, + { + "epoch": 0.1253884740627425, + "grad_norm": 0.2764253616333008, + "learning_rate": 1.9234101231560656e-05, + "loss": 0.3078, + "step": 6754 + }, + { + "epoch": 0.12542560420016113, + "grad_norm": 0.3609953224658966, + "learning_rate": 1.923365345244239e-05, + "loss": 0.2709, + "step": 6756 + }, + { + "epoch": 0.1254627343375798, + "grad_norm": 0.3713049590587616, + "learning_rate": 1.9233205547681764e-05, + "loss": 0.4512, + "step": 6758 + }, + { + "epoch": 0.12549986447499842, + "grad_norm": 0.34089988470077515, + "learning_rate": 1.9232757517284863e-05, + "loss": 0.4119, + "step": 6760 + }, + { + "epoch": 0.12553699461241707, + "grad_norm": 0.4533332288265228, + "learning_rate": 1.923230936125779e-05, + "loss": 0.5263, + "step": 6762 + }, + { + "epoch": 0.1255741247498357, + "grad_norm": 0.37296736240386963, + "learning_rate": 1.923186107960664e-05, + "loss": 0.2874, + "step": 6764 + }, + { + "epoch": 0.12561125488725433, + "grad_norm": 0.33240365982055664, + "learning_rate": 1.923141267233751e-05, + "loss": 0.2954, + "step": 6766 + }, + { + "epoch": 0.125648385024673, + "grad_norm": 0.2692447304725647, + "learning_rate": 1.9230964139456508e-05, + "loss": 0.469, + "step": 6768 + }, + { + "epoch": 0.12568551516209162, + "grad_norm": 0.29615044593811035, + "learning_rate": 1.9230515480969735e-05, + "loss": 0.4667, + "step": 6770 + }, + { + "epoch": 0.12572264529951024, + "grad_norm": 0.251888245344162, + "learning_rate": 1.9230066696883294e-05, + "loss": 0.3877, + "step": 6772 + }, + { + "epoch": 0.1257597754369289, + "grad_norm": 0.3574179708957672, + "learning_rate": 1.9229617787203293e-05, + "loss": 0.5043, + "step": 6774 + }, + { + "epoch": 0.12579690557434753, + "grad_norm": 0.3815130889415741, + "learning_rate": 1.9229168751935838e-05, + "loss": 0.3192, + "step": 6776 + }, + { + "epoch": 0.12583403571176616, + "grad_norm": 0.32082876563072205, + "learning_rate": 1.922871959108704e-05, + "loss": 0.2146, + "step": 6778 + }, + { + "epoch": 0.1258711658491848, + "grad_norm": 0.30080121755599976, + "learning_rate": 1.9228270304663014e-05, + "loss": 0.4461, + "step": 6780 + }, + { + "epoch": 0.12590829598660344, + "grad_norm": 0.28345707058906555, + "learning_rate": 1.922782089266987e-05, + "loss": 0.3742, + "step": 6782 + }, + { + "epoch": 0.1259454261240221, + "grad_norm": 0.4946710765361786, + "learning_rate": 1.9227371355113727e-05, + "loss": 0.2621, + "step": 6784 + }, + { + "epoch": 0.12598255626144073, + "grad_norm": 0.38305824995040894, + "learning_rate": 1.9226921692000698e-05, + "loss": 0.2707, + "step": 6786 + }, + { + "epoch": 0.12601968639885935, + "grad_norm": 0.26083871722221375, + "learning_rate": 1.9226471903336898e-05, + "loss": 0.3179, + "step": 6788 + }, + { + "epoch": 0.126056816536278, + "grad_norm": 0.32412534952163696, + "learning_rate": 1.9226021989128456e-05, + "loss": 0.3843, + "step": 6790 + }, + { + "epoch": 0.12609394667369664, + "grad_norm": 0.552943229675293, + "learning_rate": 1.922557194938149e-05, + "loss": 0.3888, + "step": 6792 + }, + { + "epoch": 0.12613107681111527, + "grad_norm": 0.4379805624485016, + "learning_rate": 1.9225121784102124e-05, + "loss": 0.4919, + "step": 6794 + }, + { + "epoch": 0.12616820694853392, + "grad_norm": 0.4324335753917694, + "learning_rate": 1.922467149329648e-05, + "loss": 0.451, + "step": 6796 + }, + { + "epoch": 0.12620533708595255, + "grad_norm": 0.46422454714775085, + "learning_rate": 1.922422107697069e-05, + "loss": 0.3353, + "step": 6798 + }, + { + "epoch": 0.1262424672233712, + "grad_norm": 0.466890811920166, + "learning_rate": 1.9223770535130878e-05, + "loss": 0.4307, + "step": 6800 + }, + { + "epoch": 0.12627959736078984, + "grad_norm": 0.27056217193603516, + "learning_rate": 1.9223319867783182e-05, + "loss": 0.426, + "step": 6802 + }, + { + "epoch": 0.12631672749820846, + "grad_norm": 0.28319016098976135, + "learning_rate": 1.922286907493373e-05, + "loss": 0.3493, + "step": 6804 + }, + { + "epoch": 0.12635385763562712, + "grad_norm": 0.4310556948184967, + "learning_rate": 1.922241815658865e-05, + "loss": 0.396, + "step": 6806 + }, + { + "epoch": 0.12639098777304575, + "grad_norm": 0.33706748485565186, + "learning_rate": 1.9221967112754085e-05, + "loss": 0.344, + "step": 6808 + }, + { + "epoch": 0.12642811791046438, + "grad_norm": 0.4150315821170807, + "learning_rate": 1.9221515943436172e-05, + "loss": 0.4606, + "step": 6810 + }, + { + "epoch": 0.12646524804788303, + "grad_norm": 0.33215463161468506, + "learning_rate": 1.922106464864105e-05, + "loss": 0.2838, + "step": 6812 + }, + { + "epoch": 0.12650237818530166, + "grad_norm": 0.33037644624710083, + "learning_rate": 1.9220613228374857e-05, + "loss": 0.2915, + "step": 6814 + }, + { + "epoch": 0.1265395083227203, + "grad_norm": 0.36261504888534546, + "learning_rate": 1.9220161682643736e-05, + "loss": 0.3475, + "step": 6816 + }, + { + "epoch": 0.12657663846013895, + "grad_norm": 0.2907375693321228, + "learning_rate": 1.9219710011453833e-05, + "loss": 0.4063, + "step": 6818 + }, + { + "epoch": 0.12661376859755757, + "grad_norm": 0.44252482056617737, + "learning_rate": 1.9219258214811295e-05, + "loss": 0.4219, + "step": 6820 + }, + { + "epoch": 0.12665089873497623, + "grad_norm": 0.34110334515571594, + "learning_rate": 1.9218806292722263e-05, + "loss": 0.1555, + "step": 6822 + }, + { + "epoch": 0.12668802887239486, + "grad_norm": 0.3758030831813812, + "learning_rate": 1.9218354245192894e-05, + "loss": 0.3817, + "step": 6824 + }, + { + "epoch": 0.1267251590098135, + "grad_norm": 0.40207135677337646, + "learning_rate": 1.9217902072229335e-05, + "loss": 0.3788, + "step": 6826 + }, + { + "epoch": 0.12676228914723214, + "grad_norm": 0.31781160831451416, + "learning_rate": 1.921744977383774e-05, + "loss": 0.3406, + "step": 6828 + }, + { + "epoch": 0.12679941928465077, + "grad_norm": 0.3182947337627411, + "learning_rate": 1.9216997350024264e-05, + "loss": 0.2978, + "step": 6830 + }, + { + "epoch": 0.1268365494220694, + "grad_norm": 0.34986546635627747, + "learning_rate": 1.9216544800795057e-05, + "loss": 0.3198, + "step": 6832 + }, + { + "epoch": 0.12687367955948806, + "grad_norm": 0.2990535795688629, + "learning_rate": 1.921609212615629e-05, + "loss": 0.2211, + "step": 6834 + }, + { + "epoch": 0.12691080969690668, + "grad_norm": 0.39816853404045105, + "learning_rate": 1.9215639326114108e-05, + "loss": 0.3219, + "step": 6836 + }, + { + "epoch": 0.12694793983432534, + "grad_norm": 0.2099897712469101, + "learning_rate": 1.9215186400674682e-05, + "loss": 0.3103, + "step": 6838 + }, + { + "epoch": 0.12698506997174397, + "grad_norm": 0.3141871988773346, + "learning_rate": 1.921473334984417e-05, + "loss": 0.369, + "step": 6840 + }, + { + "epoch": 0.1270222001091626, + "grad_norm": 0.4258623719215393, + "learning_rate": 1.9214280173628742e-05, + "loss": 0.4774, + "step": 6842 + }, + { + "epoch": 0.12705933024658125, + "grad_norm": 0.39120393991470337, + "learning_rate": 1.9213826872034558e-05, + "loss": 0.4062, + "step": 6844 + }, + { + "epoch": 0.12709646038399988, + "grad_norm": 0.3628799617290497, + "learning_rate": 1.921337344506779e-05, + "loss": 0.2356, + "step": 6846 + }, + { + "epoch": 0.1271335905214185, + "grad_norm": 0.383894145488739, + "learning_rate": 1.9212919892734605e-05, + "loss": 0.3235, + "step": 6848 + }, + { + "epoch": 0.12717072065883717, + "grad_norm": 0.5199154019355774, + "learning_rate": 1.9212466215041177e-05, + "loss": 0.3098, + "step": 6850 + }, + { + "epoch": 0.1272078507962558, + "grad_norm": 0.44096851348876953, + "learning_rate": 1.921201241199368e-05, + "loss": 0.3114, + "step": 6852 + }, + { + "epoch": 0.12724498093367442, + "grad_norm": 0.40052562952041626, + "learning_rate": 1.9211558483598285e-05, + "loss": 0.2748, + "step": 6854 + }, + { + "epoch": 0.12728211107109308, + "grad_norm": 0.37798264622688293, + "learning_rate": 1.921110442986117e-05, + "loss": 0.4314, + "step": 6856 + }, + { + "epoch": 0.1273192412085117, + "grad_norm": 0.3125547766685486, + "learning_rate": 1.9210650250788518e-05, + "loss": 0.455, + "step": 6858 + }, + { + "epoch": 0.12735637134593036, + "grad_norm": 0.4118659198284149, + "learning_rate": 1.9210195946386504e-05, + "loss": 0.31, + "step": 6860 + }, + { + "epoch": 0.127393501483349, + "grad_norm": 0.2762649357318878, + "learning_rate": 1.9209741516661308e-05, + "loss": 0.3856, + "step": 6862 + }, + { + "epoch": 0.12743063162076762, + "grad_norm": 0.39093881845474243, + "learning_rate": 1.9209286961619118e-05, + "loss": 0.3167, + "step": 6864 + }, + { + "epoch": 0.12746776175818628, + "grad_norm": 0.385725736618042, + "learning_rate": 1.920883228126612e-05, + "loss": 0.3749, + "step": 6866 + }, + { + "epoch": 0.1275048918956049, + "grad_norm": 0.29993143677711487, + "learning_rate": 1.9208377475608493e-05, + "loss": 0.4906, + "step": 6868 + }, + { + "epoch": 0.12754202203302353, + "grad_norm": 0.24928006529808044, + "learning_rate": 1.9207922544652434e-05, + "loss": 0.4518, + "step": 6870 + }, + { + "epoch": 0.1275791521704422, + "grad_norm": 0.33041566610336304, + "learning_rate": 1.920746748840413e-05, + "loss": 0.2216, + "step": 6872 + }, + { + "epoch": 0.12761628230786082, + "grad_norm": 0.41411134600639343, + "learning_rate": 1.920701230686977e-05, + "loss": 0.3536, + "step": 6874 + }, + { + "epoch": 0.12765341244527947, + "grad_norm": 0.6628223061561584, + "learning_rate": 1.9206557000055554e-05, + "loss": 0.3325, + "step": 6876 + }, + { + "epoch": 0.1276905425826981, + "grad_norm": 0.4323306083679199, + "learning_rate": 1.9206101567967675e-05, + "loss": 0.5036, + "step": 6878 + }, + { + "epoch": 0.12772767272011673, + "grad_norm": 0.3774072229862213, + "learning_rate": 1.9205646010612327e-05, + "loss": 0.3099, + "step": 6880 + }, + { + "epoch": 0.1277648028575354, + "grad_norm": 0.3012276887893677, + "learning_rate": 1.9205190327995714e-05, + "loss": 0.5046, + "step": 6882 + }, + { + "epoch": 0.12780193299495402, + "grad_norm": 0.3034154772758484, + "learning_rate": 1.920473452012403e-05, + "loss": 0.3681, + "step": 6884 + }, + { + "epoch": 0.12783906313237264, + "grad_norm": 0.2771282494068146, + "learning_rate": 1.9204278587003483e-05, + "loss": 0.1678, + "step": 6886 + }, + { + "epoch": 0.1278761932697913, + "grad_norm": 0.3034060299396515, + "learning_rate": 1.9203822528640273e-05, + "loss": 0.3596, + "step": 6888 + }, + { + "epoch": 0.12791332340720993, + "grad_norm": 0.5938284397125244, + "learning_rate": 1.9203366345040606e-05, + "loss": 0.369, + "step": 6890 + }, + { + "epoch": 0.12795045354462856, + "grad_norm": 0.32618650794029236, + "learning_rate": 1.9202910036210692e-05, + "loss": 0.196, + "step": 6892 + }, + { + "epoch": 0.1279875836820472, + "grad_norm": 0.3151942491531372, + "learning_rate": 1.9202453602156738e-05, + "loss": 0.1502, + "step": 6894 + }, + { + "epoch": 0.12802471381946584, + "grad_norm": 0.3502785265445709, + "learning_rate": 1.9201997042884955e-05, + "loss": 0.3537, + "step": 6896 + }, + { + "epoch": 0.1280618439568845, + "grad_norm": 0.45874011516571045, + "learning_rate": 1.9201540358401553e-05, + "loss": 0.3195, + "step": 6898 + }, + { + "epoch": 0.12809897409430313, + "grad_norm": 0.29599541425704956, + "learning_rate": 1.9201083548712753e-05, + "loss": 0.3321, + "step": 6900 + }, + { + "epoch": 0.12813610423172175, + "grad_norm": 0.4213782250881195, + "learning_rate": 1.920062661382476e-05, + "loss": 0.2932, + "step": 6902 + }, + { + "epoch": 0.1281732343691404, + "grad_norm": 0.37992042303085327, + "learning_rate": 1.9200169553743803e-05, + "loss": 0.4813, + "step": 6904 + }, + { + "epoch": 0.12821036450655904, + "grad_norm": 0.3845260441303253, + "learning_rate": 1.9199712368476094e-05, + "loss": 0.2097, + "step": 6906 + }, + { + "epoch": 0.12824749464397767, + "grad_norm": 0.2666932940483093, + "learning_rate": 1.9199255058027857e-05, + "loss": 0.3519, + "step": 6908 + }, + { + "epoch": 0.12828462478139632, + "grad_norm": 0.41257423162460327, + "learning_rate": 1.9198797622405317e-05, + "loss": 0.272, + "step": 6910 + }, + { + "epoch": 0.12832175491881495, + "grad_norm": 0.31530383229255676, + "learning_rate": 1.919834006161469e-05, + "loss": 0.4329, + "step": 6912 + }, + { + "epoch": 0.12835888505623358, + "grad_norm": 0.284525603055954, + "learning_rate": 1.9197882375662208e-05, + "loss": 0.2638, + "step": 6914 + }, + { + "epoch": 0.12839601519365224, + "grad_norm": 0.3915586769580841, + "learning_rate": 1.91974245645541e-05, + "loss": 0.4098, + "step": 6916 + }, + { + "epoch": 0.12843314533107086, + "grad_norm": 0.2885100245475769, + "learning_rate": 1.919696662829659e-05, + "loss": 0.2487, + "step": 6918 + }, + { + "epoch": 0.12847027546848952, + "grad_norm": 0.40493106842041016, + "learning_rate": 1.9196508566895913e-05, + "loss": 0.4005, + "step": 6920 + }, + { + "epoch": 0.12850740560590815, + "grad_norm": 0.3438049554824829, + "learning_rate": 1.91960503803583e-05, + "loss": 0.2687, + "step": 6922 + }, + { + "epoch": 0.12854453574332678, + "grad_norm": 0.6170334219932556, + "learning_rate": 1.919559206868999e-05, + "loss": 0.3819, + "step": 6924 + }, + { + "epoch": 0.12858166588074543, + "grad_norm": 0.5131980776786804, + "learning_rate": 1.9195133631897213e-05, + "loss": 0.2414, + "step": 6926 + }, + { + "epoch": 0.12861879601816406, + "grad_norm": 0.45458874106407166, + "learning_rate": 1.9194675069986212e-05, + "loss": 0.442, + "step": 6928 + }, + { + "epoch": 0.1286559261555827, + "grad_norm": 0.3635667562484741, + "learning_rate": 1.9194216382963223e-05, + "loss": 0.2415, + "step": 6930 + }, + { + "epoch": 0.12869305629300135, + "grad_norm": 0.38065195083618164, + "learning_rate": 1.9193757570834492e-05, + "loss": 0.4177, + "step": 6932 + }, + { + "epoch": 0.12873018643041997, + "grad_norm": 0.3185134530067444, + "learning_rate": 1.9193298633606258e-05, + "loss": 0.3858, + "step": 6934 + }, + { + "epoch": 0.12876731656783863, + "grad_norm": 0.2842084467411041, + "learning_rate": 1.9192839571284763e-05, + "loss": 0.2413, + "step": 6936 + }, + { + "epoch": 0.12880444670525726, + "grad_norm": 0.2919422388076782, + "learning_rate": 1.9192380383876258e-05, + "loss": 0.346, + "step": 6938 + }, + { + "epoch": 0.1288415768426759, + "grad_norm": 0.30367588996887207, + "learning_rate": 1.919192107138699e-05, + "loss": 0.4438, + "step": 6940 + }, + { + "epoch": 0.12887870698009454, + "grad_norm": 0.2364528626203537, + "learning_rate": 1.9191461633823215e-05, + "loss": 0.2612, + "step": 6942 + }, + { + "epoch": 0.12891583711751317, + "grad_norm": 0.2886956036090851, + "learning_rate": 1.9191002071191173e-05, + "loss": 0.2756, + "step": 6944 + }, + { + "epoch": 0.1289529672549318, + "grad_norm": 0.31795135140419006, + "learning_rate": 1.9190542383497125e-05, + "loss": 0.3673, + "step": 6946 + }, + { + "epoch": 0.12899009739235046, + "grad_norm": 0.34532830119132996, + "learning_rate": 1.9190082570747322e-05, + "loss": 0.4501, + "step": 6948 + }, + { + "epoch": 0.12902722752976908, + "grad_norm": 0.34968647360801697, + "learning_rate": 1.9189622632948026e-05, + "loss": 0.3151, + "step": 6950 + }, + { + "epoch": 0.1290643576671877, + "grad_norm": 0.5368404984474182, + "learning_rate": 1.918916257010549e-05, + "loss": 0.4925, + "step": 6952 + }, + { + "epoch": 0.12910148780460637, + "grad_norm": 0.45469728112220764, + "learning_rate": 1.918870238222598e-05, + "loss": 0.2654, + "step": 6954 + }, + { + "epoch": 0.129138617942025, + "grad_norm": 0.3833331763744354, + "learning_rate": 1.918824206931575e-05, + "loss": 0.3778, + "step": 6956 + }, + { + "epoch": 0.12917574807944365, + "grad_norm": 0.9119570851325989, + "learning_rate": 1.9187781631381067e-05, + "loss": 0.4553, + "step": 6958 + }, + { + "epoch": 0.12921287821686228, + "grad_norm": 0.2994924783706665, + "learning_rate": 1.91873210684282e-05, + "loss": 0.3078, + "step": 6960 + }, + { + "epoch": 0.1292500083542809, + "grad_norm": 0.35223865509033203, + "learning_rate": 1.9186860380463407e-05, + "loss": 0.3356, + "step": 6962 + }, + { + "epoch": 0.12928713849169957, + "grad_norm": 0.6104734539985657, + "learning_rate": 1.9186399567492966e-05, + "loss": 0.2372, + "step": 6964 + }, + { + "epoch": 0.1293242686291182, + "grad_norm": 0.24619168043136597, + "learning_rate": 1.9185938629523143e-05, + "loss": 0.2637, + "step": 6966 + }, + { + "epoch": 0.12936139876653682, + "grad_norm": 0.44779258966445923, + "learning_rate": 1.9185477566560208e-05, + "loss": 0.3396, + "step": 6968 + }, + { + "epoch": 0.12939852890395548, + "grad_norm": 0.3527730703353882, + "learning_rate": 1.9185016378610443e-05, + "loss": 0.3316, + "step": 6970 + }, + { + "epoch": 0.1294356590413741, + "grad_norm": 0.5245075821876526, + "learning_rate": 1.918455506568011e-05, + "loss": 0.367, + "step": 6972 + }, + { + "epoch": 0.12947278917879276, + "grad_norm": 0.2701316177845001, + "learning_rate": 1.9184093627775496e-05, + "loss": 0.3796, + "step": 6974 + }, + { + "epoch": 0.1295099193162114, + "grad_norm": 0.3432866036891937, + "learning_rate": 1.918363206490288e-05, + "loss": 0.3815, + "step": 6976 + }, + { + "epoch": 0.12954704945363002, + "grad_norm": 0.2795094847679138, + "learning_rate": 1.918317037706854e-05, + "loss": 0.3651, + "step": 6978 + }, + { + "epoch": 0.12958417959104868, + "grad_norm": 0.25656238198280334, + "learning_rate": 1.9182708564278754e-05, + "loss": 0.2371, + "step": 6980 + }, + { + "epoch": 0.1296213097284673, + "grad_norm": 0.47116556763648987, + "learning_rate": 1.9182246626539812e-05, + "loss": 0.5125, + "step": 6982 + }, + { + "epoch": 0.12965843986588593, + "grad_norm": 0.34104740619659424, + "learning_rate": 1.9181784563857998e-05, + "loss": 0.5172, + "step": 6984 + }, + { + "epoch": 0.1296955700033046, + "grad_norm": 0.32182496786117554, + "learning_rate": 1.9181322376239596e-05, + "loss": 0.3989, + "step": 6986 + }, + { + "epoch": 0.12973270014072322, + "grad_norm": 0.4881784915924072, + "learning_rate": 1.91808600636909e-05, + "loss": 0.3813, + "step": 6988 + }, + { + "epoch": 0.12976983027814185, + "grad_norm": 0.3208380341529846, + "learning_rate": 1.91803976262182e-05, + "loss": 0.3366, + "step": 6990 + }, + { + "epoch": 0.1298069604155605, + "grad_norm": 0.47881579399108887, + "learning_rate": 1.9179935063827783e-05, + "loss": 0.2821, + "step": 6992 + }, + { + "epoch": 0.12984409055297913, + "grad_norm": 0.3987862169742584, + "learning_rate": 1.9179472376525947e-05, + "loss": 0.2486, + "step": 6994 + }, + { + "epoch": 0.1298812206903978, + "grad_norm": 0.3396744132041931, + "learning_rate": 1.9179009564318993e-05, + "loss": 0.2653, + "step": 6996 + }, + { + "epoch": 0.12991835082781641, + "grad_norm": 0.2583022713661194, + "learning_rate": 1.9178546627213205e-05, + "loss": 0.1902, + "step": 6998 + }, + { + "epoch": 0.12995548096523504, + "grad_norm": 0.37820449471473694, + "learning_rate": 1.9178083565214896e-05, + "loss": 0.2452, + "step": 7000 + }, + { + "epoch": 0.1299926111026537, + "grad_norm": 0.3128203749656677, + "learning_rate": 1.9177620378330358e-05, + "loss": 0.4935, + "step": 7002 + }, + { + "epoch": 0.13002974124007233, + "grad_norm": 0.23880182206630707, + "learning_rate": 1.9177157066565903e-05, + "loss": 0.2483, + "step": 7004 + }, + { + "epoch": 0.13006687137749096, + "grad_norm": 0.3715354800224304, + "learning_rate": 1.917669362992782e-05, + "loss": 0.2202, + "step": 7006 + }, + { + "epoch": 0.1301040015149096, + "grad_norm": 0.3140673339366913, + "learning_rate": 1.9176230068422434e-05, + "loss": 0.4123, + "step": 7008 + }, + { + "epoch": 0.13014113165232824, + "grad_norm": 0.28320327401161194, + "learning_rate": 1.9175766382056034e-05, + "loss": 0.2837, + "step": 7010 + }, + { + "epoch": 0.1301782617897469, + "grad_norm": 0.6192690134048462, + "learning_rate": 1.9175302570834942e-05, + "loss": 0.4265, + "step": 7012 + }, + { + "epoch": 0.13021539192716552, + "grad_norm": 0.49688848853111267, + "learning_rate": 1.9174838634765466e-05, + "loss": 0.2787, + "step": 7014 + }, + { + "epoch": 0.13025252206458415, + "grad_norm": 0.3195688724517822, + "learning_rate": 1.9174374573853915e-05, + "loss": 0.2544, + "step": 7016 + }, + { + "epoch": 0.1302896522020028, + "grad_norm": 0.3402513861656189, + "learning_rate": 1.917391038810661e-05, + "loss": 0.402, + "step": 7018 + }, + { + "epoch": 0.13032678233942144, + "grad_norm": 0.31732264161109924, + "learning_rate": 1.9173446077529862e-05, + "loss": 0.1558, + "step": 7020 + }, + { + "epoch": 0.13036391247684007, + "grad_norm": 0.39950212836265564, + "learning_rate": 1.917298164212999e-05, + "loss": 0.3231, + "step": 7022 + }, + { + "epoch": 0.13040104261425872, + "grad_norm": 0.4340563118457794, + "learning_rate": 1.9172517081913317e-05, + "loss": 0.3182, + "step": 7024 + }, + { + "epoch": 0.13043817275167735, + "grad_norm": 0.33968251943588257, + "learning_rate": 1.917205239688616e-05, + "loss": 0.1995, + "step": 7026 + }, + { + "epoch": 0.13047530288909598, + "grad_norm": 0.5604590773582458, + "learning_rate": 1.917158758705484e-05, + "loss": 0.246, + "step": 7028 + }, + { + "epoch": 0.13051243302651463, + "grad_norm": 0.29320746660232544, + "learning_rate": 1.9171122652425688e-05, + "loss": 0.4672, + "step": 7030 + }, + { + "epoch": 0.13054956316393326, + "grad_norm": 0.3579826354980469, + "learning_rate": 1.9170657593005027e-05, + "loss": 0.4568, + "step": 7032 + }, + { + "epoch": 0.13058669330135192, + "grad_norm": 0.2845290005207062, + "learning_rate": 1.9170192408799184e-05, + "loss": 0.213, + "step": 7034 + }, + { + "epoch": 0.13062382343877055, + "grad_norm": 0.4265182912349701, + "learning_rate": 1.9169727099814492e-05, + "loss": 0.3631, + "step": 7036 + }, + { + "epoch": 0.13066095357618918, + "grad_norm": 0.37000492215156555, + "learning_rate": 1.9169261666057283e-05, + "loss": 0.297, + "step": 7038 + }, + { + "epoch": 0.13069808371360783, + "grad_norm": 0.4555400013923645, + "learning_rate": 1.9168796107533883e-05, + "loss": 0.5511, + "step": 7040 + }, + { + "epoch": 0.13073521385102646, + "grad_norm": 0.3653505742549896, + "learning_rate": 1.916833042425063e-05, + "loss": 0.1878, + "step": 7042 + }, + { + "epoch": 0.1307723439884451, + "grad_norm": 0.24913451075553894, + "learning_rate": 1.916786461621387e-05, + "loss": 0.3973, + "step": 7044 + }, + { + "epoch": 0.13080947412586375, + "grad_norm": 0.3549131751060486, + "learning_rate": 1.916739868342993e-05, + "loss": 0.275, + "step": 7046 + }, + { + "epoch": 0.13084660426328237, + "grad_norm": 0.29103508591651917, + "learning_rate": 1.9166932625905152e-05, + "loss": 0.3745, + "step": 7048 + }, + { + "epoch": 0.13088373440070103, + "grad_norm": 0.376519113779068, + "learning_rate": 1.916646644364588e-05, + "loss": 0.3088, + "step": 7050 + }, + { + "epoch": 0.13092086453811966, + "grad_norm": 0.5899032354354858, + "learning_rate": 1.916600013665846e-05, + "loss": 0.3392, + "step": 7052 + }, + { + "epoch": 0.1309579946755383, + "grad_norm": 0.48108527064323425, + "learning_rate": 1.916553370494923e-05, + "loss": 0.2358, + "step": 7054 + }, + { + "epoch": 0.13099512481295694, + "grad_norm": 0.4275568127632141, + "learning_rate": 1.9165067148524537e-05, + "loss": 0.1322, + "step": 7056 + }, + { + "epoch": 0.13103225495037557, + "grad_norm": 0.36434200406074524, + "learning_rate": 1.9164600467390733e-05, + "loss": 0.2768, + "step": 7058 + }, + { + "epoch": 0.1310693850877942, + "grad_norm": 0.34344184398651123, + "learning_rate": 1.9164133661554174e-05, + "loss": 0.5205, + "step": 7060 + }, + { + "epoch": 0.13110651522521286, + "grad_norm": 0.543472170829773, + "learning_rate": 1.9163666731021202e-05, + "loss": 0.2428, + "step": 7062 + }, + { + "epoch": 0.13114364536263148, + "grad_norm": 0.324930876493454, + "learning_rate": 1.9163199675798173e-05, + "loss": 0.4177, + "step": 7064 + }, + { + "epoch": 0.1311807755000501, + "grad_norm": 0.2902285158634186, + "learning_rate": 1.9162732495891447e-05, + "loss": 0.309, + "step": 7066 + }, + { + "epoch": 0.13121790563746877, + "grad_norm": 0.34280896186828613, + "learning_rate": 1.9162265191307377e-05, + "loss": 0.2668, + "step": 7068 + }, + { + "epoch": 0.1312550357748874, + "grad_norm": 0.2957268953323364, + "learning_rate": 1.916179776205232e-05, + "loss": 0.3454, + "step": 7070 + }, + { + "epoch": 0.13129216591230605, + "grad_norm": 0.4251037836074829, + "learning_rate": 1.9161330208132635e-05, + "loss": 0.4571, + "step": 7072 + }, + { + "epoch": 0.13132929604972468, + "grad_norm": 0.3598605692386627, + "learning_rate": 1.9160862529554693e-05, + "loss": 0.4459, + "step": 7074 + }, + { + "epoch": 0.1313664261871433, + "grad_norm": 0.29164984822273254, + "learning_rate": 1.9160394726324847e-05, + "loss": 0.1834, + "step": 7076 + }, + { + "epoch": 0.13140355632456197, + "grad_norm": 0.3958226144313812, + "learning_rate": 1.9159926798449472e-05, + "loss": 0.3798, + "step": 7078 + }, + { + "epoch": 0.1314406864619806, + "grad_norm": 0.359245240688324, + "learning_rate": 1.9159458745934927e-05, + "loss": 0.3254, + "step": 7080 + }, + { + "epoch": 0.13147781659939922, + "grad_norm": 0.3832409381866455, + "learning_rate": 1.9158990568787588e-05, + "loss": 0.4865, + "step": 7082 + }, + { + "epoch": 0.13151494673681788, + "grad_norm": 0.3965170085430145, + "learning_rate": 1.9158522267013818e-05, + "loss": 0.3997, + "step": 7084 + }, + { + "epoch": 0.1315520768742365, + "grad_norm": 0.3076322078704834, + "learning_rate": 1.9158053840619993e-05, + "loss": 0.4112, + "step": 7086 + }, + { + "epoch": 0.13158920701165516, + "grad_norm": 0.27528607845306396, + "learning_rate": 1.915758528961249e-05, + "loss": 0.3343, + "step": 7088 + }, + { + "epoch": 0.1316263371490738, + "grad_norm": 0.3214447498321533, + "learning_rate": 1.915711661399768e-05, + "loss": 0.3168, + "step": 7090 + }, + { + "epoch": 0.13166346728649242, + "grad_norm": 0.3760682940483093, + "learning_rate": 1.9156647813781938e-05, + "loss": 0.3632, + "step": 7092 + }, + { + "epoch": 0.13170059742391108, + "grad_norm": 0.31415367126464844, + "learning_rate": 1.915617888897165e-05, + "loss": 0.3564, + "step": 7094 + }, + { + "epoch": 0.1317377275613297, + "grad_norm": 0.34857919812202454, + "learning_rate": 1.9155709839573194e-05, + "loss": 0.3106, + "step": 7096 + }, + { + "epoch": 0.13177485769874833, + "grad_norm": 0.29351750016212463, + "learning_rate": 1.915524066559295e-05, + "loss": 0.3813, + "step": 7098 + }, + { + "epoch": 0.131811987836167, + "grad_norm": 0.39932531118392944, + "learning_rate": 1.9154771367037305e-05, + "loss": 0.556, + "step": 7100 + }, + { + "epoch": 0.13184911797358562, + "grad_norm": 0.5048282146453857, + "learning_rate": 1.915430194391264e-05, + "loss": 0.2324, + "step": 7102 + }, + { + "epoch": 0.13188624811100425, + "grad_norm": 0.3569514751434326, + "learning_rate": 1.915383239622535e-05, + "loss": 0.7564, + "step": 7104 + }, + { + "epoch": 0.1319233782484229, + "grad_norm": 0.4786880910396576, + "learning_rate": 1.9153362723981816e-05, + "loss": 0.3076, + "step": 7106 + }, + { + "epoch": 0.13196050838584153, + "grad_norm": 0.3879612386226654, + "learning_rate": 1.9152892927188436e-05, + "loss": 0.2427, + "step": 7108 + }, + { + "epoch": 0.13199763852326019, + "grad_norm": 0.21245542168617249, + "learning_rate": 1.9152423005851598e-05, + "loss": 0.3149, + "step": 7110 + }, + { + "epoch": 0.13203476866067881, + "grad_norm": 0.41764116287231445, + "learning_rate": 1.9151952959977697e-05, + "loss": 0.3248, + "step": 7112 + }, + { + "epoch": 0.13207189879809744, + "grad_norm": 0.3896578252315521, + "learning_rate": 1.915148278957313e-05, + "loss": 0.2454, + "step": 7114 + }, + { + "epoch": 0.1321090289355161, + "grad_norm": 0.559226930141449, + "learning_rate": 1.9151012494644296e-05, + "loss": 0.36, + "step": 7116 + }, + { + "epoch": 0.13214615907293473, + "grad_norm": 0.32480138540267944, + "learning_rate": 1.915054207519759e-05, + "loss": 0.3746, + "step": 7118 + }, + { + "epoch": 0.13218328921035336, + "grad_norm": 0.26667365431785583, + "learning_rate": 1.915007153123942e-05, + "loss": 0.2805, + "step": 7120 + }, + { + "epoch": 0.132220419347772, + "grad_norm": 0.5144453048706055, + "learning_rate": 1.914960086277618e-05, + "loss": 0.3858, + "step": 7122 + }, + { + "epoch": 0.13225754948519064, + "grad_norm": 0.2771070897579193, + "learning_rate": 1.9149130069814276e-05, + "loss": 0.4166, + "step": 7124 + }, + { + "epoch": 0.1322946796226093, + "grad_norm": 0.33683663606643677, + "learning_rate": 1.9148659152360122e-05, + "loss": 0.4772, + "step": 7126 + }, + { + "epoch": 0.13233180976002792, + "grad_norm": 0.3622194230556488, + "learning_rate": 1.9148188110420118e-05, + "loss": 0.2595, + "step": 7128 + }, + { + "epoch": 0.13236893989744655, + "grad_norm": 0.3051789700984955, + "learning_rate": 1.9147716944000673e-05, + "loss": 0.3122, + "step": 7130 + }, + { + "epoch": 0.1324060700348652, + "grad_norm": 0.35281604528427124, + "learning_rate": 1.9147245653108205e-05, + "loss": 0.5425, + "step": 7132 + }, + { + "epoch": 0.13244320017228384, + "grad_norm": 0.4206068813800812, + "learning_rate": 1.914677423774912e-05, + "loss": 0.5501, + "step": 7134 + }, + { + "epoch": 0.13248033030970247, + "grad_norm": 0.3616102635860443, + "learning_rate": 1.9146302697929838e-05, + "loss": 0.3116, + "step": 7136 + }, + { + "epoch": 0.13251746044712112, + "grad_norm": 0.263750821352005, + "learning_rate": 1.914583103365677e-05, + "loss": 0.1852, + "step": 7138 + }, + { + "epoch": 0.13255459058453975, + "grad_norm": 0.6301262378692627, + "learning_rate": 1.914535924493634e-05, + "loss": 0.2776, + "step": 7140 + }, + { + "epoch": 0.13259172072195838, + "grad_norm": 0.40337684750556946, + "learning_rate": 1.9144887331774963e-05, + "loss": 0.544, + "step": 7142 + }, + { + "epoch": 0.13262885085937703, + "grad_norm": 0.3808266520500183, + "learning_rate": 1.914441529417906e-05, + "loss": 0.3192, + "step": 7144 + }, + { + "epoch": 0.13266598099679566, + "grad_norm": 0.40872445702552795, + "learning_rate": 1.9143943132155055e-05, + "loss": 0.2908, + "step": 7146 + }, + { + "epoch": 0.13270311113421432, + "grad_norm": 0.49685102701187134, + "learning_rate": 1.9143470845709375e-05, + "loss": 0.5106, + "step": 7148 + }, + { + "epoch": 0.13274024127163295, + "grad_norm": 0.3251996338367462, + "learning_rate": 1.9142998434848447e-05, + "loss": 0.2078, + "step": 7150 + }, + { + "epoch": 0.13277737140905158, + "grad_norm": 0.2899598777294159, + "learning_rate": 1.9142525899578694e-05, + "loss": 0.3076, + "step": 7152 + }, + { + "epoch": 0.13281450154647023, + "grad_norm": 0.39834997057914734, + "learning_rate": 1.9142053239906547e-05, + "loss": 0.2989, + "step": 7154 + }, + { + "epoch": 0.13285163168388886, + "grad_norm": 0.4305490255355835, + "learning_rate": 1.914158045583844e-05, + "loss": 0.4965, + "step": 7156 + }, + { + "epoch": 0.1328887618213075, + "grad_norm": 0.3186148703098297, + "learning_rate": 1.9141107547380806e-05, + "loss": 0.2909, + "step": 7158 + }, + { + "epoch": 0.13292589195872614, + "grad_norm": 0.28367260098457336, + "learning_rate": 1.9140634514540078e-05, + "loss": 0.321, + "step": 7160 + }, + { + "epoch": 0.13296302209614477, + "grad_norm": 0.3300277590751648, + "learning_rate": 1.9140161357322695e-05, + "loss": 0.189, + "step": 7162 + }, + { + "epoch": 0.13300015223356343, + "grad_norm": 0.364374577999115, + "learning_rate": 1.9139688075735092e-05, + "loss": 0.0786, + "step": 7164 + }, + { + "epoch": 0.13303728237098206, + "grad_norm": 0.32289332151412964, + "learning_rate": 1.9139214669783713e-05, + "loss": 0.2853, + "step": 7166 + }, + { + "epoch": 0.13307441250840069, + "grad_norm": 0.4499182403087616, + "learning_rate": 1.9138741139474995e-05, + "loss": 0.3471, + "step": 7168 + }, + { + "epoch": 0.13311154264581934, + "grad_norm": 0.29096126556396484, + "learning_rate": 1.9138267484815387e-05, + "loss": 0.4668, + "step": 7170 + }, + { + "epoch": 0.13314867278323797, + "grad_norm": 0.5043051242828369, + "learning_rate": 1.913779370581133e-05, + "loss": 0.2469, + "step": 7172 + }, + { + "epoch": 0.1331858029206566, + "grad_norm": 0.3818332552909851, + "learning_rate": 1.913731980246927e-05, + "loss": 0.3246, + "step": 7174 + }, + { + "epoch": 0.13322293305807525, + "grad_norm": 0.22566120326519012, + "learning_rate": 1.913684577479566e-05, + "loss": 0.2937, + "step": 7176 + }, + { + "epoch": 0.13326006319549388, + "grad_norm": 0.45355477929115295, + "learning_rate": 1.9136371622796942e-05, + "loss": 0.2392, + "step": 7178 + }, + { + "epoch": 0.1332971933329125, + "grad_norm": 0.3846004605293274, + "learning_rate": 1.913589734647958e-05, + "loss": 0.3306, + "step": 7180 + }, + { + "epoch": 0.13333432347033117, + "grad_norm": 0.37682563066482544, + "learning_rate": 1.913542294585001e-05, + "loss": 0.4291, + "step": 7182 + }, + { + "epoch": 0.1333714536077498, + "grad_norm": 0.46033042669296265, + "learning_rate": 1.9134948420914704e-05, + "loss": 0.2161, + "step": 7184 + }, + { + "epoch": 0.13340858374516845, + "grad_norm": 0.3213376998901367, + "learning_rate": 1.9134473771680114e-05, + "loss": 0.4398, + "step": 7186 + }, + { + "epoch": 0.13344571388258708, + "grad_norm": 0.3502066135406494, + "learning_rate": 1.9133998998152693e-05, + "loss": 0.2088, + "step": 7188 + }, + { + "epoch": 0.1334828440200057, + "grad_norm": 0.41070929169654846, + "learning_rate": 1.9133524100338908e-05, + "loss": 0.4393, + "step": 7190 + }, + { + "epoch": 0.13351997415742436, + "grad_norm": 0.38879847526550293, + "learning_rate": 1.9133049078245216e-05, + "loss": 0.1837, + "step": 7192 + }, + { + "epoch": 0.133557104294843, + "grad_norm": 0.3301045298576355, + "learning_rate": 1.9132573931878083e-05, + "loss": 0.439, + "step": 7194 + }, + { + "epoch": 0.13359423443226162, + "grad_norm": 0.27724719047546387, + "learning_rate": 1.9132098661243975e-05, + "loss": 0.2628, + "step": 7196 + }, + { + "epoch": 0.13363136456968028, + "grad_norm": 0.41109728813171387, + "learning_rate": 1.9131623266349356e-05, + "loss": 0.3366, + "step": 7198 + }, + { + "epoch": 0.1336684947070989, + "grad_norm": 0.3013942837715149, + "learning_rate": 1.9131147747200698e-05, + "loss": 0.2323, + "step": 7200 + }, + { + "epoch": 0.13370562484451756, + "grad_norm": 0.2770337164402008, + "learning_rate": 1.913067210380447e-05, + "loss": 0.4331, + "step": 7202 + }, + { + "epoch": 0.1337427549819362, + "grad_norm": 0.34681665897369385, + "learning_rate": 1.9130196336167147e-05, + "loss": 0.3626, + "step": 7204 + }, + { + "epoch": 0.13377988511935482, + "grad_norm": 0.3675478994846344, + "learning_rate": 1.9129720444295197e-05, + "loss": 0.383, + "step": 7206 + }, + { + "epoch": 0.13381701525677347, + "grad_norm": 0.23802945017814636, + "learning_rate": 1.9129244428195097e-05, + "loss": 0.2759, + "step": 7208 + }, + { + "epoch": 0.1338541453941921, + "grad_norm": 0.4891115427017212, + "learning_rate": 1.912876828787333e-05, + "loss": 0.2184, + "step": 7210 + }, + { + "epoch": 0.13389127553161073, + "grad_norm": 0.3260249197483063, + "learning_rate": 1.9128292023336367e-05, + "loss": 0.4242, + "step": 7212 + }, + { + "epoch": 0.1339284056690294, + "grad_norm": 0.5251856446266174, + "learning_rate": 1.9127815634590692e-05, + "loss": 0.3437, + "step": 7214 + }, + { + "epoch": 0.13396553580644802, + "grad_norm": 0.2991001605987549, + "learning_rate": 1.9127339121642787e-05, + "loss": 0.4452, + "step": 7216 + }, + { + "epoch": 0.13400266594386664, + "grad_norm": 0.3695489168167114, + "learning_rate": 1.9126862484499137e-05, + "loss": 0.4247, + "step": 7218 + }, + { + "epoch": 0.1340397960812853, + "grad_norm": 0.2977232038974762, + "learning_rate": 1.9126385723166226e-05, + "loss": 0.656, + "step": 7220 + }, + { + "epoch": 0.13407692621870393, + "grad_norm": 0.5024280548095703, + "learning_rate": 1.9125908837650544e-05, + "loss": 0.2328, + "step": 7222 + }, + { + "epoch": 0.13411405635612259, + "grad_norm": 0.4551043212413788, + "learning_rate": 1.9125431827958575e-05, + "loss": 0.2692, + "step": 7224 + }, + { + "epoch": 0.1341511864935412, + "grad_norm": 0.285351037979126, + "learning_rate": 1.9124954694096817e-05, + "loss": 0.2059, + "step": 7226 + }, + { + "epoch": 0.13418831663095984, + "grad_norm": 0.2855362892150879, + "learning_rate": 1.9124477436071755e-05, + "loss": 0.4807, + "step": 7228 + }, + { + "epoch": 0.1342254467683785, + "grad_norm": 0.5678279399871826, + "learning_rate": 1.9124000053889885e-05, + "loss": 0.335, + "step": 7230 + }, + { + "epoch": 0.13426257690579713, + "grad_norm": 0.39154475927352905, + "learning_rate": 1.9123522547557707e-05, + "loss": 0.3555, + "step": 7232 + }, + { + "epoch": 0.13429970704321575, + "grad_norm": 0.31161314249038696, + "learning_rate": 1.9123044917081715e-05, + "loss": 0.358, + "step": 7234 + }, + { + "epoch": 0.1343368371806344, + "grad_norm": 0.35746318101882935, + "learning_rate": 1.9122567162468406e-05, + "loss": 0.2357, + "step": 7236 + }, + { + "epoch": 0.13437396731805304, + "grad_norm": 0.36686453223228455, + "learning_rate": 1.9122089283724285e-05, + "loss": 0.2386, + "step": 7238 + }, + { + "epoch": 0.1344110974554717, + "grad_norm": 0.46335890889167786, + "learning_rate": 1.912161128085585e-05, + "loss": 0.2572, + "step": 7240 + }, + { + "epoch": 0.13444822759289032, + "grad_norm": 0.29942893981933594, + "learning_rate": 1.912113315386961e-05, + "loss": 0.2825, + "step": 7242 + }, + { + "epoch": 0.13448535773030895, + "grad_norm": 0.5759248733520508, + "learning_rate": 1.9120654902772068e-05, + "loss": 0.3768, + "step": 7244 + }, + { + "epoch": 0.1345224878677276, + "grad_norm": 0.5424313545227051, + "learning_rate": 1.9120176527569733e-05, + "loss": 0.3264, + "step": 7246 + }, + { + "epoch": 0.13455961800514624, + "grad_norm": 0.35377129912376404, + "learning_rate": 1.9119698028269115e-05, + "loss": 0.4773, + "step": 7248 + }, + { + "epoch": 0.13459674814256486, + "grad_norm": 0.2863421142101288, + "learning_rate": 1.9119219404876718e-05, + "loss": 0.1956, + "step": 7250 + }, + { + "epoch": 0.13463387827998352, + "grad_norm": 0.3096824884414673, + "learning_rate": 1.9118740657399065e-05, + "loss": 0.2662, + "step": 7252 + }, + { + "epoch": 0.13467100841740215, + "grad_norm": 0.391652375459671, + "learning_rate": 1.9118261785842667e-05, + "loss": 0.2517, + "step": 7254 + }, + { + "epoch": 0.13470813855482078, + "grad_norm": 0.3129667043685913, + "learning_rate": 1.9117782790214034e-05, + "loss": 0.3747, + "step": 7256 + }, + { + "epoch": 0.13474526869223943, + "grad_norm": 0.3863779306411743, + "learning_rate": 1.911730367051969e-05, + "loss": 0.2615, + "step": 7258 + }, + { + "epoch": 0.13478239882965806, + "grad_norm": 0.30290815234184265, + "learning_rate": 1.911682442676615e-05, + "loss": 0.3589, + "step": 7260 + }, + { + "epoch": 0.13481952896707672, + "grad_norm": 0.41937631368637085, + "learning_rate": 1.9116345058959942e-05, + "loss": 0.32, + "step": 7262 + }, + { + "epoch": 0.13485665910449535, + "grad_norm": 0.48997122049331665, + "learning_rate": 1.9115865567107585e-05, + "loss": 0.2641, + "step": 7264 + }, + { + "epoch": 0.13489378924191398, + "grad_norm": 0.33411481976509094, + "learning_rate": 1.91153859512156e-05, + "loss": 0.4147, + "step": 7266 + }, + { + "epoch": 0.13493091937933263, + "grad_norm": 0.3630472719669342, + "learning_rate": 1.9114906211290514e-05, + "loss": 0.4086, + "step": 7268 + }, + { + "epoch": 0.13496804951675126, + "grad_norm": 0.30900058150291443, + "learning_rate": 1.911442634733886e-05, + "loss": 0.2582, + "step": 7270 + }, + { + "epoch": 0.1350051796541699, + "grad_norm": 0.4592708349227905, + "learning_rate": 1.911394635936716e-05, + "loss": 0.4602, + "step": 7272 + }, + { + "epoch": 0.13504230979158854, + "grad_norm": 0.26999205350875854, + "learning_rate": 1.9113466247381955e-05, + "loss": 0.3738, + "step": 7274 + }, + { + "epoch": 0.13507943992900717, + "grad_norm": 0.2519771456718445, + "learning_rate": 1.911298601138977e-05, + "loss": 0.3134, + "step": 7276 + }, + { + "epoch": 0.13511657006642583, + "grad_norm": 0.31244611740112305, + "learning_rate": 1.9112505651397143e-05, + "loss": 0.4766, + "step": 7278 + }, + { + "epoch": 0.13515370020384446, + "grad_norm": 0.3162122666835785, + "learning_rate": 1.9112025167410605e-05, + "loss": 0.3996, + "step": 7280 + }, + { + "epoch": 0.13519083034126309, + "grad_norm": 0.3427612781524658, + "learning_rate": 1.91115445594367e-05, + "loss": 0.4997, + "step": 7282 + }, + { + "epoch": 0.13522796047868174, + "grad_norm": 0.30499011278152466, + "learning_rate": 1.911106382748197e-05, + "loss": 0.204, + "step": 7284 + }, + { + "epoch": 0.13526509061610037, + "grad_norm": 0.33350473642349243, + "learning_rate": 1.911058297155295e-05, + "loss": 0.2868, + "step": 7286 + }, + { + "epoch": 0.135302220753519, + "grad_norm": 0.3825935125350952, + "learning_rate": 1.9110101991656184e-05, + "loss": 0.3556, + "step": 7288 + }, + { + "epoch": 0.13533935089093765, + "grad_norm": 0.38847166299819946, + "learning_rate": 1.910962088779822e-05, + "loss": 0.2625, + "step": 7290 + }, + { + "epoch": 0.13537648102835628, + "grad_norm": 0.3060508668422699, + "learning_rate": 1.91091396599856e-05, + "loss": 0.1712, + "step": 7292 + }, + { + "epoch": 0.1354136111657749, + "grad_norm": 0.3461926281452179, + "learning_rate": 1.910865830822487e-05, + "loss": 0.396, + "step": 7294 + }, + { + "epoch": 0.13545074130319357, + "grad_norm": 0.2541234493255615, + "learning_rate": 1.910817683252259e-05, + "loss": 0.1824, + "step": 7296 + }, + { + "epoch": 0.1354878714406122, + "grad_norm": 0.3683461844921112, + "learning_rate": 1.9107695232885305e-05, + "loss": 0.3257, + "step": 7298 + }, + { + "epoch": 0.13552500157803085, + "grad_norm": 0.29616573452949524, + "learning_rate": 1.9107213509319567e-05, + "loss": 0.1357, + "step": 7300 + }, + { + "epoch": 0.13556213171544948, + "grad_norm": 0.23538632690906525, + "learning_rate": 1.9106731661831934e-05, + "loss": 0.3382, + "step": 7302 + }, + { + "epoch": 0.1355992618528681, + "grad_norm": 0.33959728479385376, + "learning_rate": 1.910624969042896e-05, + "loss": 0.2681, + "step": 7304 + }, + { + "epoch": 0.13563639199028676, + "grad_norm": 0.5154351592063904, + "learning_rate": 1.9105767595117203e-05, + "loss": 0.3966, + "step": 7306 + }, + { + "epoch": 0.1356735221277054, + "grad_norm": 0.3378819227218628, + "learning_rate": 1.9105285375903224e-05, + "loss": 0.1423, + "step": 7308 + }, + { + "epoch": 0.13571065226512402, + "grad_norm": 0.4130324125289917, + "learning_rate": 1.9104803032793587e-05, + "loss": 0.6339, + "step": 7310 + }, + { + "epoch": 0.13574778240254268, + "grad_norm": 0.2770974934101105, + "learning_rate": 1.910432056579485e-05, + "loss": 0.2532, + "step": 7312 + }, + { + "epoch": 0.1357849125399613, + "grad_norm": 0.36908838152885437, + "learning_rate": 1.9103837974913583e-05, + "loss": 0.3783, + "step": 7314 + }, + { + "epoch": 0.13582204267737996, + "grad_norm": 0.2566445767879486, + "learning_rate": 1.9103355260156352e-05, + "loss": 0.3144, + "step": 7316 + }, + { + "epoch": 0.1358591728147986, + "grad_norm": 0.34032490849494934, + "learning_rate": 1.910287242152972e-05, + "loss": 0.2979, + "step": 7318 + }, + { + "epoch": 0.13589630295221722, + "grad_norm": 0.25391313433647156, + "learning_rate": 1.910238945904026e-05, + "loss": 0.4646, + "step": 7320 + }, + { + "epoch": 0.13593343308963587, + "grad_norm": 0.25708311796188354, + "learning_rate": 1.9101906372694548e-05, + "loss": 0.2783, + "step": 7322 + }, + { + "epoch": 0.1359705632270545, + "grad_norm": 0.27837061882019043, + "learning_rate": 1.910142316249915e-05, + "loss": 0.2793, + "step": 7324 + }, + { + "epoch": 0.13600769336447313, + "grad_norm": 0.29085224866867065, + "learning_rate": 1.9100939828460644e-05, + "loss": 0.3763, + "step": 7326 + }, + { + "epoch": 0.1360448235018918, + "grad_norm": 0.38636770844459534, + "learning_rate": 1.910045637058561e-05, + "loss": 0.4149, + "step": 7328 + }, + { + "epoch": 0.13608195363931042, + "grad_norm": 0.3627934753894806, + "learning_rate": 1.9099972788880622e-05, + "loss": 0.2178, + "step": 7330 + }, + { + "epoch": 0.13611908377672904, + "grad_norm": 0.6746304631233215, + "learning_rate": 1.9099489083352266e-05, + "loss": 0.4325, + "step": 7332 + }, + { + "epoch": 0.1361562139141477, + "grad_norm": 0.5423709154129028, + "learning_rate": 1.9099005254007117e-05, + "loss": 0.6612, + "step": 7334 + }, + { + "epoch": 0.13619334405156633, + "grad_norm": 0.32637229561805725, + "learning_rate": 1.909852130085176e-05, + "loss": 0.37, + "step": 7336 + }, + { + "epoch": 0.13623047418898498, + "grad_norm": 0.35113540291786194, + "learning_rate": 1.9098037223892784e-05, + "loss": 0.2181, + "step": 7338 + }, + { + "epoch": 0.1362676043264036, + "grad_norm": 0.3909337818622589, + "learning_rate": 1.909755302313677e-05, + "loss": 0.2177, + "step": 7340 + }, + { + "epoch": 0.13630473446382224, + "grad_norm": 0.3278919458389282, + "learning_rate": 1.909706869859031e-05, + "loss": 0.1888, + "step": 7342 + }, + { + "epoch": 0.1363418646012409, + "grad_norm": 0.339663565158844, + "learning_rate": 1.9096584250259996e-05, + "loss": 0.3061, + "step": 7344 + }, + { + "epoch": 0.13637899473865953, + "grad_norm": 0.3417191505432129, + "learning_rate": 1.909609967815242e-05, + "loss": 0.2961, + "step": 7346 + }, + { + "epoch": 0.13641612487607815, + "grad_norm": 0.4168943464756012, + "learning_rate": 1.909561498227417e-05, + "loss": 0.3892, + "step": 7348 + }, + { + "epoch": 0.1364532550134968, + "grad_norm": 0.45449140667915344, + "learning_rate": 1.9095130162631842e-05, + "loss": 0.5291, + "step": 7350 + }, + { + "epoch": 0.13649038515091544, + "grad_norm": 0.7133435010910034, + "learning_rate": 1.909464521923204e-05, + "loss": 0.3539, + "step": 7352 + }, + { + "epoch": 0.1365275152883341, + "grad_norm": 0.36785033345222473, + "learning_rate": 1.9094160152081355e-05, + "loss": 0.2265, + "step": 7354 + }, + { + "epoch": 0.13656464542575272, + "grad_norm": 0.44135814905166626, + "learning_rate": 1.9093674961186394e-05, + "loss": 0.397, + "step": 7356 + }, + { + "epoch": 0.13660177556317135, + "grad_norm": 0.34029603004455566, + "learning_rate": 1.9093189646553752e-05, + "loss": 0.2474, + "step": 7358 + }, + { + "epoch": 0.13663890570059, + "grad_norm": 0.5090249180793762, + "learning_rate": 1.9092704208190038e-05, + "loss": 0.3186, + "step": 7360 + }, + { + "epoch": 0.13667603583800864, + "grad_norm": 0.24662551283836365, + "learning_rate": 1.9092218646101854e-05, + "loss": 0.2738, + "step": 7362 + }, + { + "epoch": 0.13671316597542726, + "grad_norm": 0.3393016755580902, + "learning_rate": 1.909173296029581e-05, + "loss": 0.5807, + "step": 7364 + }, + { + "epoch": 0.13675029611284592, + "grad_norm": 0.36386582255363464, + "learning_rate": 1.9091247150778515e-05, + "loss": 0.2103, + "step": 7366 + }, + { + "epoch": 0.13678742625026455, + "grad_norm": 0.3460341691970825, + "learning_rate": 1.9090761217556574e-05, + "loss": 0.346, + "step": 7368 + }, + { + "epoch": 0.13682455638768318, + "grad_norm": 0.2915242910385132, + "learning_rate": 1.9090275160636608e-05, + "loss": 0.2753, + "step": 7370 + }, + { + "epoch": 0.13686168652510183, + "grad_norm": 0.3440723717212677, + "learning_rate": 1.908978898002522e-05, + "loss": 0.3107, + "step": 7372 + }, + { + "epoch": 0.13689881666252046, + "grad_norm": 0.29323211312294006, + "learning_rate": 1.9089302675729034e-05, + "loss": 0.464, + "step": 7374 + }, + { + "epoch": 0.13693594679993912, + "grad_norm": 0.3394409418106079, + "learning_rate": 1.9088816247754663e-05, + "loss": 0.1878, + "step": 7376 + }, + { + "epoch": 0.13697307693735775, + "grad_norm": 0.4465654194355011, + "learning_rate": 1.908832969610873e-05, + "loss": 0.3614, + "step": 7378 + }, + { + "epoch": 0.13701020707477637, + "grad_norm": 0.41250550746917725, + "learning_rate": 1.908784302079785e-05, + "loss": 0.5246, + "step": 7380 + }, + { + "epoch": 0.13704733721219503, + "grad_norm": 0.3276500105857849, + "learning_rate": 1.9087356221828645e-05, + "loss": 0.2576, + "step": 7382 + }, + { + "epoch": 0.13708446734961366, + "grad_norm": 0.23176829516887665, + "learning_rate": 1.9086869299207745e-05, + "loss": 0.2113, + "step": 7384 + }, + { + "epoch": 0.1371215974870323, + "grad_norm": 0.48694881796836853, + "learning_rate": 1.908638225294177e-05, + "loss": 0.3808, + "step": 7386 + }, + { + "epoch": 0.13715872762445094, + "grad_norm": 0.3344497084617615, + "learning_rate": 1.9085895083037353e-05, + "loss": 0.4387, + "step": 7388 + }, + { + "epoch": 0.13719585776186957, + "grad_norm": 0.42727771401405334, + "learning_rate": 1.9085407789501115e-05, + "loss": 0.4894, + "step": 7390 + }, + { + "epoch": 0.13723298789928823, + "grad_norm": 0.42960286140441895, + "learning_rate": 1.9084920372339697e-05, + "loss": 0.51, + "step": 7392 + }, + { + "epoch": 0.13727011803670686, + "grad_norm": 0.3123345971107483, + "learning_rate": 1.908443283155972e-05, + "loss": 0.2165, + "step": 7394 + }, + { + "epoch": 0.13730724817412548, + "grad_norm": 0.37721654772758484, + "learning_rate": 1.9083945167167822e-05, + "loss": 0.4204, + "step": 7396 + }, + { + "epoch": 0.13734437831154414, + "grad_norm": 0.4325771927833557, + "learning_rate": 1.9083457379170647e-05, + "loss": 0.3588, + "step": 7398 + }, + { + "epoch": 0.13738150844896277, + "grad_norm": 0.45864030718803406, + "learning_rate": 1.908296946757482e-05, + "loss": 0.4754, + "step": 7400 + }, + { + "epoch": 0.1374186385863814, + "grad_norm": 0.29926881194114685, + "learning_rate": 1.9082481432386982e-05, + "loss": 0.2923, + "step": 7402 + }, + { + "epoch": 0.13745576872380005, + "grad_norm": 0.4547715485095978, + "learning_rate": 1.9081993273613782e-05, + "loss": 0.3827, + "step": 7404 + }, + { + "epoch": 0.13749289886121868, + "grad_norm": 0.2662661075592041, + "learning_rate": 1.908150499126186e-05, + "loss": 0.3679, + "step": 7406 + }, + { + "epoch": 0.1375300289986373, + "grad_norm": 0.2919824719429016, + "learning_rate": 1.9081016585337852e-05, + "loss": 0.2957, + "step": 7408 + }, + { + "epoch": 0.13756715913605597, + "grad_norm": 0.3491005301475525, + "learning_rate": 1.9080528055848407e-05, + "loss": 0.3699, + "step": 7410 + }, + { + "epoch": 0.1376042892734746, + "grad_norm": 0.407593309879303, + "learning_rate": 1.908003940280018e-05, + "loss": 0.2733, + "step": 7412 + }, + { + "epoch": 0.13764141941089325, + "grad_norm": 0.3127366602420807, + "learning_rate": 1.9079550626199812e-05, + "loss": 0.2969, + "step": 7414 + }, + { + "epoch": 0.13767854954831188, + "grad_norm": 0.27980494499206543, + "learning_rate": 1.9079061726053957e-05, + "loss": 0.1842, + "step": 7416 + }, + { + "epoch": 0.1377156796857305, + "grad_norm": 0.26316460967063904, + "learning_rate": 1.9078572702369268e-05, + "loss": 0.1766, + "step": 7418 + }, + { + "epoch": 0.13775280982314916, + "grad_norm": 0.4010971784591675, + "learning_rate": 1.9078083555152397e-05, + "loss": 0.3542, + "step": 7420 + }, + { + "epoch": 0.1377899399605678, + "grad_norm": 0.38183778524398804, + "learning_rate": 1.9077594284410003e-05, + "loss": 0.3271, + "step": 7422 + }, + { + "epoch": 0.13782707009798642, + "grad_norm": 0.32891348004341125, + "learning_rate": 1.9077104890148735e-05, + "loss": 0.2374, + "step": 7424 + }, + { + "epoch": 0.13786420023540508, + "grad_norm": 0.2728039622306824, + "learning_rate": 1.907661537237526e-05, + "loss": 0.3228, + "step": 7426 + }, + { + "epoch": 0.1379013303728237, + "grad_norm": 0.3456726372241974, + "learning_rate": 1.907612573109624e-05, + "loss": 0.316, + "step": 7428 + }, + { + "epoch": 0.13793846051024236, + "grad_norm": 0.335875540971756, + "learning_rate": 1.9075635966318337e-05, + "loss": 0.2688, + "step": 7430 + }, + { + "epoch": 0.137975590647661, + "grad_norm": 0.3106536865234375, + "learning_rate": 1.9075146078048213e-05, + "loss": 0.3185, + "step": 7432 + }, + { + "epoch": 0.13801272078507962, + "grad_norm": 0.3902144134044647, + "learning_rate": 1.907465606629253e-05, + "loss": 0.345, + "step": 7434 + }, + { + "epoch": 0.13804985092249827, + "grad_norm": 0.29771289229393005, + "learning_rate": 1.907416593105796e-05, + "loss": 0.257, + "step": 7436 + }, + { + "epoch": 0.1380869810599169, + "grad_norm": 0.2500526010990143, + "learning_rate": 1.9073675672351174e-05, + "loss": 0.2903, + "step": 7438 + }, + { + "epoch": 0.13812411119733553, + "grad_norm": 0.3253353238105774, + "learning_rate": 1.9073185290178845e-05, + "loss": 0.2841, + "step": 7440 + }, + { + "epoch": 0.1381612413347542, + "grad_norm": 0.3276876211166382, + "learning_rate": 1.9072694784547633e-05, + "loss": 0.3302, + "step": 7442 + }, + { + "epoch": 0.13819837147217282, + "grad_norm": 0.5529167056083679, + "learning_rate": 1.907220415546423e-05, + "loss": 0.2302, + "step": 7444 + }, + { + "epoch": 0.13823550160959144, + "grad_norm": 0.46558377146720886, + "learning_rate": 1.9071713402935295e-05, + "loss": 0.3398, + "step": 7446 + }, + { + "epoch": 0.1382726317470101, + "grad_norm": 0.4772559404373169, + "learning_rate": 1.9071222526967516e-05, + "loss": 0.2164, + "step": 7448 + }, + { + "epoch": 0.13830976188442873, + "grad_norm": 0.3378334641456604, + "learning_rate": 1.907073152756757e-05, + "loss": 0.3474, + "step": 7450 + }, + { + "epoch": 0.13834689202184738, + "grad_norm": 0.5119785070419312, + "learning_rate": 1.9070240404742136e-05, + "loss": 0.4383, + "step": 7452 + }, + { + "epoch": 0.138384022159266, + "grad_norm": 0.3765888810157776, + "learning_rate": 1.9069749158497905e-05, + "loss": 0.353, + "step": 7454 + }, + { + "epoch": 0.13842115229668464, + "grad_norm": 0.40379026532173157, + "learning_rate": 1.906925778884155e-05, + "loss": 0.3834, + "step": 7456 + }, + { + "epoch": 0.1384582824341033, + "grad_norm": 0.4134158790111542, + "learning_rate": 1.906876629577976e-05, + "loss": 0.1882, + "step": 7458 + }, + { + "epoch": 0.13849541257152193, + "grad_norm": 0.29162201285362244, + "learning_rate": 1.9068274679319228e-05, + "loss": 0.2168, + "step": 7460 + }, + { + "epoch": 0.13853254270894055, + "grad_norm": 0.4610622227191925, + "learning_rate": 1.9067782939466642e-05, + "loss": 0.38, + "step": 7462 + }, + { + "epoch": 0.1385696728463592, + "grad_norm": 0.3794720768928528, + "learning_rate": 1.9067291076228688e-05, + "loss": 0.3199, + "step": 7464 + }, + { + "epoch": 0.13860680298377784, + "grad_norm": 0.28741660714149475, + "learning_rate": 1.9066799089612063e-05, + "loss": 0.4323, + "step": 7466 + }, + { + "epoch": 0.1386439331211965, + "grad_norm": 0.2328682839870453, + "learning_rate": 1.906630697962346e-05, + "loss": 0.2428, + "step": 7468 + }, + { + "epoch": 0.13868106325861512, + "grad_norm": 0.3834840655326843, + "learning_rate": 1.9065814746269577e-05, + "loss": 0.3697, + "step": 7470 + }, + { + "epoch": 0.13871819339603375, + "grad_norm": 0.46875759959220886, + "learning_rate": 1.906532238955711e-05, + "loss": 0.3185, + "step": 7472 + }, + { + "epoch": 0.1387553235334524, + "grad_norm": 0.505338191986084, + "learning_rate": 1.906482990949276e-05, + "loss": 0.5381, + "step": 7474 + }, + { + "epoch": 0.13879245367087104, + "grad_norm": 0.44858452677726746, + "learning_rate": 1.9064337306083227e-05, + "loss": 0.377, + "step": 7476 + }, + { + "epoch": 0.13882958380828966, + "grad_norm": 0.2833554148674011, + "learning_rate": 1.906384457933521e-05, + "loss": 0.4701, + "step": 7478 + }, + { + "epoch": 0.13886671394570832, + "grad_norm": 0.32994261384010315, + "learning_rate": 1.9063351729255425e-05, + "loss": 0.3555, + "step": 7480 + }, + { + "epoch": 0.13890384408312695, + "grad_norm": 0.3297823965549469, + "learning_rate": 1.9062858755850567e-05, + "loss": 0.3037, + "step": 7482 + }, + { + "epoch": 0.13894097422054558, + "grad_norm": 0.4257313013076782, + "learning_rate": 1.9062365659127348e-05, + "loss": 0.1939, + "step": 7484 + }, + { + "epoch": 0.13897810435796423, + "grad_norm": 0.21694734692573547, + "learning_rate": 1.9061872439092476e-05, + "loss": 0.306, + "step": 7486 + }, + { + "epoch": 0.13901523449538286, + "grad_norm": 0.43100449442863464, + "learning_rate": 1.9061379095752666e-05, + "loss": 0.5689, + "step": 7488 + }, + { + "epoch": 0.13905236463280152, + "grad_norm": 0.2504644989967346, + "learning_rate": 1.9060885629114624e-05, + "loss": 0.3323, + "step": 7490 + }, + { + "epoch": 0.13908949477022015, + "grad_norm": 0.24499230086803436, + "learning_rate": 1.9060392039185075e-05, + "loss": 0.3681, + "step": 7492 + }, + { + "epoch": 0.13912662490763877, + "grad_norm": 0.22481974959373474, + "learning_rate": 1.9059898325970724e-05, + "loss": 0.2706, + "step": 7494 + }, + { + "epoch": 0.13916375504505743, + "grad_norm": 0.3223819434642792, + "learning_rate": 1.90594044894783e-05, + "loss": 0.3544, + "step": 7496 + }, + { + "epoch": 0.13920088518247606, + "grad_norm": 0.4236181676387787, + "learning_rate": 1.9058910529714512e-05, + "loss": 0.2414, + "step": 7498 + }, + { + "epoch": 0.1392380153198947, + "grad_norm": 0.27202969789505005, + "learning_rate": 1.9058416446686088e-05, + "loss": 0.2601, + "step": 7500 + }, + { + "epoch": 0.13927514545731334, + "grad_norm": 0.3573920428752899, + "learning_rate": 1.9057922240399747e-05, + "loss": 0.337, + "step": 7502 + }, + { + "epoch": 0.13931227559473197, + "grad_norm": 0.39434346556663513, + "learning_rate": 1.905742791086222e-05, + "loss": 0.7151, + "step": 7504 + }, + { + "epoch": 0.13934940573215063, + "grad_norm": 0.4281597435474396, + "learning_rate": 1.9056933458080226e-05, + "loss": 0.2846, + "step": 7506 + }, + { + "epoch": 0.13938653586956926, + "grad_norm": 0.4525904655456543, + "learning_rate": 1.90564388820605e-05, + "loss": 0.2701, + "step": 7508 + }, + { + "epoch": 0.13942366600698788, + "grad_norm": 0.35705578327178955, + "learning_rate": 1.9055944182809763e-05, + "loss": 0.4215, + "step": 7510 + }, + { + "epoch": 0.13946079614440654, + "grad_norm": 0.20357581973075867, + "learning_rate": 1.9055449360334756e-05, + "loss": 0.4587, + "step": 7512 + }, + { + "epoch": 0.13949792628182517, + "grad_norm": 0.38579848408699036, + "learning_rate": 1.9054954414642205e-05, + "loss": 0.4127, + "step": 7514 + }, + { + "epoch": 0.1395350564192438, + "grad_norm": 0.34190812706947327, + "learning_rate": 1.905445934573885e-05, + "loss": 0.3555, + "step": 7516 + }, + { + "epoch": 0.13957218655666245, + "grad_norm": 0.2952379286289215, + "learning_rate": 1.905396415363142e-05, + "loss": 0.2554, + "step": 7518 + }, + { + "epoch": 0.13960931669408108, + "grad_norm": 0.37091872096061707, + "learning_rate": 1.905346883832666e-05, + "loss": 0.3732, + "step": 7520 + }, + { + "epoch": 0.1396464468314997, + "grad_norm": 0.2575719654560089, + "learning_rate": 1.9052973399831306e-05, + "loss": 0.4769, + "step": 7522 + }, + { + "epoch": 0.13968357696891837, + "grad_norm": 0.31771451234817505, + "learning_rate": 1.9052477838152103e-05, + "loss": 0.2886, + "step": 7524 + }, + { + "epoch": 0.139720707106337, + "grad_norm": 0.6454352140426636, + "learning_rate": 1.9051982153295793e-05, + "loss": 0.3109, + "step": 7526 + }, + { + "epoch": 0.13975783724375565, + "grad_norm": 0.2928260564804077, + "learning_rate": 1.9051486345269115e-05, + "loss": 0.1593, + "step": 7528 + }, + { + "epoch": 0.13979496738117428, + "grad_norm": 0.37625187635421753, + "learning_rate": 1.9050990414078826e-05, + "loss": 0.2461, + "step": 7530 + }, + { + "epoch": 0.1398320975185929, + "grad_norm": 0.5237709283828735, + "learning_rate": 1.9050494359731667e-05, + "loss": 0.4091, + "step": 7532 + }, + { + "epoch": 0.13986922765601156, + "grad_norm": 0.23249033093452454, + "learning_rate": 1.9049998182234385e-05, + "loss": 0.2453, + "step": 7534 + }, + { + "epoch": 0.1399063577934302, + "grad_norm": 0.3015519976615906, + "learning_rate": 1.904950188159374e-05, + "loss": 0.3016, + "step": 7536 + }, + { + "epoch": 0.13994348793084882, + "grad_norm": 0.4378632605075836, + "learning_rate": 1.9049005457816477e-05, + "loss": 0.3656, + "step": 7538 + }, + { + "epoch": 0.13998061806826748, + "grad_norm": 0.31358802318573, + "learning_rate": 1.9048508910909356e-05, + "loss": 0.3226, + "step": 7540 + }, + { + "epoch": 0.1400177482056861, + "grad_norm": 0.34746092557907104, + "learning_rate": 1.9048012240879132e-05, + "loss": 0.3952, + "step": 7542 + }, + { + "epoch": 0.14005487834310476, + "grad_norm": 0.3674270510673523, + "learning_rate": 1.9047515447732564e-05, + "loss": 0.4364, + "step": 7544 + }, + { + "epoch": 0.1400920084805234, + "grad_norm": 0.49495619535446167, + "learning_rate": 1.9047018531476415e-05, + "loss": 0.3908, + "step": 7546 + }, + { + "epoch": 0.14012913861794202, + "grad_norm": 0.31911754608154297, + "learning_rate": 1.9046521492117437e-05, + "loss": 0.4131, + "step": 7548 + }, + { + "epoch": 0.14016626875536067, + "grad_norm": 0.5316864848136902, + "learning_rate": 1.90460243296624e-05, + "loss": 0.3893, + "step": 7550 + }, + { + "epoch": 0.1402033988927793, + "grad_norm": 0.2884799540042877, + "learning_rate": 1.904552704411807e-05, + "loss": 0.2538, + "step": 7552 + }, + { + "epoch": 0.14024052903019793, + "grad_norm": 0.4590071141719818, + "learning_rate": 1.904502963549121e-05, + "loss": 0.3412, + "step": 7554 + }, + { + "epoch": 0.1402776591676166, + "grad_norm": 0.35984379053115845, + "learning_rate": 1.9044532103788588e-05, + "loss": 0.3171, + "step": 7556 + }, + { + "epoch": 0.14031478930503521, + "grad_norm": 0.5221495628356934, + "learning_rate": 1.9044034449016975e-05, + "loss": 0.4594, + "step": 7558 + }, + { + "epoch": 0.14035191944245384, + "grad_norm": 0.3746601343154907, + "learning_rate": 1.904353667118315e-05, + "loss": 0.1844, + "step": 7560 + }, + { + "epoch": 0.1403890495798725, + "grad_norm": 0.3336004316806793, + "learning_rate": 1.9043038770293874e-05, + "loss": 0.5946, + "step": 7562 + }, + { + "epoch": 0.14042617971729113, + "grad_norm": 0.4773891866207123, + "learning_rate": 1.9042540746355928e-05, + "loss": 0.4231, + "step": 7564 + }, + { + "epoch": 0.14046330985470978, + "grad_norm": 0.3812766671180725, + "learning_rate": 1.904204259937609e-05, + "loss": 0.4146, + "step": 7566 + }, + { + "epoch": 0.1405004399921284, + "grad_norm": 0.5302526950836182, + "learning_rate": 1.9041544329361136e-05, + "loss": 0.3401, + "step": 7568 + }, + { + "epoch": 0.14053757012954704, + "grad_norm": 0.4042723774909973, + "learning_rate": 1.9041045936317842e-05, + "loss": 0.3921, + "step": 7570 + }, + { + "epoch": 0.1405747002669657, + "grad_norm": 0.42343172430992126, + "learning_rate": 1.9040547420252998e-05, + "loss": 0.1814, + "step": 7572 + }, + { + "epoch": 0.14061183040438432, + "grad_norm": 0.3352805972099304, + "learning_rate": 1.9040048781173383e-05, + "loss": 0.4003, + "step": 7574 + }, + { + "epoch": 0.14064896054180295, + "grad_norm": 0.30000630021095276, + "learning_rate": 1.9039550019085776e-05, + "loss": 0.3686, + "step": 7576 + }, + { + "epoch": 0.1406860906792216, + "grad_norm": 0.45026734471321106, + "learning_rate": 1.9039051133996978e-05, + "loss": 0.387, + "step": 7578 + }, + { + "epoch": 0.14072322081664024, + "grad_norm": 0.6084704399108887, + "learning_rate": 1.9038552125913768e-05, + "loss": 0.4046, + "step": 7580 + }, + { + "epoch": 0.1407603509540589, + "grad_norm": 0.3534768223762512, + "learning_rate": 1.9038052994842933e-05, + "loss": 0.2335, + "step": 7582 + }, + { + "epoch": 0.14079748109147752, + "grad_norm": 0.3005528151988983, + "learning_rate": 1.9037553740791272e-05, + "loss": 0.4198, + "step": 7584 + }, + { + "epoch": 0.14083461122889615, + "grad_norm": 0.2921445965766907, + "learning_rate": 1.9037054363765572e-05, + "loss": 0.3079, + "step": 7586 + }, + { + "epoch": 0.1408717413663148, + "grad_norm": 0.41042840480804443, + "learning_rate": 1.9036554863772637e-05, + "loss": 0.4258, + "step": 7588 + }, + { + "epoch": 0.14090887150373343, + "grad_norm": 0.22219650447368622, + "learning_rate": 1.9036055240819252e-05, + "loss": 0.2717, + "step": 7590 + }, + { + "epoch": 0.14094600164115206, + "grad_norm": 0.27416983246803284, + "learning_rate": 1.9035555494912225e-05, + "loss": 0.3806, + "step": 7592 + }, + { + "epoch": 0.14098313177857072, + "grad_norm": 0.2190663367509842, + "learning_rate": 1.903505562605835e-05, + "loss": 0.3256, + "step": 7594 + }, + { + "epoch": 0.14102026191598935, + "grad_norm": 0.381106972694397, + "learning_rate": 1.903455563426443e-05, + "loss": 0.4424, + "step": 7596 + }, + { + "epoch": 0.14105739205340798, + "grad_norm": 0.4182669520378113, + "learning_rate": 1.9034055519537272e-05, + "loss": 0.4043, + "step": 7598 + }, + { + "epoch": 0.14109452219082663, + "grad_norm": 0.3130188286304474, + "learning_rate": 1.903355528188368e-05, + "loss": 0.4642, + "step": 7600 + }, + { + "epoch": 0.14113165232824526, + "grad_norm": 0.33640605211257935, + "learning_rate": 1.903305492131046e-05, + "loss": 0.2603, + "step": 7602 + }, + { + "epoch": 0.14116878246566392, + "grad_norm": 0.27698177099227905, + "learning_rate": 1.9032554437824414e-05, + "loss": 0.3162, + "step": 7604 + }, + { + "epoch": 0.14120591260308255, + "grad_norm": 0.2489544302225113, + "learning_rate": 1.903205383143236e-05, + "loss": 0.3388, + "step": 7606 + }, + { + "epoch": 0.14124304274050117, + "grad_norm": 0.3171347975730896, + "learning_rate": 1.903155310214111e-05, + "loss": 0.3501, + "step": 7608 + }, + { + "epoch": 0.14128017287791983, + "grad_norm": 0.4473964273929596, + "learning_rate": 1.903105224995747e-05, + "loss": 0.2558, + "step": 7610 + }, + { + "epoch": 0.14131730301533846, + "grad_norm": 0.37853822112083435, + "learning_rate": 1.9030551274888266e-05, + "loss": 0.376, + "step": 7612 + }, + { + "epoch": 0.1413544331527571, + "grad_norm": 0.31770774722099304, + "learning_rate": 1.9030050176940306e-05, + "loss": 0.2787, + "step": 7614 + }, + { + "epoch": 0.14139156329017574, + "grad_norm": 0.47328582406044006, + "learning_rate": 1.902954895612041e-05, + "loss": 0.3568, + "step": 7616 + }, + { + "epoch": 0.14142869342759437, + "grad_norm": 0.3312424123287201, + "learning_rate": 1.90290476124354e-05, + "loss": 0.3017, + "step": 7618 + }, + { + "epoch": 0.14146582356501303, + "grad_norm": 0.3369753360748291, + "learning_rate": 1.90285461458921e-05, + "loss": 0.3316, + "step": 7620 + }, + { + "epoch": 0.14150295370243166, + "grad_norm": 0.4140273630619049, + "learning_rate": 1.9028044556497324e-05, + "loss": 0.2932, + "step": 7622 + }, + { + "epoch": 0.14154008383985028, + "grad_norm": 0.37320852279663086, + "learning_rate": 1.902754284425791e-05, + "loss": 0.3398, + "step": 7624 + }, + { + "epoch": 0.14157721397726894, + "grad_norm": 0.30271226167678833, + "learning_rate": 1.9027041009180676e-05, + "loss": 0.4219, + "step": 7626 + }, + { + "epoch": 0.14161434411468757, + "grad_norm": 0.48476535081863403, + "learning_rate": 1.9026539051272454e-05, + "loss": 0.5501, + "step": 7628 + }, + { + "epoch": 0.1416514742521062, + "grad_norm": 0.24782590568065643, + "learning_rate": 1.902603697054007e-05, + "loss": 0.2794, + "step": 7630 + }, + { + "epoch": 0.14168860438952485, + "grad_norm": 0.2944287955760956, + "learning_rate": 1.9025534766990362e-05, + "loss": 0.3361, + "step": 7632 + }, + { + "epoch": 0.14172573452694348, + "grad_norm": 0.38036850094795227, + "learning_rate": 1.9025032440630164e-05, + "loss": 0.3949, + "step": 7634 + }, + { + "epoch": 0.1417628646643621, + "grad_norm": 0.5582762956619263, + "learning_rate": 1.90245299914663e-05, + "loss": 0.5108, + "step": 7636 + }, + { + "epoch": 0.14179999480178077, + "grad_norm": 0.34962642192840576, + "learning_rate": 1.902402741950562e-05, + "loss": 0.2645, + "step": 7638 + }, + { + "epoch": 0.1418371249391994, + "grad_norm": 0.3124140799045563, + "learning_rate": 1.9023524724754953e-05, + "loss": 0.1926, + "step": 7640 + }, + { + "epoch": 0.14187425507661805, + "grad_norm": 0.3975899815559387, + "learning_rate": 1.9023021907221145e-05, + "loss": 0.2962, + "step": 7642 + }, + { + "epoch": 0.14191138521403668, + "grad_norm": 0.34952908754348755, + "learning_rate": 1.9022518966911036e-05, + "loss": 0.4175, + "step": 7644 + }, + { + "epoch": 0.1419485153514553, + "grad_norm": 0.30632108449935913, + "learning_rate": 1.902201590383147e-05, + "loss": 0.3112, + "step": 7646 + }, + { + "epoch": 0.14198564548887396, + "grad_norm": 0.5652284622192383, + "learning_rate": 1.902151271798929e-05, + "loss": 0.5193, + "step": 7648 + }, + { + "epoch": 0.1420227756262926, + "grad_norm": 0.27054890990257263, + "learning_rate": 1.9021009409391346e-05, + "loss": 0.2578, + "step": 7650 + }, + { + "epoch": 0.14205990576371122, + "grad_norm": 0.2990904152393341, + "learning_rate": 1.9020505978044484e-05, + "loss": 0.5334, + "step": 7652 + }, + { + "epoch": 0.14209703590112988, + "grad_norm": 0.3570253849029541, + "learning_rate": 1.9020002423955555e-05, + "loss": 0.3386, + "step": 7654 + }, + { + "epoch": 0.1421341660385485, + "grad_norm": 0.37413713335990906, + "learning_rate": 1.9019498747131412e-05, + "loss": 0.3042, + "step": 7656 + }, + { + "epoch": 0.14217129617596716, + "grad_norm": 0.3607179522514343, + "learning_rate": 1.9018994947578905e-05, + "loss": 0.3253, + "step": 7658 + }, + { + "epoch": 0.1422084263133858, + "grad_norm": 0.4070660471916199, + "learning_rate": 1.9018491025304895e-05, + "loss": 0.3294, + "step": 7660 + }, + { + "epoch": 0.14224555645080442, + "grad_norm": 0.32453879714012146, + "learning_rate": 1.9017986980316236e-05, + "loss": 0.1705, + "step": 7662 + }, + { + "epoch": 0.14228268658822307, + "grad_norm": 0.3114584982395172, + "learning_rate": 1.9017482812619784e-05, + "loss": 0.6314, + "step": 7664 + }, + { + "epoch": 0.1423198167256417, + "grad_norm": 0.4178317189216614, + "learning_rate": 1.9016978522222403e-05, + "loss": 0.3268, + "step": 7666 + }, + { + "epoch": 0.14235694686306033, + "grad_norm": 0.2957448661327362, + "learning_rate": 1.9016474109130955e-05, + "loss": 0.3073, + "step": 7668 + }, + { + "epoch": 0.14239407700047899, + "grad_norm": 0.33956751227378845, + "learning_rate": 1.9015969573352296e-05, + "loss": 0.3471, + "step": 7670 + }, + { + "epoch": 0.14243120713789761, + "grad_norm": 0.3486991226673126, + "learning_rate": 1.90154649148933e-05, + "loss": 0.1757, + "step": 7672 + }, + { + "epoch": 0.14246833727531624, + "grad_norm": 0.32016023993492126, + "learning_rate": 1.901496013376083e-05, + "loss": 0.3254, + "step": 7674 + }, + { + "epoch": 0.1425054674127349, + "grad_norm": 0.5558412075042725, + "learning_rate": 1.9014455229961757e-05, + "loss": 0.4424, + "step": 7676 + }, + { + "epoch": 0.14254259755015353, + "grad_norm": 0.35902220010757446, + "learning_rate": 1.901395020350295e-05, + "loss": 0.2228, + "step": 7678 + }, + { + "epoch": 0.14257972768757218, + "grad_norm": 0.3260502517223358, + "learning_rate": 1.901344505439128e-05, + "loss": 0.4339, + "step": 7680 + }, + { + "epoch": 0.1426168578249908, + "grad_norm": 0.3270975649356842, + "learning_rate": 1.9012939782633624e-05, + "loss": 0.4883, + "step": 7682 + }, + { + "epoch": 0.14265398796240944, + "grad_norm": 0.33395111560821533, + "learning_rate": 1.901243438823685e-05, + "loss": 0.3784, + "step": 7684 + }, + { + "epoch": 0.1426911180998281, + "grad_norm": 0.36998942494392395, + "learning_rate": 1.901192887120784e-05, + "loss": 0.2678, + "step": 7686 + }, + { + "epoch": 0.14272824823724672, + "grad_norm": 0.3290286362171173, + "learning_rate": 1.9011423231553473e-05, + "loss": 0.4979, + "step": 7688 + }, + { + "epoch": 0.14276537837466535, + "grad_norm": 0.2865745723247528, + "learning_rate": 1.9010917469280628e-05, + "loss": 0.3269, + "step": 7690 + }, + { + "epoch": 0.142802508512084, + "grad_norm": 0.4057199954986572, + "learning_rate": 1.901041158439619e-05, + "loss": 0.297, + "step": 7692 + }, + { + "epoch": 0.14283963864950264, + "grad_norm": 0.31791216135025024, + "learning_rate": 1.9009905576907035e-05, + "loss": 0.223, + "step": 7694 + }, + { + "epoch": 0.1428767687869213, + "grad_norm": 0.3224818706512451, + "learning_rate": 1.9009399446820056e-05, + "loss": 0.4132, + "step": 7696 + }, + { + "epoch": 0.14291389892433992, + "grad_norm": 0.3537513017654419, + "learning_rate": 1.9008893194142135e-05, + "loss": 0.4522, + "step": 7698 + }, + { + "epoch": 0.14295102906175855, + "grad_norm": 0.7173958420753479, + "learning_rate": 1.900838681888016e-05, + "loss": 0.3052, + "step": 7700 + }, + { + "epoch": 0.1429881591991772, + "grad_norm": 0.4466204345226288, + "learning_rate": 1.900788032104103e-05, + "loss": 0.3026, + "step": 7702 + }, + { + "epoch": 0.14302528933659583, + "grad_norm": 0.2805536687374115, + "learning_rate": 1.9007373700631627e-05, + "loss": 0.3117, + "step": 7704 + }, + { + "epoch": 0.14306241947401446, + "grad_norm": 0.436399906873703, + "learning_rate": 1.9006866957658848e-05, + "loss": 0.2908, + "step": 7706 + }, + { + "epoch": 0.14309954961143312, + "grad_norm": 0.4815303683280945, + "learning_rate": 1.900636009212959e-05, + "loss": 0.3602, + "step": 7708 + }, + { + "epoch": 0.14313667974885175, + "grad_norm": 0.3500632643699646, + "learning_rate": 1.9005853104050745e-05, + "loss": 0.2676, + "step": 7710 + }, + { + "epoch": 0.14317380988627038, + "grad_norm": 0.4023997485637665, + "learning_rate": 1.9005345993429218e-05, + "loss": 0.3419, + "step": 7712 + }, + { + "epoch": 0.14321094002368903, + "grad_norm": 0.46251416206359863, + "learning_rate": 1.9004838760271903e-05, + "loss": 0.324, + "step": 7714 + }, + { + "epoch": 0.14324807016110766, + "grad_norm": 0.4099741578102112, + "learning_rate": 1.900433140458571e-05, + "loss": 0.1832, + "step": 7716 + }, + { + "epoch": 0.14328520029852632, + "grad_norm": 0.3889768421649933, + "learning_rate": 1.9003823926377534e-05, + "loss": 0.3035, + "step": 7718 + }, + { + "epoch": 0.14332233043594494, + "grad_norm": 0.33130908012390137, + "learning_rate": 1.9003316325654287e-05, + "loss": 0.2448, + "step": 7720 + }, + { + "epoch": 0.14335946057336357, + "grad_norm": 0.3851718306541443, + "learning_rate": 1.900280860242287e-05, + "loss": 0.3418, + "step": 7722 + }, + { + "epoch": 0.14339659071078223, + "grad_norm": 0.26829975843429565, + "learning_rate": 1.9002300756690195e-05, + "loss": 0.3561, + "step": 7724 + }, + { + "epoch": 0.14343372084820086, + "grad_norm": 0.276117205619812, + "learning_rate": 1.900179278846317e-05, + "loss": 0.3257, + "step": 7726 + }, + { + "epoch": 0.14347085098561949, + "grad_norm": 0.37600189447402954, + "learning_rate": 1.9001284697748713e-05, + "loss": 0.3427, + "step": 7728 + }, + { + "epoch": 0.14350798112303814, + "grad_norm": 0.3589591979980469, + "learning_rate": 1.900077648455373e-05, + "loss": 0.2937, + "step": 7730 + }, + { + "epoch": 0.14354511126045677, + "grad_norm": 0.26378333568573, + "learning_rate": 1.900026814888514e-05, + "loss": 0.2918, + "step": 7732 + }, + { + "epoch": 0.14358224139787543, + "grad_norm": 0.3817266523838043, + "learning_rate": 1.8999759690749858e-05, + "loss": 0.3178, + "step": 7734 + }, + { + "epoch": 0.14361937153529405, + "grad_norm": 0.34049347043037415, + "learning_rate": 1.8999251110154807e-05, + "loss": 0.1398, + "step": 7736 + }, + { + "epoch": 0.14365650167271268, + "grad_norm": 0.34080126881599426, + "learning_rate": 1.8998742407106904e-05, + "loss": 0.2845, + "step": 7738 + }, + { + "epoch": 0.14369363181013134, + "grad_norm": 0.8032690286636353, + "learning_rate": 1.8998233581613067e-05, + "loss": 0.2119, + "step": 7740 + }, + { + "epoch": 0.14373076194754997, + "grad_norm": 0.44622287154197693, + "learning_rate": 1.8997724633680227e-05, + "loss": 0.29, + "step": 7742 + }, + { + "epoch": 0.1437678920849686, + "grad_norm": 0.37246716022491455, + "learning_rate": 1.8997215563315307e-05, + "loss": 0.3218, + "step": 7744 + }, + { + "epoch": 0.14380502222238725, + "grad_norm": 0.308791846036911, + "learning_rate": 1.8996706370525232e-05, + "loss": 0.3083, + "step": 7746 + }, + { + "epoch": 0.14384215235980588, + "grad_norm": 0.4257645905017853, + "learning_rate": 1.899619705531693e-05, + "loss": 0.3996, + "step": 7748 + }, + { + "epoch": 0.1438792824972245, + "grad_norm": 0.3255974352359772, + "learning_rate": 1.8995687617697336e-05, + "loss": 0.2728, + "step": 7750 + }, + { + "epoch": 0.14391641263464316, + "grad_norm": 0.6185017824172974, + "learning_rate": 1.899517805767338e-05, + "loss": 0.2815, + "step": 7752 + }, + { + "epoch": 0.1439535427720618, + "grad_norm": 0.550037145614624, + "learning_rate": 1.8994668375251986e-05, + "loss": 0.3242, + "step": 7754 + }, + { + "epoch": 0.14399067290948045, + "grad_norm": 0.41134992241859436, + "learning_rate": 1.8994158570440105e-05, + "loss": 0.3645, + "step": 7756 + }, + { + "epoch": 0.14402780304689908, + "grad_norm": 0.4989023506641388, + "learning_rate": 1.8993648643244666e-05, + "loss": 0.2693, + "step": 7758 + }, + { + "epoch": 0.1440649331843177, + "grad_norm": 0.3724779784679413, + "learning_rate": 1.8993138593672603e-05, + "loss": 0.3641, + "step": 7760 + }, + { + "epoch": 0.14410206332173636, + "grad_norm": 0.39235255122184753, + "learning_rate": 1.899262842173087e-05, + "loss": 0.253, + "step": 7762 + }, + { + "epoch": 0.144139193459155, + "grad_norm": 0.4052632749080658, + "learning_rate": 1.899211812742639e-05, + "loss": 0.2497, + "step": 7764 + }, + { + "epoch": 0.14417632359657362, + "grad_norm": 0.38518375158309937, + "learning_rate": 1.899160771076612e-05, + "loss": 0.4068, + "step": 7766 + }, + { + "epoch": 0.14421345373399228, + "grad_norm": 0.3408479690551758, + "learning_rate": 1.8991097171757008e-05, + "loss": 0.3415, + "step": 7768 + }, + { + "epoch": 0.1442505838714109, + "grad_norm": 0.34612441062927246, + "learning_rate": 1.899058651040599e-05, + "loss": 0.2504, + "step": 7770 + }, + { + "epoch": 0.14428771400882956, + "grad_norm": 0.3262399137020111, + "learning_rate": 1.899007572672002e-05, + "loss": 0.2993, + "step": 7772 + }, + { + "epoch": 0.1443248441462482, + "grad_norm": 0.29352858662605286, + "learning_rate": 1.898956482070605e-05, + "loss": 0.2231, + "step": 7774 + }, + { + "epoch": 0.14436197428366682, + "grad_norm": 0.4054524600505829, + "learning_rate": 1.8989053792371023e-05, + "loss": 0.2445, + "step": 7776 + }, + { + "epoch": 0.14439910442108547, + "grad_norm": 0.3525488078594208, + "learning_rate": 1.8988542641721906e-05, + "loss": 0.5101, + "step": 7778 + }, + { + "epoch": 0.1444362345585041, + "grad_norm": 0.33733558654785156, + "learning_rate": 1.8988031368765646e-05, + "loss": 0.1283, + "step": 7780 + }, + { + "epoch": 0.14447336469592273, + "grad_norm": 0.383801132440567, + "learning_rate": 1.89875199735092e-05, + "loss": 0.2063, + "step": 7782 + }, + { + "epoch": 0.14451049483334139, + "grad_norm": 0.31123608350753784, + "learning_rate": 1.8987008455959528e-05, + "loss": 0.2958, + "step": 7784 + }, + { + "epoch": 0.14454762497076, + "grad_norm": 0.3273412585258484, + "learning_rate": 1.898649681612359e-05, + "loss": 0.4372, + "step": 7786 + }, + { + "epoch": 0.14458475510817864, + "grad_norm": 0.5262473225593567, + "learning_rate": 1.898598505400835e-05, + "loss": 0.1877, + "step": 7788 + }, + { + "epoch": 0.1446218852455973, + "grad_norm": 0.5094627141952515, + "learning_rate": 1.898547316962077e-05, + "loss": 0.513, + "step": 7790 + }, + { + "epoch": 0.14465901538301593, + "grad_norm": 0.46677839756011963, + "learning_rate": 1.898496116296781e-05, + "loss": 0.257, + "step": 7792 + }, + { + "epoch": 0.14469614552043458, + "grad_norm": 0.3048141300678253, + "learning_rate": 1.898444903405645e-05, + "loss": 0.3845, + "step": 7794 + }, + { + "epoch": 0.1447332756578532, + "grad_norm": 0.2790718972682953, + "learning_rate": 1.898393678289364e-05, + "loss": 0.4282, + "step": 7796 + }, + { + "epoch": 0.14477040579527184, + "grad_norm": 0.38363999128341675, + "learning_rate": 1.8983424409486366e-05, + "loss": 0.3478, + "step": 7798 + }, + { + "epoch": 0.1448075359326905, + "grad_norm": 0.32744741439819336, + "learning_rate": 1.8982911913841594e-05, + "loss": 0.3119, + "step": 7800 + }, + { + "epoch": 0.14484466607010912, + "grad_norm": 0.2869434654712677, + "learning_rate": 1.8982399295966295e-05, + "loss": 0.2012, + "step": 7802 + }, + { + "epoch": 0.14488179620752775, + "grad_norm": 0.46112629771232605, + "learning_rate": 1.8981886555867452e-05, + "loss": 0.2723, + "step": 7804 + }, + { + "epoch": 0.1449189263449464, + "grad_norm": 0.36200079321861267, + "learning_rate": 1.898137369355203e-05, + "loss": 0.4567, + "step": 7806 + }, + { + "epoch": 0.14495605648236504, + "grad_norm": 0.3622796833515167, + "learning_rate": 1.898086070902702e-05, + "loss": 0.2471, + "step": 7808 + }, + { + "epoch": 0.1449931866197837, + "grad_norm": 0.45017170906066895, + "learning_rate": 1.8980347602299396e-05, + "loss": 0.4459, + "step": 7810 + }, + { + "epoch": 0.14503031675720232, + "grad_norm": 0.34005579352378845, + "learning_rate": 1.8979834373376142e-05, + "loss": 0.2367, + "step": 7812 + }, + { + "epoch": 0.14506744689462095, + "grad_norm": 0.3467685878276825, + "learning_rate": 1.8979321022264234e-05, + "loss": 0.5057, + "step": 7814 + }, + { + "epoch": 0.1451045770320396, + "grad_norm": 0.24002090096473694, + "learning_rate": 1.8978807548970667e-05, + "loss": 0.5069, + "step": 7816 + }, + { + "epoch": 0.14514170716945823, + "grad_norm": 0.29576870799064636, + "learning_rate": 1.8978293953502422e-05, + "loss": 0.3225, + "step": 7818 + }, + { + "epoch": 0.14517883730687686, + "grad_norm": 0.26339441537857056, + "learning_rate": 1.8977780235866494e-05, + "loss": 0.1784, + "step": 7820 + }, + { + "epoch": 0.14521596744429552, + "grad_norm": 0.4551510512828827, + "learning_rate": 1.897726639606986e-05, + "loss": 0.4412, + "step": 7822 + }, + { + "epoch": 0.14525309758171415, + "grad_norm": 0.41436058282852173, + "learning_rate": 1.897675243411953e-05, + "loss": 0.4262, + "step": 7824 + }, + { + "epoch": 0.14529022771913278, + "grad_norm": 0.4043031334877014, + "learning_rate": 1.897623835002248e-05, + "loss": 0.3014, + "step": 7826 + }, + { + "epoch": 0.14532735785655143, + "grad_norm": 0.4691769480705261, + "learning_rate": 1.8975724143785717e-05, + "loss": 0.4732, + "step": 7828 + }, + { + "epoch": 0.14536448799397006, + "grad_norm": 0.309151291847229, + "learning_rate": 1.8975209815416235e-05, + "loss": 0.3494, + "step": 7830 + }, + { + "epoch": 0.14540161813138872, + "grad_norm": 0.3898021876811981, + "learning_rate": 1.897469536492103e-05, + "loss": 0.3605, + "step": 7832 + }, + { + "epoch": 0.14543874826880734, + "grad_norm": 0.34558334946632385, + "learning_rate": 1.89741807923071e-05, + "loss": 0.5045, + "step": 7834 + }, + { + "epoch": 0.14547587840622597, + "grad_norm": 0.3210196793079376, + "learning_rate": 1.8973666097581456e-05, + "loss": 0.3742, + "step": 7836 + }, + { + "epoch": 0.14551300854364463, + "grad_norm": 0.35887858271598816, + "learning_rate": 1.8973151280751092e-05, + "loss": 0.276, + "step": 7838 + }, + { + "epoch": 0.14555013868106326, + "grad_norm": 0.298946738243103, + "learning_rate": 1.897263634182302e-05, + "loss": 0.3235, + "step": 7840 + }, + { + "epoch": 0.14558726881848189, + "grad_norm": 0.45819589495658875, + "learning_rate": 1.8972121280804238e-05, + "loss": 0.2479, + "step": 7842 + }, + { + "epoch": 0.14562439895590054, + "grad_norm": 0.3384646475315094, + "learning_rate": 1.8971606097701764e-05, + "loss": 0.4018, + "step": 7844 + }, + { + "epoch": 0.14566152909331917, + "grad_norm": 0.28389620780944824, + "learning_rate": 1.8971090792522604e-05, + "loss": 0.5037, + "step": 7846 + }, + { + "epoch": 0.14569865923073783, + "grad_norm": 0.4108288288116455, + "learning_rate": 1.897057536527377e-05, + "loss": 0.1672, + "step": 7848 + }, + { + "epoch": 0.14573578936815645, + "grad_norm": 0.33314409852027893, + "learning_rate": 1.8970059815962272e-05, + "loss": 0.2858, + "step": 7850 + }, + { + "epoch": 0.14577291950557508, + "grad_norm": 0.4274868071079254, + "learning_rate": 1.896954414459513e-05, + "loss": 0.2463, + "step": 7852 + }, + { + "epoch": 0.14581004964299374, + "grad_norm": 0.4678952991962433, + "learning_rate": 1.8969028351179363e-05, + "loss": 0.3779, + "step": 7854 + }, + { + "epoch": 0.14584717978041237, + "grad_norm": 0.3344060480594635, + "learning_rate": 1.896851243572198e-05, + "loss": 0.387, + "step": 7856 + }, + { + "epoch": 0.145884309917831, + "grad_norm": 0.37126004695892334, + "learning_rate": 1.896799639823001e-05, + "loss": 0.3157, + "step": 7858 + }, + { + "epoch": 0.14592144005524965, + "grad_norm": 0.3887842297554016, + "learning_rate": 1.896748023871047e-05, + "loss": 0.5238, + "step": 7860 + }, + { + "epoch": 0.14595857019266828, + "grad_norm": 0.3083222508430481, + "learning_rate": 1.8966963957170383e-05, + "loss": 0.3956, + "step": 7862 + }, + { + "epoch": 0.1459957003300869, + "grad_norm": 0.3145461082458496, + "learning_rate": 1.8966447553616777e-05, + "loss": 0.3687, + "step": 7864 + }, + { + "epoch": 0.14603283046750556, + "grad_norm": 0.5283592343330383, + "learning_rate": 1.8965931028056678e-05, + "loss": 0.4006, + "step": 7866 + }, + { + "epoch": 0.1460699606049242, + "grad_norm": 0.3143438696861267, + "learning_rate": 1.8965414380497115e-05, + "loss": 0.1904, + "step": 7868 + }, + { + "epoch": 0.14610709074234285, + "grad_norm": 0.41060465574264526, + "learning_rate": 1.8964897610945116e-05, + "loss": 0.3466, + "step": 7870 + }, + { + "epoch": 0.14614422087976148, + "grad_norm": 0.38744065165519714, + "learning_rate": 1.896438071940771e-05, + "loss": 0.4229, + "step": 7872 + }, + { + "epoch": 0.1461813510171801, + "grad_norm": 0.3486790955066681, + "learning_rate": 1.8963863705891937e-05, + "loss": 0.3158, + "step": 7874 + }, + { + "epoch": 0.14621848115459876, + "grad_norm": 0.407792866230011, + "learning_rate": 1.896334657040483e-05, + "loss": 0.2977, + "step": 7876 + }, + { + "epoch": 0.1462556112920174, + "grad_norm": 0.2871744632720947, + "learning_rate": 1.8962829312953422e-05, + "loss": 0.2173, + "step": 7878 + }, + { + "epoch": 0.14629274142943602, + "grad_norm": 0.3657383918762207, + "learning_rate": 1.8962311933544755e-05, + "loss": 0.2121, + "step": 7880 + }, + { + "epoch": 0.14632987156685467, + "grad_norm": 0.35087794065475464, + "learning_rate": 1.8961794432185868e-05, + "loss": 0.3991, + "step": 7882 + }, + { + "epoch": 0.1463670017042733, + "grad_norm": 0.47817355394363403, + "learning_rate": 1.8961276808883805e-05, + "loss": 0.3414, + "step": 7884 + }, + { + "epoch": 0.14640413184169196, + "grad_norm": 0.2610751986503601, + "learning_rate": 1.8960759063645603e-05, + "loss": 0.3385, + "step": 7886 + }, + { + "epoch": 0.1464412619791106, + "grad_norm": 0.31454387307167053, + "learning_rate": 1.8960241196478312e-05, + "loss": 0.4764, + "step": 7888 + }, + { + "epoch": 0.14647839211652922, + "grad_norm": 0.2645203471183777, + "learning_rate": 1.8959723207388977e-05, + "loss": 0.2859, + "step": 7890 + }, + { + "epoch": 0.14651552225394787, + "grad_norm": 0.3764110803604126, + "learning_rate": 1.8959205096384645e-05, + "loss": 0.3828, + "step": 7892 + }, + { + "epoch": 0.1465526523913665, + "grad_norm": 0.4874105751514435, + "learning_rate": 1.895868686347237e-05, + "loss": 0.3197, + "step": 7894 + }, + { + "epoch": 0.14658978252878513, + "grad_norm": 0.49774765968322754, + "learning_rate": 1.89581685086592e-05, + "loss": 0.3244, + "step": 7896 + }, + { + "epoch": 0.14662691266620378, + "grad_norm": 0.4108216464519501, + "learning_rate": 1.895765003195219e-05, + "loss": 0.2732, + "step": 7898 + }, + { + "epoch": 0.1466640428036224, + "grad_norm": 0.353057861328125, + "learning_rate": 1.8957131433358397e-05, + "loss": 0.3649, + "step": 7900 + }, + { + "epoch": 0.14670117294104104, + "grad_norm": 0.42483264207839966, + "learning_rate": 1.895661271288487e-05, + "loss": 0.2648, + "step": 7902 + }, + { + "epoch": 0.1467383030784597, + "grad_norm": 0.569980800151825, + "learning_rate": 1.895609387053867e-05, + "loss": 0.1865, + "step": 7904 + }, + { + "epoch": 0.14677543321587833, + "grad_norm": 0.2782089114189148, + "learning_rate": 1.8955574906326867e-05, + "loss": 0.1886, + "step": 7906 + }, + { + "epoch": 0.14681256335329698, + "grad_norm": 0.49411752820014954, + "learning_rate": 1.895505582025651e-05, + "loss": 0.419, + "step": 7908 + }, + { + "epoch": 0.1468496934907156, + "grad_norm": 0.5169767141342163, + "learning_rate": 1.8954536612334668e-05, + "loss": 0.3793, + "step": 7910 + }, + { + "epoch": 0.14688682362813424, + "grad_norm": 0.2284662127494812, + "learning_rate": 1.8954017282568404e-05, + "loss": 0.4272, + "step": 7912 + }, + { + "epoch": 0.1469239537655529, + "grad_norm": 0.3215291202068329, + "learning_rate": 1.8953497830964786e-05, + "loss": 0.2995, + "step": 7914 + }, + { + "epoch": 0.14696108390297152, + "grad_norm": 0.40153321623802185, + "learning_rate": 1.895297825753088e-05, + "loss": 0.4652, + "step": 7916 + }, + { + "epoch": 0.14699821404039015, + "grad_norm": 0.2182285189628601, + "learning_rate": 1.895245856227376e-05, + "loss": 0.2078, + "step": 7918 + }, + { + "epoch": 0.1470353441778088, + "grad_norm": 0.5507566928863525, + "learning_rate": 1.8951938745200493e-05, + "loss": 0.2655, + "step": 7920 + }, + { + "epoch": 0.14707247431522744, + "grad_norm": 0.3515758514404297, + "learning_rate": 1.8951418806318153e-05, + "loss": 0.3281, + "step": 7922 + }, + { + "epoch": 0.1471096044526461, + "grad_norm": 0.26673653721809387, + "learning_rate": 1.8950898745633813e-05, + "loss": 0.2365, + "step": 7924 + }, + { + "epoch": 0.14714673459006472, + "grad_norm": 0.327364057302475, + "learning_rate": 1.8950378563154558e-05, + "loss": 0.2789, + "step": 7926 + }, + { + "epoch": 0.14718386472748335, + "grad_norm": 0.3492949903011322, + "learning_rate": 1.8949858258887456e-05, + "loss": 0.2403, + "step": 7928 + }, + { + "epoch": 0.147220994864902, + "grad_norm": 0.5095848441123962, + "learning_rate": 1.8949337832839592e-05, + "loss": 0.3239, + "step": 7930 + }, + { + "epoch": 0.14725812500232063, + "grad_norm": 0.35220614075660706, + "learning_rate": 1.8948817285018046e-05, + "loss": 0.407, + "step": 7932 + }, + { + "epoch": 0.14729525513973926, + "grad_norm": 0.5153085589408875, + "learning_rate": 1.8948296615429903e-05, + "loss": 0.4116, + "step": 7934 + }, + { + "epoch": 0.14733238527715792, + "grad_norm": 0.4465144872665405, + "learning_rate": 1.8947775824082247e-05, + "loss": 0.3227, + "step": 7936 + }, + { + "epoch": 0.14736951541457655, + "grad_norm": 0.3701249361038208, + "learning_rate": 1.894725491098216e-05, + "loss": 0.4431, + "step": 7938 + }, + { + "epoch": 0.14740664555199517, + "grad_norm": 0.3109250068664551, + "learning_rate": 1.8946733876136738e-05, + "loss": 0.2046, + "step": 7940 + }, + { + "epoch": 0.14744377568941383, + "grad_norm": 0.3355062007904053, + "learning_rate": 1.8946212719553067e-05, + "loss": 0.2802, + "step": 7942 + }, + { + "epoch": 0.14748090582683246, + "grad_norm": 0.29765698313713074, + "learning_rate": 1.8945691441238235e-05, + "loss": 0.5689, + "step": 7944 + }, + { + "epoch": 0.14751803596425112, + "grad_norm": 0.5657996535301208, + "learning_rate": 1.8945170041199338e-05, + "loss": 0.4415, + "step": 7946 + }, + { + "epoch": 0.14755516610166974, + "grad_norm": 0.48015791177749634, + "learning_rate": 1.8944648519443473e-05, + "loss": 0.3703, + "step": 7948 + }, + { + "epoch": 0.14759229623908837, + "grad_norm": 0.36339807510375977, + "learning_rate": 1.894412687597773e-05, + "loss": 0.4124, + "step": 7950 + }, + { + "epoch": 0.14762942637650703, + "grad_norm": 0.33174842596054077, + "learning_rate": 1.8943605110809216e-05, + "loss": 0.4065, + "step": 7952 + }, + { + "epoch": 0.14766655651392566, + "grad_norm": 0.4281650185585022, + "learning_rate": 1.894308322394502e-05, + "loss": 0.4237, + "step": 7954 + }, + { + "epoch": 0.14770368665134428, + "grad_norm": 0.2378285676240921, + "learning_rate": 1.8942561215392253e-05, + "loss": 0.3911, + "step": 7956 + }, + { + "epoch": 0.14774081678876294, + "grad_norm": 0.427030473947525, + "learning_rate": 1.8942039085158012e-05, + "loss": 0.2956, + "step": 7958 + }, + { + "epoch": 0.14777794692618157, + "grad_norm": 0.2923468053340912, + "learning_rate": 1.8941516833249406e-05, + "loss": 0.238, + "step": 7960 + }, + { + "epoch": 0.14781507706360023, + "grad_norm": 0.3950183689594269, + "learning_rate": 1.8940994459673536e-05, + "loss": 0.2453, + "step": 7962 + }, + { + "epoch": 0.14785220720101885, + "grad_norm": 0.541778028011322, + "learning_rate": 1.8940471964437514e-05, + "loss": 0.3373, + "step": 7964 + }, + { + "epoch": 0.14788933733843748, + "grad_norm": 0.5443995594978333, + "learning_rate": 1.893994934754845e-05, + "loss": 0.2392, + "step": 7966 + }, + { + "epoch": 0.14792646747585614, + "grad_norm": 0.24573831260204315, + "learning_rate": 1.8939426609013448e-05, + "loss": 0.2077, + "step": 7968 + }, + { + "epoch": 0.14796359761327477, + "grad_norm": 0.25011658668518066, + "learning_rate": 1.8938903748839634e-05, + "loss": 0.4232, + "step": 7970 + }, + { + "epoch": 0.1480007277506934, + "grad_norm": 0.35290372371673584, + "learning_rate": 1.893838076703411e-05, + "loss": 0.4335, + "step": 7972 + }, + { + "epoch": 0.14803785788811205, + "grad_norm": 0.42270928621292114, + "learning_rate": 1.8937857663604e-05, + "loss": 0.2595, + "step": 7974 + }, + { + "epoch": 0.14807498802553068, + "grad_norm": 0.3626469075679779, + "learning_rate": 1.893733443855642e-05, + "loss": 0.3947, + "step": 7976 + }, + { + "epoch": 0.1481121181629493, + "grad_norm": 0.2349478304386139, + "learning_rate": 1.8936811091898485e-05, + "loss": 0.3459, + "step": 7978 + }, + { + "epoch": 0.14814924830036796, + "grad_norm": 0.3554820716381073, + "learning_rate": 1.8936287623637323e-05, + "loss": 0.2615, + "step": 7980 + }, + { + "epoch": 0.1481863784377866, + "grad_norm": 0.36123916506767273, + "learning_rate": 1.893576403378005e-05, + "loss": 0.1804, + "step": 7982 + }, + { + "epoch": 0.14822350857520525, + "grad_norm": 0.32091060280799866, + "learning_rate": 1.8935240322333798e-05, + "loss": 0.4323, + "step": 7984 + }, + { + "epoch": 0.14826063871262388, + "grad_norm": 0.26613056659698486, + "learning_rate": 1.893471648930569e-05, + "loss": 0.4224, + "step": 7986 + }, + { + "epoch": 0.1482977688500425, + "grad_norm": 0.4161258637905121, + "learning_rate": 1.893419253470285e-05, + "loss": 0.1924, + "step": 7988 + }, + { + "epoch": 0.14833489898746116, + "grad_norm": 0.32795459032058716, + "learning_rate": 1.893366845853241e-05, + "loss": 0.4959, + "step": 7990 + }, + { + "epoch": 0.1483720291248798, + "grad_norm": 0.6610921025276184, + "learning_rate": 1.8933144260801502e-05, + "loss": 0.2338, + "step": 7992 + }, + { + "epoch": 0.14840915926229842, + "grad_norm": 0.22595588862895966, + "learning_rate": 1.8932619941517264e-05, + "loss": 0.36, + "step": 7994 + }, + { + "epoch": 0.14844628939971707, + "grad_norm": 0.4424133002758026, + "learning_rate": 1.893209550068682e-05, + "loss": 0.2464, + "step": 7996 + }, + { + "epoch": 0.1484834195371357, + "grad_norm": 0.343784362077713, + "learning_rate": 1.893157093831731e-05, + "loss": 0.2872, + "step": 7998 + }, + { + "epoch": 0.14852054967455436, + "grad_norm": 0.3268868923187256, + "learning_rate": 1.8931046254415875e-05, + "loss": 0.2923, + "step": 8000 + }, + { + "epoch": 0.148557679811973, + "grad_norm": 0.3321167528629303, + "learning_rate": 1.8930521448989653e-05, + "loss": 0.3032, + "step": 8002 + }, + { + "epoch": 0.14859480994939162, + "grad_norm": 0.24509146809577942, + "learning_rate": 1.892999652204578e-05, + "loss": 0.2059, + "step": 8004 + }, + { + "epoch": 0.14863194008681027, + "grad_norm": 0.25860974192619324, + "learning_rate": 1.8929471473591404e-05, + "loss": 0.2338, + "step": 8006 + }, + { + "epoch": 0.1486690702242289, + "grad_norm": 0.472266286611557, + "learning_rate": 1.892894630363367e-05, + "loss": 0.33, + "step": 8008 + }, + { + "epoch": 0.14870620036164753, + "grad_norm": 0.22727616131305695, + "learning_rate": 1.8928421012179725e-05, + "loss": 0.1486, + "step": 8010 + }, + { + "epoch": 0.14874333049906618, + "grad_norm": 0.3413729667663574, + "learning_rate": 1.8927895599236706e-05, + "loss": 0.2891, + "step": 8012 + }, + { + "epoch": 0.1487804606364848, + "grad_norm": 0.3149247169494629, + "learning_rate": 1.892737006481178e-05, + "loss": 0.3343, + "step": 8014 + }, + { + "epoch": 0.14881759077390344, + "grad_norm": 0.5944235920906067, + "learning_rate": 1.892684440891208e-05, + "loss": 0.3546, + "step": 8016 + }, + { + "epoch": 0.1488547209113221, + "grad_norm": 0.313130259513855, + "learning_rate": 1.8926318631544768e-05, + "loss": 0.4429, + "step": 8018 + }, + { + "epoch": 0.14889185104874073, + "grad_norm": 0.30029743909835815, + "learning_rate": 1.8925792732717e-05, + "loss": 0.2523, + "step": 8020 + }, + { + "epoch": 0.14892898118615938, + "grad_norm": 0.41819971799850464, + "learning_rate": 1.8925266712435926e-05, + "loss": 0.2928, + "step": 8022 + }, + { + "epoch": 0.148966111323578, + "grad_norm": 0.3444328308105469, + "learning_rate": 1.8924740570708707e-05, + "loss": 0.4183, + "step": 8024 + }, + { + "epoch": 0.14900324146099664, + "grad_norm": 0.2936622202396393, + "learning_rate": 1.89242143075425e-05, + "loss": 0.5569, + "step": 8026 + }, + { + "epoch": 0.1490403715984153, + "grad_norm": 0.3413342535495758, + "learning_rate": 1.8923687922944468e-05, + "loss": 0.3514, + "step": 8028 + }, + { + "epoch": 0.14907750173583392, + "grad_norm": 0.30846622586250305, + "learning_rate": 1.8923161416921775e-05, + "loss": 0.2658, + "step": 8030 + }, + { + "epoch": 0.14911463187325255, + "grad_norm": 0.3375438153743744, + "learning_rate": 1.8922634789481582e-05, + "loss": 0.2853, + "step": 8032 + }, + { + "epoch": 0.1491517620106712, + "grad_norm": 0.42673662304878235, + "learning_rate": 1.8922108040631054e-05, + "loss": 0.4383, + "step": 8034 + }, + { + "epoch": 0.14918889214808984, + "grad_norm": 0.43633684515953064, + "learning_rate": 1.8921581170377364e-05, + "loss": 0.3557, + "step": 8036 + }, + { + "epoch": 0.1492260222855085, + "grad_norm": 0.5316792130470276, + "learning_rate": 1.8921054178727677e-05, + "loss": 0.3184, + "step": 8038 + }, + { + "epoch": 0.14926315242292712, + "grad_norm": 0.4390234649181366, + "learning_rate": 1.892052706568916e-05, + "loss": 0.3402, + "step": 8040 + }, + { + "epoch": 0.14930028256034575, + "grad_norm": 0.3417345881462097, + "learning_rate": 1.8919999831268992e-05, + "loss": 0.3539, + "step": 8042 + }, + { + "epoch": 0.1493374126977644, + "grad_norm": 0.39099442958831787, + "learning_rate": 1.8919472475474346e-05, + "loss": 0.3704, + "step": 8044 + }, + { + "epoch": 0.14937454283518303, + "grad_norm": 0.6233531832695007, + "learning_rate": 1.8918944998312398e-05, + "loss": 0.4361, + "step": 8046 + }, + { + "epoch": 0.14941167297260166, + "grad_norm": 0.36886996030807495, + "learning_rate": 1.891841739979032e-05, + "loss": 0.3245, + "step": 8048 + }, + { + "epoch": 0.14944880311002032, + "grad_norm": 0.28392186760902405, + "learning_rate": 1.89178896799153e-05, + "loss": 0.2779, + "step": 8050 + }, + { + "epoch": 0.14948593324743895, + "grad_norm": 0.2642804682254791, + "learning_rate": 1.8917361838694507e-05, + "loss": 0.2492, + "step": 8052 + }, + { + "epoch": 0.14952306338485757, + "grad_norm": 0.39111533761024475, + "learning_rate": 1.8916833876135134e-05, + "loss": 0.2636, + "step": 8054 + }, + { + "epoch": 0.14956019352227623, + "grad_norm": 0.3611973524093628, + "learning_rate": 1.8916305792244357e-05, + "loss": 0.2987, + "step": 8056 + }, + { + "epoch": 0.14959732365969486, + "grad_norm": 0.47690239548683167, + "learning_rate": 1.891577758702937e-05, + "loss": 0.3012, + "step": 8058 + }, + { + "epoch": 0.14963445379711351, + "grad_norm": 0.2648635804653168, + "learning_rate": 1.891524926049736e-05, + "loss": 0.2043, + "step": 8060 + }, + { + "epoch": 0.14967158393453214, + "grad_norm": 0.4561508297920227, + "learning_rate": 1.8914720812655504e-05, + "loss": 0.4975, + "step": 8062 + }, + { + "epoch": 0.14970871407195077, + "grad_norm": 0.3444630801677704, + "learning_rate": 1.8914192243511e-05, + "loss": 0.2683, + "step": 8064 + }, + { + "epoch": 0.14974584420936943, + "grad_norm": 0.24170279502868652, + "learning_rate": 1.8913663553071044e-05, + "loss": 0.2297, + "step": 8066 + }, + { + "epoch": 0.14978297434678806, + "grad_norm": 0.3343278765678406, + "learning_rate": 1.891313474134283e-05, + "loss": 0.4293, + "step": 8068 + }, + { + "epoch": 0.14982010448420668, + "grad_norm": 0.3095451593399048, + "learning_rate": 1.8912605808333543e-05, + "loss": 0.2463, + "step": 8070 + }, + { + "epoch": 0.14985723462162534, + "grad_norm": 0.38149741291999817, + "learning_rate": 1.8912076754050392e-05, + "loss": 0.2944, + "step": 8072 + }, + { + "epoch": 0.14989436475904397, + "grad_norm": 0.39498743414878845, + "learning_rate": 1.8911547578500567e-05, + "loss": 0.478, + "step": 8074 + }, + { + "epoch": 0.14993149489646262, + "grad_norm": 0.31849828362464905, + "learning_rate": 1.8911018281691278e-05, + "loss": 0.3367, + "step": 8076 + }, + { + "epoch": 0.14996862503388125, + "grad_norm": 0.3129948377609253, + "learning_rate": 1.891048886362972e-05, + "loss": 0.3156, + "step": 8078 + }, + { + "epoch": 0.15000575517129988, + "grad_norm": 0.24439303576946259, + "learning_rate": 1.8909959324323096e-05, + "loss": 0.2103, + "step": 8080 + }, + { + "epoch": 0.15004288530871854, + "grad_norm": 0.33535176515579224, + "learning_rate": 1.8909429663778618e-05, + "loss": 0.1637, + "step": 8082 + }, + { + "epoch": 0.15008001544613717, + "grad_norm": 0.3443775177001953, + "learning_rate": 1.8908899882003484e-05, + "loss": 0.1485, + "step": 8084 + }, + { + "epoch": 0.1501171455835558, + "grad_norm": 0.44087567925453186, + "learning_rate": 1.890836997900491e-05, + "loss": 0.3482, + "step": 8086 + }, + { + "epoch": 0.15015427572097445, + "grad_norm": 0.37624138593673706, + "learning_rate": 1.890783995479011e-05, + "loss": 0.4073, + "step": 8088 + }, + { + "epoch": 0.15019140585839308, + "grad_norm": 0.42156079411506653, + "learning_rate": 1.8907309809366282e-05, + "loss": 0.3459, + "step": 8090 + }, + { + "epoch": 0.1502285359958117, + "grad_norm": 0.4333178997039795, + "learning_rate": 1.890677954274065e-05, + "loss": 0.3595, + "step": 8092 + }, + { + "epoch": 0.15026566613323036, + "grad_norm": 0.3249876797199249, + "learning_rate": 1.890624915492043e-05, + "loss": 0.1614, + "step": 8094 + }, + { + "epoch": 0.150302796270649, + "grad_norm": 0.33350321650505066, + "learning_rate": 1.8905718645912835e-05, + "loss": 0.2572, + "step": 8096 + }, + { + "epoch": 0.15033992640806765, + "grad_norm": 0.42675504088401794, + "learning_rate": 1.8905188015725086e-05, + "loss": 0.4396, + "step": 8098 + }, + { + "epoch": 0.15037705654548628, + "grad_norm": 0.3597617745399475, + "learning_rate": 1.8904657264364398e-05, + "loss": 0.4652, + "step": 8100 + }, + { + "epoch": 0.1504141866829049, + "grad_norm": 0.34647828340530396, + "learning_rate": 1.8904126391838002e-05, + "loss": 0.3756, + "step": 8102 + }, + { + "epoch": 0.15045131682032356, + "grad_norm": 0.39308691024780273, + "learning_rate": 1.8903595398153115e-05, + "loss": 0.419, + "step": 8104 + }, + { + "epoch": 0.1504884469577422, + "grad_norm": 0.6328529715538025, + "learning_rate": 1.8903064283316963e-05, + "loss": 0.3136, + "step": 8106 + }, + { + "epoch": 0.15052557709516082, + "grad_norm": 0.26503661274909973, + "learning_rate": 1.8902533047336776e-05, + "loss": 0.4365, + "step": 8108 + }, + { + "epoch": 0.15056270723257947, + "grad_norm": 0.3519168198108673, + "learning_rate": 1.890200169021978e-05, + "loss": 0.4202, + "step": 8110 + }, + { + "epoch": 0.1505998373699981, + "grad_norm": 0.6732722520828247, + "learning_rate": 1.8901470211973203e-05, + "loss": 0.4758, + "step": 8112 + }, + { + "epoch": 0.15063696750741676, + "grad_norm": 0.32329288125038147, + "learning_rate": 1.890093861260428e-05, + "loss": 0.2723, + "step": 8114 + }, + { + "epoch": 0.1506740976448354, + "grad_norm": 1.740921974182129, + "learning_rate": 1.8900406892120246e-05, + "loss": 0.3484, + "step": 8116 + }, + { + "epoch": 0.15071122778225401, + "grad_norm": 0.35415127873420715, + "learning_rate": 1.889987505052833e-05, + "loss": 0.3531, + "step": 8118 + }, + { + "epoch": 0.15074835791967267, + "grad_norm": 0.5177008509635925, + "learning_rate": 1.8899343087835776e-05, + "loss": 0.3243, + "step": 8120 + }, + { + "epoch": 0.1507854880570913, + "grad_norm": 0.5174015760421753, + "learning_rate": 1.8898811004049823e-05, + "loss": 0.6162, + "step": 8122 + }, + { + "epoch": 0.15082261819450993, + "grad_norm": 0.356586754322052, + "learning_rate": 1.88982787991777e-05, + "loss": 0.2525, + "step": 8124 + }, + { + "epoch": 0.15085974833192858, + "grad_norm": 0.5085174441337585, + "learning_rate": 1.889774647322666e-05, + "loss": 0.4053, + "step": 8126 + }, + { + "epoch": 0.1508968784693472, + "grad_norm": 0.2928535044193268, + "learning_rate": 1.8897214026203944e-05, + "loss": 0.3478, + "step": 8128 + }, + { + "epoch": 0.15093400860676584, + "grad_norm": 0.3939286470413208, + "learning_rate": 1.8896681458116793e-05, + "loss": 0.2601, + "step": 8130 + }, + { + "epoch": 0.1509711387441845, + "grad_norm": 0.3399118185043335, + "learning_rate": 1.8896148768972456e-05, + "loss": 0.4998, + "step": 8132 + }, + { + "epoch": 0.15100826888160312, + "grad_norm": 0.288313627243042, + "learning_rate": 1.889561595877818e-05, + "loss": 0.2168, + "step": 8134 + }, + { + "epoch": 0.15104539901902178, + "grad_norm": 0.354743093252182, + "learning_rate": 1.8895083027541217e-05, + "loss": 0.1563, + "step": 8136 + }, + { + "epoch": 0.1510825291564404, + "grad_norm": 0.3141016662120819, + "learning_rate": 1.889454997526882e-05, + "loss": 0.5022, + "step": 8138 + }, + { + "epoch": 0.15111965929385904, + "grad_norm": 0.33479002118110657, + "learning_rate": 1.889401680196824e-05, + "loss": 0.1907, + "step": 8140 + }, + { + "epoch": 0.1511567894312777, + "grad_norm": 0.40596503019332886, + "learning_rate": 1.8893483507646725e-05, + "loss": 0.1372, + "step": 8142 + }, + { + "epoch": 0.15119391956869632, + "grad_norm": 0.3666452467441559, + "learning_rate": 1.8892950092311545e-05, + "loss": 0.3383, + "step": 8144 + }, + { + "epoch": 0.15123104970611495, + "grad_norm": 0.3360214829444885, + "learning_rate": 1.8892416555969952e-05, + "loss": 0.2423, + "step": 8146 + }, + { + "epoch": 0.1512681798435336, + "grad_norm": 0.35804012417793274, + "learning_rate": 1.8891882898629202e-05, + "loss": 0.1988, + "step": 8148 + }, + { + "epoch": 0.15130530998095224, + "grad_norm": 0.5613224506378174, + "learning_rate": 1.8891349120296565e-05, + "loss": 0.2715, + "step": 8150 + }, + { + "epoch": 0.1513424401183709, + "grad_norm": 0.4466422498226166, + "learning_rate": 1.8890815220979295e-05, + "loss": 0.2171, + "step": 8152 + }, + { + "epoch": 0.15137957025578952, + "grad_norm": 0.2291431874036789, + "learning_rate": 1.8890281200684662e-05, + "loss": 0.4429, + "step": 8154 + }, + { + "epoch": 0.15141670039320815, + "grad_norm": 0.3232664167881012, + "learning_rate": 1.8889747059419932e-05, + "loss": 0.2818, + "step": 8156 + }, + { + "epoch": 0.1514538305306268, + "grad_norm": 0.6342013478279114, + "learning_rate": 1.8889212797192375e-05, + "loss": 0.254, + "step": 8158 + }, + { + "epoch": 0.15149096066804543, + "grad_norm": 0.36274418234825134, + "learning_rate": 1.8888678414009255e-05, + "loss": 0.3168, + "step": 8160 + }, + { + "epoch": 0.15152809080546406, + "grad_norm": 0.3029654920101166, + "learning_rate": 1.8888143909877846e-05, + "loss": 0.1745, + "step": 8162 + }, + { + "epoch": 0.15156522094288272, + "grad_norm": 0.3898391127586365, + "learning_rate": 1.888760928480542e-05, + "loss": 0.3324, + "step": 8164 + }, + { + "epoch": 0.15160235108030135, + "grad_norm": 0.2669164836406708, + "learning_rate": 1.8887074538799253e-05, + "loss": 0.3229, + "step": 8166 + }, + { + "epoch": 0.15163948121771997, + "grad_norm": 0.4950024485588074, + "learning_rate": 1.8886539671866625e-05, + "loss": 0.3934, + "step": 8168 + }, + { + "epoch": 0.15167661135513863, + "grad_norm": 0.25776857137680054, + "learning_rate": 1.8886004684014812e-05, + "loss": 0.4377, + "step": 8170 + }, + { + "epoch": 0.15171374149255726, + "grad_norm": 0.4564487338066101, + "learning_rate": 1.888546957525109e-05, + "loss": 0.3171, + "step": 8172 + }, + { + "epoch": 0.15175087162997591, + "grad_norm": 0.31179043650627136, + "learning_rate": 1.888493434558274e-05, + "loss": 0.2832, + "step": 8174 + }, + { + "epoch": 0.15178800176739454, + "grad_norm": 0.27408334612846375, + "learning_rate": 1.8884398995017046e-05, + "loss": 0.2731, + "step": 8176 + }, + { + "epoch": 0.15182513190481317, + "grad_norm": 0.3307841420173645, + "learning_rate": 1.8883863523561293e-05, + "loss": 0.3481, + "step": 8178 + }, + { + "epoch": 0.15186226204223183, + "grad_norm": 0.3532922565937042, + "learning_rate": 1.8883327931222774e-05, + "loss": 0.356, + "step": 8180 + }, + { + "epoch": 0.15189939217965046, + "grad_norm": 0.27513787150382996, + "learning_rate": 1.8882792218008764e-05, + "loss": 0.3857, + "step": 8182 + }, + { + "epoch": 0.15193652231706908, + "grad_norm": 0.44716960191726685, + "learning_rate": 1.8882256383926564e-05, + "loss": 0.2656, + "step": 8184 + }, + { + "epoch": 0.15197365245448774, + "grad_norm": 0.33391496539115906, + "learning_rate": 1.8881720428983455e-05, + "loss": 0.2694, + "step": 8186 + }, + { + "epoch": 0.15201078259190637, + "grad_norm": 0.4720102846622467, + "learning_rate": 1.888118435318674e-05, + "loss": 0.3618, + "step": 8188 + }, + { + "epoch": 0.15204791272932502, + "grad_norm": 0.2996935546398163, + "learning_rate": 1.8880648156543704e-05, + "loss": 0.5037, + "step": 8190 + }, + { + "epoch": 0.15208504286674365, + "grad_norm": 0.4064074754714966, + "learning_rate": 1.888011183906165e-05, + "loss": 0.3693, + "step": 8192 + }, + { + "epoch": 0.15212217300416228, + "grad_norm": 0.3001393973827362, + "learning_rate": 1.8879575400747872e-05, + "loss": 0.5317, + "step": 8194 + }, + { + "epoch": 0.15215930314158094, + "grad_norm": 0.4057982265949249, + "learning_rate": 1.887903884160967e-05, + "loss": 0.246, + "step": 8196 + }, + { + "epoch": 0.15219643327899957, + "grad_norm": 0.2689710557460785, + "learning_rate": 1.8878502161654345e-05, + "loss": 0.2674, + "step": 8198 + }, + { + "epoch": 0.1522335634164182, + "grad_norm": 0.34354719519615173, + "learning_rate": 1.8877965360889197e-05, + "loss": 0.2977, + "step": 8200 + }, + { + "epoch": 0.15227069355383685, + "grad_norm": 0.4297869801521301, + "learning_rate": 1.887742843932154e-05, + "loss": 0.1582, + "step": 8202 + }, + { + "epoch": 0.15230782369125548, + "grad_norm": 0.3412962555885315, + "learning_rate": 1.8876891396958668e-05, + "loss": 0.3881, + "step": 8204 + }, + { + "epoch": 0.1523449538286741, + "grad_norm": 0.3971596360206604, + "learning_rate": 1.8876354233807892e-05, + "loss": 0.4237, + "step": 8206 + }, + { + "epoch": 0.15238208396609276, + "grad_norm": 0.30143117904663086, + "learning_rate": 1.8875816949876527e-05, + "loss": 0.3968, + "step": 8208 + }, + { + "epoch": 0.1524192141035114, + "grad_norm": 0.9673166871070862, + "learning_rate": 1.8875279545171877e-05, + "loss": 0.3169, + "step": 8210 + }, + { + "epoch": 0.15245634424093005, + "grad_norm": 0.41814717650413513, + "learning_rate": 1.8874742019701257e-05, + "loss": 0.2595, + "step": 8212 + }, + { + "epoch": 0.15249347437834868, + "grad_norm": 0.3168085515499115, + "learning_rate": 1.8874204373471982e-05, + "loss": 0.2242, + "step": 8214 + }, + { + "epoch": 0.1525306045157673, + "grad_norm": 0.4080829322338104, + "learning_rate": 1.8873666606491364e-05, + "loss": 0.4444, + "step": 8216 + }, + { + "epoch": 0.15256773465318596, + "grad_norm": 0.30472931265830994, + "learning_rate": 1.8873128718766728e-05, + "loss": 0.4663, + "step": 8218 + }, + { + "epoch": 0.1526048647906046, + "grad_norm": 0.3709179759025574, + "learning_rate": 1.8872590710305386e-05, + "loss": 0.3474, + "step": 8220 + }, + { + "epoch": 0.15264199492802322, + "grad_norm": 0.358796089887619, + "learning_rate": 1.887205258111466e-05, + "loss": 0.4593, + "step": 8222 + }, + { + "epoch": 0.15267912506544187, + "grad_norm": 0.31223952770233154, + "learning_rate": 1.8871514331201872e-05, + "loss": 0.277, + "step": 8224 + }, + { + "epoch": 0.1527162552028605, + "grad_norm": 0.3244241774082184, + "learning_rate": 1.887097596057435e-05, + "loss": 0.6544, + "step": 8226 + }, + { + "epoch": 0.15275338534027916, + "grad_norm": 0.3357715904712677, + "learning_rate": 1.8870437469239416e-05, + "loss": 0.2969, + "step": 8228 + }, + { + "epoch": 0.15279051547769779, + "grad_norm": 0.642245352268219, + "learning_rate": 1.8869898857204395e-05, + "loss": 0.312, + "step": 8230 + }, + { + "epoch": 0.15282764561511641, + "grad_norm": 0.44083335995674133, + "learning_rate": 1.8869360124476624e-05, + "loss": 0.2675, + "step": 8232 + }, + { + "epoch": 0.15286477575253507, + "grad_norm": 0.2946237623691559, + "learning_rate": 1.8868821271063427e-05, + "loss": 0.3552, + "step": 8234 + }, + { + "epoch": 0.1529019058899537, + "grad_norm": 0.39548438787460327, + "learning_rate": 1.8868282296972138e-05, + "loss": 0.3445, + "step": 8236 + }, + { + "epoch": 0.15293903602737233, + "grad_norm": 0.376122385263443, + "learning_rate": 1.886774320221009e-05, + "loss": 0.3051, + "step": 8238 + }, + { + "epoch": 0.15297616616479098, + "grad_norm": 0.357554167509079, + "learning_rate": 1.8867203986784618e-05, + "loss": 0.3374, + "step": 8240 + }, + { + "epoch": 0.1530132963022096, + "grad_norm": 0.4153965413570404, + "learning_rate": 1.8866664650703058e-05, + "loss": 0.1452, + "step": 8242 + }, + { + "epoch": 0.15305042643962824, + "grad_norm": 0.327599436044693, + "learning_rate": 1.8866125193972755e-05, + "loss": 0.2772, + "step": 8244 + }, + { + "epoch": 0.1530875565770469, + "grad_norm": 0.2960450053215027, + "learning_rate": 1.8865585616601042e-05, + "loss": 0.3275, + "step": 8246 + }, + { + "epoch": 0.15312468671446552, + "grad_norm": 0.28462567925453186, + "learning_rate": 1.8865045918595267e-05, + "loss": 0.1539, + "step": 8248 + }, + { + "epoch": 0.15316181685188418, + "grad_norm": 0.35535839200019836, + "learning_rate": 1.886450609996277e-05, + "loss": 0.3838, + "step": 8250 + }, + { + "epoch": 0.1531989469893028, + "grad_norm": 0.3786665201187134, + "learning_rate": 1.88639661607109e-05, + "loss": 0.4171, + "step": 8252 + }, + { + "epoch": 0.15323607712672144, + "grad_norm": 0.275909960269928, + "learning_rate": 1.8863426100846998e-05, + "loss": 0.3196, + "step": 8254 + }, + { + "epoch": 0.1532732072641401, + "grad_norm": 0.32498323917388916, + "learning_rate": 1.8862885920378415e-05, + "loss": 0.3672, + "step": 8256 + }, + { + "epoch": 0.15331033740155872, + "grad_norm": 0.275538831949234, + "learning_rate": 1.8862345619312504e-05, + "loss": 0.4554, + "step": 8258 + }, + { + "epoch": 0.15334746753897735, + "grad_norm": 0.3281930088996887, + "learning_rate": 1.8861805197656612e-05, + "loss": 0.4325, + "step": 8260 + }, + { + "epoch": 0.153384597676396, + "grad_norm": 0.2855262756347656, + "learning_rate": 1.88612646554181e-05, + "loss": 0.2941, + "step": 8262 + }, + { + "epoch": 0.15342172781381463, + "grad_norm": 0.48509934544563293, + "learning_rate": 1.8860723992604314e-05, + "loss": 0.2736, + "step": 8264 + }, + { + "epoch": 0.1534588579512333, + "grad_norm": 0.6197798848152161, + "learning_rate": 1.8860183209222616e-05, + "loss": 0.3455, + "step": 8266 + }, + { + "epoch": 0.15349598808865192, + "grad_norm": 0.3569723069667816, + "learning_rate": 1.8859642305280367e-05, + "loss": 0.3243, + "step": 8268 + }, + { + "epoch": 0.15353311822607055, + "grad_norm": 0.29191651940345764, + "learning_rate": 1.8859101280784923e-05, + "loss": 0.4646, + "step": 8270 + }, + { + "epoch": 0.1535702483634892, + "grad_norm": 0.4912727475166321, + "learning_rate": 1.8858560135743647e-05, + "loss": 0.3712, + "step": 8272 + }, + { + "epoch": 0.15360737850090783, + "grad_norm": 0.3839099407196045, + "learning_rate": 1.88580188701639e-05, + "loss": 0.2462, + "step": 8274 + }, + { + "epoch": 0.15364450863832646, + "grad_norm": 0.2939319312572479, + "learning_rate": 1.8857477484053052e-05, + "loss": 0.437, + "step": 8276 + }, + { + "epoch": 0.15368163877574512, + "grad_norm": 0.3470343053340912, + "learning_rate": 1.885693597741846e-05, + "loss": 0.4802, + "step": 8278 + }, + { + "epoch": 0.15371876891316374, + "grad_norm": 0.3169183135032654, + "learning_rate": 1.8856394350267505e-05, + "loss": 0.2843, + "step": 8280 + }, + { + "epoch": 0.15375589905058237, + "grad_norm": 0.3366777300834656, + "learning_rate": 1.8855852602607552e-05, + "loss": 0.3752, + "step": 8282 + }, + { + "epoch": 0.15379302918800103, + "grad_norm": 0.28098663687705994, + "learning_rate": 1.8855310734445968e-05, + "loss": 0.2157, + "step": 8284 + }, + { + "epoch": 0.15383015932541966, + "grad_norm": 0.47254812717437744, + "learning_rate": 1.8854768745790132e-05, + "loss": 0.4557, + "step": 8286 + }, + { + "epoch": 0.1538672894628383, + "grad_norm": 0.5211777091026306, + "learning_rate": 1.8854226636647413e-05, + "loss": 0.4324, + "step": 8288 + }, + { + "epoch": 0.15390441960025694, + "grad_norm": 0.7771388292312622, + "learning_rate": 1.885368440702519e-05, + "loss": 0.4232, + "step": 8290 + }, + { + "epoch": 0.15394154973767557, + "grad_norm": 0.3036107122898102, + "learning_rate": 1.885314205693085e-05, + "loss": 0.5038, + "step": 8292 + }, + { + "epoch": 0.15397867987509423, + "grad_norm": 0.32725098729133606, + "learning_rate": 1.8852599586371758e-05, + "loss": 0.3583, + "step": 8294 + }, + { + "epoch": 0.15401581001251285, + "grad_norm": 0.41499611735343933, + "learning_rate": 1.8852056995355306e-05, + "loss": 0.4218, + "step": 8296 + }, + { + "epoch": 0.15405294014993148, + "grad_norm": 0.37212952971458435, + "learning_rate": 1.885151428388887e-05, + "loss": 0.5429, + "step": 8298 + }, + { + "epoch": 0.15409007028735014, + "grad_norm": 0.33431586623191833, + "learning_rate": 1.885097145197984e-05, + "loss": 0.3511, + "step": 8300 + }, + { + "epoch": 0.15412720042476877, + "grad_norm": 0.24516381323337555, + "learning_rate": 1.88504284996356e-05, + "loss": 0.3869, + "step": 8302 + }, + { + "epoch": 0.15416433056218742, + "grad_norm": 0.3198567032814026, + "learning_rate": 1.8849885426863537e-05, + "loss": 0.2599, + "step": 8304 + }, + { + "epoch": 0.15420146069960605, + "grad_norm": 0.35716384649276733, + "learning_rate": 1.8849342233671044e-05, + "loss": 0.315, + "step": 8306 + }, + { + "epoch": 0.15423859083702468, + "grad_norm": 0.38285890221595764, + "learning_rate": 1.8848798920065506e-05, + "loss": 0.1891, + "step": 8308 + }, + { + "epoch": 0.15427572097444334, + "grad_norm": 0.3787875175476074, + "learning_rate": 1.8848255486054326e-05, + "loss": 0.4427, + "step": 8310 + }, + { + "epoch": 0.15431285111186197, + "grad_norm": 0.45547017455101013, + "learning_rate": 1.8847711931644887e-05, + "loss": 0.4486, + "step": 8312 + }, + { + "epoch": 0.1543499812492806, + "grad_norm": 0.3472912907600403, + "learning_rate": 1.884716825684459e-05, + "loss": 0.4075, + "step": 8314 + }, + { + "epoch": 0.15438711138669925, + "grad_norm": 0.38502001762390137, + "learning_rate": 1.884662446166084e-05, + "loss": 0.2494, + "step": 8316 + }, + { + "epoch": 0.15442424152411788, + "grad_norm": 0.23485706746578217, + "learning_rate": 1.8846080546101024e-05, + "loss": 0.2826, + "step": 8318 + }, + { + "epoch": 0.1544613716615365, + "grad_norm": 0.3629828095436096, + "learning_rate": 1.884553651017255e-05, + "loss": 0.4393, + "step": 8320 + }, + { + "epoch": 0.15449850179895516, + "grad_norm": 0.28902557492256165, + "learning_rate": 1.884499235388282e-05, + "loss": 0.4056, + "step": 8322 + }, + { + "epoch": 0.1545356319363738, + "grad_norm": 0.33223941922187805, + "learning_rate": 1.884444807723924e-05, + "loss": 0.2088, + "step": 8324 + }, + { + "epoch": 0.15457276207379245, + "grad_norm": 0.42679888010025024, + "learning_rate": 1.884390368024921e-05, + "loss": 0.1811, + "step": 8326 + }, + { + "epoch": 0.15460989221121108, + "grad_norm": 0.30499541759490967, + "learning_rate": 1.8843359162920144e-05, + "loss": 0.4709, + "step": 8328 + }, + { + "epoch": 0.1546470223486297, + "grad_norm": 0.30179184675216675, + "learning_rate": 1.884281452525945e-05, + "loss": 0.3519, + "step": 8330 + }, + { + "epoch": 0.15468415248604836, + "grad_norm": 0.5158125162124634, + "learning_rate": 1.8842269767274535e-05, + "loss": 0.3181, + "step": 8332 + }, + { + "epoch": 0.154721282623467, + "grad_norm": 0.31337445974349976, + "learning_rate": 1.8841724888972813e-05, + "loss": 0.343, + "step": 8334 + }, + { + "epoch": 0.15475841276088562, + "grad_norm": 0.30121591687202454, + "learning_rate": 1.8841179890361702e-05, + "loss": 0.193, + "step": 8336 + }, + { + "epoch": 0.15479554289830427, + "grad_norm": 0.38722893595695496, + "learning_rate": 1.8840634771448613e-05, + "loss": 0.4006, + "step": 8338 + }, + { + "epoch": 0.1548326730357229, + "grad_norm": 0.4912799894809723, + "learning_rate": 1.8840089532240968e-05, + "loss": 0.2056, + "step": 8340 + }, + { + "epoch": 0.15486980317314156, + "grad_norm": 0.29853376746177673, + "learning_rate": 1.883954417274618e-05, + "loss": 0.2034, + "step": 8342 + }, + { + "epoch": 0.15490693331056019, + "grad_norm": 0.38545364141464233, + "learning_rate": 1.8838998692971676e-05, + "loss": 0.3032, + "step": 8344 + }, + { + "epoch": 0.1549440634479788, + "grad_norm": 0.34697243571281433, + "learning_rate": 1.883845309292488e-05, + "loss": 0.2253, + "step": 8346 + }, + { + "epoch": 0.15498119358539747, + "grad_norm": 0.2644882798194885, + "learning_rate": 1.8837907372613206e-05, + "loss": 0.493, + "step": 8348 + }, + { + "epoch": 0.1550183237228161, + "grad_norm": 0.39763158559799194, + "learning_rate": 1.8837361532044085e-05, + "loss": 0.3732, + "step": 8350 + }, + { + "epoch": 0.15505545386023473, + "grad_norm": 0.35202133655548096, + "learning_rate": 1.8836815571224945e-05, + "loss": 0.367, + "step": 8352 + }, + { + "epoch": 0.15509258399765338, + "grad_norm": 0.3993526101112366, + "learning_rate": 1.8836269490163218e-05, + "loss": 0.6087, + "step": 8354 + }, + { + "epoch": 0.155129714135072, + "grad_norm": 0.3769938349723816, + "learning_rate": 1.883572328886633e-05, + "loss": 0.4694, + "step": 8356 + }, + { + "epoch": 0.15516684427249064, + "grad_norm": 0.31535157561302185, + "learning_rate": 1.883517696734171e-05, + "loss": 0.2156, + "step": 8358 + }, + { + "epoch": 0.1552039744099093, + "grad_norm": 0.32255110144615173, + "learning_rate": 1.8834630525596798e-05, + "loss": 0.2437, + "step": 8360 + }, + { + "epoch": 0.15524110454732792, + "grad_norm": 0.3419147729873657, + "learning_rate": 1.8834083963639026e-05, + "loss": 0.4095, + "step": 8362 + }, + { + "epoch": 0.15527823468474658, + "grad_norm": 0.28262636065483093, + "learning_rate": 1.8833537281475835e-05, + "loss": 0.3426, + "step": 8364 + }, + { + "epoch": 0.1553153648221652, + "grad_norm": 0.4587547481060028, + "learning_rate": 1.8832990479114656e-05, + "loss": 0.4772, + "step": 8366 + }, + { + "epoch": 0.15535249495958384, + "grad_norm": 0.36408957839012146, + "learning_rate": 1.883244355656294e-05, + "loss": 0.3623, + "step": 8368 + }, + { + "epoch": 0.1553896250970025, + "grad_norm": 0.4597477912902832, + "learning_rate": 1.8831896513828123e-05, + "loss": 0.3255, + "step": 8370 + }, + { + "epoch": 0.15542675523442112, + "grad_norm": 0.25715145468711853, + "learning_rate": 1.8831349350917647e-05, + "loss": 0.451, + "step": 8372 + }, + { + "epoch": 0.15546388537183975, + "grad_norm": 0.3800475001335144, + "learning_rate": 1.8830802067838962e-05, + "loss": 0.3383, + "step": 8374 + }, + { + "epoch": 0.1555010155092584, + "grad_norm": 0.6488123536109924, + "learning_rate": 1.883025466459951e-05, + "loss": 0.3889, + "step": 8376 + }, + { + "epoch": 0.15553814564667703, + "grad_norm": 0.30191728472709656, + "learning_rate": 1.882970714120674e-05, + "loss": 0.3073, + "step": 8378 + }, + { + "epoch": 0.1555752757840957, + "grad_norm": 0.3120136260986328, + "learning_rate": 1.8829159497668107e-05, + "loss": 0.4175, + "step": 8380 + }, + { + "epoch": 0.15561240592151432, + "grad_norm": 0.5108128190040588, + "learning_rate": 1.882861173399106e-05, + "loss": 0.2471, + "step": 8382 + }, + { + "epoch": 0.15564953605893295, + "grad_norm": 0.28883299231529236, + "learning_rate": 1.8828063850183048e-05, + "loss": 0.212, + "step": 8384 + }, + { + "epoch": 0.1556866661963516, + "grad_norm": 0.5487533211708069, + "learning_rate": 1.8827515846251532e-05, + "loss": 0.348, + "step": 8386 + }, + { + "epoch": 0.15572379633377023, + "grad_norm": 0.4988667964935303, + "learning_rate": 1.8826967722203968e-05, + "loss": 0.1381, + "step": 8388 + }, + { + "epoch": 0.15576092647118886, + "grad_norm": 0.2011573165655136, + "learning_rate": 1.8826419478047813e-05, + "loss": 0.2634, + "step": 8390 + }, + { + "epoch": 0.15579805660860752, + "grad_norm": 0.41278955340385437, + "learning_rate": 1.8825871113790528e-05, + "loss": 0.2462, + "step": 8392 + }, + { + "epoch": 0.15583518674602614, + "grad_norm": 0.4164373278617859, + "learning_rate": 1.882532262943957e-05, + "loss": 0.24, + "step": 8394 + }, + { + "epoch": 0.15587231688344477, + "grad_norm": 0.543787956237793, + "learning_rate": 1.882477402500241e-05, + "loss": 0.4038, + "step": 8396 + }, + { + "epoch": 0.15590944702086343, + "grad_norm": 0.2890948951244354, + "learning_rate": 1.8824225300486504e-05, + "loss": 0.3602, + "step": 8398 + }, + { + "epoch": 0.15594657715828206, + "grad_norm": 0.4376918077468872, + "learning_rate": 1.8823676455899328e-05, + "loss": 0.384, + "step": 8400 + }, + { + "epoch": 0.1559837072957007, + "grad_norm": 0.38471558690071106, + "learning_rate": 1.8823127491248345e-05, + "loss": 0.1924, + "step": 8402 + }, + { + "epoch": 0.15602083743311934, + "grad_norm": 0.28494715690612793, + "learning_rate": 1.882257840654102e-05, + "loss": 0.2271, + "step": 8404 + }, + { + "epoch": 0.15605796757053797, + "grad_norm": 0.3399980366230011, + "learning_rate": 1.8822029201784837e-05, + "loss": 0.2603, + "step": 8406 + }, + { + "epoch": 0.15609509770795663, + "grad_norm": 0.3538065254688263, + "learning_rate": 1.8821479876987256e-05, + "loss": 0.2999, + "step": 8408 + }, + { + "epoch": 0.15613222784537525, + "grad_norm": 0.4185173213481903, + "learning_rate": 1.882093043215576e-05, + "loss": 0.4075, + "step": 8410 + }, + { + "epoch": 0.15616935798279388, + "grad_norm": 0.2962680757045746, + "learning_rate": 1.8820380867297823e-05, + "loss": 0.3695, + "step": 8412 + }, + { + "epoch": 0.15620648812021254, + "grad_norm": 0.3709704875946045, + "learning_rate": 1.881983118242092e-05, + "loss": 0.4406, + "step": 8414 + }, + { + "epoch": 0.15624361825763117, + "grad_norm": 0.357825368642807, + "learning_rate": 1.8819281377532538e-05, + "loss": 0.3227, + "step": 8416 + }, + { + "epoch": 0.15628074839504982, + "grad_norm": 0.40552014112472534, + "learning_rate": 1.8818731452640148e-05, + "loss": 0.4656, + "step": 8418 + }, + { + "epoch": 0.15631787853246845, + "grad_norm": 0.328622967004776, + "learning_rate": 1.8818181407751236e-05, + "loss": 0.2863, + "step": 8420 + }, + { + "epoch": 0.15635500866988708, + "grad_norm": 0.44142547249794006, + "learning_rate": 1.8817631242873293e-05, + "loss": 0.388, + "step": 8422 + }, + { + "epoch": 0.15639213880730574, + "grad_norm": 0.48155131936073303, + "learning_rate": 1.88170809580138e-05, + "loss": 0.2765, + "step": 8424 + }, + { + "epoch": 0.15642926894472436, + "grad_norm": 0.3157564699649811, + "learning_rate": 1.8816530553180244e-05, + "loss": 0.2506, + "step": 8426 + }, + { + "epoch": 0.156466399082143, + "grad_norm": 0.28496500849723816, + "learning_rate": 1.881598002838012e-05, + "loss": 0.2898, + "step": 8428 + }, + { + "epoch": 0.15650352921956165, + "grad_norm": 0.31027498841285706, + "learning_rate": 1.8815429383620912e-05, + "loss": 0.2977, + "step": 8430 + }, + { + "epoch": 0.15654065935698028, + "grad_norm": 0.2794686555862427, + "learning_rate": 1.8814878618910117e-05, + "loss": 0.4871, + "step": 8432 + }, + { + "epoch": 0.1565777894943989, + "grad_norm": 0.29024404287338257, + "learning_rate": 1.8814327734255224e-05, + "loss": 0.3747, + "step": 8434 + }, + { + "epoch": 0.15661491963181756, + "grad_norm": 0.21158207952976227, + "learning_rate": 1.8813776729663736e-05, + "loss": 0.2859, + "step": 8436 + }, + { + "epoch": 0.1566520497692362, + "grad_norm": 0.2505621016025543, + "learning_rate": 1.8813225605143143e-05, + "loss": 0.452, + "step": 8438 + }, + { + "epoch": 0.15668917990665485, + "grad_norm": 0.41810840368270874, + "learning_rate": 1.881267436070095e-05, + "loss": 0.5954, + "step": 8440 + }, + { + "epoch": 0.15672631004407347, + "grad_norm": 0.4492923617362976, + "learning_rate": 1.881212299634466e-05, + "loss": 0.1924, + "step": 8442 + }, + { + "epoch": 0.1567634401814921, + "grad_norm": 0.3148758113384247, + "learning_rate": 1.8811571512081766e-05, + "loss": 0.1384, + "step": 8444 + }, + { + "epoch": 0.15680057031891076, + "grad_norm": 0.40080761909484863, + "learning_rate": 1.8811019907919775e-05, + "loss": 0.1698, + "step": 8446 + }, + { + "epoch": 0.1568377004563294, + "grad_norm": 0.3242115080356598, + "learning_rate": 1.8810468183866206e-05, + "loss": 0.2909, + "step": 8448 + }, + { + "epoch": 0.15687483059374802, + "grad_norm": 0.29555776715278625, + "learning_rate": 1.8809916339928547e-05, + "loss": 0.2006, + "step": 8450 + }, + { + "epoch": 0.15691196073116667, + "grad_norm": 0.48480233550071716, + "learning_rate": 1.8809364376114312e-05, + "loss": 0.3939, + "step": 8452 + }, + { + "epoch": 0.1569490908685853, + "grad_norm": 0.3605828583240509, + "learning_rate": 1.8808812292431025e-05, + "loss": 0.2524, + "step": 8454 + }, + { + "epoch": 0.15698622100600396, + "grad_norm": 0.4065260589122772, + "learning_rate": 1.8808260088886182e-05, + "loss": 0.4643, + "step": 8456 + }, + { + "epoch": 0.15702335114342258, + "grad_norm": 0.3973235487937927, + "learning_rate": 1.8807707765487305e-05, + "loss": 0.2947, + "step": 8458 + }, + { + "epoch": 0.1570604812808412, + "grad_norm": 0.2911217212677002, + "learning_rate": 1.8807155322241906e-05, + "loss": 0.2751, + "step": 8460 + }, + { + "epoch": 0.15709761141825987, + "grad_norm": 0.25099101662635803, + "learning_rate": 1.8806602759157503e-05, + "loss": 0.3877, + "step": 8462 + }, + { + "epoch": 0.1571347415556785, + "grad_norm": 0.2811022996902466, + "learning_rate": 1.8806050076241618e-05, + "loss": 0.285, + "step": 8464 + }, + { + "epoch": 0.15717187169309713, + "grad_norm": 0.43172702193260193, + "learning_rate": 1.8805497273501768e-05, + "loss": 0.2786, + "step": 8466 + }, + { + "epoch": 0.15720900183051578, + "grad_norm": 0.3037837743759155, + "learning_rate": 1.8804944350945473e-05, + "loss": 0.2659, + "step": 8468 + }, + { + "epoch": 0.1572461319679344, + "grad_norm": 0.43164506554603577, + "learning_rate": 1.8804391308580263e-05, + "loss": 0.3052, + "step": 8470 + }, + { + "epoch": 0.15728326210535304, + "grad_norm": 0.42627617716789246, + "learning_rate": 1.8803838146413658e-05, + "loss": 0.2732, + "step": 8472 + }, + { + "epoch": 0.1573203922427717, + "grad_norm": 0.3359403908252716, + "learning_rate": 1.8803284864453186e-05, + "loss": 0.3816, + "step": 8474 + }, + { + "epoch": 0.15735752238019032, + "grad_norm": 0.34421515464782715, + "learning_rate": 1.8802731462706374e-05, + "loss": 0.4073, + "step": 8476 + }, + { + "epoch": 0.15739465251760898, + "grad_norm": 0.3191753625869751, + "learning_rate": 1.8802177941180756e-05, + "loss": 0.1902, + "step": 8478 + }, + { + "epoch": 0.1574317826550276, + "grad_norm": 0.2970689833164215, + "learning_rate": 1.8801624299883862e-05, + "loss": 0.3047, + "step": 8480 + }, + { + "epoch": 0.15746891279244624, + "grad_norm": 0.41219812631607056, + "learning_rate": 1.8801070538823224e-05, + "loss": 0.3404, + "step": 8482 + }, + { + "epoch": 0.1575060429298649, + "grad_norm": 0.4366006553173065, + "learning_rate": 1.880051665800638e-05, + "loss": 0.3478, + "step": 8484 + }, + { + "epoch": 0.15754317306728352, + "grad_norm": 0.44011735916137695, + "learning_rate": 1.8799962657440864e-05, + "loss": 0.2409, + "step": 8486 + }, + { + "epoch": 0.15758030320470215, + "grad_norm": 0.34648698568344116, + "learning_rate": 1.8799408537134214e-05, + "loss": 0.3225, + "step": 8488 + }, + { + "epoch": 0.1576174333421208, + "grad_norm": 0.4016639292240143, + "learning_rate": 1.8798854297093975e-05, + "loss": 0.2632, + "step": 8490 + }, + { + "epoch": 0.15765456347953943, + "grad_norm": 0.2704181969165802, + "learning_rate": 1.879829993732768e-05, + "loss": 0.2534, + "step": 8492 + }, + { + "epoch": 0.1576916936169581, + "grad_norm": 0.354867160320282, + "learning_rate": 1.8797745457842878e-05, + "loss": 0.3467, + "step": 8494 + }, + { + "epoch": 0.15772882375437672, + "grad_norm": 0.7613488435745239, + "learning_rate": 1.8797190858647114e-05, + "loss": 0.1332, + "step": 8496 + }, + { + "epoch": 0.15776595389179535, + "grad_norm": 0.21838530898094177, + "learning_rate": 1.879663613974793e-05, + "loss": 0.2148, + "step": 8498 + }, + { + "epoch": 0.157803084029214, + "grad_norm": 0.27511149644851685, + "learning_rate": 1.879608130115288e-05, + "loss": 0.3795, + "step": 8500 + }, + { + "epoch": 0.15784021416663263, + "grad_norm": 0.5101073980331421, + "learning_rate": 1.879552634286951e-05, + "loss": 0.1914, + "step": 8502 + }, + { + "epoch": 0.15787734430405126, + "grad_norm": 0.35078731179237366, + "learning_rate": 1.8794971264905373e-05, + "loss": 0.3513, + "step": 8504 + }, + { + "epoch": 0.15791447444146992, + "grad_norm": 0.33056655526161194, + "learning_rate": 1.879441606726802e-05, + "loss": 0.4958, + "step": 8506 + }, + { + "epoch": 0.15795160457888854, + "grad_norm": 0.482127845287323, + "learning_rate": 1.8793860749965008e-05, + "loss": 0.3168, + "step": 8508 + }, + { + "epoch": 0.15798873471630717, + "grad_norm": 0.46538054943084717, + "learning_rate": 1.879330531300389e-05, + "loss": 0.6215, + "step": 8510 + }, + { + "epoch": 0.15802586485372583, + "grad_norm": 0.36674946546554565, + "learning_rate": 1.8792749756392227e-05, + "loss": 0.2752, + "step": 8512 + }, + { + "epoch": 0.15806299499114446, + "grad_norm": 0.3260144889354706, + "learning_rate": 1.8792194080137577e-05, + "loss": 0.2367, + "step": 8514 + }, + { + "epoch": 0.1581001251285631, + "grad_norm": 0.4890153408050537, + "learning_rate": 1.8791638284247498e-05, + "loss": 0.3441, + "step": 8516 + }, + { + "epoch": 0.15813725526598174, + "grad_norm": 0.4065719544887543, + "learning_rate": 1.879108236872956e-05, + "loss": 0.3227, + "step": 8518 + }, + { + "epoch": 0.15817438540340037, + "grad_norm": 0.4680102467536926, + "learning_rate": 1.8790526333591326e-05, + "loss": 0.2918, + "step": 8520 + }, + { + "epoch": 0.15821151554081903, + "grad_norm": 0.3054845631122589, + "learning_rate": 1.8789970178840354e-05, + "loss": 0.1613, + "step": 8522 + }, + { + "epoch": 0.15824864567823765, + "grad_norm": 0.3788398504257202, + "learning_rate": 1.878941390448422e-05, + "loss": 0.3803, + "step": 8524 + }, + { + "epoch": 0.15828577581565628, + "grad_norm": 0.3817039430141449, + "learning_rate": 1.8788857510530486e-05, + "loss": 0.367, + "step": 8526 + }, + { + "epoch": 0.15832290595307494, + "grad_norm": 0.558929979801178, + "learning_rate": 1.8788300996986728e-05, + "loss": 0.3611, + "step": 8528 + }, + { + "epoch": 0.15836003609049357, + "grad_norm": 0.24036526679992676, + "learning_rate": 1.878774436386052e-05, + "loss": 0.5088, + "step": 8530 + }, + { + "epoch": 0.15839716622791222, + "grad_norm": 0.3546796143054962, + "learning_rate": 1.8787187611159435e-05, + "loss": 0.3854, + "step": 8532 + }, + { + "epoch": 0.15843429636533085, + "grad_norm": 0.5018741488456726, + "learning_rate": 1.8786630738891044e-05, + "loss": 0.4239, + "step": 8534 + }, + { + "epoch": 0.15847142650274948, + "grad_norm": 0.3488694429397583, + "learning_rate": 1.8786073747062926e-05, + "loss": 0.3024, + "step": 8536 + }, + { + "epoch": 0.15850855664016814, + "grad_norm": 0.3052525520324707, + "learning_rate": 1.8785516635682667e-05, + "loss": 0.2005, + "step": 8538 + }, + { + "epoch": 0.15854568677758676, + "grad_norm": 0.3745570778846741, + "learning_rate": 1.8784959404757836e-05, + "loss": 0.3843, + "step": 8540 + }, + { + "epoch": 0.1585828169150054, + "grad_norm": 0.40408721566200256, + "learning_rate": 1.8784402054296026e-05, + "loss": 0.3659, + "step": 8542 + }, + { + "epoch": 0.15861994705242405, + "grad_norm": 0.27932727336883545, + "learning_rate": 1.8783844584304816e-05, + "loss": 0.3886, + "step": 8544 + }, + { + "epoch": 0.15865707718984268, + "grad_norm": 0.43143123388290405, + "learning_rate": 1.878328699479179e-05, + "loss": 0.3531, + "step": 8546 + }, + { + "epoch": 0.1586942073272613, + "grad_norm": 0.32952263951301575, + "learning_rate": 1.878272928576454e-05, + "loss": 0.2384, + "step": 8548 + }, + { + "epoch": 0.15873133746467996, + "grad_norm": 0.225346639752388, + "learning_rate": 1.8782171457230647e-05, + "loss": 0.2715, + "step": 8550 + }, + { + "epoch": 0.1587684676020986, + "grad_norm": 0.3140149414539337, + "learning_rate": 1.878161350919771e-05, + "loss": 0.4038, + "step": 8552 + }, + { + "epoch": 0.15880559773951725, + "grad_norm": 0.3829936981201172, + "learning_rate": 1.8781055441673315e-05, + "loss": 0.4015, + "step": 8554 + }, + { + "epoch": 0.15884272787693587, + "grad_norm": 0.3526516854763031, + "learning_rate": 1.878049725466506e-05, + "loss": 0.4421, + "step": 8556 + }, + { + "epoch": 0.1588798580143545, + "grad_norm": 0.5073131918907166, + "learning_rate": 1.8779938948180536e-05, + "loss": 0.464, + "step": 8558 + }, + { + "epoch": 0.15891698815177316, + "grad_norm": 0.31822317838668823, + "learning_rate": 1.8779380522227342e-05, + "loss": 0.4428, + "step": 8560 + }, + { + "epoch": 0.1589541182891918, + "grad_norm": 0.3263387382030487, + "learning_rate": 1.877882197681308e-05, + "loss": 0.2352, + "step": 8562 + }, + { + "epoch": 0.15899124842661042, + "grad_norm": 0.3192360997200012, + "learning_rate": 1.8778263311945343e-05, + "loss": 0.385, + "step": 8564 + }, + { + "epoch": 0.15902837856402907, + "grad_norm": 0.4619692265987396, + "learning_rate": 1.8777704527631736e-05, + "loss": 0.4057, + "step": 8566 + }, + { + "epoch": 0.1590655087014477, + "grad_norm": 0.37241238355636597, + "learning_rate": 1.877714562387986e-05, + "loss": 0.374, + "step": 8568 + }, + { + "epoch": 0.15910263883886636, + "grad_norm": 0.29747694730758667, + "learning_rate": 1.8776586600697332e-05, + "loss": 0.0751, + "step": 8570 + }, + { + "epoch": 0.15913976897628498, + "grad_norm": 0.3293072283267975, + "learning_rate": 1.877602745809174e-05, + "loss": 0.463, + "step": 8572 + }, + { + "epoch": 0.1591768991137036, + "grad_norm": 0.31524768471717834, + "learning_rate": 1.8775468196070705e-05, + "loss": 0.1662, + "step": 8574 + }, + { + "epoch": 0.15921402925112227, + "grad_norm": 0.3459037244319916, + "learning_rate": 1.8774908814641835e-05, + "loss": 0.3044, + "step": 8576 + }, + { + "epoch": 0.1592511593885409, + "grad_norm": 0.3256380259990692, + "learning_rate": 1.8774349313812735e-05, + "loss": 0.3346, + "step": 8578 + }, + { + "epoch": 0.15928828952595953, + "grad_norm": 0.4057963192462921, + "learning_rate": 1.8773789693591028e-05, + "loss": 0.383, + "step": 8580 + }, + { + "epoch": 0.15932541966337818, + "grad_norm": 0.4462374448776245, + "learning_rate": 1.8773229953984322e-05, + "loss": 0.3459, + "step": 8582 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.3705790936946869, + "learning_rate": 1.8772670095000232e-05, + "loss": 0.2999, + "step": 8584 + }, + { + "epoch": 0.15939967993821544, + "grad_norm": 0.6001185178756714, + "learning_rate": 1.8772110116646388e-05, + "loss": 0.3258, + "step": 8586 + }, + { + "epoch": 0.1594368100756341, + "grad_norm": 0.32519131898880005, + "learning_rate": 1.8771550018930392e-05, + "loss": 0.4422, + "step": 8588 + }, + { + "epoch": 0.15947394021305272, + "grad_norm": 0.3848697245121002, + "learning_rate": 1.8770989801859877e-05, + "loss": 0.316, + "step": 8590 + }, + { + "epoch": 0.15951107035047138, + "grad_norm": 0.29297640919685364, + "learning_rate": 1.8770429465442465e-05, + "loss": 0.3697, + "step": 8592 + }, + { + "epoch": 0.15954820048789, + "grad_norm": 0.2892846167087555, + "learning_rate": 1.8769869009685778e-05, + "loss": 0.3376, + "step": 8594 + }, + { + "epoch": 0.15958533062530864, + "grad_norm": 0.44709864258766174, + "learning_rate": 1.876930843459744e-05, + "loss": 0.387, + "step": 8596 + }, + { + "epoch": 0.1596224607627273, + "grad_norm": 0.38853031396865845, + "learning_rate": 1.8768747740185085e-05, + "loss": 0.698, + "step": 8598 + }, + { + "epoch": 0.15965959090014592, + "grad_norm": 0.4344070553779602, + "learning_rate": 1.8768186926456336e-05, + "loss": 0.4328, + "step": 8600 + }, + { + "epoch": 0.15969672103756455, + "grad_norm": 0.4778657555580139, + "learning_rate": 1.876762599341883e-05, + "loss": 0.3448, + "step": 8602 + }, + { + "epoch": 0.1597338511749832, + "grad_norm": 0.31848129630088806, + "learning_rate": 1.876706494108019e-05, + "loss": 0.4954, + "step": 8604 + }, + { + "epoch": 0.15977098131240183, + "grad_norm": 0.3195416331291199, + "learning_rate": 1.876650376944806e-05, + "loss": 0.2679, + "step": 8606 + }, + { + "epoch": 0.1598081114498205, + "grad_norm": 0.44452306628227234, + "learning_rate": 1.8765942478530072e-05, + "loss": 0.3155, + "step": 8608 + }, + { + "epoch": 0.15984524158723912, + "grad_norm": 0.35568225383758545, + "learning_rate": 1.8765381068333864e-05, + "loss": 0.2622, + "step": 8610 + }, + { + "epoch": 0.15988237172465775, + "grad_norm": 0.39536669850349426, + "learning_rate": 1.8764819538867076e-05, + "loss": 0.4116, + "step": 8612 + }, + { + "epoch": 0.1599195018620764, + "grad_norm": 0.4562740921974182, + "learning_rate": 1.8764257890137348e-05, + "loss": 0.3428, + "step": 8614 + }, + { + "epoch": 0.15995663199949503, + "grad_norm": 0.2941872477531433, + "learning_rate": 1.876369612215232e-05, + "loss": 0.3196, + "step": 8616 + }, + { + "epoch": 0.15999376213691366, + "grad_norm": 0.35525792837142944, + "learning_rate": 1.876313423491964e-05, + "loss": 0.2997, + "step": 8618 + }, + { + "epoch": 0.16003089227433231, + "grad_norm": 0.3040013015270233, + "learning_rate": 1.876257222844695e-05, + "loss": 0.3572, + "step": 8620 + }, + { + "epoch": 0.16006802241175094, + "grad_norm": 0.3851323425769806, + "learning_rate": 1.87620101027419e-05, + "loss": 0.4544, + "step": 8622 + }, + { + "epoch": 0.16010515254916957, + "grad_norm": 0.31959977746009827, + "learning_rate": 1.8761447857812138e-05, + "loss": 0.5239, + "step": 8624 + }, + { + "epoch": 0.16014228268658823, + "grad_norm": 0.3255581259727478, + "learning_rate": 1.8760885493665314e-05, + "loss": 0.3746, + "step": 8626 + }, + { + "epoch": 0.16017941282400686, + "grad_norm": 0.345718115568161, + "learning_rate": 1.876032301030908e-05, + "loss": 0.2424, + "step": 8628 + }, + { + "epoch": 0.1602165429614255, + "grad_norm": 0.4276328682899475, + "learning_rate": 1.875976040775109e-05, + "loss": 0.2984, + "step": 8630 + }, + { + "epoch": 0.16025367309884414, + "grad_norm": 0.4385695457458496, + "learning_rate": 1.8759197685999e-05, + "loss": 0.1855, + "step": 8632 + }, + { + "epoch": 0.16029080323626277, + "grad_norm": 0.3090505599975586, + "learning_rate": 1.8758634845060465e-05, + "loss": 0.3593, + "step": 8634 + }, + { + "epoch": 0.16032793337368142, + "grad_norm": 0.7547240853309631, + "learning_rate": 1.8758071884943147e-05, + "loss": 0.2509, + "step": 8636 + }, + { + "epoch": 0.16036506351110005, + "grad_norm": 0.4626752734184265, + "learning_rate": 1.87575088056547e-05, + "loss": 0.4202, + "step": 8638 + }, + { + "epoch": 0.16040219364851868, + "grad_norm": 0.4105139672756195, + "learning_rate": 1.8756945607202795e-05, + "loss": 0.2204, + "step": 8640 + }, + { + "epoch": 0.16043932378593734, + "grad_norm": 0.3552491366863251, + "learning_rate": 1.875638228959509e-05, + "loss": 0.4521, + "step": 8642 + }, + { + "epoch": 0.16047645392335597, + "grad_norm": 0.35555294156074524, + "learning_rate": 1.8755818852839246e-05, + "loss": 0.2935, + "step": 8644 + }, + { + "epoch": 0.16051358406077462, + "grad_norm": 0.40929824113845825, + "learning_rate": 1.8755255296942934e-05, + "loss": 0.492, + "step": 8646 + }, + { + "epoch": 0.16055071419819325, + "grad_norm": 0.33090144395828247, + "learning_rate": 1.8754691621913827e-05, + "loss": 0.2716, + "step": 8648 + }, + { + "epoch": 0.16058784433561188, + "grad_norm": 0.5216090083122253, + "learning_rate": 1.875412782775959e-05, + "loss": 0.3294, + "step": 8650 + }, + { + "epoch": 0.16062497447303054, + "grad_norm": 0.3737295866012573, + "learning_rate": 1.8753563914487895e-05, + "loss": 0.335, + "step": 8652 + }, + { + "epoch": 0.16066210461044916, + "grad_norm": 0.3304767310619354, + "learning_rate": 1.8752999882106412e-05, + "loss": 0.359, + "step": 8654 + }, + { + "epoch": 0.1606992347478678, + "grad_norm": 0.3724282681941986, + "learning_rate": 1.875243573062282e-05, + "loss": 0.3107, + "step": 8656 + }, + { + "epoch": 0.16073636488528645, + "grad_norm": 0.2931241989135742, + "learning_rate": 1.8751871460044796e-05, + "loss": 0.5784, + "step": 8658 + }, + { + "epoch": 0.16077349502270508, + "grad_norm": 0.4210611879825592, + "learning_rate": 1.8751307070380015e-05, + "loss": 0.5042, + "step": 8660 + }, + { + "epoch": 0.1608106251601237, + "grad_norm": 0.45726799964904785, + "learning_rate": 1.8750742561636158e-05, + "loss": 0.3166, + "step": 8662 + }, + { + "epoch": 0.16084775529754236, + "grad_norm": 0.42124277353286743, + "learning_rate": 1.8750177933820904e-05, + "loss": 0.2216, + "step": 8664 + }, + { + "epoch": 0.160884885434961, + "grad_norm": 0.4111817181110382, + "learning_rate": 1.8749613186941943e-05, + "loss": 0.3943, + "step": 8666 + }, + { + "epoch": 0.16092201557237965, + "grad_norm": 0.406029611825943, + "learning_rate": 1.874904832100695e-05, + "loss": 0.3875, + "step": 8668 + }, + { + "epoch": 0.16095914570979827, + "grad_norm": 0.36605018377304077, + "learning_rate": 1.874848333602362e-05, + "loss": 0.2387, + "step": 8670 + }, + { + "epoch": 0.1609962758472169, + "grad_norm": 0.3154708743095398, + "learning_rate": 1.8747918231999634e-05, + "loss": 0.271, + "step": 8672 + }, + { + "epoch": 0.16103340598463556, + "grad_norm": 0.32795828580856323, + "learning_rate": 1.8747353008942683e-05, + "loss": 0.2265, + "step": 8674 + }, + { + "epoch": 0.1610705361220542, + "grad_norm": 0.46512681245803833, + "learning_rate": 1.874678766686046e-05, + "loss": 0.2286, + "step": 8676 + }, + { + "epoch": 0.16110766625947281, + "grad_norm": 0.35453662276268005, + "learning_rate": 1.8746222205760654e-05, + "loss": 0.259, + "step": 8678 + }, + { + "epoch": 0.16114479639689147, + "grad_norm": 0.3772389888763428, + "learning_rate": 1.8745656625650968e-05, + "loss": 0.2606, + "step": 8680 + }, + { + "epoch": 0.1611819265343101, + "grad_norm": 0.39723682403564453, + "learning_rate": 1.874509092653909e-05, + "loss": 0.3513, + "step": 8682 + }, + { + "epoch": 0.16121905667172876, + "grad_norm": 0.5562801957130432, + "learning_rate": 1.8744525108432718e-05, + "loss": 0.2881, + "step": 8684 + }, + { + "epoch": 0.16125618680914738, + "grad_norm": 0.24145297706127167, + "learning_rate": 1.874395917133955e-05, + "loss": 0.4559, + "step": 8686 + }, + { + "epoch": 0.161293316946566, + "grad_norm": 0.34071019291877747, + "learning_rate": 1.8743393115267288e-05, + "loss": 0.244, + "step": 8688 + }, + { + "epoch": 0.16133044708398467, + "grad_norm": 0.34396877884864807, + "learning_rate": 1.8742826940223638e-05, + "loss": 0.3514, + "step": 8690 + }, + { + "epoch": 0.1613675772214033, + "grad_norm": 0.4700627624988556, + "learning_rate": 1.87422606462163e-05, + "loss": 0.454, + "step": 8692 + }, + { + "epoch": 0.16140470735882192, + "grad_norm": 0.27930131554603577, + "learning_rate": 1.8741694233252983e-05, + "loss": 0.2178, + "step": 8694 + }, + { + "epoch": 0.16144183749624058, + "grad_norm": 0.39671099185943604, + "learning_rate": 1.874112770134139e-05, + "loss": 0.3771, + "step": 8696 + }, + { + "epoch": 0.1614789676336592, + "grad_norm": 0.515316903591156, + "learning_rate": 1.874056105048923e-05, + "loss": 0.4473, + "step": 8698 + }, + { + "epoch": 0.16151609777107784, + "grad_norm": 0.25630247592926025, + "learning_rate": 1.8739994280704215e-05, + "loss": 0.2924, + "step": 8700 + }, + { + "epoch": 0.1615532279084965, + "grad_norm": 0.32032832503318787, + "learning_rate": 1.873942739199406e-05, + "loss": 0.4812, + "step": 8702 + }, + { + "epoch": 0.16159035804591512, + "grad_norm": 0.34187644720077515, + "learning_rate": 1.8738860384366474e-05, + "loss": 0.2431, + "step": 8704 + }, + { + "epoch": 0.16162748818333378, + "grad_norm": 0.3880484998226166, + "learning_rate": 1.8738293257829176e-05, + "loss": 0.3024, + "step": 8706 + }, + { + "epoch": 0.1616646183207524, + "grad_norm": 0.3767837882041931, + "learning_rate": 1.873772601238988e-05, + "loss": 0.3059, + "step": 8708 + }, + { + "epoch": 0.16170174845817104, + "grad_norm": 0.3146280348300934, + "learning_rate": 1.8737158648056305e-05, + "loss": 0.3483, + "step": 8710 + }, + { + "epoch": 0.1617388785955897, + "grad_norm": 0.44261911511421204, + "learning_rate": 1.8736591164836174e-05, + "loss": 0.2981, + "step": 8712 + }, + { + "epoch": 0.16177600873300832, + "grad_norm": 0.32656797766685486, + "learning_rate": 1.8736023562737205e-05, + "loss": 0.3056, + "step": 8714 + }, + { + "epoch": 0.16181313887042695, + "grad_norm": 0.3982129991054535, + "learning_rate": 1.873545584176712e-05, + "loss": 0.3988, + "step": 8716 + }, + { + "epoch": 0.1618502690078456, + "grad_norm": 0.3760051429271698, + "learning_rate": 1.873488800193365e-05, + "loss": 0.3991, + "step": 8718 + }, + { + "epoch": 0.16188739914526423, + "grad_norm": 0.27282649278640747, + "learning_rate": 1.8734320043244518e-05, + "loss": 0.3566, + "step": 8720 + }, + { + "epoch": 0.1619245292826829, + "grad_norm": 0.4922017455101013, + "learning_rate": 1.873375196570745e-05, + "loss": 0.5588, + "step": 8722 + }, + { + "epoch": 0.16196165942010152, + "grad_norm": 0.3886096179485321, + "learning_rate": 1.8733183769330182e-05, + "loss": 0.5044, + "step": 8724 + }, + { + "epoch": 0.16199878955752015, + "grad_norm": 0.38123759627342224, + "learning_rate": 1.873261545412044e-05, + "loss": 0.5065, + "step": 8726 + }, + { + "epoch": 0.1620359196949388, + "grad_norm": 0.39078447222709656, + "learning_rate": 1.8732047020085958e-05, + "loss": 0.3562, + "step": 8728 + }, + { + "epoch": 0.16207304983235743, + "grad_norm": 0.22509169578552246, + "learning_rate": 1.873147846723447e-05, + "loss": 0.3349, + "step": 8730 + }, + { + "epoch": 0.16211017996977606, + "grad_norm": 0.48230093717575073, + "learning_rate": 1.873090979557372e-05, + "loss": 0.2265, + "step": 8732 + }, + { + "epoch": 0.16214731010719471, + "grad_norm": 0.3128286898136139, + "learning_rate": 1.8730341005111432e-05, + "loss": 0.458, + "step": 8734 + }, + { + "epoch": 0.16218444024461334, + "grad_norm": 0.28908175230026245, + "learning_rate": 1.872977209585536e-05, + "loss": 0.456, + "step": 8736 + }, + { + "epoch": 0.16222157038203197, + "grad_norm": 0.3584655821323395, + "learning_rate": 1.8729203067813233e-05, + "loss": 0.2529, + "step": 8738 + }, + { + "epoch": 0.16225870051945063, + "grad_norm": 0.29987919330596924, + "learning_rate": 1.8728633920992804e-05, + "loss": 0.4907, + "step": 8740 + }, + { + "epoch": 0.16229583065686926, + "grad_norm": 0.5201492309570312, + "learning_rate": 1.872806465540181e-05, + "loss": 0.5543, + "step": 8742 + }, + { + "epoch": 0.1623329607942879, + "grad_norm": 0.44429242610931396, + "learning_rate": 1.8727495271048e-05, + "loss": 0.3141, + "step": 8744 + }, + { + "epoch": 0.16237009093170654, + "grad_norm": 0.32017919421195984, + "learning_rate": 1.8726925767939122e-05, + "loss": 0.4184, + "step": 8746 + }, + { + "epoch": 0.16240722106912517, + "grad_norm": 0.38378721475601196, + "learning_rate": 1.872635614608292e-05, + "loss": 0.3663, + "step": 8748 + }, + { + "epoch": 0.16244435120654382, + "grad_norm": 0.49947452545166016, + "learning_rate": 1.872578640548716e-05, + "loss": 0.3633, + "step": 8750 + }, + { + "epoch": 0.16248148134396245, + "grad_norm": 0.4004424512386322, + "learning_rate": 1.8725216546159575e-05, + "loss": 0.2664, + "step": 8752 + }, + { + "epoch": 0.16251861148138108, + "grad_norm": 0.3957572877407074, + "learning_rate": 1.8724646568107928e-05, + "loss": 0.2754, + "step": 8754 + }, + { + "epoch": 0.16255574161879974, + "grad_norm": 0.2752881646156311, + "learning_rate": 1.8724076471339978e-05, + "loss": 0.3604, + "step": 8756 + }, + { + "epoch": 0.16259287175621837, + "grad_norm": 0.31430885195732117, + "learning_rate": 1.8723506255863477e-05, + "loss": 0.1757, + "step": 8758 + }, + { + "epoch": 0.16263000189363702, + "grad_norm": 0.2804718613624573, + "learning_rate": 1.8722935921686188e-05, + "loss": 0.32, + "step": 8760 + }, + { + "epoch": 0.16266713203105565, + "grad_norm": 0.28155380487442017, + "learning_rate": 1.8722365468815863e-05, + "loss": 0.2424, + "step": 8762 + }, + { + "epoch": 0.16270426216847428, + "grad_norm": 0.30976906418800354, + "learning_rate": 1.8721794897260278e-05, + "loss": 0.3799, + "step": 8764 + }, + { + "epoch": 0.16274139230589293, + "grad_norm": 0.5113062262535095, + "learning_rate": 1.8721224207027187e-05, + "loss": 0.4051, + "step": 8766 + }, + { + "epoch": 0.16277852244331156, + "grad_norm": 0.3286890685558319, + "learning_rate": 1.8720653398124357e-05, + "loss": 0.4777, + "step": 8768 + }, + { + "epoch": 0.1628156525807302, + "grad_norm": 0.3769436776638031, + "learning_rate": 1.8720082470559553e-05, + "loss": 0.3091, + "step": 8770 + }, + { + "epoch": 0.16285278271814885, + "grad_norm": 0.3779613971710205, + "learning_rate": 1.871951142434055e-05, + "loss": 0.4016, + "step": 8772 + }, + { + "epoch": 0.16288991285556748, + "grad_norm": 0.4506412446498871, + "learning_rate": 1.871894025947511e-05, + "loss": 0.2935, + "step": 8774 + }, + { + "epoch": 0.1629270429929861, + "grad_norm": 0.30236580967903137, + "learning_rate": 1.8718368975971012e-05, + "loss": 0.2349, + "step": 8776 + }, + { + "epoch": 0.16296417313040476, + "grad_norm": 0.31908392906188965, + "learning_rate": 1.871779757383603e-05, + "loss": 0.4673, + "step": 8778 + }, + { + "epoch": 0.1630013032678234, + "grad_norm": 0.3405972421169281, + "learning_rate": 1.8717226053077933e-05, + "loss": 0.536, + "step": 8780 + }, + { + "epoch": 0.16303843340524204, + "grad_norm": 0.3148418068885803, + "learning_rate": 1.87166544137045e-05, + "loss": 0.4419, + "step": 8782 + }, + { + "epoch": 0.16307556354266067, + "grad_norm": 0.31018543243408203, + "learning_rate": 1.871608265572351e-05, + "loss": 0.3121, + "step": 8784 + }, + { + "epoch": 0.1631126936800793, + "grad_norm": 0.41335955262184143, + "learning_rate": 1.8715510779142746e-05, + "loss": 0.261, + "step": 8786 + }, + { + "epoch": 0.16314982381749796, + "grad_norm": 0.4860115051269531, + "learning_rate": 1.871493878396998e-05, + "loss": 0.4225, + "step": 8788 + }, + { + "epoch": 0.16318695395491659, + "grad_norm": 0.32487449049949646, + "learning_rate": 1.8714366670213005e-05, + "loss": 0.4146, + "step": 8790 + }, + { + "epoch": 0.16322408409233521, + "grad_norm": 0.37052568793296814, + "learning_rate": 1.8713794437879605e-05, + "loss": 0.286, + "step": 8792 + }, + { + "epoch": 0.16326121422975387, + "grad_norm": 0.34378519654273987, + "learning_rate": 1.8713222086977557e-05, + "loss": 0.3397, + "step": 8794 + }, + { + "epoch": 0.1632983443671725, + "grad_norm": 0.3928300142288208, + "learning_rate": 1.871264961751466e-05, + "loss": 0.4875, + "step": 8796 + }, + { + "epoch": 0.16333547450459115, + "grad_norm": 0.26039910316467285, + "learning_rate": 1.8712077029498702e-05, + "loss": 0.362, + "step": 8798 + }, + { + "epoch": 0.16337260464200978, + "grad_norm": 0.4242274761199951, + "learning_rate": 1.8711504322937464e-05, + "loss": 0.393, + "step": 8800 + }, + { + "epoch": 0.1634097347794284, + "grad_norm": 0.3434769809246063, + "learning_rate": 1.8710931497838748e-05, + "loss": 0.377, + "step": 8802 + }, + { + "epoch": 0.16344686491684707, + "grad_norm": 0.32703617215156555, + "learning_rate": 1.8710358554210352e-05, + "loss": 0.2256, + "step": 8804 + }, + { + "epoch": 0.1634839950542657, + "grad_norm": 0.3237664997577667, + "learning_rate": 1.870978549206006e-05, + "loss": 0.3185, + "step": 8806 + }, + { + "epoch": 0.16352112519168432, + "grad_norm": 0.40504515171051025, + "learning_rate": 1.870921231139568e-05, + "loss": 0.3962, + "step": 8808 + }, + { + "epoch": 0.16355825532910298, + "grad_norm": 0.3916057050228119, + "learning_rate": 1.870863901222501e-05, + "loss": 0.3832, + "step": 8810 + }, + { + "epoch": 0.1635953854665216, + "grad_norm": 0.4852084517478943, + "learning_rate": 1.8708065594555843e-05, + "loss": 0.4308, + "step": 8812 + }, + { + "epoch": 0.16363251560394024, + "grad_norm": 0.3387133777141571, + "learning_rate": 1.8707492058395988e-05, + "loss": 0.2143, + "step": 8814 + }, + { + "epoch": 0.1636696457413589, + "grad_norm": 0.3047282099723816, + "learning_rate": 1.8706918403753248e-05, + "loss": 0.2391, + "step": 8816 + }, + { + "epoch": 0.16370677587877752, + "grad_norm": 0.3141844570636749, + "learning_rate": 1.870634463063543e-05, + "loss": 0.3684, + "step": 8818 + }, + { + "epoch": 0.16374390601619618, + "grad_norm": 0.5523303747177124, + "learning_rate": 1.8705770739050342e-05, + "loss": 0.452, + "step": 8820 + }, + { + "epoch": 0.1637810361536148, + "grad_norm": 0.3234608769416809, + "learning_rate": 1.8705196729005788e-05, + "loss": 0.3602, + "step": 8822 + }, + { + "epoch": 0.16381816629103343, + "grad_norm": 0.33608299493789673, + "learning_rate": 1.8704622600509582e-05, + "loss": 0.2711, + "step": 8824 + }, + { + "epoch": 0.1638552964284521, + "grad_norm": 0.32217055559158325, + "learning_rate": 1.8704048353569537e-05, + "loss": 0.3961, + "step": 8826 + }, + { + "epoch": 0.16389242656587072, + "grad_norm": 0.3437250554561615, + "learning_rate": 1.8703473988193466e-05, + "loss": 0.2902, + "step": 8828 + }, + { + "epoch": 0.16392955670328935, + "grad_norm": 0.36695992946624756, + "learning_rate": 1.870289950438918e-05, + "loss": 0.312, + "step": 8830 + }, + { + "epoch": 0.163966686840708, + "grad_norm": 0.3539976179599762, + "learning_rate": 1.8702324902164507e-05, + "loss": 0.2968, + "step": 8832 + }, + { + "epoch": 0.16400381697812663, + "grad_norm": 0.38439294695854187, + "learning_rate": 1.8701750181527252e-05, + "loss": 0.2262, + "step": 8834 + }, + { + "epoch": 0.1640409471155453, + "grad_norm": 0.39139512181282043, + "learning_rate": 1.8701175342485244e-05, + "loss": 0.1911, + "step": 8836 + }, + { + "epoch": 0.16407807725296392, + "grad_norm": 0.3429873585700989, + "learning_rate": 1.8700600385046302e-05, + "loss": 0.2391, + "step": 8838 + }, + { + "epoch": 0.16411520739038254, + "grad_norm": 0.42418980598449707, + "learning_rate": 1.8700025309218253e-05, + "loss": 0.2896, + "step": 8840 + }, + { + "epoch": 0.1641523375278012, + "grad_norm": 0.295303612947464, + "learning_rate": 1.8699450115008917e-05, + "loss": 0.3443, + "step": 8842 + }, + { + "epoch": 0.16418946766521983, + "grad_norm": 0.30014926195144653, + "learning_rate": 1.8698874802426122e-05, + "loss": 0.2854, + "step": 8844 + }, + { + "epoch": 0.16422659780263846, + "grad_norm": 0.270679235458374, + "learning_rate": 1.86982993714777e-05, + "loss": 0.3117, + "step": 8846 + }, + { + "epoch": 0.1642637279400571, + "grad_norm": 0.37993142008781433, + "learning_rate": 1.8697723822171475e-05, + "loss": 0.4493, + "step": 8848 + }, + { + "epoch": 0.16430085807747574, + "grad_norm": 0.5098891258239746, + "learning_rate": 1.869714815451528e-05, + "loss": 0.415, + "step": 8850 + }, + { + "epoch": 0.16433798821489437, + "grad_norm": 0.2503422796726227, + "learning_rate": 1.8696572368516954e-05, + "loss": 0.368, + "step": 8852 + }, + { + "epoch": 0.16437511835231303, + "grad_norm": 0.3472348153591156, + "learning_rate": 1.8695996464184325e-05, + "loss": 0.2042, + "step": 8854 + }, + { + "epoch": 0.16441224848973165, + "grad_norm": 0.36320799589157104, + "learning_rate": 1.8695420441525233e-05, + "loss": 0.3655, + "step": 8856 + }, + { + "epoch": 0.1644493786271503, + "grad_norm": 0.3802759349346161, + "learning_rate": 1.8694844300547513e-05, + "loss": 0.2609, + "step": 8858 + }, + { + "epoch": 0.16448650876456894, + "grad_norm": 0.4180498719215393, + "learning_rate": 1.8694268041259006e-05, + "loss": 0.2869, + "step": 8860 + }, + { + "epoch": 0.16452363890198757, + "grad_norm": 0.30235522985458374, + "learning_rate": 1.8693691663667555e-05, + "loss": 0.2089, + "step": 8862 + }, + { + "epoch": 0.16456076903940622, + "grad_norm": 0.39410197734832764, + "learning_rate": 1.8693115167780998e-05, + "loss": 0.1876, + "step": 8864 + }, + { + "epoch": 0.16459789917682485, + "grad_norm": 0.3708670735359192, + "learning_rate": 1.8692538553607183e-05, + "loss": 0.2305, + "step": 8866 + }, + { + "epoch": 0.16463502931424348, + "grad_norm": 0.5014273524284363, + "learning_rate": 1.869196182115396e-05, + "loss": 0.3745, + "step": 8868 + }, + { + "epoch": 0.16467215945166214, + "grad_norm": 0.39320728182792664, + "learning_rate": 1.8691384970429166e-05, + "loss": 0.3298, + "step": 8870 + }, + { + "epoch": 0.16470928958908077, + "grad_norm": 0.5245968103408813, + "learning_rate": 1.869080800144066e-05, + "loss": 0.3161, + "step": 8872 + }, + { + "epoch": 0.16474641972649942, + "grad_norm": 0.3268929123878479, + "learning_rate": 1.8690230914196292e-05, + "loss": 0.2662, + "step": 8874 + }, + { + "epoch": 0.16478354986391805, + "grad_norm": 0.3765755295753479, + "learning_rate": 1.8689653708703905e-05, + "loss": 0.2974, + "step": 8876 + }, + { + "epoch": 0.16482068000133668, + "grad_norm": 0.4465888738632202, + "learning_rate": 1.8689076384971363e-05, + "loss": 0.3252, + "step": 8878 + }, + { + "epoch": 0.16485781013875533, + "grad_norm": 0.3358520567417145, + "learning_rate": 1.868849894300652e-05, + "loss": 0.3511, + "step": 8880 + }, + { + "epoch": 0.16489494027617396, + "grad_norm": 0.3509506583213806, + "learning_rate": 1.8687921382817227e-05, + "loss": 0.6462, + "step": 8882 + }, + { + "epoch": 0.1649320704135926, + "grad_norm": 0.35972586274147034, + "learning_rate": 1.868734370441135e-05, + "loss": 0.2916, + "step": 8884 + }, + { + "epoch": 0.16496920055101125, + "grad_norm": 0.32858628034591675, + "learning_rate": 1.8686765907796746e-05, + "loss": 0.2517, + "step": 8886 + }, + { + "epoch": 0.16500633068842988, + "grad_norm": 0.9542116522789001, + "learning_rate": 1.8686187992981276e-05, + "loss": 0.4535, + "step": 8888 + }, + { + "epoch": 0.1650434608258485, + "grad_norm": 0.40352657437324524, + "learning_rate": 1.8685609959972808e-05, + "loss": 0.2336, + "step": 8890 + }, + { + "epoch": 0.16508059096326716, + "grad_norm": 0.4392412602901459, + "learning_rate": 1.8685031808779204e-05, + "loss": 0.3026, + "step": 8892 + }, + { + "epoch": 0.1651177211006858, + "grad_norm": 0.38800475001335144, + "learning_rate": 1.8684453539408333e-05, + "loss": 0.3553, + "step": 8894 + }, + { + "epoch": 0.16515485123810444, + "grad_norm": 0.35486483573913574, + "learning_rate": 1.868387515186806e-05, + "loss": 0.2214, + "step": 8896 + }, + { + "epoch": 0.16519198137552307, + "grad_norm": 0.397891104221344, + "learning_rate": 1.868329664616626e-05, + "loss": 0.2955, + "step": 8898 + }, + { + "epoch": 0.1652291115129417, + "grad_norm": 0.29122766852378845, + "learning_rate": 1.8682718022310803e-05, + "loss": 0.5533, + "step": 8900 + }, + { + "epoch": 0.16526624165036036, + "grad_norm": 0.3675747215747833, + "learning_rate": 1.8682139280309557e-05, + "loss": 0.2951, + "step": 8902 + }, + { + "epoch": 0.16530337178777899, + "grad_norm": 0.4462049901485443, + "learning_rate": 1.8681560420170407e-05, + "loss": 0.372, + "step": 8904 + }, + { + "epoch": 0.1653405019251976, + "grad_norm": 0.3874303698539734, + "learning_rate": 1.868098144190122e-05, + "loss": 0.2961, + "step": 8906 + }, + { + "epoch": 0.16537763206261627, + "grad_norm": 0.3423820436000824, + "learning_rate": 1.868040234550988e-05, + "loss": 0.4155, + "step": 8908 + }, + { + "epoch": 0.1654147622000349, + "grad_norm": 0.5025599002838135, + "learning_rate": 1.8679823131004264e-05, + "loss": 0.3226, + "step": 8910 + }, + { + "epoch": 0.16545189233745355, + "grad_norm": 0.48700037598609924, + "learning_rate": 1.8679243798392253e-05, + "loss": 0.4387, + "step": 8912 + }, + { + "epoch": 0.16548902247487218, + "grad_norm": 0.4817005395889282, + "learning_rate": 1.8678664347681732e-05, + "loss": 0.3622, + "step": 8914 + }, + { + "epoch": 0.1655261526122908, + "grad_norm": 0.5320210456848145, + "learning_rate": 1.8678084778880584e-05, + "loss": 0.3755, + "step": 8916 + }, + { + "epoch": 0.16556328274970947, + "grad_norm": 0.4524260461330414, + "learning_rate": 1.8677505091996698e-05, + "loss": 0.2338, + "step": 8918 + }, + { + "epoch": 0.1656004128871281, + "grad_norm": 0.3629036843776703, + "learning_rate": 1.867692528703796e-05, + "loss": 0.1626, + "step": 8920 + }, + { + "epoch": 0.16563754302454672, + "grad_norm": 0.36559760570526123, + "learning_rate": 1.8676345364012256e-05, + "loss": 0.2821, + "step": 8922 + }, + { + "epoch": 0.16567467316196538, + "grad_norm": 0.35414788126945496, + "learning_rate": 1.8675765322927485e-05, + "loss": 0.3282, + "step": 8924 + }, + { + "epoch": 0.165711803299384, + "grad_norm": 0.23960210382938385, + "learning_rate": 1.8675185163791533e-05, + "loss": 0.2255, + "step": 8926 + }, + { + "epoch": 0.16574893343680264, + "grad_norm": 0.4486822783946991, + "learning_rate": 1.8674604886612293e-05, + "loss": 0.5776, + "step": 8928 + }, + { + "epoch": 0.1657860635742213, + "grad_norm": 0.35357430577278137, + "learning_rate": 1.8674024491397667e-05, + "loss": 0.3932, + "step": 8930 + }, + { + "epoch": 0.16582319371163992, + "grad_norm": 0.4018774628639221, + "learning_rate": 1.867344397815555e-05, + "loss": 0.3245, + "step": 8932 + }, + { + "epoch": 0.16586032384905858, + "grad_norm": 0.29945969581604004, + "learning_rate": 1.867286334689384e-05, + "loss": 0.4212, + "step": 8934 + }, + { + "epoch": 0.1658974539864772, + "grad_norm": 0.47677671909332275, + "learning_rate": 1.8672282597620438e-05, + "loss": 0.2737, + "step": 8936 + }, + { + "epoch": 0.16593458412389583, + "grad_norm": 0.40573689341545105, + "learning_rate": 1.8671701730343244e-05, + "loss": 0.3739, + "step": 8938 + }, + { + "epoch": 0.1659717142613145, + "grad_norm": 0.3261779546737671, + "learning_rate": 1.8671120745070167e-05, + "loss": 0.3029, + "step": 8940 + }, + { + "epoch": 0.16600884439873312, + "grad_norm": 0.3106573224067688, + "learning_rate": 1.867053964180911e-05, + "loss": 0.4091, + "step": 8942 + }, + { + "epoch": 0.16604597453615175, + "grad_norm": 0.2213493138551712, + "learning_rate": 1.8669958420567977e-05, + "loss": 0.1955, + "step": 8944 + }, + { + "epoch": 0.1660831046735704, + "grad_norm": 0.423654168844223, + "learning_rate": 1.866937708135468e-05, + "loss": 0.3466, + "step": 8946 + }, + { + "epoch": 0.16612023481098903, + "grad_norm": 0.3835442066192627, + "learning_rate": 1.8668795624177128e-05, + "loss": 0.2034, + "step": 8948 + }, + { + "epoch": 0.1661573649484077, + "grad_norm": 0.3940131366252899, + "learning_rate": 1.866821404904324e-05, + "loss": 0.2347, + "step": 8950 + }, + { + "epoch": 0.16619449508582632, + "grad_norm": 0.3909975290298462, + "learning_rate": 1.8667632355960915e-05, + "loss": 0.1534, + "step": 8952 + }, + { + "epoch": 0.16623162522324494, + "grad_norm": 0.5471821427345276, + "learning_rate": 1.866705054493808e-05, + "loss": 0.1818, + "step": 8954 + }, + { + "epoch": 0.1662687553606636, + "grad_norm": 0.38158008456230164, + "learning_rate": 1.8666468615982648e-05, + "loss": 0.2384, + "step": 8956 + }, + { + "epoch": 0.16630588549808223, + "grad_norm": 0.3104810118675232, + "learning_rate": 1.8665886569102537e-05, + "loss": 0.3025, + "step": 8958 + }, + { + "epoch": 0.16634301563550086, + "grad_norm": 0.35220158100128174, + "learning_rate": 1.8665304404305665e-05, + "loss": 0.293, + "step": 8960 + }, + { + "epoch": 0.1663801457729195, + "grad_norm": 0.3959876298904419, + "learning_rate": 1.866472212159996e-05, + "loss": 0.3165, + "step": 8962 + }, + { + "epoch": 0.16641727591033814, + "grad_norm": 0.2942659258842468, + "learning_rate": 1.8664139720993337e-05, + "loss": 0.4532, + "step": 8964 + }, + { + "epoch": 0.16645440604775677, + "grad_norm": 0.417222261428833, + "learning_rate": 1.8663557202493724e-05, + "loss": 0.4038, + "step": 8966 + }, + { + "epoch": 0.16649153618517543, + "grad_norm": 0.44982826709747314, + "learning_rate": 1.8662974566109052e-05, + "loss": 0.3952, + "step": 8968 + }, + { + "epoch": 0.16652866632259405, + "grad_norm": 0.2330973595380783, + "learning_rate": 1.866239181184724e-05, + "loss": 0.2441, + "step": 8970 + }, + { + "epoch": 0.1665657964600127, + "grad_norm": 0.2412310093641281, + "learning_rate": 1.8661808939716227e-05, + "loss": 0.3406, + "step": 8972 + }, + { + "epoch": 0.16660292659743134, + "grad_norm": 0.5149389505386353, + "learning_rate": 1.8661225949723937e-05, + "loss": 0.3873, + "step": 8974 + }, + { + "epoch": 0.16664005673484997, + "grad_norm": 0.5088759660720825, + "learning_rate": 1.8660642841878305e-05, + "loss": 0.4967, + "step": 8976 + }, + { + "epoch": 0.16667718687226862, + "grad_norm": 0.4076996445655823, + "learning_rate": 1.8660059616187266e-05, + "loss": 0.3042, + "step": 8978 + }, + { + "epoch": 0.16671431700968725, + "grad_norm": 0.294471800327301, + "learning_rate": 1.865947627265875e-05, + "loss": 0.5449, + "step": 8980 + }, + { + "epoch": 0.16675144714710588, + "grad_norm": 0.3518579304218292, + "learning_rate": 1.865889281130071e-05, + "loss": 0.2504, + "step": 8982 + }, + { + "epoch": 0.16678857728452454, + "grad_norm": 0.4149520993232727, + "learning_rate": 1.8658309232121067e-05, + "loss": 0.466, + "step": 8984 + }, + { + "epoch": 0.16682570742194316, + "grad_norm": 0.3717808425426483, + "learning_rate": 1.8657725535127777e-05, + "loss": 0.298, + "step": 8986 + }, + { + "epoch": 0.16686283755936182, + "grad_norm": 0.587266743183136, + "learning_rate": 1.8657141720328772e-05, + "loss": 0.3896, + "step": 8988 + }, + { + "epoch": 0.16689996769678045, + "grad_norm": 0.5814943313598633, + "learning_rate": 1.8656557787731997e-05, + "loss": 0.6717, + "step": 8990 + }, + { + "epoch": 0.16693709783419908, + "grad_norm": 0.3730618953704834, + "learning_rate": 1.8655973737345404e-05, + "loss": 0.293, + "step": 8992 + }, + { + "epoch": 0.16697422797161773, + "grad_norm": 0.43233779072761536, + "learning_rate": 1.8655389569176932e-05, + "loss": 0.2715, + "step": 8994 + }, + { + "epoch": 0.16701135810903636, + "grad_norm": 0.6507939696311951, + "learning_rate": 1.865480528323454e-05, + "loss": 0.3389, + "step": 8996 + }, + { + "epoch": 0.167048488246455, + "grad_norm": 0.29440972208976746, + "learning_rate": 1.8654220879526164e-05, + "loss": 0.1907, + "step": 8998 + }, + { + "epoch": 0.16708561838387365, + "grad_norm": 0.2581784129142761, + "learning_rate": 1.865363635805977e-05, + "loss": 0.3709, + "step": 9000 + }, + { + "epoch": 0.16712274852129227, + "grad_norm": 0.2679079473018646, + "learning_rate": 1.8653051718843305e-05, + "loss": 0.2634, + "step": 9002 + }, + { + "epoch": 0.1671598786587109, + "grad_norm": 0.390916109085083, + "learning_rate": 1.8652466961884725e-05, + "loss": 0.3326, + "step": 9004 + }, + { + "epoch": 0.16719700879612956, + "grad_norm": 0.379873663187027, + "learning_rate": 1.8651882087191983e-05, + "loss": 0.4046, + "step": 9006 + }, + { + "epoch": 0.1672341389335482, + "grad_norm": 0.28242987394332886, + "learning_rate": 1.8651297094773047e-05, + "loss": 0.2639, + "step": 9008 + }, + { + "epoch": 0.16727126907096684, + "grad_norm": 0.2469072788953781, + "learning_rate": 1.8650711984635866e-05, + "loss": 0.2049, + "step": 9010 + }, + { + "epoch": 0.16730839920838547, + "grad_norm": 0.335746169090271, + "learning_rate": 1.865012675678841e-05, + "loss": 0.3211, + "step": 9012 + }, + { + "epoch": 0.1673455293458041, + "grad_norm": 0.3432886302471161, + "learning_rate": 1.864954141123864e-05, + "loss": 0.2362, + "step": 9014 + }, + { + "epoch": 0.16738265948322276, + "grad_norm": 0.23973342776298523, + "learning_rate": 1.864895594799452e-05, + "loss": 0.3048, + "step": 9016 + }, + { + "epoch": 0.16741978962064138, + "grad_norm": 0.2192869335412979, + "learning_rate": 1.8648370367064012e-05, + "loss": 0.3243, + "step": 9018 + }, + { + "epoch": 0.16745691975806, + "grad_norm": 0.27877163887023926, + "learning_rate": 1.864778466845509e-05, + "loss": 0.1152, + "step": 9020 + }, + { + "epoch": 0.16749404989547867, + "grad_norm": 0.3916303217411041, + "learning_rate": 1.8647198852175723e-05, + "loss": 0.2086, + "step": 9022 + }, + { + "epoch": 0.1675311800328973, + "grad_norm": 0.2963021993637085, + "learning_rate": 1.8646612918233878e-05, + "loss": 0.2917, + "step": 9024 + }, + { + "epoch": 0.16756831017031595, + "grad_norm": 0.20817649364471436, + "learning_rate": 1.8646026866637533e-05, + "loss": 0.3202, + "step": 9026 + }, + { + "epoch": 0.16760544030773458, + "grad_norm": 0.35939204692840576, + "learning_rate": 1.864544069739466e-05, + "loss": 0.4557, + "step": 9028 + }, + { + "epoch": 0.1676425704451532, + "grad_norm": 0.34954309463500977, + "learning_rate": 1.8644854410513236e-05, + "loss": 0.3979, + "step": 9030 + }, + { + "epoch": 0.16767970058257187, + "grad_norm": 0.31985536217689514, + "learning_rate": 1.8644268006001235e-05, + "loss": 0.2936, + "step": 9032 + }, + { + "epoch": 0.1677168307199905, + "grad_norm": 0.3944746255874634, + "learning_rate": 1.8643681483866643e-05, + "loss": 0.4014, + "step": 9034 + }, + { + "epoch": 0.16775396085740912, + "grad_norm": 0.42706286907196045, + "learning_rate": 1.8643094844117434e-05, + "loss": 0.291, + "step": 9036 + }, + { + "epoch": 0.16779109099482778, + "grad_norm": 0.7891225218772888, + "learning_rate": 1.8642508086761593e-05, + "loss": 0.1962, + "step": 9038 + }, + { + "epoch": 0.1678282211322464, + "grad_norm": 0.41168656945228577, + "learning_rate": 1.8641921211807107e-05, + "loss": 0.2771, + "step": 9040 + }, + { + "epoch": 0.16786535126966504, + "grad_norm": 0.4221994876861572, + "learning_rate": 1.8641334219261956e-05, + "loss": 0.2993, + "step": 9042 + }, + { + "epoch": 0.1679024814070837, + "grad_norm": 0.39930611848831177, + "learning_rate": 1.864074710913413e-05, + "loss": 0.1684, + "step": 9044 + }, + { + "epoch": 0.16793961154450232, + "grad_norm": 0.3187035620212555, + "learning_rate": 1.8640159881431622e-05, + "loss": 0.3943, + "step": 9046 + }, + { + "epoch": 0.16797674168192098, + "grad_norm": 0.4324028193950653, + "learning_rate": 1.8639572536162417e-05, + "loss": 0.3532, + "step": 9048 + }, + { + "epoch": 0.1680138718193396, + "grad_norm": 0.3073504865169525, + "learning_rate": 1.8638985073334504e-05, + "loss": 0.2467, + "step": 9050 + }, + { + "epoch": 0.16805100195675823, + "grad_norm": 0.3276194930076599, + "learning_rate": 1.8638397492955885e-05, + "loss": 0.1607, + "step": 9052 + }, + { + "epoch": 0.1680881320941769, + "grad_norm": 0.39510953426361084, + "learning_rate": 1.8637809795034548e-05, + "loss": 0.2899, + "step": 9054 + }, + { + "epoch": 0.16812526223159552, + "grad_norm": 0.3415541648864746, + "learning_rate": 1.8637221979578495e-05, + "loss": 0.2519, + "step": 9056 + }, + { + "epoch": 0.16816239236901415, + "grad_norm": 0.36961629986763, + "learning_rate": 1.8636634046595723e-05, + "loss": 0.3486, + "step": 9058 + }, + { + "epoch": 0.1681995225064328, + "grad_norm": 0.3302825093269348, + "learning_rate": 1.863604599609423e-05, + "loss": 0.1825, + "step": 9060 + }, + { + "epoch": 0.16823665264385143, + "grad_norm": 0.4261574447154999, + "learning_rate": 1.863545782808202e-05, + "loss": 0.3456, + "step": 9062 + }, + { + "epoch": 0.1682737827812701, + "grad_norm": 0.3397577106952667, + "learning_rate": 1.8634869542567092e-05, + "loss": 0.2923, + "step": 9064 + }, + { + "epoch": 0.16831091291868872, + "grad_norm": 0.32736897468566895, + "learning_rate": 1.863428113955746e-05, + "loss": 0.5217, + "step": 9066 + }, + { + "epoch": 0.16834804305610734, + "grad_norm": 0.28635403513908386, + "learning_rate": 1.863369261906112e-05, + "loss": 0.3189, + "step": 9068 + }, + { + "epoch": 0.168385173193526, + "grad_norm": 0.3244496285915375, + "learning_rate": 1.8633103981086083e-05, + "loss": 0.2182, + "step": 9070 + }, + { + "epoch": 0.16842230333094463, + "grad_norm": 0.3476366698741913, + "learning_rate": 1.8632515225640364e-05, + "loss": 0.3319, + "step": 9072 + }, + { + "epoch": 0.16845943346836326, + "grad_norm": 0.34608879685401917, + "learning_rate": 1.8631926352731968e-05, + "loss": 0.3374, + "step": 9074 + }, + { + "epoch": 0.1684965636057819, + "grad_norm": 0.3689146935939789, + "learning_rate": 1.863133736236891e-05, + "loss": 0.447, + "step": 9076 + }, + { + "epoch": 0.16853369374320054, + "grad_norm": 0.37709754705429077, + "learning_rate": 1.8630748254559207e-05, + "loss": 0.5672, + "step": 9078 + }, + { + "epoch": 0.16857082388061917, + "grad_norm": 0.3143155574798584, + "learning_rate": 1.8630159029310868e-05, + "loss": 0.1586, + "step": 9080 + }, + { + "epoch": 0.16860795401803783, + "grad_norm": 0.3054594099521637, + "learning_rate": 1.8629569686631918e-05, + "loss": 0.381, + "step": 9082 + }, + { + "epoch": 0.16864508415545645, + "grad_norm": 0.31075263023376465, + "learning_rate": 1.8628980226530372e-05, + "loss": 0.3457, + "step": 9084 + }, + { + "epoch": 0.1686822142928751, + "grad_norm": 0.2756892442703247, + "learning_rate": 1.862839064901425e-05, + "loss": 0.232, + "step": 9086 + }, + { + "epoch": 0.16871934443029374, + "grad_norm": 0.3617435097694397, + "learning_rate": 1.862780095409158e-05, + "loss": 0.3282, + "step": 9088 + }, + { + "epoch": 0.16875647456771237, + "grad_norm": 0.3648868501186371, + "learning_rate": 1.862721114177038e-05, + "loss": 0.3284, + "step": 9090 + }, + { + "epoch": 0.16879360470513102, + "grad_norm": 0.23681136965751648, + "learning_rate": 1.862662121205868e-05, + "loss": 0.1978, + "step": 9092 + }, + { + "epoch": 0.16883073484254965, + "grad_norm": 0.40135663747787476, + "learning_rate": 1.86260311649645e-05, + "loss": 0.3602, + "step": 9094 + }, + { + "epoch": 0.16886786497996828, + "grad_norm": 0.38004037737846375, + "learning_rate": 1.862544100049588e-05, + "loss": 0.3164, + "step": 9096 + }, + { + "epoch": 0.16890499511738694, + "grad_norm": 0.39326241612434387, + "learning_rate": 1.8624850718660837e-05, + "loss": 0.2491, + "step": 9098 + }, + { + "epoch": 0.16894212525480556, + "grad_norm": 0.4096834361553192, + "learning_rate": 1.8624260319467415e-05, + "loss": 0.2213, + "step": 9100 + }, + { + "epoch": 0.16897925539222422, + "grad_norm": 0.372112900018692, + "learning_rate": 1.8623669802923647e-05, + "loss": 0.372, + "step": 9102 + }, + { + "epoch": 0.16901638552964285, + "grad_norm": 0.30282333493232727, + "learning_rate": 1.8623079169037558e-05, + "loss": 0.3609, + "step": 9104 + }, + { + "epoch": 0.16905351566706148, + "grad_norm": 0.3284953534603119, + "learning_rate": 1.862248841781719e-05, + "loss": 0.3442, + "step": 9106 + }, + { + "epoch": 0.16909064580448013, + "grad_norm": 0.28249475359916687, + "learning_rate": 1.8621897549270584e-05, + "loss": 0.2931, + "step": 9108 + }, + { + "epoch": 0.16912777594189876, + "grad_norm": 0.4184150695800781, + "learning_rate": 1.8621306563405778e-05, + "loss": 0.2894, + "step": 9110 + }, + { + "epoch": 0.1691649060793174, + "grad_norm": 0.30237600207328796, + "learning_rate": 1.8620715460230812e-05, + "loss": 0.255, + "step": 9112 + }, + { + "epoch": 0.16920203621673605, + "grad_norm": 0.35291510820388794, + "learning_rate": 1.8620124239753733e-05, + "loss": 0.3163, + "step": 9114 + }, + { + "epoch": 0.16923916635415467, + "grad_norm": 0.3508572280406952, + "learning_rate": 1.861953290198258e-05, + "loss": 0.3648, + "step": 9116 + }, + { + "epoch": 0.1692762964915733, + "grad_norm": 0.2748846709728241, + "learning_rate": 1.8618941446925406e-05, + "loss": 0.1888, + "step": 9118 + }, + { + "epoch": 0.16931342662899196, + "grad_norm": 0.43892136216163635, + "learning_rate": 1.8618349874590254e-05, + "loss": 0.2888, + "step": 9120 + }, + { + "epoch": 0.1693505567664106, + "grad_norm": 0.2610081732273102, + "learning_rate": 1.8617758184985174e-05, + "loss": 0.3631, + "step": 9122 + }, + { + "epoch": 0.16938768690382924, + "grad_norm": 0.44517678022384644, + "learning_rate": 1.861716637811822e-05, + "loss": 0.3621, + "step": 9124 + }, + { + "epoch": 0.16942481704124787, + "grad_norm": 0.3575249910354614, + "learning_rate": 1.8616574453997445e-05, + "loss": 0.2717, + "step": 9126 + }, + { + "epoch": 0.1694619471786665, + "grad_norm": 0.35729482769966125, + "learning_rate": 1.8615982412630898e-05, + "loss": 0.263, + "step": 9128 + }, + { + "epoch": 0.16949907731608516, + "grad_norm": 0.3210984766483307, + "learning_rate": 1.861539025402664e-05, + "loss": 0.2798, + "step": 9130 + }, + { + "epoch": 0.16953620745350378, + "grad_norm": 0.32401925325393677, + "learning_rate": 1.8614797978192725e-05, + "loss": 0.1603, + "step": 9132 + }, + { + "epoch": 0.1695733375909224, + "grad_norm": 0.488510400056839, + "learning_rate": 1.8614205585137217e-05, + "loss": 0.3016, + "step": 9134 + }, + { + "epoch": 0.16961046772834107, + "grad_norm": 0.42595547437667847, + "learning_rate": 1.8613613074868168e-05, + "loss": 0.4085, + "step": 9136 + }, + { + "epoch": 0.1696475978657597, + "grad_norm": 0.6518195271492004, + "learning_rate": 1.861302044739365e-05, + "loss": 0.2394, + "step": 9138 + }, + { + "epoch": 0.16968472800317835, + "grad_norm": 0.27355557680130005, + "learning_rate": 1.8612427702721724e-05, + "loss": 0.2291, + "step": 9140 + }, + { + "epoch": 0.16972185814059698, + "grad_norm": 0.3529863953590393, + "learning_rate": 1.861183484086045e-05, + "loss": 0.2818, + "step": 9142 + }, + { + "epoch": 0.1697589882780156, + "grad_norm": 0.48271316289901733, + "learning_rate": 1.8611241861817902e-05, + "loss": 0.3834, + "step": 9144 + }, + { + "epoch": 0.16979611841543427, + "grad_norm": 0.2798055112361908, + "learning_rate": 1.8610648765602143e-05, + "loss": 0.2391, + "step": 9146 + }, + { + "epoch": 0.1698332485528529, + "grad_norm": 0.45579269528388977, + "learning_rate": 1.861005555222125e-05, + "loss": 0.4511, + "step": 9148 + }, + { + "epoch": 0.16987037869027152, + "grad_norm": 0.5225679874420166, + "learning_rate": 1.860946222168329e-05, + "loss": 0.4077, + "step": 9150 + }, + { + "epoch": 0.16990750882769018, + "grad_norm": 0.4053509831428528, + "learning_rate": 1.8608868773996335e-05, + "loss": 0.3867, + "step": 9152 + }, + { + "epoch": 0.1699446389651088, + "grad_norm": 0.37855228781700134, + "learning_rate": 1.8608275209168465e-05, + "loss": 0.276, + "step": 9154 + }, + { + "epoch": 0.16998176910252744, + "grad_norm": 0.3076510429382324, + "learning_rate": 1.8607681527207756e-05, + "loss": 0.3981, + "step": 9156 + }, + { + "epoch": 0.1700188992399461, + "grad_norm": 0.41040322184562683, + "learning_rate": 1.860708772812228e-05, + "loss": 0.5165, + "step": 9158 + }, + { + "epoch": 0.17005602937736472, + "grad_norm": 0.21080558001995087, + "learning_rate": 1.8606493811920124e-05, + "loss": 0.3915, + "step": 9160 + }, + { + "epoch": 0.17009315951478338, + "grad_norm": 0.3336891829967499, + "learning_rate": 1.8605899778609363e-05, + "loss": 0.3784, + "step": 9162 + }, + { + "epoch": 0.170130289652202, + "grad_norm": 0.3784641623497009, + "learning_rate": 1.860530562819809e-05, + "loss": 0.1508, + "step": 9164 + }, + { + "epoch": 0.17016741978962063, + "grad_norm": 0.35133096575737, + "learning_rate": 1.860471136069438e-05, + "loss": 0.5316, + "step": 9166 + }, + { + "epoch": 0.1702045499270393, + "grad_norm": 0.3064520061016083, + "learning_rate": 1.8604116976106322e-05, + "loss": 0.3887, + "step": 9168 + }, + { + "epoch": 0.17024168006445792, + "grad_norm": 0.2978232502937317, + "learning_rate": 1.8603522474442e-05, + "loss": 0.3191, + "step": 9170 + }, + { + "epoch": 0.17027881020187655, + "grad_norm": 0.32241135835647583, + "learning_rate": 1.8602927855709514e-05, + "loss": 0.3096, + "step": 9172 + }, + { + "epoch": 0.1703159403392952, + "grad_norm": 0.3563089966773987, + "learning_rate": 1.8602333119916948e-05, + "loss": 0.3661, + "step": 9174 + }, + { + "epoch": 0.17035307047671383, + "grad_norm": 0.3349653482437134, + "learning_rate": 1.8601738267072394e-05, + "loss": 0.2875, + "step": 9176 + }, + { + "epoch": 0.1703902006141325, + "grad_norm": 0.25343212485313416, + "learning_rate": 1.8601143297183947e-05, + "loss": 0.2169, + "step": 9178 + }, + { + "epoch": 0.17042733075155111, + "grad_norm": 0.3305390179157257, + "learning_rate": 1.8600548210259704e-05, + "loss": 0.2846, + "step": 9180 + }, + { + "epoch": 0.17046446088896974, + "grad_norm": 0.34964269399642944, + "learning_rate": 1.859995300630776e-05, + "loss": 0.3253, + "step": 9182 + }, + { + "epoch": 0.1705015910263884, + "grad_norm": 0.29429739713668823, + "learning_rate": 1.859935768533622e-05, + "loss": 0.3509, + "step": 9184 + }, + { + "epoch": 0.17053872116380703, + "grad_norm": 0.36354726552963257, + "learning_rate": 1.8598762247353175e-05, + "loss": 0.263, + "step": 9186 + }, + { + "epoch": 0.17057585130122566, + "grad_norm": 0.42745673656463623, + "learning_rate": 1.859816669236673e-05, + "loss": 0.4561, + "step": 9188 + }, + { + "epoch": 0.1706129814386443, + "grad_norm": 0.3214625120162964, + "learning_rate": 1.8597571020384997e-05, + "loss": 0.2996, + "step": 9190 + }, + { + "epoch": 0.17065011157606294, + "grad_norm": 0.39977049827575684, + "learning_rate": 1.8596975231416072e-05, + "loss": 0.1782, + "step": 9192 + }, + { + "epoch": 0.17068724171348157, + "grad_norm": 0.25968509912490845, + "learning_rate": 1.8596379325468066e-05, + "loss": 0.2999, + "step": 9194 + }, + { + "epoch": 0.17072437185090023, + "grad_norm": 0.29702845215797424, + "learning_rate": 1.8595783302549085e-05, + "loss": 0.2405, + "step": 9196 + }, + { + "epoch": 0.17076150198831885, + "grad_norm": 0.24404948949813843, + "learning_rate": 1.8595187162667242e-05, + "loss": 0.203, + "step": 9198 + }, + { + "epoch": 0.1707986321257375, + "grad_norm": 0.2870927155017853, + "learning_rate": 1.8594590905830646e-05, + "loss": 0.1675, + "step": 9200 + }, + { + "epoch": 0.17083576226315614, + "grad_norm": 0.3394530117511749, + "learning_rate": 1.8593994532047414e-05, + "loss": 0.3978, + "step": 9202 + }, + { + "epoch": 0.17087289240057477, + "grad_norm": 0.39679962396621704, + "learning_rate": 1.8593398041325655e-05, + "loss": 0.3616, + "step": 9204 + }, + { + "epoch": 0.17091002253799342, + "grad_norm": 0.3744213581085205, + "learning_rate": 1.859280143367349e-05, + "loss": 0.3222, + "step": 9206 + }, + { + "epoch": 0.17094715267541205, + "grad_norm": 0.3758876621723175, + "learning_rate": 1.8592204709099038e-05, + "loss": 0.4096, + "step": 9208 + }, + { + "epoch": 0.17098428281283068, + "grad_norm": 0.31115368008613586, + "learning_rate": 1.8591607867610416e-05, + "loss": 0.2859, + "step": 9210 + }, + { + "epoch": 0.17102141295024934, + "grad_norm": 0.40578708052635193, + "learning_rate": 1.8591010909215743e-05, + "loss": 0.2259, + "step": 9212 + }, + { + "epoch": 0.17105854308766796, + "grad_norm": 0.4514785408973694, + "learning_rate": 1.859041383392315e-05, + "loss": 0.3643, + "step": 9214 + }, + { + "epoch": 0.17109567322508662, + "grad_norm": 0.3591819405555725, + "learning_rate": 1.858981664174075e-05, + "loss": 0.4375, + "step": 9216 + }, + { + "epoch": 0.17113280336250525, + "grad_norm": 0.3094094693660736, + "learning_rate": 1.858921933267668e-05, + "loss": 0.3128, + "step": 9218 + }, + { + "epoch": 0.17116993349992388, + "grad_norm": 0.3282987177371979, + "learning_rate": 1.858862190673906e-05, + "loss": 0.2824, + "step": 9220 + }, + { + "epoch": 0.17120706363734253, + "grad_norm": 0.3291833698749542, + "learning_rate": 1.8588024363936018e-05, + "loss": 0.4219, + "step": 9222 + }, + { + "epoch": 0.17124419377476116, + "grad_norm": 0.3618098497390747, + "learning_rate": 1.8587426704275694e-05, + "loss": 0.2411, + "step": 9224 + }, + { + "epoch": 0.1712813239121798, + "grad_norm": 0.44711795449256897, + "learning_rate": 1.858682892776621e-05, + "loss": 0.4277, + "step": 9226 + }, + { + "epoch": 0.17131845404959845, + "grad_norm": 0.2986961901187897, + "learning_rate": 1.858623103441571e-05, + "loss": 0.4113, + "step": 9228 + }, + { + "epoch": 0.17135558418701707, + "grad_norm": 0.3988238275051117, + "learning_rate": 1.8585633024232322e-05, + "loss": 0.2077, + "step": 9230 + }, + { + "epoch": 0.1713927143244357, + "grad_norm": 0.3558047413825989, + "learning_rate": 1.8585034897224185e-05, + "loss": 0.486, + "step": 9232 + }, + { + "epoch": 0.17142984446185436, + "grad_norm": 0.44968381524086, + "learning_rate": 1.858443665339944e-05, + "loss": 0.2421, + "step": 9234 + }, + { + "epoch": 0.171466974599273, + "grad_norm": 0.3278856575489044, + "learning_rate": 1.8583838292766225e-05, + "loss": 0.349, + "step": 9236 + }, + { + "epoch": 0.17150410473669164, + "grad_norm": 0.29003486037254333, + "learning_rate": 1.8583239815332684e-05, + "loss": 0.4458, + "step": 9238 + }, + { + "epoch": 0.17154123487411027, + "grad_norm": 0.25758254528045654, + "learning_rate": 1.8582641221106956e-05, + "loss": 0.392, + "step": 9240 + }, + { + "epoch": 0.1715783650115289, + "grad_norm": 0.328154981136322, + "learning_rate": 1.858204251009719e-05, + "loss": 0.2883, + "step": 9242 + }, + { + "epoch": 0.17161549514894756, + "grad_norm": 0.32784155011177063, + "learning_rate": 1.858144368231153e-05, + "loss": 0.3765, + "step": 9244 + }, + { + "epoch": 0.17165262528636618, + "grad_norm": 0.3521912395954132, + "learning_rate": 1.858084473775813e-05, + "loss": 0.2789, + "step": 9246 + }, + { + "epoch": 0.1716897554237848, + "grad_norm": 0.2367892861366272, + "learning_rate": 1.858024567644513e-05, + "loss": 0.419, + "step": 9248 + }, + { + "epoch": 0.17172688556120347, + "grad_norm": 0.41230475902557373, + "learning_rate": 1.857964649838069e-05, + "loss": 0.2091, + "step": 9250 + }, + { + "epoch": 0.1717640156986221, + "grad_norm": 0.342324435710907, + "learning_rate": 1.8579047203572962e-05, + "loss": 0.2902, + "step": 9252 + }, + { + "epoch": 0.17180114583604075, + "grad_norm": 0.4646769165992737, + "learning_rate": 1.8578447792030097e-05, + "loss": 0.3464, + "step": 9254 + }, + { + "epoch": 0.17183827597345938, + "grad_norm": 0.3777949810028076, + "learning_rate": 1.857784826376025e-05, + "loss": 0.2406, + "step": 9256 + }, + { + "epoch": 0.171875406110878, + "grad_norm": 0.43188735842704773, + "learning_rate": 1.857724861877159e-05, + "loss": 0.1858, + "step": 9258 + }, + { + "epoch": 0.17191253624829667, + "grad_norm": 0.5867998003959656, + "learning_rate": 1.8576648857072262e-05, + "loss": 0.3568, + "step": 9260 + }, + { + "epoch": 0.1719496663857153, + "grad_norm": 0.7872071862220764, + "learning_rate": 1.8576048978670433e-05, + "loss": 0.3128, + "step": 9262 + }, + { + "epoch": 0.17198679652313392, + "grad_norm": 0.24838481843471527, + "learning_rate": 1.857544898357427e-05, + "loss": 0.1923, + "step": 9264 + }, + { + "epoch": 0.17202392666055258, + "grad_norm": 0.40683993697166443, + "learning_rate": 1.857484887179193e-05, + "loss": 0.3205, + "step": 9266 + }, + { + "epoch": 0.1720610567979712, + "grad_norm": 0.3763538897037506, + "learning_rate": 1.857424864333158e-05, + "loss": 0.2685, + "step": 9268 + }, + { + "epoch": 0.17209818693538984, + "grad_norm": 0.43546822667121887, + "learning_rate": 1.857364829820139e-05, + "loss": 0.2427, + "step": 9270 + }, + { + "epoch": 0.1721353170728085, + "grad_norm": 0.5783769488334656, + "learning_rate": 1.8573047836409526e-05, + "loss": 0.5151, + "step": 9272 + }, + { + "epoch": 0.17217244721022712, + "grad_norm": 0.4253793954849243, + "learning_rate": 1.8572447257964162e-05, + "loss": 0.3271, + "step": 9274 + }, + { + "epoch": 0.17220957734764578, + "grad_norm": 0.3916061818599701, + "learning_rate": 1.8571846562873468e-05, + "loss": 0.3526, + "step": 9276 + }, + { + "epoch": 0.1722467074850644, + "grad_norm": 0.5011011362075806, + "learning_rate": 1.857124575114562e-05, + "loss": 0.4339, + "step": 9278 + }, + { + "epoch": 0.17228383762248303, + "grad_norm": 0.38932064175605774, + "learning_rate": 1.8570644822788786e-05, + "loss": 0.3874, + "step": 9280 + }, + { + "epoch": 0.1723209677599017, + "grad_norm": 0.47515982389450073, + "learning_rate": 1.8570043777811153e-05, + "loss": 0.2489, + "step": 9282 + }, + { + "epoch": 0.17235809789732032, + "grad_norm": 0.575239360332489, + "learning_rate": 1.856944261622089e-05, + "loss": 0.5481, + "step": 9284 + }, + { + "epoch": 0.17239522803473895, + "grad_norm": 0.43876922130584717, + "learning_rate": 1.8568841338026183e-05, + "loss": 0.3287, + "step": 9286 + }, + { + "epoch": 0.1724323581721576, + "grad_norm": 0.30297747254371643, + "learning_rate": 1.8568239943235215e-05, + "loss": 0.3305, + "step": 9288 + }, + { + "epoch": 0.17246948830957623, + "grad_norm": 0.4020748436450958, + "learning_rate": 1.8567638431856166e-05, + "loss": 0.3738, + "step": 9290 + }, + { + "epoch": 0.17250661844699489, + "grad_norm": 0.5123699903488159, + "learning_rate": 1.8567036803897217e-05, + "loss": 0.2138, + "step": 9292 + }, + { + "epoch": 0.17254374858441351, + "grad_norm": 0.34203192591667175, + "learning_rate": 1.8566435059366562e-05, + "loss": 0.3698, + "step": 9294 + }, + { + "epoch": 0.17258087872183214, + "grad_norm": 0.31366607546806335, + "learning_rate": 1.8565833198272383e-05, + "loss": 0.2416, + "step": 9296 + }, + { + "epoch": 0.1726180088592508, + "grad_norm": 0.3314725458621979, + "learning_rate": 1.856523122062287e-05, + "loss": 0.399, + "step": 9298 + }, + { + "epoch": 0.17265513899666943, + "grad_norm": 0.37554433941841125, + "learning_rate": 1.856462912642622e-05, + "loss": 0.3267, + "step": 9300 + }, + { + "epoch": 0.17269226913408806, + "grad_norm": 0.311726450920105, + "learning_rate": 1.8564026915690624e-05, + "loss": 0.2463, + "step": 9302 + }, + { + "epoch": 0.1727293992715067, + "grad_norm": 0.43950992822647095, + "learning_rate": 1.856342458842427e-05, + "loss": 0.5294, + "step": 9304 + }, + { + "epoch": 0.17276652940892534, + "grad_norm": 0.3451043963432312, + "learning_rate": 1.8562822144635356e-05, + "loss": 0.4019, + "step": 9306 + }, + { + "epoch": 0.17280365954634397, + "grad_norm": 0.291781485080719, + "learning_rate": 1.8562219584332084e-05, + "loss": 0.3192, + "step": 9308 + }, + { + "epoch": 0.17284078968376262, + "grad_norm": 0.3367330729961395, + "learning_rate": 1.856161690752265e-05, + "loss": 0.3019, + "step": 9310 + }, + { + "epoch": 0.17287791982118125, + "grad_norm": 0.25980302691459656, + "learning_rate": 1.8561014114215253e-05, + "loss": 0.2993, + "step": 9312 + }, + { + "epoch": 0.1729150499585999, + "grad_norm": 0.34610578417778015, + "learning_rate": 1.85604112044181e-05, + "loss": 0.2862, + "step": 9314 + }, + { + "epoch": 0.17295218009601854, + "grad_norm": 0.44273290038108826, + "learning_rate": 1.855980817813939e-05, + "loss": 0.4501, + "step": 9316 + }, + { + "epoch": 0.17298931023343717, + "grad_norm": 0.3156743347644806, + "learning_rate": 1.855920503538733e-05, + "loss": 0.4705, + "step": 9318 + }, + { + "epoch": 0.17302644037085582, + "grad_norm": 0.2643009424209595, + "learning_rate": 1.855860177617013e-05, + "loss": 0.2435, + "step": 9320 + }, + { + "epoch": 0.17306357050827445, + "grad_norm": 0.35216060280799866, + "learning_rate": 1.855799840049599e-05, + "loss": 0.2701, + "step": 9322 + }, + { + "epoch": 0.17310070064569308, + "grad_norm": 0.45826292037963867, + "learning_rate": 1.8557394908373132e-05, + "loss": 0.2661, + "step": 9324 + }, + { + "epoch": 0.17313783078311173, + "grad_norm": 0.3726547062397003, + "learning_rate": 1.8556791299809758e-05, + "loss": 0.2983, + "step": 9326 + }, + { + "epoch": 0.17317496092053036, + "grad_norm": 0.32667383551597595, + "learning_rate": 1.855618757481409e-05, + "loss": 0.4411, + "step": 9328 + }, + { + "epoch": 0.17321209105794902, + "grad_norm": 0.32809633016586304, + "learning_rate": 1.8555583733394332e-05, + "loss": 0.2844, + "step": 9330 + }, + { + "epoch": 0.17324922119536765, + "grad_norm": 0.3578638434410095, + "learning_rate": 1.8554979775558708e-05, + "loss": 0.5059, + "step": 9332 + }, + { + "epoch": 0.17328635133278628, + "grad_norm": 0.3978993892669678, + "learning_rate": 1.8554375701315438e-05, + "loss": 0.6142, + "step": 9334 + }, + { + "epoch": 0.17332348147020493, + "grad_norm": 0.26432088017463684, + "learning_rate": 1.8553771510672734e-05, + "loss": 0.2792, + "step": 9336 + }, + { + "epoch": 0.17336061160762356, + "grad_norm": 0.24240751564502716, + "learning_rate": 1.855316720363882e-05, + "loss": 0.2333, + "step": 9338 + }, + { + "epoch": 0.1733977417450422, + "grad_norm": 0.31753164529800415, + "learning_rate": 1.8552562780221924e-05, + "loss": 0.2924, + "step": 9340 + }, + { + "epoch": 0.17343487188246084, + "grad_norm": 0.4367333650588989, + "learning_rate": 1.8551958240430264e-05, + "loss": 0.4729, + "step": 9342 + }, + { + "epoch": 0.17347200201987947, + "grad_norm": 0.3188006579875946, + "learning_rate": 1.855135358427207e-05, + "loss": 0.3794, + "step": 9344 + }, + { + "epoch": 0.1735091321572981, + "grad_norm": 0.38418668508529663, + "learning_rate": 1.8550748811755566e-05, + "loss": 0.4431, + "step": 9346 + }, + { + "epoch": 0.17354626229471676, + "grad_norm": 0.35711127519607544, + "learning_rate": 1.8550143922888984e-05, + "loss": 0.4132, + "step": 9348 + }, + { + "epoch": 0.17358339243213539, + "grad_norm": 0.4196261763572693, + "learning_rate": 1.8549538917680553e-05, + "loss": 0.3282, + "step": 9350 + }, + { + "epoch": 0.17362052256955404, + "grad_norm": 0.4077802002429962, + "learning_rate": 1.854893379613851e-05, + "loss": 0.2437, + "step": 9352 + }, + { + "epoch": 0.17365765270697267, + "grad_norm": 0.3750348389148712, + "learning_rate": 1.854832855827108e-05, + "loss": 0.3686, + "step": 9354 + }, + { + "epoch": 0.1736947828443913, + "grad_norm": 0.3244530260562897, + "learning_rate": 1.854772320408651e-05, + "loss": 0.3124, + "step": 9356 + }, + { + "epoch": 0.17373191298180996, + "grad_norm": 0.28786423802375793, + "learning_rate": 1.8547117733593024e-05, + "loss": 0.2356, + "step": 9358 + }, + { + "epoch": 0.17376904311922858, + "grad_norm": 0.3978523015975952, + "learning_rate": 1.854651214679887e-05, + "loss": 0.3694, + "step": 9360 + }, + { + "epoch": 0.1738061732566472, + "grad_norm": 0.32264500856399536, + "learning_rate": 1.8545906443712285e-05, + "loss": 0.2208, + "step": 9362 + }, + { + "epoch": 0.17384330339406587, + "grad_norm": 0.41607165336608887, + "learning_rate": 1.8545300624341507e-05, + "loss": 0.2328, + "step": 9364 + }, + { + "epoch": 0.1738804335314845, + "grad_norm": 0.5304431915283203, + "learning_rate": 1.854469468869479e-05, + "loss": 0.3716, + "step": 9366 + }, + { + "epoch": 0.17391756366890315, + "grad_norm": 0.4470873773097992, + "learning_rate": 1.854408863678037e-05, + "loss": 0.2912, + "step": 9368 + }, + { + "epoch": 0.17395469380632178, + "grad_norm": 0.4217226505279541, + "learning_rate": 1.8543482468606498e-05, + "loss": 0.3014, + "step": 9370 + }, + { + "epoch": 0.1739918239437404, + "grad_norm": 0.3370877504348755, + "learning_rate": 1.854287618418142e-05, + "loss": 0.2875, + "step": 9372 + }, + { + "epoch": 0.17402895408115907, + "grad_norm": 0.3335394263267517, + "learning_rate": 1.8542269783513386e-05, + "loss": 0.4537, + "step": 9374 + }, + { + "epoch": 0.1740660842185777, + "grad_norm": 0.32001960277557373, + "learning_rate": 1.8541663266610645e-05, + "loss": 0.3942, + "step": 9376 + }, + { + "epoch": 0.17410321435599632, + "grad_norm": 0.432697057723999, + "learning_rate": 1.8541056633481454e-05, + "loss": 0.2389, + "step": 9378 + }, + { + "epoch": 0.17414034449341498, + "grad_norm": 0.35600340366363525, + "learning_rate": 1.8540449884134065e-05, + "loss": 0.3917, + "step": 9380 + }, + { + "epoch": 0.1741774746308336, + "grad_norm": 0.47467970848083496, + "learning_rate": 1.8539843018576736e-05, + "loss": 0.3246, + "step": 9382 + }, + { + "epoch": 0.17421460476825223, + "grad_norm": 0.3445054590702057, + "learning_rate": 1.8539236036817722e-05, + "loss": 0.3018, + "step": 9384 + }, + { + "epoch": 0.1742517349056709, + "grad_norm": 0.3301033079624176, + "learning_rate": 1.8538628938865284e-05, + "loss": 0.3655, + "step": 9386 + }, + { + "epoch": 0.17428886504308952, + "grad_norm": 0.32374081015586853, + "learning_rate": 1.8538021724727683e-05, + "loss": 0.4083, + "step": 9388 + }, + { + "epoch": 0.17432599518050818, + "grad_norm": 0.4189296364784241, + "learning_rate": 1.853741439441318e-05, + "loss": 0.2392, + "step": 9390 + }, + { + "epoch": 0.1743631253179268, + "grad_norm": 0.3894954323768616, + "learning_rate": 1.8536806947930036e-05, + "loss": 0.3614, + "step": 9392 + }, + { + "epoch": 0.17440025545534543, + "grad_norm": 0.3289856016635895, + "learning_rate": 1.8536199385286524e-05, + "loss": 0.2692, + "step": 9394 + }, + { + "epoch": 0.1744373855927641, + "grad_norm": 0.4606802463531494, + "learning_rate": 1.8535591706490907e-05, + "loss": 0.4084, + "step": 9396 + }, + { + "epoch": 0.17447451573018272, + "grad_norm": 0.5313446521759033, + "learning_rate": 1.8534983911551455e-05, + "loss": 0.4132, + "step": 9398 + }, + { + "epoch": 0.17451164586760134, + "grad_norm": 0.3066141903400421, + "learning_rate": 1.8534376000476437e-05, + "loss": 0.3225, + "step": 9400 + }, + { + "epoch": 0.17454877600502, + "grad_norm": 0.34293287992477417, + "learning_rate": 1.8533767973274123e-05, + "loss": 0.3153, + "step": 9402 + }, + { + "epoch": 0.17458590614243863, + "grad_norm": 0.398378849029541, + "learning_rate": 1.853315982995279e-05, + "loss": 0.174, + "step": 9404 + }, + { + "epoch": 0.17462303627985729, + "grad_norm": 0.3824893534183502, + "learning_rate": 1.853255157052071e-05, + "loss": 0.3311, + "step": 9406 + }, + { + "epoch": 0.1746601664172759, + "grad_norm": 0.4293367862701416, + "learning_rate": 1.853194319498616e-05, + "loss": 0.2168, + "step": 9408 + }, + { + "epoch": 0.17469729655469454, + "grad_norm": 0.4046095013618469, + "learning_rate": 1.8531334703357423e-05, + "loss": 0.435, + "step": 9410 + }, + { + "epoch": 0.1747344266921132, + "grad_norm": 0.21468330919742584, + "learning_rate": 1.8530726095642772e-05, + "loss": 0.2694, + "step": 9412 + }, + { + "epoch": 0.17477155682953183, + "grad_norm": 0.3662661910057068, + "learning_rate": 1.8530117371850493e-05, + "loss": 0.2405, + "step": 9414 + }, + { + "epoch": 0.17480868696695046, + "grad_norm": 0.6266947984695435, + "learning_rate": 1.8529508531988866e-05, + "loss": 0.2137, + "step": 9416 + }, + { + "epoch": 0.1748458171043691, + "grad_norm": 0.26765987277030945, + "learning_rate": 1.8528899576066178e-05, + "loss": 0.3102, + "step": 9418 + }, + { + "epoch": 0.17488294724178774, + "grad_norm": 0.5364091396331787, + "learning_rate": 1.8528290504090713e-05, + "loss": 0.3093, + "step": 9420 + }, + { + "epoch": 0.17492007737920637, + "grad_norm": 0.3082817494869232, + "learning_rate": 1.8527681316070758e-05, + "loss": 0.3934, + "step": 9422 + }, + { + "epoch": 0.17495720751662502, + "grad_norm": 0.3861907720565796, + "learning_rate": 1.8527072012014608e-05, + "loss": 0.3434, + "step": 9424 + }, + { + "epoch": 0.17499433765404365, + "grad_norm": 0.3344472050666809, + "learning_rate": 1.8526462591930546e-05, + "loss": 0.2315, + "step": 9426 + }, + { + "epoch": 0.1750314677914623, + "grad_norm": 0.37833088636398315, + "learning_rate": 1.8525853055826867e-05, + "loss": 0.221, + "step": 9428 + }, + { + "epoch": 0.17506859792888094, + "grad_norm": 0.2597125768661499, + "learning_rate": 1.852524340371187e-05, + "loss": 0.2226, + "step": 9430 + }, + { + "epoch": 0.17510572806629957, + "grad_norm": 0.3074120581150055, + "learning_rate": 1.8524633635593844e-05, + "loss": 0.215, + "step": 9432 + }, + { + "epoch": 0.17514285820371822, + "grad_norm": 0.26205897331237793, + "learning_rate": 1.8524023751481085e-05, + "loss": 0.4043, + "step": 9434 + }, + { + "epoch": 0.17517998834113685, + "grad_norm": 0.40058374404907227, + "learning_rate": 1.85234137513819e-05, + "loss": 0.3174, + "step": 9436 + }, + { + "epoch": 0.17521711847855548, + "grad_norm": 0.31757161021232605, + "learning_rate": 1.8522803635304583e-05, + "loss": 0.5511, + "step": 9438 + }, + { + "epoch": 0.17525424861597413, + "grad_norm": 0.37255167961120605, + "learning_rate": 1.8522193403257436e-05, + "loss": 0.1876, + "step": 9440 + }, + { + "epoch": 0.17529137875339276, + "grad_norm": 0.4025558531284332, + "learning_rate": 1.8521583055248763e-05, + "loss": 0.3859, + "step": 9442 + }, + { + "epoch": 0.17532850889081142, + "grad_norm": 0.2851197123527527, + "learning_rate": 1.852097259128687e-05, + "loss": 0.3216, + "step": 9444 + }, + { + "epoch": 0.17536563902823005, + "grad_norm": 0.3579910099506378, + "learning_rate": 1.852036201138007e-05, + "loss": 0.3721, + "step": 9446 + }, + { + "epoch": 0.17540276916564868, + "grad_norm": 0.4212762713432312, + "learning_rate": 1.8519751315536657e-05, + "loss": 0.4328, + "step": 9448 + }, + { + "epoch": 0.17543989930306733, + "grad_norm": 0.44826266169548035, + "learning_rate": 1.851914050376495e-05, + "loss": 0.3397, + "step": 9450 + }, + { + "epoch": 0.17547702944048596, + "grad_norm": 0.5340867042541504, + "learning_rate": 1.8518529576073262e-05, + "loss": 0.2108, + "step": 9452 + }, + { + "epoch": 0.1755141595779046, + "grad_norm": 0.38614746928215027, + "learning_rate": 1.8517918532469895e-05, + "loss": 0.3878, + "step": 9454 + }, + { + "epoch": 0.17555128971532324, + "grad_norm": 0.4035106599330902, + "learning_rate": 1.8517307372963178e-05, + "loss": 0.3643, + "step": 9456 + }, + { + "epoch": 0.17558841985274187, + "grad_norm": 0.29411834478378296, + "learning_rate": 1.8516696097561415e-05, + "loss": 0.2756, + "step": 9458 + }, + { + "epoch": 0.1756255499901605, + "grad_norm": 0.31375500559806824, + "learning_rate": 1.851608470627293e-05, + "loss": 0.2443, + "step": 9460 + }, + { + "epoch": 0.17566268012757916, + "grad_norm": 0.36916181445121765, + "learning_rate": 1.8515473199106043e-05, + "loss": 0.4535, + "step": 9462 + }, + { + "epoch": 0.17569981026499779, + "grad_norm": 0.3658238351345062, + "learning_rate": 1.851486157606907e-05, + "loss": 0.25, + "step": 9464 + }, + { + "epoch": 0.17573694040241644, + "grad_norm": 0.3895134925842285, + "learning_rate": 1.851424983717034e-05, + "loss": 0.2615, + "step": 9466 + }, + { + "epoch": 0.17577407053983507, + "grad_norm": 0.3768514096736908, + "learning_rate": 1.851363798241817e-05, + "loss": 0.4519, + "step": 9468 + }, + { + "epoch": 0.1758112006772537, + "grad_norm": 0.4980388283729553, + "learning_rate": 1.851302601182089e-05, + "loss": 0.3016, + "step": 9470 + }, + { + "epoch": 0.17584833081467235, + "grad_norm": 0.43198224902153015, + "learning_rate": 1.851241392538682e-05, + "loss": 0.2536, + "step": 9472 + }, + { + "epoch": 0.17588546095209098, + "grad_norm": 0.3454064726829529, + "learning_rate": 1.85118017231243e-05, + "loss": 0.244, + "step": 9474 + }, + { + "epoch": 0.1759225910895096, + "grad_norm": 0.27406659722328186, + "learning_rate": 1.8511189405041648e-05, + "loss": 0.5167, + "step": 9476 + }, + { + "epoch": 0.17595972122692827, + "grad_norm": 0.314929723739624, + "learning_rate": 1.851057697114721e-05, + "loss": 0.2578, + "step": 9478 + }, + { + "epoch": 0.1759968513643469, + "grad_norm": 0.26748126745224, + "learning_rate": 1.8509964421449305e-05, + "loss": 0.3624, + "step": 9480 + }, + { + "epoch": 0.17603398150176555, + "grad_norm": 0.427721232175827, + "learning_rate": 1.850935175595628e-05, + "loss": 0.2797, + "step": 9482 + }, + { + "epoch": 0.17607111163918418, + "grad_norm": 0.2944505214691162, + "learning_rate": 1.850873897467646e-05, + "loss": 0.2098, + "step": 9484 + }, + { + "epoch": 0.1761082417766028, + "grad_norm": 0.5448423027992249, + "learning_rate": 1.8508126077618197e-05, + "loss": 0.4087, + "step": 9486 + }, + { + "epoch": 0.17614537191402146, + "grad_norm": 0.3299909830093384, + "learning_rate": 1.850751306478982e-05, + "loss": 0.3594, + "step": 9488 + }, + { + "epoch": 0.1761825020514401, + "grad_norm": 0.46255943179130554, + "learning_rate": 1.850689993619967e-05, + "loss": 0.377, + "step": 9490 + }, + { + "epoch": 0.17621963218885872, + "grad_norm": 0.38724610209465027, + "learning_rate": 1.8506286691856092e-05, + "loss": 0.3468, + "step": 9492 + }, + { + "epoch": 0.17625676232627738, + "grad_norm": 0.3927869498729706, + "learning_rate": 1.8505673331767434e-05, + "loss": 0.45, + "step": 9494 + }, + { + "epoch": 0.176293892463696, + "grad_norm": 0.4074196219444275, + "learning_rate": 1.850505985594204e-05, + "loss": 0.2707, + "step": 9496 + }, + { + "epoch": 0.17633102260111463, + "grad_norm": 0.3120250701904297, + "learning_rate": 1.8504446264388257e-05, + "loss": 0.5151, + "step": 9498 + }, + { + "epoch": 0.1763681527385333, + "grad_norm": 0.42595574259757996, + "learning_rate": 1.850383255711443e-05, + "loss": 0.5885, + "step": 9500 + }, + { + "epoch": 0.17640528287595192, + "grad_norm": 0.5752061605453491, + "learning_rate": 1.850321873412892e-05, + "loss": 0.3706, + "step": 9502 + }, + { + "epoch": 0.17644241301337057, + "grad_norm": 0.3946669101715088, + "learning_rate": 1.8502604795440068e-05, + "loss": 0.3707, + "step": 9504 + }, + { + "epoch": 0.1764795431507892, + "grad_norm": 0.44369637966156006, + "learning_rate": 1.8501990741056236e-05, + "loss": 0.2307, + "step": 9506 + }, + { + "epoch": 0.17651667328820783, + "grad_norm": 0.40915265679359436, + "learning_rate": 1.8501376570985777e-05, + "loss": 0.2256, + "step": 9508 + }, + { + "epoch": 0.1765538034256265, + "grad_norm": 0.43708500266075134, + "learning_rate": 1.8500762285237048e-05, + "loss": 0.2529, + "step": 9510 + }, + { + "epoch": 0.17659093356304512, + "grad_norm": 0.3137120008468628, + "learning_rate": 1.8500147883818404e-05, + "loss": 0.3213, + "step": 9512 + }, + { + "epoch": 0.17662806370046374, + "grad_norm": 0.2800552248954773, + "learning_rate": 1.849953336673821e-05, + "loss": 0.2088, + "step": 9514 + }, + { + "epoch": 0.1766651938378824, + "grad_norm": 0.20511212944984436, + "learning_rate": 1.8498918734004826e-05, + "loss": 0.2332, + "step": 9516 + }, + { + "epoch": 0.17670232397530103, + "grad_norm": 0.328767329454422, + "learning_rate": 1.8498303985626613e-05, + "loss": 0.2591, + "step": 9518 + }, + { + "epoch": 0.17673945411271968, + "grad_norm": 0.4095207750797272, + "learning_rate": 1.849768912161194e-05, + "loss": 0.4157, + "step": 9520 + }, + { + "epoch": 0.1767765842501383, + "grad_norm": 0.2723335325717926, + "learning_rate": 1.849707414196917e-05, + "loss": 0.4392, + "step": 9522 + }, + { + "epoch": 0.17681371438755694, + "grad_norm": 0.5100218653678894, + "learning_rate": 1.8496459046706677e-05, + "loss": 0.3481, + "step": 9524 + }, + { + "epoch": 0.1768508445249756, + "grad_norm": 0.23778165876865387, + "learning_rate": 1.849584383583282e-05, + "loss": 0.2599, + "step": 9526 + }, + { + "epoch": 0.17688797466239423, + "grad_norm": 0.2638942003250122, + "learning_rate": 1.849522850935598e-05, + "loss": 0.1997, + "step": 9528 + }, + { + "epoch": 0.17692510479981285, + "grad_norm": 0.2838019132614136, + "learning_rate": 1.849461306728453e-05, + "loss": 0.4762, + "step": 9530 + }, + { + "epoch": 0.1769622349372315, + "grad_norm": 0.21618467569351196, + "learning_rate": 1.8493997509626834e-05, + "loss": 0.2855, + "step": 9532 + }, + { + "epoch": 0.17699936507465014, + "grad_norm": 0.32158729434013367, + "learning_rate": 1.8493381836391275e-05, + "loss": 0.3031, + "step": 9534 + }, + { + "epoch": 0.17703649521206877, + "grad_norm": 0.3213442265987396, + "learning_rate": 1.849276604758623e-05, + "loss": 0.1866, + "step": 9536 + }, + { + "epoch": 0.17707362534948742, + "grad_norm": 0.5821731090545654, + "learning_rate": 1.8492150143220084e-05, + "loss": 0.4691, + "step": 9538 + }, + { + "epoch": 0.17711075548690605, + "grad_norm": 0.3224603533744812, + "learning_rate": 1.84915341233012e-05, + "loss": 0.3409, + "step": 9540 + }, + { + "epoch": 0.1771478856243247, + "grad_norm": 0.32279685139656067, + "learning_rate": 1.849091798783798e-05, + "loss": 0.3744, + "step": 9542 + }, + { + "epoch": 0.17718501576174334, + "grad_norm": 0.5420823097229004, + "learning_rate": 1.8490301736838797e-05, + "loss": 0.2885, + "step": 9544 + }, + { + "epoch": 0.17722214589916196, + "grad_norm": 0.3984422981739044, + "learning_rate": 1.8489685370312043e-05, + "loss": 0.3111, + "step": 9546 + }, + { + "epoch": 0.17725927603658062, + "grad_norm": 0.38931065797805786, + "learning_rate": 1.8489068888266096e-05, + "loss": 0.1416, + "step": 9548 + }, + { + "epoch": 0.17729640617399925, + "grad_norm": 0.3846571147441864, + "learning_rate": 1.848845229070935e-05, + "loss": 0.322, + "step": 9550 + }, + { + "epoch": 0.17733353631141788, + "grad_norm": 0.3965075612068176, + "learning_rate": 1.8487835577650194e-05, + "loss": 0.3832, + "step": 9552 + }, + { + "epoch": 0.17737066644883653, + "grad_norm": 0.38376057147979736, + "learning_rate": 1.848721874909702e-05, + "loss": 0.4807, + "step": 9554 + }, + { + "epoch": 0.17740779658625516, + "grad_norm": 0.408200740814209, + "learning_rate": 1.8486601805058222e-05, + "loss": 0.4089, + "step": 9556 + }, + { + "epoch": 0.17744492672367382, + "grad_norm": 0.38661542534828186, + "learning_rate": 1.8485984745542193e-05, + "loss": 0.3836, + "step": 9558 + }, + { + "epoch": 0.17748205686109245, + "grad_norm": 0.3075103759765625, + "learning_rate": 1.848536757055733e-05, + "loss": 0.4346, + "step": 9560 + }, + { + "epoch": 0.17751918699851107, + "grad_norm": 0.5638620257377625, + "learning_rate": 1.848475028011203e-05, + "loss": 0.4128, + "step": 9562 + }, + { + "epoch": 0.17755631713592973, + "grad_norm": 0.4374730885028839, + "learning_rate": 1.8484132874214698e-05, + "loss": 0.4658, + "step": 9564 + }, + { + "epoch": 0.17759344727334836, + "grad_norm": 0.2996465563774109, + "learning_rate": 1.8483515352873724e-05, + "loss": 0.3067, + "step": 9566 + }, + { + "epoch": 0.177630577410767, + "grad_norm": 0.4137851297855377, + "learning_rate": 1.848289771609752e-05, + "loss": 0.4094, + "step": 9568 + }, + { + "epoch": 0.17766770754818564, + "grad_norm": 0.6291019320487976, + "learning_rate": 1.8482279963894488e-05, + "loss": 0.2141, + "step": 9570 + }, + { + "epoch": 0.17770483768560427, + "grad_norm": 0.5233544111251831, + "learning_rate": 1.8481662096273035e-05, + "loss": 0.3076, + "step": 9572 + }, + { + "epoch": 0.1777419678230229, + "grad_norm": 0.35800623893737793, + "learning_rate": 1.8481044113241564e-05, + "loss": 0.4914, + "step": 9574 + }, + { + "epoch": 0.17777909796044156, + "grad_norm": 0.2738491892814636, + "learning_rate": 1.8480426014808485e-05, + "loss": 0.3562, + "step": 9576 + }, + { + "epoch": 0.17781622809786019, + "grad_norm": 0.6201164126396179, + "learning_rate": 1.847980780098221e-05, + "loss": 0.2276, + "step": 9578 + }, + { + "epoch": 0.17785335823527884, + "grad_norm": 0.24048741161823273, + "learning_rate": 1.8479189471771152e-05, + "loss": 0.1767, + "step": 9580 + }, + { + "epoch": 0.17789048837269747, + "grad_norm": 0.41604742407798767, + "learning_rate": 1.8478571027183724e-05, + "loss": 0.1528, + "step": 9582 + }, + { + "epoch": 0.1779276185101161, + "grad_norm": 0.36952152848243713, + "learning_rate": 1.847795246722834e-05, + "loss": 0.2571, + "step": 9584 + }, + { + "epoch": 0.17796474864753475, + "grad_norm": 0.3412088453769684, + "learning_rate": 1.8477333791913417e-05, + "loss": 0.336, + "step": 9586 + }, + { + "epoch": 0.17800187878495338, + "grad_norm": 0.4281190037727356, + "learning_rate": 1.8476715001247374e-05, + "loss": 0.5419, + "step": 9588 + }, + { + "epoch": 0.178039008922372, + "grad_norm": 0.3360917270183563, + "learning_rate": 1.847609609523863e-05, + "loss": 0.3863, + "step": 9590 + }, + { + "epoch": 0.17807613905979067, + "grad_norm": 0.3602268099784851, + "learning_rate": 1.8475477073895608e-05, + "loss": 0.2263, + "step": 9592 + }, + { + "epoch": 0.1781132691972093, + "grad_norm": 0.3859279453754425, + "learning_rate": 1.847485793722673e-05, + "loss": 0.451, + "step": 9594 + }, + { + "epoch": 0.17815039933462795, + "grad_norm": 0.3367448151111603, + "learning_rate": 1.847423868524042e-05, + "loss": 0.5692, + "step": 9596 + }, + { + "epoch": 0.17818752947204658, + "grad_norm": 0.3100263476371765, + "learning_rate": 1.8473619317945103e-05, + "loss": 0.2304, + "step": 9598 + }, + { + "epoch": 0.1782246596094652, + "grad_norm": 0.3648507595062256, + "learning_rate": 1.8472999835349213e-05, + "loss": 0.322, + "step": 9600 + }, + { + "epoch": 0.17826178974688386, + "grad_norm": 0.8895721435546875, + "learning_rate": 1.847238023746117e-05, + "loss": 0.4908, + "step": 9602 + }, + { + "epoch": 0.1782989198843025, + "grad_norm": 0.3271710276603699, + "learning_rate": 1.8471760524289412e-05, + "loss": 0.167, + "step": 9604 + }, + { + "epoch": 0.17833605002172112, + "grad_norm": 0.26065754890441895, + "learning_rate": 1.8471140695842367e-05, + "loss": 0.2159, + "step": 9606 + }, + { + "epoch": 0.17837318015913978, + "grad_norm": 0.4074278175830841, + "learning_rate": 1.8470520752128472e-05, + "loss": 0.3071, + "step": 9608 + }, + { + "epoch": 0.1784103102965584, + "grad_norm": 0.4186624586582184, + "learning_rate": 1.8469900693156166e-05, + "loss": 0.355, + "step": 9610 + }, + { + "epoch": 0.17844744043397703, + "grad_norm": 0.314352810382843, + "learning_rate": 1.8469280518933876e-05, + "loss": 0.2123, + "step": 9612 + }, + { + "epoch": 0.1784845705713957, + "grad_norm": 0.3136752247810364, + "learning_rate": 1.846866022947005e-05, + "loss": 0.4403, + "step": 9614 + }, + { + "epoch": 0.17852170070881432, + "grad_norm": 0.3026633858680725, + "learning_rate": 1.8468039824773125e-05, + "loss": 0.3442, + "step": 9616 + }, + { + "epoch": 0.17855883084623297, + "grad_norm": 0.297133207321167, + "learning_rate": 1.8467419304851543e-05, + "loss": 0.4821, + "step": 9618 + }, + { + "epoch": 0.1785959609836516, + "grad_norm": 0.3496973216533661, + "learning_rate": 1.8466798669713744e-05, + "loss": 0.337, + "step": 9620 + }, + { + "epoch": 0.17863309112107023, + "grad_norm": 0.404927521944046, + "learning_rate": 1.846617791936818e-05, + "loss": 0.3329, + "step": 9622 + }, + { + "epoch": 0.1786702212584889, + "grad_norm": 0.3777688145637512, + "learning_rate": 1.846555705382329e-05, + "loss": 0.2465, + "step": 9624 + }, + { + "epoch": 0.17870735139590752, + "grad_norm": 0.5088377594947815, + "learning_rate": 1.8464936073087527e-05, + "loss": 0.3038, + "step": 9626 + }, + { + "epoch": 0.17874448153332614, + "grad_norm": 0.39530888199806213, + "learning_rate": 1.846431497716934e-05, + "loss": 0.2303, + "step": 9628 + }, + { + "epoch": 0.1787816116707448, + "grad_norm": 0.3287244141101837, + "learning_rate": 1.846369376607718e-05, + "loss": 0.2175, + "step": 9630 + }, + { + "epoch": 0.17881874180816343, + "grad_norm": 0.3361397683620453, + "learning_rate": 1.84630724398195e-05, + "loss": 0.2259, + "step": 9632 + }, + { + "epoch": 0.17885587194558208, + "grad_norm": 0.3792361319065094, + "learning_rate": 1.846245099840475e-05, + "loss": 0.3851, + "step": 9634 + }, + { + "epoch": 0.1788930020830007, + "grad_norm": 0.3108764588832855, + "learning_rate": 1.8461829441841394e-05, + "loss": 0.4233, + "step": 9636 + }, + { + "epoch": 0.17893013222041934, + "grad_norm": 0.3521835505962372, + "learning_rate": 1.8461207770137885e-05, + "loss": 0.337, + "step": 9638 + }, + { + "epoch": 0.178967262357838, + "grad_norm": 0.36800017952919006, + "learning_rate": 1.846058598330268e-05, + "loss": 0.3447, + "step": 9640 + }, + { + "epoch": 0.17900439249525663, + "grad_norm": 0.29860007762908936, + "learning_rate": 1.8459964081344248e-05, + "loss": 0.2888, + "step": 9642 + }, + { + "epoch": 0.17904152263267525, + "grad_norm": 0.545779287815094, + "learning_rate": 1.8459342064271037e-05, + "loss": 0.4612, + "step": 9644 + }, + { + "epoch": 0.1790786527700939, + "grad_norm": 0.37817618250846863, + "learning_rate": 1.8458719932091523e-05, + "loss": 0.3164, + "step": 9646 + }, + { + "epoch": 0.17911578290751254, + "grad_norm": 0.23531462252140045, + "learning_rate": 1.8458097684814168e-05, + "loss": 0.2905, + "step": 9648 + }, + { + "epoch": 0.17915291304493117, + "grad_norm": 0.31386569142341614, + "learning_rate": 1.8457475322447437e-05, + "loss": 0.1614, + "step": 9650 + }, + { + "epoch": 0.17919004318234982, + "grad_norm": 0.2847346067428589, + "learning_rate": 1.8456852844999805e-05, + "loss": 0.4167, + "step": 9652 + }, + { + "epoch": 0.17922717331976845, + "grad_norm": 0.38693252205848694, + "learning_rate": 1.8456230252479732e-05, + "loss": 0.1707, + "step": 9654 + }, + { + "epoch": 0.1792643034571871, + "grad_norm": 0.35181039571762085, + "learning_rate": 1.8455607544895694e-05, + "loss": 0.4361, + "step": 9656 + }, + { + "epoch": 0.17930143359460574, + "grad_norm": 0.30536600947380066, + "learning_rate": 1.845498472225617e-05, + "loss": 0.3256, + "step": 9658 + }, + { + "epoch": 0.17933856373202436, + "grad_norm": 0.4238933324813843, + "learning_rate": 1.8454361784569625e-05, + "loss": 0.3954, + "step": 9660 + }, + { + "epoch": 0.17937569386944302, + "grad_norm": 0.2870017886161804, + "learning_rate": 1.845373873184454e-05, + "loss": 0.1442, + "step": 9662 + }, + { + "epoch": 0.17941282400686165, + "grad_norm": 0.3694000244140625, + "learning_rate": 1.8453115564089396e-05, + "loss": 0.5079, + "step": 9664 + }, + { + "epoch": 0.17944995414428028, + "grad_norm": 0.36335745453834534, + "learning_rate": 1.8452492281312667e-05, + "loss": 0.2248, + "step": 9666 + }, + { + "epoch": 0.17948708428169893, + "grad_norm": 0.33692434430122375, + "learning_rate": 1.8451868883522833e-05, + "loss": 0.2997, + "step": 9668 + }, + { + "epoch": 0.17952421441911756, + "grad_norm": 0.2719217836856842, + "learning_rate": 1.8451245370728384e-05, + "loss": 0.3224, + "step": 9670 + }, + { + "epoch": 0.1795613445565362, + "grad_norm": 0.2950933575630188, + "learning_rate": 1.84506217429378e-05, + "loss": 0.258, + "step": 9672 + }, + { + "epoch": 0.17959847469395485, + "grad_norm": 0.8864665031433105, + "learning_rate": 1.8449998000159567e-05, + "loss": 0.3255, + "step": 9674 + }, + { + "epoch": 0.17963560483137347, + "grad_norm": 0.5421977043151855, + "learning_rate": 1.844937414240217e-05, + "loss": 0.3919, + "step": 9676 + }, + { + "epoch": 0.17967273496879213, + "grad_norm": 0.4338848888874054, + "learning_rate": 1.8448750169674098e-05, + "loss": 0.3483, + "step": 9678 + }, + { + "epoch": 0.17970986510621076, + "grad_norm": 0.2878280282020569, + "learning_rate": 1.844812608198385e-05, + "loss": 0.3473, + "step": 9680 + }, + { + "epoch": 0.1797469952436294, + "grad_norm": 0.37720227241516113, + "learning_rate": 1.8447501879339904e-05, + "loss": 0.2399, + "step": 9682 + }, + { + "epoch": 0.17978412538104804, + "grad_norm": 0.3899800181388855, + "learning_rate": 1.844687756175076e-05, + "loss": 0.2252, + "step": 9684 + }, + { + "epoch": 0.17982125551846667, + "grad_norm": 0.36731138825416565, + "learning_rate": 1.8446253129224916e-05, + "loss": 0.3274, + "step": 9686 + }, + { + "epoch": 0.1798583856558853, + "grad_norm": 0.3342880308628082, + "learning_rate": 1.844562858177087e-05, + "loss": 0.3391, + "step": 9688 + }, + { + "epoch": 0.17989551579330396, + "grad_norm": 0.39947667717933655, + "learning_rate": 1.8445003919397115e-05, + "loss": 0.2343, + "step": 9690 + }, + { + "epoch": 0.17993264593072258, + "grad_norm": 0.41179266571998596, + "learning_rate": 1.844437914211215e-05, + "loss": 0.3604, + "step": 9692 + }, + { + "epoch": 0.17996977606814124, + "grad_norm": 0.42948979139328003, + "learning_rate": 1.844375424992448e-05, + "loss": 0.4247, + "step": 9694 + }, + { + "epoch": 0.18000690620555987, + "grad_norm": 0.4198618531227112, + "learning_rate": 1.8443129242842607e-05, + "loss": 0.43, + "step": 9696 + }, + { + "epoch": 0.1800440363429785, + "grad_norm": 0.4670703411102295, + "learning_rate": 1.8442504120875036e-05, + "loss": 0.4865, + "step": 9698 + }, + { + "epoch": 0.18008116648039715, + "grad_norm": 0.38360267877578735, + "learning_rate": 1.844187888403027e-05, + "loss": 0.3198, + "step": 9700 + }, + { + "epoch": 0.18011829661781578, + "grad_norm": 0.329893559217453, + "learning_rate": 1.844125353231682e-05, + "loss": 0.2644, + "step": 9702 + }, + { + "epoch": 0.1801554267552344, + "grad_norm": 0.4611571431159973, + "learning_rate": 1.844062806574319e-05, + "loss": 0.3441, + "step": 9704 + }, + { + "epoch": 0.18019255689265307, + "grad_norm": 0.29672861099243164, + "learning_rate": 1.84400024843179e-05, + "loss": 0.2753, + "step": 9706 + }, + { + "epoch": 0.1802296870300717, + "grad_norm": 0.28903207182884216, + "learning_rate": 1.8439376788049455e-05, + "loss": 0.3827, + "step": 9708 + }, + { + "epoch": 0.18026681716749032, + "grad_norm": 0.3726979196071625, + "learning_rate": 1.8438750976946372e-05, + "loss": 0.3596, + "step": 9710 + }, + { + "epoch": 0.18030394730490898, + "grad_norm": 0.6121364235877991, + "learning_rate": 1.843812505101716e-05, + "loss": 0.532, + "step": 9712 + }, + { + "epoch": 0.1803410774423276, + "grad_norm": 0.43849310278892517, + "learning_rate": 1.8437499010270347e-05, + "loss": 0.3487, + "step": 9714 + }, + { + "epoch": 0.18037820757974626, + "grad_norm": 0.35089248418807983, + "learning_rate": 1.8436872854714445e-05, + "loss": 0.3169, + "step": 9716 + }, + { + "epoch": 0.1804153377171649, + "grad_norm": 0.288174569606781, + "learning_rate": 1.843624658435797e-05, + "loss": 0.0789, + "step": 9718 + }, + { + "epoch": 0.18045246785458352, + "grad_norm": 0.22463294863700867, + "learning_rate": 1.8435620199209455e-05, + "loss": 0.2058, + "step": 9720 + }, + { + "epoch": 0.18048959799200218, + "grad_norm": 0.3337746858596802, + "learning_rate": 1.8434993699277414e-05, + "loss": 0.2351, + "step": 9722 + }, + { + "epoch": 0.1805267281294208, + "grad_norm": 0.3516903817653656, + "learning_rate": 1.8434367084570372e-05, + "loss": 0.4076, + "step": 9724 + }, + { + "epoch": 0.18056385826683943, + "grad_norm": 0.29438966512680054, + "learning_rate": 1.843374035509686e-05, + "loss": 0.1668, + "step": 9726 + }, + { + "epoch": 0.1806009884042581, + "grad_norm": 0.31906697154045105, + "learning_rate": 1.8433113510865405e-05, + "loss": 0.3355, + "step": 9728 + }, + { + "epoch": 0.18063811854167672, + "grad_norm": 0.2787906229496002, + "learning_rate": 1.8432486551884535e-05, + "loss": 0.218, + "step": 9730 + }, + { + "epoch": 0.18067524867909537, + "grad_norm": 0.2658388912677765, + "learning_rate": 1.843185947816278e-05, + "loss": 0.2716, + "step": 9732 + }, + { + "epoch": 0.180712378816514, + "grad_norm": 0.398564875125885, + "learning_rate": 1.843123228970867e-05, + "loss": 0.2116, + "step": 9734 + }, + { + "epoch": 0.18074950895393263, + "grad_norm": 0.2886311113834381, + "learning_rate": 1.843060498653075e-05, + "loss": 0.2342, + "step": 9736 + }, + { + "epoch": 0.1807866390913513, + "grad_norm": 0.2524523138999939, + "learning_rate": 1.8429977568637546e-05, + "loss": 0.3968, + "step": 9738 + }, + { + "epoch": 0.18082376922876991, + "grad_norm": 0.27005019783973694, + "learning_rate": 1.84293500360376e-05, + "loss": 0.4382, + "step": 9740 + }, + { + "epoch": 0.18086089936618854, + "grad_norm": 0.37713804841041565, + "learning_rate": 1.8428722388739444e-05, + "loss": 0.3396, + "step": 9742 + }, + { + "epoch": 0.1808980295036072, + "grad_norm": 0.35463401675224304, + "learning_rate": 1.842809462675163e-05, + "loss": 0.4273, + "step": 9744 + }, + { + "epoch": 0.18093515964102583, + "grad_norm": 0.28021538257598877, + "learning_rate": 1.8427466750082684e-05, + "loss": 0.3907, + "step": 9746 + }, + { + "epoch": 0.18097228977844446, + "grad_norm": 0.27935490012168884, + "learning_rate": 1.8426838758741165e-05, + "loss": 0.3083, + "step": 9748 + }, + { + "epoch": 0.1810094199158631, + "grad_norm": 0.3651592433452606, + "learning_rate": 1.842621065273561e-05, + "loss": 0.226, + "step": 9750 + }, + { + "epoch": 0.18104655005328174, + "grad_norm": 0.3888011574745178, + "learning_rate": 1.842558243207457e-05, + "loss": 0.3569, + "step": 9752 + }, + { + "epoch": 0.1810836801907004, + "grad_norm": 0.3161230981349945, + "learning_rate": 1.8424954096766584e-05, + "loss": 0.3683, + "step": 9754 + }, + { + "epoch": 0.18112081032811903, + "grad_norm": 0.40960893034935, + "learning_rate": 1.8424325646820214e-05, + "loss": 0.2456, + "step": 9756 + }, + { + "epoch": 0.18115794046553765, + "grad_norm": 0.378567099571228, + "learning_rate": 1.842369708224401e-05, + "loss": 0.3213, + "step": 9758 + }, + { + "epoch": 0.1811950706029563, + "grad_norm": 0.5387058258056641, + "learning_rate": 1.8423068403046512e-05, + "loss": 0.2883, + "step": 9760 + }, + { + "epoch": 0.18123220074037494, + "grad_norm": 0.3814791738986969, + "learning_rate": 1.8422439609236286e-05, + "loss": 0.2495, + "step": 9762 + }, + { + "epoch": 0.18126933087779357, + "grad_norm": 0.4276221692562103, + "learning_rate": 1.842181070082189e-05, + "loss": 0.2367, + "step": 9764 + }, + { + "epoch": 0.18130646101521222, + "grad_norm": 0.523210883140564, + "learning_rate": 1.842118167781187e-05, + "loss": 0.2761, + "step": 9766 + }, + { + "epoch": 0.18134359115263085, + "grad_norm": 0.5031771659851074, + "learning_rate": 1.842055254021479e-05, + "loss": 0.3539, + "step": 9768 + }, + { + "epoch": 0.1813807212900495, + "grad_norm": 0.4519246220588684, + "learning_rate": 1.8419923288039218e-05, + "loss": 0.301, + "step": 9770 + }, + { + "epoch": 0.18141785142746814, + "grad_norm": 0.42654770612716675, + "learning_rate": 1.8419293921293707e-05, + "loss": 0.3605, + "step": 9772 + }, + { + "epoch": 0.18145498156488676, + "grad_norm": 0.4720914363861084, + "learning_rate": 1.8418664439986828e-05, + "loss": 0.2795, + "step": 9774 + }, + { + "epoch": 0.18149211170230542, + "grad_norm": 0.33781060576438904, + "learning_rate": 1.841803484412714e-05, + "loss": 0.2397, + "step": 9776 + }, + { + "epoch": 0.18152924183972405, + "grad_norm": 0.34768402576446533, + "learning_rate": 1.8417405133723214e-05, + "loss": 0.6441, + "step": 9778 + }, + { + "epoch": 0.18156637197714268, + "grad_norm": 0.3594615161418915, + "learning_rate": 1.8416775308783616e-05, + "loss": 0.3614, + "step": 9780 + }, + { + "epoch": 0.18160350211456133, + "grad_norm": 0.35014021396636963, + "learning_rate": 1.841614536931692e-05, + "loss": 0.2191, + "step": 9782 + }, + { + "epoch": 0.18164063225197996, + "grad_norm": 0.3346107602119446, + "learning_rate": 1.8415515315331692e-05, + "loss": 0.3816, + "step": 9784 + }, + { + "epoch": 0.1816777623893986, + "grad_norm": 0.3111545741558075, + "learning_rate": 1.841488514683651e-05, + "loss": 0.2995, + "step": 9786 + }, + { + "epoch": 0.18171489252681725, + "grad_norm": 0.2627500593662262, + "learning_rate": 1.8414254863839946e-05, + "loss": 0.3501, + "step": 9788 + }, + { + "epoch": 0.18175202266423587, + "grad_norm": 0.35551393032073975, + "learning_rate": 1.8413624466350576e-05, + "loss": 0.3958, + "step": 9790 + }, + { + "epoch": 0.18178915280165453, + "grad_norm": 0.3552328050136566, + "learning_rate": 1.8412993954376978e-05, + "loss": 0.3843, + "step": 9792 + }, + { + "epoch": 0.18182628293907316, + "grad_norm": 0.38312020897865295, + "learning_rate": 1.8412363327927734e-05, + "loss": 0.3268, + "step": 9794 + }, + { + "epoch": 0.1818634130764918, + "grad_norm": 0.22952710092067719, + "learning_rate": 1.8411732587011423e-05, + "loss": 0.3035, + "step": 9796 + }, + { + "epoch": 0.18190054321391044, + "grad_norm": 0.35503703355789185, + "learning_rate": 1.8411101731636628e-05, + "loss": 0.2711, + "step": 9798 + }, + { + "epoch": 0.18193767335132907, + "grad_norm": 0.33516064286231995, + "learning_rate": 1.8410470761811933e-05, + "loss": 0.4079, + "step": 9800 + }, + { + "epoch": 0.1819748034887477, + "grad_norm": 0.3932841420173645, + "learning_rate": 1.8409839677545918e-05, + "loss": 0.3557, + "step": 9802 + }, + { + "epoch": 0.18201193362616636, + "grad_norm": 0.3025108277797699, + "learning_rate": 1.8409208478847178e-05, + "loss": 0.3723, + "step": 9804 + }, + { + "epoch": 0.18204906376358498, + "grad_norm": 0.34415170550346375, + "learning_rate": 1.8408577165724302e-05, + "loss": 0.5636, + "step": 9806 + }, + { + "epoch": 0.18208619390100364, + "grad_norm": 0.2993118464946747, + "learning_rate": 1.8407945738185876e-05, + "loss": 0.1903, + "step": 9808 + }, + { + "epoch": 0.18212332403842227, + "grad_norm": 0.3082931339740753, + "learning_rate": 1.8407314196240492e-05, + "loss": 0.4864, + "step": 9810 + }, + { + "epoch": 0.1821604541758409, + "grad_norm": 0.4172440469264984, + "learning_rate": 1.8406682539896746e-05, + "loss": 0.3378, + "step": 9812 + }, + { + "epoch": 0.18219758431325955, + "grad_norm": 0.2985750138759613, + "learning_rate": 1.840605076916323e-05, + "loss": 0.5089, + "step": 9814 + }, + { + "epoch": 0.18223471445067818, + "grad_norm": 0.42213085293769836, + "learning_rate": 1.8405418884048542e-05, + "loss": 0.3455, + "step": 9816 + }, + { + "epoch": 0.1822718445880968, + "grad_norm": 0.48405712842941284, + "learning_rate": 1.8404786884561283e-05, + "loss": 0.2788, + "step": 9818 + }, + { + "epoch": 0.18230897472551547, + "grad_norm": 0.3284052908420563, + "learning_rate": 1.8404154770710047e-05, + "loss": 0.3127, + "step": 9820 + }, + { + "epoch": 0.1823461048629341, + "grad_norm": 0.24915306270122528, + "learning_rate": 1.840352254250344e-05, + "loss": 0.2529, + "step": 9822 + }, + { + "epoch": 0.18238323500035272, + "grad_norm": 0.3323463201522827, + "learning_rate": 1.840289019995006e-05, + "loss": 0.3785, + "step": 9824 + }, + { + "epoch": 0.18242036513777138, + "grad_norm": 0.4031302034854889, + "learning_rate": 1.8402257743058515e-05, + "loss": 0.379, + "step": 9826 + }, + { + "epoch": 0.18245749527519, + "grad_norm": 0.4071623980998993, + "learning_rate": 1.8401625171837413e-05, + "loss": 0.2915, + "step": 9828 + }, + { + "epoch": 0.18249462541260866, + "grad_norm": 0.4241139590740204, + "learning_rate": 1.8400992486295354e-05, + "loss": 0.4581, + "step": 9830 + }, + { + "epoch": 0.1825317555500273, + "grad_norm": 0.3751186430454254, + "learning_rate": 1.840035968644095e-05, + "loss": 0.4105, + "step": 9832 + }, + { + "epoch": 0.18256888568744592, + "grad_norm": 0.32899799942970276, + "learning_rate": 1.839972677228282e-05, + "loss": 0.2431, + "step": 9834 + }, + { + "epoch": 0.18260601582486458, + "grad_norm": 0.32996708154678345, + "learning_rate": 1.839909374382956e-05, + "loss": 0.346, + "step": 9836 + }, + { + "epoch": 0.1826431459622832, + "grad_norm": 0.2801746428012848, + "learning_rate": 1.83984606010898e-05, + "loss": 0.2736, + "step": 9838 + }, + { + "epoch": 0.18268027609970183, + "grad_norm": 0.3736366331577301, + "learning_rate": 1.8397827344072145e-05, + "loss": 0.3387, + "step": 9840 + }, + { + "epoch": 0.1827174062371205, + "grad_norm": 0.35635530948638916, + "learning_rate": 1.839719397278521e-05, + "loss": 0.2634, + "step": 9842 + }, + { + "epoch": 0.18275453637453912, + "grad_norm": 0.2773335874080658, + "learning_rate": 1.8396560487237624e-05, + "loss": 0.3752, + "step": 9844 + }, + { + "epoch": 0.18279166651195777, + "grad_norm": 0.302762895822525, + "learning_rate": 1.8395926887437993e-05, + "loss": 0.5539, + "step": 9846 + }, + { + "epoch": 0.1828287966493764, + "grad_norm": 0.5084289312362671, + "learning_rate": 1.839529317339495e-05, + "loss": 0.2642, + "step": 9848 + }, + { + "epoch": 0.18286592678679503, + "grad_norm": 0.3113343417644501, + "learning_rate": 1.8394659345117113e-05, + "loss": 0.4062, + "step": 9850 + }, + { + "epoch": 0.18290305692421369, + "grad_norm": 0.2898481786251068, + "learning_rate": 1.839402540261311e-05, + "loss": 0.4576, + "step": 9852 + }, + { + "epoch": 0.18294018706163231, + "grad_norm": 0.41298696398735046, + "learning_rate": 1.8393391345891563e-05, + "loss": 0.5739, + "step": 9854 + }, + { + "epoch": 0.18297731719905094, + "grad_norm": 0.42617517709732056, + "learning_rate": 1.8392757174961096e-05, + "loss": 0.2863, + "step": 9856 + }, + { + "epoch": 0.1830144473364696, + "grad_norm": 0.45977580547332764, + "learning_rate": 1.8392122889830347e-05, + "loss": 0.398, + "step": 9858 + }, + { + "epoch": 0.18305157747388823, + "grad_norm": 0.30564093589782715, + "learning_rate": 1.8391488490507946e-05, + "loss": 0.3398, + "step": 9860 + }, + { + "epoch": 0.18308870761130686, + "grad_norm": 0.35091835260391235, + "learning_rate": 1.8390853977002518e-05, + "loss": 0.3235, + "step": 9862 + }, + { + "epoch": 0.1831258377487255, + "grad_norm": 0.6922197341918945, + "learning_rate": 1.8390219349322704e-05, + "loss": 0.397, + "step": 9864 + }, + { + "epoch": 0.18316296788614414, + "grad_norm": 0.37593111395835876, + "learning_rate": 1.8389584607477133e-05, + "loss": 0.4523, + "step": 9866 + }, + { + "epoch": 0.1832000980235628, + "grad_norm": 0.3268374502658844, + "learning_rate": 1.8388949751474444e-05, + "loss": 0.2368, + "step": 9868 + }, + { + "epoch": 0.18323722816098142, + "grad_norm": 0.6514426469802856, + "learning_rate": 1.838831478132328e-05, + "loss": 0.2353, + "step": 9870 + }, + { + "epoch": 0.18327435829840005, + "grad_norm": 0.38410860300064087, + "learning_rate": 1.8387679697032275e-05, + "loss": 0.3626, + "step": 9872 + }, + { + "epoch": 0.1833114884358187, + "grad_norm": 0.5829281806945801, + "learning_rate": 1.8387044498610077e-05, + "loss": 0.3657, + "step": 9874 + }, + { + "epoch": 0.18334861857323734, + "grad_norm": 0.33428290486335754, + "learning_rate": 1.838640918606532e-05, + "loss": 0.286, + "step": 9876 + }, + { + "epoch": 0.18338574871065597, + "grad_norm": 0.30670416355133057, + "learning_rate": 1.838577375940666e-05, + "loss": 0.5656, + "step": 9878 + }, + { + "epoch": 0.18342287884807462, + "grad_norm": 0.518703281879425, + "learning_rate": 1.8385138218642736e-05, + "loss": 0.1887, + "step": 9880 + }, + { + "epoch": 0.18346000898549325, + "grad_norm": 0.38538694381713867, + "learning_rate": 1.8384502563782198e-05, + "loss": 0.0978, + "step": 9882 + }, + { + "epoch": 0.1834971391229119, + "grad_norm": 0.34368419647216797, + "learning_rate": 1.838386679483369e-05, + "loss": 0.279, + "step": 9884 + }, + { + "epoch": 0.18353426926033053, + "grad_norm": 0.38312453031539917, + "learning_rate": 1.838323091180587e-05, + "loss": 0.3464, + "step": 9886 + }, + { + "epoch": 0.18357139939774916, + "grad_norm": 0.2549024224281311, + "learning_rate": 1.838259491470739e-05, + "loss": 0.3489, + "step": 9888 + }, + { + "epoch": 0.18360852953516782, + "grad_norm": 0.29596665501594543, + "learning_rate": 1.83819588035469e-05, + "loss": 0.3395, + "step": 9890 + }, + { + "epoch": 0.18364565967258645, + "grad_norm": 0.35860276222229004, + "learning_rate": 1.838132257833306e-05, + "loss": 0.4298, + "step": 9892 + }, + { + "epoch": 0.18368278981000508, + "grad_norm": 0.49010443687438965, + "learning_rate": 1.8380686239074518e-05, + "loss": 0.3149, + "step": 9894 + }, + { + "epoch": 0.18371991994742373, + "grad_norm": 0.24262480437755585, + "learning_rate": 1.8380049785779947e-05, + "loss": 0.3188, + "step": 9896 + }, + { + "epoch": 0.18375705008484236, + "grad_norm": 0.23179686069488525, + "learning_rate": 1.8379413218457994e-05, + "loss": 0.3206, + "step": 9898 + }, + { + "epoch": 0.183794180222261, + "grad_norm": 0.3632320463657379, + "learning_rate": 1.837877653711733e-05, + "loss": 0.3353, + "step": 9900 + }, + { + "epoch": 0.18383131035967964, + "grad_norm": 0.47408539056777954, + "learning_rate": 1.8378139741766613e-05, + "loss": 0.3434, + "step": 9902 + }, + { + "epoch": 0.18386844049709827, + "grad_norm": 0.30511316657066345, + "learning_rate": 1.837750283241451e-05, + "loss": 0.3216, + "step": 9904 + }, + { + "epoch": 0.18390557063451693, + "grad_norm": 0.4280533194541931, + "learning_rate": 1.8376865809069687e-05, + "loss": 0.4023, + "step": 9906 + }, + { + "epoch": 0.18394270077193556, + "grad_norm": 0.2546878457069397, + "learning_rate": 1.8376228671740812e-05, + "loss": 0.4436, + "step": 9908 + }, + { + "epoch": 0.1839798309093542, + "grad_norm": 0.34061458706855774, + "learning_rate": 1.8375591420436556e-05, + "loss": 0.2533, + "step": 9910 + }, + { + "epoch": 0.18401696104677284, + "grad_norm": 0.42443713545799255, + "learning_rate": 1.837495405516559e-05, + "loss": 0.4974, + "step": 9912 + }, + { + "epoch": 0.18405409118419147, + "grad_norm": 0.5118205547332764, + "learning_rate": 1.8374316575936578e-05, + "loss": 0.4737, + "step": 9914 + }, + { + "epoch": 0.1840912213216101, + "grad_norm": 0.37152597308158875, + "learning_rate": 1.8373678982758206e-05, + "loss": 0.3769, + "step": 9916 + }, + { + "epoch": 0.18412835145902876, + "grad_norm": 0.2654774487018585, + "learning_rate": 1.8373041275639145e-05, + "loss": 0.3671, + "step": 9918 + }, + { + "epoch": 0.18416548159644738, + "grad_norm": 0.24857978522777557, + "learning_rate": 1.8372403454588073e-05, + "loss": 0.3383, + "step": 9920 + }, + { + "epoch": 0.18420261173386604, + "grad_norm": 0.3159993588924408, + "learning_rate": 1.8371765519613666e-05, + "loss": 0.48, + "step": 9922 + }, + { + "epoch": 0.18423974187128467, + "grad_norm": 0.3612730801105499, + "learning_rate": 1.8371127470724606e-05, + "loss": 0.3143, + "step": 9924 + }, + { + "epoch": 0.1842768720087033, + "grad_norm": 0.5210133194923401, + "learning_rate": 1.837048930792958e-05, + "loss": 0.3155, + "step": 9926 + }, + { + "epoch": 0.18431400214612195, + "grad_norm": 0.3854641020298004, + "learning_rate": 1.8369851031237265e-05, + "loss": 0.2234, + "step": 9928 + }, + { + "epoch": 0.18435113228354058, + "grad_norm": 0.2614252269268036, + "learning_rate": 1.8369212640656348e-05, + "loss": 0.2337, + "step": 9930 + }, + { + "epoch": 0.1843882624209592, + "grad_norm": 0.3065553903579712, + "learning_rate": 1.8368574136195513e-05, + "loss": 0.2027, + "step": 9932 + }, + { + "epoch": 0.18442539255837787, + "grad_norm": 0.44947549700737, + "learning_rate": 1.8367935517863455e-05, + "loss": 0.3199, + "step": 9934 + }, + { + "epoch": 0.1844625226957965, + "grad_norm": 0.8503955006599426, + "learning_rate": 1.8367296785668858e-05, + "loss": 0.5328, + "step": 9936 + }, + { + "epoch": 0.18449965283321512, + "grad_norm": 0.41352495551109314, + "learning_rate": 1.8366657939620415e-05, + "loss": 0.4103, + "step": 9938 + }, + { + "epoch": 0.18453678297063378, + "grad_norm": 0.31638580560684204, + "learning_rate": 1.8366018979726817e-05, + "loss": 0.3238, + "step": 9940 + }, + { + "epoch": 0.1845739131080524, + "grad_norm": 0.5232476592063904, + "learning_rate": 1.8365379905996762e-05, + "loss": 0.3791, + "step": 9942 + }, + { + "epoch": 0.18461104324547106, + "grad_norm": 0.418901264667511, + "learning_rate": 1.836474071843894e-05, + "loss": 0.4642, + "step": 9944 + }, + { + "epoch": 0.1846481733828897, + "grad_norm": 0.29190734028816223, + "learning_rate": 1.8364101417062054e-05, + "loss": 0.4276, + "step": 9946 + }, + { + "epoch": 0.18468530352030832, + "grad_norm": 0.3254503309726715, + "learning_rate": 1.8363462001874803e-05, + "loss": 0.257, + "step": 9948 + }, + { + "epoch": 0.18472243365772698, + "grad_norm": 0.36586275696754456, + "learning_rate": 1.8362822472885887e-05, + "loss": 0.2655, + "step": 9950 + }, + { + "epoch": 0.1847595637951456, + "grad_norm": 0.4781222343444824, + "learning_rate": 1.8362182830104e-05, + "loss": 0.4539, + "step": 9952 + }, + { + "epoch": 0.18479669393256423, + "grad_norm": 0.396098256111145, + "learning_rate": 1.8361543073537857e-05, + "loss": 0.2428, + "step": 9954 + }, + { + "epoch": 0.1848338240699829, + "grad_norm": 0.31001099944114685, + "learning_rate": 1.8360903203196157e-05, + "loss": 0.3677, + "step": 9956 + }, + { + "epoch": 0.18487095420740152, + "grad_norm": 0.26083776354789734, + "learning_rate": 1.836026321908761e-05, + "loss": 0.3047, + "step": 9958 + }, + { + "epoch": 0.18490808434482017, + "grad_norm": 0.35777002573013306, + "learning_rate": 1.835962312122092e-05, + "loss": 0.3121, + "step": 9960 + }, + { + "epoch": 0.1849452144822388, + "grad_norm": 0.399875670671463, + "learning_rate": 1.83589829096048e-05, + "loss": 0.4987, + "step": 9962 + }, + { + "epoch": 0.18498234461965743, + "grad_norm": 0.3539074957370758, + "learning_rate": 1.835834258424796e-05, + "loss": 0.2322, + "step": 9964 + }, + { + "epoch": 0.18501947475707609, + "grad_norm": 0.45889416337013245, + "learning_rate": 1.835770214515912e-05, + "loss": 0.3029, + "step": 9966 + }, + { + "epoch": 0.18505660489449471, + "grad_norm": 0.32760050892829895, + "learning_rate": 1.835706159234698e-05, + "loss": 0.3015, + "step": 9968 + }, + { + "epoch": 0.18509373503191334, + "grad_norm": 0.33777618408203125, + "learning_rate": 1.835642092582027e-05, + "loss": 0.2423, + "step": 9970 + }, + { + "epoch": 0.185130865169332, + "grad_norm": 0.4052596986293793, + "learning_rate": 1.8355780145587698e-05, + "loss": 0.2568, + "step": 9972 + }, + { + "epoch": 0.18516799530675063, + "grad_norm": 0.379072904586792, + "learning_rate": 1.8355139251657985e-05, + "loss": 0.4143, + "step": 9974 + }, + { + "epoch": 0.18520512544416926, + "grad_norm": 0.4473394453525543, + "learning_rate": 1.8354498244039858e-05, + "loss": 0.1469, + "step": 9976 + }, + { + "epoch": 0.1852422555815879, + "grad_norm": 0.4131547212600708, + "learning_rate": 1.835385712274203e-05, + "loss": 0.3964, + "step": 9978 + }, + { + "epoch": 0.18527938571900654, + "grad_norm": 0.4453961253166199, + "learning_rate": 1.835321588777323e-05, + "loss": 0.4611, + "step": 9980 + }, + { + "epoch": 0.1853165158564252, + "grad_norm": 0.35863277316093445, + "learning_rate": 1.8352574539142187e-05, + "loss": 0.2339, + "step": 9982 + }, + { + "epoch": 0.18535364599384382, + "grad_norm": 0.41370517015457153, + "learning_rate": 1.8351933076857618e-05, + "loss": 0.2234, + "step": 9984 + }, + { + "epoch": 0.18539077613126245, + "grad_norm": 0.3429970443248749, + "learning_rate": 1.835129150092826e-05, + "loss": 0.2499, + "step": 9986 + }, + { + "epoch": 0.1854279062686811, + "grad_norm": 0.2927241921424866, + "learning_rate": 1.835064981136284e-05, + "loss": 0.1489, + "step": 9988 + }, + { + "epoch": 0.18546503640609974, + "grad_norm": 0.2844732403755188, + "learning_rate": 1.8350008008170084e-05, + "loss": 0.1451, + "step": 9990 + }, + { + "epoch": 0.18550216654351837, + "grad_norm": 0.3491942286491394, + "learning_rate": 1.8349366091358735e-05, + "loss": 0.2921, + "step": 9992 + }, + { + "epoch": 0.18553929668093702, + "grad_norm": 0.3390982747077942, + "learning_rate": 1.8348724060937524e-05, + "loss": 0.3201, + "step": 9994 + }, + { + "epoch": 0.18557642681835565, + "grad_norm": 0.5280259847640991, + "learning_rate": 1.834808191691518e-05, + "loss": 0.2777, + "step": 9996 + }, + { + "epoch": 0.1856135569557743, + "grad_norm": 0.3064058721065521, + "learning_rate": 1.834743965930045e-05, + "loss": 0.4065, + "step": 9998 + }, + { + "epoch": 0.18565068709319293, + "grad_norm": 0.33439329266548157, + "learning_rate": 1.8346797288102072e-05, + "loss": 0.2729, + "step": 10000 + }, + { + "epoch": 0.18568781723061156, + "grad_norm": 0.28253093361854553, + "learning_rate": 1.8346154803328783e-05, + "loss": 0.2426, + "step": 10002 + }, + { + "epoch": 0.18572494736803022, + "grad_norm": 0.44935092329978943, + "learning_rate": 1.8345512204989324e-05, + "loss": 0.2209, + "step": 10004 + }, + { + "epoch": 0.18576207750544885, + "grad_norm": 0.283279150724411, + "learning_rate": 1.8344869493092444e-05, + "loss": 0.3066, + "step": 10006 + }, + { + "epoch": 0.18579920764286748, + "grad_norm": 0.38343068957328796, + "learning_rate": 1.8344226667646884e-05, + "loss": 0.3254, + "step": 10008 + }, + { + "epoch": 0.18583633778028613, + "grad_norm": 0.4531508982181549, + "learning_rate": 1.83435837286614e-05, + "loss": 0.3926, + "step": 10010 + }, + { + "epoch": 0.18587346791770476, + "grad_norm": 0.3899098038673401, + "learning_rate": 1.8342940676144724e-05, + "loss": 0.2624, + "step": 10012 + }, + { + "epoch": 0.1859105980551234, + "grad_norm": 0.385444313287735, + "learning_rate": 1.834229751010562e-05, + "loss": 0.243, + "step": 10014 + }, + { + "epoch": 0.18594772819254204, + "grad_norm": 0.3875812292098999, + "learning_rate": 1.8341654230552836e-05, + "loss": 0.2728, + "step": 10016 + }, + { + "epoch": 0.18598485832996067, + "grad_norm": 0.45712265372276306, + "learning_rate": 1.834101083749512e-05, + "loss": 0.5489, + "step": 10018 + }, + { + "epoch": 0.18602198846737933, + "grad_norm": 0.3582898676395416, + "learning_rate": 1.8340367330941232e-05, + "loss": 0.4963, + "step": 10020 + }, + { + "epoch": 0.18605911860479796, + "grad_norm": 0.3622925877571106, + "learning_rate": 1.833972371089993e-05, + "loss": 0.3348, + "step": 10022 + }, + { + "epoch": 0.18609624874221659, + "grad_norm": 0.331865131855011, + "learning_rate": 1.8339079977379965e-05, + "loss": 0.3835, + "step": 10024 + }, + { + "epoch": 0.18613337887963524, + "grad_norm": 0.40517669916152954, + "learning_rate": 1.8338436130390103e-05, + "loss": 0.4138, + "step": 10026 + }, + { + "epoch": 0.18617050901705387, + "grad_norm": 0.3718627393245697, + "learning_rate": 1.83377921699391e-05, + "loss": 0.3598, + "step": 10028 + }, + { + "epoch": 0.1862076391544725, + "grad_norm": 0.45984190702438354, + "learning_rate": 1.833714809603572e-05, + "loss": 0.2331, + "step": 10030 + }, + { + "epoch": 0.18624476929189115, + "grad_norm": 0.6453419923782349, + "learning_rate": 1.8336503908688727e-05, + "loss": 0.2367, + "step": 10032 + }, + { + "epoch": 0.18628189942930978, + "grad_norm": 0.378261536359787, + "learning_rate": 1.8335859607906886e-05, + "loss": 0.2027, + "step": 10034 + }, + { + "epoch": 0.18631902956672844, + "grad_norm": 0.3476535677909851, + "learning_rate": 1.8335215193698967e-05, + "loss": 0.2969, + "step": 10036 + }, + { + "epoch": 0.18635615970414707, + "grad_norm": 0.607456386089325, + "learning_rate": 1.8334570666073733e-05, + "loss": 0.412, + "step": 10038 + }, + { + "epoch": 0.1863932898415657, + "grad_norm": 0.5959854125976562, + "learning_rate": 1.833392602503996e-05, + "loss": 0.4672, + "step": 10040 + }, + { + "epoch": 0.18643041997898435, + "grad_norm": 0.3668532073497772, + "learning_rate": 1.8333281270606414e-05, + "loss": 0.2478, + "step": 10042 + }, + { + "epoch": 0.18646755011640298, + "grad_norm": 0.44977787137031555, + "learning_rate": 1.8332636402781876e-05, + "loss": 0.3463, + "step": 10044 + }, + { + "epoch": 0.1865046802538216, + "grad_norm": 0.42468881607055664, + "learning_rate": 1.833199142157511e-05, + "loss": 0.4086, + "step": 10046 + }, + { + "epoch": 0.18654181039124026, + "grad_norm": 0.36195921897888184, + "learning_rate": 1.8331346326994902e-05, + "loss": 0.2674, + "step": 10048 + }, + { + "epoch": 0.1865789405286589, + "grad_norm": 0.3577727973461151, + "learning_rate": 1.8330701119050023e-05, + "loss": 0.2756, + "step": 10050 + }, + { + "epoch": 0.18661607066607752, + "grad_norm": 0.37022027373313904, + "learning_rate": 1.8330055797749256e-05, + "loss": 0.2044, + "step": 10052 + }, + { + "epoch": 0.18665320080349618, + "grad_norm": 0.3330825865268707, + "learning_rate": 1.832941036310138e-05, + "loss": 0.4059, + "step": 10054 + }, + { + "epoch": 0.1866903309409148, + "grad_norm": 0.6666103601455688, + "learning_rate": 1.8328764815115182e-05, + "loss": 0.2432, + "step": 10056 + }, + { + "epoch": 0.18672746107833346, + "grad_norm": 0.3342370390892029, + "learning_rate": 1.832811915379944e-05, + "loss": 0.3715, + "step": 10058 + }, + { + "epoch": 0.1867645912157521, + "grad_norm": 0.34751439094543457, + "learning_rate": 1.832747337916294e-05, + "loss": 0.3832, + "step": 10060 + }, + { + "epoch": 0.18680172135317072, + "grad_norm": 0.30360522866249084, + "learning_rate": 1.8326827491214473e-05, + "loss": 0.4883, + "step": 10062 + }, + { + "epoch": 0.18683885149058937, + "grad_norm": 0.40708357095718384, + "learning_rate": 1.8326181489962826e-05, + "loss": 0.4136, + "step": 10064 + }, + { + "epoch": 0.186875981628008, + "grad_norm": 0.418451726436615, + "learning_rate": 1.8325535375416788e-05, + "loss": 0.1855, + "step": 10066 + }, + { + "epoch": 0.18691311176542663, + "grad_norm": 0.3928448259830475, + "learning_rate": 1.8324889147585152e-05, + "loss": 0.2512, + "step": 10068 + }, + { + "epoch": 0.1869502419028453, + "grad_norm": 0.24479812383651733, + "learning_rate": 1.832424280647671e-05, + "loss": 0.5165, + "step": 10070 + }, + { + "epoch": 0.18698737204026392, + "grad_norm": 0.3074891269207001, + "learning_rate": 1.8323596352100257e-05, + "loss": 0.2978, + "step": 10072 + }, + { + "epoch": 0.18702450217768257, + "grad_norm": 0.450447678565979, + "learning_rate": 1.8322949784464593e-05, + "loss": 0.3733, + "step": 10074 + }, + { + "epoch": 0.1870616323151012, + "grad_norm": 0.5366356372833252, + "learning_rate": 1.8322303103578506e-05, + "loss": 0.2737, + "step": 10076 + }, + { + "epoch": 0.18709876245251983, + "grad_norm": 0.5472540855407715, + "learning_rate": 1.8321656309450806e-05, + "loss": 0.2986, + "step": 10078 + }, + { + "epoch": 0.18713589258993849, + "grad_norm": 0.3829568326473236, + "learning_rate": 1.832100940209029e-05, + "loss": 0.4872, + "step": 10080 + }, + { + "epoch": 0.1871730227273571, + "grad_norm": 0.49750491976737976, + "learning_rate": 1.832036238150576e-05, + "loss": 0.2229, + "step": 10082 + }, + { + "epoch": 0.18721015286477574, + "grad_norm": 0.5529249906539917, + "learning_rate": 1.831971524770602e-05, + "loss": 0.2392, + "step": 10084 + }, + { + "epoch": 0.1872472830021944, + "grad_norm": 0.33275121450424194, + "learning_rate": 1.8319068000699878e-05, + "loss": 0.2293, + "step": 10086 + }, + { + "epoch": 0.18728441313961303, + "grad_norm": 0.36916640400886536, + "learning_rate": 1.831842064049614e-05, + "loss": 0.3386, + "step": 10088 + }, + { + "epoch": 0.18732154327703165, + "grad_norm": 0.5965574979782104, + "learning_rate": 1.8317773167103607e-05, + "loss": 0.2376, + "step": 10090 + }, + { + "epoch": 0.1873586734144503, + "grad_norm": 0.4161369502544403, + "learning_rate": 1.83171255805311e-05, + "loss": 0.3249, + "step": 10092 + }, + { + "epoch": 0.18739580355186894, + "grad_norm": 0.34742462635040283, + "learning_rate": 1.8316477880787427e-05, + "loss": 0.443, + "step": 10094 + }, + { + "epoch": 0.1874329336892876, + "grad_norm": 0.3939172625541687, + "learning_rate": 1.83158300678814e-05, + "loss": 0.2988, + "step": 10096 + }, + { + "epoch": 0.18747006382670622, + "grad_norm": 0.4368314743041992, + "learning_rate": 1.8315182141821834e-05, + "loss": 0.475, + "step": 10098 + }, + { + "epoch": 0.18750719396412485, + "grad_norm": 0.3249127268791199, + "learning_rate": 1.831453410261755e-05, + "loss": 0.5569, + "step": 10100 + }, + { + "epoch": 0.1875443241015435, + "grad_norm": 0.3012632727622986, + "learning_rate": 1.8313885950277356e-05, + "loss": 0.278, + "step": 10102 + }, + { + "epoch": 0.18758145423896214, + "grad_norm": 0.36531540751457214, + "learning_rate": 1.8313237684810075e-05, + "loss": 0.448, + "step": 10104 + }, + { + "epoch": 0.18761858437638076, + "grad_norm": 0.41080960631370544, + "learning_rate": 1.8312589306224536e-05, + "loss": 0.3589, + "step": 10106 + }, + { + "epoch": 0.18765571451379942, + "grad_norm": 0.37441620230674744, + "learning_rate": 1.8311940814529555e-05, + "loss": 0.3877, + "step": 10108 + }, + { + "epoch": 0.18769284465121805, + "grad_norm": 0.3083757758140564, + "learning_rate": 1.8311292209733957e-05, + "loss": 0.4681, + "step": 10110 + }, + { + "epoch": 0.1877299747886367, + "grad_norm": 0.3810519874095917, + "learning_rate": 1.8310643491846565e-05, + "loss": 0.3453, + "step": 10112 + }, + { + "epoch": 0.18776710492605533, + "grad_norm": 0.3464382290840149, + "learning_rate": 1.8309994660876213e-05, + "loss": 0.3911, + "step": 10114 + }, + { + "epoch": 0.18780423506347396, + "grad_norm": 0.40538454055786133, + "learning_rate": 1.830934571683172e-05, + "loss": 0.2661, + "step": 10116 + }, + { + "epoch": 0.18784136520089262, + "grad_norm": 0.3924916386604309, + "learning_rate": 1.830869665972192e-05, + "loss": 0.2532, + "step": 10118 + }, + { + "epoch": 0.18787849533831125, + "grad_norm": 0.24053512513637543, + "learning_rate": 1.8308047489555648e-05, + "loss": 0.3108, + "step": 10120 + }, + { + "epoch": 0.18791562547572987, + "grad_norm": 0.32990771532058716, + "learning_rate": 1.8307398206341737e-05, + "loss": 0.3901, + "step": 10122 + }, + { + "epoch": 0.18795275561314853, + "grad_norm": 0.6292337775230408, + "learning_rate": 1.8306748810089023e-05, + "loss": 0.2062, + "step": 10124 + }, + { + "epoch": 0.18798988575056716, + "grad_norm": 0.37493157386779785, + "learning_rate": 1.830609930080633e-05, + "loss": 0.3672, + "step": 10126 + }, + { + "epoch": 0.1880270158879858, + "grad_norm": 0.3033231496810913, + "learning_rate": 1.8305449678502512e-05, + "loss": 0.2681, + "step": 10128 + }, + { + "epoch": 0.18806414602540444, + "grad_norm": 0.28835123777389526, + "learning_rate": 1.83047999431864e-05, + "loss": 0.2672, + "step": 10130 + }, + { + "epoch": 0.18810127616282307, + "grad_norm": 0.3232995569705963, + "learning_rate": 1.8304150094866836e-05, + "loss": 0.3445, + "step": 10132 + }, + { + "epoch": 0.18813840630024173, + "grad_norm": 0.320734441280365, + "learning_rate": 1.8303500133552662e-05, + "loss": 0.1707, + "step": 10134 + }, + { + "epoch": 0.18817553643766036, + "grad_norm": 0.343945175409317, + "learning_rate": 1.8302850059252723e-05, + "loss": 0.1589, + "step": 10136 + }, + { + "epoch": 0.18821266657507899, + "grad_norm": 0.45969587564468384, + "learning_rate": 1.8302199871975865e-05, + "loss": 0.1531, + "step": 10138 + }, + { + "epoch": 0.18824979671249764, + "grad_norm": 0.30570536851882935, + "learning_rate": 1.8301549571730937e-05, + "loss": 0.4095, + "step": 10140 + }, + { + "epoch": 0.18828692684991627, + "grad_norm": 0.28152766823768616, + "learning_rate": 1.8300899158526783e-05, + "loss": 0.5259, + "step": 10142 + }, + { + "epoch": 0.1883240569873349, + "grad_norm": 0.3860793113708496, + "learning_rate": 1.8300248632372257e-05, + "loss": 0.2524, + "step": 10144 + }, + { + "epoch": 0.18836118712475355, + "grad_norm": 0.25214534997940063, + "learning_rate": 1.829959799327621e-05, + "loss": 0.3168, + "step": 10146 + }, + { + "epoch": 0.18839831726217218, + "grad_norm": 0.3945486843585968, + "learning_rate": 1.829894724124749e-05, + "loss": 0.4294, + "step": 10148 + }, + { + "epoch": 0.18843544739959084, + "grad_norm": 0.4058411419391632, + "learning_rate": 1.8298296376294962e-05, + "loss": 0.4185, + "step": 10150 + }, + { + "epoch": 0.18847257753700947, + "grad_norm": 0.29185596108436584, + "learning_rate": 1.8297645398427475e-05, + "loss": 0.3392, + "step": 10152 + }, + { + "epoch": 0.1885097076744281, + "grad_norm": 0.3090771734714508, + "learning_rate": 1.8296994307653888e-05, + "loss": 0.1249, + "step": 10154 + }, + { + "epoch": 0.18854683781184675, + "grad_norm": 0.40176692605018616, + "learning_rate": 1.829634310398306e-05, + "loss": 0.2969, + "step": 10156 + }, + { + "epoch": 0.18858396794926538, + "grad_norm": 0.35776567459106445, + "learning_rate": 1.829569178742385e-05, + "loss": 0.3368, + "step": 10158 + }, + { + "epoch": 0.188621098086684, + "grad_norm": 0.3186683654785156, + "learning_rate": 1.829504035798513e-05, + "loss": 0.221, + "step": 10160 + }, + { + "epoch": 0.18865822822410266, + "grad_norm": 0.4903828799724579, + "learning_rate": 1.8294388815675753e-05, + "loss": 0.1931, + "step": 10162 + }, + { + "epoch": 0.1886953583615213, + "grad_norm": 0.35308709740638733, + "learning_rate": 1.8293737160504587e-05, + "loss": 0.1733, + "step": 10164 + }, + { + "epoch": 0.18873248849893992, + "grad_norm": 0.34929269552230835, + "learning_rate": 1.8293085392480504e-05, + "loss": 0.3794, + "step": 10166 + }, + { + "epoch": 0.18876961863635858, + "grad_norm": 0.34631383419036865, + "learning_rate": 1.829243351161237e-05, + "loss": 0.308, + "step": 10168 + }, + { + "epoch": 0.1888067487737772, + "grad_norm": 0.3294782042503357, + "learning_rate": 1.829178151790905e-05, + "loss": 0.2645, + "step": 10170 + }, + { + "epoch": 0.18884387891119586, + "grad_norm": 0.5012518167495728, + "learning_rate": 1.8291129411379428e-05, + "loss": 0.325, + "step": 10172 + }, + { + "epoch": 0.1888810090486145, + "grad_norm": 0.3634997606277466, + "learning_rate": 1.829047719203236e-05, + "loss": 0.5125, + "step": 10174 + }, + { + "epoch": 0.18891813918603312, + "grad_norm": 0.3600502610206604, + "learning_rate": 1.8289824859876736e-05, + "loss": 0.314, + "step": 10176 + }, + { + "epoch": 0.18895526932345177, + "grad_norm": 0.36589720845222473, + "learning_rate": 1.828917241492143e-05, + "loss": 0.2664, + "step": 10178 + }, + { + "epoch": 0.1889923994608704, + "grad_norm": 0.2958115041255951, + "learning_rate": 1.828851985717531e-05, + "loss": 0.3417, + "step": 10180 + }, + { + "epoch": 0.18902952959828903, + "grad_norm": 0.3177456855773926, + "learning_rate": 1.8287867186647265e-05, + "loss": 0.3357, + "step": 10182 + }, + { + "epoch": 0.1890666597357077, + "grad_norm": 0.35406264662742615, + "learning_rate": 1.828721440334617e-05, + "loss": 0.2551, + "step": 10184 + }, + { + "epoch": 0.18910378987312632, + "grad_norm": 0.3648502230644226, + "learning_rate": 1.8286561507280912e-05, + "loss": 0.4824, + "step": 10186 + }, + { + "epoch": 0.18914092001054497, + "grad_norm": 0.3534731864929199, + "learning_rate": 1.8285908498460372e-05, + "loss": 0.2023, + "step": 10188 + }, + { + "epoch": 0.1891780501479636, + "grad_norm": 0.2903887629508972, + "learning_rate": 1.828525537689344e-05, + "loss": 0.175, + "step": 10190 + }, + { + "epoch": 0.18921518028538223, + "grad_norm": 0.4702399969100952, + "learning_rate": 1.8284602142588998e-05, + "loss": 0.1905, + "step": 10192 + }, + { + "epoch": 0.18925231042280088, + "grad_norm": 0.2802911400794983, + "learning_rate": 1.8283948795555934e-05, + "loss": 0.2727, + "step": 10194 + }, + { + "epoch": 0.1892894405602195, + "grad_norm": 0.34536829590797424, + "learning_rate": 1.828329533580314e-05, + "loss": 0.5455, + "step": 10196 + }, + { + "epoch": 0.18932657069763814, + "grad_norm": 0.24384969472885132, + "learning_rate": 1.828264176333951e-05, + "loss": 0.3243, + "step": 10198 + }, + { + "epoch": 0.1893637008350568, + "grad_norm": 0.29854816198349, + "learning_rate": 1.8281988078173934e-05, + "loss": 0.4068, + "step": 10200 + }, + { + "epoch": 0.18940083097247543, + "grad_norm": 0.28660720586776733, + "learning_rate": 1.8281334280315305e-05, + "loss": 0.104, + "step": 10202 + }, + { + "epoch": 0.18943796110989405, + "grad_norm": 0.37110835313796997, + "learning_rate": 1.8280680369772526e-05, + "loss": 0.3304, + "step": 10204 + }, + { + "epoch": 0.1894750912473127, + "grad_norm": 0.3357993960380554, + "learning_rate": 1.828002634655449e-05, + "loss": 0.0829, + "step": 10206 + }, + { + "epoch": 0.18951222138473134, + "grad_norm": 0.3750956356525421, + "learning_rate": 1.8279372210670098e-05, + "loss": 0.162, + "step": 10208 + }, + { + "epoch": 0.18954935152215, + "grad_norm": 0.3358430862426758, + "learning_rate": 1.8278717962128246e-05, + "loss": 0.2304, + "step": 10210 + }, + { + "epoch": 0.18958648165956862, + "grad_norm": 0.3922125995159149, + "learning_rate": 1.827806360093784e-05, + "loss": 0.4568, + "step": 10212 + }, + { + "epoch": 0.18962361179698725, + "grad_norm": 0.3021722733974457, + "learning_rate": 1.8277409127107787e-05, + "loss": 0.2651, + "step": 10214 + }, + { + "epoch": 0.1896607419344059, + "grad_norm": 0.3054031431674957, + "learning_rate": 1.827675454064699e-05, + "loss": 0.3145, + "step": 10216 + }, + { + "epoch": 0.18969787207182454, + "grad_norm": 0.36788761615753174, + "learning_rate": 1.8276099841564353e-05, + "loss": 0.2488, + "step": 10218 + }, + { + "epoch": 0.18973500220924316, + "grad_norm": 0.33292147517204285, + "learning_rate": 1.8275445029868788e-05, + "loss": 0.1316, + "step": 10220 + }, + { + "epoch": 0.18977213234666182, + "grad_norm": 0.4965655207633972, + "learning_rate": 1.8274790105569205e-05, + "loss": 0.4831, + "step": 10222 + }, + { + "epoch": 0.18980926248408045, + "grad_norm": 0.3368760049343109, + "learning_rate": 1.827413506867451e-05, + "loss": 0.2015, + "step": 10224 + }, + { + "epoch": 0.1898463926214991, + "grad_norm": 0.37184277176856995, + "learning_rate": 1.827347991919363e-05, + "loss": 0.388, + "step": 10226 + }, + { + "epoch": 0.18988352275891773, + "grad_norm": 0.2848564088344574, + "learning_rate": 1.8272824657135463e-05, + "loss": 0.2419, + "step": 10228 + }, + { + "epoch": 0.18992065289633636, + "grad_norm": 0.4152754545211792, + "learning_rate": 1.827216928250893e-05, + "loss": 0.3988, + "step": 10230 + }, + { + "epoch": 0.18995778303375502, + "grad_norm": 0.3655049204826355, + "learning_rate": 1.8271513795322956e-05, + "loss": 0.2914, + "step": 10232 + }, + { + "epoch": 0.18999491317117365, + "grad_norm": 0.3804266154766083, + "learning_rate": 1.8270858195586453e-05, + "loss": 0.4918, + "step": 10234 + }, + { + "epoch": 0.19003204330859227, + "grad_norm": 0.44650667905807495, + "learning_rate": 1.827020248330835e-05, + "loss": 0.5099, + "step": 10236 + }, + { + "epoch": 0.19006917344601093, + "grad_norm": 0.5515437722206116, + "learning_rate": 1.8269546658497556e-05, + "loss": 0.2801, + "step": 10238 + }, + { + "epoch": 0.19010630358342956, + "grad_norm": 0.4348154067993164, + "learning_rate": 1.8268890721163007e-05, + "loss": 0.3094, + "step": 10240 + }, + { + "epoch": 0.1901434337208482, + "grad_norm": 0.4531700909137726, + "learning_rate": 1.826823467131362e-05, + "loss": 0.3214, + "step": 10242 + }, + { + "epoch": 0.19018056385826684, + "grad_norm": 0.6516050100326538, + "learning_rate": 1.8267578508958324e-05, + "loss": 0.3586, + "step": 10244 + }, + { + "epoch": 0.19021769399568547, + "grad_norm": 0.47056853771209717, + "learning_rate": 1.8266922234106052e-05, + "loss": 0.2819, + "step": 10246 + }, + { + "epoch": 0.19025482413310413, + "grad_norm": 0.42877843976020813, + "learning_rate": 1.826626584676573e-05, + "loss": 0.3191, + "step": 10248 + }, + { + "epoch": 0.19029195427052276, + "grad_norm": 0.35493627190589905, + "learning_rate": 1.8265609346946292e-05, + "loss": 0.2854, + "step": 10250 + }, + { + "epoch": 0.19032908440794138, + "grad_norm": 0.38031259179115295, + "learning_rate": 1.8264952734656667e-05, + "loss": 0.3722, + "step": 10252 + }, + { + "epoch": 0.19036621454536004, + "grad_norm": 0.37702032923698425, + "learning_rate": 1.8264296009905792e-05, + "loss": 0.3663, + "step": 10254 + }, + { + "epoch": 0.19040334468277867, + "grad_norm": 0.31660595536231995, + "learning_rate": 1.82636391727026e-05, + "loss": 0.2031, + "step": 10256 + }, + { + "epoch": 0.1904404748201973, + "grad_norm": 0.4001532196998596, + "learning_rate": 1.8262982223056035e-05, + "loss": 0.297, + "step": 10258 + }, + { + "epoch": 0.19047760495761595, + "grad_norm": 0.2738375961780548, + "learning_rate": 1.8262325160975032e-05, + "loss": 0.2355, + "step": 10260 + }, + { + "epoch": 0.19051473509503458, + "grad_norm": 0.3207013010978699, + "learning_rate": 1.8261667986468534e-05, + "loss": 0.3822, + "step": 10262 + }, + { + "epoch": 0.19055186523245324, + "grad_norm": 0.3424219787120819, + "learning_rate": 1.8261010699545477e-05, + "loss": 0.4224, + "step": 10264 + }, + { + "epoch": 0.19058899536987187, + "grad_norm": 0.5419808626174927, + "learning_rate": 1.826035330021481e-05, + "loss": 0.3046, + "step": 10266 + }, + { + "epoch": 0.1906261255072905, + "grad_norm": 0.4402081370353699, + "learning_rate": 1.8259695788485478e-05, + "loss": 0.183, + "step": 10268 + }, + { + "epoch": 0.19066325564470915, + "grad_norm": 0.42075181007385254, + "learning_rate": 1.8259038164366428e-05, + "loss": 0.3375, + "step": 10270 + }, + { + "epoch": 0.19070038578212778, + "grad_norm": 0.3891755938529968, + "learning_rate": 1.8258380427866608e-05, + "loss": 0.3046, + "step": 10272 + }, + { + "epoch": 0.1907375159195464, + "grad_norm": 0.29503315687179565, + "learning_rate": 1.825772257899496e-05, + "loss": 0.2603, + "step": 10274 + }, + { + "epoch": 0.19077464605696506, + "grad_norm": 0.5955208539962769, + "learning_rate": 1.825706461776045e-05, + "loss": 0.2174, + "step": 10276 + }, + { + "epoch": 0.1908117761943837, + "grad_norm": 0.5589277744293213, + "learning_rate": 1.8256406544172024e-05, + "loss": 0.2449, + "step": 10278 + }, + { + "epoch": 0.19084890633180232, + "grad_norm": 0.35382336378097534, + "learning_rate": 1.8255748358238633e-05, + "loss": 0.3601, + "step": 10280 + }, + { + "epoch": 0.19088603646922098, + "grad_norm": 0.4089867174625397, + "learning_rate": 1.8255090059969235e-05, + "loss": 0.324, + "step": 10282 + }, + { + "epoch": 0.1909231666066396, + "grad_norm": 0.2896486520767212, + "learning_rate": 1.825443164937279e-05, + "loss": 0.2961, + "step": 10284 + }, + { + "epoch": 0.19096029674405826, + "grad_norm": 0.5487931966781616, + "learning_rate": 1.825377312645825e-05, + "loss": 0.1843, + "step": 10286 + }, + { + "epoch": 0.1909974268814769, + "grad_norm": 0.3776894807815552, + "learning_rate": 1.8253114491234587e-05, + "loss": 0.2858, + "step": 10288 + }, + { + "epoch": 0.19103455701889552, + "grad_norm": 0.5502543449401855, + "learning_rate": 1.8252455743710752e-05, + "loss": 0.4484, + "step": 10290 + }, + { + "epoch": 0.19107168715631417, + "grad_norm": 0.3481937348842621, + "learning_rate": 1.8251796883895715e-05, + "loss": 0.2355, + "step": 10292 + }, + { + "epoch": 0.1911088172937328, + "grad_norm": 0.3464677929878235, + "learning_rate": 1.825113791179844e-05, + "loss": 0.4775, + "step": 10294 + }, + { + "epoch": 0.19114594743115143, + "grad_norm": 0.3349195122718811, + "learning_rate": 1.8250478827427893e-05, + "loss": 0.3036, + "step": 10296 + }, + { + "epoch": 0.1911830775685701, + "grad_norm": 0.3453654646873474, + "learning_rate": 1.8249819630793044e-05, + "loss": 0.2553, + "step": 10298 + }, + { + "epoch": 0.19122020770598872, + "grad_norm": 0.40591198205947876, + "learning_rate": 1.824916032190286e-05, + "loss": 0.3128, + "step": 10300 + }, + { + "epoch": 0.19125733784340737, + "grad_norm": 0.521061360836029, + "learning_rate": 1.8248500900766312e-05, + "loss": 0.3342, + "step": 10302 + }, + { + "epoch": 0.191294467980826, + "grad_norm": 0.5649065375328064, + "learning_rate": 1.8247841367392373e-05, + "loss": 0.3777, + "step": 10304 + }, + { + "epoch": 0.19133159811824463, + "grad_norm": 0.4497404396533966, + "learning_rate": 1.824718172179002e-05, + "loss": 0.3101, + "step": 10306 + }, + { + "epoch": 0.19136872825566328, + "grad_norm": 0.4043754041194916, + "learning_rate": 1.8246521963968224e-05, + "loss": 0.2547, + "step": 10308 + }, + { + "epoch": 0.1914058583930819, + "grad_norm": 0.318892240524292, + "learning_rate": 1.8245862093935968e-05, + "loss": 0.3594, + "step": 10310 + }, + { + "epoch": 0.19144298853050054, + "grad_norm": 0.28512245416641235, + "learning_rate": 1.8245202111702228e-05, + "loss": 0.3597, + "step": 10312 + }, + { + "epoch": 0.1914801186679192, + "grad_norm": 0.3797457218170166, + "learning_rate": 1.8244542017275985e-05, + "loss": 0.3199, + "step": 10314 + }, + { + "epoch": 0.19151724880533783, + "grad_norm": 0.23189327120780945, + "learning_rate": 1.8243881810666216e-05, + "loss": 0.3123, + "step": 10316 + }, + { + "epoch": 0.19155437894275645, + "grad_norm": 0.2917740046977997, + "learning_rate": 1.824322149188191e-05, + "loss": 0.3349, + "step": 10318 + }, + { + "epoch": 0.1915915090801751, + "grad_norm": 0.4230726957321167, + "learning_rate": 1.8242561060932054e-05, + "loss": 0.397, + "step": 10320 + }, + { + "epoch": 0.19162863921759374, + "grad_norm": 0.22096633911132812, + "learning_rate": 1.8241900517825636e-05, + "loss": 0.3303, + "step": 10322 + }, + { + "epoch": 0.1916657693550124, + "grad_norm": 0.3386573791503906, + "learning_rate": 1.824123986257163e-05, + "loss": 0.3678, + "step": 10324 + }, + { + "epoch": 0.19170289949243102, + "grad_norm": 0.3953254520893097, + "learning_rate": 1.824057909517904e-05, + "loss": 0.2924, + "step": 10326 + }, + { + "epoch": 0.19174002962984965, + "grad_norm": 0.1699492484331131, + "learning_rate": 1.823991821565685e-05, + "loss": 0.2123, + "step": 10328 + }, + { + "epoch": 0.1917771597672683, + "grad_norm": 0.38965439796447754, + "learning_rate": 1.8239257224014054e-05, + "loss": 0.364, + "step": 10330 + }, + { + "epoch": 0.19181428990468694, + "grad_norm": 0.42472290992736816, + "learning_rate": 1.8238596120259648e-05, + "loss": 0.4063, + "step": 10332 + }, + { + "epoch": 0.19185142004210556, + "grad_norm": 0.34790465235710144, + "learning_rate": 1.8237934904402624e-05, + "loss": 0.4023, + "step": 10334 + }, + { + "epoch": 0.19188855017952422, + "grad_norm": 0.3114390969276428, + "learning_rate": 1.8237273576451984e-05, + "loss": 0.6155, + "step": 10336 + }, + { + "epoch": 0.19192568031694285, + "grad_norm": 0.29550909996032715, + "learning_rate": 1.8236612136416723e-05, + "loss": 0.562, + "step": 10338 + }, + { + "epoch": 0.1919628104543615, + "grad_norm": 0.46728673577308655, + "learning_rate": 1.8235950584305844e-05, + "loss": 0.4387, + "step": 10340 + }, + { + "epoch": 0.19199994059178013, + "grad_norm": 0.4299033284187317, + "learning_rate": 1.8235288920128345e-05, + "loss": 0.3679, + "step": 10342 + }, + { + "epoch": 0.19203707072919876, + "grad_norm": 0.42623013257980347, + "learning_rate": 1.8234627143893232e-05, + "loss": 0.3048, + "step": 10344 + }, + { + "epoch": 0.19207420086661742, + "grad_norm": 0.31752580404281616, + "learning_rate": 1.8233965255609508e-05, + "loss": 0.3704, + "step": 10346 + }, + { + "epoch": 0.19211133100403605, + "grad_norm": 0.36908331513404846, + "learning_rate": 1.8233303255286185e-05, + "loss": 0.2221, + "step": 10348 + }, + { + "epoch": 0.19214846114145467, + "grad_norm": 0.4320656955242157, + "learning_rate": 1.823264114293226e-05, + "loss": 0.2494, + "step": 10350 + }, + { + "epoch": 0.19218559127887333, + "grad_norm": 0.515095055103302, + "learning_rate": 1.8231978918556752e-05, + "loss": 0.2313, + "step": 10352 + }, + { + "epoch": 0.19222272141629196, + "grad_norm": 0.41074809432029724, + "learning_rate": 1.823131658216867e-05, + "loss": 0.3836, + "step": 10354 + }, + { + "epoch": 0.1922598515537106, + "grad_norm": 0.4301954507827759, + "learning_rate": 1.823065413377702e-05, + "loss": 0.4249, + "step": 10356 + }, + { + "epoch": 0.19229698169112924, + "grad_norm": 0.4090089201927185, + "learning_rate": 1.8229991573390828e-05, + "loss": 0.2946, + "step": 10358 + }, + { + "epoch": 0.19233411182854787, + "grad_norm": 0.40900731086730957, + "learning_rate": 1.8229328901019095e-05, + "loss": 0.3216, + "step": 10360 + }, + { + "epoch": 0.19237124196596653, + "grad_norm": 0.3969988524913788, + "learning_rate": 1.822866611667085e-05, + "loss": 0.4612, + "step": 10362 + }, + { + "epoch": 0.19240837210338516, + "grad_norm": 0.37536633014678955, + "learning_rate": 1.8228003220355103e-05, + "loss": 0.3757, + "step": 10364 + }, + { + "epoch": 0.19244550224080378, + "grad_norm": 0.22825933992862701, + "learning_rate": 1.8227340212080885e-05, + "loss": 0.3799, + "step": 10366 + }, + { + "epoch": 0.19248263237822244, + "grad_norm": 0.4094143211841583, + "learning_rate": 1.8226677091857203e-05, + "loss": 0.1981, + "step": 10368 + }, + { + "epoch": 0.19251976251564107, + "grad_norm": 0.32274430990219116, + "learning_rate": 1.822601385969309e-05, + "loss": 0.3365, + "step": 10370 + }, + { + "epoch": 0.1925568926530597, + "grad_norm": 0.36306002736091614, + "learning_rate": 1.8225350515597568e-05, + "loss": 0.3687, + "step": 10372 + }, + { + "epoch": 0.19259402279047835, + "grad_norm": 0.37292805314064026, + "learning_rate": 1.8224687059579667e-05, + "loss": 0.2193, + "step": 10374 + }, + { + "epoch": 0.19263115292789698, + "grad_norm": 0.3921237885951996, + "learning_rate": 1.8224023491648406e-05, + "loss": 0.3511, + "step": 10376 + }, + { + "epoch": 0.19266828306531564, + "grad_norm": 0.31636109948158264, + "learning_rate": 1.822335981181282e-05, + "loss": 0.1889, + "step": 10378 + }, + { + "epoch": 0.19270541320273427, + "grad_norm": 0.40478530526161194, + "learning_rate": 1.822269602008194e-05, + "loss": 0.3274, + "step": 10380 + }, + { + "epoch": 0.1927425433401529, + "grad_norm": 0.43385058641433716, + "learning_rate": 1.8222032116464798e-05, + "loss": 0.2702, + "step": 10382 + }, + { + "epoch": 0.19277967347757155, + "grad_norm": 0.47108379006385803, + "learning_rate": 1.8221368100970424e-05, + "loss": 0.2186, + "step": 10384 + }, + { + "epoch": 0.19281680361499018, + "grad_norm": 0.35603728890419006, + "learning_rate": 1.8220703973607857e-05, + "loss": 0.3223, + "step": 10386 + }, + { + "epoch": 0.1928539337524088, + "grad_norm": 0.22544145584106445, + "learning_rate": 1.8220039734386136e-05, + "loss": 0.3447, + "step": 10388 + }, + { + "epoch": 0.19289106388982746, + "grad_norm": 0.5928962826728821, + "learning_rate": 1.821937538331429e-05, + "loss": 0.2882, + "step": 10390 + }, + { + "epoch": 0.1929281940272461, + "grad_norm": 0.3926587998867035, + "learning_rate": 1.821871092040137e-05, + "loss": 0.2627, + "step": 10392 + }, + { + "epoch": 0.19296532416466472, + "grad_norm": 0.3659384846687317, + "learning_rate": 1.821804634565641e-05, + "loss": 0.2562, + "step": 10394 + }, + { + "epoch": 0.19300245430208338, + "grad_norm": 0.26231399178504944, + "learning_rate": 1.8217381659088453e-05, + "loss": 0.3167, + "step": 10396 + }, + { + "epoch": 0.193039584439502, + "grad_norm": 0.3123832046985626, + "learning_rate": 1.8216716860706548e-05, + "loss": 0.2726, + "step": 10398 + }, + { + "epoch": 0.19307671457692066, + "grad_norm": 0.4036448299884796, + "learning_rate": 1.821605195051974e-05, + "loss": 0.2602, + "step": 10400 + }, + { + "epoch": 0.1931138447143393, + "grad_norm": 0.3967619836330414, + "learning_rate": 1.8215386928537073e-05, + "loss": 0.2901, + "step": 10402 + }, + { + "epoch": 0.19315097485175792, + "grad_norm": 0.29481208324432373, + "learning_rate": 1.8214721794767597e-05, + "loss": 0.3315, + "step": 10404 + }, + { + "epoch": 0.19318810498917657, + "grad_norm": 0.3918724060058594, + "learning_rate": 1.8214056549220366e-05, + "loss": 0.3605, + "step": 10406 + }, + { + "epoch": 0.1932252351265952, + "grad_norm": 0.4407624900341034, + "learning_rate": 1.8213391191904424e-05, + "loss": 0.37, + "step": 10408 + }, + { + "epoch": 0.19326236526401383, + "grad_norm": 0.17345456779003143, + "learning_rate": 1.8212725722828838e-05, + "loss": 0.1097, + "step": 10410 + }, + { + "epoch": 0.1932994954014325, + "grad_norm": 0.3316115140914917, + "learning_rate": 1.821206014200265e-05, + "loss": 0.2392, + "step": 10412 + }, + { + "epoch": 0.19333662553885111, + "grad_norm": 0.39465439319610596, + "learning_rate": 1.821139444943492e-05, + "loss": 0.2555, + "step": 10414 + }, + { + "epoch": 0.19337375567626977, + "grad_norm": 0.4078032374382019, + "learning_rate": 1.821072864513471e-05, + "loss": 0.4078, + "step": 10416 + }, + { + "epoch": 0.1934108858136884, + "grad_norm": 0.340839684009552, + "learning_rate": 1.8210062729111077e-05, + "loss": 0.1896, + "step": 10418 + }, + { + "epoch": 0.19344801595110703, + "grad_norm": 0.2955494225025177, + "learning_rate": 1.820939670137308e-05, + "loss": 0.2723, + "step": 10420 + }, + { + "epoch": 0.19348514608852568, + "grad_norm": 0.39263802766799927, + "learning_rate": 1.8208730561929788e-05, + "loss": 0.295, + "step": 10422 + }, + { + "epoch": 0.1935222762259443, + "grad_norm": 0.34953704476356506, + "learning_rate": 1.820806431079026e-05, + "loss": 0.36, + "step": 10424 + }, + { + "epoch": 0.19355940636336294, + "grad_norm": 0.44238829612731934, + "learning_rate": 1.820739794796356e-05, + "loss": 0.2015, + "step": 10426 + }, + { + "epoch": 0.1935965365007816, + "grad_norm": 0.34842196106910706, + "learning_rate": 1.8206731473458757e-05, + "loss": 0.2409, + "step": 10428 + }, + { + "epoch": 0.19363366663820022, + "grad_norm": 0.40157032012939453, + "learning_rate": 1.8206064887284925e-05, + "loss": 0.3887, + "step": 10430 + }, + { + "epoch": 0.19367079677561885, + "grad_norm": 0.3731132447719574, + "learning_rate": 1.8205398189451128e-05, + "loss": 0.289, + "step": 10432 + }, + { + "epoch": 0.1937079269130375, + "grad_norm": 0.35805171728134155, + "learning_rate": 1.8204731379966437e-05, + "loss": 0.4443, + "step": 10434 + }, + { + "epoch": 0.19374505705045614, + "grad_norm": 0.4544253945350647, + "learning_rate": 1.8204064458839932e-05, + "loss": 0.4354, + "step": 10436 + }, + { + "epoch": 0.1937821871878748, + "grad_norm": 0.4127708971500397, + "learning_rate": 1.8203397426080684e-05, + "loss": 0.3682, + "step": 10438 + }, + { + "epoch": 0.19381931732529342, + "grad_norm": 0.3067456781864166, + "learning_rate": 1.8202730281697767e-05, + "loss": 0.2851, + "step": 10440 + }, + { + "epoch": 0.19385644746271205, + "grad_norm": 0.4010867774486542, + "learning_rate": 1.8202063025700262e-05, + "loss": 0.3113, + "step": 10442 + }, + { + "epoch": 0.1938935776001307, + "grad_norm": 0.35063600540161133, + "learning_rate": 1.8201395658097245e-05, + "loss": 0.236, + "step": 10444 + }, + { + "epoch": 0.19393070773754933, + "grad_norm": 0.32037413120269775, + "learning_rate": 1.82007281788978e-05, + "loss": 0.2814, + "step": 10446 + }, + { + "epoch": 0.19396783787496796, + "grad_norm": 0.3340356647968292, + "learning_rate": 1.820006058811101e-05, + "loss": 0.5214, + "step": 10448 + }, + { + "epoch": 0.19400496801238662, + "grad_norm": 0.2888498306274414, + "learning_rate": 1.8199392885745957e-05, + "loss": 0.2936, + "step": 10450 + }, + { + "epoch": 0.19404209814980525, + "grad_norm": 0.3157704770565033, + "learning_rate": 1.8198725071811725e-05, + "loss": 0.1533, + "step": 10452 + }, + { + "epoch": 0.1940792282872239, + "grad_norm": 0.31509166955947876, + "learning_rate": 1.8198057146317407e-05, + "loss": 0.3485, + "step": 10454 + }, + { + "epoch": 0.19411635842464253, + "grad_norm": 0.42326149344444275, + "learning_rate": 1.819738910927208e-05, + "loss": 0.3972, + "step": 10456 + }, + { + "epoch": 0.19415348856206116, + "grad_norm": 0.3669712245464325, + "learning_rate": 1.819672096068485e-05, + "loss": 0.2087, + "step": 10458 + }, + { + "epoch": 0.19419061869947982, + "grad_norm": 0.3648996651172638, + "learning_rate": 1.819605270056479e-05, + "loss": 0.2442, + "step": 10460 + }, + { + "epoch": 0.19422774883689845, + "grad_norm": 0.27890896797180176, + "learning_rate": 1.819538432892101e-05, + "loss": 0.1599, + "step": 10462 + }, + { + "epoch": 0.19426487897431707, + "grad_norm": 0.3570995032787323, + "learning_rate": 1.8194715845762596e-05, + "loss": 0.4027, + "step": 10464 + }, + { + "epoch": 0.19430200911173573, + "grad_norm": 0.2493124157190323, + "learning_rate": 1.8194047251098646e-05, + "loss": 0.3901, + "step": 10466 + }, + { + "epoch": 0.19433913924915436, + "grad_norm": 0.43931853771209717, + "learning_rate": 1.8193378544938254e-05, + "loss": 0.1128, + "step": 10468 + }, + { + "epoch": 0.194376269386573, + "grad_norm": 0.39804863929748535, + "learning_rate": 1.8192709727290526e-05, + "loss": 0.2803, + "step": 10470 + }, + { + "epoch": 0.19441339952399164, + "grad_norm": 0.3732399642467499, + "learning_rate": 1.8192040798164554e-05, + "loss": 0.2847, + "step": 10472 + }, + { + "epoch": 0.19445052966141027, + "grad_norm": 0.2559812366962433, + "learning_rate": 1.819137175756945e-05, + "loss": 0.3249, + "step": 10474 + }, + { + "epoch": 0.19448765979882893, + "grad_norm": 0.2886560261249542, + "learning_rate": 1.8190702605514307e-05, + "loss": 0.3991, + "step": 10476 + }, + { + "epoch": 0.19452478993624756, + "grad_norm": 0.32769614458084106, + "learning_rate": 1.819003334200824e-05, + "loss": 0.2013, + "step": 10478 + }, + { + "epoch": 0.19456192007366618, + "grad_norm": 0.42437949776649475, + "learning_rate": 1.8189363967060346e-05, + "loss": 0.3614, + "step": 10480 + }, + { + "epoch": 0.19459905021108484, + "grad_norm": 0.3976858854293823, + "learning_rate": 1.8188694480679743e-05, + "loss": 0.44, + "step": 10482 + }, + { + "epoch": 0.19463618034850347, + "grad_norm": 0.4024472236633301, + "learning_rate": 1.8188024882875534e-05, + "loss": 0.3528, + "step": 10484 + }, + { + "epoch": 0.1946733104859221, + "grad_norm": 0.4627920389175415, + "learning_rate": 1.8187355173656836e-05, + "loss": 0.1862, + "step": 10486 + }, + { + "epoch": 0.19471044062334075, + "grad_norm": 0.413394570350647, + "learning_rate": 1.8186685353032753e-05, + "loss": 0.3557, + "step": 10488 + }, + { + "epoch": 0.19474757076075938, + "grad_norm": 0.4072570502758026, + "learning_rate": 1.8186015421012406e-05, + "loss": 0.2602, + "step": 10490 + }, + { + "epoch": 0.19478470089817804, + "grad_norm": 0.42197901010513306, + "learning_rate": 1.818534537760491e-05, + "loss": 0.3215, + "step": 10492 + }, + { + "epoch": 0.19482183103559667, + "grad_norm": 0.3577510714530945, + "learning_rate": 1.818467522281938e-05, + "loss": 0.2728, + "step": 10494 + }, + { + "epoch": 0.1948589611730153, + "grad_norm": 0.344599187374115, + "learning_rate": 1.8184004956664938e-05, + "loss": 0.4647, + "step": 10496 + }, + { + "epoch": 0.19489609131043395, + "grad_norm": 0.4934121370315552, + "learning_rate": 1.8183334579150703e-05, + "loss": 0.3121, + "step": 10498 + }, + { + "epoch": 0.19493322144785258, + "grad_norm": 0.24618367850780487, + "learning_rate": 1.8182664090285797e-05, + "loss": 0.4257, + "step": 10500 + }, + { + "epoch": 0.1949703515852712, + "grad_norm": 0.20650441944599152, + "learning_rate": 1.818199349007934e-05, + "loss": 0.1588, + "step": 10502 + }, + { + "epoch": 0.19500748172268986, + "grad_norm": 0.3634934723377228, + "learning_rate": 1.8181322778540455e-05, + "loss": 0.3875, + "step": 10504 + }, + { + "epoch": 0.1950446118601085, + "grad_norm": 0.3191303312778473, + "learning_rate": 1.818065195567828e-05, + "loss": 0.1349, + "step": 10506 + }, + { + "epoch": 0.19508174199752712, + "grad_norm": 0.46026158332824707, + "learning_rate": 1.8179981021501935e-05, + "loss": 0.3438, + "step": 10508 + }, + { + "epoch": 0.19511887213494578, + "grad_norm": 0.3885361850261688, + "learning_rate": 1.8179309976020546e-05, + "loss": 0.4297, + "step": 10510 + }, + { + "epoch": 0.1951560022723644, + "grad_norm": 0.32798442244529724, + "learning_rate": 1.8178638819243253e-05, + "loss": 0.3166, + "step": 10512 + }, + { + "epoch": 0.19519313240978306, + "grad_norm": 0.31509965658187866, + "learning_rate": 1.817796755117918e-05, + "loss": 0.2445, + "step": 10514 + }, + { + "epoch": 0.1952302625472017, + "grad_norm": 0.31694459915161133, + "learning_rate": 1.8177296171837468e-05, + "loss": 0.3619, + "step": 10516 + }, + { + "epoch": 0.19526739268462032, + "grad_norm": 0.618583619594574, + "learning_rate": 1.8176624681227248e-05, + "loss": 0.3338, + "step": 10518 + }, + { + "epoch": 0.19530452282203897, + "grad_norm": 0.3839055001735687, + "learning_rate": 1.8175953079357654e-05, + "loss": 0.225, + "step": 10520 + }, + { + "epoch": 0.1953416529594576, + "grad_norm": 0.4258374571800232, + "learning_rate": 1.8175281366237832e-05, + "loss": 0.3165, + "step": 10522 + }, + { + "epoch": 0.19537878309687623, + "grad_norm": 0.23897075653076172, + "learning_rate": 1.817460954187692e-05, + "loss": 0.3364, + "step": 10524 + }, + { + "epoch": 0.19541591323429489, + "grad_norm": 0.3382510840892792, + "learning_rate": 1.8173937606284053e-05, + "loss": 0.1694, + "step": 10526 + }, + { + "epoch": 0.19545304337171351, + "grad_norm": 0.6898068189620972, + "learning_rate": 1.817326555946838e-05, + "loss": 0.3891, + "step": 10528 + }, + { + "epoch": 0.19549017350913217, + "grad_norm": 0.3281741440296173, + "learning_rate": 1.8172593401439048e-05, + "loss": 0.342, + "step": 10530 + }, + { + "epoch": 0.1955273036465508, + "grad_norm": 0.4617529511451721, + "learning_rate": 1.8171921132205196e-05, + "loss": 0.3762, + "step": 10532 + }, + { + "epoch": 0.19556443378396943, + "grad_norm": 0.3313845098018646, + "learning_rate": 1.8171248751775975e-05, + "loss": 0.3025, + "step": 10534 + }, + { + "epoch": 0.19560156392138808, + "grad_norm": 0.3868240416049957, + "learning_rate": 1.8170576260160535e-05, + "loss": 0.3407, + "step": 10536 + }, + { + "epoch": 0.1956386940588067, + "grad_norm": 0.3786194622516632, + "learning_rate": 1.816990365736803e-05, + "loss": 0.4106, + "step": 10538 + }, + { + "epoch": 0.19567582419622534, + "grad_norm": 0.295289546251297, + "learning_rate": 1.8169230943407605e-05, + "loss": 0.4138, + "step": 10540 + }, + { + "epoch": 0.195712954333644, + "grad_norm": 0.38305091857910156, + "learning_rate": 1.8168558118288415e-05, + "loss": 0.1935, + "step": 10542 + }, + { + "epoch": 0.19575008447106262, + "grad_norm": 0.27974823117256165, + "learning_rate": 1.8167885182019616e-05, + "loss": 0.3582, + "step": 10544 + }, + { + "epoch": 0.19578721460848125, + "grad_norm": 0.33655616641044617, + "learning_rate": 1.8167212134610368e-05, + "loss": 0.321, + "step": 10546 + }, + { + "epoch": 0.1958243447458999, + "grad_norm": 0.35231056809425354, + "learning_rate": 1.8166538976069825e-05, + "loss": 0.3232, + "step": 10548 + }, + { + "epoch": 0.19586147488331854, + "grad_norm": 0.3734198808670044, + "learning_rate": 1.8165865706407147e-05, + "loss": 0.3575, + "step": 10550 + }, + { + "epoch": 0.1958986050207372, + "grad_norm": 0.32524755597114563, + "learning_rate": 1.81651923256315e-05, + "loss": 0.2206, + "step": 10552 + }, + { + "epoch": 0.19593573515815582, + "grad_norm": 0.3715626299381256, + "learning_rate": 1.816451883375204e-05, + "loss": 0.3282, + "step": 10554 + }, + { + "epoch": 0.19597286529557445, + "grad_norm": 0.4585845470428467, + "learning_rate": 1.816384523077794e-05, + "loss": 0.3723, + "step": 10556 + }, + { + "epoch": 0.1960099954329931, + "grad_norm": 0.4712212085723877, + "learning_rate": 1.816317151671835e-05, + "loss": 0.3035, + "step": 10558 + }, + { + "epoch": 0.19604712557041173, + "grad_norm": 0.41906657814979553, + "learning_rate": 1.8162497691582455e-05, + "loss": 0.3311, + "step": 10560 + }, + { + "epoch": 0.19608425570783036, + "grad_norm": 0.34197336435317993, + "learning_rate": 1.8161823755379414e-05, + "loss": 0.4088, + "step": 10562 + }, + { + "epoch": 0.19612138584524902, + "grad_norm": 0.4386694133281708, + "learning_rate": 1.8161149708118397e-05, + "loss": 0.2722, + "step": 10564 + }, + { + "epoch": 0.19615851598266765, + "grad_norm": 0.2498355656862259, + "learning_rate": 1.816047554980858e-05, + "loss": 0.2435, + "step": 10566 + }, + { + "epoch": 0.1961956461200863, + "grad_norm": 0.29070866107940674, + "learning_rate": 1.8159801280459132e-05, + "loss": 0.4648, + "step": 10568 + }, + { + "epoch": 0.19623277625750493, + "grad_norm": 0.32423922419548035, + "learning_rate": 1.8159126900079234e-05, + "loss": 0.4451, + "step": 10570 + }, + { + "epoch": 0.19626990639492356, + "grad_norm": 0.3282385766506195, + "learning_rate": 1.8158452408678057e-05, + "loss": 0.2586, + "step": 10572 + }, + { + "epoch": 0.19630703653234222, + "grad_norm": 0.5702826380729675, + "learning_rate": 1.8157777806264775e-05, + "loss": 0.2844, + "step": 10574 + }, + { + "epoch": 0.19634416666976084, + "grad_norm": 0.31377604603767395, + "learning_rate": 1.815710309284858e-05, + "loss": 0.4656, + "step": 10576 + }, + { + "epoch": 0.19638129680717947, + "grad_norm": 0.4903489649295807, + "learning_rate": 1.815642826843864e-05, + "loss": 0.2663, + "step": 10578 + }, + { + "epoch": 0.19641842694459813, + "grad_norm": 0.4795646369457245, + "learning_rate": 1.8155753333044144e-05, + "loss": 0.4595, + "step": 10580 + }, + { + "epoch": 0.19645555708201676, + "grad_norm": 0.26642128825187683, + "learning_rate": 1.8155078286674274e-05, + "loss": 0.3076, + "step": 10582 + }, + { + "epoch": 0.19649268721943539, + "grad_norm": 0.2655535936355591, + "learning_rate": 1.815440312933822e-05, + "loss": 0.1803, + "step": 10584 + }, + { + "epoch": 0.19652981735685404, + "grad_norm": 0.39043155312538147, + "learning_rate": 1.815372786104516e-05, + "loss": 0.3942, + "step": 10586 + }, + { + "epoch": 0.19656694749427267, + "grad_norm": 0.3451046049594879, + "learning_rate": 1.8153052481804286e-05, + "loss": 0.1798, + "step": 10588 + }, + { + "epoch": 0.19660407763169133, + "grad_norm": 0.27846482396125793, + "learning_rate": 1.8152376991624795e-05, + "loss": 0.4045, + "step": 10590 + }, + { + "epoch": 0.19664120776910995, + "grad_norm": 0.310319721698761, + "learning_rate": 1.8151701390515867e-05, + "loss": 0.3243, + "step": 10592 + }, + { + "epoch": 0.19667833790652858, + "grad_norm": 0.3808184564113617, + "learning_rate": 1.81510256784867e-05, + "loss": 0.3467, + "step": 10594 + }, + { + "epoch": 0.19671546804394724, + "grad_norm": 0.32306984066963196, + "learning_rate": 1.8150349855546494e-05, + "loss": 0.4589, + "step": 10596 + }, + { + "epoch": 0.19675259818136587, + "grad_norm": 0.6907929182052612, + "learning_rate": 1.8149673921704437e-05, + "loss": 0.3387, + "step": 10598 + }, + { + "epoch": 0.1967897283187845, + "grad_norm": 0.41490477323532104, + "learning_rate": 1.8148997876969728e-05, + "loss": 0.2975, + "step": 10600 + }, + { + "epoch": 0.19682685845620315, + "grad_norm": 0.354691743850708, + "learning_rate": 1.8148321721351565e-05, + "loss": 0.2463, + "step": 10602 + }, + { + "epoch": 0.19686398859362178, + "grad_norm": 0.326217919588089, + "learning_rate": 1.8147645454859154e-05, + "loss": 0.3271, + "step": 10604 + }, + { + "epoch": 0.19690111873104044, + "grad_norm": 0.4011254906654358, + "learning_rate": 1.814696907750169e-05, + "loss": 0.2554, + "step": 10606 + }, + { + "epoch": 0.19693824886845906, + "grad_norm": 0.49285557866096497, + "learning_rate": 1.814629258928838e-05, + "loss": 0.487, + "step": 10608 + }, + { + "epoch": 0.1969753790058777, + "grad_norm": 0.4032011926174164, + "learning_rate": 1.8145615990228433e-05, + "loss": 0.3717, + "step": 10610 + }, + { + "epoch": 0.19701250914329635, + "grad_norm": 0.2815262973308563, + "learning_rate": 1.814493928033105e-05, + "loss": 0.2501, + "step": 10612 + }, + { + "epoch": 0.19704963928071498, + "grad_norm": 0.4225890338420868, + "learning_rate": 1.814426245960544e-05, + "loss": 0.3522, + "step": 10614 + }, + { + "epoch": 0.1970867694181336, + "grad_norm": 0.3081131875514984, + "learning_rate": 1.814358552806081e-05, + "loss": 0.3521, + "step": 10616 + }, + { + "epoch": 0.19712389955555226, + "grad_norm": 0.25342971086502075, + "learning_rate": 1.8142908485706375e-05, + "loss": 0.2319, + "step": 10618 + }, + { + "epoch": 0.1971610296929709, + "grad_norm": 0.40041080117225647, + "learning_rate": 1.8142231332551348e-05, + "loss": 0.4048, + "step": 10620 + }, + { + "epoch": 0.19719815983038952, + "grad_norm": 0.42423558235168457, + "learning_rate": 1.8141554068604942e-05, + "loss": 0.2455, + "step": 10622 + }, + { + "epoch": 0.19723528996780818, + "grad_norm": 0.2633681297302246, + "learning_rate": 1.8140876693876372e-05, + "loss": 0.23, + "step": 10624 + }, + { + "epoch": 0.1972724201052268, + "grad_norm": 0.3154236078262329, + "learning_rate": 1.8140199208374852e-05, + "loss": 0.2243, + "step": 10626 + }, + { + "epoch": 0.19730955024264546, + "grad_norm": 0.4470406174659729, + "learning_rate": 1.8139521612109605e-05, + "loss": 0.2863, + "step": 10628 + }, + { + "epoch": 0.1973466803800641, + "grad_norm": 0.3068430423736572, + "learning_rate": 1.813884390508985e-05, + "loss": 0.274, + "step": 10630 + }, + { + "epoch": 0.19738381051748272, + "grad_norm": 0.2704326808452606, + "learning_rate": 1.8138166087324806e-05, + "loss": 0.2433, + "step": 10632 + }, + { + "epoch": 0.19742094065490137, + "grad_norm": 0.2957181930541992, + "learning_rate": 1.8137488158823703e-05, + "loss": 0.2194, + "step": 10634 + }, + { + "epoch": 0.19745807079232, + "grad_norm": 0.41069671511650085, + "learning_rate": 1.8136810119595756e-05, + "loss": 0.525, + "step": 10636 + }, + { + "epoch": 0.19749520092973863, + "grad_norm": 0.3662183880805969, + "learning_rate": 1.8136131969650198e-05, + "loss": 0.3967, + "step": 10638 + }, + { + "epoch": 0.19753233106715729, + "grad_norm": 0.9215697646141052, + "learning_rate": 1.8135453708996254e-05, + "loss": 0.5995, + "step": 10640 + }, + { + "epoch": 0.1975694612045759, + "grad_norm": 0.37060633301734924, + "learning_rate": 1.8134775337643155e-05, + "loss": 0.4379, + "step": 10642 + }, + { + "epoch": 0.19760659134199457, + "grad_norm": 0.2897200286388397, + "learning_rate": 1.8134096855600125e-05, + "loss": 0.2779, + "step": 10644 + }, + { + "epoch": 0.1976437214794132, + "grad_norm": 0.3277202546596527, + "learning_rate": 1.8133418262876405e-05, + "loss": 0.4352, + "step": 10646 + }, + { + "epoch": 0.19768085161683183, + "grad_norm": 0.31786489486694336, + "learning_rate": 1.8132739559481227e-05, + "loss": 0.2168, + "step": 10648 + }, + { + "epoch": 0.19771798175425048, + "grad_norm": 0.3507799208164215, + "learning_rate": 1.813206074542382e-05, + "loss": 0.3223, + "step": 10650 + }, + { + "epoch": 0.1977551118916691, + "grad_norm": 0.2890516519546509, + "learning_rate": 1.8131381820713426e-05, + "loss": 0.2937, + "step": 10652 + }, + { + "epoch": 0.19779224202908774, + "grad_norm": 0.40991395711898804, + "learning_rate": 1.8130702785359284e-05, + "loss": 0.3344, + "step": 10654 + }, + { + "epoch": 0.1978293721665064, + "grad_norm": 0.4502514600753784, + "learning_rate": 1.813002363937063e-05, + "loss": 0.3834, + "step": 10656 + }, + { + "epoch": 0.19786650230392502, + "grad_norm": 0.5486236214637756, + "learning_rate": 1.8129344382756702e-05, + "loss": 0.3912, + "step": 10658 + }, + { + "epoch": 0.19790363244134365, + "grad_norm": 0.4866623282432556, + "learning_rate": 1.8128665015526753e-05, + "loss": 0.2315, + "step": 10660 + }, + { + "epoch": 0.1979407625787623, + "grad_norm": 0.37923917174339294, + "learning_rate": 1.812798553769002e-05, + "loss": 0.1825, + "step": 10662 + }, + { + "epoch": 0.19797789271618094, + "grad_norm": 0.35919103026390076, + "learning_rate": 1.812730594925575e-05, + "loss": 0.4337, + "step": 10664 + }, + { + "epoch": 0.1980150228535996, + "grad_norm": 0.3857702612876892, + "learning_rate": 1.8126626250233185e-05, + "loss": 0.3478, + "step": 10666 + }, + { + "epoch": 0.19805215299101822, + "grad_norm": 0.44642049074172974, + "learning_rate": 1.8125946440631582e-05, + "loss": 0.355, + "step": 10668 + }, + { + "epoch": 0.19808928312843685, + "grad_norm": 0.3609551787376404, + "learning_rate": 1.8125266520460192e-05, + "loss": 0.3635, + "step": 10670 + }, + { + "epoch": 0.1981264132658555, + "grad_norm": 0.4047122001647949, + "learning_rate": 1.812458648972826e-05, + "loss": 0.4582, + "step": 10672 + }, + { + "epoch": 0.19816354340327413, + "grad_norm": 0.4076749384403229, + "learning_rate": 1.812390634844504e-05, + "loss": 0.2525, + "step": 10674 + }, + { + "epoch": 0.19820067354069276, + "grad_norm": 0.3619585335254669, + "learning_rate": 1.8123226096619792e-05, + "loss": 0.3451, + "step": 10676 + }, + { + "epoch": 0.19823780367811142, + "grad_norm": 0.5135860443115234, + "learning_rate": 1.8122545734261767e-05, + "loss": 0.4424, + "step": 10678 + }, + { + "epoch": 0.19827493381553005, + "grad_norm": 0.36556708812713623, + "learning_rate": 1.8121865261380223e-05, + "loss": 0.465, + "step": 10680 + }, + { + "epoch": 0.1983120639529487, + "grad_norm": 0.3625973165035248, + "learning_rate": 1.8121184677984424e-05, + "loss": 0.3302, + "step": 10682 + }, + { + "epoch": 0.19834919409036733, + "grad_norm": 0.43512579798698425, + "learning_rate": 1.8120503984083622e-05, + "loss": 0.2786, + "step": 10684 + }, + { + "epoch": 0.19838632422778596, + "grad_norm": 0.6174240708351135, + "learning_rate": 1.8119823179687088e-05, + "loss": 0.2174, + "step": 10686 + }, + { + "epoch": 0.19842345436520462, + "grad_norm": 0.42891427874565125, + "learning_rate": 1.811914226480408e-05, + "loss": 0.4079, + "step": 10688 + }, + { + "epoch": 0.19846058450262324, + "grad_norm": 0.3629204332828522, + "learning_rate": 1.811846123944387e-05, + "loss": 0.1746, + "step": 10690 + }, + { + "epoch": 0.19849771464004187, + "grad_norm": 0.3608040511608124, + "learning_rate": 1.8117780103615717e-05, + "loss": 0.2701, + "step": 10692 + }, + { + "epoch": 0.19853484477746053, + "grad_norm": 0.4338761270046234, + "learning_rate": 1.8117098857328895e-05, + "loss": 0.308, + "step": 10694 + }, + { + "epoch": 0.19857197491487916, + "grad_norm": 0.4181887209415436, + "learning_rate": 1.811641750059267e-05, + "loss": 0.4115, + "step": 10696 + }, + { + "epoch": 0.19860910505229779, + "grad_norm": 0.4565739333629608, + "learning_rate": 1.8115736033416312e-05, + "loss": 0.4606, + "step": 10698 + }, + { + "epoch": 0.19864623518971644, + "grad_norm": 0.3283848464488983, + "learning_rate": 1.8115054455809096e-05, + "loss": 0.2638, + "step": 10700 + }, + { + "epoch": 0.19868336532713507, + "grad_norm": 0.27011847496032715, + "learning_rate": 1.8114372767780297e-05, + "loss": 0.3762, + "step": 10702 + }, + { + "epoch": 0.19872049546455373, + "grad_norm": 0.272344172000885, + "learning_rate": 1.811369096933919e-05, + "loss": 0.4051, + "step": 10704 + }, + { + "epoch": 0.19875762560197235, + "grad_norm": 0.2958928942680359, + "learning_rate": 1.811300906049505e-05, + "loss": 0.2722, + "step": 10706 + }, + { + "epoch": 0.19879475573939098, + "grad_norm": 0.33428341150283813, + "learning_rate": 1.8112327041257162e-05, + "loss": 0.3913, + "step": 10708 + }, + { + "epoch": 0.19883188587680964, + "grad_norm": 0.3786318898200989, + "learning_rate": 1.8111644911634803e-05, + "loss": 0.2783, + "step": 10710 + }, + { + "epoch": 0.19886901601422827, + "grad_norm": 0.3685378134250641, + "learning_rate": 1.811096267163725e-05, + "loss": 0.4935, + "step": 10712 + }, + { + "epoch": 0.1989061461516469, + "grad_norm": 0.30812156200408936, + "learning_rate": 1.8110280321273793e-05, + "loss": 0.3269, + "step": 10714 + }, + { + "epoch": 0.19894327628906555, + "grad_norm": 0.9871928095817566, + "learning_rate": 1.8109597860553713e-05, + "loss": 0.3227, + "step": 10716 + }, + { + "epoch": 0.19898040642648418, + "grad_norm": 0.3444797098636627, + "learning_rate": 1.8108915289486296e-05, + "loss": 0.3791, + "step": 10718 + }, + { + "epoch": 0.19901753656390284, + "grad_norm": 0.3381890654563904, + "learning_rate": 1.8108232608080834e-05, + "loss": 0.431, + "step": 10720 + }, + { + "epoch": 0.19905466670132146, + "grad_norm": 0.25154224038124084, + "learning_rate": 1.810754981634661e-05, + "loss": 0.3114, + "step": 10722 + }, + { + "epoch": 0.1990917968387401, + "grad_norm": 0.33820077776908875, + "learning_rate": 1.8106866914292918e-05, + "loss": 0.3455, + "step": 10724 + }, + { + "epoch": 0.19912892697615875, + "grad_norm": 0.252514123916626, + "learning_rate": 1.810618390192905e-05, + "loss": 0.3195, + "step": 10726 + }, + { + "epoch": 0.19916605711357738, + "grad_norm": 0.5573863983154297, + "learning_rate": 1.8105500779264302e-05, + "loss": 0.1973, + "step": 10728 + }, + { + "epoch": 0.199203187250996, + "grad_norm": 0.3413804769515991, + "learning_rate": 1.8104817546307967e-05, + "loss": 0.3683, + "step": 10730 + }, + { + "epoch": 0.19924031738841466, + "grad_norm": 0.4128609895706177, + "learning_rate": 1.810413420306934e-05, + "loss": 0.296, + "step": 10732 + }, + { + "epoch": 0.1992774475258333, + "grad_norm": 0.43613719940185547, + "learning_rate": 1.8103450749557724e-05, + "loss": 0.2528, + "step": 10734 + }, + { + "epoch": 0.19931457766325192, + "grad_norm": 0.26584532856941223, + "learning_rate": 1.8102767185782415e-05, + "loss": 0.3336, + "step": 10736 + }, + { + "epoch": 0.19935170780067057, + "grad_norm": 0.441381573677063, + "learning_rate": 1.8102083511752717e-05, + "loss": 0.3325, + "step": 10738 + }, + { + "epoch": 0.1993888379380892, + "grad_norm": 0.32035815715789795, + "learning_rate": 1.8101399727477926e-05, + "loss": 0.2888, + "step": 10740 + }, + { + "epoch": 0.19942596807550786, + "grad_norm": 0.42634662985801697, + "learning_rate": 1.8100715832967354e-05, + "loss": 0.3069, + "step": 10742 + }, + { + "epoch": 0.1994630982129265, + "grad_norm": 0.3308328688144684, + "learning_rate": 1.81000318282303e-05, + "loss": 0.3464, + "step": 10744 + }, + { + "epoch": 0.19950022835034512, + "grad_norm": 0.3660121262073517, + "learning_rate": 1.8099347713276084e-05, + "loss": 0.3299, + "step": 10746 + }, + { + "epoch": 0.19953735848776377, + "grad_norm": 0.35178038477897644, + "learning_rate": 1.8098663488114e-05, + "loss": 0.4313, + "step": 10748 + }, + { + "epoch": 0.1995744886251824, + "grad_norm": 0.5728922486305237, + "learning_rate": 1.8097979152753364e-05, + "loss": 0.4727, + "step": 10750 + }, + { + "epoch": 0.19961161876260103, + "grad_norm": 0.4055405259132385, + "learning_rate": 1.809729470720349e-05, + "loss": 0.3252, + "step": 10752 + }, + { + "epoch": 0.19964874890001968, + "grad_norm": 0.34874027967453003, + "learning_rate": 1.8096610151473685e-05, + "loss": 0.2923, + "step": 10754 + }, + { + "epoch": 0.1996858790374383, + "grad_norm": 0.48665115237236023, + "learning_rate": 1.8095925485573274e-05, + "loss": 0.3447, + "step": 10756 + }, + { + "epoch": 0.19972300917485697, + "grad_norm": 0.3688524663448334, + "learning_rate": 1.8095240709511563e-05, + "loss": 0.4582, + "step": 10758 + }, + { + "epoch": 0.1997601393122756, + "grad_norm": 0.3206796646118164, + "learning_rate": 1.8094555823297876e-05, + "loss": 0.3614, + "step": 10760 + }, + { + "epoch": 0.19979726944969423, + "grad_norm": 0.3029753565788269, + "learning_rate": 1.809387082694153e-05, + "loss": 0.4177, + "step": 10762 + }, + { + "epoch": 0.19983439958711288, + "grad_norm": 0.303425133228302, + "learning_rate": 1.8093185720451846e-05, + "loss": 0.2802, + "step": 10764 + }, + { + "epoch": 0.1998715297245315, + "grad_norm": 0.3895516097545624, + "learning_rate": 1.8092500503838145e-05, + "loss": 0.1207, + "step": 10766 + }, + { + "epoch": 0.19990865986195014, + "grad_norm": 0.24686263501644135, + "learning_rate": 1.8091815177109754e-05, + "loss": 0.5278, + "step": 10768 + }, + { + "epoch": 0.1999457899993688, + "grad_norm": 0.35627856850624084, + "learning_rate": 1.8091129740275994e-05, + "loss": 0.3638, + "step": 10770 + }, + { + "epoch": 0.19998292013678742, + "grad_norm": 0.2892895042896271, + "learning_rate": 1.8090444193346196e-05, + "loss": 0.2866, + "step": 10772 + }, + { + "epoch": 0.20002005027420605, + "grad_norm": 0.4437684714794159, + "learning_rate": 1.8089758536329688e-05, + "loss": 0.174, + "step": 10774 + }, + { + "epoch": 0.2000571804116247, + "grad_norm": 0.48527154326438904, + "learning_rate": 1.8089072769235793e-05, + "loss": 0.3254, + "step": 10776 + }, + { + "epoch": 0.20009431054904334, + "grad_norm": 0.45915573835372925, + "learning_rate": 1.808838689207385e-05, + "loss": 0.2423, + "step": 10778 + }, + { + "epoch": 0.200131440686462, + "grad_norm": 0.5010533928871155, + "learning_rate": 1.8087700904853188e-05, + "loss": 0.2444, + "step": 10780 + }, + { + "epoch": 0.20016857082388062, + "grad_norm": 0.329974502325058, + "learning_rate": 1.8087014807583143e-05, + "loss": 0.2029, + "step": 10782 + }, + { + "epoch": 0.20020570096129925, + "grad_norm": 0.418027400970459, + "learning_rate": 1.808632860027305e-05, + "loss": 0.1309, + "step": 10784 + }, + { + "epoch": 0.2002428310987179, + "grad_norm": 0.3399648666381836, + "learning_rate": 1.8085642282932247e-05, + "loss": 0.3519, + "step": 10786 + }, + { + "epoch": 0.20027996123613653, + "grad_norm": 0.642388641834259, + "learning_rate": 1.808495585557007e-05, + "loss": 0.5265, + "step": 10788 + }, + { + "epoch": 0.20031709137355516, + "grad_norm": 0.4723513722419739, + "learning_rate": 1.808426931819586e-05, + "loss": 0.2839, + "step": 10790 + }, + { + "epoch": 0.20035422151097382, + "grad_norm": 0.3475300967693329, + "learning_rate": 1.8083582670818966e-05, + "loss": 0.2425, + "step": 10792 + }, + { + "epoch": 0.20039135164839245, + "grad_norm": 0.41961824893951416, + "learning_rate": 1.8082895913448718e-05, + "loss": 0.2641, + "step": 10794 + }, + { + "epoch": 0.2004284817858111, + "grad_norm": 0.4302493929862976, + "learning_rate": 1.808220904609447e-05, + "loss": 0.2446, + "step": 10796 + }, + { + "epoch": 0.20046561192322973, + "grad_norm": 0.34174981713294983, + "learning_rate": 1.8081522068765567e-05, + "loss": 0.2751, + "step": 10798 + }, + { + "epoch": 0.20050274206064836, + "grad_norm": 0.33729425072669983, + "learning_rate": 1.808083498147135e-05, + "loss": 0.3074, + "step": 10800 + }, + { + "epoch": 0.20053987219806702, + "grad_norm": 0.44991937279701233, + "learning_rate": 1.808014778422118e-05, + "loss": 0.3619, + "step": 10802 + }, + { + "epoch": 0.20057700233548564, + "grad_norm": 0.3150615990161896, + "learning_rate": 1.80794604770244e-05, + "loss": 0.1592, + "step": 10804 + }, + { + "epoch": 0.20061413247290427, + "grad_norm": 0.5023438930511475, + "learning_rate": 1.8078773059890366e-05, + "loss": 0.1777, + "step": 10806 + }, + { + "epoch": 0.20065126261032293, + "grad_norm": 0.39736321568489075, + "learning_rate": 1.8078085532828425e-05, + "loss": 0.3187, + "step": 10808 + }, + { + "epoch": 0.20068839274774156, + "grad_norm": 0.6400244235992432, + "learning_rate": 1.807739789584794e-05, + "loss": 0.3765, + "step": 10810 + }, + { + "epoch": 0.20072552288516018, + "grad_norm": 0.32258108258247375, + "learning_rate": 1.8076710148958262e-05, + "loss": 0.1767, + "step": 10812 + }, + { + "epoch": 0.20076265302257884, + "grad_norm": 0.40697264671325684, + "learning_rate": 1.807602229216875e-05, + "loss": 0.2256, + "step": 10814 + }, + { + "epoch": 0.20079978315999747, + "grad_norm": 0.40953925251960754, + "learning_rate": 1.8075334325488767e-05, + "loss": 0.3952, + "step": 10816 + }, + { + "epoch": 0.20083691329741613, + "grad_norm": 0.4370473325252533, + "learning_rate": 1.8074646248927673e-05, + "loss": 0.2824, + "step": 10818 + }, + { + "epoch": 0.20087404343483475, + "grad_norm": 0.3332976996898651, + "learning_rate": 1.8073958062494828e-05, + "loss": 0.4775, + "step": 10820 + }, + { + "epoch": 0.20091117357225338, + "grad_norm": 0.3931434750556946, + "learning_rate": 1.80732697661996e-05, + "loss": 0.3925, + "step": 10822 + }, + { + "epoch": 0.20094830370967204, + "grad_norm": 0.42469334602355957, + "learning_rate": 1.8072581360051353e-05, + "loss": 0.3629, + "step": 10824 + }, + { + "epoch": 0.20098543384709067, + "grad_norm": 0.27338218688964844, + "learning_rate": 1.8071892844059452e-05, + "loss": 0.2915, + "step": 10826 + }, + { + "epoch": 0.2010225639845093, + "grad_norm": 0.3425963819026947, + "learning_rate": 1.807120421823327e-05, + "loss": 0.1143, + "step": 10828 + }, + { + "epoch": 0.20105969412192795, + "grad_norm": 0.4951414167881012, + "learning_rate": 1.8070515482582175e-05, + "loss": 0.3085, + "step": 10830 + }, + { + "epoch": 0.20109682425934658, + "grad_norm": 0.3356459438800812, + "learning_rate": 1.8069826637115535e-05, + "loss": 0.4145, + "step": 10832 + }, + { + "epoch": 0.20113395439676524, + "grad_norm": 0.555823802947998, + "learning_rate": 1.806913768184273e-05, + "loss": 0.2103, + "step": 10834 + }, + { + "epoch": 0.20117108453418386, + "grad_norm": 0.47968658804893494, + "learning_rate": 1.8068448616773125e-05, + "loss": 0.3189, + "step": 10836 + }, + { + "epoch": 0.2012082146716025, + "grad_norm": 0.3954299986362457, + "learning_rate": 1.8067759441916107e-05, + "loss": 0.3065, + "step": 10838 + }, + { + "epoch": 0.20124534480902115, + "grad_norm": 0.29290133714675903, + "learning_rate": 1.8067070157281052e-05, + "loss": 0.3637, + "step": 10840 + }, + { + "epoch": 0.20128247494643978, + "grad_norm": 0.3015665113925934, + "learning_rate": 1.806638076287733e-05, + "loss": 0.3295, + "step": 10842 + }, + { + "epoch": 0.2013196050838584, + "grad_norm": 0.3737410008907318, + "learning_rate": 1.8065691258714326e-05, + "loss": 0.3986, + "step": 10844 + }, + { + "epoch": 0.20135673522127706, + "grad_norm": 0.3320861756801605, + "learning_rate": 1.806500164480143e-05, + "loss": 0.3409, + "step": 10846 + }, + { + "epoch": 0.2013938653586957, + "grad_norm": 0.43762192130088806, + "learning_rate": 1.8064311921148014e-05, + "loss": 0.2743, + "step": 10848 + }, + { + "epoch": 0.20143099549611432, + "grad_norm": 0.4020223617553711, + "learning_rate": 1.806362208776347e-05, + "loss": 0.2362, + "step": 10850 + }, + { + "epoch": 0.20146812563353297, + "grad_norm": 0.4083857238292694, + "learning_rate": 1.8062932144657185e-05, + "loss": 0.4352, + "step": 10852 + }, + { + "epoch": 0.2015052557709516, + "grad_norm": 0.49567094445228577, + "learning_rate": 1.806224209183854e-05, + "loss": 0.2241, + "step": 10854 + }, + { + "epoch": 0.20154238590837026, + "grad_norm": 0.3750990927219391, + "learning_rate": 1.8061551929316932e-05, + "loss": 0.2615, + "step": 10856 + }, + { + "epoch": 0.2015795160457889, + "grad_norm": 0.45427751541137695, + "learning_rate": 1.806086165710175e-05, + "loss": 0.3357, + "step": 10858 + }, + { + "epoch": 0.20161664618320752, + "grad_norm": 0.36389946937561035, + "learning_rate": 1.806017127520238e-05, + "loss": 0.394, + "step": 10860 + }, + { + "epoch": 0.20165377632062617, + "grad_norm": 0.49820196628570557, + "learning_rate": 1.8059480783628232e-05, + "loss": 0.2714, + "step": 10862 + }, + { + "epoch": 0.2016909064580448, + "grad_norm": 0.39231058955192566, + "learning_rate": 1.8058790182388687e-05, + "loss": 0.2723, + "step": 10864 + }, + { + "epoch": 0.20172803659546343, + "grad_norm": 0.2937910556793213, + "learning_rate": 1.8058099471493145e-05, + "loss": 0.2701, + "step": 10866 + }, + { + "epoch": 0.20176516673288208, + "grad_norm": 0.33409932255744934, + "learning_rate": 1.8057408650951006e-05, + "loss": 0.3141, + "step": 10868 + }, + { + "epoch": 0.2018022968703007, + "grad_norm": 0.31985020637512207, + "learning_rate": 1.805671772077167e-05, + "loss": 0.4062, + "step": 10870 + }, + { + "epoch": 0.20183942700771937, + "grad_norm": 0.49771395325660706, + "learning_rate": 1.805602668096454e-05, + "loss": 0.2688, + "step": 10872 + }, + { + "epoch": 0.201876557145138, + "grad_norm": 0.3047142028808594, + "learning_rate": 1.8055335531539018e-05, + "loss": 0.2343, + "step": 10874 + }, + { + "epoch": 0.20191368728255663, + "grad_norm": 0.5055795907974243, + "learning_rate": 1.8054644272504504e-05, + "loss": 0.5337, + "step": 10876 + }, + { + "epoch": 0.20195081741997528, + "grad_norm": 0.3464277684688568, + "learning_rate": 1.8053952903870412e-05, + "loss": 0.3591, + "step": 10878 + }, + { + "epoch": 0.2019879475573939, + "grad_norm": 0.6054584383964539, + "learning_rate": 1.8053261425646144e-05, + "loss": 0.2056, + "step": 10880 + }, + { + "epoch": 0.20202507769481254, + "grad_norm": 0.3776136338710785, + "learning_rate": 1.805256983784111e-05, + "loss": 0.2072, + "step": 10882 + }, + { + "epoch": 0.2020622078322312, + "grad_norm": 0.4827113151550293, + "learning_rate": 1.8051878140464723e-05, + "loss": 0.2769, + "step": 10884 + }, + { + "epoch": 0.20209933796964982, + "grad_norm": 0.2429111748933792, + "learning_rate": 1.805118633352639e-05, + "loss": 0.3155, + "step": 10886 + }, + { + "epoch": 0.20213646810706845, + "grad_norm": 0.3328652083873749, + "learning_rate": 1.805049441703553e-05, + "loss": 0.3116, + "step": 10888 + }, + { + "epoch": 0.2021735982444871, + "grad_norm": 0.3258818984031677, + "learning_rate": 1.804980239100155e-05, + "loss": 0.1621, + "step": 10890 + }, + { + "epoch": 0.20221072838190574, + "grad_norm": 0.5889344811439514, + "learning_rate": 1.8049110255433875e-05, + "loss": 0.4156, + "step": 10892 + }, + { + "epoch": 0.2022478585193244, + "grad_norm": 0.34815457463264465, + "learning_rate": 1.8048418010341917e-05, + "loss": 0.5055, + "step": 10894 + }, + { + "epoch": 0.20228498865674302, + "grad_norm": 0.3929527997970581, + "learning_rate": 1.80477256557351e-05, + "loss": 0.1796, + "step": 10896 + }, + { + "epoch": 0.20232211879416165, + "grad_norm": 0.2645559310913086, + "learning_rate": 1.8047033191622843e-05, + "loss": 0.1441, + "step": 10898 + }, + { + "epoch": 0.2023592489315803, + "grad_norm": 0.43833860754966736, + "learning_rate": 1.8046340618014567e-05, + "loss": 0.2069, + "step": 10900 + }, + { + "epoch": 0.20239637906899893, + "grad_norm": 0.38086459040641785, + "learning_rate": 1.8045647934919697e-05, + "loss": 0.4202, + "step": 10902 + }, + { + "epoch": 0.20243350920641756, + "grad_norm": 0.5325655341148376, + "learning_rate": 1.804495514234766e-05, + "loss": 0.4256, + "step": 10904 + }, + { + "epoch": 0.20247063934383622, + "grad_norm": 0.5525509119033813, + "learning_rate": 1.8044262240307874e-05, + "loss": 0.254, + "step": 10906 + }, + { + "epoch": 0.20250776948125485, + "grad_norm": 0.37684616446495056, + "learning_rate": 1.804356922880978e-05, + "loss": 0.3692, + "step": 10908 + }, + { + "epoch": 0.2025448996186735, + "grad_norm": 0.3446234464645386, + "learning_rate": 1.8042876107862802e-05, + "loss": 0.3716, + "step": 10910 + }, + { + "epoch": 0.20258202975609213, + "grad_norm": 0.46539050340652466, + "learning_rate": 1.8042182877476367e-05, + "loss": 0.2808, + "step": 10912 + }, + { + "epoch": 0.20261915989351076, + "grad_norm": 0.26811131834983826, + "learning_rate": 1.8041489537659916e-05, + "loss": 0.3519, + "step": 10914 + }, + { + "epoch": 0.20265629003092941, + "grad_norm": 0.2790672779083252, + "learning_rate": 1.804079608842288e-05, + "loss": 0.3262, + "step": 10916 + }, + { + "epoch": 0.20269342016834804, + "grad_norm": 0.26005733013153076, + "learning_rate": 1.8040102529774693e-05, + "loss": 0.2662, + "step": 10918 + }, + { + "epoch": 0.20273055030576667, + "grad_norm": 0.30791449546813965, + "learning_rate": 1.8039408861724795e-05, + "loss": 0.2685, + "step": 10920 + }, + { + "epoch": 0.20276768044318533, + "grad_norm": 0.4886121451854706, + "learning_rate": 1.803871508428262e-05, + "loss": 0.5012, + "step": 10922 + }, + { + "epoch": 0.20280481058060396, + "grad_norm": 0.2542363703250885, + "learning_rate": 1.8038021197457613e-05, + "loss": 0.3042, + "step": 10924 + }, + { + "epoch": 0.20284194071802258, + "grad_norm": 0.38627511262893677, + "learning_rate": 1.8037327201259214e-05, + "loss": 0.3797, + "step": 10926 + }, + { + "epoch": 0.20287907085544124, + "grad_norm": 0.34510016441345215, + "learning_rate": 1.8036633095696867e-05, + "loss": 0.2813, + "step": 10928 + }, + { + "epoch": 0.20291620099285987, + "grad_norm": 0.3160102665424347, + "learning_rate": 1.8035938880780016e-05, + "loss": 0.2398, + "step": 10930 + }, + { + "epoch": 0.20295333113027852, + "grad_norm": 0.3049541115760803, + "learning_rate": 1.803524455651811e-05, + "loss": 0.3977, + "step": 10932 + }, + { + "epoch": 0.20299046126769715, + "grad_norm": 0.3099616467952728, + "learning_rate": 1.803455012292059e-05, + "loss": 0.3029, + "step": 10934 + }, + { + "epoch": 0.20302759140511578, + "grad_norm": 0.29621782898902893, + "learning_rate": 1.8033855579996907e-05, + "loss": 0.1594, + "step": 10936 + }, + { + "epoch": 0.20306472154253444, + "grad_norm": 0.39212656021118164, + "learning_rate": 1.8033160927756518e-05, + "loss": 0.2957, + "step": 10938 + }, + { + "epoch": 0.20310185167995307, + "grad_norm": 0.37854981422424316, + "learning_rate": 1.803246616620887e-05, + "loss": 0.5551, + "step": 10940 + }, + { + "epoch": 0.2031389818173717, + "grad_norm": 0.3617917001247406, + "learning_rate": 1.8031771295363417e-05, + "loss": 0.284, + "step": 10942 + }, + { + "epoch": 0.20317611195479035, + "grad_norm": 0.28693732619285583, + "learning_rate": 1.8031076315229615e-05, + "loss": 0.3541, + "step": 10944 + }, + { + "epoch": 0.20321324209220898, + "grad_norm": 0.3539438247680664, + "learning_rate": 1.803038122581692e-05, + "loss": 0.1728, + "step": 10946 + }, + { + "epoch": 0.20325037222962763, + "grad_norm": 0.4100123345851898, + "learning_rate": 1.802968602713479e-05, + "loss": 0.1783, + "step": 10948 + }, + { + "epoch": 0.20328750236704626, + "grad_norm": 0.26993250846862793, + "learning_rate": 1.8028990719192686e-05, + "loss": 0.3275, + "step": 10950 + }, + { + "epoch": 0.2033246325044649, + "grad_norm": 0.564293622970581, + "learning_rate": 1.8028295302000068e-05, + "loss": 0.3285, + "step": 10952 + }, + { + "epoch": 0.20336176264188355, + "grad_norm": 0.3827095925807953, + "learning_rate": 1.8027599775566396e-05, + "loss": 0.2651, + "step": 10954 + }, + { + "epoch": 0.20339889277930218, + "grad_norm": 0.4387151896953583, + "learning_rate": 1.802690413990114e-05, + "loss": 0.3062, + "step": 10956 + }, + { + "epoch": 0.2034360229167208, + "grad_norm": 0.39288821816444397, + "learning_rate": 1.8026208395013756e-05, + "loss": 0.3374, + "step": 10958 + }, + { + "epoch": 0.20347315305413946, + "grad_norm": 0.36687755584716797, + "learning_rate": 1.8025512540913724e-05, + "loss": 0.1945, + "step": 10960 + }, + { + "epoch": 0.2035102831915581, + "grad_norm": 0.23342613875865936, + "learning_rate": 1.8024816577610504e-05, + "loss": 0.1826, + "step": 10962 + }, + { + "epoch": 0.20354741332897672, + "grad_norm": 0.23447753489017487, + "learning_rate": 1.8024120505113563e-05, + "loss": 0.3601, + "step": 10964 + }, + { + "epoch": 0.20358454346639537, + "grad_norm": 0.40237846970558167, + "learning_rate": 1.8023424323432382e-05, + "loss": 0.2262, + "step": 10966 + }, + { + "epoch": 0.203621673603814, + "grad_norm": 0.4993864893913269, + "learning_rate": 1.8022728032576426e-05, + "loss": 0.2022, + "step": 10968 + }, + { + "epoch": 0.20365880374123266, + "grad_norm": 0.2714662551879883, + "learning_rate": 1.802203163255517e-05, + "loss": 0.3509, + "step": 10970 + }, + { + "epoch": 0.2036959338786513, + "grad_norm": 0.3502453565597534, + "learning_rate": 1.80213351233781e-05, + "loss": 0.2396, + "step": 10972 + }, + { + "epoch": 0.20373306401606991, + "grad_norm": 0.38922733068466187, + "learning_rate": 1.802063850505468e-05, + "loss": 0.211, + "step": 10974 + }, + { + "epoch": 0.20377019415348857, + "grad_norm": 0.4161888659000397, + "learning_rate": 1.8019941777594395e-05, + "loss": 0.4479, + "step": 10976 + }, + { + "epoch": 0.2038073242909072, + "grad_norm": 0.43013548851013184, + "learning_rate": 1.8019244941006725e-05, + "loss": 0.3909, + "step": 10978 + }, + { + "epoch": 0.20384445442832583, + "grad_norm": 0.42400020360946655, + "learning_rate": 1.8018547995301154e-05, + "loss": 0.4307, + "step": 10980 + }, + { + "epoch": 0.20388158456574448, + "grad_norm": 0.3903316855430603, + "learning_rate": 1.8017850940487162e-05, + "loss": 0.1936, + "step": 10982 + }, + { + "epoch": 0.2039187147031631, + "grad_norm": 0.34987080097198486, + "learning_rate": 1.8017153776574235e-05, + "loss": 0.3128, + "step": 10984 + }, + { + "epoch": 0.20395584484058177, + "grad_norm": 0.37024426460266113, + "learning_rate": 1.8016456503571857e-05, + "loss": 0.1905, + "step": 10986 + }, + { + "epoch": 0.2039929749780004, + "grad_norm": 0.3467206358909607, + "learning_rate": 1.801575912148952e-05, + "loss": 0.4905, + "step": 10988 + }, + { + "epoch": 0.20403010511541902, + "grad_norm": 0.2291179895401001, + "learning_rate": 1.801506163033671e-05, + "loss": 0.3076, + "step": 10990 + }, + { + "epoch": 0.20406723525283768, + "grad_norm": 0.3153413236141205, + "learning_rate": 1.801436403012292e-05, + "loss": 0.3613, + "step": 10992 + }, + { + "epoch": 0.2041043653902563, + "grad_norm": 0.3223901689052582, + "learning_rate": 1.8013666320857646e-05, + "loss": 0.172, + "step": 10994 + }, + { + "epoch": 0.20414149552767494, + "grad_norm": 0.36323311924934387, + "learning_rate": 1.801296850255037e-05, + "loss": 0.3457, + "step": 10996 + }, + { + "epoch": 0.2041786256650936, + "grad_norm": 0.3450501561164856, + "learning_rate": 1.8012270575210596e-05, + "loss": 0.441, + "step": 10998 + }, + { + "epoch": 0.20421575580251222, + "grad_norm": 0.48479440808296204, + "learning_rate": 1.801157253884782e-05, + "loss": 0.1534, + "step": 11000 + }, + { + "epoch": 0.20425288593993085, + "grad_norm": 0.42609670758247375, + "learning_rate": 1.801087439347154e-05, + "loss": 0.4914, + "step": 11002 + }, + { + "epoch": 0.2042900160773495, + "grad_norm": 0.3819350600242615, + "learning_rate": 1.8010176139091257e-05, + "loss": 0.2517, + "step": 11004 + }, + { + "epoch": 0.20432714621476813, + "grad_norm": 0.25776270031929016, + "learning_rate": 1.8009477775716468e-05, + "loss": 0.2487, + "step": 11006 + }, + { + "epoch": 0.2043642763521868, + "grad_norm": 0.365556538105011, + "learning_rate": 1.8008779303356673e-05, + "loss": 0.3783, + "step": 11008 + }, + { + "epoch": 0.20440140648960542, + "grad_norm": 0.871935248374939, + "learning_rate": 1.8008080722021387e-05, + "loss": 0.218, + "step": 11010 + }, + { + "epoch": 0.20443853662702405, + "grad_norm": 0.2795582711696625, + "learning_rate": 1.8007382031720104e-05, + "loss": 0.313, + "step": 11012 + }, + { + "epoch": 0.2044756667644427, + "grad_norm": 0.7396692037582397, + "learning_rate": 1.800668323246234e-05, + "loss": 0.3901, + "step": 11014 + }, + { + "epoch": 0.20451279690186133, + "grad_norm": 0.3855896592140198, + "learning_rate": 1.80059843242576e-05, + "loss": 0.3641, + "step": 11016 + }, + { + "epoch": 0.20454992703927996, + "grad_norm": 0.6135772466659546, + "learning_rate": 1.800528530711539e-05, + "loss": 0.2708, + "step": 11018 + }, + { + "epoch": 0.20458705717669862, + "grad_norm": 0.2560223340988159, + "learning_rate": 1.800458618104523e-05, + "loss": 0.3619, + "step": 11020 + }, + { + "epoch": 0.20462418731411725, + "grad_norm": 0.3569188117980957, + "learning_rate": 1.800388694605663e-05, + "loss": 0.3182, + "step": 11022 + }, + { + "epoch": 0.2046613174515359, + "grad_norm": 0.3389948010444641, + "learning_rate": 1.80031876021591e-05, + "loss": 0.2331, + "step": 11024 + }, + { + "epoch": 0.20469844758895453, + "grad_norm": 0.36397939920425415, + "learning_rate": 1.8002488149362158e-05, + "loss": 0.2084, + "step": 11026 + }, + { + "epoch": 0.20473557772637316, + "grad_norm": 0.21178899705410004, + "learning_rate": 1.8001788587675323e-05, + "loss": 0.1995, + "step": 11028 + }, + { + "epoch": 0.20477270786379181, + "grad_norm": 0.33609259128570557, + "learning_rate": 1.8001088917108115e-05, + "loss": 0.2748, + "step": 11030 + }, + { + "epoch": 0.20480983800121044, + "grad_norm": 0.4405866265296936, + "learning_rate": 1.8000389137670055e-05, + "loss": 0.3556, + "step": 11032 + }, + { + "epoch": 0.20484696813862907, + "grad_norm": 0.3714745342731476, + "learning_rate": 1.7999689249370657e-05, + "loss": 0.3512, + "step": 11034 + }, + { + "epoch": 0.20488409827604773, + "grad_norm": 0.2890619933605194, + "learning_rate": 1.7998989252219454e-05, + "loss": 0.4128, + "step": 11036 + }, + { + "epoch": 0.20492122841346636, + "grad_norm": 0.36238807439804077, + "learning_rate": 1.7998289146225968e-05, + "loss": 0.2808, + "step": 11038 + }, + { + "epoch": 0.20495835855088498, + "grad_norm": 0.3752381205558777, + "learning_rate": 1.799758893139972e-05, + "loss": 0.3256, + "step": 11040 + }, + { + "epoch": 0.20499548868830364, + "grad_norm": 0.39415037631988525, + "learning_rate": 1.7996888607750244e-05, + "loss": 0.2595, + "step": 11042 + }, + { + "epoch": 0.20503261882572227, + "grad_norm": 0.30676236748695374, + "learning_rate": 1.799618817528707e-05, + "loss": 0.2643, + "step": 11044 + }, + { + "epoch": 0.20506974896314092, + "grad_norm": 0.4020739793777466, + "learning_rate": 1.7995487634019725e-05, + "loss": 0.1626, + "step": 11046 + }, + { + "epoch": 0.20510687910055955, + "grad_norm": 0.39993715286254883, + "learning_rate": 1.7994786983957745e-05, + "loss": 0.407, + "step": 11048 + }, + { + "epoch": 0.20514400923797818, + "grad_norm": 0.3869505822658539, + "learning_rate": 1.7994086225110657e-05, + "loss": 0.4255, + "step": 11050 + }, + { + "epoch": 0.20518113937539684, + "grad_norm": 0.3084717392921448, + "learning_rate": 1.7993385357488e-05, + "loss": 0.3249, + "step": 11052 + }, + { + "epoch": 0.20521826951281547, + "grad_norm": 0.35271546244621277, + "learning_rate": 1.7992684381099314e-05, + "loss": 0.3688, + "step": 11054 + }, + { + "epoch": 0.2052553996502341, + "grad_norm": 0.6537727117538452, + "learning_rate": 1.7991983295954135e-05, + "loss": 0.2483, + "step": 11056 + }, + { + "epoch": 0.20529252978765275, + "grad_norm": 0.35688623785972595, + "learning_rate": 1.7991282102062e-05, + "loss": 0.3329, + "step": 11058 + }, + { + "epoch": 0.20532965992507138, + "grad_norm": 0.33994412422180176, + "learning_rate": 1.7990580799432452e-05, + "loss": 0.2438, + "step": 11060 + }, + { + "epoch": 0.20536679006249003, + "grad_norm": 0.3979966342449188, + "learning_rate": 1.7989879388075037e-05, + "loss": 0.3546, + "step": 11062 + }, + { + "epoch": 0.20540392019990866, + "grad_norm": 0.23355227708816528, + "learning_rate": 1.7989177867999294e-05, + "loss": 0.1189, + "step": 11064 + }, + { + "epoch": 0.2054410503373273, + "grad_norm": 0.45612430572509766, + "learning_rate": 1.798847623921477e-05, + "loss": 0.362, + "step": 11066 + }, + { + "epoch": 0.20547818047474595, + "grad_norm": 0.31561651825904846, + "learning_rate": 1.798777450173101e-05, + "loss": 0.2421, + "step": 11068 + }, + { + "epoch": 0.20551531061216458, + "grad_norm": 0.31964311003685, + "learning_rate": 1.798707265555757e-05, + "loss": 0.1353, + "step": 11070 + }, + { + "epoch": 0.2055524407495832, + "grad_norm": 0.28680485486984253, + "learning_rate": 1.7986370700703993e-05, + "loss": 0.4415, + "step": 11072 + }, + { + "epoch": 0.20558957088700186, + "grad_norm": 0.31005164980888367, + "learning_rate": 1.7985668637179835e-05, + "loss": 0.2767, + "step": 11074 + }, + { + "epoch": 0.2056267010244205, + "grad_norm": 0.2682952880859375, + "learning_rate": 1.7984966464994645e-05, + "loss": 0.303, + "step": 11076 + }, + { + "epoch": 0.20566383116183912, + "grad_norm": 0.48930829763412476, + "learning_rate": 1.7984264184157978e-05, + "loss": 0.2874, + "step": 11078 + }, + { + "epoch": 0.20570096129925777, + "grad_norm": 0.46756598353385925, + "learning_rate": 1.7983561794679394e-05, + "loss": 0.2312, + "step": 11080 + }, + { + "epoch": 0.2057380914366764, + "grad_norm": 0.35370391607284546, + "learning_rate": 1.7982859296568445e-05, + "loss": 0.1852, + "step": 11082 + }, + { + "epoch": 0.20577522157409506, + "grad_norm": 0.45003122091293335, + "learning_rate": 1.7982156689834693e-05, + "loss": 0.1659, + "step": 11084 + }, + { + "epoch": 0.20581235171151369, + "grad_norm": 0.24225486814975739, + "learning_rate": 1.7981453974487698e-05, + "loss": 0.2772, + "step": 11086 + }, + { + "epoch": 0.20584948184893231, + "grad_norm": 0.4070354700088501, + "learning_rate": 1.798075115053702e-05, + "loss": 0.2216, + "step": 11088 + }, + { + "epoch": 0.20588661198635097, + "grad_norm": 0.27683866024017334, + "learning_rate": 1.798004821799223e-05, + "loss": 0.3173, + "step": 11090 + }, + { + "epoch": 0.2059237421237696, + "grad_norm": 0.4069429636001587, + "learning_rate": 1.797934517686288e-05, + "loss": 0.4082, + "step": 11092 + }, + { + "epoch": 0.20596087226118823, + "grad_norm": 0.331180602312088, + "learning_rate": 1.7978642027158545e-05, + "loss": 0.2893, + "step": 11094 + }, + { + "epoch": 0.20599800239860688, + "grad_norm": 0.2940187454223633, + "learning_rate": 1.797793876888879e-05, + "loss": 0.3266, + "step": 11096 + }, + { + "epoch": 0.2060351325360255, + "grad_norm": 0.6095760464668274, + "learning_rate": 1.797723540206319e-05, + "loss": 0.2674, + "step": 11098 + }, + { + "epoch": 0.20607226267344417, + "grad_norm": 0.5362911820411682, + "learning_rate": 1.7976531926691305e-05, + "loss": 0.4348, + "step": 11100 + }, + { + "epoch": 0.2061093928108628, + "grad_norm": 0.47066736221313477, + "learning_rate": 1.7975828342782712e-05, + "loss": 0.4524, + "step": 11102 + }, + { + "epoch": 0.20614652294828142, + "grad_norm": 0.3122949004173279, + "learning_rate": 1.797512465034699e-05, + "loss": 0.323, + "step": 11104 + }, + { + "epoch": 0.20618365308570008, + "grad_norm": 0.31107062101364136, + "learning_rate": 1.797442084939371e-05, + "loss": 0.2409, + "step": 11106 + }, + { + "epoch": 0.2062207832231187, + "grad_norm": 0.8616113662719727, + "learning_rate": 1.7973716939932445e-05, + "loss": 0.4116, + "step": 11108 + }, + { + "epoch": 0.20625791336053734, + "grad_norm": 0.23543886840343475, + "learning_rate": 1.797301292197278e-05, + "loss": 0.3517, + "step": 11110 + }, + { + "epoch": 0.206295043497956, + "grad_norm": 0.45217669010162354, + "learning_rate": 1.7972308795524287e-05, + "loss": 0.2879, + "step": 11112 + }, + { + "epoch": 0.20633217363537462, + "grad_norm": 0.32104748487472534, + "learning_rate": 1.7971604560596552e-05, + "loss": 0.3071, + "step": 11114 + }, + { + "epoch": 0.20636930377279325, + "grad_norm": 0.37868815660476685, + "learning_rate": 1.797090021719916e-05, + "loss": 0.3469, + "step": 11116 + }, + { + "epoch": 0.2064064339102119, + "grad_norm": 0.4879947006702423, + "learning_rate": 1.797019576534169e-05, + "loss": 0.2358, + "step": 11118 + }, + { + "epoch": 0.20644356404763053, + "grad_norm": 0.430711954832077, + "learning_rate": 1.796949120503373e-05, + "loss": 0.3454, + "step": 11120 + }, + { + "epoch": 0.2064806941850492, + "grad_norm": 0.2504064738750458, + "learning_rate": 1.7968786536284865e-05, + "loss": 0.4941, + "step": 11122 + }, + { + "epoch": 0.20651782432246782, + "grad_norm": 0.3445931375026703, + "learning_rate": 1.7968081759104686e-05, + "loss": 0.358, + "step": 11124 + }, + { + "epoch": 0.20655495445988645, + "grad_norm": 0.5050252079963684, + "learning_rate": 1.796737687350278e-05, + "loss": 0.286, + "step": 11126 + }, + { + "epoch": 0.2065920845973051, + "grad_norm": 0.297049343585968, + "learning_rate": 1.796667187948874e-05, + "loss": 0.3362, + "step": 11128 + }, + { + "epoch": 0.20662921473472373, + "grad_norm": 0.32343947887420654, + "learning_rate": 1.7965966777072153e-05, + "loss": 0.3937, + "step": 11130 + }, + { + "epoch": 0.20666634487214236, + "grad_norm": 0.41604095697402954, + "learning_rate": 1.796526156626263e-05, + "loss": 0.2948, + "step": 11132 + }, + { + "epoch": 0.20670347500956102, + "grad_norm": 0.47810590267181396, + "learning_rate": 1.7964556247069747e-05, + "loss": 0.1511, + "step": 11134 + }, + { + "epoch": 0.20674060514697964, + "grad_norm": 0.33088523149490356, + "learning_rate": 1.7963850819503115e-05, + "loss": 0.1819, + "step": 11136 + }, + { + "epoch": 0.2067777352843983, + "grad_norm": 0.28317898511886597, + "learning_rate": 1.7963145283572327e-05, + "loss": 0.339, + "step": 11138 + }, + { + "epoch": 0.20681486542181693, + "grad_norm": 0.26996004581451416, + "learning_rate": 1.796243963928698e-05, + "loss": 0.2007, + "step": 11140 + }, + { + "epoch": 0.20685199555923556, + "grad_norm": 0.4394034445285797, + "learning_rate": 1.7961733886656682e-05, + "loss": 0.3707, + "step": 11142 + }, + { + "epoch": 0.2068891256966542, + "grad_norm": 0.34104758501052856, + "learning_rate": 1.7961028025691037e-05, + "loss": 0.2973, + "step": 11144 + }, + { + "epoch": 0.20692625583407284, + "grad_norm": 0.41780200600624084, + "learning_rate": 1.7960322056399643e-05, + "loss": 0.2402, + "step": 11146 + }, + { + "epoch": 0.20696338597149147, + "grad_norm": 0.2939709722995758, + "learning_rate": 1.7959615978792115e-05, + "loss": 0.293, + "step": 11148 + }, + { + "epoch": 0.20700051610891013, + "grad_norm": 0.23490868508815765, + "learning_rate": 1.7958909792878048e-05, + "loss": 0.2134, + "step": 11150 + }, + { + "epoch": 0.20703764624632875, + "grad_norm": 0.34482479095458984, + "learning_rate": 1.7958203498667065e-05, + "loss": 0.3631, + "step": 11152 + }, + { + "epoch": 0.20707477638374738, + "grad_norm": 0.3382614254951477, + "learning_rate": 1.7957497096168767e-05, + "loss": 0.4784, + "step": 11154 + }, + { + "epoch": 0.20711190652116604, + "grad_norm": 0.34633511304855347, + "learning_rate": 1.7956790585392764e-05, + "loss": 0.3552, + "step": 11156 + }, + { + "epoch": 0.20714903665858467, + "grad_norm": 0.394165575504303, + "learning_rate": 1.7956083966348684e-05, + "loss": 0.364, + "step": 11158 + }, + { + "epoch": 0.20718616679600332, + "grad_norm": 0.2904904782772064, + "learning_rate": 1.7955377239046126e-05, + "loss": 0.2963, + "step": 11160 + }, + { + "epoch": 0.20722329693342195, + "grad_norm": 0.26285892724990845, + "learning_rate": 1.7954670403494713e-05, + "loss": 0.2799, + "step": 11162 + }, + { + "epoch": 0.20726042707084058, + "grad_norm": 0.34330615401268005, + "learning_rate": 1.7953963459704066e-05, + "loss": 0.2612, + "step": 11164 + }, + { + "epoch": 0.20729755720825924, + "grad_norm": 0.5931466221809387, + "learning_rate": 1.7953256407683797e-05, + "loss": 0.2651, + "step": 11166 + }, + { + "epoch": 0.20733468734567786, + "grad_norm": 0.34845098853111267, + "learning_rate": 1.7952549247443537e-05, + "loss": 0.3494, + "step": 11168 + }, + { + "epoch": 0.2073718174830965, + "grad_norm": 0.48305806517601013, + "learning_rate": 1.79518419789929e-05, + "loss": 0.45, + "step": 11170 + }, + { + "epoch": 0.20740894762051515, + "grad_norm": 0.3579391837120056, + "learning_rate": 1.7951134602341512e-05, + "loss": 0.4734, + "step": 11172 + }, + { + "epoch": 0.20744607775793378, + "grad_norm": 0.24750709533691406, + "learning_rate": 1.7950427117498995e-05, + "loss": 0.3436, + "step": 11174 + }, + { + "epoch": 0.20748320789535243, + "grad_norm": 1.1222401857376099, + "learning_rate": 1.7949719524474982e-05, + "loss": 0.3056, + "step": 11176 + }, + { + "epoch": 0.20752033803277106, + "grad_norm": 0.27495741844177246, + "learning_rate": 1.7949011823279098e-05, + "loss": 0.2536, + "step": 11178 + }, + { + "epoch": 0.2075574681701897, + "grad_norm": 0.40407535433769226, + "learning_rate": 1.7948304013920976e-05, + "loss": 0.2484, + "step": 11180 + }, + { + "epoch": 0.20759459830760835, + "grad_norm": 0.4311932921409607, + "learning_rate": 1.7947596096410242e-05, + "loss": 0.4077, + "step": 11182 + }, + { + "epoch": 0.20763172844502698, + "grad_norm": 0.32951274514198303, + "learning_rate": 1.794688807075653e-05, + "loss": 0.297, + "step": 11184 + }, + { + "epoch": 0.2076688585824456, + "grad_norm": 0.4299672245979309, + "learning_rate": 1.7946179936969476e-05, + "loss": 0.2706, + "step": 11186 + }, + { + "epoch": 0.20770598871986426, + "grad_norm": 0.47778764367103577, + "learning_rate": 1.7945471695058713e-05, + "loss": 0.3769, + "step": 11188 + }, + { + "epoch": 0.2077431188572829, + "grad_norm": 0.4164954721927643, + "learning_rate": 1.794476334503388e-05, + "loss": 0.4864, + "step": 11190 + }, + { + "epoch": 0.20778024899470152, + "grad_norm": 0.4768127202987671, + "learning_rate": 1.794405488690462e-05, + "loss": 0.3166, + "step": 11192 + }, + { + "epoch": 0.20781737913212017, + "grad_norm": 0.46884313225746155, + "learning_rate": 1.7943346320680562e-05, + "loss": 0.3376, + "step": 11194 + }, + { + "epoch": 0.2078545092695388, + "grad_norm": 0.5495001673698425, + "learning_rate": 1.7942637646371358e-05, + "loss": 0.4091, + "step": 11196 + }, + { + "epoch": 0.20789163940695746, + "grad_norm": 0.2996853291988373, + "learning_rate": 1.7941928863986645e-05, + "loss": 0.3333, + "step": 11198 + }, + { + "epoch": 0.20792876954437609, + "grad_norm": 0.42002248764038086, + "learning_rate": 1.794121997353607e-05, + "loss": 0.2899, + "step": 11200 + }, + { + "epoch": 0.2079658996817947, + "grad_norm": 0.39984187483787537, + "learning_rate": 1.7940510975029278e-05, + "loss": 0.3451, + "step": 11202 + }, + { + "epoch": 0.20800302981921337, + "grad_norm": 0.435496985912323, + "learning_rate": 1.793980186847591e-05, + "loss": 0.2027, + "step": 11204 + }, + { + "epoch": 0.208040159956632, + "grad_norm": 0.4816204607486725, + "learning_rate": 1.7939092653885627e-05, + "loss": 0.527, + "step": 11206 + }, + { + "epoch": 0.20807729009405063, + "grad_norm": 0.38474929332733154, + "learning_rate": 1.793838333126807e-05, + "loss": 0.1692, + "step": 11208 + }, + { + "epoch": 0.20811442023146928, + "grad_norm": 0.39252087473869324, + "learning_rate": 1.7937673900632903e-05, + "loss": 0.3937, + "step": 11210 + }, + { + "epoch": 0.2081515503688879, + "grad_norm": 0.35301506519317627, + "learning_rate": 1.7936964361989758e-05, + "loss": 0.2033, + "step": 11212 + }, + { + "epoch": 0.20818868050630657, + "grad_norm": 0.2924462556838989, + "learning_rate": 1.793625471534831e-05, + "loss": 0.3416, + "step": 11214 + }, + { + "epoch": 0.2082258106437252, + "grad_norm": 0.37668949365615845, + "learning_rate": 1.7935544960718204e-05, + "loss": 0.1794, + "step": 11216 + }, + { + "epoch": 0.20826294078114382, + "grad_norm": 0.2842382490634918, + "learning_rate": 1.79348350981091e-05, + "loss": 0.3269, + "step": 11218 + }, + { + "epoch": 0.20830007091856248, + "grad_norm": 0.27826449275016785, + "learning_rate": 1.7934125127530663e-05, + "loss": 0.3411, + "step": 11220 + }, + { + "epoch": 0.2083372010559811, + "grad_norm": 0.4437578320503235, + "learning_rate": 1.7933415048992543e-05, + "loss": 0.3274, + "step": 11222 + }, + { + "epoch": 0.20837433119339974, + "grad_norm": 0.395443856716156, + "learning_rate": 1.793270486250441e-05, + "loss": 0.3947, + "step": 11224 + }, + { + "epoch": 0.2084114613308184, + "grad_norm": 0.5515168905258179, + "learning_rate": 1.7931994568075925e-05, + "loss": 0.1667, + "step": 11226 + }, + { + "epoch": 0.20844859146823702, + "grad_norm": 0.2250809669494629, + "learning_rate": 1.7931284165716753e-05, + "loss": 0.1925, + "step": 11228 + }, + { + "epoch": 0.20848572160565565, + "grad_norm": 0.46520328521728516, + "learning_rate": 1.793057365543656e-05, + "loss": 0.2073, + "step": 11230 + }, + { + "epoch": 0.2085228517430743, + "grad_norm": 0.22988620400428772, + "learning_rate": 1.7929863037245012e-05, + "loss": 0.295, + "step": 11232 + }, + { + "epoch": 0.20855998188049293, + "grad_norm": 0.27971047163009644, + "learning_rate": 1.7929152311151786e-05, + "loss": 0.406, + "step": 11234 + }, + { + "epoch": 0.2085971120179116, + "grad_norm": 0.34872183203697205, + "learning_rate": 1.7928441477166543e-05, + "loss": 0.3845, + "step": 11236 + }, + { + "epoch": 0.20863424215533022, + "grad_norm": 0.37287500500679016, + "learning_rate": 1.792773053529896e-05, + "loss": 0.2862, + "step": 11238 + }, + { + "epoch": 0.20867137229274885, + "grad_norm": 0.2915292978286743, + "learning_rate": 1.7927019485558715e-05, + "loss": 0.2374, + "step": 11240 + }, + { + "epoch": 0.2087085024301675, + "grad_norm": 0.4057576358318329, + "learning_rate": 1.7926308327955474e-05, + "loss": 0.3982, + "step": 11242 + }, + { + "epoch": 0.20874563256758613, + "grad_norm": 0.49636590480804443, + "learning_rate": 1.7925597062498924e-05, + "loss": 0.3781, + "step": 11244 + }, + { + "epoch": 0.20878276270500476, + "grad_norm": 0.324169397354126, + "learning_rate": 1.7924885689198733e-05, + "loss": 0.3575, + "step": 11246 + }, + { + "epoch": 0.20881989284242342, + "grad_norm": 0.40219929814338684, + "learning_rate": 1.7924174208064586e-05, + "loss": 0.2937, + "step": 11248 + }, + { + "epoch": 0.20885702297984204, + "grad_norm": 0.3900527060031891, + "learning_rate": 1.7923462619106168e-05, + "loss": 0.3158, + "step": 11250 + }, + { + "epoch": 0.2088941531172607, + "grad_norm": 0.2828623354434967, + "learning_rate": 1.7922750922333154e-05, + "loss": 0.3131, + "step": 11252 + }, + { + "epoch": 0.20893128325467933, + "grad_norm": 0.30541741847991943, + "learning_rate": 1.792203911775523e-05, + "loss": 0.2422, + "step": 11254 + }, + { + "epoch": 0.20896841339209796, + "grad_norm": 0.3387317657470703, + "learning_rate": 1.7921327205382086e-05, + "loss": 0.2074, + "step": 11256 + }, + { + "epoch": 0.2090055435295166, + "grad_norm": 0.4591515064239502, + "learning_rate": 1.7920615185223402e-05, + "loss": 0.4596, + "step": 11258 + }, + { + "epoch": 0.20904267366693524, + "grad_norm": 0.368550181388855, + "learning_rate": 1.7919903057288873e-05, + "loss": 0.2687, + "step": 11260 + }, + { + "epoch": 0.20907980380435387, + "grad_norm": 0.4759654402732849, + "learning_rate": 1.7919190821588185e-05, + "loss": 0.516, + "step": 11262 + }, + { + "epoch": 0.20911693394177253, + "grad_norm": 0.3504283130168915, + "learning_rate": 1.7918478478131035e-05, + "loss": 0.1249, + "step": 11264 + }, + { + "epoch": 0.20915406407919115, + "grad_norm": 0.30504560470581055, + "learning_rate": 1.7917766026927106e-05, + "loss": 0.1936, + "step": 11266 + }, + { + "epoch": 0.20919119421660978, + "grad_norm": 0.4589231312274933, + "learning_rate": 1.79170534679861e-05, + "loss": 0.3473, + "step": 11268 + }, + { + "epoch": 0.20922832435402844, + "grad_norm": 0.29722461104393005, + "learning_rate": 1.791634080131771e-05, + "loss": 0.3495, + "step": 11270 + }, + { + "epoch": 0.20926545449144707, + "grad_norm": 0.316755086183548, + "learning_rate": 1.7915628026931634e-05, + "loss": 0.2927, + "step": 11272 + }, + { + "epoch": 0.20930258462886572, + "grad_norm": 0.32517385482788086, + "learning_rate": 1.7914915144837573e-05, + "loss": 0.3272, + "step": 11274 + }, + { + "epoch": 0.20933971476628435, + "grad_norm": 0.24355337023735046, + "learning_rate": 1.7914202155045225e-05, + "loss": 0.344, + "step": 11276 + }, + { + "epoch": 0.20937684490370298, + "grad_norm": 0.25505974888801575, + "learning_rate": 1.791348905756429e-05, + "loss": 0.381, + "step": 11278 + }, + { + "epoch": 0.20941397504112164, + "grad_norm": 0.3841293156147003, + "learning_rate": 1.7912775852404468e-05, + "loss": 0.4498, + "step": 11280 + }, + { + "epoch": 0.20945110517854026, + "grad_norm": 0.4953913390636444, + "learning_rate": 1.7912062539575475e-05, + "loss": 0.3866, + "step": 11282 + }, + { + "epoch": 0.2094882353159589, + "grad_norm": 0.40921443700790405, + "learning_rate": 1.7911349119087005e-05, + "loss": 0.257, + "step": 11284 + }, + { + "epoch": 0.20952536545337755, + "grad_norm": 0.30752265453338623, + "learning_rate": 1.791063559094877e-05, + "loss": 0.2584, + "step": 11286 + }, + { + "epoch": 0.20956249559079618, + "grad_norm": 0.43422332406044006, + "learning_rate": 1.7909921955170486e-05, + "loss": 0.4647, + "step": 11288 + }, + { + "epoch": 0.20959962572821483, + "grad_norm": 0.31949397921562195, + "learning_rate": 1.7909208211761852e-05, + "loss": 0.3347, + "step": 11290 + }, + { + "epoch": 0.20963675586563346, + "grad_norm": 0.3028753697872162, + "learning_rate": 1.7908494360732586e-05, + "loss": 0.2717, + "step": 11292 + }, + { + "epoch": 0.2096738860030521, + "grad_norm": 0.3990941047668457, + "learning_rate": 1.7907780402092405e-05, + "loss": 0.3538, + "step": 11294 + }, + { + "epoch": 0.20971101614047075, + "grad_norm": 0.5162336826324463, + "learning_rate": 1.7907066335851017e-05, + "loss": 0.1953, + "step": 11296 + }, + { + "epoch": 0.20974814627788937, + "grad_norm": 0.2976301908493042, + "learning_rate": 1.790635216201814e-05, + "loss": 0.234, + "step": 11298 + }, + { + "epoch": 0.209785276415308, + "grad_norm": 0.2512798011302948, + "learning_rate": 1.7905637880603487e-05, + "loss": 0.4178, + "step": 11300 + }, + { + "epoch": 0.20982240655272666, + "grad_norm": 0.3172019422054291, + "learning_rate": 1.790492349161679e-05, + "loss": 0.4056, + "step": 11302 + }, + { + "epoch": 0.2098595366901453, + "grad_norm": 0.3491508960723877, + "learning_rate": 1.7904208995067756e-05, + "loss": 0.2989, + "step": 11304 + }, + { + "epoch": 0.20989666682756392, + "grad_norm": 0.3441779315471649, + "learning_rate": 1.7903494390966118e-05, + "loss": 0.2475, + "step": 11306 + }, + { + "epoch": 0.20993379696498257, + "grad_norm": 0.3297508656978607, + "learning_rate": 1.7902779679321593e-05, + "loss": 0.2369, + "step": 11308 + }, + { + "epoch": 0.2099709271024012, + "grad_norm": 0.31899791955947876, + "learning_rate": 1.790206486014391e-05, + "loss": 0.3274, + "step": 11310 + }, + { + "epoch": 0.21000805723981986, + "grad_norm": 0.38884031772613525, + "learning_rate": 1.7901349933442795e-05, + "loss": 0.3682, + "step": 11312 + }, + { + "epoch": 0.21004518737723848, + "grad_norm": 0.2976062595844269, + "learning_rate": 1.790063489922797e-05, + "loss": 0.2472, + "step": 11314 + }, + { + "epoch": 0.2100823175146571, + "grad_norm": 0.4382602274417877, + "learning_rate": 1.789991975750917e-05, + "loss": 0.3804, + "step": 11316 + }, + { + "epoch": 0.21011944765207577, + "grad_norm": 0.40436607599258423, + "learning_rate": 1.7899204508296127e-05, + "loss": 0.4625, + "step": 11318 + }, + { + "epoch": 0.2101565777894944, + "grad_norm": 0.39966943860054016, + "learning_rate": 1.789848915159857e-05, + "loss": 0.365, + "step": 11320 + }, + { + "epoch": 0.21019370792691303, + "grad_norm": 0.2898668348789215, + "learning_rate": 1.7897773687426237e-05, + "loss": 0.3255, + "step": 11322 + }, + { + "epoch": 0.21023083806433168, + "grad_norm": 0.46234452724456787, + "learning_rate": 1.789705811578886e-05, + "loss": 0.3227, + "step": 11324 + }, + { + "epoch": 0.2102679682017503, + "grad_norm": 0.3670711815357208, + "learning_rate": 1.7896342436696176e-05, + "loss": 0.4924, + "step": 11326 + }, + { + "epoch": 0.21030509833916897, + "grad_norm": 0.3221372663974762, + "learning_rate": 1.7895626650157923e-05, + "loss": 0.3204, + "step": 11328 + }, + { + "epoch": 0.2103422284765876, + "grad_norm": 0.3623506724834442, + "learning_rate": 1.789491075618384e-05, + "loss": 0.4174, + "step": 11330 + }, + { + "epoch": 0.21037935861400622, + "grad_norm": 0.28841155767440796, + "learning_rate": 1.7894194754783672e-05, + "loss": 0.4641, + "step": 11332 + }, + { + "epoch": 0.21041648875142488, + "grad_norm": 0.4555869996547699, + "learning_rate": 1.789347864596716e-05, + "loss": 0.3762, + "step": 11334 + }, + { + "epoch": 0.2104536188888435, + "grad_norm": 0.30810022354125977, + "learning_rate": 1.7892762429744045e-05, + "loss": 0.1977, + "step": 11336 + }, + { + "epoch": 0.21049074902626214, + "grad_norm": 0.3749142289161682, + "learning_rate": 1.7892046106124077e-05, + "loss": 0.3887, + "step": 11338 + }, + { + "epoch": 0.2105278791636808, + "grad_norm": 0.3263684809207916, + "learning_rate": 1.7891329675116998e-05, + "loss": 0.4671, + "step": 11340 + }, + { + "epoch": 0.21056500930109942, + "grad_norm": 0.3980671465396881, + "learning_rate": 1.789061313673256e-05, + "loss": 0.3209, + "step": 11342 + }, + { + "epoch": 0.21060213943851805, + "grad_norm": 0.37517815828323364, + "learning_rate": 1.7889896490980515e-05, + "loss": 0.3681, + "step": 11344 + }, + { + "epoch": 0.2106392695759367, + "grad_norm": 0.30738428235054016, + "learning_rate": 1.788917973787061e-05, + "loss": 0.2078, + "step": 11346 + }, + { + "epoch": 0.21067639971335533, + "grad_norm": 0.3333374261856079, + "learning_rate": 1.78884628774126e-05, + "loss": 0.2817, + "step": 11348 + }, + { + "epoch": 0.210713529850774, + "grad_norm": 0.2693551480770111, + "learning_rate": 1.788774590961624e-05, + "loss": 0.2098, + "step": 11350 + }, + { + "epoch": 0.21075065998819262, + "grad_norm": 0.27048808336257935, + "learning_rate": 1.788702883449128e-05, + "loss": 0.379, + "step": 11352 + }, + { + "epoch": 0.21078779012561125, + "grad_norm": 0.2991126477718353, + "learning_rate": 1.788631165204749e-05, + "loss": 0.4559, + "step": 11354 + }, + { + "epoch": 0.2108249202630299, + "grad_norm": 0.3492969572544098, + "learning_rate": 1.7885594362294613e-05, + "loss": 0.2681, + "step": 11356 + }, + { + "epoch": 0.21086205040044853, + "grad_norm": 0.36768993735313416, + "learning_rate": 1.788487696524242e-05, + "loss": 0.5137, + "step": 11358 + }, + { + "epoch": 0.21089918053786716, + "grad_norm": 0.4367530643939972, + "learning_rate": 1.788415946090067e-05, + "loss": 0.3998, + "step": 11360 + }, + { + "epoch": 0.21093631067528582, + "grad_norm": 0.3286070227622986, + "learning_rate": 1.7883441849279124e-05, + "loss": 0.299, + "step": 11362 + }, + { + "epoch": 0.21097344081270444, + "grad_norm": 0.34484922885894775, + "learning_rate": 1.788272413038755e-05, + "loss": 0.1799, + "step": 11364 + }, + { + "epoch": 0.2110105709501231, + "grad_norm": 0.3885658085346222, + "learning_rate": 1.788200630423571e-05, + "loss": 0.3552, + "step": 11366 + }, + { + "epoch": 0.21104770108754173, + "grad_norm": 0.447185754776001, + "learning_rate": 1.7881288370833374e-05, + "loss": 0.3397, + "step": 11368 + }, + { + "epoch": 0.21108483122496036, + "grad_norm": 0.5488476753234863, + "learning_rate": 1.788057033019031e-05, + "loss": 0.3098, + "step": 11370 + }, + { + "epoch": 0.211121961362379, + "grad_norm": 0.20774076879024506, + "learning_rate": 1.787985218231629e-05, + "loss": 0.2344, + "step": 11372 + }, + { + "epoch": 0.21115909149979764, + "grad_norm": 0.3104729950428009, + "learning_rate": 1.7879133927221085e-05, + "loss": 0.3938, + "step": 11374 + }, + { + "epoch": 0.21119622163721627, + "grad_norm": 0.33838367462158203, + "learning_rate": 1.7878415564914468e-05, + "loss": 0.2054, + "step": 11376 + }, + { + "epoch": 0.21123335177463493, + "grad_norm": 0.36007073521614075, + "learning_rate": 1.7877697095406215e-05, + "loss": 0.3108, + "step": 11378 + }, + { + "epoch": 0.21127048191205355, + "grad_norm": 0.2878221869468689, + "learning_rate": 1.78769785187061e-05, + "loss": 0.2944, + "step": 11380 + }, + { + "epoch": 0.21130761204947218, + "grad_norm": 0.5970224142074585, + "learning_rate": 1.7876259834823897e-05, + "loss": 0.4931, + "step": 11382 + }, + { + "epoch": 0.21134474218689084, + "grad_norm": 0.3691718280315399, + "learning_rate": 1.7875541043769395e-05, + "loss": 0.3745, + "step": 11384 + }, + { + "epoch": 0.21138187232430947, + "grad_norm": 0.42721423506736755, + "learning_rate": 1.7874822145552367e-05, + "loss": 0.1797, + "step": 11386 + }, + { + "epoch": 0.21141900246172812, + "grad_norm": 0.3286542594432831, + "learning_rate": 1.7874103140182598e-05, + "loss": 0.22, + "step": 11388 + }, + { + "epoch": 0.21145613259914675, + "grad_norm": 0.3122023046016693, + "learning_rate": 1.787338402766987e-05, + "loss": 0.2754, + "step": 11390 + }, + { + "epoch": 0.21149326273656538, + "grad_norm": 0.2577419579029083, + "learning_rate": 1.7872664808023974e-05, + "loss": 0.4035, + "step": 11392 + }, + { + "epoch": 0.21153039287398404, + "grad_norm": 0.48411402106285095, + "learning_rate": 1.7871945481254685e-05, + "loss": 0.2051, + "step": 11394 + }, + { + "epoch": 0.21156752301140266, + "grad_norm": 0.3355330228805542, + "learning_rate": 1.7871226047371802e-05, + "loss": 0.3139, + "step": 11396 + }, + { + "epoch": 0.2116046531488213, + "grad_norm": 0.33480045199394226, + "learning_rate": 1.7870506506385106e-05, + "loss": 0.3558, + "step": 11398 + }, + { + "epoch": 0.21164178328623995, + "grad_norm": 0.36884260177612305, + "learning_rate": 1.7869786858304392e-05, + "loss": 0.2811, + "step": 11400 + }, + { + "epoch": 0.21167891342365858, + "grad_norm": 0.42031824588775635, + "learning_rate": 1.7869067103139452e-05, + "loss": 0.369, + "step": 11402 + }, + { + "epoch": 0.21171604356107723, + "grad_norm": 0.4800170063972473, + "learning_rate": 1.786834724090008e-05, + "loss": 0.4371, + "step": 11404 + }, + { + "epoch": 0.21175317369849586, + "grad_norm": 0.41438165307044983, + "learning_rate": 1.786762727159607e-05, + "loss": 0.4753, + "step": 11406 + }, + { + "epoch": 0.2117903038359145, + "grad_norm": 0.37987223267555237, + "learning_rate": 1.786690719523722e-05, + "loss": 0.2602, + "step": 11408 + }, + { + "epoch": 0.21182743397333315, + "grad_norm": 0.2816537618637085, + "learning_rate": 1.7866187011833328e-05, + "loss": 0.2211, + "step": 11410 + }, + { + "epoch": 0.21186456411075177, + "grad_norm": 0.4368533790111542, + "learning_rate": 1.786546672139419e-05, + "loss": 0.1751, + "step": 11412 + }, + { + "epoch": 0.2119016942481704, + "grad_norm": 0.274362176656723, + "learning_rate": 1.786474632392961e-05, + "loss": 0.4659, + "step": 11414 + }, + { + "epoch": 0.21193882438558906, + "grad_norm": 0.2870135009288788, + "learning_rate": 1.7864025819449393e-05, + "loss": 0.1647, + "step": 11416 + }, + { + "epoch": 0.2119759545230077, + "grad_norm": 0.3261653184890747, + "learning_rate": 1.786330520796334e-05, + "loss": 0.1533, + "step": 11418 + }, + { + "epoch": 0.21201308466042632, + "grad_norm": 0.5299916863441467, + "learning_rate": 1.7862584489481252e-05, + "loss": 0.379, + "step": 11420 + }, + { + "epoch": 0.21205021479784497, + "grad_norm": 0.26870614290237427, + "learning_rate": 1.786186366401294e-05, + "loss": 0.5631, + "step": 11422 + }, + { + "epoch": 0.2120873449352636, + "grad_norm": 0.25638487935066223, + "learning_rate": 1.7861142731568217e-05, + "loss": 0.2603, + "step": 11424 + }, + { + "epoch": 0.21212447507268226, + "grad_norm": 0.3180806636810303, + "learning_rate": 1.7860421692156883e-05, + "loss": 0.3897, + "step": 11426 + }, + { + "epoch": 0.21216160521010088, + "grad_norm": 0.3051069974899292, + "learning_rate": 1.785970054578876e-05, + "loss": 0.3624, + "step": 11428 + }, + { + "epoch": 0.2121987353475195, + "grad_norm": 0.317199170589447, + "learning_rate": 1.7858979292473652e-05, + "loss": 0.3877, + "step": 11430 + }, + { + "epoch": 0.21223586548493817, + "grad_norm": 0.4193689823150635, + "learning_rate": 1.785825793222138e-05, + "loss": 0.3963, + "step": 11432 + }, + { + "epoch": 0.2122729956223568, + "grad_norm": 0.397788405418396, + "learning_rate": 1.7857536465041754e-05, + "loss": 0.1269, + "step": 11434 + }, + { + "epoch": 0.21231012575977543, + "grad_norm": 0.420858234167099, + "learning_rate": 1.7856814890944592e-05, + "loss": 0.3157, + "step": 11436 + }, + { + "epoch": 0.21234725589719408, + "grad_norm": 0.4014202356338501, + "learning_rate": 1.7856093209939712e-05, + "loss": 0.5307, + "step": 11438 + }, + { + "epoch": 0.2123843860346127, + "grad_norm": 0.4186968207359314, + "learning_rate": 1.7855371422036937e-05, + "loss": 0.3368, + "step": 11440 + }, + { + "epoch": 0.21242151617203137, + "grad_norm": 0.4789351224899292, + "learning_rate": 1.7854649527246084e-05, + "loss": 0.2259, + "step": 11442 + }, + { + "epoch": 0.21245864630945, + "grad_norm": 0.44176778197288513, + "learning_rate": 1.785392752557698e-05, + "loss": 0.402, + "step": 11444 + }, + { + "epoch": 0.21249577644686862, + "grad_norm": 0.3761562705039978, + "learning_rate": 1.7853205417039447e-05, + "loss": 0.1352, + "step": 11446 + }, + { + "epoch": 0.21253290658428728, + "grad_norm": 0.3794058561325073, + "learning_rate": 1.7852483201643314e-05, + "loss": 0.195, + "step": 11448 + }, + { + "epoch": 0.2125700367217059, + "grad_norm": 0.28392860293388367, + "learning_rate": 1.7851760879398403e-05, + "loss": 0.3234, + "step": 11450 + }, + { + "epoch": 0.21260716685912454, + "grad_norm": 0.4184112548828125, + "learning_rate": 1.7851038450314546e-05, + "loss": 0.2738, + "step": 11452 + }, + { + "epoch": 0.2126442969965432, + "grad_norm": 0.36601537466049194, + "learning_rate": 1.785031591440157e-05, + "loss": 0.2953, + "step": 11454 + }, + { + "epoch": 0.21268142713396182, + "grad_norm": 0.30151498317718506, + "learning_rate": 1.784959327166931e-05, + "loss": 0.1882, + "step": 11456 + }, + { + "epoch": 0.21271855727138045, + "grad_norm": 0.29366907477378845, + "learning_rate": 1.7848870522127598e-05, + "loss": 0.2897, + "step": 11458 + }, + { + "epoch": 0.2127556874087991, + "grad_norm": 0.5849746465682983, + "learning_rate": 1.7848147665786272e-05, + "loss": 0.343, + "step": 11460 + }, + { + "epoch": 0.21279281754621773, + "grad_norm": 0.3080577552318573, + "learning_rate": 1.7847424702655162e-05, + "loss": 0.3845, + "step": 11462 + }, + { + "epoch": 0.2128299476836364, + "grad_norm": 0.5569397211074829, + "learning_rate": 1.7846701632744104e-05, + "loss": 0.5116, + "step": 11464 + }, + { + "epoch": 0.21286707782105502, + "grad_norm": 0.35806289315223694, + "learning_rate": 1.7845978456062944e-05, + "loss": 0.3764, + "step": 11466 + }, + { + "epoch": 0.21290420795847365, + "grad_norm": 0.33247724175453186, + "learning_rate": 1.7845255172621517e-05, + "loss": 0.2466, + "step": 11468 + }, + { + "epoch": 0.2129413380958923, + "grad_norm": 0.4600261449813843, + "learning_rate": 1.784453178242967e-05, + "loss": 0.4468, + "step": 11470 + }, + { + "epoch": 0.21297846823331093, + "grad_norm": 0.49692943692207336, + "learning_rate": 1.7843808285497238e-05, + "loss": 0.2575, + "step": 11472 + }, + { + "epoch": 0.21301559837072956, + "grad_norm": 0.3204275369644165, + "learning_rate": 1.784308468183407e-05, + "loss": 0.2747, + "step": 11474 + }, + { + "epoch": 0.21305272850814821, + "grad_norm": 0.35899871587753296, + "learning_rate": 1.7842360971450013e-05, + "loss": 0.2242, + "step": 11476 + }, + { + "epoch": 0.21308985864556684, + "grad_norm": 0.3258490562438965, + "learning_rate": 1.7841637154354916e-05, + "loss": 0.3945, + "step": 11478 + }, + { + "epoch": 0.2131269887829855, + "grad_norm": 0.40194201469421387, + "learning_rate": 1.7840913230558624e-05, + "loss": 0.2938, + "step": 11480 + }, + { + "epoch": 0.21316411892040413, + "grad_norm": 0.48156869411468506, + "learning_rate": 1.784018920007099e-05, + "loss": 0.3341, + "step": 11482 + }, + { + "epoch": 0.21320124905782276, + "grad_norm": 0.48903074860572815, + "learning_rate": 1.7839465062901865e-05, + "loss": 0.4978, + "step": 11484 + }, + { + "epoch": 0.2132383791952414, + "grad_norm": 0.46952056884765625, + "learning_rate": 1.7838740819061102e-05, + "loss": 0.1329, + "step": 11486 + }, + { + "epoch": 0.21327550933266004, + "grad_norm": 0.4599023461341858, + "learning_rate": 1.7838016468558557e-05, + "loss": 0.297, + "step": 11488 + }, + { + "epoch": 0.21331263947007867, + "grad_norm": 0.40903759002685547, + "learning_rate": 1.7837292011404083e-05, + "loss": 0.2725, + "step": 11490 + }, + { + "epoch": 0.21334976960749732, + "grad_norm": 0.391618937253952, + "learning_rate": 1.7836567447607542e-05, + "loss": 0.3916, + "step": 11492 + }, + { + "epoch": 0.21338689974491595, + "grad_norm": 2.1403417587280273, + "learning_rate": 1.7835842777178792e-05, + "loss": 0.3439, + "step": 11494 + }, + { + "epoch": 0.21342402988233458, + "grad_norm": 0.2944311499595642, + "learning_rate": 1.7835118000127693e-05, + "loss": 0.4008, + "step": 11496 + }, + { + "epoch": 0.21346116001975324, + "grad_norm": 0.42583248019218445, + "learning_rate": 1.7834393116464107e-05, + "loss": 0.2525, + "step": 11498 + }, + { + "epoch": 0.21349829015717187, + "grad_norm": 0.43102923035621643, + "learning_rate": 1.7833668126197895e-05, + "loss": 0.2459, + "step": 11500 + }, + { + "epoch": 0.21353542029459052, + "grad_norm": 0.31348541378974915, + "learning_rate": 1.7832943029338925e-05, + "loss": 0.1969, + "step": 11502 + }, + { + "epoch": 0.21357255043200915, + "grad_norm": 0.4670495092868805, + "learning_rate": 1.7832217825897065e-05, + "loss": 0.2661, + "step": 11504 + }, + { + "epoch": 0.21360968056942778, + "grad_norm": 0.44927075505256653, + "learning_rate": 1.783149251588218e-05, + "loss": 0.4168, + "step": 11506 + }, + { + "epoch": 0.21364681070684644, + "grad_norm": 0.3722849488258362, + "learning_rate": 1.7830767099304135e-05, + "loss": 0.3584, + "step": 11508 + }, + { + "epoch": 0.21368394084426506, + "grad_norm": 0.28457188606262207, + "learning_rate": 1.7830041576172813e-05, + "loss": 0.1731, + "step": 11510 + }, + { + "epoch": 0.2137210709816837, + "grad_norm": 0.31138989329338074, + "learning_rate": 1.7829315946498075e-05, + "loss": 0.4616, + "step": 11512 + }, + { + "epoch": 0.21375820111910235, + "grad_norm": 0.3556770980358124, + "learning_rate": 1.7828590210289797e-05, + "loss": 0.3717, + "step": 11514 + }, + { + "epoch": 0.21379533125652098, + "grad_norm": 0.35543492436408997, + "learning_rate": 1.7827864367557856e-05, + "loss": 0.2778, + "step": 11516 + }, + { + "epoch": 0.21383246139393963, + "grad_norm": 0.6743049621582031, + "learning_rate": 1.7827138418312132e-05, + "loss": 0.3329, + "step": 11518 + }, + { + "epoch": 0.21386959153135826, + "grad_norm": 0.2347886711359024, + "learning_rate": 1.7826412362562497e-05, + "loss": 0.3199, + "step": 11520 + }, + { + "epoch": 0.2139067216687769, + "grad_norm": 0.3072112798690796, + "learning_rate": 1.7825686200318833e-05, + "loss": 0.4713, + "step": 11522 + }, + { + "epoch": 0.21394385180619555, + "grad_norm": 0.26356202363967896, + "learning_rate": 1.782495993159102e-05, + "loss": 0.3933, + "step": 11524 + }, + { + "epoch": 0.21398098194361417, + "grad_norm": 0.30975306034088135, + "learning_rate": 1.782423355638894e-05, + "loss": 0.3039, + "step": 11526 + }, + { + "epoch": 0.2140181120810328, + "grad_norm": 0.5246520042419434, + "learning_rate": 1.7823507074722477e-05, + "loss": 0.3466, + "step": 11528 + }, + { + "epoch": 0.21405524221845146, + "grad_norm": 0.245462104678154, + "learning_rate": 1.782278048660152e-05, + "loss": 0.1666, + "step": 11530 + }, + { + "epoch": 0.2140923723558701, + "grad_norm": 0.5315453410148621, + "learning_rate": 1.782205379203595e-05, + "loss": 0.3585, + "step": 11532 + }, + { + "epoch": 0.21412950249328871, + "grad_norm": 0.5651087760925293, + "learning_rate": 1.7821326991035656e-05, + "loss": 0.2643, + "step": 11534 + }, + { + "epoch": 0.21416663263070737, + "grad_norm": 0.4269042909145355, + "learning_rate": 1.7820600083610534e-05, + "loss": 0.6047, + "step": 11536 + }, + { + "epoch": 0.214203762768126, + "grad_norm": 0.29089438915252686, + "learning_rate": 1.7819873069770464e-05, + "loss": 0.3744, + "step": 11538 + }, + { + "epoch": 0.21424089290554466, + "grad_norm": 0.5004473924636841, + "learning_rate": 1.781914594952535e-05, + "loss": 0.433, + "step": 11540 + }, + { + "epoch": 0.21427802304296328, + "grad_norm": 0.24751044809818268, + "learning_rate": 1.781841872288508e-05, + "loss": 0.2993, + "step": 11542 + }, + { + "epoch": 0.2143151531803819, + "grad_norm": 0.3154331147670746, + "learning_rate": 1.781769138985955e-05, + "loss": 0.2875, + "step": 11544 + }, + { + "epoch": 0.21435228331780057, + "grad_norm": 0.43083828687667847, + "learning_rate": 1.7816963950458656e-05, + "loss": 0.248, + "step": 11546 + }, + { + "epoch": 0.2143894134552192, + "grad_norm": 0.4418305456638336, + "learning_rate": 1.7816236404692294e-05, + "loss": 0.335, + "step": 11548 + }, + { + "epoch": 0.21442654359263782, + "grad_norm": 0.45585572719573975, + "learning_rate": 1.7815508752570374e-05, + "loss": 0.2524, + "step": 11550 + }, + { + "epoch": 0.21446367373005648, + "grad_norm": 0.4467269480228424, + "learning_rate": 1.7814780994102786e-05, + "loss": 0.2737, + "step": 11552 + }, + { + "epoch": 0.2145008038674751, + "grad_norm": 0.45708996057510376, + "learning_rate": 1.7814053129299435e-05, + "loss": 0.481, + "step": 11554 + }, + { + "epoch": 0.21453793400489377, + "grad_norm": 0.3335057497024536, + "learning_rate": 1.781332515817023e-05, + "loss": 0.4306, + "step": 11556 + }, + { + "epoch": 0.2145750641423124, + "grad_norm": 0.4115939140319824, + "learning_rate": 1.781259708072507e-05, + "loss": 0.2511, + "step": 11558 + }, + { + "epoch": 0.21461219427973102, + "grad_norm": 0.5011863708496094, + "learning_rate": 1.7811868896973868e-05, + "loss": 0.3784, + "step": 11560 + }, + { + "epoch": 0.21464932441714968, + "grad_norm": 0.37148159742355347, + "learning_rate": 1.781114060692653e-05, + "loss": 0.1917, + "step": 11562 + }, + { + "epoch": 0.2146864545545683, + "grad_norm": 0.44088369607925415, + "learning_rate": 1.781041221059296e-05, + "loss": 0.174, + "step": 11564 + }, + { + "epoch": 0.21472358469198694, + "grad_norm": 0.3805510103702545, + "learning_rate": 1.780968370798308e-05, + "loss": 0.4689, + "step": 11566 + }, + { + "epoch": 0.2147607148294056, + "grad_norm": 0.3723870515823364, + "learning_rate": 1.7808955099106797e-05, + "loss": 0.1415, + "step": 11568 + }, + { + "epoch": 0.21479784496682422, + "grad_norm": 0.3652951717376709, + "learning_rate": 1.7808226383974022e-05, + "loss": 0.3654, + "step": 11570 + }, + { + "epoch": 0.21483497510424285, + "grad_norm": 0.42163509130477905, + "learning_rate": 1.780749756259468e-05, + "loss": 0.3133, + "step": 11572 + }, + { + "epoch": 0.2148721052416615, + "grad_norm": 0.3045058250427246, + "learning_rate": 1.780676863497868e-05, + "loss": 0.2281, + "step": 11574 + }, + { + "epoch": 0.21490923537908013, + "grad_norm": 0.3182167112827301, + "learning_rate": 1.780603960113594e-05, + "loss": 0.3289, + "step": 11576 + }, + { + "epoch": 0.2149463655164988, + "grad_norm": 0.38915860652923584, + "learning_rate": 1.7805310461076386e-05, + "loss": 0.449, + "step": 11578 + }, + { + "epoch": 0.21498349565391742, + "grad_norm": 0.5140718817710876, + "learning_rate": 1.7804581214809934e-05, + "loss": 0.4113, + "step": 11580 + }, + { + "epoch": 0.21502062579133605, + "grad_norm": 0.3171132802963257, + "learning_rate": 1.780385186234651e-05, + "loss": 0.2031, + "step": 11582 + }, + { + "epoch": 0.2150577559287547, + "grad_norm": 0.32669177651405334, + "learning_rate": 1.7803122403696037e-05, + "loss": 0.4019, + "step": 11584 + }, + { + "epoch": 0.21509488606617333, + "grad_norm": 0.36177048087120056, + "learning_rate": 1.7802392838868443e-05, + "loss": 0.2707, + "step": 11586 + }, + { + "epoch": 0.21513201620359196, + "grad_norm": 0.47265127301216125, + "learning_rate": 1.7801663167873654e-05, + "loss": 0.4945, + "step": 11588 + }, + { + "epoch": 0.21516914634101061, + "grad_norm": 0.48518139123916626, + "learning_rate": 1.7800933390721592e-05, + "loss": 0.4166, + "step": 11590 + }, + { + "epoch": 0.21520627647842924, + "grad_norm": 0.34591609239578247, + "learning_rate": 1.7800203507422196e-05, + "loss": 0.2916, + "step": 11592 + }, + { + "epoch": 0.2152434066158479, + "grad_norm": 0.3669845461845398, + "learning_rate": 1.7799473517985398e-05, + "loss": 0.4412, + "step": 11594 + }, + { + "epoch": 0.21528053675326653, + "grad_norm": 0.32830438017845154, + "learning_rate": 1.7798743422421125e-05, + "loss": 0.2918, + "step": 11596 + }, + { + "epoch": 0.21531766689068516, + "grad_norm": 0.3107183873653412, + "learning_rate": 1.7798013220739317e-05, + "loss": 0.3959, + "step": 11598 + }, + { + "epoch": 0.2153547970281038, + "grad_norm": 0.7919156551361084, + "learning_rate": 1.7797282912949902e-05, + "loss": 0.236, + "step": 11600 + }, + { + "epoch": 0.21539192716552244, + "grad_norm": 0.3440660834312439, + "learning_rate": 1.7796552499062824e-05, + "loss": 0.4059, + "step": 11602 + }, + { + "epoch": 0.21542905730294107, + "grad_norm": 0.3416938781738281, + "learning_rate": 1.779582197908802e-05, + "loss": 0.151, + "step": 11604 + }, + { + "epoch": 0.21546618744035972, + "grad_norm": 0.3433080017566681, + "learning_rate": 1.7795091353035432e-05, + "loss": 0.2142, + "step": 11606 + }, + { + "epoch": 0.21550331757777835, + "grad_norm": 0.3219144642353058, + "learning_rate": 1.7794360620914996e-05, + "loss": 0.2532, + "step": 11608 + }, + { + "epoch": 0.21554044771519698, + "grad_norm": 0.3427276611328125, + "learning_rate": 1.779362978273666e-05, + "loss": 0.3072, + "step": 11610 + }, + { + "epoch": 0.21557757785261564, + "grad_norm": 0.29735422134399414, + "learning_rate": 1.7792898838510368e-05, + "loss": 0.3904, + "step": 11612 + }, + { + "epoch": 0.21561470799003427, + "grad_norm": 0.3915838599205017, + "learning_rate": 1.7792167788246067e-05, + "loss": 0.3779, + "step": 11614 + }, + { + "epoch": 0.21565183812745292, + "grad_norm": 0.434965580701828, + "learning_rate": 1.7791436631953696e-05, + "loss": 0.4514, + "step": 11616 + }, + { + "epoch": 0.21568896826487155, + "grad_norm": 0.43974030017852783, + "learning_rate": 1.779070536964322e-05, + "loss": 0.2888, + "step": 11618 + }, + { + "epoch": 0.21572609840229018, + "grad_norm": 0.3089367747306824, + "learning_rate": 1.7789974001324576e-05, + "loss": 0.2847, + "step": 11620 + }, + { + "epoch": 0.21576322853970883, + "grad_norm": 0.43584901094436646, + "learning_rate": 1.7789242527007715e-05, + "loss": 0.3013, + "step": 11622 + }, + { + "epoch": 0.21580035867712746, + "grad_norm": 0.3898523449897766, + "learning_rate": 1.77885109467026e-05, + "loss": 0.4838, + "step": 11624 + }, + { + "epoch": 0.2158374888145461, + "grad_norm": 0.4706246852874756, + "learning_rate": 1.7787779260419177e-05, + "loss": 0.3107, + "step": 11626 + }, + { + "epoch": 0.21587461895196475, + "grad_norm": 0.2623288929462433, + "learning_rate": 1.778704746816741e-05, + "loss": 0.4537, + "step": 11628 + }, + { + "epoch": 0.21591174908938338, + "grad_norm": 0.3656718134880066, + "learning_rate": 1.7786315569957246e-05, + "loss": 0.3526, + "step": 11630 + }, + { + "epoch": 0.21594887922680203, + "grad_norm": 0.3711652457714081, + "learning_rate": 1.778558356579865e-05, + "loss": 0.2342, + "step": 11632 + }, + { + "epoch": 0.21598600936422066, + "grad_norm": 0.31929171085357666, + "learning_rate": 1.7784851455701587e-05, + "loss": 0.4484, + "step": 11634 + }, + { + "epoch": 0.2160231395016393, + "grad_norm": 0.5670583844184875, + "learning_rate": 1.7784119239676012e-05, + "loss": 0.2593, + "step": 11636 + }, + { + "epoch": 0.21606026963905794, + "grad_norm": 0.6417336463928223, + "learning_rate": 1.7783386917731892e-05, + "loss": 0.4084, + "step": 11638 + }, + { + "epoch": 0.21609739977647657, + "grad_norm": 0.34550225734710693, + "learning_rate": 1.7782654489879187e-05, + "loss": 0.1716, + "step": 11640 + }, + { + "epoch": 0.2161345299138952, + "grad_norm": 0.33153343200683594, + "learning_rate": 1.7781921956127868e-05, + "loss": 0.2753, + "step": 11642 + }, + { + "epoch": 0.21617166005131386, + "grad_norm": 0.38554567098617554, + "learning_rate": 1.77811893164879e-05, + "loss": 0.2786, + "step": 11644 + }, + { + "epoch": 0.21620879018873249, + "grad_norm": 0.45013391971588135, + "learning_rate": 1.778045657096925e-05, + "loss": 0.5178, + "step": 11646 + }, + { + "epoch": 0.21624592032615111, + "grad_norm": 0.3294655382633209, + "learning_rate": 1.7779723719581893e-05, + "loss": 0.2283, + "step": 11648 + }, + { + "epoch": 0.21628305046356977, + "grad_norm": 0.31702443957328796, + "learning_rate": 1.77789907623358e-05, + "loss": 0.4256, + "step": 11650 + }, + { + "epoch": 0.2163201806009884, + "grad_norm": 0.4021613597869873, + "learning_rate": 1.7778257699240946e-05, + "loss": 0.5154, + "step": 11652 + }, + { + "epoch": 0.21635731073840705, + "grad_norm": 0.30701741576194763, + "learning_rate": 1.77775245303073e-05, + "loss": 0.1619, + "step": 11654 + }, + { + "epoch": 0.21639444087582568, + "grad_norm": 0.27280837297439575, + "learning_rate": 1.7776791255544844e-05, + "loss": 0.3622, + "step": 11656 + }, + { + "epoch": 0.2164315710132443, + "grad_norm": 1.4822739362716675, + "learning_rate": 1.777605787496355e-05, + "loss": 0.3385, + "step": 11658 + }, + { + "epoch": 0.21646870115066297, + "grad_norm": 0.3658817410469055, + "learning_rate": 1.77753243885734e-05, + "loss": 0.3216, + "step": 11660 + }, + { + "epoch": 0.2165058312880816, + "grad_norm": 0.39698219299316406, + "learning_rate": 1.7774590796384382e-05, + "loss": 0.4169, + "step": 11662 + }, + { + "epoch": 0.21654296142550022, + "grad_norm": 0.2685529887676239, + "learning_rate": 1.7773857098406463e-05, + "loss": 0.3702, + "step": 11664 + }, + { + "epoch": 0.21658009156291888, + "grad_norm": 0.282993346452713, + "learning_rate": 1.7773123294649637e-05, + "loss": 0.3329, + "step": 11666 + }, + { + "epoch": 0.2166172217003375, + "grad_norm": 0.30182579159736633, + "learning_rate": 1.7772389385123885e-05, + "loss": 0.2155, + "step": 11668 + }, + { + "epoch": 0.21665435183775617, + "grad_norm": 0.3884281516075134, + "learning_rate": 1.7771655369839194e-05, + "loss": 0.2903, + "step": 11670 + }, + { + "epoch": 0.2166914819751748, + "grad_norm": 0.3103363513946533, + "learning_rate": 1.7770921248805554e-05, + "loss": 0.3722, + "step": 11672 + }, + { + "epoch": 0.21672861211259342, + "grad_norm": 0.6926793456077576, + "learning_rate": 1.7770187022032952e-05, + "loss": 0.2255, + "step": 11674 + }, + { + "epoch": 0.21676574225001208, + "grad_norm": 0.39647847414016724, + "learning_rate": 1.776945268953138e-05, + "loss": 0.3133, + "step": 11676 + }, + { + "epoch": 0.2168028723874307, + "grad_norm": 0.29309317469596863, + "learning_rate": 1.776871825131083e-05, + "loss": 0.394, + "step": 11678 + }, + { + "epoch": 0.21684000252484933, + "grad_norm": 0.4459497332572937, + "learning_rate": 1.776798370738129e-05, + "loss": 0.4171, + "step": 11680 + }, + { + "epoch": 0.216877132662268, + "grad_norm": 0.3338577151298523, + "learning_rate": 1.7767249057752765e-05, + "loss": 0.2979, + "step": 11682 + }, + { + "epoch": 0.21691426279968662, + "grad_norm": 0.44793543219566345, + "learning_rate": 1.7766514302435243e-05, + "loss": 0.3787, + "step": 11684 + }, + { + "epoch": 0.21695139293710525, + "grad_norm": 0.30891355872154236, + "learning_rate": 1.7765779441438726e-05, + "loss": 0.1643, + "step": 11686 + }, + { + "epoch": 0.2169885230745239, + "grad_norm": 0.5312884449958801, + "learning_rate": 1.7765044474773213e-05, + "loss": 0.2262, + "step": 11688 + }, + { + "epoch": 0.21702565321194253, + "grad_norm": 0.3190726637840271, + "learning_rate": 1.77643094024487e-05, + "loss": 0.2582, + "step": 11690 + }, + { + "epoch": 0.2170627833493612, + "grad_norm": 0.4413638114929199, + "learning_rate": 1.7763574224475192e-05, + "loss": 0.2651, + "step": 11692 + }, + { + "epoch": 0.21709991348677982, + "grad_norm": 0.32466989755630493, + "learning_rate": 1.77628389408627e-05, + "loss": 0.2543, + "step": 11694 + }, + { + "epoch": 0.21713704362419844, + "grad_norm": 0.3333269953727722, + "learning_rate": 1.7762103551621214e-05, + "loss": 0.5586, + "step": 11696 + }, + { + "epoch": 0.2171741737616171, + "grad_norm": 0.33949244022369385, + "learning_rate": 1.7761368056760753e-05, + "loss": 0.4606, + "step": 11698 + }, + { + "epoch": 0.21721130389903573, + "grad_norm": 0.3035808801651001, + "learning_rate": 1.7760632456291324e-05, + "loss": 0.4636, + "step": 11700 + }, + { + "epoch": 0.21724843403645436, + "grad_norm": 0.2670368552207947, + "learning_rate": 1.7759896750222927e-05, + "loss": 0.1996, + "step": 11702 + }, + { + "epoch": 0.217285564173873, + "grad_norm": 0.33414679765701294, + "learning_rate": 1.7759160938565586e-05, + "loss": 0.4058, + "step": 11704 + }, + { + "epoch": 0.21732269431129164, + "grad_norm": 0.32663220167160034, + "learning_rate": 1.77584250213293e-05, + "loss": 0.3148, + "step": 11706 + }, + { + "epoch": 0.2173598244487103, + "grad_norm": 0.28434255719184875, + "learning_rate": 1.775768899852409e-05, + "loss": 0.2994, + "step": 11708 + }, + { + "epoch": 0.21739695458612893, + "grad_norm": 0.43545177578926086, + "learning_rate": 1.775695287015997e-05, + "loss": 0.3458, + "step": 11710 + }, + { + "epoch": 0.21743408472354755, + "grad_norm": 0.5098344087600708, + "learning_rate": 1.775621663624696e-05, + "loss": 0.445, + "step": 11712 + }, + { + "epoch": 0.2174712148609662, + "grad_norm": 0.3793988525867462, + "learning_rate": 1.7755480296795068e-05, + "loss": 0.3367, + "step": 11714 + }, + { + "epoch": 0.21750834499838484, + "grad_norm": 0.24274712800979614, + "learning_rate": 1.7754743851814322e-05, + "loss": 0.345, + "step": 11716 + }, + { + "epoch": 0.21754547513580347, + "grad_norm": 0.28071388602256775, + "learning_rate": 1.775400730131474e-05, + "loss": 0.3141, + "step": 11718 + }, + { + "epoch": 0.21758260527322212, + "grad_norm": 0.3397105932235718, + "learning_rate": 1.7753270645306345e-05, + "loss": 0.3611, + "step": 11720 + }, + { + "epoch": 0.21761973541064075, + "grad_norm": 0.3186165988445282, + "learning_rate": 1.775253388379916e-05, + "loss": 0.1774, + "step": 11722 + }, + { + "epoch": 0.21765686554805938, + "grad_norm": 0.42826539278030396, + "learning_rate": 1.7751797016803213e-05, + "loss": 0.3379, + "step": 11724 + }, + { + "epoch": 0.21769399568547804, + "grad_norm": 0.4246998727321625, + "learning_rate": 1.7751060044328525e-05, + "loss": 0.5351, + "step": 11726 + }, + { + "epoch": 0.21773112582289667, + "grad_norm": 0.32764923572540283, + "learning_rate": 1.7750322966385126e-05, + "loss": 0.222, + "step": 11728 + }, + { + "epoch": 0.21776825596031532, + "grad_norm": 0.5281040072441101, + "learning_rate": 1.774958578298305e-05, + "loss": 0.511, + "step": 11730 + }, + { + "epoch": 0.21780538609773395, + "grad_norm": 0.3740936517715454, + "learning_rate": 1.774884849413232e-05, + "loss": 0.4096, + "step": 11732 + }, + { + "epoch": 0.21784251623515258, + "grad_norm": 0.8390377759933472, + "learning_rate": 1.7748111099842976e-05, + "loss": 0.3183, + "step": 11734 + }, + { + "epoch": 0.21787964637257123, + "grad_norm": 0.2964892089366913, + "learning_rate": 1.7747373600125044e-05, + "loss": 0.3896, + "step": 11736 + }, + { + "epoch": 0.21791677650998986, + "grad_norm": 0.3244796097278595, + "learning_rate": 1.774663599498857e-05, + "loss": 0.2406, + "step": 11738 + }, + { + "epoch": 0.2179539066474085, + "grad_norm": 0.36529427766799927, + "learning_rate": 1.7745898284443578e-05, + "loss": 0.3456, + "step": 11740 + }, + { + "epoch": 0.21799103678482715, + "grad_norm": 0.39138689637184143, + "learning_rate": 1.7745160468500115e-05, + "loss": 0.2657, + "step": 11742 + }, + { + "epoch": 0.21802816692224578, + "grad_norm": 0.361465722322464, + "learning_rate": 1.7744422547168216e-05, + "loss": 0.1946, + "step": 11744 + }, + { + "epoch": 0.21806529705966443, + "grad_norm": 0.3145333528518677, + "learning_rate": 1.774368452045792e-05, + "loss": 0.3187, + "step": 11746 + }, + { + "epoch": 0.21810242719708306, + "grad_norm": 0.3784826099872589, + "learning_rate": 1.774294638837928e-05, + "loss": 0.4318, + "step": 11748 + }, + { + "epoch": 0.2181395573345017, + "grad_norm": 0.3336193859577179, + "learning_rate": 1.7742208150942328e-05, + "loss": 0.1942, + "step": 11750 + }, + { + "epoch": 0.21817668747192034, + "grad_norm": 0.2890664041042328, + "learning_rate": 1.7741469808157114e-05, + "loss": 0.2165, + "step": 11752 + }, + { + "epoch": 0.21821381760933897, + "grad_norm": 0.5425647497177124, + "learning_rate": 1.7740731360033688e-05, + "loss": 0.2951, + "step": 11754 + }, + { + "epoch": 0.2182509477467576, + "grad_norm": 0.4762794077396393, + "learning_rate": 1.773999280658209e-05, + "loss": 0.384, + "step": 11756 + }, + { + "epoch": 0.21828807788417626, + "grad_norm": 0.298922061920166, + "learning_rate": 1.773925414781237e-05, + "loss": 0.4055, + "step": 11758 + }, + { + "epoch": 0.21832520802159489, + "grad_norm": 0.41174009442329407, + "learning_rate": 1.7738515383734593e-05, + "loss": 0.2703, + "step": 11760 + }, + { + "epoch": 0.2183623381590135, + "grad_norm": 0.3122740387916565, + "learning_rate": 1.7737776514358795e-05, + "loss": 0.2366, + "step": 11762 + }, + { + "epoch": 0.21839946829643217, + "grad_norm": 0.5369684100151062, + "learning_rate": 1.7737037539695036e-05, + "loss": 0.2402, + "step": 11764 + }, + { + "epoch": 0.2184365984338508, + "grad_norm": 0.36874696612358093, + "learning_rate": 1.7736298459753373e-05, + "loss": 0.3595, + "step": 11766 + }, + { + "epoch": 0.21847372857126945, + "grad_norm": 0.3430725634098053, + "learning_rate": 1.773555927454386e-05, + "loss": 0.4032, + "step": 11768 + }, + { + "epoch": 0.21851085870868808, + "grad_norm": 0.33281052112579346, + "learning_rate": 1.7734819984076556e-05, + "loss": 0.2263, + "step": 11770 + }, + { + "epoch": 0.2185479888461067, + "grad_norm": 0.32666975259780884, + "learning_rate": 1.7734080588361522e-05, + "loss": 0.3449, + "step": 11772 + }, + { + "epoch": 0.21858511898352537, + "grad_norm": 0.31406232714653015, + "learning_rate": 1.7733341087408817e-05, + "loss": 0.1904, + "step": 11774 + }, + { + "epoch": 0.218622249120944, + "grad_norm": 0.3264770805835724, + "learning_rate": 1.77326014812285e-05, + "loss": 0.3706, + "step": 11776 + }, + { + "epoch": 0.21865937925836262, + "grad_norm": 0.3386581838130951, + "learning_rate": 1.7731861769830643e-05, + "loss": 0.3241, + "step": 11778 + }, + { + "epoch": 0.21869650939578128, + "grad_norm": 0.44383057951927185, + "learning_rate": 1.7731121953225306e-05, + "loss": 0.2901, + "step": 11780 + }, + { + "epoch": 0.2187336395331999, + "grad_norm": 0.28619638085365295, + "learning_rate": 1.7730382031422554e-05, + "loss": 0.3034, + "step": 11782 + }, + { + "epoch": 0.21877076967061856, + "grad_norm": 0.479587197303772, + "learning_rate": 1.772964200443246e-05, + "loss": 0.1831, + "step": 11784 + }, + { + "epoch": 0.2188078998080372, + "grad_norm": 0.4370276629924774, + "learning_rate": 1.772890187226509e-05, + "loss": 0.538, + "step": 11786 + }, + { + "epoch": 0.21884502994545582, + "grad_norm": 0.3278469443321228, + "learning_rate": 1.7728161634930518e-05, + "loss": 0.3296, + "step": 11788 + }, + { + "epoch": 0.21888216008287448, + "grad_norm": 0.6768957376480103, + "learning_rate": 1.7727421292438816e-05, + "loss": 0.4059, + "step": 11790 + }, + { + "epoch": 0.2189192902202931, + "grad_norm": 0.49905920028686523, + "learning_rate": 1.7726680844800053e-05, + "loss": 0.5475, + "step": 11792 + }, + { + "epoch": 0.21895642035771173, + "grad_norm": 0.2606540024280548, + "learning_rate": 1.772594029202431e-05, + "loss": 0.2928, + "step": 11794 + }, + { + "epoch": 0.2189935504951304, + "grad_norm": 0.315047949552536, + "learning_rate": 1.7725199634121663e-05, + "loss": 0.2818, + "step": 11796 + }, + { + "epoch": 0.21903068063254902, + "grad_norm": 0.43197986483573914, + "learning_rate": 1.7724458871102186e-05, + "loss": 0.3067, + "step": 11798 + }, + { + "epoch": 0.21906781076996765, + "grad_norm": 0.38735368847846985, + "learning_rate": 1.772371800297596e-05, + "loss": 0.288, + "step": 11800 + }, + { + "epoch": 0.2191049409073863, + "grad_norm": 0.31600499153137207, + "learning_rate": 1.772297702975307e-05, + "loss": 0.3397, + "step": 11802 + }, + { + "epoch": 0.21914207104480493, + "grad_norm": 0.39824894070625305, + "learning_rate": 1.7722235951443595e-05, + "loss": 0.2036, + "step": 11804 + }, + { + "epoch": 0.2191792011822236, + "grad_norm": 0.33607062697410583, + "learning_rate": 1.7721494768057618e-05, + "loss": 0.2655, + "step": 11806 + }, + { + "epoch": 0.21921633131964222, + "grad_norm": 0.29336264729499817, + "learning_rate": 1.7720753479605226e-05, + "loss": 0.1517, + "step": 11808 + }, + { + "epoch": 0.21925346145706084, + "grad_norm": 0.28137242794036865, + "learning_rate": 1.7720012086096507e-05, + "loss": 0.309, + "step": 11810 + }, + { + "epoch": 0.2192905915944795, + "grad_norm": 0.5039309859275818, + "learning_rate": 1.7719270587541547e-05, + "loss": 0.2358, + "step": 11812 + }, + { + "epoch": 0.21932772173189813, + "grad_norm": 0.2509121596813202, + "learning_rate": 1.7718528983950432e-05, + "loss": 0.2434, + "step": 11814 + }, + { + "epoch": 0.21936485186931676, + "grad_norm": 0.544304370880127, + "learning_rate": 1.7717787275333263e-05, + "loss": 0.2158, + "step": 11816 + }, + { + "epoch": 0.2194019820067354, + "grad_norm": 0.2899669408798218, + "learning_rate": 1.7717045461700126e-05, + "loss": 0.4096, + "step": 11818 + }, + { + "epoch": 0.21943911214415404, + "grad_norm": 0.4426668584346771, + "learning_rate": 1.7716303543061113e-05, + "loss": 0.3397, + "step": 11820 + }, + { + "epoch": 0.2194762422815727, + "grad_norm": 0.36808159947395325, + "learning_rate": 1.7715561519426323e-05, + "loss": 0.3998, + "step": 11822 + }, + { + "epoch": 0.21951337241899133, + "grad_norm": 0.4388499855995178, + "learning_rate": 1.7714819390805854e-05, + "loss": 0.1492, + "step": 11824 + }, + { + "epoch": 0.21955050255640995, + "grad_norm": 0.31198567152023315, + "learning_rate": 1.77140771572098e-05, + "loss": 0.3563, + "step": 11826 + }, + { + "epoch": 0.2195876326938286, + "grad_norm": 0.3879901170730591, + "learning_rate": 1.771333481864826e-05, + "loss": 0.3429, + "step": 11828 + }, + { + "epoch": 0.21962476283124724, + "grad_norm": 0.3279518187046051, + "learning_rate": 1.7712592375131338e-05, + "loss": 0.3222, + "step": 11830 + }, + { + "epoch": 0.21966189296866587, + "grad_norm": 0.2979283332824707, + "learning_rate": 1.7711849826669136e-05, + "loss": 0.482, + "step": 11832 + }, + { + "epoch": 0.21969902310608452, + "grad_norm": 0.2699752151966095, + "learning_rate": 1.7711107173271756e-05, + "loss": 0.266, + "step": 11834 + }, + { + "epoch": 0.21973615324350315, + "grad_norm": 0.34165728092193604, + "learning_rate": 1.7710364414949312e-05, + "loss": 0.2764, + "step": 11836 + }, + { + "epoch": 0.21977328338092178, + "grad_norm": 0.4444302022457123, + "learning_rate": 1.7709621551711896e-05, + "loss": 0.3999, + "step": 11838 + }, + { + "epoch": 0.21981041351834044, + "grad_norm": 0.3308092951774597, + "learning_rate": 1.7708878583569626e-05, + "loss": 0.5882, + "step": 11840 + }, + { + "epoch": 0.21984754365575906, + "grad_norm": 0.36871659755706787, + "learning_rate": 1.770813551053261e-05, + "loss": 0.3896, + "step": 11842 + }, + { + "epoch": 0.21988467379317772, + "grad_norm": 0.3203166723251343, + "learning_rate": 1.7707392332610957e-05, + "loss": 0.3394, + "step": 11844 + }, + { + "epoch": 0.21992180393059635, + "grad_norm": 0.3662722408771515, + "learning_rate": 1.770664904981478e-05, + "loss": 0.4338, + "step": 11846 + }, + { + "epoch": 0.21995893406801498, + "grad_norm": 0.433752179145813, + "learning_rate": 1.7705905662154196e-05, + "loss": 0.2038, + "step": 11848 + }, + { + "epoch": 0.21999606420543363, + "grad_norm": 0.510762095451355, + "learning_rate": 1.7705162169639317e-05, + "loss": 0.3264, + "step": 11850 + }, + { + "epoch": 0.22003319434285226, + "grad_norm": 0.5013280510902405, + "learning_rate": 1.770441857228026e-05, + "loss": 0.2388, + "step": 11852 + }, + { + "epoch": 0.2200703244802709, + "grad_norm": 0.2385890930891037, + "learning_rate": 1.770367487008714e-05, + "loss": 0.3615, + "step": 11854 + }, + { + "epoch": 0.22010745461768955, + "grad_norm": 0.3456043303012848, + "learning_rate": 1.7702931063070084e-05, + "loss": 0.4043, + "step": 11856 + }, + { + "epoch": 0.22014458475510817, + "grad_norm": 0.3471605181694031, + "learning_rate": 1.770218715123921e-05, + "loss": 0.3438, + "step": 11858 + }, + { + "epoch": 0.22018171489252683, + "grad_norm": 0.4312245845794678, + "learning_rate": 1.770144313460464e-05, + "loss": 0.4343, + "step": 11860 + }, + { + "epoch": 0.22021884502994546, + "grad_norm": 1.200728416442871, + "learning_rate": 1.7700699013176494e-05, + "loss": 0.3087, + "step": 11862 + }, + { + "epoch": 0.2202559751673641, + "grad_norm": 0.35962337255477905, + "learning_rate": 1.7699954786964902e-05, + "loss": 0.0891, + "step": 11864 + }, + { + "epoch": 0.22029310530478274, + "grad_norm": 0.34709036350250244, + "learning_rate": 1.769921045597999e-05, + "loss": 0.2664, + "step": 11866 + }, + { + "epoch": 0.22033023544220137, + "grad_norm": 0.3900023400783539, + "learning_rate": 1.7698466020231887e-05, + "loss": 0.2083, + "step": 11868 + }, + { + "epoch": 0.22036736557962, + "grad_norm": 0.33800074458122253, + "learning_rate": 1.769772147973072e-05, + "loss": 0.5163, + "step": 11870 + }, + { + "epoch": 0.22040449571703866, + "grad_norm": 0.33367934823036194, + "learning_rate": 1.769697683448662e-05, + "loss": 0.5203, + "step": 11872 + }, + { + "epoch": 0.22044162585445728, + "grad_norm": 0.3235739767551422, + "learning_rate": 1.769623208450972e-05, + "loss": 0.2921, + "step": 11874 + }, + { + "epoch": 0.2204787559918759, + "grad_norm": 0.410702109336853, + "learning_rate": 1.7695487229810157e-05, + "loss": 0.2929, + "step": 11876 + }, + { + "epoch": 0.22051588612929457, + "grad_norm": 0.477309912443161, + "learning_rate": 1.769474227039806e-05, + "loss": 0.306, + "step": 11878 + }, + { + "epoch": 0.2205530162667132, + "grad_norm": 0.33906829357147217, + "learning_rate": 1.769399720628357e-05, + "loss": 0.3882, + "step": 11880 + }, + { + "epoch": 0.22059014640413185, + "grad_norm": 0.4200831651687622, + "learning_rate": 1.7693252037476828e-05, + "loss": 0.4112, + "step": 11882 + }, + { + "epoch": 0.22062727654155048, + "grad_norm": 0.2531754970550537, + "learning_rate": 1.7692506763987967e-05, + "loss": 0.3926, + "step": 11884 + }, + { + "epoch": 0.2206644066789691, + "grad_norm": 0.31104719638824463, + "learning_rate": 1.7691761385827128e-05, + "loss": 0.3234, + "step": 11886 + }, + { + "epoch": 0.22070153681638777, + "grad_norm": 0.3835548162460327, + "learning_rate": 1.769101590300446e-05, + "loss": 0.2685, + "step": 11888 + }, + { + "epoch": 0.2207386669538064, + "grad_norm": 0.3172190487384796, + "learning_rate": 1.76902703155301e-05, + "loss": 0.2448, + "step": 11890 + }, + { + "epoch": 0.22077579709122502, + "grad_norm": 0.5205923914909363, + "learning_rate": 1.7689524623414196e-05, + "loss": 0.5127, + "step": 11892 + }, + { + "epoch": 0.22081292722864368, + "grad_norm": 0.34658876061439514, + "learning_rate": 1.7688778826666896e-05, + "loss": 0.2965, + "step": 11894 + }, + { + "epoch": 0.2208500573660623, + "grad_norm": 0.4719263017177582, + "learning_rate": 1.768803292529835e-05, + "loss": 0.4143, + "step": 11896 + }, + { + "epoch": 0.22088718750348096, + "grad_norm": 0.31216877698898315, + "learning_rate": 1.76872869193187e-05, + "loss": 0.3368, + "step": 11898 + }, + { + "epoch": 0.2209243176408996, + "grad_norm": 0.32623419165611267, + "learning_rate": 1.7686540808738103e-05, + "loss": 0.3787, + "step": 11900 + }, + { + "epoch": 0.22096144777831822, + "grad_norm": 0.5687903165817261, + "learning_rate": 1.7685794593566706e-05, + "loss": 0.3338, + "step": 11902 + }, + { + "epoch": 0.22099857791573688, + "grad_norm": 0.34359288215637207, + "learning_rate": 1.7685048273814668e-05, + "loss": 0.3974, + "step": 11904 + }, + { + "epoch": 0.2210357080531555, + "grad_norm": 0.5070930123329163, + "learning_rate": 1.7684301849492144e-05, + "loss": 0.3566, + "step": 11906 + }, + { + "epoch": 0.22107283819057413, + "grad_norm": 0.3738914728164673, + "learning_rate": 1.7683555320609287e-05, + "loss": 0.4272, + "step": 11908 + }, + { + "epoch": 0.2211099683279928, + "grad_norm": 0.5204195976257324, + "learning_rate": 1.768280868717626e-05, + "loss": 0.3264, + "step": 11910 + }, + { + "epoch": 0.22114709846541142, + "grad_norm": 0.35910990834236145, + "learning_rate": 1.7682061949203215e-05, + "loss": 0.2156, + "step": 11912 + }, + { + "epoch": 0.22118422860283005, + "grad_norm": 0.8209647536277771, + "learning_rate": 1.768131510670032e-05, + "loss": 0.5776, + "step": 11914 + }, + { + "epoch": 0.2212213587402487, + "grad_norm": 0.32603925466537476, + "learning_rate": 1.7680568159677736e-05, + "loss": 0.1758, + "step": 11916 + }, + { + "epoch": 0.22125848887766733, + "grad_norm": 0.6392351984977722, + "learning_rate": 1.767982110814562e-05, + "loss": 0.4334, + "step": 11918 + }, + { + "epoch": 0.221295619015086, + "grad_norm": 0.4216277003288269, + "learning_rate": 1.7679073952114148e-05, + "loss": 0.2834, + "step": 11920 + }, + { + "epoch": 0.22133274915250462, + "grad_norm": 0.3311893045902252, + "learning_rate": 1.7678326691593478e-05, + "loss": 0.2945, + "step": 11922 + }, + { + "epoch": 0.22136987928992324, + "grad_norm": 0.40621522068977356, + "learning_rate": 1.7677579326593784e-05, + "loss": 0.3203, + "step": 11924 + }, + { + "epoch": 0.2214070094273419, + "grad_norm": 0.4474644958972931, + "learning_rate": 1.767683185712523e-05, + "loss": 0.2587, + "step": 11926 + }, + { + "epoch": 0.22144413956476053, + "grad_norm": 0.3213600516319275, + "learning_rate": 1.767608428319799e-05, + "loss": 0.4003, + "step": 11928 + }, + { + "epoch": 0.22148126970217916, + "grad_norm": 0.5836672186851501, + "learning_rate": 1.7675336604822235e-05, + "loss": 0.154, + "step": 11930 + }, + { + "epoch": 0.2215183998395978, + "grad_norm": 0.3745591938495636, + "learning_rate": 1.7674588822008137e-05, + "loss": 0.2864, + "step": 11932 + }, + { + "epoch": 0.22155552997701644, + "grad_norm": 0.43277275562286377, + "learning_rate": 1.767384093476588e-05, + "loss": 0.2934, + "step": 11934 + }, + { + "epoch": 0.2215926601144351, + "grad_norm": 0.38131460547447205, + "learning_rate": 1.7673092943105626e-05, + "loss": 0.2992, + "step": 11936 + }, + { + "epoch": 0.22162979025185373, + "grad_norm": 0.37792280316352844, + "learning_rate": 1.7672344847037562e-05, + "loss": 0.4105, + "step": 11938 + }, + { + "epoch": 0.22166692038927235, + "grad_norm": 0.3829842507839203, + "learning_rate": 1.7671596646571868e-05, + "loss": 0.4318, + "step": 11940 + }, + { + "epoch": 0.221704050526691, + "grad_norm": 0.465193510055542, + "learning_rate": 1.7670848341718724e-05, + "loss": 0.3345, + "step": 11942 + }, + { + "epoch": 0.22174118066410964, + "grad_norm": 0.6588819622993469, + "learning_rate": 1.767009993248831e-05, + "loss": 0.3363, + "step": 11944 + }, + { + "epoch": 0.22177831080152827, + "grad_norm": 0.5793259739875793, + "learning_rate": 1.7669351418890806e-05, + "loss": 0.2801, + "step": 11946 + }, + { + "epoch": 0.22181544093894692, + "grad_norm": 0.3190169334411621, + "learning_rate": 1.766860280093641e-05, + "loss": 0.3801, + "step": 11948 + }, + { + "epoch": 0.22185257107636555, + "grad_norm": 0.46417924761772156, + "learning_rate": 1.7667854078635295e-05, + "loss": 0.3216, + "step": 11950 + }, + { + "epoch": 0.22188970121378418, + "grad_norm": 0.3605559170246124, + "learning_rate": 1.7667105251997654e-05, + "loss": 0.473, + "step": 11952 + }, + { + "epoch": 0.22192683135120284, + "grad_norm": 0.2942885756492615, + "learning_rate": 1.7666356321033677e-05, + "loss": 0.4502, + "step": 11954 + }, + { + "epoch": 0.22196396148862146, + "grad_norm": 0.3319733142852783, + "learning_rate": 1.766560728575355e-05, + "loss": 0.2812, + "step": 11956 + }, + { + "epoch": 0.22200109162604012, + "grad_norm": 0.32812047004699707, + "learning_rate": 1.7664858146167474e-05, + "loss": 0.3876, + "step": 11958 + }, + { + "epoch": 0.22203822176345875, + "grad_norm": 0.22675660252571106, + "learning_rate": 1.7664108902285636e-05, + "loss": 0.0913, + "step": 11960 + }, + { + "epoch": 0.22207535190087738, + "grad_norm": 0.3907155990600586, + "learning_rate": 1.7663359554118233e-05, + "loss": 0.2604, + "step": 11962 + }, + { + "epoch": 0.22211248203829603, + "grad_norm": 0.321621298789978, + "learning_rate": 1.766261010167546e-05, + "loss": 0.1823, + "step": 11964 + }, + { + "epoch": 0.22214961217571466, + "grad_norm": 0.33325880765914917, + "learning_rate": 1.7661860544967515e-05, + "loss": 0.2361, + "step": 11966 + }, + { + "epoch": 0.2221867423131333, + "grad_norm": 0.285675972700119, + "learning_rate": 1.76611108840046e-05, + "loss": 0.3636, + "step": 11968 + }, + { + "epoch": 0.22222387245055195, + "grad_norm": 0.3085360527038574, + "learning_rate": 1.766036111879691e-05, + "loss": 0.2841, + "step": 11970 + }, + { + "epoch": 0.22226100258797057, + "grad_norm": 0.3567916750907898, + "learning_rate": 1.7659611249354654e-05, + "loss": 0.2461, + "step": 11972 + }, + { + "epoch": 0.22229813272538923, + "grad_norm": 0.373532235622406, + "learning_rate": 1.765886127568803e-05, + "loss": 0.3636, + "step": 11974 + }, + { + "epoch": 0.22233526286280786, + "grad_norm": 0.6448768973350525, + "learning_rate": 1.7658111197807245e-05, + "loss": 0.327, + "step": 11976 + }, + { + "epoch": 0.2223723930002265, + "grad_norm": 0.4483274221420288, + "learning_rate": 1.765736101572251e-05, + "loss": 0.3557, + "step": 11978 + }, + { + "epoch": 0.22240952313764514, + "grad_norm": 0.327824205160141, + "learning_rate": 1.7656610729444022e-05, + "loss": 0.4091, + "step": 11980 + }, + { + "epoch": 0.22244665327506377, + "grad_norm": 0.3396904170513153, + "learning_rate": 1.7655860338981998e-05, + "loss": 0.2439, + "step": 11982 + }, + { + "epoch": 0.2224837834124824, + "grad_norm": 0.30622032284736633, + "learning_rate": 1.7655109844346648e-05, + "loss": 0.4751, + "step": 11984 + }, + { + "epoch": 0.22252091354990106, + "grad_norm": 0.42210638523101807, + "learning_rate": 1.7654359245548183e-05, + "loss": 0.3772, + "step": 11986 + }, + { + "epoch": 0.22255804368731968, + "grad_norm": 0.2235272228717804, + "learning_rate": 1.7653608542596812e-05, + "loss": 0.3202, + "step": 11988 + }, + { + "epoch": 0.2225951738247383, + "grad_norm": 0.6257721781730652, + "learning_rate": 1.765285773550276e-05, + "loss": 0.4758, + "step": 11990 + }, + { + "epoch": 0.22263230396215697, + "grad_norm": 0.3404028117656708, + "learning_rate": 1.765210682427623e-05, + "loss": 0.4208, + "step": 11992 + }, + { + "epoch": 0.2226694340995756, + "grad_norm": 0.4042735695838928, + "learning_rate": 1.7651355808927454e-05, + "loss": 0.3651, + "step": 11994 + }, + { + "epoch": 0.22270656423699425, + "grad_norm": 0.39030569791793823, + "learning_rate": 1.7650604689466643e-05, + "loss": 0.3205, + "step": 11996 + }, + { + "epoch": 0.22274369437441288, + "grad_norm": 0.5382828712463379, + "learning_rate": 1.7649853465904015e-05, + "loss": 0.2219, + "step": 11998 + }, + { + "epoch": 0.2227808245118315, + "grad_norm": 0.3254512846469879, + "learning_rate": 1.7649102138249796e-05, + "loss": 0.2656, + "step": 12000 + }, + { + "epoch": 0.22281795464925017, + "grad_norm": 0.4989635646343231, + "learning_rate": 1.764835070651421e-05, + "loss": 0.1574, + "step": 12002 + }, + { + "epoch": 0.2228550847866688, + "grad_norm": 0.4164736866950989, + "learning_rate": 1.764759917070748e-05, + "loss": 0.5466, + "step": 12004 + }, + { + "epoch": 0.22289221492408742, + "grad_norm": 0.26724839210510254, + "learning_rate": 1.764684753083983e-05, + "loss": 0.3191, + "step": 12006 + }, + { + "epoch": 0.22292934506150608, + "grad_norm": 0.330612450838089, + "learning_rate": 1.7646095786921492e-05, + "loss": 0.4574, + "step": 12008 + }, + { + "epoch": 0.2229664751989247, + "grad_norm": 0.4043966829776764, + "learning_rate": 1.7645343938962693e-05, + "loss": 0.3978, + "step": 12010 + }, + { + "epoch": 0.22300360533634336, + "grad_norm": 0.3628135025501251, + "learning_rate": 1.764459198697367e-05, + "loss": 0.2679, + "step": 12012 + }, + { + "epoch": 0.223040735473762, + "grad_norm": 0.2798672020435333, + "learning_rate": 1.7643839930964638e-05, + "loss": 0.2558, + "step": 12014 + }, + { + "epoch": 0.22307786561118062, + "grad_norm": 0.34981054067611694, + "learning_rate": 1.7643087770945843e-05, + "loss": 0.3094, + "step": 12016 + }, + { + "epoch": 0.22311499574859928, + "grad_norm": 0.2877528965473175, + "learning_rate": 1.764233550692752e-05, + "loss": 0.3695, + "step": 12018 + }, + { + "epoch": 0.2231521258860179, + "grad_norm": 0.3457721173763275, + "learning_rate": 1.76415831389199e-05, + "loss": 0.4331, + "step": 12020 + }, + { + "epoch": 0.22318925602343653, + "grad_norm": 0.34324902296066284, + "learning_rate": 1.7640830666933225e-05, + "loss": 0.5258, + "step": 12022 + }, + { + "epoch": 0.2232263861608552, + "grad_norm": 0.5169813632965088, + "learning_rate": 1.764007809097773e-05, + "loss": 0.2426, + "step": 12024 + }, + { + "epoch": 0.22326351629827382, + "grad_norm": 0.33846956491470337, + "learning_rate": 1.7639325411063657e-05, + "loss": 0.4613, + "step": 12026 + }, + { + "epoch": 0.22330064643569245, + "grad_norm": 0.2937115728855133, + "learning_rate": 1.7638572627201247e-05, + "loss": 0.4964, + "step": 12028 + }, + { + "epoch": 0.2233377765731111, + "grad_norm": 0.6198121905326843, + "learning_rate": 1.7637819739400743e-05, + "loss": 0.2469, + "step": 12030 + }, + { + "epoch": 0.22337490671052973, + "grad_norm": 0.2619032859802246, + "learning_rate": 1.7637066747672392e-05, + "loss": 0.4192, + "step": 12032 + }, + { + "epoch": 0.2234120368479484, + "grad_norm": 0.39697787165641785, + "learning_rate": 1.763631365202644e-05, + "loss": 0.3014, + "step": 12034 + }, + { + "epoch": 0.22344916698536701, + "grad_norm": 0.24017538130283356, + "learning_rate": 1.7635560452473127e-05, + "loss": 0.2293, + "step": 12036 + }, + { + "epoch": 0.22348629712278564, + "grad_norm": 0.36495137214660645, + "learning_rate": 1.7634807149022713e-05, + "loss": 0.5313, + "step": 12038 + }, + { + "epoch": 0.2235234272602043, + "grad_norm": 0.3485358953475952, + "learning_rate": 1.763405374168544e-05, + "loss": 0.1923, + "step": 12040 + }, + { + "epoch": 0.22356055739762293, + "grad_norm": 0.3236474096775055, + "learning_rate": 1.7633300230471562e-05, + "loss": 0.3568, + "step": 12042 + }, + { + "epoch": 0.22359768753504156, + "grad_norm": 0.2673585116863251, + "learning_rate": 1.7632546615391334e-05, + "loss": 0.519, + "step": 12044 + }, + { + "epoch": 0.2236348176724602, + "grad_norm": 0.3688596487045288, + "learning_rate": 1.7631792896455007e-05, + "loss": 0.1387, + "step": 12046 + }, + { + "epoch": 0.22367194780987884, + "grad_norm": 0.28437888622283936, + "learning_rate": 1.7631039073672836e-05, + "loss": 0.3222, + "step": 12048 + }, + { + "epoch": 0.2237090779472975, + "grad_norm": 0.3170979619026184, + "learning_rate": 1.7630285147055083e-05, + "loss": 0.4551, + "step": 12050 + }, + { + "epoch": 0.22374620808471612, + "grad_norm": 0.3250986635684967, + "learning_rate": 1.7629531116612008e-05, + "loss": 0.3673, + "step": 12052 + }, + { + "epoch": 0.22378333822213475, + "grad_norm": 0.34939953684806824, + "learning_rate": 1.7628776982353864e-05, + "loss": 0.4111, + "step": 12054 + }, + { + "epoch": 0.2238204683595534, + "grad_norm": 0.38850826025009155, + "learning_rate": 1.762802274429092e-05, + "loss": 0.3636, + "step": 12056 + }, + { + "epoch": 0.22385759849697204, + "grad_norm": 0.22319717705249786, + "learning_rate": 1.762726840243343e-05, + "loss": 0.296, + "step": 12058 + }, + { + "epoch": 0.22389472863439067, + "grad_norm": 0.4979951083660126, + "learning_rate": 1.7626513956791663e-05, + "loss": 0.2192, + "step": 12060 + }, + { + "epoch": 0.22393185877180932, + "grad_norm": 0.2625686526298523, + "learning_rate": 1.762575940737589e-05, + "loss": 0.4437, + "step": 12062 + }, + { + "epoch": 0.22396898890922795, + "grad_norm": 0.41948339343070984, + "learning_rate": 1.762500475419637e-05, + "loss": 0.4163, + "step": 12064 + }, + { + "epoch": 0.22400611904664658, + "grad_norm": 0.3573482930660248, + "learning_rate": 1.7624249997263375e-05, + "loss": 0.2905, + "step": 12066 + }, + { + "epoch": 0.22404324918406524, + "grad_norm": 0.3806382119655609, + "learning_rate": 1.7623495136587174e-05, + "loss": 0.2995, + "step": 12068 + }, + { + "epoch": 0.22408037932148386, + "grad_norm": 0.2932985723018646, + "learning_rate": 1.7622740172178043e-05, + "loss": 0.2907, + "step": 12070 + }, + { + "epoch": 0.22411750945890252, + "grad_norm": 0.5836262702941895, + "learning_rate": 1.7621985104046245e-05, + "loss": 0.3332, + "step": 12072 + }, + { + "epoch": 0.22415463959632115, + "grad_norm": 0.3987974226474762, + "learning_rate": 1.7621229932202065e-05, + "loss": 0.3599, + "step": 12074 + }, + { + "epoch": 0.22419176973373978, + "grad_norm": 0.3861166536808014, + "learning_rate": 1.7620474656655774e-05, + "loss": 0.4077, + "step": 12076 + }, + { + "epoch": 0.22422889987115843, + "grad_norm": 0.4544990658760071, + "learning_rate": 1.7619719277417648e-05, + "loss": 0.2665, + "step": 12078 + }, + { + "epoch": 0.22426603000857706, + "grad_norm": 0.4183366596698761, + "learning_rate": 1.7618963794497966e-05, + "loss": 0.2245, + "step": 12080 + }, + { + "epoch": 0.2243031601459957, + "grad_norm": 0.4875152111053467, + "learning_rate": 1.7618208207907007e-05, + "loss": 0.4212, + "step": 12082 + }, + { + "epoch": 0.22434029028341435, + "grad_norm": 0.32507941126823425, + "learning_rate": 1.7617452517655055e-05, + "loss": 0.2196, + "step": 12084 + }, + { + "epoch": 0.22437742042083297, + "grad_norm": 0.5160993337631226, + "learning_rate": 1.761669672375239e-05, + "loss": 0.2918, + "step": 12086 + }, + { + "epoch": 0.22441455055825163, + "grad_norm": 0.30765220522880554, + "learning_rate": 1.7615940826209297e-05, + "loss": 0.3645, + "step": 12088 + }, + { + "epoch": 0.22445168069567026, + "grad_norm": 0.30236923694610596, + "learning_rate": 1.7615184825036064e-05, + "loss": 0.2412, + "step": 12090 + }, + { + "epoch": 0.2244888108330889, + "grad_norm": 0.4530021548271179, + "learning_rate": 1.7614428720242976e-05, + "loss": 0.3649, + "step": 12092 + }, + { + "epoch": 0.22452594097050754, + "grad_norm": 0.3599660098552704, + "learning_rate": 1.761367251184032e-05, + "loss": 0.3872, + "step": 12094 + }, + { + "epoch": 0.22456307110792617, + "grad_norm": 0.4225991368293762, + "learning_rate": 1.7612916199838385e-05, + "loss": 0.2793, + "step": 12096 + }, + { + "epoch": 0.2246002012453448, + "grad_norm": 0.28973588347435, + "learning_rate": 1.7612159784247462e-05, + "loss": 0.4806, + "step": 12098 + }, + { + "epoch": 0.22463733138276346, + "grad_norm": 0.47195345163345337, + "learning_rate": 1.761140326507785e-05, + "loss": 0.3048, + "step": 12100 + }, + { + "epoch": 0.22467446152018208, + "grad_norm": 0.31633633375167847, + "learning_rate": 1.7610646642339836e-05, + "loss": 0.3259, + "step": 12102 + }, + { + "epoch": 0.2247115916576007, + "grad_norm": 0.25203317403793335, + "learning_rate": 1.760988991604372e-05, + "loss": 0.3407, + "step": 12104 + }, + { + "epoch": 0.22474872179501937, + "grad_norm": 0.32096803188323975, + "learning_rate": 1.7609133086199796e-05, + "loss": 0.407, + "step": 12106 + }, + { + "epoch": 0.224785851932438, + "grad_norm": 0.3544452488422394, + "learning_rate": 1.7608376152818357e-05, + "loss": 0.3093, + "step": 12108 + }, + { + "epoch": 0.22482298206985665, + "grad_norm": 0.5119296312332153, + "learning_rate": 1.7607619115909712e-05, + "loss": 0.5716, + "step": 12110 + }, + { + "epoch": 0.22486011220727528, + "grad_norm": 0.34867793321609497, + "learning_rate": 1.760686197548416e-05, + "loss": 0.3431, + "step": 12112 + }, + { + "epoch": 0.2248972423446939, + "grad_norm": 0.3729167580604553, + "learning_rate": 1.7606104731552004e-05, + "loss": 0.1598, + "step": 12114 + }, + { + "epoch": 0.22493437248211257, + "grad_norm": 0.32883015275001526, + "learning_rate": 1.760534738412354e-05, + "loss": 0.1359, + "step": 12116 + }, + { + "epoch": 0.2249715026195312, + "grad_norm": 0.6238548755645752, + "learning_rate": 1.760458993320908e-05, + "loss": 0.4753, + "step": 12118 + }, + { + "epoch": 0.22500863275694982, + "grad_norm": 0.4662645757198334, + "learning_rate": 1.760383237881893e-05, + "loss": 0.274, + "step": 12120 + }, + { + "epoch": 0.22504576289436848, + "grad_norm": 0.3792097270488739, + "learning_rate": 1.76030747209634e-05, + "loss": 0.1781, + "step": 12122 + }, + { + "epoch": 0.2250828930317871, + "grad_norm": 0.34540051221847534, + "learning_rate": 1.7602316959652794e-05, + "loss": 0.3906, + "step": 12124 + }, + { + "epoch": 0.22512002316920576, + "grad_norm": 0.3337765634059906, + "learning_rate": 1.7601559094897427e-05, + "loss": 0.3008, + "step": 12126 + }, + { + "epoch": 0.2251571533066244, + "grad_norm": 0.2720305621623993, + "learning_rate": 1.7600801126707613e-05, + "loss": 0.2594, + "step": 12128 + }, + { + "epoch": 0.22519428344404302, + "grad_norm": 0.4514831006526947, + "learning_rate": 1.7600043055093654e-05, + "loss": 0.237, + "step": 12130 + }, + { + "epoch": 0.22523141358146168, + "grad_norm": 0.293328195810318, + "learning_rate": 1.7599284880065883e-05, + "loss": 0.3785, + "step": 12132 + }, + { + "epoch": 0.2252685437188803, + "grad_norm": 0.40314000844955444, + "learning_rate": 1.75985266016346e-05, + "loss": 0.2489, + "step": 12134 + }, + { + "epoch": 0.22530567385629893, + "grad_norm": 0.23729155957698822, + "learning_rate": 1.7597768219810134e-05, + "loss": 0.4259, + "step": 12136 + }, + { + "epoch": 0.2253428039937176, + "grad_norm": 0.41475462913513184, + "learning_rate": 1.75970097346028e-05, + "loss": 0.3213, + "step": 12138 + }, + { + "epoch": 0.22537993413113622, + "grad_norm": 0.5343440175056458, + "learning_rate": 1.759625114602292e-05, + "loss": 0.3509, + "step": 12140 + }, + { + "epoch": 0.22541706426855485, + "grad_norm": 0.3142673075199127, + "learning_rate": 1.7595492454080813e-05, + "loss": 0.1057, + "step": 12142 + }, + { + "epoch": 0.2254541944059735, + "grad_norm": 0.36299678683280945, + "learning_rate": 1.7594733658786807e-05, + "loss": 0.2515, + "step": 12144 + }, + { + "epoch": 0.22549132454339213, + "grad_norm": 0.3457544445991516, + "learning_rate": 1.7593974760151223e-05, + "loss": 0.3595, + "step": 12146 + }, + { + "epoch": 0.22552845468081079, + "grad_norm": 0.31571483612060547, + "learning_rate": 1.7593215758184392e-05, + "loss": 0.3146, + "step": 12148 + }, + { + "epoch": 0.22556558481822941, + "grad_norm": 0.38800618052482605, + "learning_rate": 1.7592456652896636e-05, + "loss": 0.25, + "step": 12150 + }, + { + "epoch": 0.22560271495564804, + "grad_norm": 0.31788524985313416, + "learning_rate": 1.759169744429829e-05, + "loss": 0.309, + "step": 12152 + }, + { + "epoch": 0.2256398450930667, + "grad_norm": 0.35473138093948364, + "learning_rate": 1.7590938132399678e-05, + "loss": 0.4165, + "step": 12154 + }, + { + "epoch": 0.22567697523048533, + "grad_norm": 0.2991270124912262, + "learning_rate": 1.7590178717211138e-05, + "loss": 0.2397, + "step": 12156 + }, + { + "epoch": 0.22571410536790396, + "grad_norm": 0.38730674982070923, + "learning_rate": 1.7589419198743e-05, + "loss": 0.4355, + "step": 12158 + }, + { + "epoch": 0.2257512355053226, + "grad_norm": 0.270922988653183, + "learning_rate": 1.75886595770056e-05, + "loss": 0.3376, + "step": 12160 + }, + { + "epoch": 0.22578836564274124, + "grad_norm": 0.4049718379974365, + "learning_rate": 1.758789985200927e-05, + "loss": 0.2612, + "step": 12162 + }, + { + "epoch": 0.2258254957801599, + "grad_norm": 0.3952605426311493, + "learning_rate": 1.7587140023764355e-05, + "loss": 0.3824, + "step": 12164 + }, + { + "epoch": 0.22586262591757852, + "grad_norm": 0.34663283824920654, + "learning_rate": 1.7586380092281194e-05, + "loss": 0.4979, + "step": 12166 + }, + { + "epoch": 0.22589975605499715, + "grad_norm": 0.3986808657646179, + "learning_rate": 1.758562005757012e-05, + "loss": 0.3647, + "step": 12168 + }, + { + "epoch": 0.2259368861924158, + "grad_norm": 0.5012375712394714, + "learning_rate": 1.758485991964148e-05, + "loss": 0.3665, + "step": 12170 + }, + { + "epoch": 0.22597401632983444, + "grad_norm": 0.27836892008781433, + "learning_rate": 1.758409967850561e-05, + "loss": 0.3261, + "step": 12172 + }, + { + "epoch": 0.22601114646725307, + "grad_norm": 0.7115713953971863, + "learning_rate": 1.7583339334172866e-05, + "loss": 0.3473, + "step": 12174 + }, + { + "epoch": 0.22604827660467172, + "grad_norm": 0.3703259527683258, + "learning_rate": 1.7582578886653587e-05, + "loss": 0.2575, + "step": 12176 + }, + { + "epoch": 0.22608540674209035, + "grad_norm": 0.46376755833625793, + "learning_rate": 1.758181833595812e-05, + "loss": 0.2019, + "step": 12178 + }, + { + "epoch": 0.22612253687950898, + "grad_norm": 0.40719014406204224, + "learning_rate": 1.7581057682096817e-05, + "loss": 0.2252, + "step": 12180 + }, + { + "epoch": 0.22615966701692763, + "grad_norm": 0.5799514055252075, + "learning_rate": 1.758029692508003e-05, + "loss": 0.3436, + "step": 12182 + }, + { + "epoch": 0.22619679715434626, + "grad_norm": 0.2772023677825928, + "learning_rate": 1.7579536064918104e-05, + "loss": 0.2202, + "step": 12184 + }, + { + "epoch": 0.22623392729176492, + "grad_norm": 0.2913564443588257, + "learning_rate": 1.7578775101621396e-05, + "loss": 0.2913, + "step": 12186 + }, + { + "epoch": 0.22627105742918355, + "grad_norm": 0.31022971868515015, + "learning_rate": 1.7578014035200262e-05, + "loss": 0.2204, + "step": 12188 + }, + { + "epoch": 0.22630818756660218, + "grad_norm": 0.4769797623157501, + "learning_rate": 1.757725286566505e-05, + "loss": 0.2824, + "step": 12190 + }, + { + "epoch": 0.22634531770402083, + "grad_norm": 0.3974776268005371, + "learning_rate": 1.757649159302613e-05, + "loss": 0.2972, + "step": 12192 + }, + { + "epoch": 0.22638244784143946, + "grad_norm": 0.31579384207725525, + "learning_rate": 1.757573021729385e-05, + "loss": 0.4782, + "step": 12194 + }, + { + "epoch": 0.2264195779788581, + "grad_norm": 0.41150814294815063, + "learning_rate": 1.7574968738478576e-05, + "loss": 0.2063, + "step": 12196 + }, + { + "epoch": 0.22645670811627674, + "grad_norm": 0.37733304500579834, + "learning_rate": 1.7574207156590667e-05, + "loss": 0.4448, + "step": 12198 + }, + { + "epoch": 0.22649383825369537, + "grad_norm": 0.3141343891620636, + "learning_rate": 1.757344547164048e-05, + "loss": 0.2598, + "step": 12200 + }, + { + "epoch": 0.22653096839111403, + "grad_norm": 0.6399528384208679, + "learning_rate": 1.7572683683638393e-05, + "loss": 0.4158, + "step": 12202 + }, + { + "epoch": 0.22656809852853266, + "grad_norm": 0.29067307710647583, + "learning_rate": 1.757192179259476e-05, + "loss": 0.1872, + "step": 12204 + }, + { + "epoch": 0.22660522866595129, + "grad_norm": 0.4155784547328949, + "learning_rate": 1.757115979851995e-05, + "loss": 0.4163, + "step": 12206 + }, + { + "epoch": 0.22664235880336994, + "grad_norm": 0.34444814920425415, + "learning_rate": 1.7570397701424337e-05, + "loss": 0.3538, + "step": 12208 + }, + { + "epoch": 0.22667948894078857, + "grad_norm": 0.3122224807739258, + "learning_rate": 1.7569635501318287e-05, + "loss": 0.2395, + "step": 12210 + }, + { + "epoch": 0.2267166190782072, + "grad_norm": 0.3670322895050049, + "learning_rate": 1.756887319821217e-05, + "loss": 0.2709, + "step": 12212 + }, + { + "epoch": 0.22675374921562585, + "grad_norm": 0.3097686469554901, + "learning_rate": 1.7568110792116362e-05, + "loss": 0.351, + "step": 12214 + }, + { + "epoch": 0.22679087935304448, + "grad_norm": 0.4320831298828125, + "learning_rate": 1.756734828304123e-05, + "loss": 0.4112, + "step": 12216 + }, + { + "epoch": 0.2268280094904631, + "grad_norm": 0.4121559262275696, + "learning_rate": 1.7566585670997164e-05, + "loss": 0.3443, + "step": 12218 + }, + { + "epoch": 0.22686513962788177, + "grad_norm": 0.40930017828941345, + "learning_rate": 1.7565822955994524e-05, + "loss": 0.3014, + "step": 12220 + }, + { + "epoch": 0.2269022697653004, + "grad_norm": 0.4146338701248169, + "learning_rate": 1.7565060138043697e-05, + "loss": 0.2149, + "step": 12222 + }, + { + "epoch": 0.22693939990271905, + "grad_norm": 0.3705211877822876, + "learning_rate": 1.7564297217155066e-05, + "loss": 0.2736, + "step": 12224 + }, + { + "epoch": 0.22697653004013768, + "grad_norm": 0.3298647403717041, + "learning_rate": 1.7563534193339002e-05, + "loss": 0.3522, + "step": 12226 + }, + { + "epoch": 0.2270136601775563, + "grad_norm": 0.3647691309452057, + "learning_rate": 1.75627710666059e-05, + "loss": 0.3509, + "step": 12228 + }, + { + "epoch": 0.22705079031497497, + "grad_norm": 0.37193411588668823, + "learning_rate": 1.7562007836966128e-05, + "loss": 0.4075, + "step": 12230 + }, + { + "epoch": 0.2270879204523936, + "grad_norm": 0.36932727694511414, + "learning_rate": 1.7561244504430083e-05, + "loss": 0.5432, + "step": 12232 + }, + { + "epoch": 0.22712505058981222, + "grad_norm": 0.3440976142883301, + "learning_rate": 1.7560481069008154e-05, + "loss": 0.3834, + "step": 12234 + }, + { + "epoch": 0.22716218072723088, + "grad_norm": 0.36136186122894287, + "learning_rate": 1.7559717530710716e-05, + "loss": 0.3166, + "step": 12236 + }, + { + "epoch": 0.2271993108646495, + "grad_norm": 0.5251002907752991, + "learning_rate": 1.755895388954817e-05, + "loss": 0.3668, + "step": 12238 + }, + { + "epoch": 0.22723644100206816, + "grad_norm": 0.4312556982040405, + "learning_rate": 1.7558190145530906e-05, + "loss": 0.3254, + "step": 12240 + }, + { + "epoch": 0.2272735711394868, + "grad_norm": 0.3840956687927246, + "learning_rate": 1.755742629866931e-05, + "loss": 0.5117, + "step": 12242 + }, + { + "epoch": 0.22731070127690542, + "grad_norm": 0.43443334102630615, + "learning_rate": 1.755666234897378e-05, + "loss": 0.2533, + "step": 12244 + }, + { + "epoch": 0.22734783141432408, + "grad_norm": 0.3611725866794586, + "learning_rate": 1.755589829645471e-05, + "loss": 0.3495, + "step": 12246 + }, + { + "epoch": 0.2273849615517427, + "grad_norm": 0.34282463788986206, + "learning_rate": 1.75551341411225e-05, + "loss": 0.3618, + "step": 12248 + }, + { + "epoch": 0.22742209168916133, + "grad_norm": 0.3647899925708771, + "learning_rate": 1.7554369882987542e-05, + "loss": 0.4186, + "step": 12250 + }, + { + "epoch": 0.22745922182658, + "grad_norm": 0.6680319309234619, + "learning_rate": 1.7553605522060237e-05, + "loss": 0.3006, + "step": 12252 + }, + { + "epoch": 0.22749635196399862, + "grad_norm": 0.3172919750213623, + "learning_rate": 1.7552841058350986e-05, + "loss": 0.3028, + "step": 12254 + }, + { + "epoch": 0.22753348210141724, + "grad_norm": 0.28211918473243713, + "learning_rate": 1.7552076491870195e-05, + "loss": 0.3686, + "step": 12256 + }, + { + "epoch": 0.2275706122388359, + "grad_norm": 0.39960405230522156, + "learning_rate": 1.7551311822628264e-05, + "loss": 0.3279, + "step": 12258 + }, + { + "epoch": 0.22760774237625453, + "grad_norm": 0.324870228767395, + "learning_rate": 1.7550547050635595e-05, + "loss": 0.4096, + "step": 12260 + }, + { + "epoch": 0.22764487251367319, + "grad_norm": 0.3878096342086792, + "learning_rate": 1.7549782175902597e-05, + "loss": 0.0775, + "step": 12262 + }, + { + "epoch": 0.2276820026510918, + "grad_norm": 0.42712169885635376, + "learning_rate": 1.754901719843968e-05, + "loss": 0.2589, + "step": 12264 + }, + { + "epoch": 0.22771913278851044, + "grad_norm": 0.3460284471511841, + "learning_rate": 1.7548252118257246e-05, + "loss": 0.219, + "step": 12266 + }, + { + "epoch": 0.2277562629259291, + "grad_norm": 0.321351021528244, + "learning_rate": 1.7547486935365716e-05, + "loss": 0.2219, + "step": 12268 + }, + { + "epoch": 0.22779339306334773, + "grad_norm": 0.443685919046402, + "learning_rate": 1.7546721649775494e-05, + "loss": 0.4945, + "step": 12270 + }, + { + "epoch": 0.22783052320076635, + "grad_norm": 0.3065042495727539, + "learning_rate": 1.7545956261496995e-05, + "loss": 0.2671, + "step": 12272 + }, + { + "epoch": 0.227867653338185, + "grad_norm": 0.3917382061481476, + "learning_rate": 1.7545190770540633e-05, + "loss": 0.3305, + "step": 12274 + }, + { + "epoch": 0.22790478347560364, + "grad_norm": 0.33366790413856506, + "learning_rate": 1.7544425176916827e-05, + "loss": 0.2396, + "step": 12276 + }, + { + "epoch": 0.2279419136130223, + "grad_norm": 0.5161924362182617, + "learning_rate": 1.7543659480635992e-05, + "loss": 0.2298, + "step": 12278 + }, + { + "epoch": 0.22797904375044092, + "grad_norm": 0.3733651041984558, + "learning_rate": 1.7542893681708547e-05, + "loss": 0.5555, + "step": 12280 + }, + { + "epoch": 0.22801617388785955, + "grad_norm": 0.20911754667758942, + "learning_rate": 1.7542127780144917e-05, + "loss": 0.2317, + "step": 12282 + }, + { + "epoch": 0.2280533040252782, + "grad_norm": 0.38140398263931274, + "learning_rate": 1.7541361775955514e-05, + "loss": 0.4778, + "step": 12284 + }, + { + "epoch": 0.22809043416269684, + "grad_norm": 0.255867600440979, + "learning_rate": 1.7540595669150766e-05, + "loss": 0.355, + "step": 12286 + }, + { + "epoch": 0.22812756430011547, + "grad_norm": 0.43670031428337097, + "learning_rate": 1.7539829459741102e-05, + "loss": 0.4306, + "step": 12288 + }, + { + "epoch": 0.22816469443753412, + "grad_norm": 0.5742508769035339, + "learning_rate": 1.753906314773694e-05, + "loss": 0.4602, + "step": 12290 + }, + { + "epoch": 0.22820182457495275, + "grad_norm": 0.44967716932296753, + "learning_rate": 1.753829673314871e-05, + "loss": 0.3507, + "step": 12292 + }, + { + "epoch": 0.22823895471237138, + "grad_norm": 0.42827701568603516, + "learning_rate": 1.7537530215986844e-05, + "loss": 0.3715, + "step": 12294 + }, + { + "epoch": 0.22827608484979003, + "grad_norm": 0.39758598804473877, + "learning_rate": 1.7536763596261765e-05, + "loss": 0.4169, + "step": 12296 + }, + { + "epoch": 0.22831321498720866, + "grad_norm": 0.2817027270793915, + "learning_rate": 1.753599687398391e-05, + "loss": 0.4116, + "step": 12298 + }, + { + "epoch": 0.22835034512462732, + "grad_norm": 0.3187933564186096, + "learning_rate": 1.7535230049163713e-05, + "loss": 0.4137, + "step": 12300 + }, + { + "epoch": 0.22838747526204595, + "grad_norm": 0.42051562666893005, + "learning_rate": 1.7534463121811603e-05, + "loss": 0.4312, + "step": 12302 + }, + { + "epoch": 0.22842460539946458, + "grad_norm": 0.3544573485851288, + "learning_rate": 1.7533696091938024e-05, + "loss": 0.3789, + "step": 12304 + }, + { + "epoch": 0.22846173553688323, + "grad_norm": 0.4475204646587372, + "learning_rate": 1.7532928959553403e-05, + "loss": 0.2393, + "step": 12306 + }, + { + "epoch": 0.22849886567430186, + "grad_norm": 0.37221747636795044, + "learning_rate": 1.753216172466818e-05, + "loss": 0.3988, + "step": 12308 + }, + { + "epoch": 0.2285359958117205, + "grad_norm": 0.32714807987213135, + "learning_rate": 1.75313943872928e-05, + "loss": 0.5212, + "step": 12310 + }, + { + "epoch": 0.22857312594913914, + "grad_norm": 0.4023895561695099, + "learning_rate": 1.7530626947437704e-05, + "loss": 0.295, + "step": 12312 + }, + { + "epoch": 0.22861025608655777, + "grad_norm": 0.39455997943878174, + "learning_rate": 1.752985940511333e-05, + "loss": 0.3178, + "step": 12314 + }, + { + "epoch": 0.22864738622397643, + "grad_norm": 0.30868273973464966, + "learning_rate": 1.7529091760330123e-05, + "loss": 0.2897, + "step": 12316 + }, + { + "epoch": 0.22868451636139506, + "grad_norm": 0.5626739263534546, + "learning_rate": 1.752832401309853e-05, + "loss": 0.537, + "step": 12318 + }, + { + "epoch": 0.22872164649881369, + "grad_norm": 0.2995493412017822, + "learning_rate": 1.7527556163428994e-05, + "loss": 0.3179, + "step": 12320 + }, + { + "epoch": 0.22875877663623234, + "grad_norm": 0.3250625729560852, + "learning_rate": 1.752678821133197e-05, + "loss": 0.2105, + "step": 12322 + }, + { + "epoch": 0.22879590677365097, + "grad_norm": 0.2906014621257782, + "learning_rate": 1.7526020156817907e-05, + "loss": 0.2625, + "step": 12324 + }, + { + "epoch": 0.2288330369110696, + "grad_norm": 0.4179259240627289, + "learning_rate": 1.7525251999897247e-05, + "loss": 0.2694, + "step": 12326 + }, + { + "epoch": 0.22887016704848825, + "grad_norm": 0.30020153522491455, + "learning_rate": 1.752448374058045e-05, + "loss": 0.3193, + "step": 12328 + }, + { + "epoch": 0.22890729718590688, + "grad_norm": 0.6141716837882996, + "learning_rate": 1.7523715378877967e-05, + "loss": 0.1568, + "step": 12330 + }, + { + "epoch": 0.2289444273233255, + "grad_norm": 0.29577603936195374, + "learning_rate": 1.7522946914800256e-05, + "loss": 0.4573, + "step": 12332 + }, + { + "epoch": 0.22898155746074417, + "grad_norm": 0.35698315501213074, + "learning_rate": 1.752217834835777e-05, + "loss": 0.4283, + "step": 12334 + }, + { + "epoch": 0.2290186875981628, + "grad_norm": 0.36258038878440857, + "learning_rate": 1.7521409679560967e-05, + "loss": 0.2888, + "step": 12336 + }, + { + "epoch": 0.22905581773558145, + "grad_norm": 0.2752467393875122, + "learning_rate": 1.7520640908420307e-05, + "loss": 0.2378, + "step": 12338 + }, + { + "epoch": 0.22909294787300008, + "grad_norm": 0.34941089153289795, + "learning_rate": 1.7519872034946253e-05, + "loss": 0.2176, + "step": 12340 + }, + { + "epoch": 0.2291300780104187, + "grad_norm": 0.41091299057006836, + "learning_rate": 1.7519103059149264e-05, + "loss": 0.2544, + "step": 12342 + }, + { + "epoch": 0.22916720814783736, + "grad_norm": 0.278972864151001, + "learning_rate": 1.751833398103981e-05, + "loss": 0.383, + "step": 12344 + }, + { + "epoch": 0.229204338285256, + "grad_norm": 0.506033182144165, + "learning_rate": 1.7517564800628343e-05, + "loss": 0.4384, + "step": 12346 + }, + { + "epoch": 0.22924146842267462, + "grad_norm": 0.48699185252189636, + "learning_rate": 1.7516795517925344e-05, + "loss": 0.3769, + "step": 12348 + }, + { + "epoch": 0.22927859856009328, + "grad_norm": 0.366421103477478, + "learning_rate": 1.7516026132941266e-05, + "loss": 0.2798, + "step": 12350 + }, + { + "epoch": 0.2293157286975119, + "grad_norm": 0.4105135202407837, + "learning_rate": 1.751525664568659e-05, + "loss": 0.3215, + "step": 12352 + }, + { + "epoch": 0.22935285883493056, + "grad_norm": 0.3275894522666931, + "learning_rate": 1.751448705617178e-05, + "loss": 0.2186, + "step": 12354 + }, + { + "epoch": 0.2293899889723492, + "grad_norm": 0.3845275938510895, + "learning_rate": 1.751371736440731e-05, + "loss": 0.2763, + "step": 12356 + }, + { + "epoch": 0.22942711910976782, + "grad_norm": 0.3953634798526764, + "learning_rate": 1.7512947570403654e-05, + "loss": 0.3233, + "step": 12358 + }, + { + "epoch": 0.22946424924718647, + "grad_norm": 0.4476756155490875, + "learning_rate": 1.7512177674171282e-05, + "loss": 0.4203, + "step": 12360 + }, + { + "epoch": 0.2295013793846051, + "grad_norm": 0.334692120552063, + "learning_rate": 1.7511407675720678e-05, + "loss": 0.2407, + "step": 12362 + }, + { + "epoch": 0.22953850952202373, + "grad_norm": 0.6168070435523987, + "learning_rate": 1.7510637575062312e-05, + "loss": 0.2661, + "step": 12364 + }, + { + "epoch": 0.2295756396594424, + "grad_norm": 0.28880152106285095, + "learning_rate": 1.7509867372206666e-05, + "loss": 0.2695, + "step": 12366 + }, + { + "epoch": 0.22961276979686102, + "grad_norm": 0.30871060490608215, + "learning_rate": 1.750909706716422e-05, + "loss": 0.143, + "step": 12368 + }, + { + "epoch": 0.22964989993427964, + "grad_norm": 0.33034607768058777, + "learning_rate": 1.7508326659945457e-05, + "loss": 0.4469, + "step": 12370 + }, + { + "epoch": 0.2296870300716983, + "grad_norm": 0.6231948137283325, + "learning_rate": 1.7507556150560856e-05, + "loss": 0.3858, + "step": 12372 + }, + { + "epoch": 0.22972416020911693, + "grad_norm": 0.4673445522785187, + "learning_rate": 1.7506785539020904e-05, + "loss": 0.3185, + "step": 12374 + }, + { + "epoch": 0.22976129034653558, + "grad_norm": 0.28963035345077515, + "learning_rate": 1.7506014825336086e-05, + "loss": 0.3862, + "step": 12376 + }, + { + "epoch": 0.2297984204839542, + "grad_norm": 0.33484646677970886, + "learning_rate": 1.750524400951689e-05, + "loss": 0.2071, + "step": 12378 + }, + { + "epoch": 0.22983555062137284, + "grad_norm": 0.33448272943496704, + "learning_rate": 1.7504473091573804e-05, + "loss": 0.3293, + "step": 12380 + }, + { + "epoch": 0.2298726807587915, + "grad_norm": 0.4470660090446472, + "learning_rate": 1.7503702071517318e-05, + "loss": 0.2937, + "step": 12382 + }, + { + "epoch": 0.22990981089621013, + "grad_norm": 0.5206077694892883, + "learning_rate": 1.7502930949357922e-05, + "loss": 0.5043, + "step": 12384 + }, + { + "epoch": 0.22994694103362875, + "grad_norm": 0.34875521063804626, + "learning_rate": 1.750215972510611e-05, + "loss": 0.467, + "step": 12386 + }, + { + "epoch": 0.2299840711710474, + "grad_norm": 0.6374222636222839, + "learning_rate": 1.7501388398772373e-05, + "loss": 0.1336, + "step": 12388 + }, + { + "epoch": 0.23002120130846604, + "grad_norm": 0.3302450478076935, + "learning_rate": 1.7500616970367216e-05, + "loss": 0.2838, + "step": 12390 + }, + { + "epoch": 0.2300583314458847, + "grad_norm": 0.3730352222919464, + "learning_rate": 1.749984543990112e-05, + "loss": 0.1782, + "step": 12392 + }, + { + "epoch": 0.23009546158330332, + "grad_norm": 0.2858547866344452, + "learning_rate": 1.7499073807384598e-05, + "loss": 0.2981, + "step": 12394 + }, + { + "epoch": 0.23013259172072195, + "grad_norm": 0.34633609652519226, + "learning_rate": 1.7498302072828144e-05, + "loss": 0.4711, + "step": 12396 + }, + { + "epoch": 0.2301697218581406, + "grad_norm": 0.34397944808006287, + "learning_rate": 1.7497530236242258e-05, + "loss": 0.3962, + "step": 12398 + }, + { + "epoch": 0.23020685199555924, + "grad_norm": 0.39055126905441284, + "learning_rate": 1.7496758297637444e-05, + "loss": 0.279, + "step": 12400 + }, + { + "epoch": 0.23024398213297786, + "grad_norm": 0.4564305245876312, + "learning_rate": 1.74959862570242e-05, + "loss": 0.38, + "step": 12402 + }, + { + "epoch": 0.23028111227039652, + "grad_norm": 0.3358359932899475, + "learning_rate": 1.7495214114413043e-05, + "loss": 0.148, + "step": 12404 + }, + { + "epoch": 0.23031824240781515, + "grad_norm": 0.3154211640357971, + "learning_rate": 1.749444186981447e-05, + "loss": 0.4049, + "step": 12406 + }, + { + "epoch": 0.23035537254523378, + "grad_norm": 0.35937538743019104, + "learning_rate": 1.749366952323899e-05, + "loss": 0.2327, + "step": 12408 + }, + { + "epoch": 0.23039250268265243, + "grad_norm": 0.3170214593410492, + "learning_rate": 1.7492897074697117e-05, + "loss": 0.2995, + "step": 12410 + }, + { + "epoch": 0.23042963282007106, + "grad_norm": 0.3615940809249878, + "learning_rate": 1.7492124524199354e-05, + "loss": 0.4211, + "step": 12412 + }, + { + "epoch": 0.23046676295748972, + "grad_norm": 0.27584195137023926, + "learning_rate": 1.7491351871756222e-05, + "loss": 0.234, + "step": 12414 + }, + { + "epoch": 0.23050389309490835, + "grad_norm": 0.3777814507484436, + "learning_rate": 1.749057911737823e-05, + "loss": 0.3478, + "step": 12416 + }, + { + "epoch": 0.23054102323232697, + "grad_norm": 0.5194113254547119, + "learning_rate": 1.748980626107589e-05, + "loss": 0.247, + "step": 12418 + }, + { + "epoch": 0.23057815336974563, + "grad_norm": 0.34694328904151917, + "learning_rate": 1.7489033302859722e-05, + "loss": 0.2752, + "step": 12420 + }, + { + "epoch": 0.23061528350716426, + "grad_norm": 0.35486072301864624, + "learning_rate": 1.7488260242740246e-05, + "loss": 0.439, + "step": 12422 + }, + { + "epoch": 0.2306524136445829, + "grad_norm": 0.3262186050415039, + "learning_rate": 1.7487487080727975e-05, + "loss": 0.2797, + "step": 12424 + }, + { + "epoch": 0.23068954378200154, + "grad_norm": 0.4208851158618927, + "learning_rate": 1.7486713816833433e-05, + "loss": 0.312, + "step": 12426 + }, + { + "epoch": 0.23072667391942017, + "grad_norm": 0.32234078645706177, + "learning_rate": 1.7485940451067143e-05, + "loss": 0.3958, + "step": 12428 + }, + { + "epoch": 0.23076380405683883, + "grad_norm": 0.36034539341926575, + "learning_rate": 1.7485166983439625e-05, + "loss": 0.3528, + "step": 12430 + }, + { + "epoch": 0.23080093419425746, + "grad_norm": 0.7856832146644592, + "learning_rate": 1.7484393413961406e-05, + "loss": 0.2883, + "step": 12432 + }, + { + "epoch": 0.23083806433167608, + "grad_norm": 0.40456244349479675, + "learning_rate": 1.748361974264301e-05, + "loss": 0.3403, + "step": 12434 + }, + { + "epoch": 0.23087519446909474, + "grad_norm": 0.42169487476348877, + "learning_rate": 1.7482845969494963e-05, + "loss": 0.372, + "step": 12436 + }, + { + "epoch": 0.23091232460651337, + "grad_norm": 0.3566226363182068, + "learning_rate": 1.74820720945278e-05, + "loss": 0.4594, + "step": 12438 + }, + { + "epoch": 0.230949454743932, + "grad_norm": 0.673967719078064, + "learning_rate": 1.7481298117752046e-05, + "loss": 0.3753, + "step": 12440 + }, + { + "epoch": 0.23098658488135065, + "grad_norm": 0.27730801701545715, + "learning_rate": 1.7480524039178234e-05, + "loss": 0.303, + "step": 12442 + }, + { + "epoch": 0.23102371501876928, + "grad_norm": 0.38060349225997925, + "learning_rate": 1.7479749858816895e-05, + "loss": 0.2556, + "step": 12444 + }, + { + "epoch": 0.2310608451561879, + "grad_norm": 0.48266640305519104, + "learning_rate": 1.7478975576678565e-05, + "loss": 0.2467, + "step": 12446 + }, + { + "epoch": 0.23109797529360657, + "grad_norm": 0.3496796786785126, + "learning_rate": 1.7478201192773783e-05, + "loss": 0.1936, + "step": 12448 + }, + { + "epoch": 0.2311351054310252, + "grad_norm": 0.4143736660480499, + "learning_rate": 1.747742670711308e-05, + "loss": 0.4739, + "step": 12450 + }, + { + "epoch": 0.23117223556844385, + "grad_norm": 0.34172284603118896, + "learning_rate": 1.7476652119706996e-05, + "loss": 0.5804, + "step": 12452 + }, + { + "epoch": 0.23120936570586248, + "grad_norm": 0.41916823387145996, + "learning_rate": 1.747587743056607e-05, + "loss": 0.408, + "step": 12454 + }, + { + "epoch": 0.2312464958432811, + "grad_norm": 0.533547580242157, + "learning_rate": 1.747510263970085e-05, + "loss": 0.1849, + "step": 12456 + }, + { + "epoch": 0.23128362598069976, + "grad_norm": 0.32442694902420044, + "learning_rate": 1.7474327747121874e-05, + "loss": 0.2626, + "step": 12458 + }, + { + "epoch": 0.2313207561181184, + "grad_norm": 0.35257571935653687, + "learning_rate": 1.7473552752839682e-05, + "loss": 0.3978, + "step": 12460 + }, + { + "epoch": 0.23135788625553702, + "grad_norm": 0.35884883999824524, + "learning_rate": 1.7472777656864825e-05, + "loss": 0.3507, + "step": 12462 + }, + { + "epoch": 0.23139501639295568, + "grad_norm": 0.381747841835022, + "learning_rate": 1.7472002459207847e-05, + "loss": 0.1963, + "step": 12464 + }, + { + "epoch": 0.2314321465303743, + "grad_norm": 0.2982572615146637, + "learning_rate": 1.7471227159879295e-05, + "loss": 0.2763, + "step": 12466 + }, + { + "epoch": 0.23146927666779293, + "grad_norm": 0.333759069442749, + "learning_rate": 1.747045175888972e-05, + "loss": 0.3619, + "step": 12468 + }, + { + "epoch": 0.2315064068052116, + "grad_norm": 0.3700467348098755, + "learning_rate": 1.7469676256249677e-05, + "loss": 0.2686, + "step": 12470 + }, + { + "epoch": 0.23154353694263022, + "grad_norm": 0.35166335105895996, + "learning_rate": 1.746890065196971e-05, + "loss": 0.2595, + "step": 12472 + }, + { + "epoch": 0.23158066708004887, + "grad_norm": 0.40320897102355957, + "learning_rate": 1.7468124946060384e-05, + "loss": 0.2864, + "step": 12474 + }, + { + "epoch": 0.2316177972174675, + "grad_norm": 0.3219544589519501, + "learning_rate": 1.746734913853224e-05, + "loss": 0.2817, + "step": 12476 + }, + { + "epoch": 0.23165492735488613, + "grad_norm": 0.39119938015937805, + "learning_rate": 1.7466573229395844e-05, + "loss": 0.2267, + "step": 12478 + }, + { + "epoch": 0.2316920574923048, + "grad_norm": 0.30432385206222534, + "learning_rate": 1.746579721866175e-05, + "loss": 0.3832, + "step": 12480 + }, + { + "epoch": 0.23172918762972342, + "grad_norm": 0.4062613844871521, + "learning_rate": 1.746502110634052e-05, + "loss": 0.3976, + "step": 12482 + }, + { + "epoch": 0.23176631776714204, + "grad_norm": 0.29385146498680115, + "learning_rate": 1.7464244892442714e-05, + "loss": 0.1382, + "step": 12484 + }, + { + "epoch": 0.2318034479045607, + "grad_norm": 0.2695964276790619, + "learning_rate": 1.7463468576978893e-05, + "loss": 0.5599, + "step": 12486 + }, + { + "epoch": 0.23184057804197933, + "grad_norm": 0.7436281442642212, + "learning_rate": 1.746269215995962e-05, + "loss": 0.4025, + "step": 12488 + }, + { + "epoch": 0.23187770817939798, + "grad_norm": 0.35808977484703064, + "learning_rate": 1.746191564139546e-05, + "loss": 0.3018, + "step": 12490 + }, + { + "epoch": 0.2319148383168166, + "grad_norm": 0.2579288184642792, + "learning_rate": 1.7461139021296974e-05, + "loss": 0.2899, + "step": 12492 + }, + { + "epoch": 0.23195196845423524, + "grad_norm": 0.31625860929489136, + "learning_rate": 1.7460362299674743e-05, + "loss": 0.1102, + "step": 12494 + }, + { + "epoch": 0.2319890985916539, + "grad_norm": 0.3438818156719208, + "learning_rate": 1.7459585476539324e-05, + "loss": 0.3074, + "step": 12496 + }, + { + "epoch": 0.23202622872907253, + "grad_norm": 0.4542659521102905, + "learning_rate": 1.7458808551901286e-05, + "loss": 0.3865, + "step": 12498 + }, + { + "epoch": 0.23206335886649115, + "grad_norm": 0.2878901958465576, + "learning_rate": 1.7458031525771212e-05, + "loss": 0.1274, + "step": 12500 + }, + { + "epoch": 0.2321004890039098, + "grad_norm": 0.3579825162887573, + "learning_rate": 1.7457254398159665e-05, + "loss": 0.2869, + "step": 12502 + }, + { + "epoch": 0.23213761914132844, + "grad_norm": 0.4130638539791107, + "learning_rate": 1.7456477169077228e-05, + "loss": 0.3883, + "step": 12504 + }, + { + "epoch": 0.23217474927874707, + "grad_norm": 0.37749090790748596, + "learning_rate": 1.7455699838534463e-05, + "loss": 0.4673, + "step": 12506 + }, + { + "epoch": 0.23221187941616572, + "grad_norm": 0.33898088335990906, + "learning_rate": 1.745492240654196e-05, + "loss": 0.2841, + "step": 12508 + }, + { + "epoch": 0.23224900955358435, + "grad_norm": 0.34774860739707947, + "learning_rate": 1.7454144873110293e-05, + "loss": 0.1943, + "step": 12510 + }, + { + "epoch": 0.232286139691003, + "grad_norm": 0.28457504510879517, + "learning_rate": 1.745336723825004e-05, + "loss": 0.3138, + "step": 12512 + }, + { + "epoch": 0.23232326982842164, + "grad_norm": 0.3134916126728058, + "learning_rate": 1.745258950197179e-05, + "loss": 0.3829, + "step": 12514 + }, + { + "epoch": 0.23236039996584026, + "grad_norm": 0.36652836203575134, + "learning_rate": 1.7451811664286115e-05, + "loss": 0.2887, + "step": 12516 + }, + { + "epoch": 0.23239753010325892, + "grad_norm": 0.45158714056015015, + "learning_rate": 1.7451033725203602e-05, + "loss": 0.3936, + "step": 12518 + }, + { + "epoch": 0.23243466024067755, + "grad_norm": 0.3174259662628174, + "learning_rate": 1.7450255684734846e-05, + "loss": 0.1917, + "step": 12520 + }, + { + "epoch": 0.23247179037809618, + "grad_norm": 0.47589996457099915, + "learning_rate": 1.744947754289042e-05, + "loss": 0.3594, + "step": 12522 + }, + { + "epoch": 0.23250892051551483, + "grad_norm": 0.4630495309829712, + "learning_rate": 1.744869929968092e-05, + "loss": 0.1076, + "step": 12524 + }, + { + "epoch": 0.23254605065293346, + "grad_norm": 0.30611947178840637, + "learning_rate": 1.7447920955116934e-05, + "loss": 0.3806, + "step": 12526 + }, + { + "epoch": 0.23258318079035212, + "grad_norm": 0.4588335156440735, + "learning_rate": 1.744714250920905e-05, + "loss": 0.2879, + "step": 12528 + }, + { + "epoch": 0.23262031092777075, + "grad_norm": 0.4138917624950409, + "learning_rate": 1.744636396196787e-05, + "loss": 0.3003, + "step": 12530 + }, + { + "epoch": 0.23265744106518937, + "grad_norm": 0.5048940777778625, + "learning_rate": 1.7445585313403976e-05, + "loss": 0.3434, + "step": 12532 + }, + { + "epoch": 0.23269457120260803, + "grad_norm": 0.22947223484516144, + "learning_rate": 1.744480656352797e-05, + "loss": 0.2576, + "step": 12534 + }, + { + "epoch": 0.23273170134002666, + "grad_norm": 0.4218013882637024, + "learning_rate": 1.7444027712350448e-05, + "loss": 0.2409, + "step": 12536 + }, + { + "epoch": 0.2327688314774453, + "grad_norm": 0.39741289615631104, + "learning_rate": 1.7443248759882002e-05, + "loss": 0.4571, + "step": 12538 + }, + { + "epoch": 0.23280596161486394, + "grad_norm": 0.33087971806526184, + "learning_rate": 1.7442469706133234e-05, + "loss": 0.4341, + "step": 12540 + }, + { + "epoch": 0.23284309175228257, + "grad_norm": 0.30327117443084717, + "learning_rate": 1.7441690551114752e-05, + "loss": 0.3528, + "step": 12542 + }, + { + "epoch": 0.2328802218897012, + "grad_norm": 0.4206446409225464, + "learning_rate": 1.744091129483715e-05, + "loss": 0.322, + "step": 12544 + }, + { + "epoch": 0.23291735202711986, + "grad_norm": 0.38756147027015686, + "learning_rate": 1.7440131937311034e-05, + "loss": 0.4019, + "step": 12546 + }, + { + "epoch": 0.23295448216453848, + "grad_norm": 0.3450041711330414, + "learning_rate": 1.7439352478547008e-05, + "loss": 0.3146, + "step": 12548 + }, + { + "epoch": 0.23299161230195714, + "grad_norm": 0.34918153285980225, + "learning_rate": 1.7438572918555677e-05, + "loss": 0.3231, + "step": 12550 + }, + { + "epoch": 0.23302874243937577, + "grad_norm": 0.29507169127464294, + "learning_rate": 1.743779325734765e-05, + "loss": 0.4331, + "step": 12552 + }, + { + "epoch": 0.2330658725767944, + "grad_norm": 0.5163176655769348, + "learning_rate": 1.7437013494933533e-05, + "loss": 0.3267, + "step": 12554 + }, + { + "epoch": 0.23310300271421305, + "grad_norm": 0.40732666850090027, + "learning_rate": 1.7436233631323943e-05, + "loss": 0.1744, + "step": 12556 + }, + { + "epoch": 0.23314013285163168, + "grad_norm": 0.3646830916404724, + "learning_rate": 1.743545366652949e-05, + "loss": 0.3587, + "step": 12558 + }, + { + "epoch": 0.2331772629890503, + "grad_norm": 0.4157034456729889, + "learning_rate": 1.7434673600560773e-05, + "loss": 0.3563, + "step": 12560 + }, + { + "epoch": 0.23321439312646897, + "grad_norm": 0.32352957129478455, + "learning_rate": 1.743389343342843e-05, + "loss": 0.1657, + "step": 12562 + }, + { + "epoch": 0.2332515232638876, + "grad_norm": 0.35385289788246155, + "learning_rate": 1.7433113165143056e-05, + "loss": 0.2354, + "step": 12564 + }, + { + "epoch": 0.23328865340130625, + "grad_norm": 0.26118889451026917, + "learning_rate": 1.743233279571528e-05, + "loss": 0.4764, + "step": 12566 + }, + { + "epoch": 0.23332578353872488, + "grad_norm": 0.34386008977890015, + "learning_rate": 1.7431552325155718e-05, + "loss": 0.3517, + "step": 12568 + }, + { + "epoch": 0.2333629136761435, + "grad_norm": 0.3405663073062897, + "learning_rate": 1.7430771753474987e-05, + "loss": 0.3109, + "step": 12570 + }, + { + "epoch": 0.23340004381356216, + "grad_norm": 0.2750672698020935, + "learning_rate": 1.742999108068371e-05, + "loss": 0.2927, + "step": 12572 + }, + { + "epoch": 0.2334371739509808, + "grad_norm": 0.4655449688434601, + "learning_rate": 1.742921030679251e-05, + "loss": 0.1777, + "step": 12574 + }, + { + "epoch": 0.23347430408839942, + "grad_norm": 0.28576651215553284, + "learning_rate": 1.7428429431812013e-05, + "loss": 0.3276, + "step": 12576 + }, + { + "epoch": 0.23351143422581808, + "grad_norm": 0.5428327918052673, + "learning_rate": 1.742764845575284e-05, + "loss": 0.2782, + "step": 12578 + }, + { + "epoch": 0.2335485643632367, + "grad_norm": 0.4101560413837433, + "learning_rate": 1.7426867378625622e-05, + "loss": 0.293, + "step": 12580 + }, + { + "epoch": 0.23358569450065533, + "grad_norm": 0.41373971104621887, + "learning_rate": 1.7426086200440983e-05, + "loss": 0.2911, + "step": 12582 + }, + { + "epoch": 0.233622824638074, + "grad_norm": 0.4218546152114868, + "learning_rate": 1.7425304921209555e-05, + "loss": 0.6767, + "step": 12584 + }, + { + "epoch": 0.23365995477549262, + "grad_norm": 0.3192867040634155, + "learning_rate": 1.7424523540941968e-05, + "loss": 0.3068, + "step": 12586 + }, + { + "epoch": 0.23369708491291127, + "grad_norm": 0.27396726608276367, + "learning_rate": 1.7423742059648855e-05, + "loss": 0.2926, + "step": 12588 + }, + { + "epoch": 0.2337342150503299, + "grad_norm": 0.30776506662368774, + "learning_rate": 1.742296047734085e-05, + "loss": 0.1969, + "step": 12590 + }, + { + "epoch": 0.23377134518774853, + "grad_norm": 1.1420944929122925, + "learning_rate": 1.7422178794028587e-05, + "loss": 0.4351, + "step": 12592 + }, + { + "epoch": 0.2338084753251672, + "grad_norm": 0.34347182512283325, + "learning_rate": 1.74213970097227e-05, + "loss": 0.2518, + "step": 12594 + }, + { + "epoch": 0.23384560546258581, + "grad_norm": 0.37608620524406433, + "learning_rate": 1.7420615124433834e-05, + "loss": 0.4885, + "step": 12596 + }, + { + "epoch": 0.23388273560000444, + "grad_norm": 0.3722425103187561, + "learning_rate": 1.7419833138172618e-05, + "loss": 0.2205, + "step": 12598 + }, + { + "epoch": 0.2339198657374231, + "grad_norm": 0.3573843836784363, + "learning_rate": 1.7419051050949703e-05, + "loss": 0.1322, + "step": 12600 + }, + { + "epoch": 0.23395699587484173, + "grad_norm": 0.3446076214313507, + "learning_rate": 1.7418268862775723e-05, + "loss": 0.3544, + "step": 12602 + }, + { + "epoch": 0.23399412601226038, + "grad_norm": 0.4843568801879883, + "learning_rate": 1.7417486573661326e-05, + "loss": 0.3412, + "step": 12604 + }, + { + "epoch": 0.234031256149679, + "grad_norm": 0.30958664417266846, + "learning_rate": 1.7416704183617153e-05, + "loss": 0.5088, + "step": 12606 + }, + { + "epoch": 0.23406838628709764, + "grad_norm": 0.34208205342292786, + "learning_rate": 1.7415921692653852e-05, + "loss": 0.1449, + "step": 12608 + }, + { + "epoch": 0.2341055164245163, + "grad_norm": 0.4737250506877899, + "learning_rate": 1.741513910078207e-05, + "loss": 0.3077, + "step": 12610 + }, + { + "epoch": 0.23414264656193493, + "grad_norm": 0.3565233647823334, + "learning_rate": 1.7414356408012457e-05, + "loss": 0.1516, + "step": 12612 + }, + { + "epoch": 0.23417977669935355, + "grad_norm": 0.5153803825378418, + "learning_rate": 1.741357361435566e-05, + "loss": 0.3821, + "step": 12614 + }, + { + "epoch": 0.2342169068367722, + "grad_norm": 0.2750087380409241, + "learning_rate": 1.7412790719822334e-05, + "loss": 0.3858, + "step": 12616 + }, + { + "epoch": 0.23425403697419084, + "grad_norm": 0.47064000368118286, + "learning_rate": 1.741200772442313e-05, + "loss": 0.2999, + "step": 12618 + }, + { + "epoch": 0.23429116711160947, + "grad_norm": 0.4685410261154175, + "learning_rate": 1.7411224628168703e-05, + "loss": 0.3678, + "step": 12620 + }, + { + "epoch": 0.23432829724902812, + "grad_norm": 0.6276955604553223, + "learning_rate": 1.7410441431069704e-05, + "loss": 0.4268, + "step": 12622 + }, + { + "epoch": 0.23436542738644675, + "grad_norm": 0.28432828187942505, + "learning_rate": 1.7409658133136797e-05, + "loss": 0.2987, + "step": 12624 + }, + { + "epoch": 0.2344025575238654, + "grad_norm": 0.31281784176826477, + "learning_rate": 1.740887473438064e-05, + "loss": 0.3165, + "step": 12626 + }, + { + "epoch": 0.23443968766128404, + "grad_norm": 0.3922826945781708, + "learning_rate": 1.7408091234811887e-05, + "loss": 0.2571, + "step": 12628 + }, + { + "epoch": 0.23447681779870266, + "grad_norm": 0.32701578736305237, + "learning_rate": 1.74073076344412e-05, + "loss": 0.2985, + "step": 12630 + }, + { + "epoch": 0.23451394793612132, + "grad_norm": 0.4044848084449768, + "learning_rate": 1.7406523933279244e-05, + "loss": 0.2234, + "step": 12632 + }, + { + "epoch": 0.23455107807353995, + "grad_norm": 0.33668404817581177, + "learning_rate": 1.7405740131336685e-05, + "loss": 0.3202, + "step": 12634 + }, + { + "epoch": 0.23458820821095858, + "grad_norm": 0.32177063822746277, + "learning_rate": 1.7404956228624187e-05, + "loss": 0.308, + "step": 12636 + }, + { + "epoch": 0.23462533834837723, + "grad_norm": 0.3719465732574463, + "learning_rate": 1.740417222515241e-05, + "loss": 0.3683, + "step": 12638 + }, + { + "epoch": 0.23466246848579586, + "grad_norm": 0.41589921712875366, + "learning_rate": 1.740338812093203e-05, + "loss": 0.3301, + "step": 12640 + }, + { + "epoch": 0.23469959862321452, + "grad_norm": 0.9213447570800781, + "learning_rate": 1.7402603915973714e-05, + "loss": 0.5306, + "step": 12642 + }, + { + "epoch": 0.23473672876063315, + "grad_norm": 0.5603246092796326, + "learning_rate": 1.740181961028813e-05, + "loss": 0.2628, + "step": 12644 + }, + { + "epoch": 0.23477385889805177, + "grad_norm": 0.42980051040649414, + "learning_rate": 1.7401035203885954e-05, + "loss": 0.4079, + "step": 12646 + }, + { + "epoch": 0.23481098903547043, + "grad_norm": 0.3186267018318176, + "learning_rate": 1.7400250696777857e-05, + "loss": 0.36, + "step": 12648 + }, + { + "epoch": 0.23484811917288906, + "grad_norm": 0.35135483741760254, + "learning_rate": 1.7399466088974515e-05, + "loss": 0.136, + "step": 12650 + }, + { + "epoch": 0.2348852493103077, + "grad_norm": 0.3205353617668152, + "learning_rate": 1.73986813804866e-05, + "loss": 0.2567, + "step": 12652 + }, + { + "epoch": 0.23492237944772634, + "grad_norm": 0.3409160077571869, + "learning_rate": 1.73978965713248e-05, + "loss": 0.2062, + "step": 12654 + }, + { + "epoch": 0.23495950958514497, + "grad_norm": 0.27931809425354004, + "learning_rate": 1.7397111661499782e-05, + "loss": 0.4996, + "step": 12656 + }, + { + "epoch": 0.2349966397225636, + "grad_norm": 0.38363218307495117, + "learning_rate": 1.739632665102223e-05, + "loss": 0.2805, + "step": 12658 + }, + { + "epoch": 0.23503376985998226, + "grad_norm": 0.3391798436641693, + "learning_rate": 1.739554153990283e-05, + "loss": 0.2609, + "step": 12660 + }, + { + "epoch": 0.23507089999740088, + "grad_norm": 0.2957044839859009, + "learning_rate": 1.739475632815226e-05, + "loss": 0.1946, + "step": 12662 + }, + { + "epoch": 0.23510803013481954, + "grad_norm": 0.4695781171321869, + "learning_rate": 1.739397101578121e-05, + "loss": 0.4279, + "step": 12664 + }, + { + "epoch": 0.23514516027223817, + "grad_norm": 0.2800712585449219, + "learning_rate": 1.739318560280036e-05, + "loss": 0.285, + "step": 12666 + }, + { + "epoch": 0.2351822904096568, + "grad_norm": 0.23228482902050018, + "learning_rate": 1.7392400089220392e-05, + "loss": 0.3475, + "step": 12668 + }, + { + "epoch": 0.23521942054707545, + "grad_norm": 0.42808568477630615, + "learning_rate": 1.739161447505201e-05, + "loss": 0.2079, + "step": 12670 + }, + { + "epoch": 0.23525655068449408, + "grad_norm": 0.48333513736724854, + "learning_rate": 1.7390828760305894e-05, + "loss": 0.4686, + "step": 12672 + }, + { + "epoch": 0.2352936808219127, + "grad_norm": 0.38416048884391785, + "learning_rate": 1.7390042944992734e-05, + "loss": 0.1864, + "step": 12674 + }, + { + "epoch": 0.23533081095933137, + "grad_norm": 0.2735489308834076, + "learning_rate": 1.7389257029123228e-05, + "loss": 0.4269, + "step": 12676 + }, + { + "epoch": 0.23536794109675, + "grad_norm": 0.4203188419342041, + "learning_rate": 1.7388471012708068e-05, + "loss": 0.3147, + "step": 12678 + }, + { + "epoch": 0.23540507123416865, + "grad_norm": 0.3212920129299164, + "learning_rate": 1.7387684895757947e-05, + "loss": 0.2707, + "step": 12680 + }, + { + "epoch": 0.23544220137158728, + "grad_norm": 0.3601970076560974, + "learning_rate": 1.7386898678283563e-05, + "loss": 0.2869, + "step": 12682 + }, + { + "epoch": 0.2354793315090059, + "grad_norm": 0.2809280753135681, + "learning_rate": 1.7386112360295614e-05, + "loss": 0.1522, + "step": 12684 + }, + { + "epoch": 0.23551646164642456, + "grad_norm": 0.2453979104757309, + "learning_rate": 1.7385325941804797e-05, + "loss": 0.3479, + "step": 12686 + }, + { + "epoch": 0.2355535917838432, + "grad_norm": 0.3738706707954407, + "learning_rate": 1.738453942282182e-05, + "loss": 0.4889, + "step": 12688 + }, + { + "epoch": 0.23559072192126182, + "grad_norm": 0.4311172664165497, + "learning_rate": 1.738375280335738e-05, + "loss": 0.3496, + "step": 12690 + }, + { + "epoch": 0.23562785205868048, + "grad_norm": 0.30339083075523376, + "learning_rate": 1.738296608342218e-05, + "loss": 0.4413, + "step": 12692 + }, + { + "epoch": 0.2356649821960991, + "grad_norm": 0.4342459738254547, + "learning_rate": 1.7382179263026926e-05, + "loss": 0.2265, + "step": 12694 + }, + { + "epoch": 0.23570211233351773, + "grad_norm": 0.4688303768634796, + "learning_rate": 1.7381392342182328e-05, + "loss": 0.3414, + "step": 12696 + }, + { + "epoch": 0.2357392424709364, + "grad_norm": 0.25577959418296814, + "learning_rate": 1.7380605320899087e-05, + "loss": 0.5067, + "step": 12698 + }, + { + "epoch": 0.23577637260835502, + "grad_norm": 0.43003860116004944, + "learning_rate": 1.7379818199187914e-05, + "loss": 0.2793, + "step": 12700 + }, + { + "epoch": 0.23581350274577367, + "grad_norm": 0.3236466944217682, + "learning_rate": 1.737903097705952e-05, + "loss": 0.2849, + "step": 12702 + }, + { + "epoch": 0.2358506328831923, + "grad_norm": 0.5448622107505798, + "learning_rate": 1.7378243654524622e-05, + "loss": 0.4428, + "step": 12704 + }, + { + "epoch": 0.23588776302061093, + "grad_norm": 0.36351892352104187, + "learning_rate": 1.7377456231593923e-05, + "loss": 0.37, + "step": 12706 + }, + { + "epoch": 0.23592489315802959, + "grad_norm": 0.34030577540397644, + "learning_rate": 1.7376668708278144e-05, + "loss": 0.2143, + "step": 12708 + }, + { + "epoch": 0.23596202329544821, + "grad_norm": 0.4404785633087158, + "learning_rate": 1.7375881084588e-05, + "loss": 0.4456, + "step": 12710 + }, + { + "epoch": 0.23599915343286684, + "grad_norm": 0.37547069787979126, + "learning_rate": 1.737509336053421e-05, + "loss": 0.2474, + "step": 12712 + }, + { + "epoch": 0.2360362835702855, + "grad_norm": 0.4028439521789551, + "learning_rate": 1.7374305536127486e-05, + "loss": 0.3614, + "step": 12714 + }, + { + "epoch": 0.23607341370770413, + "grad_norm": 0.4768480658531189, + "learning_rate": 1.7373517611378558e-05, + "loss": 0.2723, + "step": 12716 + }, + { + "epoch": 0.23611054384512278, + "grad_norm": 0.3648746609687805, + "learning_rate": 1.7372729586298137e-05, + "loss": 0.2954, + "step": 12718 + }, + { + "epoch": 0.2361476739825414, + "grad_norm": 0.5499078631401062, + "learning_rate": 1.737194146089695e-05, + "loss": 0.285, + "step": 12720 + }, + { + "epoch": 0.23618480411996004, + "grad_norm": 0.3659633696079254, + "learning_rate": 1.7371153235185724e-05, + "loss": 0.2028, + "step": 12722 + }, + { + "epoch": 0.2362219342573787, + "grad_norm": 0.42026105523109436, + "learning_rate": 1.737036490917518e-05, + "loss": 0.358, + "step": 12724 + }, + { + "epoch": 0.23625906439479732, + "grad_norm": 0.4046187400817871, + "learning_rate": 1.7369576482876046e-05, + "loss": 0.4486, + "step": 12726 + }, + { + "epoch": 0.23629619453221595, + "grad_norm": 0.4823373854160309, + "learning_rate": 1.7368787956299052e-05, + "loss": 0.1377, + "step": 12728 + }, + { + "epoch": 0.2363333246696346, + "grad_norm": 0.29686641693115234, + "learning_rate": 1.7367999329454926e-05, + "loss": 0.2842, + "step": 12730 + }, + { + "epoch": 0.23637045480705324, + "grad_norm": 0.41739293932914734, + "learning_rate": 1.7367210602354395e-05, + "loss": 0.3491, + "step": 12732 + }, + { + "epoch": 0.23640758494447187, + "grad_norm": 0.38745439052581787, + "learning_rate": 1.7366421775008202e-05, + "loss": 0.459, + "step": 12734 + }, + { + "epoch": 0.23644471508189052, + "grad_norm": 0.4241142272949219, + "learning_rate": 1.7365632847427068e-05, + "loss": 0.2465, + "step": 12736 + }, + { + "epoch": 0.23648184521930915, + "grad_norm": 0.4007609486579895, + "learning_rate": 1.7364843819621736e-05, + "loss": 0.2899, + "step": 12738 + }, + { + "epoch": 0.2365189753567278, + "grad_norm": 0.5675888657569885, + "learning_rate": 1.736405469160294e-05, + "loss": 0.3971, + "step": 12740 + }, + { + "epoch": 0.23655610549414643, + "grad_norm": 0.3920831084251404, + "learning_rate": 1.7363265463381416e-05, + "loss": 0.3328, + "step": 12742 + }, + { + "epoch": 0.23659323563156506, + "grad_norm": 0.37745344638824463, + "learning_rate": 1.7362476134967906e-05, + "loss": 0.4852, + "step": 12744 + }, + { + "epoch": 0.23663036576898372, + "grad_norm": 0.3095090389251709, + "learning_rate": 1.7361686706373147e-05, + "loss": 0.3429, + "step": 12746 + }, + { + "epoch": 0.23666749590640235, + "grad_norm": 0.696565568447113, + "learning_rate": 1.7360897177607885e-05, + "loss": 0.2833, + "step": 12748 + }, + { + "epoch": 0.23670462604382098, + "grad_norm": 0.38258102536201477, + "learning_rate": 1.736010754868286e-05, + "loss": 0.3988, + "step": 12750 + }, + { + "epoch": 0.23674175618123963, + "grad_norm": 0.4913133680820465, + "learning_rate": 1.7359317819608814e-05, + "loss": 0.283, + "step": 12752 + }, + { + "epoch": 0.23677888631865826, + "grad_norm": 0.40903839468955994, + "learning_rate": 1.73585279903965e-05, + "loss": 0.5389, + "step": 12754 + }, + { + "epoch": 0.23681601645607692, + "grad_norm": 0.36749300360679626, + "learning_rate": 1.7357738061056665e-05, + "loss": 0.5194, + "step": 12756 + }, + { + "epoch": 0.23685314659349554, + "grad_norm": 0.4181799590587616, + "learning_rate": 1.7356948031600046e-05, + "loss": 0.3729, + "step": 12758 + }, + { + "epoch": 0.23689027673091417, + "grad_norm": 0.4059695601463318, + "learning_rate": 1.7356157902037404e-05, + "loss": 0.4474, + "step": 12760 + }, + { + "epoch": 0.23692740686833283, + "grad_norm": 0.21081207692623138, + "learning_rate": 1.735536767237949e-05, + "loss": 0.1456, + "step": 12762 + }, + { + "epoch": 0.23696453700575146, + "grad_norm": 0.4715377688407898, + "learning_rate": 1.735457734263705e-05, + "loss": 0.2404, + "step": 12764 + }, + { + "epoch": 0.23700166714317009, + "grad_norm": 0.3792320489883423, + "learning_rate": 1.7353786912820846e-05, + "loss": 0.1972, + "step": 12766 + }, + { + "epoch": 0.23703879728058874, + "grad_norm": 0.333168089389801, + "learning_rate": 1.7352996382941624e-05, + "loss": 0.229, + "step": 12768 + }, + { + "epoch": 0.23707592741800737, + "grad_norm": 0.5305964350700378, + "learning_rate": 1.735220575301015e-05, + "loss": 0.5028, + "step": 12770 + }, + { + "epoch": 0.237113057555426, + "grad_norm": 0.2878890335559845, + "learning_rate": 1.7351415023037178e-05, + "loss": 0.2405, + "step": 12772 + }, + { + "epoch": 0.23715018769284466, + "grad_norm": 0.2512670159339905, + "learning_rate": 1.7350624193033464e-05, + "loss": 0.3051, + "step": 12774 + }, + { + "epoch": 0.23718731783026328, + "grad_norm": 0.35368621349334717, + "learning_rate": 1.7349833263009776e-05, + "loss": 0.2299, + "step": 12776 + }, + { + "epoch": 0.23722444796768194, + "grad_norm": 0.8200502395629883, + "learning_rate": 1.734904223297687e-05, + "loss": 0.2192, + "step": 12778 + }, + { + "epoch": 0.23726157810510057, + "grad_norm": 0.367302805185318, + "learning_rate": 1.734825110294552e-05, + "loss": 0.4873, + "step": 12780 + }, + { + "epoch": 0.2372987082425192, + "grad_norm": 0.3810739815235138, + "learning_rate": 1.734745987292647e-05, + "loss": 0.2621, + "step": 12782 + }, + { + "epoch": 0.23733583837993785, + "grad_norm": 0.37058472633361816, + "learning_rate": 1.7346668542930508e-05, + "loss": 0.2029, + "step": 12784 + }, + { + "epoch": 0.23737296851735648, + "grad_norm": 0.3579244017601013, + "learning_rate": 1.734587711296839e-05, + "loss": 0.2997, + "step": 12786 + }, + { + "epoch": 0.2374100986547751, + "grad_norm": 0.37096911668777466, + "learning_rate": 1.734508558305089e-05, + "loss": 0.3205, + "step": 12788 + }, + { + "epoch": 0.23744722879219377, + "grad_norm": 0.30552271008491516, + "learning_rate": 1.7344293953188775e-05, + "loss": 0.4221, + "step": 12790 + }, + { + "epoch": 0.2374843589296124, + "grad_norm": 0.4109688699245453, + "learning_rate": 1.734350222339282e-05, + "loss": 0.4132, + "step": 12792 + }, + { + "epoch": 0.23752148906703105, + "grad_norm": 0.6047911643981934, + "learning_rate": 1.7342710393673795e-05, + "loss": 0.4507, + "step": 12794 + }, + { + "epoch": 0.23755861920444968, + "grad_norm": 0.29877474904060364, + "learning_rate": 1.7341918464042474e-05, + "loss": 0.4751, + "step": 12796 + }, + { + "epoch": 0.2375957493418683, + "grad_norm": 0.44438010454177856, + "learning_rate": 1.7341126434509633e-05, + "loss": 0.366, + "step": 12798 + }, + { + "epoch": 0.23763287947928696, + "grad_norm": 0.43472474813461304, + "learning_rate": 1.7340334305086052e-05, + "loss": 0.353, + "step": 12800 + }, + { + "epoch": 0.2376700096167056, + "grad_norm": 0.28846633434295654, + "learning_rate": 1.733954207578251e-05, + "loss": 0.3997, + "step": 12802 + }, + { + "epoch": 0.23770713975412422, + "grad_norm": 0.3043912351131439, + "learning_rate": 1.7338749746609783e-05, + "loss": 0.4526, + "step": 12804 + }, + { + "epoch": 0.23774426989154288, + "grad_norm": 0.41464704275131226, + "learning_rate": 1.7337957317578654e-05, + "loss": 0.1918, + "step": 12806 + }, + { + "epoch": 0.2377814000289615, + "grad_norm": 0.30406907200813293, + "learning_rate": 1.7337164788699908e-05, + "loss": 0.2603, + "step": 12808 + }, + { + "epoch": 0.23781853016638013, + "grad_norm": 0.33571094274520874, + "learning_rate": 1.7336372159984323e-05, + "loss": 0.4518, + "step": 12810 + }, + { + "epoch": 0.2378556603037988, + "grad_norm": 0.3648044168949127, + "learning_rate": 1.733557943144269e-05, + "loss": 0.3453, + "step": 12812 + }, + { + "epoch": 0.23789279044121742, + "grad_norm": 0.2715219557285309, + "learning_rate": 1.7334786603085792e-05, + "loss": 0.4085, + "step": 12814 + }, + { + "epoch": 0.23792992057863607, + "grad_norm": 0.24025827646255493, + "learning_rate": 1.733399367492442e-05, + "loss": 0.2742, + "step": 12816 + }, + { + "epoch": 0.2379670507160547, + "grad_norm": 0.3469150960445404, + "learning_rate": 1.7333200646969358e-05, + "loss": 0.2603, + "step": 12818 + }, + { + "epoch": 0.23800418085347333, + "grad_norm": 0.364886611700058, + "learning_rate": 1.7332407519231406e-05, + "loss": 0.4342, + "step": 12820 + }, + { + "epoch": 0.23804131099089199, + "grad_norm": 0.3781435191631317, + "learning_rate": 1.733161429172135e-05, + "loss": 0.3451, + "step": 12822 + }, + { + "epoch": 0.23807844112831061, + "grad_norm": 0.24098388850688934, + "learning_rate": 1.7330820964449984e-05, + "loss": 0.2947, + "step": 12824 + }, + { + "epoch": 0.23811557126572924, + "grad_norm": 0.3040056526660919, + "learning_rate": 1.73300275374281e-05, + "loss": 0.4029, + "step": 12826 + }, + { + "epoch": 0.2381527014031479, + "grad_norm": 0.457302451133728, + "learning_rate": 1.7329234010666503e-05, + "loss": 0.1853, + "step": 12828 + }, + { + "epoch": 0.23818983154056653, + "grad_norm": 0.38976603746414185, + "learning_rate": 1.732844038417598e-05, + "loss": 0.324, + "step": 12830 + }, + { + "epoch": 0.23822696167798518, + "grad_norm": 0.38723477721214294, + "learning_rate": 1.7327646657967335e-05, + "loss": 0.4528, + "step": 12832 + }, + { + "epoch": 0.2382640918154038, + "grad_norm": 0.38451531529426575, + "learning_rate": 1.7326852832051368e-05, + "loss": 0.2307, + "step": 12834 + }, + { + "epoch": 0.23830122195282244, + "grad_norm": 0.41715332865715027, + "learning_rate": 1.732605890643888e-05, + "loss": 0.2744, + "step": 12836 + }, + { + "epoch": 0.2383383520902411, + "grad_norm": 0.5411593317985535, + "learning_rate": 1.7325264881140677e-05, + "loss": 0.2953, + "step": 12838 + }, + { + "epoch": 0.23837548222765972, + "grad_norm": 0.4014984369277954, + "learning_rate": 1.7324470756167557e-05, + "loss": 0.2533, + "step": 12840 + }, + { + "epoch": 0.23841261236507835, + "grad_norm": 0.42186856269836426, + "learning_rate": 1.732367653153033e-05, + "loss": 0.2571, + "step": 12842 + }, + { + "epoch": 0.238449742502497, + "grad_norm": 0.39245909452438354, + "learning_rate": 1.7322882207239808e-05, + "loss": 0.3315, + "step": 12844 + }, + { + "epoch": 0.23848687263991564, + "grad_norm": 0.3485632836818695, + "learning_rate": 1.732208778330679e-05, + "loss": 0.4277, + "step": 12846 + }, + { + "epoch": 0.23852400277733427, + "grad_norm": 0.4353030025959015, + "learning_rate": 1.7321293259742088e-05, + "loss": 0.2823, + "step": 12848 + }, + { + "epoch": 0.23856113291475292, + "grad_norm": 0.3382247984409332, + "learning_rate": 1.7320498636556514e-05, + "loss": 0.4936, + "step": 12850 + }, + { + "epoch": 0.23859826305217155, + "grad_norm": 0.36883148550987244, + "learning_rate": 1.7319703913760883e-05, + "loss": 0.1888, + "step": 12852 + }, + { + "epoch": 0.2386353931895902, + "grad_norm": 0.4192979633808136, + "learning_rate": 1.7318909091366004e-05, + "loss": 0.2758, + "step": 12854 + }, + { + "epoch": 0.23867252332700883, + "grad_norm": 0.3251652121543884, + "learning_rate": 1.7318114169382697e-05, + "loss": 0.3805, + "step": 12856 + }, + { + "epoch": 0.23870965346442746, + "grad_norm": 0.38844212889671326, + "learning_rate": 1.7317319147821777e-05, + "loss": 0.4159, + "step": 12858 + }, + { + "epoch": 0.23874678360184612, + "grad_norm": 0.30372580885887146, + "learning_rate": 1.7316524026694062e-05, + "loss": 0.2436, + "step": 12860 + }, + { + "epoch": 0.23878391373926475, + "grad_norm": 0.3229014575481415, + "learning_rate": 1.731572880601037e-05, + "loss": 0.5588, + "step": 12862 + }, + { + "epoch": 0.23882104387668338, + "grad_norm": 0.2849932312965393, + "learning_rate": 1.731493348578152e-05, + "loss": 0.193, + "step": 12864 + }, + { + "epoch": 0.23885817401410203, + "grad_norm": 0.5650273561477661, + "learning_rate": 1.731413806601834e-05, + "loss": 0.3707, + "step": 12866 + }, + { + "epoch": 0.23889530415152066, + "grad_norm": 0.3212253451347351, + "learning_rate": 1.7313342546731643e-05, + "loss": 0.3246, + "step": 12868 + }, + { + "epoch": 0.23893243428893932, + "grad_norm": 0.2947758138179779, + "learning_rate": 1.7312546927932264e-05, + "loss": 0.2349, + "step": 12870 + }, + { + "epoch": 0.23896956442635794, + "grad_norm": 0.43986088037490845, + "learning_rate": 1.7311751209631027e-05, + "loss": 0.4349, + "step": 12872 + }, + { + "epoch": 0.23900669456377657, + "grad_norm": 0.2511743903160095, + "learning_rate": 1.7310955391838754e-05, + "loss": 0.4066, + "step": 12874 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.36121439933776855, + "learning_rate": 1.7310159474566275e-05, + "loss": 0.2057, + "step": 12876 + }, + { + "epoch": 0.23908095483861386, + "grad_norm": 0.37918469309806824, + "learning_rate": 1.7309363457824428e-05, + "loss": 0.2967, + "step": 12878 + }, + { + "epoch": 0.23911808497603249, + "grad_norm": 0.48838090896606445, + "learning_rate": 1.7308567341624033e-05, + "loss": 0.3519, + "step": 12880 + }, + { + "epoch": 0.23915521511345114, + "grad_norm": 0.2849925756454468, + "learning_rate": 1.7307771125975935e-05, + "loss": 0.3813, + "step": 12882 + }, + { + "epoch": 0.23919234525086977, + "grad_norm": 0.3952077627182007, + "learning_rate": 1.7306974810890954e-05, + "loss": 0.4064, + "step": 12884 + }, + { + "epoch": 0.2392294753882884, + "grad_norm": 0.5538647174835205, + "learning_rate": 1.7306178396379937e-05, + "loss": 0.151, + "step": 12886 + }, + { + "epoch": 0.23926660552570705, + "grad_norm": 0.3408423662185669, + "learning_rate": 1.7305381882453717e-05, + "loss": 0.3177, + "step": 12888 + }, + { + "epoch": 0.23930373566312568, + "grad_norm": 0.20129266381263733, + "learning_rate": 1.7304585269123126e-05, + "loss": 0.3393, + "step": 12890 + }, + { + "epoch": 0.23934086580054434, + "grad_norm": 0.4235449433326721, + "learning_rate": 1.730378855639901e-05, + "loss": 0.4103, + "step": 12892 + }, + { + "epoch": 0.23937799593796297, + "grad_norm": 0.4500958323478699, + "learning_rate": 1.7302991744292214e-05, + "loss": 0.3064, + "step": 12894 + }, + { + "epoch": 0.2394151260753816, + "grad_norm": 0.2916750907897949, + "learning_rate": 1.7302194832813577e-05, + "loss": 0.4196, + "step": 12896 + }, + { + "epoch": 0.23945225621280025, + "grad_norm": 0.2420589029788971, + "learning_rate": 1.7301397821973937e-05, + "loss": 0.37, + "step": 12898 + }, + { + "epoch": 0.23948938635021888, + "grad_norm": 0.5080774426460266, + "learning_rate": 1.7300600711784142e-05, + "loss": 0.4509, + "step": 12900 + }, + { + "epoch": 0.2395265164876375, + "grad_norm": 0.35668542981147766, + "learning_rate": 1.7299803502255042e-05, + "loss": 0.407, + "step": 12902 + }, + { + "epoch": 0.23956364662505616, + "grad_norm": 0.2377709001302719, + "learning_rate": 1.7299006193397478e-05, + "loss": 0.0846, + "step": 12904 + }, + { + "epoch": 0.2396007767624748, + "grad_norm": 0.30553528666496277, + "learning_rate": 1.7298208785222305e-05, + "loss": 0.1847, + "step": 12906 + }, + { + "epoch": 0.23963790689989345, + "grad_norm": 2.6624503135681152, + "learning_rate": 1.7297411277740374e-05, + "loss": 0.433, + "step": 12908 + }, + { + "epoch": 0.23967503703731208, + "grad_norm": 0.5066721439361572, + "learning_rate": 1.7296613670962528e-05, + "loss": 0.235, + "step": 12910 + }, + { + "epoch": 0.2397121671747307, + "grad_norm": 0.3370498716831207, + "learning_rate": 1.729581596489963e-05, + "loss": 0.3136, + "step": 12912 + }, + { + "epoch": 0.23974929731214936, + "grad_norm": 0.3181498348712921, + "learning_rate": 1.7295018159562527e-05, + "loss": 0.4081, + "step": 12914 + }, + { + "epoch": 0.239786427449568, + "grad_norm": 0.2863244414329529, + "learning_rate": 1.7294220254962083e-05, + "loss": 0.3422, + "step": 12916 + }, + { + "epoch": 0.23982355758698662, + "grad_norm": 0.5080907940864563, + "learning_rate": 1.7293422251109144e-05, + "loss": 0.4009, + "step": 12918 + }, + { + "epoch": 0.23986068772440527, + "grad_norm": 0.2951410710811615, + "learning_rate": 1.729262414801458e-05, + "loss": 0.2905, + "step": 12920 + }, + { + "epoch": 0.2398978178618239, + "grad_norm": 0.4922964870929718, + "learning_rate": 1.7291825945689244e-05, + "loss": 0.2797, + "step": 12922 + }, + { + "epoch": 0.23993494799924253, + "grad_norm": 0.3180672824382782, + "learning_rate": 1.7291027644143997e-05, + "loss": 0.3735, + "step": 12924 + }, + { + "epoch": 0.2399720781366612, + "grad_norm": 0.3576677143573761, + "learning_rate": 1.72902292433897e-05, + "loss": 0.4923, + "step": 12926 + }, + { + "epoch": 0.24000920827407982, + "grad_norm": 0.35889574885368347, + "learning_rate": 1.7289430743437226e-05, + "loss": 0.2224, + "step": 12928 + }, + { + "epoch": 0.24004633841149847, + "grad_norm": 0.37475645542144775, + "learning_rate": 1.728863214429743e-05, + "loss": 0.4123, + "step": 12930 + }, + { + "epoch": 0.2400834685489171, + "grad_norm": 0.4415918290615082, + "learning_rate": 1.728783344598118e-05, + "loss": 0.3953, + "step": 12932 + }, + { + "epoch": 0.24012059868633573, + "grad_norm": 0.44548550248146057, + "learning_rate": 1.728703464849935e-05, + "loss": 0.4443, + "step": 12934 + }, + { + "epoch": 0.24015772882375439, + "grad_norm": 0.3238242566585541, + "learning_rate": 1.728623575186281e-05, + "loss": 0.2713, + "step": 12936 + }, + { + "epoch": 0.240194858961173, + "grad_norm": 0.40348416566848755, + "learning_rate": 1.728543675608242e-05, + "loss": 0.3249, + "step": 12938 + }, + { + "epoch": 0.24023198909859164, + "grad_norm": 0.46820053458213806, + "learning_rate": 1.7284637661169058e-05, + "loss": 0.2933, + "step": 12940 + }, + { + "epoch": 0.2402691192360103, + "grad_norm": 0.37959709763526917, + "learning_rate": 1.72838384671336e-05, + "loss": 0.4103, + "step": 12942 + }, + { + "epoch": 0.24030624937342893, + "grad_norm": 0.44922611117362976, + "learning_rate": 1.7283039173986917e-05, + "loss": 0.1696, + "step": 12944 + }, + { + "epoch": 0.24034337951084758, + "grad_norm": 0.33543509244918823, + "learning_rate": 1.7282239781739885e-05, + "loss": 0.2149, + "step": 12946 + }, + { + "epoch": 0.2403805096482662, + "grad_norm": 0.3309846520423889, + "learning_rate": 1.7281440290403386e-05, + "loss": 0.2548, + "step": 12948 + }, + { + "epoch": 0.24041763978568484, + "grad_norm": 0.41331303119659424, + "learning_rate": 1.728064069998829e-05, + "loss": 0.3792, + "step": 12950 + }, + { + "epoch": 0.2404547699231035, + "grad_norm": 0.3837539851665497, + "learning_rate": 1.7279841010505488e-05, + "loss": 0.3256, + "step": 12952 + }, + { + "epoch": 0.24049190006052212, + "grad_norm": 0.4244285821914673, + "learning_rate": 1.7279041221965852e-05, + "loss": 0.243, + "step": 12954 + }, + { + "epoch": 0.24052903019794075, + "grad_norm": 0.30095329880714417, + "learning_rate": 1.7278241334380265e-05, + "loss": 0.4204, + "step": 12956 + }, + { + "epoch": 0.2405661603353594, + "grad_norm": 0.4282054007053375, + "learning_rate": 1.727744134775962e-05, + "loss": 0.222, + "step": 12958 + }, + { + "epoch": 0.24060329047277804, + "grad_norm": 0.31342217326164246, + "learning_rate": 1.7276641262114795e-05, + "loss": 0.2665, + "step": 12960 + }, + { + "epoch": 0.24064042061019666, + "grad_norm": 0.5499395132064819, + "learning_rate": 1.7275841077456677e-05, + "loss": 0.3654, + "step": 12962 + }, + { + "epoch": 0.24067755074761532, + "grad_norm": 0.3552193343639374, + "learning_rate": 1.7275040793796157e-05, + "loss": 0.5202, + "step": 12964 + }, + { + "epoch": 0.24071468088503395, + "grad_norm": 0.3272906541824341, + "learning_rate": 1.727424041114412e-05, + "loss": 0.271, + "step": 12966 + }, + { + "epoch": 0.2407518110224526, + "grad_norm": 0.49146223068237305, + "learning_rate": 1.7273439929511466e-05, + "loss": 0.3732, + "step": 12968 + }, + { + "epoch": 0.24078894115987123, + "grad_norm": 0.5061178803443909, + "learning_rate": 1.7272639348909078e-05, + "loss": 0.4085, + "step": 12970 + }, + { + "epoch": 0.24082607129728986, + "grad_norm": 0.39546099305152893, + "learning_rate": 1.727183866934785e-05, + "loss": 0.3143, + "step": 12972 + }, + { + "epoch": 0.24086320143470852, + "grad_norm": 0.5701996088027954, + "learning_rate": 1.727103789083868e-05, + "loss": 0.341, + "step": 12974 + }, + { + "epoch": 0.24090033157212715, + "grad_norm": 0.3677946627140045, + "learning_rate": 1.727023701339247e-05, + "loss": 0.2923, + "step": 12976 + }, + { + "epoch": 0.24093746170954577, + "grad_norm": 0.3409651815891266, + "learning_rate": 1.7269436037020106e-05, + "loss": 0.3411, + "step": 12978 + }, + { + "epoch": 0.24097459184696443, + "grad_norm": 0.29803889989852905, + "learning_rate": 1.726863496173249e-05, + "loss": 0.4735, + "step": 12980 + }, + { + "epoch": 0.24101172198438306, + "grad_norm": 0.2405932992696762, + "learning_rate": 1.726783378754053e-05, + "loss": 0.3239, + "step": 12982 + }, + { + "epoch": 0.24104885212180172, + "grad_norm": 0.3031086027622223, + "learning_rate": 1.7267032514455117e-05, + "loss": 0.4129, + "step": 12984 + }, + { + "epoch": 0.24108598225922034, + "grad_norm": 0.3151809871196747, + "learning_rate": 1.726623114248716e-05, + "loss": 0.4174, + "step": 12986 + }, + { + "epoch": 0.24112311239663897, + "grad_norm": 0.48531627655029297, + "learning_rate": 1.726542967164756e-05, + "loss": 0.3294, + "step": 12988 + }, + { + "epoch": 0.24116024253405763, + "grad_norm": 0.4126521646976471, + "learning_rate": 1.7264628101947226e-05, + "loss": 0.3746, + "step": 12990 + }, + { + "epoch": 0.24119737267147626, + "grad_norm": 0.3163250684738159, + "learning_rate": 1.7263826433397066e-05, + "loss": 0.2242, + "step": 12992 + }, + { + "epoch": 0.24123450280889489, + "grad_norm": 0.3708619475364685, + "learning_rate": 1.7263024666007986e-05, + "loss": 0.225, + "step": 12994 + }, + { + "epoch": 0.24127163294631354, + "grad_norm": 0.30232563614845276, + "learning_rate": 1.7262222799790893e-05, + "loss": 0.1594, + "step": 12996 + }, + { + "epoch": 0.24130876308373217, + "grad_norm": 0.3360409438610077, + "learning_rate": 1.7261420834756704e-05, + "loss": 0.2606, + "step": 12998 + }, + { + "epoch": 0.2413458932211508, + "grad_norm": 0.4139024019241333, + "learning_rate": 1.7260618770916325e-05, + "loss": 0.3651, + "step": 13000 + }, + { + "epoch": 0.24138302335856945, + "grad_norm": 0.34493640065193176, + "learning_rate": 1.725981660828067e-05, + "loss": 0.2583, + "step": 13002 + }, + { + "epoch": 0.24142015349598808, + "grad_norm": 0.4492935240268707, + "learning_rate": 1.725901434686066e-05, + "loss": 0.2989, + "step": 13004 + }, + { + "epoch": 0.24145728363340674, + "grad_norm": 0.25691118836402893, + "learning_rate": 1.725821198666721e-05, + "loss": 0.2701, + "step": 13006 + }, + { + "epoch": 0.24149441377082537, + "grad_norm": 0.33022820949554443, + "learning_rate": 1.7257409527711233e-05, + "loss": 0.3715, + "step": 13008 + }, + { + "epoch": 0.241531543908244, + "grad_norm": 0.29889681935310364, + "learning_rate": 1.7256606970003652e-05, + "loss": 0.3527, + "step": 13010 + }, + { + "epoch": 0.24156867404566265, + "grad_norm": 0.33318424224853516, + "learning_rate": 1.7255804313555384e-05, + "loss": 0.4145, + "step": 13012 + }, + { + "epoch": 0.24160580418308128, + "grad_norm": 0.29783084988594055, + "learning_rate": 1.7255001558377356e-05, + "loss": 0.238, + "step": 13014 + }, + { + "epoch": 0.2416429343204999, + "grad_norm": 0.28089606761932373, + "learning_rate": 1.7254198704480487e-05, + "loss": 0.2178, + "step": 13016 + }, + { + "epoch": 0.24168006445791856, + "grad_norm": 0.35583245754241943, + "learning_rate": 1.72533957518757e-05, + "loss": 0.3795, + "step": 13018 + }, + { + "epoch": 0.2417171945953372, + "grad_norm": 0.40556272864341736, + "learning_rate": 1.7252592700573923e-05, + "loss": 0.3324, + "step": 13020 + }, + { + "epoch": 0.24175432473275585, + "grad_norm": 0.35644981265068054, + "learning_rate": 1.7251789550586086e-05, + "loss": 0.2773, + "step": 13022 + }, + { + "epoch": 0.24179145487017448, + "grad_norm": 0.35568147897720337, + "learning_rate": 1.7250986301923115e-05, + "loss": 0.2557, + "step": 13024 + }, + { + "epoch": 0.2418285850075931, + "grad_norm": 0.3319592773914337, + "learning_rate": 1.725018295459594e-05, + "loss": 0.2168, + "step": 13026 + }, + { + "epoch": 0.24186571514501176, + "grad_norm": 0.29799574613571167, + "learning_rate": 1.724937950861549e-05, + "loss": 0.3508, + "step": 13028 + }, + { + "epoch": 0.2419028452824304, + "grad_norm": 0.339399129152298, + "learning_rate": 1.72485759639927e-05, + "loss": 0.2623, + "step": 13030 + }, + { + "epoch": 0.24193997541984902, + "grad_norm": 0.41107305884361267, + "learning_rate": 1.7247772320738503e-05, + "loss": 0.2632, + "step": 13032 + }, + { + "epoch": 0.24197710555726767, + "grad_norm": 0.3391827344894409, + "learning_rate": 1.7246968578863836e-05, + "loss": 0.4389, + "step": 13034 + }, + { + "epoch": 0.2420142356946863, + "grad_norm": 0.45916324853897095, + "learning_rate": 1.7246164738379632e-05, + "loss": 0.3764, + "step": 13036 + }, + { + "epoch": 0.24205136583210493, + "grad_norm": 0.35283342003822327, + "learning_rate": 1.724536079929683e-05, + "loss": 0.1874, + "step": 13038 + }, + { + "epoch": 0.2420884959695236, + "grad_norm": 0.4572621285915375, + "learning_rate": 1.7244556761626372e-05, + "loss": 0.4463, + "step": 13040 + }, + { + "epoch": 0.24212562610694222, + "grad_norm": 0.4193453788757324, + "learning_rate": 1.724375262537919e-05, + "loss": 0.3986, + "step": 13042 + }, + { + "epoch": 0.24216275624436087, + "grad_norm": 0.3569111227989197, + "learning_rate": 1.7242948390566243e-05, + "loss": 0.388, + "step": 13044 + }, + { + "epoch": 0.2421998863817795, + "grad_norm": 0.2860376536846161, + "learning_rate": 1.7242144057198457e-05, + "loss": 0.2487, + "step": 13046 + }, + { + "epoch": 0.24223701651919813, + "grad_norm": 0.6685859560966492, + "learning_rate": 1.7241339625286783e-05, + "loss": 0.3617, + "step": 13048 + }, + { + "epoch": 0.24227414665661678, + "grad_norm": 0.25891509652137756, + "learning_rate": 1.7240535094842172e-05, + "loss": 0.31, + "step": 13050 + }, + { + "epoch": 0.2423112767940354, + "grad_norm": 0.2941647469997406, + "learning_rate": 1.723973046587556e-05, + "loss": 0.2279, + "step": 13052 + }, + { + "epoch": 0.24234840693145404, + "grad_norm": 0.42375341057777405, + "learning_rate": 1.7238925738397906e-05, + "loss": 0.5482, + "step": 13054 + }, + { + "epoch": 0.2423855370688727, + "grad_norm": 0.3624899983406067, + "learning_rate": 1.7238120912420155e-05, + "loss": 0.3964, + "step": 13056 + }, + { + "epoch": 0.24242266720629133, + "grad_norm": 0.3396706283092499, + "learning_rate": 1.723731598795326e-05, + "loss": 0.402, + "step": 13058 + }, + { + "epoch": 0.24245979734370998, + "grad_norm": 0.3352181017398834, + "learning_rate": 1.7236510965008168e-05, + "loss": 0.3338, + "step": 13060 + }, + { + "epoch": 0.2424969274811286, + "grad_norm": 0.2515247166156769, + "learning_rate": 1.723570584359584e-05, + "loss": 0.2738, + "step": 13062 + }, + { + "epoch": 0.24253405761854724, + "grad_norm": 0.293032705783844, + "learning_rate": 1.7234900623727232e-05, + "loss": 0.2411, + "step": 13064 + }, + { + "epoch": 0.2425711877559659, + "grad_norm": 0.3973437547683716, + "learning_rate": 1.7234095305413295e-05, + "loss": 0.4046, + "step": 13066 + }, + { + "epoch": 0.24260831789338452, + "grad_norm": 0.321879506111145, + "learning_rate": 1.723328988866499e-05, + "loss": 0.3887, + "step": 13068 + }, + { + "epoch": 0.24264544803080315, + "grad_norm": 0.4179864227771759, + "learning_rate": 1.7232484373493274e-05, + "loss": 0.3397, + "step": 13070 + }, + { + "epoch": 0.2426825781682218, + "grad_norm": 0.29522237181663513, + "learning_rate": 1.7231678759909113e-05, + "loss": 0.4788, + "step": 13072 + }, + { + "epoch": 0.24271970830564044, + "grad_norm": 0.33710727095603943, + "learning_rate": 1.723087304792346e-05, + "loss": 0.3504, + "step": 13074 + }, + { + "epoch": 0.24275683844305906, + "grad_norm": 0.251616507768631, + "learning_rate": 1.723006723754729e-05, + "loss": 0.2514, + "step": 13076 + }, + { + "epoch": 0.24279396858047772, + "grad_norm": 0.4470879137516022, + "learning_rate": 1.7229261328791557e-05, + "loss": 0.4094, + "step": 13078 + }, + { + "epoch": 0.24283109871789635, + "grad_norm": 0.29195520281791687, + "learning_rate": 1.7228455321667233e-05, + "loss": 0.506, + "step": 13080 + }, + { + "epoch": 0.242868228855315, + "grad_norm": 0.34468114376068115, + "learning_rate": 1.7227649216185283e-05, + "loss": 0.2528, + "step": 13082 + }, + { + "epoch": 0.24290535899273363, + "grad_norm": 1.06187903881073, + "learning_rate": 1.722684301235668e-05, + "loss": 0.4135, + "step": 13084 + }, + { + "epoch": 0.24294248913015226, + "grad_norm": 0.3771825432777405, + "learning_rate": 1.7226036710192385e-05, + "loss": 0.2728, + "step": 13086 + }, + { + "epoch": 0.24297961926757092, + "grad_norm": 0.3228921890258789, + "learning_rate": 1.7225230309703378e-05, + "loss": 0.5064, + "step": 13088 + }, + { + "epoch": 0.24301674940498955, + "grad_norm": 0.32422319054603577, + "learning_rate": 1.722442381090063e-05, + "loss": 0.5009, + "step": 13090 + }, + { + "epoch": 0.24305387954240817, + "grad_norm": 0.5176020264625549, + "learning_rate": 1.7223617213795113e-05, + "loss": 0.3618, + "step": 13092 + }, + { + "epoch": 0.24309100967982683, + "grad_norm": 0.47559887170791626, + "learning_rate": 1.72228105183978e-05, + "loss": 0.435, + "step": 13094 + }, + { + "epoch": 0.24312813981724546, + "grad_norm": 0.38654300570487976, + "learning_rate": 1.7222003724719672e-05, + "loss": 0.3917, + "step": 13096 + }, + { + "epoch": 0.24316526995466411, + "grad_norm": 0.2886367738246918, + "learning_rate": 1.7221196832771707e-05, + "loss": 0.2535, + "step": 13098 + }, + { + "epoch": 0.24320240009208274, + "grad_norm": 0.3581579029560089, + "learning_rate": 1.722038984256488e-05, + "loss": 0.2081, + "step": 13100 + }, + { + "epoch": 0.24323953022950137, + "grad_norm": 0.3354935050010681, + "learning_rate": 1.721958275411018e-05, + "loss": 0.2213, + "step": 13102 + }, + { + "epoch": 0.24327666036692003, + "grad_norm": 0.44140952825546265, + "learning_rate": 1.721877556741858e-05, + "loss": 0.1751, + "step": 13104 + }, + { + "epoch": 0.24331379050433866, + "grad_norm": 0.32853755354881287, + "learning_rate": 1.7217968282501066e-05, + "loss": 0.4917, + "step": 13106 + }, + { + "epoch": 0.24335092064175728, + "grad_norm": 0.355752170085907, + "learning_rate": 1.721716089936863e-05, + "loss": 0.3928, + "step": 13108 + }, + { + "epoch": 0.24338805077917594, + "grad_norm": 0.4365628957748413, + "learning_rate": 1.7216353418032245e-05, + "loss": 0.198, + "step": 13110 + }, + { + "epoch": 0.24342518091659457, + "grad_norm": 0.4583829343318939, + "learning_rate": 1.7215545838502913e-05, + "loss": 0.3316, + "step": 13112 + }, + { + "epoch": 0.2434623110540132, + "grad_norm": 0.3248903751373291, + "learning_rate": 1.7214738160791607e-05, + "loss": 0.3134, + "step": 13114 + }, + { + "epoch": 0.24349944119143185, + "grad_norm": 0.33115360140800476, + "learning_rate": 1.7213930384909332e-05, + "loss": 0.3954, + "step": 13116 + }, + { + "epoch": 0.24353657132885048, + "grad_norm": 0.2920989692211151, + "learning_rate": 1.7213122510867068e-05, + "loss": 0.1111, + "step": 13118 + }, + { + "epoch": 0.24357370146626914, + "grad_norm": 0.41305750608444214, + "learning_rate": 1.7212314538675813e-05, + "loss": 0.3176, + "step": 13120 + }, + { + "epoch": 0.24361083160368777, + "grad_norm": 0.44872161746025085, + "learning_rate": 1.7211506468346562e-05, + "loss": 0.3408, + "step": 13122 + }, + { + "epoch": 0.2436479617411064, + "grad_norm": 0.25415483117103577, + "learning_rate": 1.721069829989031e-05, + "loss": 0.1128, + "step": 13124 + }, + { + "epoch": 0.24368509187852505, + "grad_norm": 0.3859167695045471, + "learning_rate": 1.720989003331805e-05, + "loss": 0.3534, + "step": 13126 + }, + { + "epoch": 0.24372222201594368, + "grad_norm": 0.41660287976264954, + "learning_rate": 1.7209081668640787e-05, + "loss": 0.3616, + "step": 13128 + }, + { + "epoch": 0.2437593521533623, + "grad_norm": 0.322981595993042, + "learning_rate": 1.7208273205869513e-05, + "loss": 0.4666, + "step": 13130 + }, + { + "epoch": 0.24379648229078096, + "grad_norm": 0.4274020493030548, + "learning_rate": 1.7207464645015233e-05, + "loss": 0.3299, + "step": 13132 + }, + { + "epoch": 0.2438336124281996, + "grad_norm": 0.35858261585235596, + "learning_rate": 1.7206655986088946e-05, + "loss": 0.3434, + "step": 13134 + }, + { + "epoch": 0.24387074256561825, + "grad_norm": 0.3371645212173462, + "learning_rate": 1.720584722910166e-05, + "loss": 0.2476, + "step": 13136 + }, + { + "epoch": 0.24390787270303688, + "grad_norm": 0.33661943674087524, + "learning_rate": 1.7205038374064375e-05, + "loss": 0.2343, + "step": 13138 + }, + { + "epoch": 0.2439450028404555, + "grad_norm": 0.37192678451538086, + "learning_rate": 1.72042294209881e-05, + "loss": 0.5577, + "step": 13140 + }, + { + "epoch": 0.24398213297787416, + "grad_norm": 0.3558404743671417, + "learning_rate": 1.720342036988384e-05, + "loss": 0.2811, + "step": 13142 + }, + { + "epoch": 0.2440192631152928, + "grad_norm": 0.27632248401641846, + "learning_rate": 1.720261122076261e-05, + "loss": 0.3152, + "step": 13144 + }, + { + "epoch": 0.24405639325271142, + "grad_norm": 0.5439814925193787, + "learning_rate": 1.7201801973635416e-05, + "loss": 0.2343, + "step": 13146 + }, + { + "epoch": 0.24409352339013007, + "grad_norm": 0.3401106595993042, + "learning_rate": 1.7200992628513265e-05, + "loss": 0.313, + "step": 13148 + }, + { + "epoch": 0.2441306535275487, + "grad_norm": 0.4511454105377197, + "learning_rate": 1.7200183185407176e-05, + "loss": 0.2871, + "step": 13150 + }, + { + "epoch": 0.24416778366496733, + "grad_norm": 0.2866542935371399, + "learning_rate": 1.719937364432816e-05, + "loss": 0.3224, + "step": 13152 + }, + { + "epoch": 0.244204913802386, + "grad_norm": 0.38121873140335083, + "learning_rate": 1.7198564005287234e-05, + "loss": 0.4019, + "step": 13154 + }, + { + "epoch": 0.24424204393980462, + "grad_norm": 0.4640549421310425, + "learning_rate": 1.719775426829541e-05, + "loss": 0.2572, + "step": 13156 + }, + { + "epoch": 0.24427917407722327, + "grad_norm": 0.3459872603416443, + "learning_rate": 1.7196944433363714e-05, + "loss": 0.3596, + "step": 13158 + }, + { + "epoch": 0.2443163042146419, + "grad_norm": 0.3143463134765625, + "learning_rate": 1.7196134500503162e-05, + "loss": 0.3612, + "step": 13160 + }, + { + "epoch": 0.24435343435206053, + "grad_norm": 0.36931899189949036, + "learning_rate": 1.7195324469724774e-05, + "loss": 0.276, + "step": 13162 + }, + { + "epoch": 0.24439056448947918, + "grad_norm": 0.4110381603240967, + "learning_rate": 1.7194514341039572e-05, + "loss": 0.5731, + "step": 13164 + }, + { + "epoch": 0.2444276946268978, + "grad_norm": 0.3556995391845703, + "learning_rate": 1.719370411445858e-05, + "loss": 0.3167, + "step": 13166 + }, + { + "epoch": 0.24446482476431644, + "grad_norm": 0.30289989709854126, + "learning_rate": 1.719289378999282e-05, + "loss": 0.2691, + "step": 13168 + }, + { + "epoch": 0.2445019549017351, + "grad_norm": 0.3584682047367096, + "learning_rate": 1.7192083367653324e-05, + "loss": 0.2148, + "step": 13170 + }, + { + "epoch": 0.24453908503915373, + "grad_norm": 0.3848544657230377, + "learning_rate": 1.7191272847451117e-05, + "loss": 0.2792, + "step": 13172 + }, + { + "epoch": 0.24457621517657238, + "grad_norm": 0.42471927404403687, + "learning_rate": 1.7190462229397224e-05, + "loss": 0.2176, + "step": 13174 + }, + { + "epoch": 0.244613345313991, + "grad_norm": 0.3466026782989502, + "learning_rate": 1.718965151350268e-05, + "loss": 0.3751, + "step": 13176 + }, + { + "epoch": 0.24465047545140964, + "grad_norm": 0.41115647554397583, + "learning_rate": 1.7188840699778516e-05, + "loss": 0.3462, + "step": 13178 + }, + { + "epoch": 0.2446876055888283, + "grad_norm": 0.41987869143486023, + "learning_rate": 1.7188029788235757e-05, + "loss": 0.3363, + "step": 13180 + }, + { + "epoch": 0.24472473572624692, + "grad_norm": 0.3660711944103241, + "learning_rate": 1.718721877888545e-05, + "loss": 0.2393, + "step": 13182 + }, + { + "epoch": 0.24476186586366555, + "grad_norm": 0.49280649423599243, + "learning_rate": 1.718640767173862e-05, + "loss": 0.3269, + "step": 13184 + }, + { + "epoch": 0.2447989960010842, + "grad_norm": 0.3349471390247345, + "learning_rate": 1.7185596466806308e-05, + "loss": 0.2278, + "step": 13186 + }, + { + "epoch": 0.24483612613850284, + "grad_norm": 1.1708298921585083, + "learning_rate": 1.7184785164099553e-05, + "loss": 0.3443, + "step": 13188 + }, + { + "epoch": 0.24487325627592146, + "grad_norm": 0.35315534472465515, + "learning_rate": 1.718397376362939e-05, + "loss": 0.2783, + "step": 13190 + }, + { + "epoch": 0.24491038641334012, + "grad_norm": 0.34423086047172546, + "learning_rate": 1.7183162265406865e-05, + "loss": 0.2307, + "step": 13192 + }, + { + "epoch": 0.24494751655075875, + "grad_norm": 0.39267128705978394, + "learning_rate": 1.7182350669443017e-05, + "loss": 0.2213, + "step": 13194 + }, + { + "epoch": 0.2449846466881774, + "grad_norm": 0.422139972448349, + "learning_rate": 1.718153897574889e-05, + "loss": 0.3307, + "step": 13196 + }, + { + "epoch": 0.24502177682559603, + "grad_norm": 0.44690245389938354, + "learning_rate": 1.7180727184335525e-05, + "loss": 0.4274, + "step": 13198 + }, + { + "epoch": 0.24505890696301466, + "grad_norm": 0.41924849152565, + "learning_rate": 1.7179915295213976e-05, + "loss": 0.2368, + "step": 13200 + }, + { + "epoch": 0.24509603710043332, + "grad_norm": 0.3475898802280426, + "learning_rate": 1.7179103308395285e-05, + "loss": 0.1549, + "step": 13202 + }, + { + "epoch": 0.24513316723785195, + "grad_norm": 0.3980828821659088, + "learning_rate": 1.7178291223890503e-05, + "loss": 0.3345, + "step": 13204 + }, + { + "epoch": 0.24517029737527057, + "grad_norm": 0.3410588800907135, + "learning_rate": 1.717747904171068e-05, + "loss": 0.3222, + "step": 13206 + }, + { + "epoch": 0.24520742751268923, + "grad_norm": 0.30045706033706665, + "learning_rate": 1.7176666761866862e-05, + "loss": 0.4054, + "step": 13208 + }, + { + "epoch": 0.24524455765010786, + "grad_norm": 0.3879125714302063, + "learning_rate": 1.717585438437011e-05, + "loss": 0.3389, + "step": 13210 + }, + { + "epoch": 0.24528168778752651, + "grad_norm": 0.347405344247818, + "learning_rate": 1.717504190923147e-05, + "loss": 0.2948, + "step": 13212 + }, + { + "epoch": 0.24531881792494514, + "grad_norm": 0.32417306303977966, + "learning_rate": 1.7174229336462003e-05, + "loss": 0.3303, + "step": 13214 + }, + { + "epoch": 0.24535594806236377, + "grad_norm": 0.4360882043838501, + "learning_rate": 1.717341666607277e-05, + "loss": 0.3815, + "step": 13216 + }, + { + "epoch": 0.24539307819978243, + "grad_norm": 0.2860407531261444, + "learning_rate": 1.7172603898074814e-05, + "loss": 0.5392, + "step": 13218 + }, + { + "epoch": 0.24543020833720106, + "grad_norm": 0.4340052008628845, + "learning_rate": 1.7171791032479206e-05, + "loss": 0.2883, + "step": 13220 + }, + { + "epoch": 0.24546733847461968, + "grad_norm": 0.37193435430526733, + "learning_rate": 1.7170978069297007e-05, + "loss": 0.1349, + "step": 13222 + }, + { + "epoch": 0.24550446861203834, + "grad_norm": 0.40919381380081177, + "learning_rate": 1.7170165008539276e-05, + "loss": 0.3871, + "step": 13224 + }, + { + "epoch": 0.24554159874945697, + "grad_norm": 0.3199876844882965, + "learning_rate": 1.7169351850217074e-05, + "loss": 0.3669, + "step": 13226 + }, + { + "epoch": 0.2455787288868756, + "grad_norm": 0.45857229828834534, + "learning_rate": 1.7168538594341468e-05, + "loss": 0.4147, + "step": 13228 + }, + { + "epoch": 0.24561585902429425, + "grad_norm": 0.22806601226329803, + "learning_rate": 1.7167725240923526e-05, + "loss": 0.3579, + "step": 13230 + }, + { + "epoch": 0.24565298916171288, + "grad_norm": 0.45223361253738403, + "learning_rate": 1.716691178997431e-05, + "loss": 0.1652, + "step": 13232 + }, + { + "epoch": 0.24569011929913154, + "grad_norm": 0.23677855730056763, + "learning_rate": 1.7166098241504895e-05, + "loss": 0.2829, + "step": 13234 + }, + { + "epoch": 0.24572724943655017, + "grad_norm": 0.3565402626991272, + "learning_rate": 1.7165284595526347e-05, + "loss": 0.491, + "step": 13236 + }, + { + "epoch": 0.2457643795739688, + "grad_norm": 0.3340590000152588, + "learning_rate": 1.7164470852049736e-05, + "loss": 0.2391, + "step": 13238 + }, + { + "epoch": 0.24580150971138745, + "grad_norm": 0.39384719729423523, + "learning_rate": 1.716365701108614e-05, + "loss": 0.2928, + "step": 13240 + }, + { + "epoch": 0.24583863984880608, + "grad_norm": 0.436230331659317, + "learning_rate": 1.7162843072646626e-05, + "loss": 0.3541, + "step": 13242 + }, + { + "epoch": 0.2458757699862247, + "grad_norm": 0.3410209119319916, + "learning_rate": 1.7162029036742275e-05, + "loss": 0.3494, + "step": 13244 + }, + { + "epoch": 0.24591290012364336, + "grad_norm": 0.49817532300949097, + "learning_rate": 1.7161214903384165e-05, + "loss": 0.2482, + "step": 13246 + }, + { + "epoch": 0.245950030261062, + "grad_norm": 0.3439389169216156, + "learning_rate": 1.7160400672583367e-05, + "loss": 0.3116, + "step": 13248 + }, + { + "epoch": 0.24598716039848065, + "grad_norm": 0.28430119156837463, + "learning_rate": 1.715958634435096e-05, + "loss": 0.296, + "step": 13250 + }, + { + "epoch": 0.24602429053589928, + "grad_norm": 0.3383987545967102, + "learning_rate": 1.7158771918698036e-05, + "loss": 0.4191, + "step": 13252 + }, + { + "epoch": 0.2460614206733179, + "grad_norm": 0.29728689789772034, + "learning_rate": 1.7157957395635666e-05, + "loss": 0.4433, + "step": 13254 + }, + { + "epoch": 0.24609855081073656, + "grad_norm": 0.2834780216217041, + "learning_rate": 1.7157142775174935e-05, + "loss": 0.4482, + "step": 13256 + }, + { + "epoch": 0.2461356809481552, + "grad_norm": 0.36353743076324463, + "learning_rate": 1.715632805732693e-05, + "loss": 0.2665, + "step": 13258 + }, + { + "epoch": 0.24617281108557382, + "grad_norm": 0.2620483636856079, + "learning_rate": 1.7155513242102736e-05, + "loss": 0.3502, + "step": 13260 + }, + { + "epoch": 0.24620994122299247, + "grad_norm": 0.31910502910614014, + "learning_rate": 1.7154698329513443e-05, + "loss": 0.3247, + "step": 13262 + }, + { + "epoch": 0.2462470713604111, + "grad_norm": 0.29031166434288025, + "learning_rate": 1.715388331957013e-05, + "loss": 0.2791, + "step": 13264 + }, + { + "epoch": 0.24628420149782973, + "grad_norm": 0.2690592408180237, + "learning_rate": 1.7153068212283898e-05, + "loss": 0.2403, + "step": 13266 + }, + { + "epoch": 0.2463213316352484, + "grad_norm": 0.34039804339408875, + "learning_rate": 1.715225300766583e-05, + "loss": 0.2414, + "step": 13268 + }, + { + "epoch": 0.24635846177266701, + "grad_norm": 0.6392877697944641, + "learning_rate": 1.7151437705727026e-05, + "loss": 0.3618, + "step": 13270 + }, + { + "epoch": 0.24639559191008567, + "grad_norm": 0.8884836435317993, + "learning_rate": 1.7150622306478575e-05, + "loss": 0.3704, + "step": 13272 + }, + { + "epoch": 0.2464327220475043, + "grad_norm": 0.23477886617183685, + "learning_rate": 1.7149806809931572e-05, + "loss": 0.1839, + "step": 13274 + }, + { + "epoch": 0.24646985218492293, + "grad_norm": 0.38277870416641235, + "learning_rate": 1.7148991216097112e-05, + "loss": 0.2553, + "step": 13276 + }, + { + "epoch": 0.24650698232234158, + "grad_norm": 0.3473656475543976, + "learning_rate": 1.7148175524986302e-05, + "loss": 0.3704, + "step": 13278 + }, + { + "epoch": 0.2465441124597602, + "grad_norm": 0.37068596482276917, + "learning_rate": 1.7147359736610228e-05, + "loss": 0.3807, + "step": 13280 + }, + { + "epoch": 0.24658124259717884, + "grad_norm": 0.3488757610321045, + "learning_rate": 1.714654385098e-05, + "loss": 0.2192, + "step": 13282 + }, + { + "epoch": 0.2466183727345975, + "grad_norm": 0.33016151189804077, + "learning_rate": 1.7145727868106715e-05, + "loss": 0.3596, + "step": 13284 + }, + { + "epoch": 0.24665550287201612, + "grad_norm": 0.3586462438106537, + "learning_rate": 1.714491178800148e-05, + "loss": 0.2789, + "step": 13286 + }, + { + "epoch": 0.24669263300943478, + "grad_norm": 0.4576156437397003, + "learning_rate": 1.7144095610675397e-05, + "loss": 0.2144, + "step": 13288 + }, + { + "epoch": 0.2467297631468534, + "grad_norm": 0.37462636828422546, + "learning_rate": 1.7143279336139565e-05, + "loss": 0.268, + "step": 13290 + }, + { + "epoch": 0.24676689328427204, + "grad_norm": 0.3459118902683258, + "learning_rate": 1.7142462964405107e-05, + "loss": 0.2834, + "step": 13292 + }, + { + "epoch": 0.2468040234216907, + "grad_norm": 0.44189175963401794, + "learning_rate": 1.7141646495483115e-05, + "loss": 0.3729, + "step": 13294 + }, + { + "epoch": 0.24684115355910932, + "grad_norm": 0.3049304485321045, + "learning_rate": 1.714082992938471e-05, + "loss": 0.3827, + "step": 13296 + }, + { + "epoch": 0.24687828369652795, + "grad_norm": 0.260957807302475, + "learning_rate": 1.7140013266121e-05, + "loss": 0.245, + "step": 13298 + }, + { + "epoch": 0.2469154138339466, + "grad_norm": 0.4045257866382599, + "learning_rate": 1.7139196505703094e-05, + "loss": 0.3808, + "step": 13300 + }, + { + "epoch": 0.24695254397136523, + "grad_norm": 0.3291529417037964, + "learning_rate": 1.7138379648142108e-05, + "loss": 0.3517, + "step": 13302 + }, + { + "epoch": 0.24698967410878386, + "grad_norm": 0.26678377389907837, + "learning_rate": 1.7137562693449153e-05, + "loss": 0.338, + "step": 13304 + }, + { + "epoch": 0.24702680424620252, + "grad_norm": 0.5196218490600586, + "learning_rate": 1.7136745641635356e-05, + "loss": 0.282, + "step": 13306 + }, + { + "epoch": 0.24706393438362115, + "grad_norm": 0.41561537981033325, + "learning_rate": 1.7135928492711828e-05, + "loss": 0.4068, + "step": 13308 + }, + { + "epoch": 0.2471010645210398, + "grad_norm": 0.4688931107521057, + "learning_rate": 1.713511124668968e-05, + "loss": 0.2828, + "step": 13310 + }, + { + "epoch": 0.24713819465845843, + "grad_norm": 0.37110626697540283, + "learning_rate": 1.7134293903580047e-05, + "loss": 0.3732, + "step": 13312 + }, + { + "epoch": 0.24717532479587706, + "grad_norm": 0.4646414816379547, + "learning_rate": 1.713347646339404e-05, + "loss": 0.4901, + "step": 13314 + }, + { + "epoch": 0.24721245493329572, + "grad_norm": 0.2735046446323395, + "learning_rate": 1.713265892614279e-05, + "loss": 0.5157, + "step": 13316 + }, + { + "epoch": 0.24724958507071434, + "grad_norm": 0.3327465355396271, + "learning_rate": 1.7131841291837416e-05, + "loss": 0.2956, + "step": 13318 + }, + { + "epoch": 0.24728671520813297, + "grad_norm": 0.3320567011833191, + "learning_rate": 1.713102356048904e-05, + "loss": 0.2655, + "step": 13320 + }, + { + "epoch": 0.24732384534555163, + "grad_norm": 0.4400215148925781, + "learning_rate": 1.7130205732108795e-05, + "loss": 0.2801, + "step": 13322 + }, + { + "epoch": 0.24736097548297026, + "grad_norm": 0.3597651422023773, + "learning_rate": 1.712938780670781e-05, + "loss": 0.2907, + "step": 13324 + }, + { + "epoch": 0.24739810562038891, + "grad_norm": 0.3958512246608734, + "learning_rate": 1.7128569784297207e-05, + "loss": 0.4731, + "step": 13326 + }, + { + "epoch": 0.24743523575780754, + "grad_norm": 0.37480098009109497, + "learning_rate": 1.712775166488813e-05, + "loss": 0.3264, + "step": 13328 + }, + { + "epoch": 0.24747236589522617, + "grad_norm": 0.3136560916900635, + "learning_rate": 1.7126933448491695e-05, + "loss": 0.1936, + "step": 13330 + }, + { + "epoch": 0.24750949603264483, + "grad_norm": 0.37380507588386536, + "learning_rate": 1.7126115135119044e-05, + "loss": 0.2505, + "step": 13332 + }, + { + "epoch": 0.24754662617006346, + "grad_norm": 0.5548191070556641, + "learning_rate": 1.7125296724781315e-05, + "loss": 0.3475, + "step": 13334 + }, + { + "epoch": 0.24758375630748208, + "grad_norm": 0.4348328411579132, + "learning_rate": 1.7124478217489636e-05, + "loss": 0.4385, + "step": 13336 + }, + { + "epoch": 0.24762088644490074, + "grad_norm": 0.48877087235450745, + "learning_rate": 1.7123659613255155e-05, + "loss": 0.5022, + "step": 13338 + }, + { + "epoch": 0.24765801658231937, + "grad_norm": 0.409216046333313, + "learning_rate": 1.7122840912089002e-05, + "loss": 0.3165, + "step": 13340 + }, + { + "epoch": 0.247695146719738, + "grad_norm": 0.3926894962787628, + "learning_rate": 1.7122022114002317e-05, + "loss": 0.2234, + "step": 13342 + }, + { + "epoch": 0.24773227685715665, + "grad_norm": 0.5280458331108093, + "learning_rate": 1.7121203219006246e-05, + "loss": 0.2775, + "step": 13344 + }, + { + "epoch": 0.24776940699457528, + "grad_norm": 0.35759904980659485, + "learning_rate": 1.7120384227111928e-05, + "loss": 0.3918, + "step": 13346 + }, + { + "epoch": 0.24780653713199394, + "grad_norm": 0.35483217239379883, + "learning_rate": 1.711956513833051e-05, + "loss": 0.1831, + "step": 13348 + }, + { + "epoch": 0.24784366726941257, + "grad_norm": 0.2846633195877075, + "learning_rate": 1.7118745952673137e-05, + "loss": 0.3612, + "step": 13350 + }, + { + "epoch": 0.2478807974068312, + "grad_norm": 0.4046551287174225, + "learning_rate": 1.7117926670150952e-05, + "loss": 0.3278, + "step": 13352 + }, + { + "epoch": 0.24791792754424985, + "grad_norm": 0.3206670582294464, + "learning_rate": 1.7117107290775106e-05, + "loss": 0.3482, + "step": 13354 + }, + { + "epoch": 0.24795505768166848, + "grad_norm": 0.26846450567245483, + "learning_rate": 1.711628781455675e-05, + "loss": 0.2611, + "step": 13356 + }, + { + "epoch": 0.2479921878190871, + "grad_norm": 0.33748847246170044, + "learning_rate": 1.711546824150703e-05, + "loss": 0.6318, + "step": 13358 + }, + { + "epoch": 0.24802931795650576, + "grad_norm": 0.3074412941932678, + "learning_rate": 1.7114648571637103e-05, + "loss": 0.3257, + "step": 13360 + }, + { + "epoch": 0.2480664480939244, + "grad_norm": 0.4141692817211151, + "learning_rate": 1.711382880495812e-05, + "loss": 0.3644, + "step": 13362 + }, + { + "epoch": 0.24810357823134305, + "grad_norm": 0.40177881717681885, + "learning_rate": 1.7113008941481236e-05, + "loss": 0.4129, + "step": 13364 + }, + { + "epoch": 0.24814070836876168, + "grad_norm": 0.3892151415348053, + "learning_rate": 1.7112188981217602e-05, + "loss": 0.5622, + "step": 13366 + }, + { + "epoch": 0.2481778385061803, + "grad_norm": 1.0396039485931396, + "learning_rate": 1.7111368924178383e-05, + "loss": 0.3238, + "step": 13368 + }, + { + "epoch": 0.24821496864359896, + "grad_norm": 0.4415968954563141, + "learning_rate": 1.7110548770374734e-05, + "loss": 0.2282, + "step": 13370 + }, + { + "epoch": 0.2482520987810176, + "grad_norm": 0.33203616738319397, + "learning_rate": 1.710972851981781e-05, + "loss": 0.3122, + "step": 13372 + }, + { + "epoch": 0.24828922891843622, + "grad_norm": 0.2803024649620056, + "learning_rate": 1.7108908172518785e-05, + "loss": 0.2342, + "step": 13374 + }, + { + "epoch": 0.24832635905585487, + "grad_norm": 0.2605185806751251, + "learning_rate": 1.7108087728488807e-05, + "loss": 0.3215, + "step": 13376 + }, + { + "epoch": 0.2483634891932735, + "grad_norm": 0.352474182844162, + "learning_rate": 1.710726718773905e-05, + "loss": 0.2411, + "step": 13378 + }, + { + "epoch": 0.24840061933069213, + "grad_norm": 0.4577985107898712, + "learning_rate": 1.7106446550280672e-05, + "loss": 0.3917, + "step": 13380 + }, + { + "epoch": 0.24843774946811079, + "grad_norm": 0.2931303083896637, + "learning_rate": 1.7105625816124843e-05, + "loss": 0.2189, + "step": 13382 + }, + { + "epoch": 0.24847487960552941, + "grad_norm": 0.32447829842567444, + "learning_rate": 1.7104804985282734e-05, + "loss": 0.3135, + "step": 13384 + }, + { + "epoch": 0.24851200974294807, + "grad_norm": 0.4484362304210663, + "learning_rate": 1.710398405776551e-05, + "loss": 0.3342, + "step": 13386 + }, + { + "epoch": 0.2485491398803667, + "grad_norm": 0.4414674937725067, + "learning_rate": 1.710316303358434e-05, + "loss": 0.3514, + "step": 13388 + }, + { + "epoch": 0.24858627001778533, + "grad_norm": 0.2533408999443054, + "learning_rate": 1.7102341912750398e-05, + "loss": 0.2756, + "step": 13390 + }, + { + "epoch": 0.24862340015520398, + "grad_norm": 0.3433852195739746, + "learning_rate": 1.7101520695274855e-05, + "loss": 0.4985, + "step": 13392 + }, + { + "epoch": 0.2486605302926226, + "grad_norm": 0.2968577742576599, + "learning_rate": 1.710069938116889e-05, + "loss": 0.2443, + "step": 13394 + }, + { + "epoch": 0.24869766043004124, + "grad_norm": 0.3259652554988861, + "learning_rate": 1.7099877970443675e-05, + "loss": 0.387, + "step": 13396 + }, + { + "epoch": 0.2487347905674599, + "grad_norm": 0.2970534861087799, + "learning_rate": 1.709905646311038e-05, + "loss": 0.4225, + "step": 13398 + }, + { + "epoch": 0.24877192070487852, + "grad_norm": 0.41418832540512085, + "learning_rate": 1.70982348591802e-05, + "loss": 0.4134, + "step": 13400 + }, + { + "epoch": 0.24880905084229718, + "grad_norm": 0.45218539237976074, + "learning_rate": 1.7097413158664303e-05, + "loss": 0.1988, + "step": 13402 + }, + { + "epoch": 0.2488461809797158, + "grad_norm": 0.38420361280441284, + "learning_rate": 1.7096591361573873e-05, + "loss": 0.5638, + "step": 13404 + }, + { + "epoch": 0.24888331111713444, + "grad_norm": 0.29785534739494324, + "learning_rate": 1.7095769467920087e-05, + "loss": 0.2869, + "step": 13406 + }, + { + "epoch": 0.2489204412545531, + "grad_norm": 0.4749932289123535, + "learning_rate": 1.7094947477714137e-05, + "loss": 0.3811, + "step": 13408 + }, + { + "epoch": 0.24895757139197172, + "grad_norm": 0.6014745831489563, + "learning_rate": 1.70941253909672e-05, + "loss": 0.3423, + "step": 13410 + }, + { + "epoch": 0.24899470152939035, + "grad_norm": 0.31092795729637146, + "learning_rate": 1.709330320769047e-05, + "loss": 0.1398, + "step": 13412 + }, + { + "epoch": 0.249031831666809, + "grad_norm": 0.3357078433036804, + "learning_rate": 1.709248092789513e-05, + "loss": 0.1135, + "step": 13414 + }, + { + "epoch": 0.24906896180422763, + "grad_norm": 0.34562963247299194, + "learning_rate": 1.7091658551592364e-05, + "loss": 0.3422, + "step": 13416 + }, + { + "epoch": 0.24910609194164626, + "grad_norm": 0.47185638546943665, + "learning_rate": 1.709083607879337e-05, + "loss": 0.404, + "step": 13418 + }, + { + "epoch": 0.24914322207906492, + "grad_norm": 0.3098020553588867, + "learning_rate": 1.7090013509509335e-05, + "loss": 0.3513, + "step": 13420 + }, + { + "epoch": 0.24918035221648355, + "grad_norm": 0.4504270851612091, + "learning_rate": 1.7089190843751455e-05, + "loss": 0.4556, + "step": 13422 + }, + { + "epoch": 0.2492174823539022, + "grad_norm": 0.2668244540691376, + "learning_rate": 1.7088368081530922e-05, + "loss": 0.5071, + "step": 13424 + }, + { + "epoch": 0.24925461249132083, + "grad_norm": 0.39171335101127625, + "learning_rate": 1.708754522285893e-05, + "loss": 0.3375, + "step": 13426 + }, + { + "epoch": 0.24929174262873946, + "grad_norm": 0.4340653121471405, + "learning_rate": 1.708672226774668e-05, + "loss": 0.4444, + "step": 13428 + }, + { + "epoch": 0.24932887276615812, + "grad_norm": 0.3993779718875885, + "learning_rate": 1.708589921620537e-05, + "loss": 0.2589, + "step": 13430 + }, + { + "epoch": 0.24936600290357674, + "grad_norm": 0.2112313210964203, + "learning_rate": 1.7085076068246188e-05, + "loss": 0.2493, + "step": 13432 + }, + { + "epoch": 0.24940313304099537, + "grad_norm": 0.33994027972221375, + "learning_rate": 1.708425282388035e-05, + "loss": 0.2105, + "step": 13434 + }, + { + "epoch": 0.24944026317841403, + "grad_norm": 0.401529461145401, + "learning_rate": 1.7083429483119044e-05, + "loss": 0.4486, + "step": 13436 + }, + { + "epoch": 0.24947739331583266, + "grad_norm": 0.39311596751213074, + "learning_rate": 1.7082606045973487e-05, + "loss": 0.2584, + "step": 13438 + }, + { + "epoch": 0.2495145234532513, + "grad_norm": 0.2795025110244751, + "learning_rate": 1.7081782512454873e-05, + "loss": 0.4173, + "step": 13440 + }, + { + "epoch": 0.24955165359066994, + "grad_norm": 0.27866944670677185, + "learning_rate": 1.7080958882574412e-05, + "loss": 0.4654, + "step": 13442 + }, + { + "epoch": 0.24958878372808857, + "grad_norm": 0.2017209380865097, + "learning_rate": 1.708013515634331e-05, + "loss": 0.1865, + "step": 13444 + }, + { + "epoch": 0.24962591386550723, + "grad_norm": 0.4624181389808655, + "learning_rate": 1.7079311333772778e-05, + "loss": 0.4687, + "step": 13446 + }, + { + "epoch": 0.24966304400292585, + "grad_norm": 0.5237762331962585, + "learning_rate": 1.707848741487402e-05, + "loss": 0.189, + "step": 13448 + }, + { + "epoch": 0.24970017414034448, + "grad_norm": 0.4490068554878235, + "learning_rate": 1.7077663399658252e-05, + "loss": 0.5101, + "step": 13450 + }, + { + "epoch": 0.24973730427776314, + "grad_norm": 0.3285551965236664, + "learning_rate": 1.7076839288136687e-05, + "loss": 0.3384, + "step": 13452 + }, + { + "epoch": 0.24977443441518177, + "grad_norm": 0.25772300362586975, + "learning_rate": 1.7076015080320538e-05, + "loss": 0.2376, + "step": 13454 + }, + { + "epoch": 0.2498115645526004, + "grad_norm": 0.34867772459983826, + "learning_rate": 1.7075190776221018e-05, + "loss": 0.1967, + "step": 13456 + }, + { + "epoch": 0.24984869469001905, + "grad_norm": 0.5373649597167969, + "learning_rate": 1.7074366375849343e-05, + "loss": 0.313, + "step": 13458 + }, + { + "epoch": 0.24988582482743768, + "grad_norm": 0.4595335125923157, + "learning_rate": 1.707354187921673e-05, + "loss": 0.2878, + "step": 13460 + }, + { + "epoch": 0.24992295496485634, + "grad_norm": 0.4692481756210327, + "learning_rate": 1.7072717286334402e-05, + "loss": 0.2593, + "step": 13462 + }, + { + "epoch": 0.24996008510227496, + "grad_norm": 0.3877066373825073, + "learning_rate": 1.7071892597213576e-05, + "loss": 0.3189, + "step": 13464 + }, + { + "epoch": 0.2499972152396936, + "grad_norm": 0.5486915707588196, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.3509, + "step": 13466 + }, + { + "epoch": 0.2500343453771122, + "grad_norm": 0.5739935636520386, + "learning_rate": 1.7070242930301324e-05, + "loss": 0.3001, + "step": 13468 + }, + { + "epoch": 0.2500714755145309, + "grad_norm": 0.5411041378974915, + "learning_rate": 1.706941795253234e-05, + "loss": 0.3059, + "step": 13470 + }, + { + "epoch": 0.25010860565194953, + "grad_norm": 0.27244988083839417, + "learning_rate": 1.706859287856976e-05, + "loss": 0.1619, + "step": 13472 + }, + { + "epoch": 0.25014573578936816, + "grad_norm": 0.38710346817970276, + "learning_rate": 1.7067767708424797e-05, + "loss": 0.1943, + "step": 13474 + }, + { + "epoch": 0.2501828659267868, + "grad_norm": 0.4377591013908386, + "learning_rate": 1.7066942442108687e-05, + "loss": 0.3846, + "step": 13476 + }, + { + "epoch": 0.2502199960642054, + "grad_norm": 0.4194667339324951, + "learning_rate": 1.706611707963266e-05, + "loss": 0.3448, + "step": 13478 + }, + { + "epoch": 0.25025712620162405, + "grad_norm": 0.4690960943698883, + "learning_rate": 1.7065291621007944e-05, + "loss": 0.3351, + "step": 13480 + }, + { + "epoch": 0.25029425633904273, + "grad_norm": 0.39001163840293884, + "learning_rate": 1.7064466066245772e-05, + "loss": 0.3453, + "step": 13482 + }, + { + "epoch": 0.25033138647646136, + "grad_norm": 0.4255661964416504, + "learning_rate": 1.7063640415357378e-05, + "loss": 0.4786, + "step": 13484 + }, + { + "epoch": 0.25036851661388, + "grad_norm": 0.40822985768318176, + "learning_rate": 1.7062814668353997e-05, + "loss": 0.3558, + "step": 13486 + }, + { + "epoch": 0.2504056467512986, + "grad_norm": 0.3241831958293915, + "learning_rate": 1.706198882524686e-05, + "loss": 0.2258, + "step": 13488 + }, + { + "epoch": 0.25044277688871724, + "grad_norm": 0.23742733895778656, + "learning_rate": 1.706116288604721e-05, + "loss": 0.2094, + "step": 13490 + }, + { + "epoch": 0.25047990702613593, + "grad_norm": 0.4142918884754181, + "learning_rate": 1.7060336850766287e-05, + "loss": 0.2303, + "step": 13492 + }, + { + "epoch": 0.25051703716355456, + "grad_norm": 0.2777521312236786, + "learning_rate": 1.7059510719415323e-05, + "loss": 0.1075, + "step": 13494 + }, + { + "epoch": 0.2505541673009732, + "grad_norm": 0.44745078682899475, + "learning_rate": 1.7058684492005568e-05, + "loss": 0.4177, + "step": 13496 + }, + { + "epoch": 0.2505912974383918, + "grad_norm": 0.3736150562763214, + "learning_rate": 1.7057858168548257e-05, + "loss": 0.3703, + "step": 13498 + }, + { + "epoch": 0.25062842757581044, + "grad_norm": 0.3684479892253876, + "learning_rate": 1.7057031749054638e-05, + "loss": 0.3459, + "step": 13500 + }, + { + "epoch": 0.25066555771322907, + "grad_norm": 0.28411492705345154, + "learning_rate": 1.7056205233535957e-05, + "loss": 0.2371, + "step": 13502 + }, + { + "epoch": 0.25070268785064775, + "grad_norm": 0.3408103287220001, + "learning_rate": 1.7055378622003455e-05, + "loss": 0.2489, + "step": 13504 + }, + { + "epoch": 0.2507398179880664, + "grad_norm": 0.4849068522453308, + "learning_rate": 1.7054551914468388e-05, + "loss": 0.3235, + "step": 13506 + }, + { + "epoch": 0.250776948125485, + "grad_norm": 0.39432206749916077, + "learning_rate": 1.7053725110941998e-05, + "loss": 0.4644, + "step": 13508 + }, + { + "epoch": 0.25081407826290364, + "grad_norm": 0.32414188981056213, + "learning_rate": 1.7052898211435533e-05, + "loss": 0.2818, + "step": 13510 + }, + { + "epoch": 0.25085120840032227, + "grad_norm": 0.6440356373786926, + "learning_rate": 1.7052071215960255e-05, + "loss": 0.2385, + "step": 13512 + }, + { + "epoch": 0.25088833853774095, + "grad_norm": 0.28638899326324463, + "learning_rate": 1.705124412452741e-05, + "loss": 0.4424, + "step": 13514 + }, + { + "epoch": 0.2509254686751596, + "grad_norm": 0.30699872970581055, + "learning_rate": 1.7050416937148253e-05, + "loss": 0.2969, + "step": 13516 + }, + { + "epoch": 0.2509625988125782, + "grad_norm": 0.3447207510471344, + "learning_rate": 1.704958965383404e-05, + "loss": 0.3657, + "step": 13518 + }, + { + "epoch": 0.25099972894999684, + "grad_norm": 0.2970324754714966, + "learning_rate": 1.7048762274596028e-05, + "loss": 0.2656, + "step": 13520 + }, + { + "epoch": 0.25103685908741546, + "grad_norm": 0.3424597382545471, + "learning_rate": 1.7047934799445474e-05, + "loss": 0.1664, + "step": 13522 + }, + { + "epoch": 0.25107398922483415, + "grad_norm": 0.34687936305999756, + "learning_rate": 1.704710722839364e-05, + "loss": 0.3276, + "step": 13524 + }, + { + "epoch": 0.2511111193622528, + "grad_norm": 0.44370514154434204, + "learning_rate": 1.7046279561451783e-05, + "loss": 0.4512, + "step": 13526 + }, + { + "epoch": 0.2511482494996714, + "grad_norm": 0.3351060450077057, + "learning_rate": 1.704545179863117e-05, + "loss": 0.282, + "step": 13528 + }, + { + "epoch": 0.25118537963709003, + "grad_norm": 0.3558134138584137, + "learning_rate": 1.704462393994306e-05, + "loss": 0.2581, + "step": 13530 + }, + { + "epoch": 0.25122250977450866, + "grad_norm": 0.39743366837501526, + "learning_rate": 1.7043795985398717e-05, + "loss": 0.5565, + "step": 13532 + }, + { + "epoch": 0.2512596399119273, + "grad_norm": 0.3620244264602661, + "learning_rate": 1.7042967935009413e-05, + "loss": 0.4073, + "step": 13534 + }, + { + "epoch": 0.251296770049346, + "grad_norm": 0.5346766710281372, + "learning_rate": 1.7042139788786405e-05, + "loss": 0.1641, + "step": 13536 + }, + { + "epoch": 0.2513339001867646, + "grad_norm": 0.2945798933506012, + "learning_rate": 1.7041311546740976e-05, + "loss": 0.4137, + "step": 13538 + }, + { + "epoch": 0.25137103032418323, + "grad_norm": 0.5081639289855957, + "learning_rate": 1.7040483208884387e-05, + "loss": 0.2896, + "step": 13540 + }, + { + "epoch": 0.25140816046160186, + "grad_norm": 0.42044633626937866, + "learning_rate": 1.7039654775227904e-05, + "loss": 0.6324, + "step": 13542 + }, + { + "epoch": 0.2514452905990205, + "grad_norm": 0.2637290358543396, + "learning_rate": 1.7038826245782813e-05, + "loss": 0.2475, + "step": 13544 + }, + { + "epoch": 0.25148242073643917, + "grad_norm": 0.4096880555152893, + "learning_rate": 1.703799762056038e-05, + "loss": 0.3392, + "step": 13546 + }, + { + "epoch": 0.2515195508738578, + "grad_norm": 0.26376697421073914, + "learning_rate": 1.7037168899571876e-05, + "loss": 0.3018, + "step": 13548 + }, + { + "epoch": 0.25155668101127643, + "grad_norm": 0.5098007321357727, + "learning_rate": 1.703634008282858e-05, + "loss": 0.411, + "step": 13550 + }, + { + "epoch": 0.25159381114869506, + "grad_norm": 0.2604253590106964, + "learning_rate": 1.703551117034178e-05, + "loss": 0.2732, + "step": 13552 + }, + { + "epoch": 0.2516309412861137, + "grad_norm": 0.45323461294174194, + "learning_rate": 1.7034682162122745e-05, + "loss": 0.2711, + "step": 13554 + }, + { + "epoch": 0.2516680714235323, + "grad_norm": 0.23245488107204437, + "learning_rate": 1.7033853058182752e-05, + "loss": 0.2193, + "step": 13556 + }, + { + "epoch": 0.251705201560951, + "grad_norm": 0.6137582659721375, + "learning_rate": 1.703302385853309e-05, + "loss": 0.3564, + "step": 13558 + }, + { + "epoch": 0.2517423316983696, + "grad_norm": 0.42788252234458923, + "learning_rate": 1.7032194563185044e-05, + "loss": 0.6079, + "step": 13560 + }, + { + "epoch": 0.25177946183578825, + "grad_norm": 0.2858025133609772, + "learning_rate": 1.7031365172149884e-05, + "loss": 0.3965, + "step": 13562 + }, + { + "epoch": 0.2518165919732069, + "grad_norm": 0.4628661870956421, + "learning_rate": 1.7030535685438913e-05, + "loss": 0.3049, + "step": 13564 + }, + { + "epoch": 0.2518537221106255, + "grad_norm": 0.284263551235199, + "learning_rate": 1.7029706103063412e-05, + "loss": 0.323, + "step": 13566 + }, + { + "epoch": 0.2518908522480442, + "grad_norm": 0.48040521144866943, + "learning_rate": 1.702887642503466e-05, + "loss": 0.2782, + "step": 13568 + }, + { + "epoch": 0.2519279823854628, + "grad_norm": 0.45553284883499146, + "learning_rate": 1.702804665136396e-05, + "loss": 0.3985, + "step": 13570 + }, + { + "epoch": 0.25196511252288145, + "grad_norm": 0.2838556468486786, + "learning_rate": 1.7027216782062592e-05, + "loss": 0.3218, + "step": 13572 + }, + { + "epoch": 0.2520022426603001, + "grad_norm": 0.32408303022384644, + "learning_rate": 1.7026386817141854e-05, + "loss": 0.2301, + "step": 13574 + }, + { + "epoch": 0.2520393727977187, + "grad_norm": 0.751385509967804, + "learning_rate": 1.7025556756613042e-05, + "loss": 0.3233, + "step": 13576 + }, + { + "epoch": 0.25207650293513734, + "grad_norm": 0.4387381672859192, + "learning_rate": 1.702472660048744e-05, + "loss": 0.351, + "step": 13578 + }, + { + "epoch": 0.252113633072556, + "grad_norm": 0.37481051683425903, + "learning_rate": 1.7023896348776357e-05, + "loss": 0.4684, + "step": 13580 + }, + { + "epoch": 0.25215076320997465, + "grad_norm": 0.23695437610149384, + "learning_rate": 1.702306600149108e-05, + "loss": 0.3087, + "step": 13582 + }, + { + "epoch": 0.2521878933473933, + "grad_norm": 0.5026005506515503, + "learning_rate": 1.702223555864291e-05, + "loss": 0.3798, + "step": 13584 + }, + { + "epoch": 0.2522250234848119, + "grad_norm": 0.6652505397796631, + "learning_rate": 1.7021405020243152e-05, + "loss": 0.3381, + "step": 13586 + }, + { + "epoch": 0.25226215362223053, + "grad_norm": 0.378642737865448, + "learning_rate": 1.70205743863031e-05, + "loss": 0.5165, + "step": 13588 + }, + { + "epoch": 0.2522992837596492, + "grad_norm": 0.36895811557769775, + "learning_rate": 1.701974365683406e-05, + "loss": 0.4132, + "step": 13590 + }, + { + "epoch": 0.25233641389706785, + "grad_norm": 0.2597840428352356, + "learning_rate": 1.7018912831847337e-05, + "loss": 0.4467, + "step": 13592 + }, + { + "epoch": 0.2523735440344865, + "grad_norm": 0.3972187340259552, + "learning_rate": 1.701808191135423e-05, + "loss": 0.2537, + "step": 13594 + }, + { + "epoch": 0.2524106741719051, + "grad_norm": 0.4723551869392395, + "learning_rate": 1.7017250895366054e-05, + "loss": 0.2146, + "step": 13596 + }, + { + "epoch": 0.25244780430932373, + "grad_norm": 0.4637466371059418, + "learning_rate": 1.7016419783894112e-05, + "loss": 0.5138, + "step": 13598 + }, + { + "epoch": 0.2524849344467424, + "grad_norm": 0.42479127645492554, + "learning_rate": 1.7015588576949713e-05, + "loss": 0.3211, + "step": 13600 + }, + { + "epoch": 0.25252206458416104, + "grad_norm": 0.5597760677337646, + "learning_rate": 1.7014757274544165e-05, + "loss": 0.3117, + "step": 13602 + }, + { + "epoch": 0.25255919472157967, + "grad_norm": 0.39463362097740173, + "learning_rate": 1.701392587668879e-05, + "loss": 0.4728, + "step": 13604 + }, + { + "epoch": 0.2525963248589983, + "grad_norm": 0.369480162858963, + "learning_rate": 1.7013094383394886e-05, + "loss": 0.2383, + "step": 13606 + }, + { + "epoch": 0.25263345499641693, + "grad_norm": 0.4247845411300659, + "learning_rate": 1.7012262794673774e-05, + "loss": 0.3228, + "step": 13608 + }, + { + "epoch": 0.25267058513383556, + "grad_norm": 0.27499979734420776, + "learning_rate": 1.7011431110536772e-05, + "loss": 0.1294, + "step": 13610 + }, + { + "epoch": 0.25270771527125424, + "grad_norm": 0.35639554262161255, + "learning_rate": 1.7010599330995194e-05, + "loss": 0.2921, + "step": 13612 + }, + { + "epoch": 0.25274484540867287, + "grad_norm": 0.45653897523880005, + "learning_rate": 1.7009767456060354e-05, + "loss": 0.3373, + "step": 13614 + }, + { + "epoch": 0.2527819755460915, + "grad_norm": 0.42157605290412903, + "learning_rate": 1.7008935485743585e-05, + "loss": 0.4752, + "step": 13616 + }, + { + "epoch": 0.2528191056835101, + "grad_norm": 0.3991600573062897, + "learning_rate": 1.700810342005619e-05, + "loss": 0.3342, + "step": 13618 + }, + { + "epoch": 0.25285623582092875, + "grad_norm": 0.3936379849910736, + "learning_rate": 1.7007271259009502e-05, + "loss": 0.2549, + "step": 13620 + }, + { + "epoch": 0.25289336595834744, + "grad_norm": 0.3804357647895813, + "learning_rate": 1.7006439002614842e-05, + "loss": 0.3514, + "step": 13622 + }, + { + "epoch": 0.25293049609576607, + "grad_norm": 0.5249435901641846, + "learning_rate": 1.7005606650883534e-05, + "loss": 0.2186, + "step": 13624 + }, + { + "epoch": 0.2529676262331847, + "grad_norm": 0.26637545228004456, + "learning_rate": 1.70047742038269e-05, + "loss": 0.2712, + "step": 13626 + }, + { + "epoch": 0.2530047563706033, + "grad_norm": 0.31128400564193726, + "learning_rate": 1.7003941661456272e-05, + "loss": 0.3893, + "step": 13628 + }, + { + "epoch": 0.25304188650802195, + "grad_norm": 0.3280087113380432, + "learning_rate": 1.700310902378298e-05, + "loss": 0.2651, + "step": 13630 + }, + { + "epoch": 0.2530790166454406, + "grad_norm": 0.24037370085716248, + "learning_rate": 1.700227629081835e-05, + "loss": 0.2972, + "step": 13632 + }, + { + "epoch": 0.25311614678285926, + "grad_norm": 0.5142262578010559, + "learning_rate": 1.7001443462573716e-05, + "loss": 0.4333, + "step": 13634 + }, + { + "epoch": 0.2531532769202779, + "grad_norm": 0.2996436059474945, + "learning_rate": 1.7000610539060406e-05, + "loss": 0.2376, + "step": 13636 + }, + { + "epoch": 0.2531904070576965, + "grad_norm": 0.5064845085144043, + "learning_rate": 1.6999777520289754e-05, + "loss": 0.2596, + "step": 13638 + }, + { + "epoch": 0.25322753719511515, + "grad_norm": 0.31223976612091064, + "learning_rate": 1.69989444062731e-05, + "loss": 0.2104, + "step": 13640 + }, + { + "epoch": 0.2532646673325338, + "grad_norm": 0.46125105023384094, + "learning_rate": 1.699811119702177e-05, + "loss": 0.5057, + "step": 13642 + }, + { + "epoch": 0.25330179746995246, + "grad_norm": 0.38701242208480835, + "learning_rate": 1.699727789254712e-05, + "loss": 0.3401, + "step": 13644 + }, + { + "epoch": 0.2533389276073711, + "grad_norm": 0.27538976073265076, + "learning_rate": 1.6996444492860468e-05, + "loss": 0.3991, + "step": 13646 + }, + { + "epoch": 0.2533760577447897, + "grad_norm": 0.30591198801994324, + "learning_rate": 1.6995610997973166e-05, + "loss": 0.3411, + "step": 13648 + }, + { + "epoch": 0.25341318788220835, + "grad_norm": 0.39603808522224426, + "learning_rate": 1.699477740789655e-05, + "loss": 0.2415, + "step": 13650 + }, + { + "epoch": 0.253450318019627, + "grad_norm": 0.3029903173446655, + "learning_rate": 1.699394372264197e-05, + "loss": 0.2939, + "step": 13652 + }, + { + "epoch": 0.2534874481570456, + "grad_norm": 0.32516413927078247, + "learning_rate": 1.699310994222076e-05, + "loss": 0.1726, + "step": 13654 + }, + { + "epoch": 0.2535245782944643, + "grad_norm": 1.33535635471344, + "learning_rate": 1.699227606664428e-05, + "loss": 0.2552, + "step": 13656 + }, + { + "epoch": 0.2535617084318829, + "grad_norm": 0.3027133047580719, + "learning_rate": 1.6991442095923855e-05, + "loss": 0.2955, + "step": 13658 + }, + { + "epoch": 0.25359883856930154, + "grad_norm": 0.3311522603034973, + "learning_rate": 1.6990608030070854e-05, + "loss": 0.3306, + "step": 13660 + }, + { + "epoch": 0.25363596870672017, + "grad_norm": 0.3005430996417999, + "learning_rate": 1.698977386909661e-05, + "loss": 0.4855, + "step": 13662 + }, + { + "epoch": 0.2536730988441388, + "grad_norm": 0.3486959636211395, + "learning_rate": 1.6988939613012485e-05, + "loss": 0.3124, + "step": 13664 + }, + { + "epoch": 0.2537102289815575, + "grad_norm": 0.42648744583129883, + "learning_rate": 1.6988105261829825e-05, + "loss": 0.3011, + "step": 13666 + }, + { + "epoch": 0.2537473591189761, + "grad_norm": 0.46215444803237915, + "learning_rate": 1.6987270815559985e-05, + "loss": 0.349, + "step": 13668 + }, + { + "epoch": 0.25378448925639474, + "grad_norm": 0.3062231242656708, + "learning_rate": 1.698643627421432e-05, + "loss": 0.413, + "step": 13670 + }, + { + "epoch": 0.25382161939381337, + "grad_norm": 0.4943634569644928, + "learning_rate": 1.698560163780418e-05, + "loss": 0.2292, + "step": 13672 + }, + { + "epoch": 0.253858749531232, + "grad_norm": 0.4232478141784668, + "learning_rate": 1.6984766906340924e-05, + "loss": 0.4758, + "step": 13674 + }, + { + "epoch": 0.2538958796686507, + "grad_norm": 0.4092228412628174, + "learning_rate": 1.6983932079835917e-05, + "loss": 0.1964, + "step": 13676 + }, + { + "epoch": 0.2539330098060693, + "grad_norm": 0.34870395064353943, + "learning_rate": 1.6983097158300516e-05, + "loss": 0.1751, + "step": 13678 + }, + { + "epoch": 0.25397013994348794, + "grad_norm": 0.8198285698890686, + "learning_rate": 1.6982262141746076e-05, + "loss": 0.2075, + "step": 13680 + }, + { + "epoch": 0.25400727008090657, + "grad_norm": 0.4108404219150543, + "learning_rate": 1.6981427030183962e-05, + "loss": 0.2724, + "step": 13682 + }, + { + "epoch": 0.2540444002183252, + "grad_norm": 0.4889931082725525, + "learning_rate": 1.6980591823625537e-05, + "loss": 0.3884, + "step": 13684 + }, + { + "epoch": 0.2540815303557438, + "grad_norm": 0.5948621034622192, + "learning_rate": 1.697975652208217e-05, + "loss": 0.351, + "step": 13686 + }, + { + "epoch": 0.2541186604931625, + "grad_norm": 0.34527528285980225, + "learning_rate": 1.697892112556522e-05, + "loss": 0.4693, + "step": 13688 + }, + { + "epoch": 0.25415579063058114, + "grad_norm": 0.5897166132926941, + "learning_rate": 1.697808563408606e-05, + "loss": 0.2551, + "step": 13690 + }, + { + "epoch": 0.25419292076799976, + "grad_norm": 0.3086923360824585, + "learning_rate": 1.6977250047656056e-05, + "loss": 0.2136, + "step": 13692 + }, + { + "epoch": 0.2542300509054184, + "grad_norm": 0.4336811304092407, + "learning_rate": 1.6976414366286578e-05, + "loss": 0.2543, + "step": 13694 + }, + { + "epoch": 0.254267181042837, + "grad_norm": 0.3661821782588959, + "learning_rate": 1.6975578589989e-05, + "loss": 0.4478, + "step": 13696 + }, + { + "epoch": 0.2543043111802557, + "grad_norm": 0.2914643883705139, + "learning_rate": 1.6974742718774687e-05, + "loss": 0.2323, + "step": 13698 + }, + { + "epoch": 0.25434144131767433, + "grad_norm": 0.4297862946987152, + "learning_rate": 1.697390675265502e-05, + "loss": 0.2568, + "step": 13700 + }, + { + "epoch": 0.25437857145509296, + "grad_norm": 0.3211735785007477, + "learning_rate": 1.697307069164137e-05, + "loss": 0.3877, + "step": 13702 + }, + { + "epoch": 0.2544157015925116, + "grad_norm": 0.45898211002349854, + "learning_rate": 1.6972234535745117e-05, + "loss": 0.1843, + "step": 13704 + }, + { + "epoch": 0.2544528317299302, + "grad_norm": 0.6417859196662903, + "learning_rate": 1.6971398284977634e-05, + "loss": 0.3583, + "step": 13706 + }, + { + "epoch": 0.25448996186734885, + "grad_norm": 0.42082932591438293, + "learning_rate": 1.6970561939350302e-05, + "loss": 0.3105, + "step": 13708 + }, + { + "epoch": 0.25452709200476753, + "grad_norm": 0.2992493212223053, + "learning_rate": 1.69697254988745e-05, + "loss": 0.2017, + "step": 13710 + }, + { + "epoch": 0.25456422214218616, + "grad_norm": 0.32489416003227234, + "learning_rate": 1.6968888963561614e-05, + "loss": 0.1897, + "step": 13712 + }, + { + "epoch": 0.2546013522796048, + "grad_norm": 0.43848279118537903, + "learning_rate": 1.6968052333423023e-05, + "loss": 0.5144, + "step": 13714 + }, + { + "epoch": 0.2546384824170234, + "grad_norm": 0.33223775029182434, + "learning_rate": 1.696721560847011e-05, + "loss": 0.3641, + "step": 13716 + }, + { + "epoch": 0.25467561255444204, + "grad_norm": 0.2649035155773163, + "learning_rate": 1.6966378788714262e-05, + "loss": 0.2365, + "step": 13718 + }, + { + "epoch": 0.2547127426918607, + "grad_norm": 0.36433765292167664, + "learning_rate": 1.6965541874166866e-05, + "loss": 0.2299, + "step": 13720 + }, + { + "epoch": 0.25474987282927936, + "grad_norm": 0.3439432680606842, + "learning_rate": 1.696470486483931e-05, + "loss": 0.2747, + "step": 13722 + }, + { + "epoch": 0.254787002966698, + "grad_norm": 0.37267714738845825, + "learning_rate": 1.696386776074298e-05, + "loss": 0.3405, + "step": 13724 + }, + { + "epoch": 0.2548241331041166, + "grad_norm": 0.33308616280555725, + "learning_rate": 1.696303056188927e-05, + "loss": 0.3022, + "step": 13726 + }, + { + "epoch": 0.25486126324153524, + "grad_norm": 0.38547685742378235, + "learning_rate": 1.696219326828957e-05, + "loss": 0.5033, + "step": 13728 + }, + { + "epoch": 0.25489839337895387, + "grad_norm": 0.2877867817878723, + "learning_rate": 1.696135587995528e-05, + "loss": 0.1859, + "step": 13730 + }, + { + "epoch": 0.25493552351637255, + "grad_norm": 0.5539423823356628, + "learning_rate": 1.696051839689778e-05, + "loss": 0.1841, + "step": 13732 + }, + { + "epoch": 0.2549726536537912, + "grad_norm": 0.3133772611618042, + "learning_rate": 1.695968081912848e-05, + "loss": 0.3078, + "step": 13734 + }, + { + "epoch": 0.2550097837912098, + "grad_norm": 0.40221840143203735, + "learning_rate": 1.6958843146658766e-05, + "loss": 0.299, + "step": 13736 + }, + { + "epoch": 0.25504691392862844, + "grad_norm": 0.35547107458114624, + "learning_rate": 1.695800537950004e-05, + "loss": 0.2233, + "step": 13738 + }, + { + "epoch": 0.25508404406604707, + "grad_norm": 0.3636387586593628, + "learning_rate": 1.6957167517663705e-05, + "loss": 0.365, + "step": 13740 + }, + { + "epoch": 0.25512117420346575, + "grad_norm": 0.3263877034187317, + "learning_rate": 1.695632956116116e-05, + "loss": 0.4536, + "step": 13742 + }, + { + "epoch": 0.2551583043408844, + "grad_norm": 0.257674902677536, + "learning_rate": 1.6955491510003805e-05, + "loss": 0.281, + "step": 13744 + }, + { + "epoch": 0.255195434478303, + "grad_norm": 0.38836032152175903, + "learning_rate": 1.6954653364203046e-05, + "loss": 0.3071, + "step": 13746 + }, + { + "epoch": 0.25523256461572164, + "grad_norm": 0.5239485502243042, + "learning_rate": 1.6953815123770285e-05, + "loss": 0.3251, + "step": 13748 + }, + { + "epoch": 0.25526969475314026, + "grad_norm": 0.34689855575561523, + "learning_rate": 1.695297678871693e-05, + "loss": 0.5234, + "step": 13750 + }, + { + "epoch": 0.25530682489055895, + "grad_norm": 0.30790990591049194, + "learning_rate": 1.6952138359054387e-05, + "loss": 0.2019, + "step": 13752 + }, + { + "epoch": 0.2553439550279776, + "grad_norm": 0.8268873691558838, + "learning_rate": 1.6951299834794065e-05, + "loss": 0.2557, + "step": 13754 + }, + { + "epoch": 0.2553810851653962, + "grad_norm": 0.3010076880455017, + "learning_rate": 1.6950461215947377e-05, + "loss": 0.2623, + "step": 13756 + }, + { + "epoch": 0.25541821530281483, + "grad_norm": 0.27561119198799133, + "learning_rate": 1.6949622502525723e-05, + "loss": 0.286, + "step": 13758 + }, + { + "epoch": 0.25545534544023346, + "grad_norm": 0.4324747323989868, + "learning_rate": 1.6948783694540528e-05, + "loss": 0.5414, + "step": 13760 + }, + { + "epoch": 0.2554924755776521, + "grad_norm": 0.5337046980857849, + "learning_rate": 1.6947944792003203e-05, + "loss": 0.2074, + "step": 13762 + }, + { + "epoch": 0.2555296057150708, + "grad_norm": 0.30530399084091187, + "learning_rate": 1.694710579492516e-05, + "loss": 0.4649, + "step": 13764 + }, + { + "epoch": 0.2555667358524894, + "grad_norm": 0.3715148866176605, + "learning_rate": 1.6946266703317817e-05, + "loss": 0.2815, + "step": 13766 + }, + { + "epoch": 0.25560386598990803, + "grad_norm": 0.3819938898086548, + "learning_rate": 1.6945427517192588e-05, + "loss": 0.3095, + "step": 13768 + }, + { + "epoch": 0.25564099612732666, + "grad_norm": 0.38834378123283386, + "learning_rate": 1.6944588236560894e-05, + "loss": 0.1813, + "step": 13770 + }, + { + "epoch": 0.2556781262647453, + "grad_norm": 0.40062519907951355, + "learning_rate": 1.6943748861434157e-05, + "loss": 0.4839, + "step": 13772 + }, + { + "epoch": 0.25571525640216397, + "grad_norm": 0.4693989157676697, + "learning_rate": 1.6942909391823795e-05, + "loss": 0.4739, + "step": 13774 + }, + { + "epoch": 0.2557523865395826, + "grad_norm": 0.31688180565834045, + "learning_rate": 1.6942069827741233e-05, + "loss": 0.2734, + "step": 13776 + }, + { + "epoch": 0.2557895166770012, + "grad_norm": 0.32606568932533264, + "learning_rate": 1.6941230169197896e-05, + "loss": 0.4397, + "step": 13778 + }, + { + "epoch": 0.25582664681441986, + "grad_norm": 0.29052188992500305, + "learning_rate": 1.694039041620521e-05, + "loss": 0.3016, + "step": 13780 + }, + { + "epoch": 0.2558637769518385, + "grad_norm": 0.3872697949409485, + "learning_rate": 1.6939550568774597e-05, + "loss": 0.2658, + "step": 13782 + }, + { + "epoch": 0.2559009070892571, + "grad_norm": 0.2635507583618164, + "learning_rate": 1.6938710626917485e-05, + "loss": 0.2144, + "step": 13784 + }, + { + "epoch": 0.2559380372266758, + "grad_norm": 0.31077760457992554, + "learning_rate": 1.693787059064531e-05, + "loss": 0.2891, + "step": 13786 + }, + { + "epoch": 0.2559751673640944, + "grad_norm": 0.32194823026657104, + "learning_rate": 1.6937030459969495e-05, + "loss": 0.3408, + "step": 13788 + }, + { + "epoch": 0.25601229750151305, + "grad_norm": 0.36904555559158325, + "learning_rate": 1.6936190234901472e-05, + "loss": 0.4703, + "step": 13790 + }, + { + "epoch": 0.2560494276389317, + "grad_norm": 0.2540688216686249, + "learning_rate": 1.693534991545268e-05, + "loss": 0.413, + "step": 13792 + }, + { + "epoch": 0.2560865577763503, + "grad_norm": 0.35131198167800903, + "learning_rate": 1.6934509501634548e-05, + "loss": 0.2003, + "step": 13794 + }, + { + "epoch": 0.256123687913769, + "grad_norm": 0.3865675628185272, + "learning_rate": 1.6933668993458512e-05, + "loss": 0.3688, + "step": 13796 + }, + { + "epoch": 0.2561608180511876, + "grad_norm": 0.2915065586566925, + "learning_rate": 1.693282839093601e-05, + "loss": 0.4057, + "step": 13798 + }, + { + "epoch": 0.25619794818860625, + "grad_norm": 0.3600832223892212, + "learning_rate": 1.693198769407848e-05, + "loss": 0.533, + "step": 13800 + }, + { + "epoch": 0.2562350783260249, + "grad_norm": 0.42910027503967285, + "learning_rate": 1.6931146902897362e-05, + "loss": 0.1325, + "step": 13802 + }, + { + "epoch": 0.2562722084634435, + "grad_norm": 0.32095006108283997, + "learning_rate": 1.6930306017404097e-05, + "loss": 0.3628, + "step": 13804 + }, + { + "epoch": 0.25630933860086214, + "grad_norm": 0.3400716185569763, + "learning_rate": 1.692946503761012e-05, + "loss": 0.1498, + "step": 13806 + }, + { + "epoch": 0.2563464687382808, + "grad_norm": 0.37268680334091187, + "learning_rate": 1.6928623963526888e-05, + "loss": 0.3984, + "step": 13808 + }, + { + "epoch": 0.25638359887569945, + "grad_norm": 0.3930261433124542, + "learning_rate": 1.6927782795165835e-05, + "loss": 0.2939, + "step": 13810 + }, + { + "epoch": 0.2564207290131181, + "grad_norm": 0.3283863961696625, + "learning_rate": 1.6926941532538407e-05, + "loss": 0.1214, + "step": 13812 + }, + { + "epoch": 0.2564578591505367, + "grad_norm": 0.3679408133029938, + "learning_rate": 1.6926100175656053e-05, + "loss": 0.4315, + "step": 13814 + }, + { + "epoch": 0.25649498928795533, + "grad_norm": 0.4393298029899597, + "learning_rate": 1.692525872453022e-05, + "loss": 0.2682, + "step": 13816 + }, + { + "epoch": 0.256532119425374, + "grad_norm": 0.4153956174850464, + "learning_rate": 1.6924417179172364e-05, + "loss": 0.2776, + "step": 13818 + }, + { + "epoch": 0.25656924956279265, + "grad_norm": 0.3554937541484833, + "learning_rate": 1.692357553959393e-05, + "loss": 0.3864, + "step": 13820 + }, + { + "epoch": 0.2566063797002113, + "grad_norm": 0.3652535080909729, + "learning_rate": 1.692273380580637e-05, + "loss": 0.2932, + "step": 13822 + }, + { + "epoch": 0.2566435098376299, + "grad_norm": 0.4037335216999054, + "learning_rate": 1.692189197782114e-05, + "loss": 0.5484, + "step": 13824 + }, + { + "epoch": 0.25668063997504853, + "grad_norm": 0.25311097502708435, + "learning_rate": 1.692105005564969e-05, + "loss": 0.2388, + "step": 13826 + }, + { + "epoch": 0.25671777011246716, + "grad_norm": 0.2720162868499756, + "learning_rate": 1.6920208039303484e-05, + "loss": 0.1609, + "step": 13828 + }, + { + "epoch": 0.25675490024988584, + "grad_norm": 0.28366848826408386, + "learning_rate": 1.691936592879398e-05, + "loss": 0.3137, + "step": 13830 + }, + { + "epoch": 0.25679203038730447, + "grad_norm": 0.5329599976539612, + "learning_rate": 1.691852372413262e-05, + "loss": 0.3553, + "step": 13832 + }, + { + "epoch": 0.2568291605247231, + "grad_norm": 0.4084221422672272, + "learning_rate": 1.6917681425330883e-05, + "loss": 0.2966, + "step": 13834 + }, + { + "epoch": 0.2568662906621417, + "grad_norm": 0.38366132974624634, + "learning_rate": 1.6916839032400224e-05, + "loss": 0.34, + "step": 13836 + }, + { + "epoch": 0.25690342079956036, + "grad_norm": 0.25355449318885803, + "learning_rate": 1.6915996545352098e-05, + "loss": 0.4101, + "step": 13838 + }, + { + "epoch": 0.25694055093697904, + "grad_norm": 0.4541107416152954, + "learning_rate": 1.691515396419798e-05, + "loss": 0.4866, + "step": 13840 + }, + { + "epoch": 0.25697768107439767, + "grad_norm": 0.46720975637435913, + "learning_rate": 1.6914311288949327e-05, + "loss": 0.3382, + "step": 13842 + }, + { + "epoch": 0.2570148112118163, + "grad_norm": 0.36868152022361755, + "learning_rate": 1.6913468519617605e-05, + "loss": 0.405, + "step": 13844 + }, + { + "epoch": 0.2570519413492349, + "grad_norm": 0.32116058468818665, + "learning_rate": 1.691262565621429e-05, + "loss": 0.3789, + "step": 13846 + }, + { + "epoch": 0.25708907148665355, + "grad_norm": 0.3383767604827881, + "learning_rate": 1.6911782698750845e-05, + "loss": 0.2281, + "step": 13848 + }, + { + "epoch": 0.25712620162407224, + "grad_norm": 0.38744455575942993, + "learning_rate": 1.6910939647238737e-05, + "loss": 0.2009, + "step": 13850 + }, + { + "epoch": 0.25716333176149087, + "grad_norm": 0.3801097869873047, + "learning_rate": 1.6910096501689444e-05, + "loss": 0.4077, + "step": 13852 + }, + { + "epoch": 0.2572004618989095, + "grad_norm": 0.21515339612960815, + "learning_rate": 1.690925326211444e-05, + "loss": 0.3105, + "step": 13854 + }, + { + "epoch": 0.2572375920363281, + "grad_norm": 0.35749539732933044, + "learning_rate": 1.6908409928525188e-05, + "loss": 0.3635, + "step": 13856 + }, + { + "epoch": 0.25727472217374675, + "grad_norm": 0.4581398665904999, + "learning_rate": 1.690756650093317e-05, + "loss": 0.4339, + "step": 13858 + }, + { + "epoch": 0.2573118523111654, + "grad_norm": 0.6094984412193298, + "learning_rate": 1.6906722979349863e-05, + "loss": 0.2473, + "step": 13860 + }, + { + "epoch": 0.25734898244858406, + "grad_norm": 0.46877989172935486, + "learning_rate": 1.690587936378675e-05, + "loss": 0.3005, + "step": 13862 + }, + { + "epoch": 0.2573861125860027, + "grad_norm": 0.36369916796684265, + "learning_rate": 1.6905035654255296e-05, + "loss": 0.3536, + "step": 13864 + }, + { + "epoch": 0.2574232427234213, + "grad_norm": 0.40528199076652527, + "learning_rate": 1.6904191850766996e-05, + "loss": 0.2358, + "step": 13866 + }, + { + "epoch": 0.25746037286083995, + "grad_norm": 0.2735990881919861, + "learning_rate": 1.6903347953333322e-05, + "loss": 0.2734, + "step": 13868 + }, + { + "epoch": 0.2574975029982586, + "grad_norm": 0.5643244385719299, + "learning_rate": 1.6902503961965765e-05, + "loss": 0.3127, + "step": 13870 + }, + { + "epoch": 0.25753463313567726, + "grad_norm": 0.4487912356853485, + "learning_rate": 1.69016598766758e-05, + "loss": 0.2847, + "step": 13872 + }, + { + "epoch": 0.2575717632730959, + "grad_norm": 0.466102659702301, + "learning_rate": 1.690081569747492e-05, + "loss": 0.2779, + "step": 13874 + }, + { + "epoch": 0.2576088934105145, + "grad_norm": 0.28952568769454956, + "learning_rate": 1.68999714243746e-05, + "loss": 0.3667, + "step": 13876 + }, + { + "epoch": 0.25764602354793315, + "grad_norm": 0.4669014811515808, + "learning_rate": 1.6899127057386346e-05, + "loss": 0.281, + "step": 13878 + }, + { + "epoch": 0.2576831536853518, + "grad_norm": 0.45572248101234436, + "learning_rate": 1.6898282596521636e-05, + "loss": 0.248, + "step": 13880 + }, + { + "epoch": 0.2577202838227704, + "grad_norm": 0.37069517374038696, + "learning_rate": 1.689743804179196e-05, + "loss": 0.3705, + "step": 13882 + }, + { + "epoch": 0.2577574139601891, + "grad_norm": 0.4735618531703949, + "learning_rate": 1.6896593393208816e-05, + "loss": 0.3593, + "step": 13884 + }, + { + "epoch": 0.2577945440976077, + "grad_norm": 0.6046683192253113, + "learning_rate": 1.6895748650783693e-05, + "loss": 0.4249, + "step": 13886 + }, + { + "epoch": 0.25783167423502634, + "grad_norm": 0.3490101099014282, + "learning_rate": 1.6894903814528083e-05, + "loss": 0.2407, + "step": 13888 + }, + { + "epoch": 0.25786880437244497, + "grad_norm": 0.32474836707115173, + "learning_rate": 1.6894058884453487e-05, + "loss": 0.4068, + "step": 13890 + }, + { + "epoch": 0.2579059345098636, + "grad_norm": 0.27498355507850647, + "learning_rate": 1.6893213860571402e-05, + "loss": 0.3269, + "step": 13892 + }, + { + "epoch": 0.2579430646472823, + "grad_norm": 0.34867867827415466, + "learning_rate": 1.689236874289332e-05, + "loss": 0.1692, + "step": 13894 + }, + { + "epoch": 0.2579801947847009, + "grad_norm": 0.37561291456222534, + "learning_rate": 1.6891523531430743e-05, + "loss": 0.2597, + "step": 13896 + }, + { + "epoch": 0.25801732492211954, + "grad_norm": 0.293508380651474, + "learning_rate": 1.689067822619518e-05, + "loss": 0.2276, + "step": 13898 + }, + { + "epoch": 0.25805445505953817, + "grad_norm": 0.39469292759895325, + "learning_rate": 1.6889832827198122e-05, + "loss": 0.3717, + "step": 13900 + }, + { + "epoch": 0.2580915851969568, + "grad_norm": 0.33319106698036194, + "learning_rate": 1.6888987334451076e-05, + "loss": 0.3414, + "step": 13902 + }, + { + "epoch": 0.2581287153343754, + "grad_norm": 0.270370751619339, + "learning_rate": 1.6888141747965546e-05, + "loss": 0.3507, + "step": 13904 + }, + { + "epoch": 0.2581658454717941, + "grad_norm": 0.46120569109916687, + "learning_rate": 1.6887296067753043e-05, + "loss": 0.3988, + "step": 13906 + }, + { + "epoch": 0.25820297560921274, + "grad_norm": 0.26902490854263306, + "learning_rate": 1.688645029382507e-05, + "loss": 0.2644, + "step": 13908 + }, + { + "epoch": 0.25824010574663137, + "grad_norm": 0.48291024565696716, + "learning_rate": 1.6885604426193132e-05, + "loss": 0.5962, + "step": 13910 + }, + { + "epoch": 0.25827723588405, + "grad_norm": 0.44153958559036255, + "learning_rate": 1.688475846486875e-05, + "loss": 0.2469, + "step": 13912 + }, + { + "epoch": 0.2583143660214686, + "grad_norm": 0.30487239360809326, + "learning_rate": 1.688391240986342e-05, + "loss": 0.4363, + "step": 13914 + }, + { + "epoch": 0.2583514961588873, + "grad_norm": 0.29749616980552673, + "learning_rate": 1.6883066261188665e-05, + "loss": 0.4633, + "step": 13916 + }, + { + "epoch": 0.25838862629630593, + "grad_norm": 0.4365514814853668, + "learning_rate": 1.6882220018855994e-05, + "loss": 0.3762, + "step": 13918 + }, + { + "epoch": 0.25842575643372456, + "grad_norm": 0.3300270736217499, + "learning_rate": 1.6881373682876925e-05, + "loss": 0.2996, + "step": 13920 + }, + { + "epoch": 0.2584628865711432, + "grad_norm": 0.4362311065196991, + "learning_rate": 1.688052725326297e-05, + "loss": 0.3393, + "step": 13922 + }, + { + "epoch": 0.2585000167085618, + "grad_norm": 0.28232109546661377, + "learning_rate": 1.687968073002565e-05, + "loss": 0.2331, + "step": 13924 + }, + { + "epoch": 0.2585371468459805, + "grad_norm": 0.23232927918434143, + "learning_rate": 1.6878834113176485e-05, + "loss": 0.2472, + "step": 13926 + }, + { + "epoch": 0.25857427698339913, + "grad_norm": 0.35865363478660583, + "learning_rate": 1.6877987402726986e-05, + "loss": 0.2303, + "step": 13928 + }, + { + "epoch": 0.25861140712081776, + "grad_norm": 0.36527344584465027, + "learning_rate": 1.6877140598688685e-05, + "loss": 0.3992, + "step": 13930 + }, + { + "epoch": 0.2586485372582364, + "grad_norm": 0.44925445318222046, + "learning_rate": 1.68762937010731e-05, + "loss": 0.4077, + "step": 13932 + }, + { + "epoch": 0.258685667395655, + "grad_norm": 0.8289952874183655, + "learning_rate": 1.687544670989175e-05, + "loss": 0.4337, + "step": 13934 + }, + { + "epoch": 0.25872279753307365, + "grad_norm": 0.25468018651008606, + "learning_rate": 1.687459962515617e-05, + "loss": 0.5259, + "step": 13936 + }, + { + "epoch": 0.25875992767049233, + "grad_norm": 0.490289568901062, + "learning_rate": 1.687375244687788e-05, + "loss": 0.375, + "step": 13938 + }, + { + "epoch": 0.25879705780791096, + "grad_norm": 0.5866946578025818, + "learning_rate": 1.6872905175068404e-05, + "loss": 0.3447, + "step": 13940 + }, + { + "epoch": 0.2588341879453296, + "grad_norm": 0.32367733120918274, + "learning_rate": 1.6872057809739286e-05, + "loss": 0.5514, + "step": 13942 + }, + { + "epoch": 0.2588713180827482, + "grad_norm": 0.3692754805088043, + "learning_rate": 1.6871210350902036e-05, + "loss": 0.3264, + "step": 13944 + }, + { + "epoch": 0.25890844822016684, + "grad_norm": 0.38057973980903625, + "learning_rate": 1.6870362798568195e-05, + "loss": 0.3451, + "step": 13946 + }, + { + "epoch": 0.2589455783575855, + "grad_norm": 0.3000034689903259, + "learning_rate": 1.6869515152749296e-05, + "loss": 0.4575, + "step": 13948 + }, + { + "epoch": 0.25898270849500415, + "grad_norm": 0.49746036529541016, + "learning_rate": 1.6868667413456873e-05, + "loss": 0.3326, + "step": 13950 + }, + { + "epoch": 0.2590198386324228, + "grad_norm": 0.42583170533180237, + "learning_rate": 1.686781958070246e-05, + "loss": 0.3672, + "step": 13952 + }, + { + "epoch": 0.2590569687698414, + "grad_norm": 0.35490143299102783, + "learning_rate": 1.68669716544976e-05, + "loss": 0.3543, + "step": 13954 + }, + { + "epoch": 0.25909409890726004, + "grad_norm": 0.3476410508155823, + "learning_rate": 1.6866123634853817e-05, + "loss": 0.3369, + "step": 13956 + }, + { + "epoch": 0.25913122904467867, + "grad_norm": 0.2687476873397827, + "learning_rate": 1.6865275521782664e-05, + "loss": 0.3628, + "step": 13958 + }, + { + "epoch": 0.25916835918209735, + "grad_norm": 0.39296820759773254, + "learning_rate": 1.686442731529567e-05, + "loss": 0.2117, + "step": 13960 + }, + { + "epoch": 0.259205489319516, + "grad_norm": 0.41421154141426086, + "learning_rate": 1.686357901540438e-05, + "loss": 0.2982, + "step": 13962 + }, + { + "epoch": 0.2592426194569346, + "grad_norm": 0.4272485077381134, + "learning_rate": 1.6862730622120345e-05, + "loss": 0.4008, + "step": 13964 + }, + { + "epoch": 0.25927974959435324, + "grad_norm": 0.5024080276489258, + "learning_rate": 1.68618821354551e-05, + "loss": 0.3085, + "step": 13966 + }, + { + "epoch": 0.25931687973177187, + "grad_norm": 0.2880585193634033, + "learning_rate": 1.6861033555420192e-05, + "loss": 0.3054, + "step": 13968 + }, + { + "epoch": 0.25935400986919055, + "grad_norm": 0.6252123713493347, + "learning_rate": 1.686018488202717e-05, + "loss": 0.3742, + "step": 13970 + }, + { + "epoch": 0.2593911400066092, + "grad_norm": 0.2947867214679718, + "learning_rate": 1.6859336115287576e-05, + "loss": 0.2913, + "step": 13972 + }, + { + "epoch": 0.2594282701440278, + "grad_norm": 0.45847341418266296, + "learning_rate": 1.6858487255212967e-05, + "loss": 0.4885, + "step": 13974 + }, + { + "epoch": 0.25946540028144643, + "grad_norm": 0.7742215394973755, + "learning_rate": 1.6857638301814888e-05, + "loss": 0.2056, + "step": 13976 + }, + { + "epoch": 0.25950253041886506, + "grad_norm": 0.4215402901172638, + "learning_rate": 1.6856789255104895e-05, + "loss": 0.3459, + "step": 13978 + }, + { + "epoch": 0.2595396605562837, + "grad_norm": 0.40069687366485596, + "learning_rate": 1.685594011509454e-05, + "loss": 0.2383, + "step": 13980 + }, + { + "epoch": 0.2595767906937024, + "grad_norm": 0.3303038775920868, + "learning_rate": 1.6855090881795372e-05, + "loss": 0.2049, + "step": 13982 + }, + { + "epoch": 0.259613920831121, + "grad_norm": 0.37788519263267517, + "learning_rate": 1.6854241555218954e-05, + "loss": 0.3644, + "step": 13984 + }, + { + "epoch": 0.25965105096853963, + "grad_norm": 0.2660425901412964, + "learning_rate": 1.6853392135376836e-05, + "loss": 0.1964, + "step": 13986 + }, + { + "epoch": 0.25968818110595826, + "grad_norm": 0.5146507024765015, + "learning_rate": 1.685254262228058e-05, + "loss": 0.3741, + "step": 13988 + }, + { + "epoch": 0.2597253112433769, + "grad_norm": 0.43032974004745483, + "learning_rate": 1.6851693015941747e-05, + "loss": 0.297, + "step": 13990 + }, + { + "epoch": 0.2597624413807956, + "grad_norm": 0.3363281786441803, + "learning_rate": 1.6850843316371894e-05, + "loss": 0.3052, + "step": 13992 + }, + { + "epoch": 0.2597995715182142, + "grad_norm": 0.4862024188041687, + "learning_rate": 1.684999352358258e-05, + "loss": 0.5986, + "step": 13994 + }, + { + "epoch": 0.25983670165563283, + "grad_norm": 0.4416103661060333, + "learning_rate": 1.6849143637585378e-05, + "loss": 0.2835, + "step": 13996 + }, + { + "epoch": 0.25987383179305146, + "grad_norm": 0.3888454735279083, + "learning_rate": 1.6848293658391846e-05, + "loss": 0.3235, + "step": 13998 + }, + { + "epoch": 0.2599109619304701, + "grad_norm": 0.3635510802268982, + "learning_rate": 1.684744358601355e-05, + "loss": 0.3073, + "step": 14000 + }, + { + "epoch": 0.25994809206788877, + "grad_norm": 0.6778346300125122, + "learning_rate": 1.6846593420462056e-05, + "loss": 0.5485, + "step": 14002 + }, + { + "epoch": 0.2599852222053074, + "grad_norm": 0.37570440769195557, + "learning_rate": 1.6845743161748935e-05, + "loss": 0.2946, + "step": 14004 + }, + { + "epoch": 0.260022352342726, + "grad_norm": 0.21032491326332092, + "learning_rate": 1.6844892809885755e-05, + "loss": 0.1352, + "step": 14006 + }, + { + "epoch": 0.26005948248014465, + "grad_norm": 0.7216085195541382, + "learning_rate": 1.6844042364884082e-05, + "loss": 0.4938, + "step": 14008 + }, + { + "epoch": 0.2600966126175633, + "grad_norm": 0.34204918146133423, + "learning_rate": 1.6843191826755497e-05, + "loss": 0.2725, + "step": 14010 + }, + { + "epoch": 0.2601337427549819, + "grad_norm": 0.3548336625099182, + "learning_rate": 1.6842341195511567e-05, + "loss": 0.201, + "step": 14012 + }, + { + "epoch": 0.2601708728924006, + "grad_norm": 0.28081128001213074, + "learning_rate": 1.684149047116387e-05, + "loss": 0.293, + "step": 14014 + }, + { + "epoch": 0.2602080030298192, + "grad_norm": 0.3139168620109558, + "learning_rate": 1.6840639653723982e-05, + "loss": 0.3792, + "step": 14016 + }, + { + "epoch": 0.26024513316723785, + "grad_norm": 0.42086249589920044, + "learning_rate": 1.6839788743203477e-05, + "loss": 0.3359, + "step": 14018 + }, + { + "epoch": 0.2602822633046565, + "grad_norm": 0.4165078103542328, + "learning_rate": 1.6838937739613933e-05, + "loss": 0.1393, + "step": 14020 + }, + { + "epoch": 0.2603193934420751, + "grad_norm": 0.37590858340263367, + "learning_rate": 1.6838086642966934e-05, + "loss": 0.2781, + "step": 14022 + }, + { + "epoch": 0.2603565235794938, + "grad_norm": 0.3450084626674652, + "learning_rate": 1.6837235453274057e-05, + "loss": 0.3789, + "step": 14024 + }, + { + "epoch": 0.2603936537169124, + "grad_norm": 0.36544740200042725, + "learning_rate": 1.6836384170546885e-05, + "loss": 0.2555, + "step": 14026 + }, + { + "epoch": 0.26043078385433105, + "grad_norm": 0.4790702760219574, + "learning_rate": 1.6835532794797e-05, + "loss": 0.2544, + "step": 14028 + }, + { + "epoch": 0.2604679139917497, + "grad_norm": 0.4462150037288666, + "learning_rate": 1.683468132603599e-05, + "loss": 0.3495, + "step": 14030 + }, + { + "epoch": 0.2605050441291683, + "grad_norm": 0.30679255723953247, + "learning_rate": 1.6833829764275444e-05, + "loss": 0.4361, + "step": 14032 + }, + { + "epoch": 0.26054217426658693, + "grad_norm": 0.29472529888153076, + "learning_rate": 1.6832978109526936e-05, + "loss": 0.2071, + "step": 14034 + }, + { + "epoch": 0.2605793044040056, + "grad_norm": 0.3187980055809021, + "learning_rate": 1.683212636180207e-05, + "loss": 0.358, + "step": 14036 + }, + { + "epoch": 0.26061643454142425, + "grad_norm": 0.38976263999938965, + "learning_rate": 1.6831274521112428e-05, + "loss": 0.2956, + "step": 14038 + }, + { + "epoch": 0.2606535646788429, + "grad_norm": 0.40875038504600525, + "learning_rate": 1.6830422587469595e-05, + "loss": 0.3406, + "step": 14040 + }, + { + "epoch": 0.2606906948162615, + "grad_norm": 0.5932621359825134, + "learning_rate": 1.6829570560885177e-05, + "loss": 0.4226, + "step": 14042 + }, + { + "epoch": 0.26072782495368013, + "grad_norm": 0.38781848549842834, + "learning_rate": 1.682871844137076e-05, + "loss": 0.2172, + "step": 14044 + }, + { + "epoch": 0.2607649550910988, + "grad_norm": 0.35778090357780457, + "learning_rate": 1.6827866228937938e-05, + "loss": 0.2958, + "step": 14046 + }, + { + "epoch": 0.26080208522851744, + "grad_norm": 0.46684181690216064, + "learning_rate": 1.6827013923598307e-05, + "loss": 0.3022, + "step": 14048 + }, + { + "epoch": 0.2608392153659361, + "grad_norm": 0.2739340364933014, + "learning_rate": 1.6826161525363466e-05, + "loss": 0.1812, + "step": 14050 + }, + { + "epoch": 0.2608763455033547, + "grad_norm": 0.2747492790222168, + "learning_rate": 1.6825309034245018e-05, + "loss": 0.2855, + "step": 14052 + }, + { + "epoch": 0.26091347564077333, + "grad_norm": 0.38275817036628723, + "learning_rate": 1.6824456450254554e-05, + "loss": 0.3022, + "step": 14054 + }, + { + "epoch": 0.26095060577819196, + "grad_norm": 0.2889985740184784, + "learning_rate": 1.682360377340368e-05, + "loss": 0.4276, + "step": 14056 + }, + { + "epoch": 0.26098773591561064, + "grad_norm": 0.37787890434265137, + "learning_rate": 1.6822751003703997e-05, + "loss": 0.2897, + "step": 14058 + }, + { + "epoch": 0.26102486605302927, + "grad_norm": 0.2764505445957184, + "learning_rate": 1.682189814116711e-05, + "loss": 0.2386, + "step": 14060 + }, + { + "epoch": 0.2610619961904479, + "grad_norm": 0.4211701452732086, + "learning_rate": 1.6821045185804626e-05, + "loss": 0.3558, + "step": 14062 + }, + { + "epoch": 0.2610991263278665, + "grad_norm": 0.31557416915893555, + "learning_rate": 1.6820192137628143e-05, + "loss": 0.3684, + "step": 14064 + }, + { + "epoch": 0.26113625646528515, + "grad_norm": 0.4372217655181885, + "learning_rate": 1.6819338996649277e-05, + "loss": 0.4507, + "step": 14066 + }, + { + "epoch": 0.26117338660270384, + "grad_norm": 0.3574519753456116, + "learning_rate": 1.681848576287963e-05, + "loss": 0.2681, + "step": 14068 + }, + { + "epoch": 0.26121051674012247, + "grad_norm": 0.36359620094299316, + "learning_rate": 1.6817632436330826e-05, + "loss": 0.4094, + "step": 14070 + }, + { + "epoch": 0.2612476468775411, + "grad_norm": 0.37492144107818604, + "learning_rate": 1.6816779017014456e-05, + "loss": 0.1873, + "step": 14072 + }, + { + "epoch": 0.2612847770149597, + "grad_norm": 0.33945581316947937, + "learning_rate": 1.681592550494214e-05, + "loss": 0.3307, + "step": 14074 + }, + { + "epoch": 0.26132190715237835, + "grad_norm": 0.5761824250221252, + "learning_rate": 1.6815071900125503e-05, + "loss": 0.3925, + "step": 14076 + }, + { + "epoch": 0.26135903728979704, + "grad_norm": 0.3409815728664398, + "learning_rate": 1.6814218202576147e-05, + "loss": 0.3691, + "step": 14078 + }, + { + "epoch": 0.26139616742721566, + "grad_norm": 0.4060782194137573, + "learning_rate": 1.681336441230569e-05, + "loss": 0.3262, + "step": 14080 + }, + { + "epoch": 0.2614332975646343, + "grad_norm": 0.2681616246700287, + "learning_rate": 1.6812510529325755e-05, + "loss": 0.3355, + "step": 14082 + }, + { + "epoch": 0.2614704277020529, + "grad_norm": 0.39515119791030884, + "learning_rate": 1.6811656553647955e-05, + "loss": 0.3281, + "step": 14084 + }, + { + "epoch": 0.26150755783947155, + "grad_norm": 0.2709459364414215, + "learning_rate": 1.6810802485283916e-05, + "loss": 0.184, + "step": 14086 + }, + { + "epoch": 0.2615446879768902, + "grad_norm": 0.28708288073539734, + "learning_rate": 1.680994832424525e-05, + "loss": 0.3242, + "step": 14088 + }, + { + "epoch": 0.26158181811430886, + "grad_norm": 0.3695259988307953, + "learning_rate": 1.680909407054359e-05, + "loss": 0.2836, + "step": 14090 + }, + { + "epoch": 0.2616189482517275, + "grad_norm": 0.30495524406433105, + "learning_rate": 1.6808239724190554e-05, + "loss": 0.1805, + "step": 14092 + }, + { + "epoch": 0.2616560783891461, + "grad_norm": 0.3366011381149292, + "learning_rate": 1.680738528519777e-05, + "loss": 0.1888, + "step": 14094 + }, + { + "epoch": 0.26169320852656475, + "grad_norm": 0.3887014091014862, + "learning_rate": 1.6806530753576862e-05, + "loss": 0.3566, + "step": 14096 + }, + { + "epoch": 0.2617303386639834, + "grad_norm": 0.41419389843940735, + "learning_rate": 1.6805676129339456e-05, + "loss": 0.3136, + "step": 14098 + }, + { + "epoch": 0.26176746880140206, + "grad_norm": 0.48114123940467834, + "learning_rate": 1.6804821412497188e-05, + "loss": 0.36, + "step": 14100 + }, + { + "epoch": 0.2618045989388207, + "grad_norm": 0.5495288968086243, + "learning_rate": 1.680396660306168e-05, + "loss": 0.3925, + "step": 14102 + }, + { + "epoch": 0.2618417290762393, + "grad_norm": 0.3097253441810608, + "learning_rate": 1.6803111701044564e-05, + "loss": 0.2634, + "step": 14104 + }, + { + "epoch": 0.26187885921365794, + "grad_norm": 0.3248990774154663, + "learning_rate": 1.6802256706457482e-05, + "loss": 0.2875, + "step": 14106 + }, + { + "epoch": 0.2619159893510766, + "grad_norm": 0.35800638794898987, + "learning_rate": 1.6801401619312057e-05, + "loss": 0.4922, + "step": 14108 + }, + { + "epoch": 0.2619531194884952, + "grad_norm": 0.2511877417564392, + "learning_rate": 1.680054643961993e-05, + "loss": 0.2732, + "step": 14110 + }, + { + "epoch": 0.2619902496259139, + "grad_norm": 0.3973028063774109, + "learning_rate": 1.6799691167392735e-05, + "loss": 0.3757, + "step": 14112 + }, + { + "epoch": 0.2620273797633325, + "grad_norm": 0.31965377926826477, + "learning_rate": 1.679883580264211e-05, + "loss": 0.1373, + "step": 14114 + }, + { + "epoch": 0.26206450990075114, + "grad_norm": 0.37503311038017273, + "learning_rate": 1.6797980345379696e-05, + "loss": 0.1595, + "step": 14116 + }, + { + "epoch": 0.26210164003816977, + "grad_norm": 0.3078148365020752, + "learning_rate": 1.679712479561713e-05, + "loss": 0.338, + "step": 14118 + }, + { + "epoch": 0.2621387701755884, + "grad_norm": 0.4150207042694092, + "learning_rate": 1.6796269153366058e-05, + "loss": 0.2663, + "step": 14120 + }, + { + "epoch": 0.2621759003130071, + "grad_norm": 0.3178233802318573, + "learning_rate": 1.679541341863812e-05, + "loss": 0.1775, + "step": 14122 + }, + { + "epoch": 0.2622130304504257, + "grad_norm": 0.3457436263561249, + "learning_rate": 1.679455759144496e-05, + "loss": 0.2439, + "step": 14124 + }, + { + "epoch": 0.26225016058784434, + "grad_norm": 0.34838226437568665, + "learning_rate": 1.6793701671798223e-05, + "loss": 0.3207, + "step": 14126 + }, + { + "epoch": 0.26228729072526297, + "grad_norm": 0.24280937016010284, + "learning_rate": 1.679284565970955e-05, + "loss": 0.2971, + "step": 14128 + }, + { + "epoch": 0.2623244208626816, + "grad_norm": 0.23812375962734222, + "learning_rate": 1.67919895551906e-05, + "loss": 0.2827, + "step": 14130 + }, + { + "epoch": 0.2623615510001002, + "grad_norm": 0.35587090253829956, + "learning_rate": 1.6791133358253015e-05, + "loss": 0.4991, + "step": 14132 + }, + { + "epoch": 0.2623986811375189, + "grad_norm": 0.7513763904571533, + "learning_rate": 1.679027706890845e-05, + "loss": 0.3813, + "step": 14134 + }, + { + "epoch": 0.26243581127493754, + "grad_norm": 0.5282157063484192, + "learning_rate": 1.678942068716855e-05, + "loss": 0.4646, + "step": 14136 + }, + { + "epoch": 0.26247294141235616, + "grad_norm": 0.37438374757766724, + "learning_rate": 1.6788564213044973e-05, + "loss": 0.2097, + "step": 14138 + }, + { + "epoch": 0.2625100715497748, + "grad_norm": 0.4786970913410187, + "learning_rate": 1.678770764654937e-05, + "loss": 0.285, + "step": 14140 + }, + { + "epoch": 0.2625472016871934, + "grad_norm": 0.42811256647109985, + "learning_rate": 1.6786850987693397e-05, + "loss": 0.2298, + "step": 14142 + }, + { + "epoch": 0.2625843318246121, + "grad_norm": 0.3187107443809509, + "learning_rate": 1.6785994236488714e-05, + "loss": 0.4095, + "step": 14144 + }, + { + "epoch": 0.26262146196203073, + "grad_norm": 0.3378993272781372, + "learning_rate": 1.678513739294697e-05, + "loss": 0.1566, + "step": 14146 + }, + { + "epoch": 0.26265859209944936, + "grad_norm": 0.6031299829483032, + "learning_rate": 1.6784280457079836e-05, + "loss": 0.2282, + "step": 14148 + }, + { + "epoch": 0.262695722236868, + "grad_norm": 0.34305110573768616, + "learning_rate": 1.678342342889896e-05, + "loss": 0.1947, + "step": 14150 + }, + { + "epoch": 0.2627328523742866, + "grad_norm": 0.5034898519515991, + "learning_rate": 1.6782566308416014e-05, + "loss": 0.2314, + "step": 14152 + }, + { + "epoch": 0.2627699825117053, + "grad_norm": 0.30161580443382263, + "learning_rate": 1.6781709095642658e-05, + "loss": 0.276, + "step": 14154 + }, + { + "epoch": 0.26280711264912393, + "grad_norm": 0.32590973377227783, + "learning_rate": 1.678085179059055e-05, + "loss": 0.297, + "step": 14156 + }, + { + "epoch": 0.26284424278654256, + "grad_norm": 0.4782107174396515, + "learning_rate": 1.6779994393271365e-05, + "loss": 0.2587, + "step": 14158 + }, + { + "epoch": 0.2628813729239612, + "grad_norm": 0.2745949327945709, + "learning_rate": 1.6779136903696763e-05, + "loss": 0.4622, + "step": 14160 + }, + { + "epoch": 0.2629185030613798, + "grad_norm": 0.3983341157436371, + "learning_rate": 1.6778279321878416e-05, + "loss": 0.3024, + "step": 14162 + }, + { + "epoch": 0.26295563319879844, + "grad_norm": 0.5545642375946045, + "learning_rate": 1.6777421647827987e-05, + "loss": 0.2972, + "step": 14164 + }, + { + "epoch": 0.26299276333621713, + "grad_norm": 0.5028650760650635, + "learning_rate": 1.677656388155715e-05, + "loss": 0.1692, + "step": 14166 + }, + { + "epoch": 0.26302989347363576, + "grad_norm": 0.28886497020721436, + "learning_rate": 1.6775706023077578e-05, + "loss": 0.2239, + "step": 14168 + }, + { + "epoch": 0.2630670236110544, + "grad_norm": 0.48967164754867554, + "learning_rate": 1.6774848072400943e-05, + "loss": 0.2816, + "step": 14170 + }, + { + "epoch": 0.263104153748473, + "grad_norm": 0.34067201614379883, + "learning_rate": 1.6773990029538917e-05, + "loss": 0.438, + "step": 14172 + }, + { + "epoch": 0.26314128388589164, + "grad_norm": 0.4151076674461365, + "learning_rate": 1.677313189450318e-05, + "loss": 0.2797, + "step": 14174 + }, + { + "epoch": 0.2631784140233103, + "grad_norm": 0.37378132343292236, + "learning_rate": 1.6772273667305405e-05, + "loss": 0.247, + "step": 14176 + }, + { + "epoch": 0.26321554416072895, + "grad_norm": 0.3564856946468353, + "learning_rate": 1.677141534795727e-05, + "loss": 0.5565, + "step": 14178 + }, + { + "epoch": 0.2632526742981476, + "grad_norm": 0.4268989861011505, + "learning_rate": 1.6770556936470456e-05, + "loss": 0.2721, + "step": 14180 + }, + { + "epoch": 0.2632898044355662, + "grad_norm": 0.3650158643722534, + "learning_rate": 1.676969843285664e-05, + "loss": 0.4985, + "step": 14182 + }, + { + "epoch": 0.26332693457298484, + "grad_norm": 0.4041502773761749, + "learning_rate": 1.6768839837127508e-05, + "loss": 0.2943, + "step": 14184 + }, + { + "epoch": 0.26336406471040347, + "grad_norm": 0.39799535274505615, + "learning_rate": 1.6767981149294738e-05, + "loss": 0.4508, + "step": 14186 + }, + { + "epoch": 0.26340119484782215, + "grad_norm": 0.33557912707328796, + "learning_rate": 1.676712236937002e-05, + "loss": 0.245, + "step": 14188 + }, + { + "epoch": 0.2634383249852408, + "grad_norm": 0.39507460594177246, + "learning_rate": 1.6766263497365036e-05, + "loss": 0.3271, + "step": 14190 + }, + { + "epoch": 0.2634754551226594, + "grad_norm": 0.46698614954948425, + "learning_rate": 1.676540453329147e-05, + "loss": 0.37, + "step": 14192 + }, + { + "epoch": 0.26351258526007804, + "grad_norm": 0.45323115587234497, + "learning_rate": 1.6764545477161017e-05, + "loss": 0.4808, + "step": 14194 + }, + { + "epoch": 0.26354971539749666, + "grad_norm": 0.3934701383113861, + "learning_rate": 1.676368632898536e-05, + "loss": 0.2344, + "step": 14196 + }, + { + "epoch": 0.26358684553491535, + "grad_norm": 0.21577338874340057, + "learning_rate": 1.6762827088776194e-05, + "loss": 0.3857, + "step": 14198 + }, + { + "epoch": 0.263623975672334, + "grad_norm": 0.42878860235214233, + "learning_rate": 1.6761967756545206e-05, + "loss": 0.3362, + "step": 14200 + }, + { + "epoch": 0.2636611058097526, + "grad_norm": 0.334805965423584, + "learning_rate": 1.6761108332304093e-05, + "loss": 0.1876, + "step": 14202 + }, + { + "epoch": 0.26369823594717123, + "grad_norm": 0.6080650091171265, + "learning_rate": 1.6760248816064546e-05, + "loss": 0.2604, + "step": 14204 + }, + { + "epoch": 0.26373536608458986, + "grad_norm": 0.3659916818141937, + "learning_rate": 1.675938920783826e-05, + "loss": 0.2707, + "step": 14206 + }, + { + "epoch": 0.2637724962220085, + "grad_norm": 0.2813114821910858, + "learning_rate": 1.6758529507636937e-05, + "loss": 0.2437, + "step": 14208 + }, + { + "epoch": 0.2638096263594272, + "grad_norm": 0.3511956036090851, + "learning_rate": 1.675766971547227e-05, + "loss": 0.2173, + "step": 14210 + }, + { + "epoch": 0.2638467564968458, + "grad_norm": 0.35966426134109497, + "learning_rate": 1.675680983135596e-05, + "loss": 0.2128, + "step": 14212 + }, + { + "epoch": 0.26388388663426443, + "grad_norm": 0.38586369156837463, + "learning_rate": 1.6755949855299708e-05, + "loss": 0.4097, + "step": 14214 + }, + { + "epoch": 0.26392101677168306, + "grad_norm": 0.29686152935028076, + "learning_rate": 1.6755089787315215e-05, + "loss": 0.0991, + "step": 14216 + }, + { + "epoch": 0.2639581469091017, + "grad_norm": 0.4054957926273346, + "learning_rate": 1.6754229627414183e-05, + "loss": 0.3325, + "step": 14218 + }, + { + "epoch": 0.26399527704652037, + "grad_norm": 0.34105566143989563, + "learning_rate": 1.6753369375608317e-05, + "loss": 0.139, + "step": 14220 + }, + { + "epoch": 0.264032407183939, + "grad_norm": 0.3767379820346832, + "learning_rate": 1.6752509031909324e-05, + "loss": 0.3132, + "step": 14222 + }, + { + "epoch": 0.26406953732135763, + "grad_norm": 0.39031022787094116, + "learning_rate": 1.6751648596328903e-05, + "loss": 0.2391, + "step": 14224 + }, + { + "epoch": 0.26410666745877626, + "grad_norm": 0.4413573741912842, + "learning_rate": 1.6750788068878777e-05, + "loss": 0.3748, + "step": 14226 + }, + { + "epoch": 0.2641437975961949, + "grad_norm": 0.26224058866500854, + "learning_rate": 1.6749927449570642e-05, + "loss": 0.2944, + "step": 14228 + }, + { + "epoch": 0.26418092773361357, + "grad_norm": 0.2730865776538849, + "learning_rate": 1.6749066738416214e-05, + "loss": 0.29, + "step": 14230 + }, + { + "epoch": 0.2642180578710322, + "grad_norm": 0.25818562507629395, + "learning_rate": 1.6748205935427202e-05, + "loss": 0.3834, + "step": 14232 + }, + { + "epoch": 0.2642551880084508, + "grad_norm": 0.2991827130317688, + "learning_rate": 1.6747345040615322e-05, + "loss": 0.1989, + "step": 14234 + }, + { + "epoch": 0.26429231814586945, + "grad_norm": 0.4195464551448822, + "learning_rate": 1.6746484053992285e-05, + "loss": 0.4589, + "step": 14236 + }, + { + "epoch": 0.2643294482832881, + "grad_norm": 0.4545131027698517, + "learning_rate": 1.674562297556981e-05, + "loss": 0.3973, + "step": 14238 + }, + { + "epoch": 0.2643665784207067, + "grad_norm": 0.4013481140136719, + "learning_rate": 1.674476180535961e-05, + "loss": 0.2932, + "step": 14240 + }, + { + "epoch": 0.2644037085581254, + "grad_norm": 0.3643893897533417, + "learning_rate": 1.6743900543373405e-05, + "loss": 0.5315, + "step": 14242 + }, + { + "epoch": 0.264440838695544, + "grad_norm": 0.42725899815559387, + "learning_rate": 1.6743039189622913e-05, + "loss": 0.5253, + "step": 14244 + }, + { + "epoch": 0.26447796883296265, + "grad_norm": 0.22592240571975708, + "learning_rate": 1.6742177744119858e-05, + "loss": 0.2865, + "step": 14246 + }, + { + "epoch": 0.2645150989703813, + "grad_norm": 0.3828607201576233, + "learning_rate": 1.6741316206875955e-05, + "loss": 0.3774, + "step": 14248 + }, + { + "epoch": 0.2645522291077999, + "grad_norm": 0.44129350781440735, + "learning_rate": 1.6740454577902934e-05, + "loss": 0.3331, + "step": 14250 + }, + { + "epoch": 0.2645893592452186, + "grad_norm": 0.23848488926887512, + "learning_rate": 1.6739592857212518e-05, + "loss": 0.198, + "step": 14252 + }, + { + "epoch": 0.2646264893826372, + "grad_norm": 0.34743815660476685, + "learning_rate": 1.6738731044816426e-05, + "loss": 0.1552, + "step": 14254 + }, + { + "epoch": 0.26466361952005585, + "grad_norm": 0.46097663044929504, + "learning_rate": 1.673786914072639e-05, + "loss": 0.3398, + "step": 14256 + }, + { + "epoch": 0.2647007496574745, + "grad_norm": 0.4356861412525177, + "learning_rate": 1.673700714495414e-05, + "loss": 0.2615, + "step": 14258 + }, + { + "epoch": 0.2647378797948931, + "grad_norm": 0.24662169814109802, + "learning_rate": 1.6736145057511403e-05, + "loss": 0.1473, + "step": 14260 + }, + { + "epoch": 0.26477500993231173, + "grad_norm": 0.45927730202674866, + "learning_rate": 1.6735282878409903e-05, + "loss": 0.2772, + "step": 14262 + }, + { + "epoch": 0.2648121400697304, + "grad_norm": 0.40104055404663086, + "learning_rate": 1.6734420607661387e-05, + "loss": 0.3106, + "step": 14264 + }, + { + "epoch": 0.26484927020714905, + "grad_norm": 0.37815555930137634, + "learning_rate": 1.6733558245277574e-05, + "loss": 0.3442, + "step": 14266 + }, + { + "epoch": 0.2648864003445677, + "grad_norm": 0.3576014041900635, + "learning_rate": 1.6732695791270196e-05, + "loss": 0.48, + "step": 14268 + }, + { + "epoch": 0.2649235304819863, + "grad_norm": 0.3668026924133301, + "learning_rate": 1.6731833245651005e-05, + "loss": 0.3261, + "step": 14270 + }, + { + "epoch": 0.26496066061940493, + "grad_norm": 0.28072062134742737, + "learning_rate": 1.6730970608431722e-05, + "loss": 0.2143, + "step": 14272 + }, + { + "epoch": 0.2649977907568236, + "grad_norm": 0.31429967284202576, + "learning_rate": 1.6730107879624092e-05, + "loss": 0.1703, + "step": 14274 + }, + { + "epoch": 0.26503492089424224, + "grad_norm": 0.6020709872245789, + "learning_rate": 1.672924505923985e-05, + "loss": 0.1655, + "step": 14276 + }, + { + "epoch": 0.26507205103166087, + "grad_norm": 0.24876609444618225, + "learning_rate": 1.6728382147290747e-05, + "loss": 0.3435, + "step": 14278 + }, + { + "epoch": 0.2651091811690795, + "grad_norm": 0.40927767753601074, + "learning_rate": 1.6727519143788512e-05, + "loss": 0.498, + "step": 14280 + }, + { + "epoch": 0.26514631130649813, + "grad_norm": 0.3224967122077942, + "learning_rate": 1.6726656048744893e-05, + "loss": 0.2814, + "step": 14282 + }, + { + "epoch": 0.26518344144391676, + "grad_norm": 0.2693054676055908, + "learning_rate": 1.672579286217163e-05, + "loss": 0.1502, + "step": 14284 + }, + { + "epoch": 0.26522057158133544, + "grad_norm": 0.39799273014068604, + "learning_rate": 1.672492958408048e-05, + "loss": 0.499, + "step": 14286 + }, + { + "epoch": 0.26525770171875407, + "grad_norm": 0.34101539850234985, + "learning_rate": 1.6724066214483174e-05, + "loss": 0.2973, + "step": 14288 + }, + { + "epoch": 0.2652948318561727, + "grad_norm": 0.35910147428512573, + "learning_rate": 1.672320275339147e-05, + "loss": 0.3929, + "step": 14290 + }, + { + "epoch": 0.2653319619935913, + "grad_norm": 0.27279797196388245, + "learning_rate": 1.6722339200817116e-05, + "loss": 0.3782, + "step": 14292 + }, + { + "epoch": 0.26536909213100995, + "grad_norm": 0.34886181354522705, + "learning_rate": 1.6721475556771864e-05, + "loss": 0.2055, + "step": 14294 + }, + { + "epoch": 0.26540622226842864, + "grad_norm": 0.451325386762619, + "learning_rate": 1.672061182126746e-05, + "loss": 0.3413, + "step": 14296 + }, + { + "epoch": 0.26544335240584727, + "grad_norm": 0.2522677481174469, + "learning_rate": 1.671974799431566e-05, + "loss": 0.4009, + "step": 14298 + }, + { + "epoch": 0.2654804825432659, + "grad_norm": 0.3332119286060333, + "learning_rate": 1.6718884075928216e-05, + "loss": 0.371, + "step": 14300 + }, + { + "epoch": 0.2655176126806845, + "grad_norm": 0.4892105758190155, + "learning_rate": 1.6718020066116888e-05, + "loss": 0.4023, + "step": 14302 + }, + { + "epoch": 0.26555474281810315, + "grad_norm": 0.30999088287353516, + "learning_rate": 1.6717155964893428e-05, + "loss": 0.3294, + "step": 14304 + }, + { + "epoch": 0.26559187295552183, + "grad_norm": 0.5551329255104065, + "learning_rate": 1.6716291772269598e-05, + "loss": 0.3277, + "step": 14306 + }, + { + "epoch": 0.26562900309294046, + "grad_norm": 0.3143518269062042, + "learning_rate": 1.6715427488257153e-05, + "loss": 0.3248, + "step": 14308 + }, + { + "epoch": 0.2656661332303591, + "grad_norm": 0.3433229327201843, + "learning_rate": 1.6714563112867855e-05, + "loss": 0.2351, + "step": 14310 + }, + { + "epoch": 0.2657032633677777, + "grad_norm": 0.3209666907787323, + "learning_rate": 1.6713698646113463e-05, + "loss": 0.3536, + "step": 14312 + }, + { + "epoch": 0.26574039350519635, + "grad_norm": 0.3675312101840973, + "learning_rate": 1.6712834088005744e-05, + "loss": 0.3228, + "step": 14314 + }, + { + "epoch": 0.265777523642615, + "grad_norm": 0.6456232070922852, + "learning_rate": 1.671196943855646e-05, + "loss": 0.1725, + "step": 14316 + }, + { + "epoch": 0.26581465378003366, + "grad_norm": 0.5283076167106628, + "learning_rate": 1.6711104697777378e-05, + "loss": 0.3528, + "step": 14318 + }, + { + "epoch": 0.2658517839174523, + "grad_norm": 0.5134596228599548, + "learning_rate": 1.671023986568026e-05, + "loss": 0.3215, + "step": 14320 + }, + { + "epoch": 0.2658889140548709, + "grad_norm": 0.4296994209289551, + "learning_rate": 1.670937494227688e-05, + "loss": 0.4656, + "step": 14322 + }, + { + "epoch": 0.26592604419228955, + "grad_norm": 0.45185139775276184, + "learning_rate": 1.6708509927579003e-05, + "loss": 0.1717, + "step": 14324 + }, + { + "epoch": 0.2659631743297082, + "grad_norm": 0.4098852872848511, + "learning_rate": 1.67076448215984e-05, + "loss": 0.4624, + "step": 14326 + }, + { + "epoch": 0.26600030446712686, + "grad_norm": 0.3968254327774048, + "learning_rate": 1.6706779624346844e-05, + "loss": 0.1497, + "step": 14328 + }, + { + "epoch": 0.2660374346045455, + "grad_norm": 0.5006260871887207, + "learning_rate": 1.6705914335836106e-05, + "loss": 0.3648, + "step": 14330 + }, + { + "epoch": 0.2660745647419641, + "grad_norm": 0.33969223499298096, + "learning_rate": 1.670504895607796e-05, + "loss": 0.2362, + "step": 14332 + }, + { + "epoch": 0.26611169487938274, + "grad_norm": 0.32858091592788696, + "learning_rate": 1.6704183485084183e-05, + "loss": 0.4018, + "step": 14334 + }, + { + "epoch": 0.26614882501680137, + "grad_norm": 0.6745291352272034, + "learning_rate": 1.6703317922866546e-05, + "loss": 0.4107, + "step": 14336 + }, + { + "epoch": 0.26618595515422, + "grad_norm": 0.5537766218185425, + "learning_rate": 1.6702452269436834e-05, + "loss": 0.2483, + "step": 14338 + }, + { + "epoch": 0.2662230852916387, + "grad_norm": 0.3518688976764679, + "learning_rate": 1.6701586524806826e-05, + "loss": 0.2292, + "step": 14340 + }, + { + "epoch": 0.2662602154290573, + "grad_norm": 0.41889575123786926, + "learning_rate": 1.670072068898829e-05, + "loss": 0.5489, + "step": 14342 + }, + { + "epoch": 0.26629734556647594, + "grad_norm": 0.3126862943172455, + "learning_rate": 1.6699854761993025e-05, + "loss": 0.2945, + "step": 14344 + }, + { + "epoch": 0.26633447570389457, + "grad_norm": 0.3870278596878052, + "learning_rate": 1.6698988743832802e-05, + "loss": 0.2315, + "step": 14346 + }, + { + "epoch": 0.2663716058413132, + "grad_norm": 0.3100411295890808, + "learning_rate": 1.6698122634519407e-05, + "loss": 0.4566, + "step": 14348 + }, + { + "epoch": 0.2664087359787319, + "grad_norm": 0.3697691261768341, + "learning_rate": 1.669725643406463e-05, + "loss": 0.2797, + "step": 14350 + }, + { + "epoch": 0.2664458661161505, + "grad_norm": 0.2827511429786682, + "learning_rate": 1.6696390142480246e-05, + "loss": 0.2803, + "step": 14352 + }, + { + "epoch": 0.26648299625356914, + "grad_norm": 0.3630894422531128, + "learning_rate": 1.669552375977806e-05, + "loss": 0.2501, + "step": 14354 + }, + { + "epoch": 0.26652012639098777, + "grad_norm": 0.30100512504577637, + "learning_rate": 1.6694657285969843e-05, + "loss": 0.1759, + "step": 14356 + }, + { + "epoch": 0.2665572565284064, + "grad_norm": 0.44136178493499756, + "learning_rate": 1.6693790721067397e-05, + "loss": 0.1432, + "step": 14358 + }, + { + "epoch": 0.266594386665825, + "grad_norm": 0.48275068402290344, + "learning_rate": 1.6692924065082508e-05, + "loss": 0.3125, + "step": 14360 + }, + { + "epoch": 0.2666315168032437, + "grad_norm": 0.32505711913108826, + "learning_rate": 1.669205731802697e-05, + "loss": 0.1992, + "step": 14362 + }, + { + "epoch": 0.26666864694066233, + "grad_norm": 0.28460440039634705, + "learning_rate": 1.6691190479912574e-05, + "loss": 0.3663, + "step": 14364 + }, + { + "epoch": 0.26670577707808096, + "grad_norm": 0.4201560318470001, + "learning_rate": 1.6690323550751127e-05, + "loss": 0.4463, + "step": 14366 + }, + { + "epoch": 0.2667429072154996, + "grad_norm": 0.3393763601779938, + "learning_rate": 1.6689456530554407e-05, + "loss": 0.3195, + "step": 14368 + }, + { + "epoch": 0.2667800373529182, + "grad_norm": 0.3699619472026825, + "learning_rate": 1.6688589419334226e-05, + "loss": 0.2872, + "step": 14370 + }, + { + "epoch": 0.2668171674903369, + "grad_norm": 0.283236563205719, + "learning_rate": 1.6687722217102374e-05, + "loss": 0.3006, + "step": 14372 + }, + { + "epoch": 0.26685429762775553, + "grad_norm": 0.4910120964050293, + "learning_rate": 1.668685492387066e-05, + "loss": 0.4428, + "step": 14374 + }, + { + "epoch": 0.26689142776517416, + "grad_norm": 0.3207995295524597, + "learning_rate": 1.6685987539650878e-05, + "loss": 0.3421, + "step": 14376 + }, + { + "epoch": 0.2669285579025928, + "grad_norm": 0.41593748331069946, + "learning_rate": 1.668512006445483e-05, + "loss": 0.6078, + "step": 14378 + }, + { + "epoch": 0.2669656880400114, + "grad_norm": 0.2827526330947876, + "learning_rate": 1.6684252498294322e-05, + "loss": 0.2529, + "step": 14380 + }, + { + "epoch": 0.2670028181774301, + "grad_norm": 0.35570284724235535, + "learning_rate": 1.6683384841181162e-05, + "loss": 0.1784, + "step": 14382 + }, + { + "epoch": 0.26703994831484873, + "grad_norm": 0.3963766396045685, + "learning_rate": 1.668251709312715e-05, + "loss": 0.2537, + "step": 14384 + }, + { + "epoch": 0.26707707845226736, + "grad_norm": 0.3778403401374817, + "learning_rate": 1.66816492541441e-05, + "loss": 0.1463, + "step": 14386 + }, + { + "epoch": 0.267114208589686, + "grad_norm": 0.28598928451538086, + "learning_rate": 1.668078132424382e-05, + "loss": 0.2729, + "step": 14388 + }, + { + "epoch": 0.2671513387271046, + "grad_norm": 0.35990801453590393, + "learning_rate": 1.667991330343811e-05, + "loss": 0.2098, + "step": 14390 + }, + { + "epoch": 0.26718846886452324, + "grad_norm": 0.31900423765182495, + "learning_rate": 1.6679045191738793e-05, + "loss": 0.4023, + "step": 14392 + }, + { + "epoch": 0.2672255990019419, + "grad_norm": 0.35485732555389404, + "learning_rate": 1.6678176989157676e-05, + "loss": 0.304, + "step": 14394 + }, + { + "epoch": 0.26726272913936056, + "grad_norm": 0.37659889459609985, + "learning_rate": 1.6677308695706577e-05, + "loss": 0.4337, + "step": 14396 + }, + { + "epoch": 0.2672998592767792, + "grad_norm": 0.41286689043045044, + "learning_rate": 1.6676440311397302e-05, + "loss": 0.4646, + "step": 14398 + }, + { + "epoch": 0.2673369894141978, + "grad_norm": 0.28072118759155273, + "learning_rate": 1.6675571836241674e-05, + "loss": 0.2176, + "step": 14400 + }, + { + "epoch": 0.26737411955161644, + "grad_norm": 0.31832537055015564, + "learning_rate": 1.667470327025151e-05, + "loss": 0.3886, + "step": 14402 + }, + { + "epoch": 0.2674112496890351, + "grad_norm": 0.29668357968330383, + "learning_rate": 1.6673834613438627e-05, + "loss": 0.5288, + "step": 14404 + }, + { + "epoch": 0.26744837982645375, + "grad_norm": 0.19775919616222382, + "learning_rate": 1.6672965865814848e-05, + "loss": 0.325, + "step": 14406 + }, + { + "epoch": 0.2674855099638724, + "grad_norm": 0.41689276695251465, + "learning_rate": 1.6672097027391986e-05, + "loss": 0.2146, + "step": 14408 + }, + { + "epoch": 0.267522640101291, + "grad_norm": 0.44212502241134644, + "learning_rate": 1.6671228098181876e-05, + "loss": 0.4554, + "step": 14410 + }, + { + "epoch": 0.26755977023870964, + "grad_norm": 0.3626474440097809, + "learning_rate": 1.6670359078196327e-05, + "loss": 0.3896, + "step": 14412 + }, + { + "epoch": 0.26759690037612827, + "grad_norm": 0.6020227670669556, + "learning_rate": 1.6669489967447178e-05, + "loss": 0.3662, + "step": 14414 + }, + { + "epoch": 0.26763403051354695, + "grad_norm": 0.3484719693660736, + "learning_rate": 1.6668620765946242e-05, + "loss": 0.2158, + "step": 14416 + }, + { + "epoch": 0.2676711606509656, + "grad_norm": 0.24232150614261627, + "learning_rate": 1.6667751473705357e-05, + "loss": 0.255, + "step": 14418 + }, + { + "epoch": 0.2677082907883842, + "grad_norm": 0.6532554030418396, + "learning_rate": 1.6666882090736343e-05, + "loss": 0.3411, + "step": 14420 + }, + { + "epoch": 0.26774542092580284, + "grad_norm": 0.494770884513855, + "learning_rate": 1.6666012617051036e-05, + "loss": 0.3573, + "step": 14422 + }, + { + "epoch": 0.26778255106322146, + "grad_norm": 0.474211722612381, + "learning_rate": 1.666514305266126e-05, + "loss": 0.3398, + "step": 14424 + }, + { + "epoch": 0.26781968120064015, + "grad_norm": 0.3734486699104309, + "learning_rate": 1.6664273397578853e-05, + "loss": 0.3873, + "step": 14426 + }, + { + "epoch": 0.2678568113380588, + "grad_norm": 0.3601103127002716, + "learning_rate": 1.666340365181565e-05, + "loss": 0.3415, + "step": 14428 + }, + { + "epoch": 0.2678939414754774, + "grad_norm": 0.4700402319431305, + "learning_rate": 1.6662533815383482e-05, + "loss": 0.2047, + "step": 14430 + }, + { + "epoch": 0.26793107161289603, + "grad_norm": 0.3722496032714844, + "learning_rate": 1.666166388829418e-05, + "loss": 0.3805, + "step": 14432 + }, + { + "epoch": 0.26796820175031466, + "grad_norm": 0.16663134098052979, + "learning_rate": 1.666079387055959e-05, + "loss": 0.2927, + "step": 14434 + }, + { + "epoch": 0.2680053318877333, + "grad_norm": 0.36628684401512146, + "learning_rate": 1.665992376219155e-05, + "loss": 0.1648, + "step": 14436 + }, + { + "epoch": 0.268042462025152, + "grad_norm": 0.26776444911956787, + "learning_rate": 1.6659053563201896e-05, + "loss": 0.2641, + "step": 14438 + }, + { + "epoch": 0.2680795921625706, + "grad_norm": 0.31087130308151245, + "learning_rate": 1.6658183273602463e-05, + "loss": 0.398, + "step": 14440 + }, + { + "epoch": 0.26811672229998923, + "grad_norm": 0.3097818195819855, + "learning_rate": 1.6657312893405104e-05, + "loss": 0.3958, + "step": 14442 + }, + { + "epoch": 0.26815385243740786, + "grad_norm": 0.37057098746299744, + "learning_rate": 1.6656442422621658e-05, + "loss": 0.343, + "step": 14444 + }, + { + "epoch": 0.2681909825748265, + "grad_norm": 0.3174958825111389, + "learning_rate": 1.665557186126397e-05, + "loss": 0.3892, + "step": 14446 + }, + { + "epoch": 0.26822811271224517, + "grad_norm": 0.3206169903278351, + "learning_rate": 1.665470120934388e-05, + "loss": 0.2514, + "step": 14448 + }, + { + "epoch": 0.2682652428496638, + "grad_norm": 0.3004591166973114, + "learning_rate": 1.6653830466873243e-05, + "loss": 0.3424, + "step": 14450 + }, + { + "epoch": 0.2683023729870824, + "grad_norm": 0.36004695296287537, + "learning_rate": 1.66529596338639e-05, + "loss": 0.3814, + "step": 14452 + }, + { + "epoch": 0.26833950312450106, + "grad_norm": 0.3942979872226715, + "learning_rate": 1.6652088710327708e-05, + "loss": 0.3992, + "step": 14454 + }, + { + "epoch": 0.2683766332619197, + "grad_norm": 0.29094555974006653, + "learning_rate": 1.6651217696276513e-05, + "loss": 0.3038, + "step": 14456 + }, + { + "epoch": 0.26841376339933837, + "grad_norm": 0.34084823727607727, + "learning_rate": 1.6650346591722168e-05, + "loss": 0.393, + "step": 14458 + }, + { + "epoch": 0.268450893536757, + "grad_norm": 0.3628236651420593, + "learning_rate": 1.6649475396676526e-05, + "loss": 0.2301, + "step": 14460 + }, + { + "epoch": 0.2684880236741756, + "grad_norm": 0.4235619008541107, + "learning_rate": 1.6648604111151444e-05, + "loss": 0.5134, + "step": 14462 + }, + { + "epoch": 0.26852515381159425, + "grad_norm": 0.4813389778137207, + "learning_rate": 1.664773273515877e-05, + "loss": 0.2371, + "step": 14464 + }, + { + "epoch": 0.2685622839490129, + "grad_norm": 0.3011683523654938, + "learning_rate": 1.664686126871037e-05, + "loss": 0.2172, + "step": 14466 + }, + { + "epoch": 0.2685994140864315, + "grad_norm": 0.5195376873016357, + "learning_rate": 1.6645989711818092e-05, + "loss": 0.3891, + "step": 14468 + }, + { + "epoch": 0.2686365442238502, + "grad_norm": 0.3900006115436554, + "learning_rate": 1.6645118064493807e-05, + "loss": 0.4188, + "step": 14470 + }, + { + "epoch": 0.2686736743612688, + "grad_norm": 0.39062610268592834, + "learning_rate": 1.6644246326749368e-05, + "loss": 0.1722, + "step": 14472 + }, + { + "epoch": 0.26871080449868745, + "grad_norm": 0.40507540106773376, + "learning_rate": 1.6643374498596637e-05, + "loss": 0.3746, + "step": 14474 + }, + { + "epoch": 0.2687479346361061, + "grad_norm": 0.359060674905777, + "learning_rate": 1.664250258004748e-05, + "loss": 0.2503, + "step": 14476 + }, + { + "epoch": 0.2687850647735247, + "grad_norm": 0.2542099952697754, + "learning_rate": 1.664163057111376e-05, + "loss": 0.3822, + "step": 14478 + }, + { + "epoch": 0.2688221949109434, + "grad_norm": 0.38323724269866943, + "learning_rate": 1.6640758471807337e-05, + "loss": 0.4657, + "step": 14480 + }, + { + "epoch": 0.268859325048362, + "grad_norm": 0.2707110643386841, + "learning_rate": 1.6639886282140086e-05, + "loss": 0.4027, + "step": 14482 + }, + { + "epoch": 0.26889645518578065, + "grad_norm": 0.31322354078292847, + "learning_rate": 1.663901400212387e-05, + "loss": 0.2866, + "step": 14484 + }, + { + "epoch": 0.2689335853231993, + "grad_norm": 0.4387029707431793, + "learning_rate": 1.6638141631770564e-05, + "loss": 0.2521, + "step": 14486 + }, + { + "epoch": 0.2689707154606179, + "grad_norm": 0.33834144473075867, + "learning_rate": 1.663726917109203e-05, + "loss": 0.3537, + "step": 14488 + }, + { + "epoch": 0.26900784559803653, + "grad_norm": 0.432029664516449, + "learning_rate": 1.6636396620100144e-05, + "loss": 0.2071, + "step": 14490 + }, + { + "epoch": 0.2690449757354552, + "grad_norm": 0.33301714062690735, + "learning_rate": 1.663552397880678e-05, + "loss": 0.1605, + "step": 14492 + }, + { + "epoch": 0.26908210587287384, + "grad_norm": 0.3962661325931549, + "learning_rate": 1.663465124722381e-05, + "loss": 0.3266, + "step": 14494 + }, + { + "epoch": 0.2691192360102925, + "grad_norm": 0.41214829683303833, + "learning_rate": 1.6633778425363107e-05, + "loss": 0.3467, + "step": 14496 + }, + { + "epoch": 0.2691563661477111, + "grad_norm": 1.6523618698120117, + "learning_rate": 1.6632905513236553e-05, + "loss": 0.2335, + "step": 14498 + }, + { + "epoch": 0.26919349628512973, + "grad_norm": 0.41188904643058777, + "learning_rate": 1.663203251085602e-05, + "loss": 0.4452, + "step": 14500 + }, + { + "epoch": 0.2692306264225484, + "grad_norm": 0.39941343665122986, + "learning_rate": 1.6631159418233392e-05, + "loss": 0.2934, + "step": 14502 + }, + { + "epoch": 0.26926775655996704, + "grad_norm": 0.3252164423465729, + "learning_rate": 1.6630286235380546e-05, + "loss": 0.4299, + "step": 14504 + }, + { + "epoch": 0.26930488669738567, + "grad_norm": 0.38180050253868103, + "learning_rate": 1.6629412962309364e-05, + "loss": 0.322, + "step": 14506 + }, + { + "epoch": 0.2693420168348043, + "grad_norm": 0.4627339243888855, + "learning_rate": 1.662853959903173e-05, + "loss": 0.3768, + "step": 14508 + }, + { + "epoch": 0.2693791469722229, + "grad_norm": 0.2700619101524353, + "learning_rate": 1.6627666145559526e-05, + "loss": 0.3165, + "step": 14510 + }, + { + "epoch": 0.26941627710964156, + "grad_norm": 0.3088364005088806, + "learning_rate": 1.6626792601904637e-05, + "loss": 0.2917, + "step": 14512 + }, + { + "epoch": 0.26945340724706024, + "grad_norm": 0.45101645588874817, + "learning_rate": 1.662591896807895e-05, + "loss": 0.4708, + "step": 14514 + }, + { + "epoch": 0.26949053738447887, + "grad_norm": 0.28268539905548096, + "learning_rate": 1.6625045244094357e-05, + "loss": 0.2777, + "step": 14516 + }, + { + "epoch": 0.2695276675218975, + "grad_norm": 0.2866145074367523, + "learning_rate": 1.662417142996274e-05, + "loss": 0.1991, + "step": 14518 + }, + { + "epoch": 0.2695647976593161, + "grad_norm": 0.28510549664497375, + "learning_rate": 1.662329752569599e-05, + "loss": 0.3238, + "step": 14520 + }, + { + "epoch": 0.26960192779673475, + "grad_norm": 0.35713887214660645, + "learning_rate": 1.6622423531306005e-05, + "loss": 0.3528, + "step": 14522 + }, + { + "epoch": 0.26963905793415344, + "grad_norm": 0.43582355976104736, + "learning_rate": 1.6621549446804666e-05, + "loss": 0.3047, + "step": 14524 + }, + { + "epoch": 0.26967618807157206, + "grad_norm": 0.36458003520965576, + "learning_rate": 1.6620675272203873e-05, + "loss": 0.3094, + "step": 14526 + }, + { + "epoch": 0.2697133182089907, + "grad_norm": 0.6691232323646545, + "learning_rate": 1.6619801007515523e-05, + "loss": 0.4771, + "step": 14528 + }, + { + "epoch": 0.2697504483464093, + "grad_norm": 0.36070874333381653, + "learning_rate": 1.6618926652751513e-05, + "loss": 0.3995, + "step": 14530 + }, + { + "epoch": 0.26978757848382795, + "grad_norm": 0.41977420449256897, + "learning_rate": 1.661805220792373e-05, + "loss": 0.3016, + "step": 14532 + }, + { + "epoch": 0.26982470862124663, + "grad_norm": 0.42717525362968445, + "learning_rate": 1.6617177673044087e-05, + "loss": 0.1057, + "step": 14534 + }, + { + "epoch": 0.26986183875866526, + "grad_norm": 0.3802667260169983, + "learning_rate": 1.6616303048124475e-05, + "loss": 0.2439, + "step": 14536 + }, + { + "epoch": 0.2698989688960839, + "grad_norm": 0.2649378180503845, + "learning_rate": 1.661542833317679e-05, + "loss": 0.3944, + "step": 14538 + }, + { + "epoch": 0.2699360990335025, + "grad_norm": 0.3610313832759857, + "learning_rate": 1.661455352821295e-05, + "loss": 0.2261, + "step": 14540 + }, + { + "epoch": 0.26997322917092115, + "grad_norm": 0.4174010753631592, + "learning_rate": 1.6613678633244846e-05, + "loss": 0.291, + "step": 14542 + }, + { + "epoch": 0.2700103593083398, + "grad_norm": 0.3797670900821686, + "learning_rate": 1.6612803648284384e-05, + "loss": 0.2449, + "step": 14544 + }, + { + "epoch": 0.27004748944575846, + "grad_norm": 0.28193598985671997, + "learning_rate": 1.6611928573343476e-05, + "loss": 0.3682, + "step": 14546 + }, + { + "epoch": 0.2700846195831771, + "grad_norm": 0.3032357692718506, + "learning_rate": 1.661105340843402e-05, + "loss": 0.3452, + "step": 14548 + }, + { + "epoch": 0.2701217497205957, + "grad_norm": 0.38184359669685364, + "learning_rate": 1.6610178153567933e-05, + "loss": 0.4803, + "step": 14550 + }, + { + "epoch": 0.27015887985801434, + "grad_norm": 0.44204840064048767, + "learning_rate": 1.660930280875712e-05, + "loss": 0.3449, + "step": 14552 + }, + { + "epoch": 0.270196009995433, + "grad_norm": 0.3766881227493286, + "learning_rate": 1.6608427374013495e-05, + "loss": 0.321, + "step": 14554 + }, + { + "epoch": 0.27023314013285166, + "grad_norm": 0.3881608843803406, + "learning_rate": 1.6607551849348965e-05, + "loss": 0.3521, + "step": 14556 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.27986642718315125, + "learning_rate": 1.6606676234775445e-05, + "loss": 0.3576, + "step": 14558 + }, + { + "epoch": 0.2703074004076889, + "grad_norm": 0.3293085992336273, + "learning_rate": 1.6605800530304854e-05, + "loss": 0.3608, + "step": 14560 + }, + { + "epoch": 0.27034453054510754, + "grad_norm": 0.36550790071487427, + "learning_rate": 1.6604924735949106e-05, + "loss": 0.2943, + "step": 14562 + }, + { + "epoch": 0.27038166068252617, + "grad_norm": 0.39974626898765564, + "learning_rate": 1.6604048851720113e-05, + "loss": 0.4125, + "step": 14564 + }, + { + "epoch": 0.2704187908199448, + "grad_norm": 0.6848732233047485, + "learning_rate": 1.6603172877629797e-05, + "loss": 0.3885, + "step": 14566 + }, + { + "epoch": 0.2704559209573635, + "grad_norm": 0.26436659693717957, + "learning_rate": 1.660229681369008e-05, + "loss": 0.2739, + "step": 14568 + }, + { + "epoch": 0.2704930510947821, + "grad_norm": 0.6374115347862244, + "learning_rate": 1.660142065991287e-05, + "loss": 0.3012, + "step": 14570 + }, + { + "epoch": 0.27053018123220074, + "grad_norm": 0.32974353432655334, + "learning_rate": 1.660054441631011e-05, + "loss": 0.3084, + "step": 14572 + }, + { + "epoch": 0.27056731136961937, + "grad_norm": 1.8442188501358032, + "learning_rate": 1.6599668082893706e-05, + "loss": 0.3309, + "step": 14574 + }, + { + "epoch": 0.270604441507038, + "grad_norm": 0.41182005405426025, + "learning_rate": 1.6598791659675588e-05, + "loss": 0.3407, + "step": 14576 + }, + { + "epoch": 0.2706415716444567, + "grad_norm": 0.31686636805534363, + "learning_rate": 1.6597915146667682e-05, + "loss": 0.3996, + "step": 14578 + }, + { + "epoch": 0.2706787017818753, + "grad_norm": 0.38003748655319214, + "learning_rate": 1.6597038543881913e-05, + "loss": 0.2377, + "step": 14580 + }, + { + "epoch": 0.27071583191929394, + "grad_norm": 0.45389607548713684, + "learning_rate": 1.6596161851330212e-05, + "loss": 0.4114, + "step": 14582 + }, + { + "epoch": 0.27075296205671256, + "grad_norm": 0.32193097472190857, + "learning_rate": 1.6595285069024504e-05, + "loss": 0.2405, + "step": 14584 + }, + { + "epoch": 0.2707900921941312, + "grad_norm": 0.18964552879333496, + "learning_rate": 1.6594408196976723e-05, + "loss": 0.3091, + "step": 14586 + }, + { + "epoch": 0.2708272223315498, + "grad_norm": 0.40375423431396484, + "learning_rate": 1.6593531235198797e-05, + "loss": 0.34, + "step": 14588 + }, + { + "epoch": 0.2708643524689685, + "grad_norm": 0.5508294105529785, + "learning_rate": 1.659265418370266e-05, + "loss": 0.2781, + "step": 14590 + }, + { + "epoch": 0.27090148260638713, + "grad_norm": 0.39986830949783325, + "learning_rate": 1.6591777042500247e-05, + "loss": 0.4121, + "step": 14592 + }, + { + "epoch": 0.27093861274380576, + "grad_norm": 0.3179416060447693, + "learning_rate": 1.6590899811603495e-05, + "loss": 0.2055, + "step": 14594 + }, + { + "epoch": 0.2709757428812244, + "grad_norm": 0.3477843105792999, + "learning_rate": 1.6590022491024338e-05, + "loss": 0.314, + "step": 14596 + }, + { + "epoch": 0.271012873018643, + "grad_norm": 0.2407577633857727, + "learning_rate": 1.6589145080774715e-05, + "loss": 0.0664, + "step": 14598 + }, + { + "epoch": 0.2710500031560617, + "grad_norm": 0.2912905514240265, + "learning_rate": 1.6588267580866564e-05, + "loss": 0.3466, + "step": 14600 + }, + { + "epoch": 0.27108713329348033, + "grad_norm": 0.3874660134315491, + "learning_rate": 1.6587389991311823e-05, + "loss": 0.2334, + "step": 14602 + }, + { + "epoch": 0.27112426343089896, + "grad_norm": 0.28356364369392395, + "learning_rate": 1.658651231212244e-05, + "loss": 0.2841, + "step": 14604 + }, + { + "epoch": 0.2711613935683176, + "grad_norm": 0.3955436050891876, + "learning_rate": 1.658563454331035e-05, + "loss": 0.3338, + "step": 14606 + }, + { + "epoch": 0.2711985237057362, + "grad_norm": 0.5678128600120544, + "learning_rate": 1.65847566848875e-05, + "loss": 0.2635, + "step": 14608 + }, + { + "epoch": 0.2712356538431549, + "grad_norm": 0.36759284138679504, + "learning_rate": 1.658387873686584e-05, + "loss": 0.4514, + "step": 14610 + }, + { + "epoch": 0.27127278398057353, + "grad_norm": 0.19377611577510834, + "learning_rate": 1.6583000699257306e-05, + "loss": 0.3425, + "step": 14612 + }, + { + "epoch": 0.27130991411799216, + "grad_norm": 0.23808307945728302, + "learning_rate": 1.658212257207385e-05, + "loss": 0.2833, + "step": 14614 + }, + { + "epoch": 0.2713470442554108, + "grad_norm": 0.4442470669746399, + "learning_rate": 1.6581244355327425e-05, + "loss": 0.2863, + "step": 14616 + }, + { + "epoch": 0.2713841743928294, + "grad_norm": 0.7035424709320068, + "learning_rate": 1.6580366049029975e-05, + "loss": 0.4233, + "step": 14618 + }, + { + "epoch": 0.27142130453024804, + "grad_norm": 0.794154703617096, + "learning_rate": 1.6579487653193456e-05, + "loss": 0.3395, + "step": 14620 + }, + { + "epoch": 0.2714584346676667, + "grad_norm": 0.23039385676383972, + "learning_rate": 1.657860916782982e-05, + "loss": 0.1868, + "step": 14622 + }, + { + "epoch": 0.27149556480508535, + "grad_norm": 0.4515477418899536, + "learning_rate": 1.6577730592951013e-05, + "loss": 0.189, + "step": 14624 + }, + { + "epoch": 0.271532694942504, + "grad_norm": 0.37406814098358154, + "learning_rate": 1.6576851928569e-05, + "loss": 0.1874, + "step": 14626 + }, + { + "epoch": 0.2715698250799226, + "grad_norm": 0.3430345058441162, + "learning_rate": 1.6575973174695725e-05, + "loss": 0.4586, + "step": 14628 + }, + { + "epoch": 0.27160695521734124, + "grad_norm": 0.4775097966194153, + "learning_rate": 1.657509433134316e-05, + "loss": 0.4951, + "step": 14630 + }, + { + "epoch": 0.2716440853547599, + "grad_norm": 0.31621044874191284, + "learning_rate": 1.657421539852325e-05, + "loss": 0.3502, + "step": 14632 + }, + { + "epoch": 0.27168121549217855, + "grad_norm": 0.4372848570346832, + "learning_rate": 1.6573336376247967e-05, + "loss": 0.2372, + "step": 14634 + }, + { + "epoch": 0.2717183456295972, + "grad_norm": 0.5204339623451233, + "learning_rate": 1.657245726452926e-05, + "loss": 0.3773, + "step": 14636 + }, + { + "epoch": 0.2717554757670158, + "grad_norm": 0.5138462781906128, + "learning_rate": 1.65715780633791e-05, + "loss": 0.3449, + "step": 14638 + }, + { + "epoch": 0.27179260590443444, + "grad_norm": 0.38314974308013916, + "learning_rate": 1.6570698772809444e-05, + "loss": 0.2474, + "step": 14640 + }, + { + "epoch": 0.27182973604185307, + "grad_norm": 0.47644051909446716, + "learning_rate": 1.6569819392832262e-05, + "loss": 0.1593, + "step": 14642 + }, + { + "epoch": 0.27186686617927175, + "grad_norm": 0.4106592535972595, + "learning_rate": 1.6568939923459514e-05, + "loss": 0.3589, + "step": 14644 + }, + { + "epoch": 0.2719039963166904, + "grad_norm": 0.416670560836792, + "learning_rate": 1.656806036470317e-05, + "loss": 0.1903, + "step": 14646 + }, + { + "epoch": 0.271941126454109, + "grad_norm": 0.5414636135101318, + "learning_rate": 1.6567180716575198e-05, + "loss": 0.3285, + "step": 14648 + }, + { + "epoch": 0.27197825659152763, + "grad_norm": 0.29910486936569214, + "learning_rate": 1.656630097908757e-05, + "loss": 0.5219, + "step": 14650 + }, + { + "epoch": 0.27201538672894626, + "grad_norm": 0.36363092064857483, + "learning_rate": 1.656542115225225e-05, + "loss": 0.3364, + "step": 14652 + }, + { + "epoch": 0.27205251686636495, + "grad_norm": 0.6942903399467468, + "learning_rate": 1.6564541236081217e-05, + "loss": 0.2863, + "step": 14654 + }, + { + "epoch": 0.2720896470037836, + "grad_norm": 0.5779421329498291, + "learning_rate": 1.656366123058644e-05, + "loss": 0.3508, + "step": 14656 + }, + { + "epoch": 0.2721267771412022, + "grad_norm": 0.45575886964797974, + "learning_rate": 1.6562781135779898e-05, + "loss": 0.2758, + "step": 14658 + }, + { + "epoch": 0.27216390727862083, + "grad_norm": 0.3178647756576538, + "learning_rate": 1.6561900951673556e-05, + "loss": 0.1936, + "step": 14660 + }, + { + "epoch": 0.27220103741603946, + "grad_norm": 0.3453634977340698, + "learning_rate": 1.65610206782794e-05, + "loss": 0.306, + "step": 14662 + }, + { + "epoch": 0.2722381675534581, + "grad_norm": 0.3277454078197479, + "learning_rate": 1.6560140315609406e-05, + "loss": 0.3499, + "step": 14664 + }, + { + "epoch": 0.27227529769087677, + "grad_norm": 0.3271265923976898, + "learning_rate": 1.6559259863675553e-05, + "loss": 0.1561, + "step": 14666 + }, + { + "epoch": 0.2723124278282954, + "grad_norm": 0.3544183671474457, + "learning_rate": 1.6558379322489817e-05, + "loss": 0.1541, + "step": 14668 + }, + { + "epoch": 0.27234955796571403, + "grad_norm": 0.260212779045105, + "learning_rate": 1.6557498692064187e-05, + "loss": 0.501, + "step": 14670 + }, + { + "epoch": 0.27238668810313266, + "grad_norm": 0.3945387005805969, + "learning_rate": 1.655661797241064e-05, + "loss": 0.4064, + "step": 14672 + }, + { + "epoch": 0.2724238182405513, + "grad_norm": 0.3963828980922699, + "learning_rate": 1.655573716354116e-05, + "loss": 0.2909, + "step": 14674 + }, + { + "epoch": 0.27246094837796997, + "grad_norm": 0.3975825309753418, + "learning_rate": 1.6554856265467735e-05, + "loss": 0.4676, + "step": 14676 + }, + { + "epoch": 0.2724980785153886, + "grad_norm": 0.37554121017456055, + "learning_rate": 1.655397527820235e-05, + "loss": 0.2851, + "step": 14678 + }, + { + "epoch": 0.2725352086528072, + "grad_norm": 0.42874693870544434, + "learning_rate": 1.6553094201756996e-05, + "loss": 0.4541, + "step": 14680 + }, + { + "epoch": 0.27257233879022585, + "grad_norm": 0.3809939920902252, + "learning_rate": 1.6552213036143654e-05, + "loss": 0.3199, + "step": 14682 + }, + { + "epoch": 0.2726094689276445, + "grad_norm": 0.3553813695907593, + "learning_rate": 1.655133178137432e-05, + "loss": 0.4257, + "step": 14684 + }, + { + "epoch": 0.27264659906506317, + "grad_norm": 0.2711872160434723, + "learning_rate": 1.6550450437460987e-05, + "loss": 0.3231, + "step": 14686 + }, + { + "epoch": 0.2726837292024818, + "grad_norm": 0.27073177695274353, + "learning_rate": 1.654956900441564e-05, + "loss": 0.2255, + "step": 14688 + }, + { + "epoch": 0.2727208593399004, + "grad_norm": 0.35050705075263977, + "learning_rate": 1.654868748225028e-05, + "loss": 0.4098, + "step": 14690 + }, + { + "epoch": 0.27275798947731905, + "grad_norm": 0.33652275800704956, + "learning_rate": 1.65478058709769e-05, + "loss": 0.4278, + "step": 14692 + }, + { + "epoch": 0.2727951196147377, + "grad_norm": 0.4140445291996002, + "learning_rate": 1.6546924170607494e-05, + "loss": 0.3922, + "step": 14694 + }, + { + "epoch": 0.2728322497521563, + "grad_norm": 0.3835141360759735, + "learning_rate": 1.6546042381154057e-05, + "loss": 0.3385, + "step": 14696 + }, + { + "epoch": 0.272869379889575, + "grad_norm": 0.46845513582229614, + "learning_rate": 1.6545160502628595e-05, + "loss": 0.3181, + "step": 14698 + }, + { + "epoch": 0.2729065100269936, + "grad_norm": 0.38605761528015137, + "learning_rate": 1.6544278535043103e-05, + "loss": 0.2867, + "step": 14700 + }, + { + "epoch": 0.27294364016441225, + "grad_norm": 0.5140795707702637, + "learning_rate": 1.6543396478409583e-05, + "loss": 0.1996, + "step": 14702 + }, + { + "epoch": 0.2729807703018309, + "grad_norm": 0.2288786917924881, + "learning_rate": 1.6542514332740035e-05, + "loss": 0.0983, + "step": 14704 + }, + { + "epoch": 0.2730179004392495, + "grad_norm": 0.3231593370437622, + "learning_rate": 1.6541632098046464e-05, + "loss": 0.4028, + "step": 14706 + }, + { + "epoch": 0.2730550305766682, + "grad_norm": 0.24441130459308624, + "learning_rate": 1.6540749774340874e-05, + "loss": 0.3805, + "step": 14708 + }, + { + "epoch": 0.2730921607140868, + "grad_norm": 0.27992236614227295, + "learning_rate": 1.6539867361635272e-05, + "loss": 0.3422, + "step": 14710 + }, + { + "epoch": 0.27312929085150545, + "grad_norm": 0.45079299807548523, + "learning_rate": 1.6538984859941667e-05, + "loss": 0.3394, + "step": 14712 + }, + { + "epoch": 0.2731664209889241, + "grad_norm": 0.3355009853839874, + "learning_rate": 1.653810226927206e-05, + "loss": 0.1901, + "step": 14714 + }, + { + "epoch": 0.2732035511263427, + "grad_norm": 0.3348577618598938, + "learning_rate": 1.6537219589638466e-05, + "loss": 0.2437, + "step": 14716 + }, + { + "epoch": 0.27324068126376133, + "grad_norm": 0.5790876150131226, + "learning_rate": 1.65363368210529e-05, + "loss": 0.3334, + "step": 14718 + }, + { + "epoch": 0.27327781140118, + "grad_norm": 0.3333645761013031, + "learning_rate": 1.6535453963527363e-05, + "loss": 0.2668, + "step": 14720 + }, + { + "epoch": 0.27331494153859864, + "grad_norm": 0.34725311398506165, + "learning_rate": 1.6534571017073873e-05, + "loss": 0.3187, + "step": 14722 + }, + { + "epoch": 0.27335207167601727, + "grad_norm": 0.2609018385410309, + "learning_rate": 1.653368798170445e-05, + "loss": 0.3864, + "step": 14724 + }, + { + "epoch": 0.2733892018134359, + "grad_norm": 0.8517078161239624, + "learning_rate": 1.65328048574311e-05, + "loss": 0.4865, + "step": 14726 + }, + { + "epoch": 0.27342633195085453, + "grad_norm": 0.29631832242012024, + "learning_rate": 1.6531921644265844e-05, + "loss": 0.319, + "step": 14728 + }, + { + "epoch": 0.2734634620882732, + "grad_norm": 0.45585134625434875, + "learning_rate": 1.65310383422207e-05, + "loss": 0.3505, + "step": 14730 + }, + { + "epoch": 0.27350059222569184, + "grad_norm": 0.2846459150314331, + "learning_rate": 1.6530154951307688e-05, + "loss": 0.2223, + "step": 14732 + }, + { + "epoch": 0.27353772236311047, + "grad_norm": 0.2941604256629944, + "learning_rate": 1.6529271471538825e-05, + "loss": 0.2407, + "step": 14734 + }, + { + "epoch": 0.2735748525005291, + "grad_norm": 0.4237828254699707, + "learning_rate": 1.6528387902926138e-05, + "loss": 0.4774, + "step": 14736 + }, + { + "epoch": 0.2736119826379477, + "grad_norm": 0.34062060713768005, + "learning_rate": 1.6527504245481646e-05, + "loss": 0.3844, + "step": 14738 + }, + { + "epoch": 0.27364911277536635, + "grad_norm": 0.33982667326927185, + "learning_rate": 1.6526620499217373e-05, + "loss": 0.3503, + "step": 14740 + }, + { + "epoch": 0.27368624291278504, + "grad_norm": 0.34856364130973816, + "learning_rate": 1.6525736664145342e-05, + "loss": 0.2474, + "step": 14742 + }, + { + "epoch": 0.27372337305020367, + "grad_norm": 0.34855902194976807, + "learning_rate": 1.6524852740277584e-05, + "loss": 0.2024, + "step": 14744 + }, + { + "epoch": 0.2737605031876223, + "grad_norm": 0.35187119245529175, + "learning_rate": 1.6523968727626125e-05, + "loss": 0.2365, + "step": 14746 + }, + { + "epoch": 0.2737976333250409, + "grad_norm": 0.39861640334129333, + "learning_rate": 1.652308462620299e-05, + "loss": 0.3052, + "step": 14748 + }, + { + "epoch": 0.27383476346245955, + "grad_norm": 0.31246644258499146, + "learning_rate": 1.6522200436020214e-05, + "loss": 0.308, + "step": 14750 + }, + { + "epoch": 0.27387189359987824, + "grad_norm": 0.29504647850990295, + "learning_rate": 1.6521316157089827e-05, + "loss": 0.3676, + "step": 14752 + }, + { + "epoch": 0.27390902373729686, + "grad_norm": 0.4543309807777405, + "learning_rate": 1.652043178942386e-05, + "loss": 0.2578, + "step": 14754 + }, + { + "epoch": 0.2739461538747155, + "grad_norm": 0.6466224193572998, + "learning_rate": 1.651954733303435e-05, + "loss": 0.1504, + "step": 14756 + }, + { + "epoch": 0.2739832840121341, + "grad_norm": 0.30616524815559387, + "learning_rate": 1.6518662787933325e-05, + "loss": 0.5037, + "step": 14758 + }, + { + "epoch": 0.27402041414955275, + "grad_norm": 0.27346140146255493, + "learning_rate": 1.651777815413283e-05, + "loss": 0.4006, + "step": 14760 + }, + { + "epoch": 0.27405754428697143, + "grad_norm": 0.3785805106163025, + "learning_rate": 1.6516893431644892e-05, + "loss": 0.2291, + "step": 14762 + }, + { + "epoch": 0.27409467442439006, + "grad_norm": 0.2849050462245941, + "learning_rate": 1.651600862048156e-05, + "loss": 0.2911, + "step": 14764 + }, + { + "epoch": 0.2741318045618087, + "grad_norm": 0.326878160238266, + "learning_rate": 1.6515123720654862e-05, + "loss": 0.2718, + "step": 14766 + }, + { + "epoch": 0.2741689346992273, + "grad_norm": 0.334862619638443, + "learning_rate": 1.6514238732176847e-05, + "loss": 0.1525, + "step": 14768 + }, + { + "epoch": 0.27420606483664595, + "grad_norm": 0.3678429126739502, + "learning_rate": 1.651335365505956e-05, + "loss": 0.3416, + "step": 14770 + }, + { + "epoch": 0.2742431949740646, + "grad_norm": 0.5865926742553711, + "learning_rate": 1.6512468489315034e-05, + "loss": 0.2622, + "step": 14772 + }, + { + "epoch": 0.27428032511148326, + "grad_norm": 0.4681185483932495, + "learning_rate": 1.651158323495532e-05, + "loss": 0.3453, + "step": 14774 + }, + { + "epoch": 0.2743174552489019, + "grad_norm": 0.38823240995407104, + "learning_rate": 1.6510697891992466e-05, + "loss": 0.4259, + "step": 14776 + }, + { + "epoch": 0.2743545853863205, + "grad_norm": 0.23671135306358337, + "learning_rate": 1.6509812460438513e-05, + "loss": 0.3245, + "step": 14778 + }, + { + "epoch": 0.27439171552373914, + "grad_norm": 0.36960306763648987, + "learning_rate": 1.6508926940305513e-05, + "loss": 0.2836, + "step": 14780 + }, + { + "epoch": 0.27442884566115777, + "grad_norm": 0.349090576171875, + "learning_rate": 1.6508041331605512e-05, + "loss": 0.336, + "step": 14782 + }, + { + "epoch": 0.27446597579857646, + "grad_norm": 0.28353485465049744, + "learning_rate": 1.6507155634350564e-05, + "loss": 0.2208, + "step": 14784 + }, + { + "epoch": 0.2745031059359951, + "grad_norm": 0.5452004075050354, + "learning_rate": 1.6506269848552718e-05, + "loss": 0.3043, + "step": 14786 + }, + { + "epoch": 0.2745402360734137, + "grad_norm": 0.2384907752275467, + "learning_rate": 1.6505383974224028e-05, + "loss": 0.3658, + "step": 14788 + }, + { + "epoch": 0.27457736621083234, + "grad_norm": 0.31997108459472656, + "learning_rate": 1.6504498011376546e-05, + "loss": 0.3844, + "step": 14790 + }, + { + "epoch": 0.27461449634825097, + "grad_norm": 0.32168155908584595, + "learning_rate": 1.6503611960022334e-05, + "loss": 0.4847, + "step": 14792 + }, + { + "epoch": 0.2746516264856696, + "grad_norm": 0.5161004066467285, + "learning_rate": 1.650272582017344e-05, + "loss": 0.359, + "step": 14794 + }, + { + "epoch": 0.2746887566230883, + "grad_norm": 0.33702290058135986, + "learning_rate": 1.6501839591841926e-05, + "loss": 0.3595, + "step": 14796 + }, + { + "epoch": 0.2747258867605069, + "grad_norm": 0.3972940742969513, + "learning_rate": 1.650095327503985e-05, + "loss": 0.3269, + "step": 14798 + }, + { + "epoch": 0.27476301689792554, + "grad_norm": 0.2500225603580475, + "learning_rate": 1.6500066869779273e-05, + "loss": 0.2834, + "step": 14800 + }, + { + "epoch": 0.27480014703534417, + "grad_norm": 0.3220600485801697, + "learning_rate": 1.649918037607226e-05, + "loss": 0.2903, + "step": 14802 + }, + { + "epoch": 0.2748372771727628, + "grad_norm": 0.3312821090221405, + "learning_rate": 1.6498293793930865e-05, + "loss": 0.173, + "step": 14804 + }, + { + "epoch": 0.2748744073101815, + "grad_norm": 0.43873366713523865, + "learning_rate": 1.6497407123367155e-05, + "loss": 0.2093, + "step": 14806 + }, + { + "epoch": 0.2749115374476001, + "grad_norm": 0.4325542151927948, + "learning_rate": 1.64965203643932e-05, + "loss": 0.3874, + "step": 14808 + }, + { + "epoch": 0.27494866758501874, + "grad_norm": 0.2475588470697403, + "learning_rate": 1.649563351702106e-05, + "loss": 0.4355, + "step": 14810 + }, + { + "epoch": 0.27498579772243736, + "grad_norm": 0.48887360095977783, + "learning_rate": 1.6494746581262803e-05, + "loss": 0.2994, + "step": 14812 + }, + { + "epoch": 0.275022927859856, + "grad_norm": 0.36854198575019836, + "learning_rate": 1.64938595571305e-05, + "loss": 0.4927, + "step": 14814 + }, + { + "epoch": 0.2750600579972746, + "grad_norm": 0.3896780014038086, + "learning_rate": 1.6492972444636223e-05, + "loss": 0.2577, + "step": 14816 + }, + { + "epoch": 0.2750971881346933, + "grad_norm": 0.2508016526699066, + "learning_rate": 1.6492085243792037e-05, + "loss": 0.1243, + "step": 14818 + }, + { + "epoch": 0.27513431827211193, + "grad_norm": 0.4492914378643036, + "learning_rate": 1.6491197954610015e-05, + "loss": 0.1922, + "step": 14820 + }, + { + "epoch": 0.27517144840953056, + "grad_norm": 0.2753238379955292, + "learning_rate": 1.6490310577102234e-05, + "loss": 0.3277, + "step": 14822 + }, + { + "epoch": 0.2752085785469492, + "grad_norm": 0.20805779099464417, + "learning_rate": 1.6489423111280767e-05, + "loss": 0.267, + "step": 14824 + }, + { + "epoch": 0.2752457086843678, + "grad_norm": 0.4545464813709259, + "learning_rate": 1.648853555715769e-05, + "loss": 0.2475, + "step": 14826 + }, + { + "epoch": 0.2752828388217865, + "grad_norm": 0.4415128231048584, + "learning_rate": 1.648764791474508e-05, + "loss": 0.4062, + "step": 14828 + }, + { + "epoch": 0.27531996895920513, + "grad_norm": 0.4637845754623413, + "learning_rate": 1.648676018405501e-05, + "loss": 0.2359, + "step": 14830 + }, + { + "epoch": 0.27535709909662376, + "grad_norm": 0.37481415271759033, + "learning_rate": 1.648587236509957e-05, + "loss": 0.2189, + "step": 14832 + }, + { + "epoch": 0.2753942292340424, + "grad_norm": 0.5689775347709656, + "learning_rate": 1.648498445789083e-05, + "loss": 0.3498, + "step": 14834 + }, + { + "epoch": 0.275431359371461, + "grad_norm": 0.45561549067497253, + "learning_rate": 1.6484096462440883e-05, + "loss": 0.1395, + "step": 14836 + }, + { + "epoch": 0.2754684895088797, + "grad_norm": 0.23857736587524414, + "learning_rate": 1.64832083787618e-05, + "loss": 0.2851, + "step": 14838 + }, + { + "epoch": 0.2755056196462983, + "grad_norm": 0.3643311858177185, + "learning_rate": 1.648232020686567e-05, + "loss": 0.4135, + "step": 14840 + }, + { + "epoch": 0.27554274978371696, + "grad_norm": 0.28163841366767883, + "learning_rate": 1.6481431946764578e-05, + "loss": 0.2765, + "step": 14842 + }, + { + "epoch": 0.2755798799211356, + "grad_norm": 0.4603663682937622, + "learning_rate": 1.6480543598470617e-05, + "loss": 0.3075, + "step": 14844 + }, + { + "epoch": 0.2756170100585542, + "grad_norm": 0.32451507449150085, + "learning_rate": 1.6479655161995865e-05, + "loss": 0.2501, + "step": 14846 + }, + { + "epoch": 0.27565414019597284, + "grad_norm": 0.39576223492622375, + "learning_rate": 1.6478766637352414e-05, + "loss": 0.3298, + "step": 14848 + }, + { + "epoch": 0.2756912703333915, + "grad_norm": 0.32359379529953003, + "learning_rate": 1.6477878024552358e-05, + "loss": 0.3423, + "step": 14850 + }, + { + "epoch": 0.27572840047081015, + "grad_norm": 0.4463805854320526, + "learning_rate": 1.6476989323607784e-05, + "loss": 0.3104, + "step": 14852 + }, + { + "epoch": 0.2757655306082288, + "grad_norm": 0.25558438897132874, + "learning_rate": 1.6476100534530787e-05, + "loss": 0.346, + "step": 14854 + }, + { + "epoch": 0.2758026607456474, + "grad_norm": 0.3127457797527313, + "learning_rate": 1.6475211657333455e-05, + "loss": 0.2887, + "step": 14856 + }, + { + "epoch": 0.27583979088306604, + "grad_norm": 0.2876504063606262, + "learning_rate": 1.6474322692027896e-05, + "loss": 0.3465, + "step": 14858 + }, + { + "epoch": 0.2758769210204847, + "grad_norm": 0.3852679431438446, + "learning_rate": 1.6473433638626194e-05, + "loss": 0.4883, + "step": 14860 + }, + { + "epoch": 0.27591405115790335, + "grad_norm": 0.4314744174480438, + "learning_rate": 1.647254449714045e-05, + "loss": 0.335, + "step": 14862 + }, + { + "epoch": 0.275951181295322, + "grad_norm": 0.25188130140304565, + "learning_rate": 1.6471655267582764e-05, + "loss": 0.2775, + "step": 14864 + }, + { + "epoch": 0.2759883114327406, + "grad_norm": 0.4336928129196167, + "learning_rate": 1.6470765949965234e-05, + "loss": 0.3647, + "step": 14866 + }, + { + "epoch": 0.27602544157015924, + "grad_norm": 0.34858429431915283, + "learning_rate": 1.646987654429996e-05, + "loss": 0.4555, + "step": 14868 + }, + { + "epoch": 0.27606257170757786, + "grad_norm": 0.6125998497009277, + "learning_rate": 1.646898705059905e-05, + "loss": 0.1871, + "step": 14870 + }, + { + "epoch": 0.27609970184499655, + "grad_norm": 0.5572607517242432, + "learning_rate": 1.6468097468874598e-05, + "loss": 0.414, + "step": 14872 + }, + { + "epoch": 0.2761368319824152, + "grad_norm": 0.38874924182891846, + "learning_rate": 1.6467207799138716e-05, + "loss": 0.4656, + "step": 14874 + }, + { + "epoch": 0.2761739621198338, + "grad_norm": 0.48564234375953674, + "learning_rate": 1.646631804140351e-05, + "loss": 0.1954, + "step": 14876 + }, + { + "epoch": 0.27621109225725243, + "grad_norm": 0.39803749322891235, + "learning_rate": 1.646542819568108e-05, + "loss": 0.2265, + "step": 14878 + }, + { + "epoch": 0.27624822239467106, + "grad_norm": 0.35922297835350037, + "learning_rate": 1.6464538261983542e-05, + "loss": 0.3295, + "step": 14880 + }, + { + "epoch": 0.27628535253208975, + "grad_norm": 0.2193242311477661, + "learning_rate": 1.6463648240323e-05, + "loss": 0.4425, + "step": 14882 + }, + { + "epoch": 0.2763224826695084, + "grad_norm": 0.39045900106430054, + "learning_rate": 1.6462758130711566e-05, + "loss": 0.2133, + "step": 14884 + }, + { + "epoch": 0.276359612806927, + "grad_norm": 0.4693489968776703, + "learning_rate": 1.6461867933161353e-05, + "loss": 0.3328, + "step": 14886 + }, + { + "epoch": 0.27639674294434563, + "grad_norm": 0.48045212030410767, + "learning_rate": 1.646097764768447e-05, + "loss": 0.4166, + "step": 14888 + }, + { + "epoch": 0.27643387308176426, + "grad_norm": 0.46798980236053467, + "learning_rate": 1.6460087274293036e-05, + "loss": 0.3705, + "step": 14890 + }, + { + "epoch": 0.2764710032191829, + "grad_norm": 0.35942989587783813, + "learning_rate": 1.645919681299917e-05, + "loss": 0.2909, + "step": 14892 + }, + { + "epoch": 0.27650813335660157, + "grad_norm": 0.4933192729949951, + "learning_rate": 1.6458306263814975e-05, + "loss": 0.3218, + "step": 14894 + }, + { + "epoch": 0.2765452634940202, + "grad_norm": 0.2826973497867584, + "learning_rate": 1.645741562675258e-05, + "loss": 0.1868, + "step": 14896 + }, + { + "epoch": 0.2765823936314388, + "grad_norm": 0.4001150131225586, + "learning_rate": 1.64565249018241e-05, + "loss": 0.3571, + "step": 14898 + }, + { + "epoch": 0.27661952376885746, + "grad_norm": 0.35059475898742676, + "learning_rate": 1.6455634089041654e-05, + "loss": 0.3113, + "step": 14900 + }, + { + "epoch": 0.2766566539062761, + "grad_norm": 0.4711218774318695, + "learning_rate": 1.6454743188417368e-05, + "loss": 0.271, + "step": 14902 + }, + { + "epoch": 0.27669378404369477, + "grad_norm": 0.4374128580093384, + "learning_rate": 1.645385219996336e-05, + "loss": 0.3295, + "step": 14904 + }, + { + "epoch": 0.2767309141811134, + "grad_norm": 0.3825603723526001, + "learning_rate": 1.6452961123691754e-05, + "loss": 0.3543, + "step": 14906 + }, + { + "epoch": 0.276768044318532, + "grad_norm": 0.3439706861972809, + "learning_rate": 1.6452069959614678e-05, + "loss": 0.2064, + "step": 14908 + }, + { + "epoch": 0.27680517445595065, + "grad_norm": 0.31731534004211426, + "learning_rate": 1.6451178707744257e-05, + "loss": 0.2999, + "step": 14910 + }, + { + "epoch": 0.2768423045933693, + "grad_norm": 0.2681587338447571, + "learning_rate": 1.6450287368092613e-05, + "loss": 0.3907, + "step": 14912 + }, + { + "epoch": 0.27687943473078797, + "grad_norm": 0.32862699031829834, + "learning_rate": 1.6449395940671882e-05, + "loss": 0.3119, + "step": 14914 + }, + { + "epoch": 0.2769165648682066, + "grad_norm": 0.3737541139125824, + "learning_rate": 1.6448504425494188e-05, + "loss": 0.331, + "step": 14916 + }, + { + "epoch": 0.2769536950056252, + "grad_norm": 0.364833801984787, + "learning_rate": 1.6447612822571667e-05, + "loss": 0.3499, + "step": 14918 + }, + { + "epoch": 0.27699082514304385, + "grad_norm": 0.4749397039413452, + "learning_rate": 1.6446721131916444e-05, + "loss": 0.2455, + "step": 14920 + }, + { + "epoch": 0.2770279552804625, + "grad_norm": 0.47684356570243835, + "learning_rate": 1.644582935354066e-05, + "loss": 0.2661, + "step": 14922 + }, + { + "epoch": 0.2770650854178811, + "grad_norm": 0.2792404890060425, + "learning_rate": 1.6444937487456447e-05, + "loss": 0.4977, + "step": 14924 + }, + { + "epoch": 0.2771022155552998, + "grad_norm": 0.27260011434555054, + "learning_rate": 1.6444045533675938e-05, + "loss": 0.1396, + "step": 14926 + }, + { + "epoch": 0.2771393456927184, + "grad_norm": 0.3562160134315491, + "learning_rate": 1.6443153492211276e-05, + "loss": 0.186, + "step": 14928 + }, + { + "epoch": 0.27717647583013705, + "grad_norm": 0.38558229804039, + "learning_rate": 1.6442261363074584e-05, + "loss": 0.5876, + "step": 14930 + }, + { + "epoch": 0.2772136059675557, + "grad_norm": 0.4793417751789093, + "learning_rate": 1.6441369146278023e-05, + "loss": 0.356, + "step": 14932 + }, + { + "epoch": 0.2772507361049743, + "grad_norm": 0.43960198760032654, + "learning_rate": 1.6440476841833714e-05, + "loss": 0.1908, + "step": 14934 + }, + { + "epoch": 0.277287866242393, + "grad_norm": 0.4275606870651245, + "learning_rate": 1.643958444975381e-05, + "loss": 0.2949, + "step": 14936 + }, + { + "epoch": 0.2773249963798116, + "grad_norm": 0.24383433163166046, + "learning_rate": 1.643869197005045e-05, + "loss": 0.3725, + "step": 14938 + }, + { + "epoch": 0.27736212651723025, + "grad_norm": 0.3487373888492584, + "learning_rate": 1.6437799402735778e-05, + "loss": 0.3811, + "step": 14940 + }, + { + "epoch": 0.2773992566546489, + "grad_norm": 0.41456958651542664, + "learning_rate": 1.6436906747821938e-05, + "loss": 0.365, + "step": 14942 + }, + { + "epoch": 0.2774363867920675, + "grad_norm": 0.30235451459884644, + "learning_rate": 1.6436014005321083e-05, + "loss": 0.3012, + "step": 14944 + }, + { + "epoch": 0.27747351692948613, + "grad_norm": 0.34595608711242676, + "learning_rate": 1.643512117524535e-05, + "loss": 0.295, + "step": 14946 + }, + { + "epoch": 0.2775106470669048, + "grad_norm": 0.272754043340683, + "learning_rate": 1.6434228257606896e-05, + "loss": 0.2225, + "step": 14948 + }, + { + "epoch": 0.27754777720432344, + "grad_norm": 0.43997758626937866, + "learning_rate": 1.6433335252417868e-05, + "loss": 0.3592, + "step": 14950 + }, + { + "epoch": 0.27758490734174207, + "grad_norm": 0.25300106406211853, + "learning_rate": 1.6432442159690417e-05, + "loss": 0.333, + "step": 14952 + }, + { + "epoch": 0.2776220374791607, + "grad_norm": 0.4052315056324005, + "learning_rate": 1.6431548979436697e-05, + "loss": 0.1619, + "step": 14954 + }, + { + "epoch": 0.2776591676165793, + "grad_norm": 0.7130093574523926, + "learning_rate": 1.6430655711668858e-05, + "loss": 0.2339, + "step": 14956 + }, + { + "epoch": 0.277696297753998, + "grad_norm": 0.46284353733062744, + "learning_rate": 1.6429762356399056e-05, + "loss": 0.3146, + "step": 14958 + }, + { + "epoch": 0.27773342789141664, + "grad_norm": 0.2876720428466797, + "learning_rate": 1.642886891363945e-05, + "loss": 0.4323, + "step": 14960 + }, + { + "epoch": 0.27777055802883527, + "grad_norm": 0.4155559241771698, + "learning_rate": 1.6427975383402194e-05, + "loss": 0.0991, + "step": 14962 + }, + { + "epoch": 0.2778076881662539, + "grad_norm": 0.41126206517219543, + "learning_rate": 1.642708176569945e-05, + "loss": 0.2999, + "step": 14964 + }, + { + "epoch": 0.2778448183036725, + "grad_norm": 0.4702550172805786, + "learning_rate": 1.642618806054337e-05, + "loss": 0.2222, + "step": 14966 + }, + { + "epoch": 0.27788194844109115, + "grad_norm": 0.47663938999176025, + "learning_rate": 1.642529426794612e-05, + "loss": 0.3905, + "step": 14968 + }, + { + "epoch": 0.27791907857850984, + "grad_norm": 0.2766834795475006, + "learning_rate": 1.6424400387919865e-05, + "loss": 0.1392, + "step": 14970 + }, + { + "epoch": 0.27795620871592847, + "grad_norm": 0.39705637097358704, + "learning_rate": 1.6423506420476756e-05, + "loss": 0.2438, + "step": 14972 + }, + { + "epoch": 0.2779933388533471, + "grad_norm": 0.4040052592754364, + "learning_rate": 1.6422612365628972e-05, + "loss": 0.3141, + "step": 14974 + }, + { + "epoch": 0.2780304689907657, + "grad_norm": 0.4049037992954254, + "learning_rate": 1.6421718223388668e-05, + "loss": 0.2965, + "step": 14976 + }, + { + "epoch": 0.27806759912818435, + "grad_norm": 0.3851534426212311, + "learning_rate": 1.642082399376802e-05, + "loss": 0.1841, + "step": 14978 + }, + { + "epoch": 0.27810472926560303, + "grad_norm": 0.37929391860961914, + "learning_rate": 1.6419929676779184e-05, + "loss": 0.3155, + "step": 14980 + }, + { + "epoch": 0.27814185940302166, + "grad_norm": 0.4493348300457001, + "learning_rate": 1.6419035272434336e-05, + "loss": 0.4621, + "step": 14982 + }, + { + "epoch": 0.2781789895404403, + "grad_norm": 0.47922950983047485, + "learning_rate": 1.6418140780745647e-05, + "loss": 0.5089, + "step": 14984 + }, + { + "epoch": 0.2782161196778589, + "grad_norm": 0.22817960381507874, + "learning_rate": 1.6417246201725286e-05, + "loss": 0.2825, + "step": 14986 + }, + { + "epoch": 0.27825324981527755, + "grad_norm": 0.2597392201423645, + "learning_rate": 1.6416351535385423e-05, + "loss": 0.3174, + "step": 14988 + }, + { + "epoch": 0.27829037995269623, + "grad_norm": 0.3169132173061371, + "learning_rate": 1.6415456781738235e-05, + "loss": 0.3405, + "step": 14990 + }, + { + "epoch": 0.27832751009011486, + "grad_norm": 0.29082944989204407, + "learning_rate": 1.64145619407959e-05, + "loss": 0.2831, + "step": 14992 + }, + { + "epoch": 0.2783646402275335, + "grad_norm": 0.30459529161453247, + "learning_rate": 1.6413667012570594e-05, + "loss": 0.2131, + "step": 14994 + }, + { + "epoch": 0.2784017703649521, + "grad_norm": 0.7897620797157288, + "learning_rate": 1.6412771997074487e-05, + "loss": 0.3588, + "step": 14996 + }, + { + "epoch": 0.27843890050237075, + "grad_norm": 0.38560277223587036, + "learning_rate": 1.641187689431976e-05, + "loss": 0.2154, + "step": 14998 + }, + { + "epoch": 0.2784760306397894, + "grad_norm": 0.3687324523925781, + "learning_rate": 1.6410981704318596e-05, + "loss": 0.2144, + "step": 15000 + }, + { + "epoch": 0.27851316077720806, + "grad_norm": 0.31290608644485474, + "learning_rate": 1.6410086427083176e-05, + "loss": 0.3951, + "step": 15002 + }, + { + "epoch": 0.2785502909146267, + "grad_norm": 0.3430293798446655, + "learning_rate": 1.6409191062625676e-05, + "loss": 0.3614, + "step": 15004 + }, + { + "epoch": 0.2785874210520453, + "grad_norm": 0.4427013695240021, + "learning_rate": 1.6408295610958288e-05, + "loss": 0.3289, + "step": 15006 + }, + { + "epoch": 0.27862455118946394, + "grad_norm": 0.26438021659851074, + "learning_rate": 1.6407400072093188e-05, + "loss": 0.2451, + "step": 15008 + }, + { + "epoch": 0.27866168132688257, + "grad_norm": 0.28633636236190796, + "learning_rate": 1.6406504446042567e-05, + "loss": 0.166, + "step": 15010 + }, + { + "epoch": 0.27869881146430125, + "grad_norm": 0.33371731638908386, + "learning_rate": 1.640560873281861e-05, + "loss": 0.4639, + "step": 15012 + }, + { + "epoch": 0.2787359416017199, + "grad_norm": 0.32669776678085327, + "learning_rate": 1.6404712932433508e-05, + "loss": 0.1534, + "step": 15014 + }, + { + "epoch": 0.2787730717391385, + "grad_norm": 0.47225221991539, + "learning_rate": 1.6403817044899445e-05, + "loss": 0.34, + "step": 15016 + }, + { + "epoch": 0.27881020187655714, + "grad_norm": 0.4173768162727356, + "learning_rate": 1.640292107022861e-05, + "loss": 0.4621, + "step": 15018 + }, + { + "epoch": 0.27884733201397577, + "grad_norm": 0.5146902203559875, + "learning_rate": 1.6402025008433204e-05, + "loss": 0.5664, + "step": 15020 + }, + { + "epoch": 0.2788844621513944, + "grad_norm": 0.5303643941879272, + "learning_rate": 1.640112885952541e-05, + "loss": 0.2577, + "step": 15022 + }, + { + "epoch": 0.2789215922888131, + "grad_norm": 0.26057448983192444, + "learning_rate": 1.6400232623517426e-05, + "loss": 0.2277, + "step": 15024 + }, + { + "epoch": 0.2789587224262317, + "grad_norm": 0.4882340431213379, + "learning_rate": 1.639933630042145e-05, + "loss": 0.3506, + "step": 15026 + }, + { + "epoch": 0.27899585256365034, + "grad_norm": 0.261616975069046, + "learning_rate": 1.6398439890249675e-05, + "loss": 0.4672, + "step": 15028 + }, + { + "epoch": 0.27903298270106897, + "grad_norm": 0.5336483716964722, + "learning_rate": 1.6397543393014293e-05, + "loss": 0.2873, + "step": 15030 + }, + { + "epoch": 0.2790701128384876, + "grad_norm": 0.3441188335418701, + "learning_rate": 1.6396646808727516e-05, + "loss": 0.1481, + "step": 15032 + }, + { + "epoch": 0.2791072429759063, + "grad_norm": 0.4678948223590851, + "learning_rate": 1.639575013740153e-05, + "loss": 0.418, + "step": 15034 + }, + { + "epoch": 0.2791443731133249, + "grad_norm": 0.5374354720115662, + "learning_rate": 1.6394853379048544e-05, + "loss": 0.2509, + "step": 15036 + }, + { + "epoch": 0.27918150325074353, + "grad_norm": 0.3136776387691498, + "learning_rate": 1.6393956533680758e-05, + "loss": 0.3832, + "step": 15038 + }, + { + "epoch": 0.27921863338816216, + "grad_norm": 0.3334224820137024, + "learning_rate": 1.639305960131038e-05, + "loss": 0.2516, + "step": 15040 + }, + { + "epoch": 0.2792557635255808, + "grad_norm": 0.47994497418403625, + "learning_rate": 1.6392162581949604e-05, + "loss": 0.2265, + "step": 15042 + }, + { + "epoch": 0.2792928936629994, + "grad_norm": 0.31730639934539795, + "learning_rate": 1.6391265475610644e-05, + "loss": 0.2638, + "step": 15044 + }, + { + "epoch": 0.2793300238004181, + "grad_norm": 0.3441244661808014, + "learning_rate": 1.6390368282305707e-05, + "loss": 0.2086, + "step": 15046 + }, + { + "epoch": 0.27936715393783673, + "grad_norm": 0.38913920521736145, + "learning_rate": 1.6389471002046998e-05, + "loss": 0.3959, + "step": 15048 + }, + { + "epoch": 0.27940428407525536, + "grad_norm": 0.43224868178367615, + "learning_rate": 1.6388573634846725e-05, + "loss": 0.3332, + "step": 15050 + }, + { + "epoch": 0.279441414212674, + "grad_norm": 0.47961071133613586, + "learning_rate": 1.6387676180717105e-05, + "loss": 0.3112, + "step": 15052 + }, + { + "epoch": 0.2794785443500926, + "grad_norm": 0.45057591795921326, + "learning_rate": 1.638677863967034e-05, + "loss": 0.4359, + "step": 15054 + }, + { + "epoch": 0.2795156744875113, + "grad_norm": 0.3402251601219177, + "learning_rate": 1.6385881011718653e-05, + "loss": 0.2692, + "step": 15056 + }, + { + "epoch": 0.27955280462492993, + "grad_norm": 0.3727535307407379, + "learning_rate": 1.6384983296874253e-05, + "loss": 0.2467, + "step": 15058 + }, + { + "epoch": 0.27958993476234856, + "grad_norm": 0.48922502994537354, + "learning_rate": 1.6384085495149356e-05, + "loss": 0.2591, + "step": 15060 + }, + { + "epoch": 0.2796270648997672, + "grad_norm": 0.4106374979019165, + "learning_rate": 1.6383187606556174e-05, + "loss": 0.4223, + "step": 15062 + }, + { + "epoch": 0.2796641950371858, + "grad_norm": 0.39098769426345825, + "learning_rate": 1.6382289631106933e-05, + "loss": 0.2028, + "step": 15064 + }, + { + "epoch": 0.2797013251746045, + "grad_norm": 0.3745861053466797, + "learning_rate": 1.6381391568813843e-05, + "loss": 0.3108, + "step": 15066 + }, + { + "epoch": 0.2797384553120231, + "grad_norm": 0.3746451139450073, + "learning_rate": 1.6380493419689133e-05, + "loss": 0.148, + "step": 15068 + }, + { + "epoch": 0.27977558544944175, + "grad_norm": 0.3354095220565796, + "learning_rate": 1.6379595183745015e-05, + "loss": 0.3559, + "step": 15070 + }, + { + "epoch": 0.2798127155868604, + "grad_norm": 0.35680198669433594, + "learning_rate": 1.637869686099372e-05, + "loss": 0.2749, + "step": 15072 + }, + { + "epoch": 0.279849845724279, + "grad_norm": 0.46964308619499207, + "learning_rate": 1.6377798451447465e-05, + "loss": 0.4228, + "step": 15074 + }, + { + "epoch": 0.27988697586169764, + "grad_norm": 0.5090184807777405, + "learning_rate": 1.6376899955118474e-05, + "loss": 0.2076, + "step": 15076 + }, + { + "epoch": 0.2799241059991163, + "grad_norm": 0.23075906932353973, + "learning_rate": 1.6376001372018978e-05, + "loss": 0.247, + "step": 15078 + }, + { + "epoch": 0.27996123613653495, + "grad_norm": 0.424236923456192, + "learning_rate": 1.6375102702161203e-05, + "loss": 0.2249, + "step": 15080 + }, + { + "epoch": 0.2799983662739536, + "grad_norm": 0.38155269622802734, + "learning_rate": 1.6374203945557375e-05, + "loss": 0.3799, + "step": 15082 + }, + { + "epoch": 0.2800354964113722, + "grad_norm": 0.36049073934555054, + "learning_rate": 1.6373305102219724e-05, + "loss": 0.3232, + "step": 15084 + }, + { + "epoch": 0.28007262654879084, + "grad_norm": 0.32451942563056946, + "learning_rate": 1.637240617216048e-05, + "loss": 0.3627, + "step": 15086 + }, + { + "epoch": 0.2801097566862095, + "grad_norm": 0.4158051609992981, + "learning_rate": 1.6371507155391877e-05, + "loss": 0.385, + "step": 15088 + }, + { + "epoch": 0.28014688682362815, + "grad_norm": 0.43560266494750977, + "learning_rate": 1.6370608051926146e-05, + "loss": 0.386, + "step": 15090 + }, + { + "epoch": 0.2801840169610468, + "grad_norm": 0.35539305210113525, + "learning_rate": 1.636970886177552e-05, + "loss": 0.356, + "step": 15092 + }, + { + "epoch": 0.2802211470984654, + "grad_norm": 0.26674985885620117, + "learning_rate": 1.636880958495224e-05, + "loss": 0.3461, + "step": 15094 + }, + { + "epoch": 0.28025827723588403, + "grad_norm": 0.23910953104496002, + "learning_rate": 1.6367910221468535e-05, + "loss": 0.3154, + "step": 15096 + }, + { + "epoch": 0.28029540737330266, + "grad_norm": 0.29694047570228577, + "learning_rate": 1.6367010771336647e-05, + "loss": 0.3483, + "step": 15098 + }, + { + "epoch": 0.28033253751072135, + "grad_norm": 0.3674033284187317, + "learning_rate": 1.636611123456881e-05, + "loss": 0.4217, + "step": 15100 + }, + { + "epoch": 0.28036966764814, + "grad_norm": 0.4674500524997711, + "learning_rate": 1.6365211611177274e-05, + "loss": 0.3562, + "step": 15102 + }, + { + "epoch": 0.2804067977855586, + "grad_norm": 0.3588138520717621, + "learning_rate": 1.6364311901174272e-05, + "loss": 0.352, + "step": 15104 + }, + { + "epoch": 0.28044392792297723, + "grad_norm": 0.4785209596157074, + "learning_rate": 1.6363412104572044e-05, + "loss": 0.2462, + "step": 15106 + }, + { + "epoch": 0.28048105806039586, + "grad_norm": 0.36932075023651123, + "learning_rate": 1.6362512221382846e-05, + "loss": 0.3516, + "step": 15108 + }, + { + "epoch": 0.28051818819781454, + "grad_norm": 0.2706802785396576, + "learning_rate": 1.6361612251618908e-05, + "loss": 0.3183, + "step": 15110 + }, + { + "epoch": 0.2805553183352332, + "grad_norm": 0.3177291750907898, + "learning_rate": 1.6360712195292486e-05, + "loss": 0.2849, + "step": 15112 + }, + { + "epoch": 0.2805924484726518, + "grad_norm": 0.5077996253967285, + "learning_rate": 1.635981205241582e-05, + "loss": 0.2259, + "step": 15114 + }, + { + "epoch": 0.28062957861007043, + "grad_norm": 0.4048846662044525, + "learning_rate": 1.6358911823001166e-05, + "loss": 0.248, + "step": 15116 + }, + { + "epoch": 0.28066670874748906, + "grad_norm": 0.538139820098877, + "learning_rate": 1.6358011507060765e-05, + "loss": 0.314, + "step": 15118 + }, + { + "epoch": 0.2807038388849077, + "grad_norm": 0.3161731958389282, + "learning_rate": 1.6357111104606875e-05, + "loss": 0.5129, + "step": 15120 + }, + { + "epoch": 0.28074096902232637, + "grad_norm": 0.4685649573802948, + "learning_rate": 1.6356210615651744e-05, + "loss": 0.1855, + "step": 15122 + }, + { + "epoch": 0.280778099159745, + "grad_norm": 0.3559631109237671, + "learning_rate": 1.6355310040207624e-05, + "loss": 0.2189, + "step": 15124 + }, + { + "epoch": 0.2808152292971636, + "grad_norm": 0.3861045241355896, + "learning_rate": 1.635440937828677e-05, + "loss": 0.3423, + "step": 15126 + }, + { + "epoch": 0.28085235943458225, + "grad_norm": 0.44704627990722656, + "learning_rate": 1.6353508629901442e-05, + "loss": 0.3762, + "step": 15128 + }, + { + "epoch": 0.2808894895720009, + "grad_norm": 0.35397785902023315, + "learning_rate": 1.635260779506389e-05, + "loss": 0.2989, + "step": 15130 + }, + { + "epoch": 0.28092661970941957, + "grad_norm": 0.3193988800048828, + "learning_rate": 1.6351706873786376e-05, + "loss": 0.2216, + "step": 15132 + }, + { + "epoch": 0.2809637498468382, + "grad_norm": 0.4408256411552429, + "learning_rate": 1.6350805866081153e-05, + "loss": 0.3688, + "step": 15134 + }, + { + "epoch": 0.2810008799842568, + "grad_norm": 0.33592477440834045, + "learning_rate": 1.634990477196049e-05, + "loss": 0.1755, + "step": 15136 + }, + { + "epoch": 0.28103801012167545, + "grad_norm": 0.5282504558563232, + "learning_rate": 1.634900359143664e-05, + "loss": 0.2946, + "step": 15138 + }, + { + "epoch": 0.2810751402590941, + "grad_norm": 0.3795854449272156, + "learning_rate": 1.6348102324521872e-05, + "loss": 0.3686, + "step": 15140 + }, + { + "epoch": 0.28111227039651276, + "grad_norm": 0.38313087821006775, + "learning_rate": 1.6347200971228443e-05, + "loss": 0.2266, + "step": 15142 + }, + { + "epoch": 0.2811494005339314, + "grad_norm": 0.4143899381160736, + "learning_rate": 1.6346299531568618e-05, + "loss": 0.243, + "step": 15144 + }, + { + "epoch": 0.28118653067135, + "grad_norm": 0.25574710965156555, + "learning_rate": 1.634539800555467e-05, + "loss": 0.1704, + "step": 15146 + }, + { + "epoch": 0.28122366080876865, + "grad_norm": 0.23985464870929718, + "learning_rate": 1.6344496393198862e-05, + "loss": 0.3421, + "step": 15148 + }, + { + "epoch": 0.2812607909461873, + "grad_norm": 0.5032542943954468, + "learning_rate": 1.634359469451346e-05, + "loss": 0.3088, + "step": 15150 + }, + { + "epoch": 0.2812979210836059, + "grad_norm": 0.34067878127098083, + "learning_rate": 1.6342692909510738e-05, + "loss": 0.2392, + "step": 15152 + }, + { + "epoch": 0.2813350512210246, + "grad_norm": 0.44177791476249695, + "learning_rate": 1.634179103820296e-05, + "loss": 0.4731, + "step": 15154 + }, + { + "epoch": 0.2813721813584432, + "grad_norm": 0.48239821195602417, + "learning_rate": 1.6340889080602406e-05, + "loss": 0.2198, + "step": 15156 + }, + { + "epoch": 0.28140931149586185, + "grad_norm": 0.2975512146949768, + "learning_rate": 1.6339987036721342e-05, + "loss": 0.2864, + "step": 15158 + }, + { + "epoch": 0.2814464416332805, + "grad_norm": 0.27439776062965393, + "learning_rate": 1.6339084906572045e-05, + "loss": 0.3156, + "step": 15160 + }, + { + "epoch": 0.2814835717706991, + "grad_norm": 0.28306058049201965, + "learning_rate": 1.6338182690166795e-05, + "loss": 0.3097, + "step": 15162 + }, + { + "epoch": 0.2815207019081178, + "grad_norm": 0.5881744027137756, + "learning_rate": 1.6337280387517857e-05, + "loss": 0.1904, + "step": 15164 + }, + { + "epoch": 0.2815578320455364, + "grad_norm": 0.33809372782707214, + "learning_rate": 1.6336377998637522e-05, + "loss": 0.312, + "step": 15166 + }, + { + "epoch": 0.28159496218295504, + "grad_norm": 0.392665296792984, + "learning_rate": 1.6335475523538056e-05, + "loss": 0.2473, + "step": 15168 + }, + { + "epoch": 0.2816320923203737, + "grad_norm": 0.27196863293647766, + "learning_rate": 1.6334572962231747e-05, + "loss": 0.2773, + "step": 15170 + }, + { + "epoch": 0.2816692224577923, + "grad_norm": 0.4898926317691803, + "learning_rate": 1.6333670314730877e-05, + "loss": 0.3732, + "step": 15172 + }, + { + "epoch": 0.28170635259521093, + "grad_norm": 0.28694918751716614, + "learning_rate": 1.6332767581047722e-05, + "loss": 0.2583, + "step": 15174 + }, + { + "epoch": 0.2817434827326296, + "grad_norm": 0.35229045152664185, + "learning_rate": 1.6331864761194568e-05, + "loss": 0.271, + "step": 15176 + }, + { + "epoch": 0.28178061287004824, + "grad_norm": 0.47089067101478577, + "learning_rate": 1.6330961855183708e-05, + "loss": 0.4246, + "step": 15178 + }, + { + "epoch": 0.28181774300746687, + "grad_norm": 0.4692615270614624, + "learning_rate": 1.6330058863027413e-05, + "loss": 0.255, + "step": 15180 + }, + { + "epoch": 0.2818548731448855, + "grad_norm": 0.27859732508659363, + "learning_rate": 1.632915578473798e-05, + "loss": 0.1934, + "step": 15182 + }, + { + "epoch": 0.2818920032823041, + "grad_norm": 0.5272289514541626, + "learning_rate": 1.6328252620327692e-05, + "loss": 0.397, + "step": 15184 + }, + { + "epoch": 0.2819291334197228, + "grad_norm": 0.4672413468360901, + "learning_rate": 1.6327349369808848e-05, + "loss": 0.1945, + "step": 15186 + }, + { + "epoch": 0.28196626355714144, + "grad_norm": 0.5078940987586975, + "learning_rate": 1.6326446033193726e-05, + "loss": 0.4013, + "step": 15188 + }, + { + "epoch": 0.28200339369456007, + "grad_norm": 0.2850618064403534, + "learning_rate": 1.6325542610494626e-05, + "loss": 0.2901, + "step": 15190 + }, + { + "epoch": 0.2820405238319787, + "grad_norm": 0.29544752836227417, + "learning_rate": 1.6324639101723835e-05, + "loss": 0.5343, + "step": 15192 + }, + { + "epoch": 0.2820776539693973, + "grad_norm": 0.32594382762908936, + "learning_rate": 1.6323735506893654e-05, + "loss": 0.3613, + "step": 15194 + }, + { + "epoch": 0.28211478410681595, + "grad_norm": 0.39919862151145935, + "learning_rate": 1.6322831826016373e-05, + "loss": 0.2641, + "step": 15196 + }, + { + "epoch": 0.28215191424423464, + "grad_norm": 0.38633760809898376, + "learning_rate": 1.632192805910429e-05, + "loss": 0.2317, + "step": 15198 + }, + { + "epoch": 0.28218904438165326, + "grad_norm": 0.29906320571899414, + "learning_rate": 1.63210242061697e-05, + "loss": 0.3474, + "step": 15200 + }, + { + "epoch": 0.2822261745190719, + "grad_norm": 0.6725844144821167, + "learning_rate": 1.632012026722491e-05, + "loss": 0.3482, + "step": 15202 + }, + { + "epoch": 0.2822633046564905, + "grad_norm": 0.42531195282936096, + "learning_rate": 1.631921624228221e-05, + "loss": 0.2521, + "step": 15204 + }, + { + "epoch": 0.28230043479390915, + "grad_norm": 0.3144271671772003, + "learning_rate": 1.6318312131353907e-05, + "loss": 0.2728, + "step": 15206 + }, + { + "epoch": 0.28233756493132783, + "grad_norm": 1.1425817012786865, + "learning_rate": 1.6317407934452297e-05, + "loss": 0.4547, + "step": 15208 + }, + { + "epoch": 0.28237469506874646, + "grad_norm": 0.43557244539260864, + "learning_rate": 1.6316503651589694e-05, + "loss": 0.2263, + "step": 15210 + }, + { + "epoch": 0.2824118252061651, + "grad_norm": 0.43249669671058655, + "learning_rate": 1.6315599282778393e-05, + "loss": 0.3461, + "step": 15212 + }, + { + "epoch": 0.2824489553435837, + "grad_norm": 0.6911650896072388, + "learning_rate": 1.6314694828030707e-05, + "loss": 0.5044, + "step": 15214 + }, + { + "epoch": 0.28248608548100235, + "grad_norm": 0.356902152299881, + "learning_rate": 1.6313790287358935e-05, + "loss": 0.2319, + "step": 15216 + }, + { + "epoch": 0.28252321561842103, + "grad_norm": 0.3941543996334076, + "learning_rate": 1.631288566077539e-05, + "loss": 0.3122, + "step": 15218 + }, + { + "epoch": 0.28256034575583966, + "grad_norm": 0.25098317861557007, + "learning_rate": 1.631198094829238e-05, + "loss": 0.3425, + "step": 15220 + }, + { + "epoch": 0.2825974758932583, + "grad_norm": 0.3847087323665619, + "learning_rate": 1.6311076149922217e-05, + "loss": 0.2571, + "step": 15222 + }, + { + "epoch": 0.2826346060306769, + "grad_norm": 0.38837409019470215, + "learning_rate": 1.6310171265677213e-05, + "loss": 0.2483, + "step": 15224 + }, + { + "epoch": 0.28267173616809554, + "grad_norm": 0.27800941467285156, + "learning_rate": 1.6309266295569674e-05, + "loss": 0.409, + "step": 15226 + }, + { + "epoch": 0.2827088663055142, + "grad_norm": 0.3290328085422516, + "learning_rate": 1.6308361239611924e-05, + "loss": 0.1897, + "step": 15228 + }, + { + "epoch": 0.28274599644293286, + "grad_norm": 0.4303019940853119, + "learning_rate": 1.6307456097816272e-05, + "loss": 0.1812, + "step": 15230 + }, + { + "epoch": 0.2827831265803515, + "grad_norm": 0.3664807677268982, + "learning_rate": 1.6306550870195033e-05, + "loss": 0.3847, + "step": 15232 + }, + { + "epoch": 0.2828202567177701, + "grad_norm": 0.3614775538444519, + "learning_rate": 1.630564555676053e-05, + "loss": 0.2171, + "step": 15234 + }, + { + "epoch": 0.28285738685518874, + "grad_norm": 0.4108017385005951, + "learning_rate": 1.6304740157525078e-05, + "loss": 0.235, + "step": 15236 + }, + { + "epoch": 0.28289451699260737, + "grad_norm": 0.373841255903244, + "learning_rate": 1.6303834672500996e-05, + "loss": 0.4997, + "step": 15238 + }, + { + "epoch": 0.28293164713002605, + "grad_norm": 0.2432202696800232, + "learning_rate": 1.630292910170061e-05, + "loss": 0.2127, + "step": 15240 + }, + { + "epoch": 0.2829687772674447, + "grad_norm": 0.2322385460138321, + "learning_rate": 1.6302023445136234e-05, + "loss": 0.1921, + "step": 15242 + }, + { + "epoch": 0.2830059074048633, + "grad_norm": 0.5921317934989929, + "learning_rate": 1.6301117702820196e-05, + "loss": 0.4181, + "step": 15244 + }, + { + "epoch": 0.28304303754228194, + "grad_norm": 0.42867523431777954, + "learning_rate": 1.6300211874764823e-05, + "loss": 0.4547, + "step": 15246 + }, + { + "epoch": 0.28308016767970057, + "grad_norm": 0.4820225238800049, + "learning_rate": 1.629930596098244e-05, + "loss": 0.4175, + "step": 15248 + }, + { + "epoch": 0.2831172978171192, + "grad_norm": 0.39568570256233215, + "learning_rate": 1.6298399961485364e-05, + "loss": 0.3123, + "step": 15250 + }, + { + "epoch": 0.2831544279545379, + "grad_norm": 0.3059687316417694, + "learning_rate": 1.6297493876285933e-05, + "loss": 0.2979, + "step": 15252 + }, + { + "epoch": 0.2831915580919565, + "grad_norm": 0.4182986617088318, + "learning_rate": 1.6296587705396476e-05, + "loss": 0.1913, + "step": 15254 + }, + { + "epoch": 0.28322868822937514, + "grad_norm": 0.2948521077632904, + "learning_rate": 1.629568144882932e-05, + "loss": 0.5376, + "step": 15256 + }, + { + "epoch": 0.28326581836679376, + "grad_norm": 0.32156965136528015, + "learning_rate": 1.6294775106596796e-05, + "loss": 0.5023, + "step": 15258 + }, + { + "epoch": 0.2833029485042124, + "grad_norm": 0.2610750198364258, + "learning_rate": 1.6293868678711237e-05, + "loss": 0.3343, + "step": 15260 + }, + { + "epoch": 0.2833400786416311, + "grad_norm": 0.6007379293441772, + "learning_rate": 1.629296216518498e-05, + "loss": 0.3914, + "step": 15262 + }, + { + "epoch": 0.2833772087790497, + "grad_norm": 0.30240124464035034, + "learning_rate": 1.6292055566030356e-05, + "loss": 0.2935, + "step": 15264 + }, + { + "epoch": 0.28341433891646833, + "grad_norm": 0.38106393814086914, + "learning_rate": 1.6291148881259704e-05, + "loss": 0.366, + "step": 15266 + }, + { + "epoch": 0.28345146905388696, + "grad_norm": 0.4438855051994324, + "learning_rate": 1.6290242110885358e-05, + "loss": 0.3173, + "step": 15268 + }, + { + "epoch": 0.2834885991913056, + "grad_norm": 0.46781694889068604, + "learning_rate": 1.628933525491966e-05, + "loss": 0.3647, + "step": 15270 + }, + { + "epoch": 0.2835257293287242, + "grad_norm": 0.4572279155254364, + "learning_rate": 1.6288428313374948e-05, + "loss": 0.4107, + "step": 15272 + }, + { + "epoch": 0.2835628594661429, + "grad_norm": 0.366137832403183, + "learning_rate": 1.6287521286263563e-05, + "loss": 0.3605, + "step": 15274 + }, + { + "epoch": 0.28359998960356153, + "grad_norm": 0.2983122766017914, + "learning_rate": 1.6286614173597843e-05, + "loss": 0.5297, + "step": 15276 + }, + { + "epoch": 0.28363711974098016, + "grad_norm": 0.2641507387161255, + "learning_rate": 1.628570697539014e-05, + "loss": 0.2767, + "step": 15278 + }, + { + "epoch": 0.2836742498783988, + "grad_norm": 0.5122215151786804, + "learning_rate": 1.6284799691652787e-05, + "loss": 0.2294, + "step": 15280 + }, + { + "epoch": 0.2837113800158174, + "grad_norm": 0.3430140018463135, + "learning_rate": 1.628389232239814e-05, + "loss": 0.1308, + "step": 15282 + }, + { + "epoch": 0.2837485101532361, + "grad_norm": 0.3650031089782715, + "learning_rate": 1.6282984867638538e-05, + "loss": 0.2422, + "step": 15284 + }, + { + "epoch": 0.28378564029065473, + "grad_norm": 0.45217829942703247, + "learning_rate": 1.6282077327386337e-05, + "loss": 0.3221, + "step": 15286 + }, + { + "epoch": 0.28382277042807336, + "grad_norm": 0.32803067564964294, + "learning_rate": 1.6281169701653875e-05, + "loss": 0.2795, + "step": 15288 + }, + { + "epoch": 0.283859900565492, + "grad_norm": 0.47347021102905273, + "learning_rate": 1.628026199045351e-05, + "loss": 0.3021, + "step": 15290 + }, + { + "epoch": 0.2838970307029106, + "grad_norm": 0.46110835671424866, + "learning_rate": 1.627935419379759e-05, + "loss": 0.3758, + "step": 15292 + }, + { + "epoch": 0.2839341608403293, + "grad_norm": 0.24924182891845703, + "learning_rate": 1.6278446311698467e-05, + "loss": 0.3331, + "step": 15294 + }, + { + "epoch": 0.2839712909777479, + "grad_norm": 0.40695130825042725, + "learning_rate": 1.62775383441685e-05, + "loss": 0.4265, + "step": 15296 + }, + { + "epoch": 0.28400842111516655, + "grad_norm": 0.3572111129760742, + "learning_rate": 1.6276630291220035e-05, + "loss": 0.3569, + "step": 15298 + }, + { + "epoch": 0.2840455512525852, + "grad_norm": 0.44562914967536926, + "learning_rate": 1.6275722152865436e-05, + "loss": 0.4146, + "step": 15300 + }, + { + "epoch": 0.2840826813900038, + "grad_norm": 0.5651805996894836, + "learning_rate": 1.6274813929117054e-05, + "loss": 0.3408, + "step": 15302 + }, + { + "epoch": 0.28411981152742244, + "grad_norm": 0.38320788741111755, + "learning_rate": 1.627390561998725e-05, + "loss": 0.4149, + "step": 15304 + }, + { + "epoch": 0.2841569416648411, + "grad_norm": 0.4090460538864136, + "learning_rate": 1.6272997225488383e-05, + "loss": 0.2655, + "step": 15306 + }, + { + "epoch": 0.28419407180225975, + "grad_norm": 0.4915003776550293, + "learning_rate": 1.6272088745632814e-05, + "loss": 0.282, + "step": 15308 + }, + { + "epoch": 0.2842312019396784, + "grad_norm": 0.3459673225879669, + "learning_rate": 1.6271180180432905e-05, + "loss": 0.2553, + "step": 15310 + }, + { + "epoch": 0.284268332077097, + "grad_norm": 0.36616796255111694, + "learning_rate": 1.627027152990102e-05, + "loss": 0.4398, + "step": 15312 + }, + { + "epoch": 0.28430546221451564, + "grad_norm": 0.43059292435646057, + "learning_rate": 1.626936279404952e-05, + "loss": 0.2593, + "step": 15314 + }, + { + "epoch": 0.2843425923519343, + "grad_norm": 0.40349170565605164, + "learning_rate": 1.6268453972890764e-05, + "loss": 0.4947, + "step": 15316 + }, + { + "epoch": 0.28437972248935295, + "grad_norm": 0.3066299259662628, + "learning_rate": 1.6267545066437134e-05, + "loss": 0.2513, + "step": 15318 + }, + { + "epoch": 0.2844168526267716, + "grad_norm": 0.5004720091819763, + "learning_rate": 1.6266636074700987e-05, + "loss": 0.143, + "step": 15320 + }, + { + "epoch": 0.2844539827641902, + "grad_norm": 0.26795145869255066, + "learning_rate": 1.626572699769469e-05, + "loss": 0.3567, + "step": 15322 + }, + { + "epoch": 0.28449111290160883, + "grad_norm": 0.35458195209503174, + "learning_rate": 1.6264817835430622e-05, + "loss": 0.2723, + "step": 15324 + }, + { + "epoch": 0.28452824303902746, + "grad_norm": 0.33366137742996216, + "learning_rate": 1.6263908587921148e-05, + "loss": 0.2204, + "step": 15326 + }, + { + "epoch": 0.28456537317644615, + "grad_norm": 0.33586347103118896, + "learning_rate": 1.6262999255178637e-05, + "loss": 0.295, + "step": 15328 + }, + { + "epoch": 0.2846025033138648, + "grad_norm": 0.2718474864959717, + "learning_rate": 1.626208983721547e-05, + "loss": 0.2439, + "step": 15330 + }, + { + "epoch": 0.2846396334512834, + "grad_norm": 0.33899179100990295, + "learning_rate": 1.6261180334044016e-05, + "loss": 0.2297, + "step": 15332 + }, + { + "epoch": 0.28467676358870203, + "grad_norm": 0.24895575642585754, + "learning_rate": 1.626027074567665e-05, + "loss": 0.3484, + "step": 15334 + }, + { + "epoch": 0.28471389372612066, + "grad_norm": 0.3588293194770813, + "learning_rate": 1.6259361072125756e-05, + "loss": 0.343, + "step": 15336 + }, + { + "epoch": 0.28475102386353934, + "grad_norm": 0.2956729233264923, + "learning_rate": 1.62584513134037e-05, + "loss": 0.3685, + "step": 15338 + }, + { + "epoch": 0.28478815400095797, + "grad_norm": 0.44835031032562256, + "learning_rate": 1.6257541469522872e-05, + "loss": 0.4142, + "step": 15340 + }, + { + "epoch": 0.2848252841383766, + "grad_norm": 0.4319595992565155, + "learning_rate": 1.6256631540495648e-05, + "loss": 0.3616, + "step": 15342 + }, + { + "epoch": 0.28486241427579523, + "grad_norm": 0.4029219150543213, + "learning_rate": 1.6255721526334407e-05, + "loss": 0.4293, + "step": 15344 + }, + { + "epoch": 0.28489954441321386, + "grad_norm": 0.38367873430252075, + "learning_rate": 1.6254811427051538e-05, + "loss": 0.3847, + "step": 15346 + }, + { + "epoch": 0.2849366745506325, + "grad_norm": 0.38078564405441284, + "learning_rate": 1.625390124265942e-05, + "loss": 0.2318, + "step": 15348 + }, + { + "epoch": 0.28497380468805117, + "grad_norm": 0.4418644607067108, + "learning_rate": 1.6252990973170435e-05, + "loss": 0.2504, + "step": 15350 + }, + { + "epoch": 0.2850109348254698, + "grad_norm": 0.3804689943790436, + "learning_rate": 1.6252080618596976e-05, + "loss": 0.3188, + "step": 15352 + }, + { + "epoch": 0.2850480649628884, + "grad_norm": 0.37130653858184814, + "learning_rate": 1.6251170178951423e-05, + "loss": 0.321, + "step": 15354 + }, + { + "epoch": 0.28508519510030705, + "grad_norm": 0.41873249411582947, + "learning_rate": 1.6250259654246172e-05, + "loss": 0.4018, + "step": 15356 + }, + { + "epoch": 0.2851223252377257, + "grad_norm": 0.33758434653282166, + "learning_rate": 1.624934904449361e-05, + "loss": 0.1856, + "step": 15358 + }, + { + "epoch": 0.28515945537514437, + "grad_norm": 0.517618715763092, + "learning_rate": 1.624843834970612e-05, + "loss": 0.2499, + "step": 15360 + }, + { + "epoch": 0.285196585512563, + "grad_norm": 0.4106998145580292, + "learning_rate": 1.624752756989611e-05, + "loss": 0.3723, + "step": 15362 + }, + { + "epoch": 0.2852337156499816, + "grad_norm": 0.5599865317344666, + "learning_rate": 1.6246616705075956e-05, + "loss": 0.3468, + "step": 15364 + }, + { + "epoch": 0.28527084578740025, + "grad_norm": 0.49578964710235596, + "learning_rate": 1.6245705755258062e-05, + "loss": 0.164, + "step": 15366 + }, + { + "epoch": 0.2853079759248189, + "grad_norm": 0.6233541369438171, + "learning_rate": 1.624479472045482e-05, + "loss": 0.314, + "step": 15368 + }, + { + "epoch": 0.28534510606223756, + "grad_norm": 0.544994592666626, + "learning_rate": 1.6243883600678628e-05, + "loss": 0.3274, + "step": 15370 + }, + { + "epoch": 0.2853822361996562, + "grad_norm": 0.4158276319503784, + "learning_rate": 1.6242972395941882e-05, + "loss": 0.2367, + "step": 15372 + }, + { + "epoch": 0.2854193663370748, + "grad_norm": 0.4828920066356659, + "learning_rate": 1.6242061106256985e-05, + "loss": 0.2873, + "step": 15374 + }, + { + "epoch": 0.28545649647449345, + "grad_norm": 0.43755224347114563, + "learning_rate": 1.6241149731636326e-05, + "loss": 0.2263, + "step": 15376 + }, + { + "epoch": 0.2854936266119121, + "grad_norm": 0.4212903380393982, + "learning_rate": 1.624023827209232e-05, + "loss": 0.4343, + "step": 15378 + }, + { + "epoch": 0.2855307567493307, + "grad_norm": 0.4520227313041687, + "learning_rate": 1.623932672763736e-05, + "loss": 0.2744, + "step": 15380 + }, + { + "epoch": 0.2855678868867494, + "grad_norm": 0.35532331466674805, + "learning_rate": 1.6238415098283853e-05, + "loss": 0.1519, + "step": 15382 + }, + { + "epoch": 0.285605017024168, + "grad_norm": 0.337658554315567, + "learning_rate": 1.6237503384044203e-05, + "loss": 0.3772, + "step": 15384 + }, + { + "epoch": 0.28564214716158665, + "grad_norm": 0.3435303866863251, + "learning_rate": 1.6236591584930814e-05, + "loss": 0.2648, + "step": 15386 + }, + { + "epoch": 0.2856792772990053, + "grad_norm": 0.30965641140937805, + "learning_rate": 1.62356797009561e-05, + "loss": 0.1044, + "step": 15388 + }, + { + "epoch": 0.2857164074364239, + "grad_norm": 0.3630635738372803, + "learning_rate": 1.6234767732132456e-05, + "loss": 0.2765, + "step": 15390 + }, + { + "epoch": 0.2857535375738426, + "grad_norm": 0.3090052902698517, + "learning_rate": 1.62338556784723e-05, + "loss": 0.184, + "step": 15392 + }, + { + "epoch": 0.2857906677112612, + "grad_norm": 0.3754655420780182, + "learning_rate": 1.6232943539988043e-05, + "loss": 0.2981, + "step": 15394 + }, + { + "epoch": 0.28582779784867984, + "grad_norm": 0.30027732253074646, + "learning_rate": 1.623203131669209e-05, + "loss": 0.219, + "step": 15396 + }, + { + "epoch": 0.28586492798609847, + "grad_norm": 0.3603162467479706, + "learning_rate": 1.6231119008596862e-05, + "loss": 0.3673, + "step": 15398 + }, + { + "epoch": 0.2859020581235171, + "grad_norm": 0.35720306634902954, + "learning_rate": 1.6230206615714767e-05, + "loss": 0.2093, + "step": 15400 + }, + { + "epoch": 0.28593918826093573, + "grad_norm": 0.26947757601737976, + "learning_rate": 1.6229294138058223e-05, + "loss": 0.2292, + "step": 15402 + }, + { + "epoch": 0.2859763183983544, + "grad_norm": 0.5669321417808533, + "learning_rate": 1.6228381575639644e-05, + "loss": 0.2901, + "step": 15404 + }, + { + "epoch": 0.28601344853577304, + "grad_norm": 0.5515179634094238, + "learning_rate": 1.622746892847145e-05, + "loss": 0.2241, + "step": 15406 + }, + { + "epoch": 0.28605057867319167, + "grad_norm": 0.3312181830406189, + "learning_rate": 1.622655619656605e-05, + "loss": 0.4145, + "step": 15408 + }, + { + "epoch": 0.2860877088106103, + "grad_norm": 0.3909747898578644, + "learning_rate": 1.622564337993588e-05, + "loss": 0.4107, + "step": 15410 + }, + { + "epoch": 0.2861248389480289, + "grad_norm": 0.4423218369483948, + "learning_rate": 1.6224730478593347e-05, + "loss": 0.4942, + "step": 15412 + }, + { + "epoch": 0.2861619690854476, + "grad_norm": 0.4754064381122589, + "learning_rate": 1.6223817492550877e-05, + "loss": 0.3382, + "step": 15414 + }, + { + "epoch": 0.28619909922286624, + "grad_norm": 0.37255749106407166, + "learning_rate": 1.6222904421820894e-05, + "loss": 0.4644, + "step": 15416 + }, + { + "epoch": 0.28623622936028487, + "grad_norm": 0.3786499798297882, + "learning_rate": 1.6221991266415823e-05, + "loss": 0.4033, + "step": 15418 + }, + { + "epoch": 0.2862733594977035, + "grad_norm": 0.36177828907966614, + "learning_rate": 1.6221078026348087e-05, + "loss": 0.3115, + "step": 15420 + }, + { + "epoch": 0.2863104896351221, + "grad_norm": 0.4462338089942932, + "learning_rate": 1.6220164701630112e-05, + "loss": 0.1892, + "step": 15422 + }, + { + "epoch": 0.28634761977254075, + "grad_norm": 0.3418540060520172, + "learning_rate": 1.621925129227433e-05, + "loss": 0.1989, + "step": 15424 + }, + { + "epoch": 0.28638474990995944, + "grad_norm": 0.5730937123298645, + "learning_rate": 1.6218337798293165e-05, + "loss": 0.3572, + "step": 15426 + }, + { + "epoch": 0.28642188004737806, + "grad_norm": 0.44200706481933594, + "learning_rate": 1.6217424219699046e-05, + "loss": 0.2889, + "step": 15428 + }, + { + "epoch": 0.2864590101847967, + "grad_norm": 0.46696531772613525, + "learning_rate": 1.621651055650441e-05, + "loss": 0.3177, + "step": 15430 + }, + { + "epoch": 0.2864961403222153, + "grad_norm": 0.4467834234237671, + "learning_rate": 1.6215596808721684e-05, + "loss": 0.3039, + "step": 15432 + }, + { + "epoch": 0.28653327045963395, + "grad_norm": 0.2603677809238434, + "learning_rate": 1.6214682976363304e-05, + "loss": 0.3223, + "step": 15434 + }, + { + "epoch": 0.28657040059705263, + "grad_norm": 0.3614133596420288, + "learning_rate": 1.6213769059441705e-05, + "loss": 0.1018, + "step": 15436 + }, + { + "epoch": 0.28660753073447126, + "grad_norm": 0.3147931396961212, + "learning_rate": 1.6212855057969324e-05, + "loss": 0.4178, + "step": 15438 + }, + { + "epoch": 0.2866446608718899, + "grad_norm": 0.422879695892334, + "learning_rate": 1.621194097195859e-05, + "loss": 0.2894, + "step": 15440 + }, + { + "epoch": 0.2866817910093085, + "grad_norm": 0.4944377839565277, + "learning_rate": 1.6211026801421947e-05, + "loss": 0.2128, + "step": 15442 + }, + { + "epoch": 0.28671892114672715, + "grad_norm": 0.34172323346138, + "learning_rate": 1.6210112546371835e-05, + "loss": 0.2334, + "step": 15444 + }, + { + "epoch": 0.28675605128414583, + "grad_norm": 0.3031104803085327, + "learning_rate": 1.6209198206820694e-05, + "loss": 0.1632, + "step": 15446 + }, + { + "epoch": 0.28679318142156446, + "grad_norm": 0.42268919944763184, + "learning_rate": 1.6208283782780964e-05, + "loss": 0.2931, + "step": 15448 + }, + { + "epoch": 0.2868303115589831, + "grad_norm": 0.5924049019813538, + "learning_rate": 1.6207369274265086e-05, + "loss": 0.3113, + "step": 15450 + }, + { + "epoch": 0.2868674416964017, + "grad_norm": 0.35454505681991577, + "learning_rate": 1.6206454681285505e-05, + "loss": 0.3643, + "step": 15452 + }, + { + "epoch": 0.28690457183382034, + "grad_norm": 0.37803372740745544, + "learning_rate": 1.6205540003854666e-05, + "loss": 0.23, + "step": 15454 + }, + { + "epoch": 0.28694170197123897, + "grad_norm": 0.44532960653305054, + "learning_rate": 1.6204625241985017e-05, + "loss": 0.3562, + "step": 15456 + }, + { + "epoch": 0.28697883210865766, + "grad_norm": 0.4188586473464966, + "learning_rate": 1.6203710395689005e-05, + "loss": 0.3087, + "step": 15458 + }, + { + "epoch": 0.2870159622460763, + "grad_norm": 0.3082582652568817, + "learning_rate": 1.620279546497907e-05, + "loss": 0.2921, + "step": 15460 + }, + { + "epoch": 0.2870530923834949, + "grad_norm": 0.200359508395195, + "learning_rate": 1.6201880449867676e-05, + "loss": 0.2381, + "step": 15462 + }, + { + "epoch": 0.28709022252091354, + "grad_norm": 0.2680836319923401, + "learning_rate": 1.620096535036726e-05, + "loss": 0.2656, + "step": 15464 + }, + { + "epoch": 0.28712735265833217, + "grad_norm": 0.43087753653526306, + "learning_rate": 1.6200050166490283e-05, + "loss": 0.197, + "step": 15466 + }, + { + "epoch": 0.28716448279575085, + "grad_norm": 0.49592265486717224, + "learning_rate": 1.6199134898249193e-05, + "loss": 0.1827, + "step": 15468 + }, + { + "epoch": 0.2872016129331695, + "grad_norm": 0.4730733036994934, + "learning_rate": 1.6198219545656448e-05, + "loss": 0.3691, + "step": 15470 + }, + { + "epoch": 0.2872387430705881, + "grad_norm": 0.37694552540779114, + "learning_rate": 1.6197304108724498e-05, + "loss": 0.3427, + "step": 15472 + }, + { + "epoch": 0.28727587320800674, + "grad_norm": 0.4769977033138275, + "learning_rate": 1.6196388587465803e-05, + "loss": 0.2563, + "step": 15474 + }, + { + "epoch": 0.28731300334542537, + "grad_norm": 0.3011767566204071, + "learning_rate": 1.619547298189282e-05, + "loss": 0.3026, + "step": 15476 + }, + { + "epoch": 0.287350133482844, + "grad_norm": 0.31554466485977173, + "learning_rate": 1.619455729201801e-05, + "loss": 0.3056, + "step": 15478 + }, + { + "epoch": 0.2873872636202627, + "grad_norm": 0.2843112647533417, + "learning_rate": 1.6193641517853825e-05, + "loss": 0.2372, + "step": 15480 + }, + { + "epoch": 0.2874243937576813, + "grad_norm": 0.35560521483421326, + "learning_rate": 1.6192725659412734e-05, + "loss": 0.6194, + "step": 15482 + }, + { + "epoch": 0.28746152389509994, + "grad_norm": 0.306471586227417, + "learning_rate": 1.6191809716707195e-05, + "loss": 0.2463, + "step": 15484 + }, + { + "epoch": 0.28749865403251856, + "grad_norm": 0.24490579962730408, + "learning_rate": 1.6190893689749675e-05, + "loss": 0.3032, + "step": 15486 + }, + { + "epoch": 0.2875357841699372, + "grad_norm": 0.3124704658985138, + "learning_rate": 1.618997757855263e-05, + "loss": 0.3571, + "step": 15488 + }, + { + "epoch": 0.2875729143073559, + "grad_norm": 0.34375661611557007, + "learning_rate": 1.6189061383128537e-05, + "loss": 0.2745, + "step": 15490 + }, + { + "epoch": 0.2876100444447745, + "grad_norm": 0.34555792808532715, + "learning_rate": 1.6188145103489853e-05, + "loss": 0.2492, + "step": 15492 + }, + { + "epoch": 0.28764717458219313, + "grad_norm": 0.36836835741996765, + "learning_rate": 1.6187228739649054e-05, + "loss": 0.3939, + "step": 15494 + }, + { + "epoch": 0.28768430471961176, + "grad_norm": 0.36780625581741333, + "learning_rate": 1.61863122916186e-05, + "loss": 0.1897, + "step": 15496 + }, + { + "epoch": 0.2877214348570304, + "grad_norm": 0.41419675946235657, + "learning_rate": 1.618539575941097e-05, + "loss": 0.2899, + "step": 15498 + }, + { + "epoch": 0.287758564994449, + "grad_norm": 0.3153773248195648, + "learning_rate": 1.6184479143038633e-05, + "loss": 0.3534, + "step": 15500 + }, + { + "epoch": 0.2877956951318677, + "grad_norm": 0.3480892777442932, + "learning_rate": 1.6183562442514055e-05, + "loss": 0.2988, + "step": 15502 + }, + { + "epoch": 0.28783282526928633, + "grad_norm": 0.31886228919029236, + "learning_rate": 1.6182645657849714e-05, + "loss": 0.3502, + "step": 15504 + }, + { + "epoch": 0.28786995540670496, + "grad_norm": 0.4016492962837219, + "learning_rate": 1.6181728789058087e-05, + "loss": 0.3568, + "step": 15506 + }, + { + "epoch": 0.2879070855441236, + "grad_norm": 0.49086523056030273, + "learning_rate": 1.6180811836151643e-05, + "loss": 0.1567, + "step": 15508 + }, + { + "epoch": 0.2879442156815422, + "grad_norm": 0.41445788741111755, + "learning_rate": 1.6179894799142868e-05, + "loss": 0.3235, + "step": 15510 + }, + { + "epoch": 0.2879813458189609, + "grad_norm": 0.25982508063316345, + "learning_rate": 1.6178977678044236e-05, + "loss": 0.2999, + "step": 15512 + }, + { + "epoch": 0.2880184759563795, + "grad_norm": 0.2904912829399109, + "learning_rate": 1.6178060472868222e-05, + "loss": 0.2873, + "step": 15514 + }, + { + "epoch": 0.28805560609379816, + "grad_norm": 0.2471446692943573, + "learning_rate": 1.6177143183627313e-05, + "loss": 0.3689, + "step": 15516 + }, + { + "epoch": 0.2880927362312168, + "grad_norm": 0.23567642271518707, + "learning_rate": 1.617622581033399e-05, + "loss": 0.4435, + "step": 15518 + }, + { + "epoch": 0.2881298663686354, + "grad_norm": 0.285057008266449, + "learning_rate": 1.6175308353000733e-05, + "loss": 0.1573, + "step": 15520 + }, + { + "epoch": 0.2881669965060541, + "grad_norm": 0.43118301033973694, + "learning_rate": 1.6174390811640023e-05, + "loss": 0.4376, + "step": 15522 + }, + { + "epoch": 0.2882041266434727, + "grad_norm": 0.3691777288913727, + "learning_rate": 1.6173473186264347e-05, + "loss": 0.372, + "step": 15524 + }, + { + "epoch": 0.28824125678089135, + "grad_norm": 0.4879799485206604, + "learning_rate": 1.61725554768862e-05, + "loss": 0.4235, + "step": 15526 + }, + { + "epoch": 0.28827838691831, + "grad_norm": 0.21953199803829193, + "learning_rate": 1.617163768351806e-05, + "loss": 0.3363, + "step": 15528 + }, + { + "epoch": 0.2883155170557286, + "grad_norm": 0.3726438879966736, + "learning_rate": 1.6170719806172413e-05, + "loss": 0.562, + "step": 15530 + }, + { + "epoch": 0.28835264719314724, + "grad_norm": 0.39426764845848083, + "learning_rate": 1.6169801844861756e-05, + "loss": 0.2275, + "step": 15532 + }, + { + "epoch": 0.2883897773305659, + "grad_norm": 0.5270881652832031, + "learning_rate": 1.6168883799598578e-05, + "loss": 0.4719, + "step": 15534 + }, + { + "epoch": 0.28842690746798455, + "grad_norm": 0.5495846271514893, + "learning_rate": 1.6167965670395366e-05, + "loss": 0.177, + "step": 15536 + }, + { + "epoch": 0.2884640376054032, + "grad_norm": 0.3154410719871521, + "learning_rate": 1.616704745726462e-05, + "loss": 0.2957, + "step": 15538 + }, + { + "epoch": 0.2885011677428218, + "grad_norm": 0.36117202043533325, + "learning_rate": 1.6166129160218823e-05, + "loss": 0.2326, + "step": 15540 + }, + { + "epoch": 0.28853829788024044, + "grad_norm": 0.7620652914047241, + "learning_rate": 1.6165210779270487e-05, + "loss": 0.3344, + "step": 15542 + }, + { + "epoch": 0.2885754280176591, + "grad_norm": 0.23021607100963593, + "learning_rate": 1.6164292314432096e-05, + "loss": 0.0988, + "step": 15544 + }, + { + "epoch": 0.28861255815507775, + "grad_norm": 0.40819084644317627, + "learning_rate": 1.616337376571615e-05, + "loss": 0.1714, + "step": 15546 + }, + { + "epoch": 0.2886496882924964, + "grad_norm": 0.3554391860961914, + "learning_rate": 1.6162455133135148e-05, + "loss": 0.3991, + "step": 15548 + }, + { + "epoch": 0.288686818429915, + "grad_norm": 0.24647042155265808, + "learning_rate": 1.616153641670159e-05, + "loss": 0.3449, + "step": 15550 + }, + { + "epoch": 0.28872394856733363, + "grad_norm": 0.797376811504364, + "learning_rate": 1.616061761642798e-05, + "loss": 0.3413, + "step": 15552 + }, + { + "epoch": 0.28876107870475226, + "grad_norm": 0.4246409237384796, + "learning_rate": 1.6159698732326815e-05, + "loss": 0.3481, + "step": 15554 + }, + { + "epoch": 0.28879820884217094, + "grad_norm": 0.3307946026325226, + "learning_rate": 1.61587797644106e-05, + "loss": 0.3762, + "step": 15556 + }, + { + "epoch": 0.2888353389795896, + "grad_norm": 0.32959240674972534, + "learning_rate": 1.6157860712691845e-05, + "loss": 0.4313, + "step": 15558 + }, + { + "epoch": 0.2888724691170082, + "grad_norm": 0.45044249296188354, + "learning_rate": 1.6156941577183043e-05, + "loss": 0.2394, + "step": 15560 + }, + { + "epoch": 0.28890959925442683, + "grad_norm": 0.3602319061756134, + "learning_rate": 1.6156022357896715e-05, + "loss": 0.2119, + "step": 15562 + }, + { + "epoch": 0.28894672939184546, + "grad_norm": 0.30920371413230896, + "learning_rate": 1.6155103054845362e-05, + "loss": 0.1847, + "step": 15564 + }, + { + "epoch": 0.28898385952926414, + "grad_norm": 0.39376068115234375, + "learning_rate": 1.6154183668041484e-05, + "loss": 0.2678, + "step": 15566 + }, + { + "epoch": 0.28902098966668277, + "grad_norm": 0.3980129361152649, + "learning_rate": 1.615326419749761e-05, + "loss": 0.2494, + "step": 15568 + }, + { + "epoch": 0.2890581198041014, + "grad_norm": 0.3452844023704529, + "learning_rate": 1.6152344643226237e-05, + "loss": 0.0795, + "step": 15570 + }, + { + "epoch": 0.28909524994152, + "grad_norm": 0.45482882857322693, + "learning_rate": 1.6151425005239886e-05, + "loss": 0.2772, + "step": 15572 + }, + { + "epoch": 0.28913238007893866, + "grad_norm": 0.336314857006073, + "learning_rate": 1.615050528355106e-05, + "loss": 0.2172, + "step": 15574 + }, + { + "epoch": 0.2891695102163573, + "grad_norm": 0.3411087691783905, + "learning_rate": 1.614958547817229e-05, + "loss": 0.3079, + "step": 15576 + }, + { + "epoch": 0.28920664035377597, + "grad_norm": 0.2088017612695694, + "learning_rate": 1.614866558911607e-05, + "loss": 0.4252, + "step": 15578 + }, + { + "epoch": 0.2892437704911946, + "grad_norm": 0.4121551215648651, + "learning_rate": 1.6147745616394933e-05, + "loss": 0.2619, + "step": 15580 + }, + { + "epoch": 0.2892809006286132, + "grad_norm": 0.42888790369033813, + "learning_rate": 1.6146825560021397e-05, + "loss": 0.2323, + "step": 15582 + }, + { + "epoch": 0.28931803076603185, + "grad_norm": 0.4210417866706848, + "learning_rate": 1.6145905420007973e-05, + "loss": 0.3908, + "step": 15584 + }, + { + "epoch": 0.2893551609034505, + "grad_norm": 0.34588950872421265, + "learning_rate": 1.6144985196367185e-05, + "loss": 0.3081, + "step": 15586 + }, + { + "epoch": 0.28939229104086917, + "grad_norm": 0.37002184987068176, + "learning_rate": 1.6144064889111557e-05, + "loss": 0.2208, + "step": 15588 + }, + { + "epoch": 0.2894294211782878, + "grad_norm": 0.33495160937309265, + "learning_rate": 1.6143144498253608e-05, + "loss": 0.0921, + "step": 15590 + }, + { + "epoch": 0.2894665513157064, + "grad_norm": 0.400707870721817, + "learning_rate": 1.6142224023805863e-05, + "loss": 0.3751, + "step": 15592 + }, + { + "epoch": 0.28950368145312505, + "grad_norm": 0.43633151054382324, + "learning_rate": 1.614130346578085e-05, + "loss": 0.3835, + "step": 15594 + }, + { + "epoch": 0.2895408115905437, + "grad_norm": 0.5015726089477539, + "learning_rate": 1.614038282419109e-05, + "loss": 0.4262, + "step": 15596 + }, + { + "epoch": 0.28957794172796236, + "grad_norm": 0.45344147086143494, + "learning_rate": 1.6139462099049112e-05, + "loss": 0.3551, + "step": 15598 + }, + { + "epoch": 0.289615071865381, + "grad_norm": 0.3031162619590759, + "learning_rate": 1.6138541290367445e-05, + "loss": 0.4266, + "step": 15600 + }, + { + "epoch": 0.2896522020027996, + "grad_norm": 0.3173650801181793, + "learning_rate": 1.613762039815862e-05, + "loss": 0.3103, + "step": 15602 + }, + { + "epoch": 0.28968933214021825, + "grad_norm": 0.2842457592487335, + "learning_rate": 1.613669942243516e-05, + "loss": 0.1897, + "step": 15604 + }, + { + "epoch": 0.2897264622776369, + "grad_norm": 2.010040760040283, + "learning_rate": 1.6135778363209607e-05, + "loss": 0.2242, + "step": 15606 + }, + { + "epoch": 0.2897635924150555, + "grad_norm": 0.4785624146461487, + "learning_rate": 1.613485722049449e-05, + "loss": 0.2185, + "step": 15608 + }, + { + "epoch": 0.2898007225524742, + "grad_norm": 0.36233460903167725, + "learning_rate": 1.6133935994302338e-05, + "loss": 0.2856, + "step": 15610 + }, + { + "epoch": 0.2898378526898928, + "grad_norm": 0.385520339012146, + "learning_rate": 1.6133014684645694e-05, + "loss": 0.4, + "step": 15612 + }, + { + "epoch": 0.28987498282731144, + "grad_norm": 0.25901177525520325, + "learning_rate": 1.613209329153709e-05, + "loss": 0.2829, + "step": 15614 + }, + { + "epoch": 0.2899121129647301, + "grad_norm": 0.4623892307281494, + "learning_rate": 1.6131171814989064e-05, + "loss": 0.4186, + "step": 15616 + }, + { + "epoch": 0.2899492431021487, + "grad_norm": 0.2792411148548126, + "learning_rate": 1.6130250255014153e-05, + "loss": 0.3546, + "step": 15618 + }, + { + "epoch": 0.2899863732395674, + "grad_norm": 0.3469361662864685, + "learning_rate": 1.61293286116249e-05, + "loss": 0.194, + "step": 15620 + }, + { + "epoch": 0.290023503376986, + "grad_norm": 0.4245806932449341, + "learning_rate": 1.6128406884833845e-05, + "loss": 0.332, + "step": 15622 + }, + { + "epoch": 0.29006063351440464, + "grad_norm": 0.29889777302742004, + "learning_rate": 1.6127485074653527e-05, + "loss": 0.3547, + "step": 15624 + }, + { + "epoch": 0.29009776365182327, + "grad_norm": 0.36889442801475525, + "learning_rate": 1.612656318109649e-05, + "loss": 0.2949, + "step": 15626 + }, + { + "epoch": 0.2901348937892419, + "grad_norm": 0.49445563554763794, + "learning_rate": 1.6125641204175282e-05, + "loss": 0.3758, + "step": 15628 + }, + { + "epoch": 0.2901720239266605, + "grad_norm": 0.38085517287254333, + "learning_rate": 1.6124719143902445e-05, + "loss": 0.5218, + "step": 15630 + }, + { + "epoch": 0.2902091540640792, + "grad_norm": 0.34607717394828796, + "learning_rate": 1.6123797000290524e-05, + "loss": 0.3529, + "step": 15632 + }, + { + "epoch": 0.29024628420149784, + "grad_norm": 0.36897605657577515, + "learning_rate": 1.6122874773352072e-05, + "loss": 0.3134, + "step": 15634 + }, + { + "epoch": 0.29028341433891647, + "grad_norm": 0.2585378885269165, + "learning_rate": 1.612195246309963e-05, + "loss": 0.3037, + "step": 15636 + }, + { + "epoch": 0.2903205444763351, + "grad_norm": 0.45234742760658264, + "learning_rate": 1.6121030069545758e-05, + "loss": 0.2496, + "step": 15638 + }, + { + "epoch": 0.2903576746137537, + "grad_norm": 0.37002167105674744, + "learning_rate": 1.6120107592702998e-05, + "loss": 0.3083, + "step": 15640 + }, + { + "epoch": 0.2903948047511724, + "grad_norm": 0.6520326733589172, + "learning_rate": 1.611918503258391e-05, + "loss": 0.2581, + "step": 15642 + }, + { + "epoch": 0.29043193488859104, + "grad_norm": 0.3902058005332947, + "learning_rate": 1.6118262389201035e-05, + "loss": 0.0838, + "step": 15644 + }, + { + "epoch": 0.29046906502600967, + "grad_norm": 0.48861244320869446, + "learning_rate": 1.611733966256694e-05, + "loss": 0.3132, + "step": 15646 + }, + { + "epoch": 0.2905061951634283, + "grad_norm": 0.37170565128326416, + "learning_rate": 1.6116416852694174e-05, + "loss": 0.187, + "step": 15648 + }, + { + "epoch": 0.2905433253008469, + "grad_norm": 0.5379403829574585, + "learning_rate": 1.6115493959595297e-05, + "loss": 0.4199, + "step": 15650 + }, + { + "epoch": 0.29058045543826555, + "grad_norm": 0.4162154197692871, + "learning_rate": 1.6114570983282863e-05, + "loss": 0.25, + "step": 15652 + }, + { + "epoch": 0.29061758557568423, + "grad_norm": 0.3742355406284332, + "learning_rate": 1.611364792376944e-05, + "loss": 0.3921, + "step": 15654 + }, + { + "epoch": 0.29065471571310286, + "grad_norm": 0.46915262937545776, + "learning_rate": 1.611272478106757e-05, + "loss": 0.3619, + "step": 15656 + }, + { + "epoch": 0.2906918458505215, + "grad_norm": 0.5767585039138794, + "learning_rate": 1.6111801555189832e-05, + "loss": 0.3113, + "step": 15658 + }, + { + "epoch": 0.2907289759879401, + "grad_norm": 0.3501065969467163, + "learning_rate": 1.6110878246148782e-05, + "loss": 0.2739, + "step": 15660 + }, + { + "epoch": 0.29076610612535875, + "grad_norm": 0.4263308644294739, + "learning_rate": 1.6109954853956985e-05, + "loss": 0.3078, + "step": 15662 + }, + { + "epoch": 0.29080323626277743, + "grad_norm": 0.5071778893470764, + "learning_rate": 1.6109031378627e-05, + "loss": 0.2069, + "step": 15664 + }, + { + "epoch": 0.29084036640019606, + "grad_norm": 0.4627779424190521, + "learning_rate": 1.6108107820171396e-05, + "loss": 0.459, + "step": 15666 + }, + { + "epoch": 0.2908774965376147, + "grad_norm": 0.4224827289581299, + "learning_rate": 1.6107184178602745e-05, + "loss": 0.1784, + "step": 15668 + }, + { + "epoch": 0.2909146266750333, + "grad_norm": 0.3111225664615631, + "learning_rate": 1.6106260453933606e-05, + "loss": 0.4632, + "step": 15670 + }, + { + "epoch": 0.29095175681245194, + "grad_norm": 0.33872827887535095, + "learning_rate": 1.6105336646176555e-05, + "loss": 0.2122, + "step": 15672 + }, + { + "epoch": 0.29098888694987063, + "grad_norm": 0.34843072295188904, + "learning_rate": 1.6104412755344162e-05, + "loss": 0.2098, + "step": 15674 + }, + { + "epoch": 0.29102601708728926, + "grad_norm": 0.8168874382972717, + "learning_rate": 1.6103488781448992e-05, + "loss": 0.2519, + "step": 15676 + }, + { + "epoch": 0.2910631472247079, + "grad_norm": 0.3109864592552185, + "learning_rate": 1.6102564724503623e-05, + "loss": 0.3273, + "step": 15678 + }, + { + "epoch": 0.2911002773621265, + "grad_norm": 0.4328574240207672, + "learning_rate": 1.6101640584520626e-05, + "loss": 0.2864, + "step": 15680 + }, + { + "epoch": 0.29113740749954514, + "grad_norm": 0.352992445230484, + "learning_rate": 1.610071636151258e-05, + "loss": 0.2614, + "step": 15682 + }, + { + "epoch": 0.29117453763696377, + "grad_norm": 0.7581146359443665, + "learning_rate": 1.609979205549206e-05, + "loss": 0.3957, + "step": 15684 + }, + { + "epoch": 0.29121166777438245, + "grad_norm": 0.36366379261016846, + "learning_rate": 1.6098867666471634e-05, + "loss": 0.3672, + "step": 15686 + }, + { + "epoch": 0.2912487979118011, + "grad_norm": 0.4659481942653656, + "learning_rate": 1.6097943194463892e-05, + "loss": 0.3018, + "step": 15688 + }, + { + "epoch": 0.2912859280492197, + "grad_norm": 0.4258236885070801, + "learning_rate": 1.609701863948141e-05, + "loss": 0.439, + "step": 15690 + }, + { + "epoch": 0.29132305818663834, + "grad_norm": 0.41472896933555603, + "learning_rate": 1.6096094001536765e-05, + "loss": 0.2013, + "step": 15692 + }, + { + "epoch": 0.29136018832405697, + "grad_norm": 0.33130306005477905, + "learning_rate": 1.6095169280642537e-05, + "loss": 0.2435, + "step": 15694 + }, + { + "epoch": 0.29139731846147565, + "grad_norm": 0.39732077717781067, + "learning_rate": 1.609424447681132e-05, + "loss": 0.2237, + "step": 15696 + }, + { + "epoch": 0.2914344485988943, + "grad_norm": 0.3854469954967499, + "learning_rate": 1.6093319590055686e-05, + "loss": 0.2369, + "step": 15698 + }, + { + "epoch": 0.2914715787363129, + "grad_norm": 0.24874833226203918, + "learning_rate": 1.609239462038822e-05, + "loss": 0.2831, + "step": 15700 + }, + { + "epoch": 0.29150870887373154, + "grad_norm": 0.2649223208427429, + "learning_rate": 1.6091469567821518e-05, + "loss": 0.2141, + "step": 15702 + }, + { + "epoch": 0.29154583901115017, + "grad_norm": 0.39283373951911926, + "learning_rate": 1.609054443236816e-05, + "loss": 0.4789, + "step": 15704 + }, + { + "epoch": 0.2915829691485688, + "grad_norm": 0.38399532437324524, + "learning_rate": 1.608961921404073e-05, + "loss": 0.2576, + "step": 15706 + }, + { + "epoch": 0.2916200992859875, + "grad_norm": 0.3801678717136383, + "learning_rate": 1.608869391285183e-05, + "loss": 0.3431, + "step": 15708 + }, + { + "epoch": 0.2916572294234061, + "grad_norm": 0.5046018958091736, + "learning_rate": 1.608776852881404e-05, + "loss": 0.2838, + "step": 15710 + }, + { + "epoch": 0.29169435956082473, + "grad_norm": 0.36326077580451965, + "learning_rate": 1.608684306193996e-05, + "loss": 0.3495, + "step": 15712 + }, + { + "epoch": 0.29173148969824336, + "grad_norm": 0.4390855133533478, + "learning_rate": 1.6085917512242173e-05, + "loss": 0.3133, + "step": 15714 + }, + { + "epoch": 0.291768619835662, + "grad_norm": 0.23382356762886047, + "learning_rate": 1.608499187973328e-05, + "loss": 0.3046, + "step": 15716 + }, + { + "epoch": 0.2918057499730807, + "grad_norm": 0.3334776759147644, + "learning_rate": 1.6084066164425874e-05, + "loss": 0.2719, + "step": 15718 + }, + { + "epoch": 0.2918428801104993, + "grad_norm": 0.2548871338367462, + "learning_rate": 1.6083140366332552e-05, + "loss": 0.2798, + "step": 15720 + }, + { + "epoch": 0.29188001024791793, + "grad_norm": 0.42713379859924316, + "learning_rate": 1.608221448546591e-05, + "loss": 0.2003, + "step": 15722 + }, + { + "epoch": 0.29191714038533656, + "grad_norm": 0.3050050139427185, + "learning_rate": 1.6081288521838546e-05, + "loss": 0.2153, + "step": 15724 + }, + { + "epoch": 0.2919542705227552, + "grad_norm": 0.4136345088481903, + "learning_rate": 1.608036247546306e-05, + "loss": 0.3352, + "step": 15726 + }, + { + "epoch": 0.2919914006601738, + "grad_norm": 0.3212653398513794, + "learning_rate": 1.607943634635206e-05, + "loss": 0.4001, + "step": 15728 + }, + { + "epoch": 0.2920285307975925, + "grad_norm": 0.5238571166992188, + "learning_rate": 1.6078510134518132e-05, + "loss": 0.3439, + "step": 15730 + }, + { + "epoch": 0.29206566093501113, + "grad_norm": 0.2564865052700043, + "learning_rate": 1.6077583839973897e-05, + "loss": 0.1781, + "step": 15732 + }, + { + "epoch": 0.29210279107242976, + "grad_norm": 0.41845089197158813, + "learning_rate": 1.6076657462731945e-05, + "loss": 0.2731, + "step": 15734 + }, + { + "epoch": 0.2921399212098484, + "grad_norm": 0.42702433466911316, + "learning_rate": 1.607573100280489e-05, + "loss": 0.4761, + "step": 15736 + }, + { + "epoch": 0.292177051347267, + "grad_norm": 0.25510188937187195, + "learning_rate": 1.6074804460205335e-05, + "loss": 0.2431, + "step": 15738 + }, + { + "epoch": 0.2922141814846857, + "grad_norm": 0.3392500877380371, + "learning_rate": 1.6073877834945884e-05, + "loss": 0.172, + "step": 15740 + }, + { + "epoch": 0.2922513116221043, + "grad_norm": 0.30395883321762085, + "learning_rate": 1.607295112703915e-05, + "loss": 0.1531, + "step": 15742 + }, + { + "epoch": 0.29228844175952295, + "grad_norm": 0.3760819435119629, + "learning_rate": 1.607202433649774e-05, + "loss": 0.3712, + "step": 15744 + }, + { + "epoch": 0.2923255718969416, + "grad_norm": 0.3539535701274872, + "learning_rate": 1.607109746333427e-05, + "loss": 0.4087, + "step": 15746 + }, + { + "epoch": 0.2923627020343602, + "grad_norm": 0.5151153206825256, + "learning_rate": 1.6070170507561348e-05, + "loss": 0.3473, + "step": 15748 + }, + { + "epoch": 0.2923998321717789, + "grad_norm": 0.40087592601776123, + "learning_rate": 1.6069243469191586e-05, + "loss": 0.2614, + "step": 15750 + }, + { + "epoch": 0.2924369623091975, + "grad_norm": 0.45849931240081787, + "learning_rate": 1.6068316348237603e-05, + "loss": 0.3205, + "step": 15752 + }, + { + "epoch": 0.29247409244661615, + "grad_norm": 0.33599889278411865, + "learning_rate": 1.6067389144712013e-05, + "loss": 0.248, + "step": 15754 + }, + { + "epoch": 0.2925112225840348, + "grad_norm": 0.25124552845954895, + "learning_rate": 1.606646185862742e-05, + "loss": 0.129, + "step": 15756 + }, + { + "epoch": 0.2925483527214534, + "grad_norm": 0.3443413972854614, + "learning_rate": 1.6065534489996465e-05, + "loss": 0.2186, + "step": 15758 + }, + { + "epoch": 0.29258548285887204, + "grad_norm": 0.4380885362625122, + "learning_rate": 1.606460703883175e-05, + "loss": 0.4973, + "step": 15760 + }, + { + "epoch": 0.2926226129962907, + "grad_norm": 0.3385966122150421, + "learning_rate": 1.6063679505145892e-05, + "loss": 0.1657, + "step": 15762 + }, + { + "epoch": 0.29265974313370935, + "grad_norm": 0.4208071529865265, + "learning_rate": 1.6062751888951528e-05, + "loss": 0.2736, + "step": 15764 + }, + { + "epoch": 0.292696873271128, + "grad_norm": 0.38226985931396484, + "learning_rate": 1.6061824190261264e-05, + "loss": 0.4101, + "step": 15766 + }, + { + "epoch": 0.2927340034085466, + "grad_norm": 0.45783621072769165, + "learning_rate": 1.606089640908773e-05, + "loss": 0.3603, + "step": 15768 + }, + { + "epoch": 0.29277113354596523, + "grad_norm": 0.5467211008071899, + "learning_rate": 1.6059968545443557e-05, + "loss": 0.3058, + "step": 15770 + }, + { + "epoch": 0.2928082636833839, + "grad_norm": 0.2641834616661072, + "learning_rate": 1.6059040599341356e-05, + "loss": 0.3768, + "step": 15772 + }, + { + "epoch": 0.29284539382080255, + "grad_norm": 0.47348564863204956, + "learning_rate": 1.6058112570793766e-05, + "loss": 0.4562, + "step": 15774 + }, + { + "epoch": 0.2928825239582212, + "grad_norm": 0.315066933631897, + "learning_rate": 1.605718445981341e-05, + "loss": 0.2777, + "step": 15776 + }, + { + "epoch": 0.2929196540956398, + "grad_norm": 0.47561630606651306, + "learning_rate": 1.6056256266412916e-05, + "loss": 0.1049, + "step": 15778 + }, + { + "epoch": 0.29295678423305843, + "grad_norm": 0.5746839642524719, + "learning_rate": 1.6055327990604916e-05, + "loss": 0.3523, + "step": 15780 + }, + { + "epoch": 0.29299391437047706, + "grad_norm": 0.42559921741485596, + "learning_rate": 1.605439963240204e-05, + "loss": 0.2788, + "step": 15782 + }, + { + "epoch": 0.29303104450789574, + "grad_norm": 0.4412107765674591, + "learning_rate": 1.6053471191816918e-05, + "loss": 0.4538, + "step": 15784 + }, + { + "epoch": 0.29306817464531437, + "grad_norm": 1.3458364009857178, + "learning_rate": 1.6052542668862187e-05, + "loss": 0.2097, + "step": 15786 + }, + { + "epoch": 0.293105304782733, + "grad_norm": 0.5330139994621277, + "learning_rate": 1.605161406355048e-05, + "loss": 0.2756, + "step": 15788 + }, + { + "epoch": 0.29314243492015163, + "grad_norm": 0.322184294462204, + "learning_rate": 1.605068537589443e-05, + "loss": 0.3241, + "step": 15790 + }, + { + "epoch": 0.29317956505757026, + "grad_norm": 0.23872196674346924, + "learning_rate": 1.6049756605906675e-05, + "loss": 0.339, + "step": 15792 + }, + { + "epoch": 0.29321669519498894, + "grad_norm": 0.4767598509788513, + "learning_rate": 1.6048827753599855e-05, + "loss": 0.2008, + "step": 15794 + }, + { + "epoch": 0.29325382533240757, + "grad_norm": 0.45141828060150146, + "learning_rate": 1.6047898818986613e-05, + "loss": 0.3625, + "step": 15796 + }, + { + "epoch": 0.2932909554698262, + "grad_norm": 0.3824867904186249, + "learning_rate": 1.6046969802079576e-05, + "loss": 0.3142, + "step": 15798 + }, + { + "epoch": 0.2933280856072448, + "grad_norm": 0.4334510266780853, + "learning_rate": 1.6046040702891397e-05, + "loss": 0.4441, + "step": 15800 + }, + { + "epoch": 0.29336521574466345, + "grad_norm": 0.27167803049087524, + "learning_rate": 1.6045111521434712e-05, + "loss": 0.4491, + "step": 15802 + }, + { + "epoch": 0.2934023458820821, + "grad_norm": 0.28596147894859314, + "learning_rate": 1.6044182257722167e-05, + "loss": 0.4787, + "step": 15804 + }, + { + "epoch": 0.29343947601950077, + "grad_norm": 0.5326327681541443, + "learning_rate": 1.6043252911766405e-05, + "loss": 0.3881, + "step": 15806 + }, + { + "epoch": 0.2934766061569194, + "grad_norm": 0.4218844473361969, + "learning_rate": 1.6042323483580075e-05, + "loss": 0.1486, + "step": 15808 + }, + { + "epoch": 0.293513736294338, + "grad_norm": 0.6821032762527466, + "learning_rate": 1.604139397317582e-05, + "loss": 0.1409, + "step": 15810 + }, + { + "epoch": 0.29355086643175665, + "grad_norm": 0.28542548418045044, + "learning_rate": 1.604046438056629e-05, + "loss": 0.4346, + "step": 15812 + }, + { + "epoch": 0.2935879965691753, + "grad_norm": 0.43990278244018555, + "learning_rate": 1.603953470576413e-05, + "loss": 0.3321, + "step": 15814 + }, + { + "epoch": 0.29362512670659396, + "grad_norm": 0.38376545906066895, + "learning_rate": 1.6038604948781994e-05, + "loss": 0.409, + "step": 15816 + }, + { + "epoch": 0.2936622568440126, + "grad_norm": 0.4855653643608093, + "learning_rate": 1.6037675109632537e-05, + "loss": 0.2908, + "step": 15818 + }, + { + "epoch": 0.2936993869814312, + "grad_norm": 0.3898667097091675, + "learning_rate": 1.6036745188328402e-05, + "loss": 0.4082, + "step": 15820 + }, + { + "epoch": 0.29373651711884985, + "grad_norm": 0.2707900106906891, + "learning_rate": 1.6035815184882248e-05, + "loss": 0.3521, + "step": 15822 + }, + { + "epoch": 0.2937736472562685, + "grad_norm": 0.3762018084526062, + "learning_rate": 1.603488509930673e-05, + "loss": 0.2577, + "step": 15824 + }, + { + "epoch": 0.29381077739368716, + "grad_norm": 0.2909925878047943, + "learning_rate": 1.60339549316145e-05, + "loss": 0.2367, + "step": 15826 + }, + { + "epoch": 0.2938479075311058, + "grad_norm": 0.41317829489707947, + "learning_rate": 1.603302468181822e-05, + "loss": 0.2127, + "step": 15828 + }, + { + "epoch": 0.2938850376685244, + "grad_norm": 0.26684844493865967, + "learning_rate": 1.6032094349930543e-05, + "loss": 0.3748, + "step": 15830 + }, + { + "epoch": 0.29392216780594305, + "grad_norm": 0.6406076550483704, + "learning_rate": 1.6031163935964128e-05, + "loss": 0.262, + "step": 15832 + }, + { + "epoch": 0.2939592979433617, + "grad_norm": 0.4238293170928955, + "learning_rate": 1.6030233439931645e-05, + "loss": 0.353, + "step": 15834 + }, + { + "epoch": 0.2939964280807803, + "grad_norm": 0.3279445171356201, + "learning_rate": 1.6029302861845742e-05, + "loss": 0.1309, + "step": 15836 + }, + { + "epoch": 0.294033558218199, + "grad_norm": 0.46401703357696533, + "learning_rate": 1.6028372201719088e-05, + "loss": 0.3902, + "step": 15838 + }, + { + "epoch": 0.2940706883556176, + "grad_norm": 0.5426220893859863, + "learning_rate": 1.6027441459564347e-05, + "loss": 0.4946, + "step": 15840 + }, + { + "epoch": 0.29410781849303624, + "grad_norm": 0.40311840176582336, + "learning_rate": 1.6026510635394176e-05, + "loss": 0.3499, + "step": 15842 + }, + { + "epoch": 0.29414494863045487, + "grad_norm": 0.4529612362384796, + "learning_rate": 1.6025579729221253e-05, + "loss": 0.2847, + "step": 15844 + }, + { + "epoch": 0.2941820787678735, + "grad_norm": 0.43816739320755005, + "learning_rate": 1.6024648741058236e-05, + "loss": 0.238, + "step": 15846 + }, + { + "epoch": 0.2942192089052922, + "grad_norm": 0.33676931262016296, + "learning_rate": 1.6023717670917793e-05, + "loss": 0.305, + "step": 15848 + }, + { + "epoch": 0.2942563390427108, + "grad_norm": 0.40976738929748535, + "learning_rate": 1.60227865188126e-05, + "loss": 0.371, + "step": 15850 + }, + { + "epoch": 0.29429346918012944, + "grad_norm": 0.22584185004234314, + "learning_rate": 1.602185528475532e-05, + "loss": 0.3181, + "step": 15852 + }, + { + "epoch": 0.29433059931754807, + "grad_norm": 0.5257530212402344, + "learning_rate": 1.602092396875863e-05, + "loss": 0.3127, + "step": 15854 + }, + { + "epoch": 0.2943677294549667, + "grad_norm": 0.32537636160850525, + "learning_rate": 1.6019992570835196e-05, + "loss": 0.3968, + "step": 15856 + }, + { + "epoch": 0.2944048595923853, + "grad_norm": 0.30383968353271484, + "learning_rate": 1.60190610909977e-05, + "loss": 0.2705, + "step": 15858 + }, + { + "epoch": 0.294441989729804, + "grad_norm": 0.29775476455688477, + "learning_rate": 1.6018129529258807e-05, + "loss": 0.2262, + "step": 15860 + }, + { + "epoch": 0.29447911986722264, + "grad_norm": 0.48876088857650757, + "learning_rate": 1.60171978856312e-05, + "loss": 0.1981, + "step": 15862 + }, + { + "epoch": 0.29451625000464127, + "grad_norm": 0.30494779348373413, + "learning_rate": 1.601626616012755e-05, + "loss": 0.1458, + "step": 15864 + }, + { + "epoch": 0.2945533801420599, + "grad_norm": 0.34384116530418396, + "learning_rate": 1.6015334352760544e-05, + "loss": 0.4137, + "step": 15866 + }, + { + "epoch": 0.2945905102794785, + "grad_norm": 0.33940520882606506, + "learning_rate": 1.6014402463542852e-05, + "loss": 0.3839, + "step": 15868 + }, + { + "epoch": 0.2946276404168972, + "grad_norm": 0.36387044191360474, + "learning_rate": 1.6013470492487158e-05, + "loss": 0.4268, + "step": 15870 + }, + { + "epoch": 0.29466477055431584, + "grad_norm": 0.3652462065219879, + "learning_rate": 1.6012538439606142e-05, + "loss": 0.2652, + "step": 15872 + }, + { + "epoch": 0.29470190069173446, + "grad_norm": 0.25367286801338196, + "learning_rate": 1.601160630491249e-05, + "loss": 0.0947, + "step": 15874 + }, + { + "epoch": 0.2947390308291531, + "grad_norm": 0.275566965341568, + "learning_rate": 1.601067408841888e-05, + "loss": 0.5275, + "step": 15876 + }, + { + "epoch": 0.2947761609665717, + "grad_norm": 0.3932128846645355, + "learning_rate": 1.6009741790138004e-05, + "loss": 0.3335, + "step": 15878 + }, + { + "epoch": 0.29481329110399035, + "grad_norm": 0.19389225542545319, + "learning_rate": 1.6008809410082537e-05, + "loss": 0.2799, + "step": 15880 + }, + { + "epoch": 0.29485042124140903, + "grad_norm": 0.7034175395965576, + "learning_rate": 1.6007876948265176e-05, + "loss": 0.385, + "step": 15882 + }, + { + "epoch": 0.29488755137882766, + "grad_norm": 0.39741891622543335, + "learning_rate": 1.600694440469861e-05, + "loss": 0.4252, + "step": 15884 + }, + { + "epoch": 0.2949246815162463, + "grad_norm": 0.25038355588912964, + "learning_rate": 1.6006011779395515e-05, + "loss": 0.2614, + "step": 15886 + }, + { + "epoch": 0.2949618116536649, + "grad_norm": 0.41598060727119446, + "learning_rate": 1.6005079072368594e-05, + "loss": 0.1831, + "step": 15888 + }, + { + "epoch": 0.29499894179108355, + "grad_norm": 0.5460042953491211, + "learning_rate": 1.6004146283630535e-05, + "loss": 0.4931, + "step": 15890 + }, + { + "epoch": 0.29503607192850223, + "grad_norm": 0.5921924114227295, + "learning_rate": 1.600321341319403e-05, + "loss": 0.2979, + "step": 15892 + }, + { + "epoch": 0.29507320206592086, + "grad_norm": 0.30093103647232056, + "learning_rate": 1.600228046107177e-05, + "loss": 0.3029, + "step": 15894 + }, + { + "epoch": 0.2951103322033395, + "grad_norm": 0.3172679543495178, + "learning_rate": 1.6001347427276454e-05, + "loss": 0.3465, + "step": 15896 + }, + { + "epoch": 0.2951474623407581, + "grad_norm": 0.49164605140686035, + "learning_rate": 1.6000414311820775e-05, + "loss": 0.3912, + "step": 15898 + }, + { + "epoch": 0.29518459247817674, + "grad_norm": 0.5129261612892151, + "learning_rate": 1.599948111471743e-05, + "loss": 0.2846, + "step": 15900 + }, + { + "epoch": 0.2952217226155954, + "grad_norm": 0.40990951657295227, + "learning_rate": 1.5998547835979114e-05, + "loss": 0.4367, + "step": 15902 + }, + { + "epoch": 0.29525885275301406, + "grad_norm": 0.3677123188972473, + "learning_rate": 1.5997614475618533e-05, + "loss": 0.3194, + "step": 15904 + }, + { + "epoch": 0.2952959828904327, + "grad_norm": 0.3080374002456665, + "learning_rate": 1.5996681033648384e-05, + "loss": 0.2761, + "step": 15906 + }, + { + "epoch": 0.2953331130278513, + "grad_norm": 0.32889053225517273, + "learning_rate": 1.599574751008137e-05, + "loss": 0.137, + "step": 15908 + }, + { + "epoch": 0.29537024316526994, + "grad_norm": 0.3005254864692688, + "learning_rate": 1.5994813904930187e-05, + "loss": 0.3357, + "step": 15910 + }, + { + "epoch": 0.29540737330268857, + "grad_norm": 0.39450713992118835, + "learning_rate": 1.599388021820755e-05, + "loss": 0.1212, + "step": 15912 + }, + { + "epoch": 0.29544450344010725, + "grad_norm": 0.391381174325943, + "learning_rate": 1.599294644992615e-05, + "loss": 0.1835, + "step": 15914 + }, + { + "epoch": 0.2954816335775259, + "grad_norm": 0.41406121850013733, + "learning_rate": 1.5992012600098704e-05, + "loss": 0.265, + "step": 15916 + }, + { + "epoch": 0.2955187637149445, + "grad_norm": 0.4220198690891266, + "learning_rate": 1.5991078668737916e-05, + "loss": 0.26, + "step": 15918 + }, + { + "epoch": 0.29555589385236314, + "grad_norm": 0.3121607303619385, + "learning_rate": 1.5990144655856488e-05, + "loss": 0.3302, + "step": 15920 + }, + { + "epoch": 0.29559302398978177, + "grad_norm": 0.35688114166259766, + "learning_rate": 1.598921056146714e-05, + "loss": 0.2773, + "step": 15922 + }, + { + "epoch": 0.29563015412720045, + "grad_norm": 0.5068998336791992, + "learning_rate": 1.598827638558257e-05, + "loss": 0.3802, + "step": 15924 + }, + { + "epoch": 0.2956672842646191, + "grad_norm": 0.24365411698818207, + "learning_rate": 1.5987342128215496e-05, + "loss": 0.385, + "step": 15926 + }, + { + "epoch": 0.2957044144020377, + "grad_norm": 0.33499500155448914, + "learning_rate": 1.5986407789378632e-05, + "loss": 0.308, + "step": 15928 + }, + { + "epoch": 0.29574154453945634, + "grad_norm": 0.3394600749015808, + "learning_rate": 1.598547336908469e-05, + "loss": 0.3867, + "step": 15930 + }, + { + "epoch": 0.29577867467687496, + "grad_norm": 0.32194146513938904, + "learning_rate": 1.5984538867346383e-05, + "loss": 0.3396, + "step": 15932 + }, + { + "epoch": 0.2958158048142936, + "grad_norm": 0.6228675246238708, + "learning_rate": 1.5983604284176425e-05, + "loss": 0.3322, + "step": 15934 + }, + { + "epoch": 0.2958529349517123, + "grad_norm": 0.37650227546691895, + "learning_rate": 1.598266961958754e-05, + "loss": 0.2935, + "step": 15936 + }, + { + "epoch": 0.2958900650891309, + "grad_norm": 0.2601744532585144, + "learning_rate": 1.5981734873592437e-05, + "loss": 0.3826, + "step": 15938 + }, + { + "epoch": 0.29592719522654953, + "grad_norm": 0.5472261905670166, + "learning_rate": 1.5980800046203846e-05, + "loss": 0.4356, + "step": 15940 + }, + { + "epoch": 0.29596432536396816, + "grad_norm": 0.2757434546947479, + "learning_rate": 1.5979865137434477e-05, + "loss": 0.2561, + "step": 15942 + }, + { + "epoch": 0.2960014555013868, + "grad_norm": 0.3484655022621155, + "learning_rate": 1.5978930147297057e-05, + "loss": 0.3614, + "step": 15944 + }, + { + "epoch": 0.2960385856388055, + "grad_norm": 0.2160046100616455, + "learning_rate": 1.5977995075804304e-05, + "loss": 0.1595, + "step": 15946 + }, + { + "epoch": 0.2960757157762241, + "grad_norm": 0.34649547934532166, + "learning_rate": 1.5977059922968944e-05, + "loss": 0.1787, + "step": 15948 + }, + { + "epoch": 0.29611284591364273, + "grad_norm": 0.3408724367618561, + "learning_rate": 1.5976124688803706e-05, + "loss": 0.4858, + "step": 15950 + }, + { + "epoch": 0.29614997605106136, + "grad_norm": 0.2644693851470947, + "learning_rate": 1.5975189373321307e-05, + "loss": 0.1441, + "step": 15952 + }, + { + "epoch": 0.29618710618848, + "grad_norm": 0.30196136236190796, + "learning_rate": 1.597425397653448e-05, + "loss": 0.2903, + "step": 15954 + }, + { + "epoch": 0.2962242363258986, + "grad_norm": 0.362359881401062, + "learning_rate": 1.5973318498455953e-05, + "loss": 0.3668, + "step": 15956 + }, + { + "epoch": 0.2962613664633173, + "grad_norm": 0.4532642662525177, + "learning_rate": 1.597238293909845e-05, + "loss": 0.3152, + "step": 15958 + }, + { + "epoch": 0.29629849660073593, + "grad_norm": 0.5208582878112793, + "learning_rate": 1.5971447298474708e-05, + "loss": 0.3827, + "step": 15960 + }, + { + "epoch": 0.29633562673815456, + "grad_norm": 0.3784242868423462, + "learning_rate": 1.5970511576597454e-05, + "loss": 0.2326, + "step": 15962 + }, + { + "epoch": 0.2963727568755732, + "grad_norm": 0.3939746022224426, + "learning_rate": 1.596957577347942e-05, + "loss": 0.2906, + "step": 15964 + }, + { + "epoch": 0.2964098870129918, + "grad_norm": 0.2783624827861786, + "learning_rate": 1.5968639889133343e-05, + "loss": 0.4099, + "step": 15966 + }, + { + "epoch": 0.2964470171504105, + "grad_norm": 0.32852616906166077, + "learning_rate": 1.596770392357195e-05, + "loss": 0.365, + "step": 15968 + }, + { + "epoch": 0.2964841472878291, + "grad_norm": 0.5242838859558105, + "learning_rate": 1.5966767876807986e-05, + "loss": 0.3891, + "step": 15970 + }, + { + "epoch": 0.29652127742524775, + "grad_norm": 0.3619416356086731, + "learning_rate": 1.5965831748854185e-05, + "loss": 0.2109, + "step": 15972 + }, + { + "epoch": 0.2965584075626664, + "grad_norm": 0.39686986804008484, + "learning_rate": 1.596489553972328e-05, + "loss": 0.3167, + "step": 15974 + }, + { + "epoch": 0.296595537700085, + "grad_norm": 0.4223601520061493, + "learning_rate": 1.5963959249428017e-05, + "loss": 0.2763, + "step": 15976 + }, + { + "epoch": 0.2966326678375037, + "grad_norm": 0.5149276852607727, + "learning_rate": 1.5963022877981132e-05, + "loss": 0.5316, + "step": 15978 + }, + { + "epoch": 0.2966697979749223, + "grad_norm": 0.40934059023857117, + "learning_rate": 1.5962086425395368e-05, + "loss": 0.3323, + "step": 15980 + }, + { + "epoch": 0.29670692811234095, + "grad_norm": 0.5370131134986877, + "learning_rate": 1.5961149891683462e-05, + "loss": 0.2522, + "step": 15982 + }, + { + "epoch": 0.2967440582497596, + "grad_norm": 0.3608935475349426, + "learning_rate": 1.5960213276858163e-05, + "loss": 0.4717, + "step": 15984 + }, + { + "epoch": 0.2967811883871782, + "grad_norm": 1.8925145864486694, + "learning_rate": 1.5959276580932216e-05, + "loss": 0.3949, + "step": 15986 + }, + { + "epoch": 0.29681831852459684, + "grad_norm": 0.3202454149723053, + "learning_rate": 1.5958339803918366e-05, + "loss": 0.4012, + "step": 15988 + }, + { + "epoch": 0.2968554486620155, + "grad_norm": 0.3580626845359802, + "learning_rate": 1.5957402945829355e-05, + "loss": 0.2624, + "step": 15990 + }, + { + "epoch": 0.29689257879943415, + "grad_norm": 0.3558816909790039, + "learning_rate": 1.595646600667794e-05, + "loss": 0.2249, + "step": 15992 + }, + { + "epoch": 0.2969297089368528, + "grad_norm": 0.38526681065559387, + "learning_rate": 1.5955528986476857e-05, + "loss": 0.3184, + "step": 15994 + }, + { + "epoch": 0.2969668390742714, + "grad_norm": 0.37881582975387573, + "learning_rate": 1.5954591885238866e-05, + "loss": 0.3373, + "step": 15996 + }, + { + "epoch": 0.29700396921169003, + "grad_norm": 0.27788904309272766, + "learning_rate": 1.5953654702976716e-05, + "loss": 0.2519, + "step": 15998 + }, + { + "epoch": 0.2970410993491087, + "grad_norm": 0.39048248529434204, + "learning_rate": 1.5952717439703162e-05, + "loss": 0.3716, + "step": 16000 + }, + { + "epoch": 0.29707822948652735, + "grad_norm": 0.3973650336265564, + "learning_rate": 1.595178009543095e-05, + "loss": 0.2755, + "step": 16002 + }, + { + "epoch": 0.297115359623946, + "grad_norm": 0.281314879655838, + "learning_rate": 1.5950842670172842e-05, + "loss": 0.3085, + "step": 16004 + }, + { + "epoch": 0.2971524897613646, + "grad_norm": 0.6539525985717773, + "learning_rate": 1.5949905163941585e-05, + "loss": 0.179, + "step": 16006 + }, + { + "epoch": 0.29718961989878323, + "grad_norm": 0.4613947570323944, + "learning_rate": 1.5948967576749944e-05, + "loss": 0.4279, + "step": 16008 + }, + { + "epoch": 0.29722675003620186, + "grad_norm": 0.2610434889793396, + "learning_rate": 1.5948029908610677e-05, + "loss": 0.3779, + "step": 16010 + }, + { + "epoch": 0.29726388017362054, + "grad_norm": 0.3161751329898834, + "learning_rate": 1.594709215953653e-05, + "loss": 0.2509, + "step": 16012 + }, + { + "epoch": 0.29730101031103917, + "grad_norm": 0.3024757504463196, + "learning_rate": 1.594615432954028e-05, + "loss": 0.2228, + "step": 16014 + }, + { + "epoch": 0.2973381404484578, + "grad_norm": 0.37941187620162964, + "learning_rate": 1.5945216418634675e-05, + "loss": 0.4287, + "step": 16016 + }, + { + "epoch": 0.29737527058587643, + "grad_norm": 0.3800159692764282, + "learning_rate": 1.5944278426832488e-05, + "loss": 0.1879, + "step": 16018 + }, + { + "epoch": 0.29741240072329506, + "grad_norm": 0.3999338448047638, + "learning_rate": 1.5943340354146477e-05, + "loss": 0.2752, + "step": 16020 + }, + { + "epoch": 0.29744953086071374, + "grad_norm": 0.5095757842063904, + "learning_rate": 1.5942402200589403e-05, + "loss": 0.4635, + "step": 16022 + }, + { + "epoch": 0.29748666099813237, + "grad_norm": 0.31285855174064636, + "learning_rate": 1.5941463966174032e-05, + "loss": 0.1266, + "step": 16024 + }, + { + "epoch": 0.297523791135551, + "grad_norm": 0.359053373336792, + "learning_rate": 1.594052565091314e-05, + "loss": 0.4553, + "step": 16026 + }, + { + "epoch": 0.2975609212729696, + "grad_norm": 1.1174899339675903, + "learning_rate": 1.593958725481948e-05, + "loss": 0.3598, + "step": 16028 + }, + { + "epoch": 0.29759805141038825, + "grad_norm": 0.33334434032440186, + "learning_rate": 1.5938648777905838e-05, + "loss": 0.4043, + "step": 16030 + }, + { + "epoch": 0.2976351815478069, + "grad_norm": 0.22132715582847595, + "learning_rate": 1.5937710220184967e-05, + "loss": 0.4288, + "step": 16032 + }, + { + "epoch": 0.29767231168522557, + "grad_norm": 0.47149693965911865, + "learning_rate": 1.5936771581669648e-05, + "loss": 0.6128, + "step": 16034 + }, + { + "epoch": 0.2977094418226442, + "grad_norm": 0.34123945236206055, + "learning_rate": 1.593583286237265e-05, + "loss": 0.3032, + "step": 16036 + }, + { + "epoch": 0.2977465719600628, + "grad_norm": 0.2863142490386963, + "learning_rate": 1.5934894062306747e-05, + "loss": 0.4077, + "step": 16038 + }, + { + "epoch": 0.29778370209748145, + "grad_norm": 0.46670910716056824, + "learning_rate": 1.5933955181484714e-05, + "loss": 0.4023, + "step": 16040 + }, + { + "epoch": 0.2978208322349001, + "grad_norm": 0.3590395748615265, + "learning_rate": 1.5933016219919323e-05, + "loss": 0.4234, + "step": 16042 + }, + { + "epoch": 0.29785796237231876, + "grad_norm": 0.3236614167690277, + "learning_rate": 1.5932077177623354e-05, + "loss": 0.3969, + "step": 16044 + }, + { + "epoch": 0.2978950925097374, + "grad_norm": 0.27161961793899536, + "learning_rate": 1.5931138054609584e-05, + "loss": 0.3299, + "step": 16046 + }, + { + "epoch": 0.297932222647156, + "grad_norm": 0.32223424315452576, + "learning_rate": 1.593019885089079e-05, + "loss": 0.4028, + "step": 16048 + }, + { + "epoch": 0.29796935278457465, + "grad_norm": 0.23701806366443634, + "learning_rate": 1.592925956647975e-05, + "loss": 0.2521, + "step": 16050 + }, + { + "epoch": 0.2980064829219933, + "grad_norm": 0.3584916591644287, + "learning_rate": 1.592832020138925e-05, + "loss": 0.2643, + "step": 16052 + }, + { + "epoch": 0.29804361305941196, + "grad_norm": 0.3109648823738098, + "learning_rate": 1.592738075563207e-05, + "loss": 0.207, + "step": 16054 + }, + { + "epoch": 0.2980807431968306, + "grad_norm": 0.5483985543251038, + "learning_rate": 1.592644122922099e-05, + "loss": 0.3383, + "step": 16056 + }, + { + "epoch": 0.2981178733342492, + "grad_norm": 0.3502201735973358, + "learning_rate": 1.5925501622168798e-05, + "loss": 0.4377, + "step": 16058 + }, + { + "epoch": 0.29815500347166785, + "grad_norm": 0.39416614174842834, + "learning_rate": 1.5924561934488276e-05, + "loss": 0.4753, + "step": 16060 + }, + { + "epoch": 0.2981921336090865, + "grad_norm": 0.3235785961151123, + "learning_rate": 1.592362216619222e-05, + "loss": 0.492, + "step": 16062 + }, + { + "epoch": 0.2982292637465051, + "grad_norm": 0.30711907148361206, + "learning_rate": 1.5922682317293402e-05, + "loss": 0.5145, + "step": 16064 + }, + { + "epoch": 0.2982663938839238, + "grad_norm": 0.2883948087692261, + "learning_rate": 1.592174238780462e-05, + "loss": 0.4891, + "step": 16066 + }, + { + "epoch": 0.2983035240213424, + "grad_norm": 0.2432112842798233, + "learning_rate": 1.5920802377738663e-05, + "loss": 0.3993, + "step": 16068 + }, + { + "epoch": 0.29834065415876104, + "grad_norm": 0.4724024832248688, + "learning_rate": 1.5919862287108315e-05, + "loss": 0.4229, + "step": 16070 + }, + { + "epoch": 0.29837778429617967, + "grad_norm": 0.2420969307422638, + "learning_rate": 1.591892211592638e-05, + "loss": 0.4508, + "step": 16072 + }, + { + "epoch": 0.2984149144335983, + "grad_norm": 0.3737691044807434, + "learning_rate": 1.5917981864205643e-05, + "loss": 0.1867, + "step": 16074 + }, + { + "epoch": 0.298452044571017, + "grad_norm": 0.849558413028717, + "learning_rate": 1.59170415319589e-05, + "loss": 0.2895, + "step": 16076 + }, + { + "epoch": 0.2984891747084356, + "grad_norm": 0.7339929342269897, + "learning_rate": 1.591610111919894e-05, + "loss": 0.3812, + "step": 16078 + }, + { + "epoch": 0.29852630484585424, + "grad_norm": 0.37790122628211975, + "learning_rate": 1.5915160625938568e-05, + "loss": 0.3223, + "step": 16080 + }, + { + "epoch": 0.29856343498327287, + "grad_norm": 0.25142228603363037, + "learning_rate": 1.5914220052190578e-05, + "loss": 0.1882, + "step": 16082 + }, + { + "epoch": 0.2986005651206915, + "grad_norm": 0.4242027699947357, + "learning_rate": 1.591327939796777e-05, + "loss": 0.2595, + "step": 16084 + }, + { + "epoch": 0.2986376952581101, + "grad_norm": 0.5102601051330566, + "learning_rate": 1.591233866328294e-05, + "loss": 0.4069, + "step": 16086 + }, + { + "epoch": 0.2986748253955288, + "grad_norm": 0.27951785922050476, + "learning_rate": 1.591139784814889e-05, + "loss": 0.3433, + "step": 16088 + }, + { + "epoch": 0.29871195553294744, + "grad_norm": 0.2540474236011505, + "learning_rate": 1.5910456952578422e-05, + "loss": 0.2422, + "step": 16090 + }, + { + "epoch": 0.29874908567036607, + "grad_norm": 0.4229940176010132, + "learning_rate": 1.590951597658434e-05, + "loss": 0.3142, + "step": 16092 + }, + { + "epoch": 0.2987862158077847, + "grad_norm": 0.28358596563339233, + "learning_rate": 1.5908574920179444e-05, + "loss": 0.2704, + "step": 16094 + }, + { + "epoch": 0.2988233459452033, + "grad_norm": 0.21484948694705963, + "learning_rate": 1.590763378337654e-05, + "loss": 0.2397, + "step": 16096 + }, + { + "epoch": 0.298860476082622, + "grad_norm": 0.8725574612617493, + "learning_rate": 1.5906692566188443e-05, + "loss": 0.403, + "step": 16098 + }, + { + "epoch": 0.29889760622004063, + "grad_norm": 0.25622326135635376, + "learning_rate": 1.590575126862795e-05, + "loss": 0.2329, + "step": 16100 + }, + { + "epoch": 0.29893473635745926, + "grad_norm": 0.38767457008361816, + "learning_rate": 1.5904809890707865e-05, + "loss": 0.3103, + "step": 16102 + }, + { + "epoch": 0.2989718664948779, + "grad_norm": 0.2989526391029358, + "learning_rate": 1.590386843244101e-05, + "loss": 0.4354, + "step": 16104 + }, + { + "epoch": 0.2990089966322965, + "grad_norm": 0.2956234812736511, + "learning_rate": 1.5902926893840192e-05, + "loss": 0.4601, + "step": 16106 + }, + { + "epoch": 0.29904612676971515, + "grad_norm": 0.3971045911312103, + "learning_rate": 1.5901985274918218e-05, + "loss": 0.4183, + "step": 16108 + }, + { + "epoch": 0.29908325690713383, + "grad_norm": 0.381245493888855, + "learning_rate": 1.59010435756879e-05, + "loss": 0.3589, + "step": 16110 + }, + { + "epoch": 0.29912038704455246, + "grad_norm": 1.8292866945266724, + "learning_rate": 1.590010179616206e-05, + "loss": 0.2834, + "step": 16112 + }, + { + "epoch": 0.2991575171819711, + "grad_norm": 0.19837979972362518, + "learning_rate": 1.5899159936353506e-05, + "loss": 0.2525, + "step": 16114 + }, + { + "epoch": 0.2991946473193897, + "grad_norm": 0.4586329758167267, + "learning_rate": 1.5898217996275053e-05, + "loss": 0.2402, + "step": 16116 + }, + { + "epoch": 0.29923177745680835, + "grad_norm": 0.42977920174598694, + "learning_rate": 1.589727597593952e-05, + "loss": 0.2989, + "step": 16118 + }, + { + "epoch": 0.29926890759422703, + "grad_norm": 0.4592747092247009, + "learning_rate": 1.5896333875359727e-05, + "loss": 0.2732, + "step": 16120 + }, + { + "epoch": 0.29930603773164566, + "grad_norm": 0.48158320784568787, + "learning_rate": 1.5895391694548493e-05, + "loss": 0.3802, + "step": 16122 + }, + { + "epoch": 0.2993431678690643, + "grad_norm": 0.3117944896221161, + "learning_rate": 1.5894449433518634e-05, + "loss": 0.2182, + "step": 16124 + }, + { + "epoch": 0.2993802980064829, + "grad_norm": 0.2854127287864685, + "learning_rate": 1.5893507092282974e-05, + "loss": 0.4905, + "step": 16126 + }, + { + "epoch": 0.29941742814390154, + "grad_norm": 0.38067081570625305, + "learning_rate": 1.589256467085434e-05, + "loss": 0.2131, + "step": 16128 + }, + { + "epoch": 0.2994545582813202, + "grad_norm": 0.30732300877571106, + "learning_rate": 1.5891622169245544e-05, + "loss": 0.4093, + "step": 16130 + }, + { + "epoch": 0.29949168841873886, + "grad_norm": 0.3696860074996948, + "learning_rate": 1.5890679587469424e-05, + "loss": 0.3965, + "step": 16132 + }, + { + "epoch": 0.2995288185561575, + "grad_norm": 0.5029286742210388, + "learning_rate": 1.5889736925538793e-05, + "loss": 0.2333, + "step": 16134 + }, + { + "epoch": 0.2995659486935761, + "grad_norm": 0.4822166860103607, + "learning_rate": 1.5888794183466492e-05, + "loss": 0.453, + "step": 16136 + }, + { + "epoch": 0.29960307883099474, + "grad_norm": 0.5186971426010132, + "learning_rate": 1.5887851361265336e-05, + "loss": 0.2685, + "step": 16138 + }, + { + "epoch": 0.29964020896841337, + "grad_norm": 0.5340501666069031, + "learning_rate": 1.588690845894816e-05, + "loss": 0.2185, + "step": 16140 + }, + { + "epoch": 0.29967733910583205, + "grad_norm": 0.3281538188457489, + "learning_rate": 1.5885965476527794e-05, + "loss": 0.3392, + "step": 16142 + }, + { + "epoch": 0.2997144692432507, + "grad_norm": 0.3609340786933899, + "learning_rate": 1.5885022414017068e-05, + "loss": 0.3292, + "step": 16144 + }, + { + "epoch": 0.2997515993806693, + "grad_norm": 0.3774430751800537, + "learning_rate": 1.5884079271428816e-05, + "loss": 0.2892, + "step": 16146 + }, + { + "epoch": 0.29978872951808794, + "grad_norm": 0.2638816237449646, + "learning_rate": 1.5883136048775866e-05, + "loss": 0.2368, + "step": 16148 + }, + { + "epoch": 0.29982585965550657, + "grad_norm": 0.4817085862159729, + "learning_rate": 1.5882192746071062e-05, + "loss": 0.2575, + "step": 16150 + }, + { + "epoch": 0.29986298979292525, + "grad_norm": 0.4615142047405243, + "learning_rate": 1.588124936332723e-05, + "loss": 0.1181, + "step": 16152 + }, + { + "epoch": 0.2999001199303439, + "grad_norm": 0.3382371664047241, + "learning_rate": 1.588030590055721e-05, + "loss": 0.1331, + "step": 16154 + }, + { + "epoch": 0.2999372500677625, + "grad_norm": 0.6113383173942566, + "learning_rate": 1.5879362357773844e-05, + "loss": 0.2208, + "step": 16156 + }, + { + "epoch": 0.29997438020518113, + "grad_norm": 0.5485912561416626, + "learning_rate": 1.5878418734989963e-05, + "loss": 0.417, + "step": 16158 + }, + { + "epoch": 0.30001151034259976, + "grad_norm": 0.6068719029426575, + "learning_rate": 1.5877475032218416e-05, + "loss": 0.2609, + "step": 16160 + }, + { + "epoch": 0.3000486404800184, + "grad_norm": 0.5113223195075989, + "learning_rate": 1.5876531249472034e-05, + "loss": 0.2562, + "step": 16162 + }, + { + "epoch": 0.3000857706174371, + "grad_norm": 0.28108909726142883, + "learning_rate": 1.5875587386763665e-05, + "loss": 0.2683, + "step": 16164 + }, + { + "epoch": 0.3001229007548557, + "grad_norm": 0.35743606090545654, + "learning_rate": 1.587464344410615e-05, + "loss": 0.1832, + "step": 16166 + }, + { + "epoch": 0.30016003089227433, + "grad_norm": 0.37228259444236755, + "learning_rate": 1.587369942151234e-05, + "loss": 0.3436, + "step": 16168 + }, + { + "epoch": 0.30019716102969296, + "grad_norm": 0.4015525281429291, + "learning_rate": 1.5872755318995066e-05, + "loss": 0.2902, + "step": 16170 + }, + { + "epoch": 0.3002342911671116, + "grad_norm": 0.43857961893081665, + "learning_rate": 1.587181113656719e-05, + "loss": 0.2085, + "step": 16172 + }, + { + "epoch": 0.3002714213045303, + "grad_norm": 0.3534403443336487, + "learning_rate": 1.5870866874241554e-05, + "loss": 0.3567, + "step": 16174 + }, + { + "epoch": 0.3003085514419489, + "grad_norm": 0.4879935085773468, + "learning_rate": 1.5869922532031e-05, + "loss": 0.2511, + "step": 16176 + }, + { + "epoch": 0.30034568157936753, + "grad_norm": 0.3121826946735382, + "learning_rate": 1.586897810994838e-05, + "loss": 0.2975, + "step": 16178 + }, + { + "epoch": 0.30038281171678616, + "grad_norm": 0.3071901798248291, + "learning_rate": 1.5868033608006553e-05, + "loss": 0.3155, + "step": 16180 + }, + { + "epoch": 0.3004199418542048, + "grad_norm": 0.3504277169704437, + "learning_rate": 1.5867089026218366e-05, + "loss": 0.2508, + "step": 16182 + }, + { + "epoch": 0.3004570719916234, + "grad_norm": 0.45316821336746216, + "learning_rate": 1.5866144364596666e-05, + "loss": 0.1407, + "step": 16184 + }, + { + "epoch": 0.3004942021290421, + "grad_norm": 0.5187512040138245, + "learning_rate": 1.586519962315432e-05, + "loss": 0.5181, + "step": 16186 + }, + { + "epoch": 0.3005313322664607, + "grad_norm": 1.0387310981750488, + "learning_rate": 1.5864254801904168e-05, + "loss": 0.2628, + "step": 16188 + }, + { + "epoch": 0.30056846240387936, + "grad_norm": 0.43464142084121704, + "learning_rate": 1.5863309900859078e-05, + "loss": 0.3263, + "step": 16190 + }, + { + "epoch": 0.300605592541298, + "grad_norm": 0.6859362125396729, + "learning_rate": 1.58623649200319e-05, + "loss": 0.3203, + "step": 16192 + }, + { + "epoch": 0.3006427226787166, + "grad_norm": 0.46165481209754944, + "learning_rate": 1.5861419859435497e-05, + "loss": 0.3308, + "step": 16194 + }, + { + "epoch": 0.3006798528161353, + "grad_norm": 0.446772038936615, + "learning_rate": 1.586047471908273e-05, + "loss": 0.362, + "step": 16196 + }, + { + "epoch": 0.3007169829535539, + "grad_norm": 0.2324133962392807, + "learning_rate": 1.585952949898645e-05, + "loss": 0.2184, + "step": 16198 + }, + { + "epoch": 0.30075411309097255, + "grad_norm": 0.9389732480049133, + "learning_rate": 1.585858419915953e-05, + "loss": 0.3131, + "step": 16200 + }, + { + "epoch": 0.3007912432283912, + "grad_norm": 0.48875007033348083, + "learning_rate": 1.5857638819614826e-05, + "loss": 0.318, + "step": 16202 + }, + { + "epoch": 0.3008283733658098, + "grad_norm": 0.47570088505744934, + "learning_rate": 1.58566933603652e-05, + "loss": 0.2984, + "step": 16204 + }, + { + "epoch": 0.3008655035032285, + "grad_norm": 0.2943958342075348, + "learning_rate": 1.5855747821423523e-05, + "loss": 0.3832, + "step": 16206 + }, + { + "epoch": 0.3009026336406471, + "grad_norm": 0.25589779019355774, + "learning_rate": 1.5854802202802657e-05, + "loss": 0.151, + "step": 16208 + }, + { + "epoch": 0.30093976377806575, + "grad_norm": 0.4360898733139038, + "learning_rate": 1.5853856504515473e-05, + "loss": 0.165, + "step": 16210 + }, + { + "epoch": 0.3009768939154844, + "grad_norm": 0.4053947627544403, + "learning_rate": 1.5852910726574836e-05, + "loss": 0.3924, + "step": 16212 + }, + { + "epoch": 0.301014024052903, + "grad_norm": 1.1675968170166016, + "learning_rate": 1.5851964868993613e-05, + "loss": 0.3263, + "step": 16214 + }, + { + "epoch": 0.30105115419032163, + "grad_norm": 0.21326427161693573, + "learning_rate": 1.5851018931784676e-05, + "loss": 0.2199, + "step": 16216 + }, + { + "epoch": 0.3010882843277403, + "grad_norm": 0.482308566570282, + "learning_rate": 1.5850072914960903e-05, + "loss": 0.261, + "step": 16218 + }, + { + "epoch": 0.30112541446515895, + "grad_norm": 0.3268890082836151, + "learning_rate": 1.5849126818535152e-05, + "loss": 0.142, + "step": 16220 + }, + { + "epoch": 0.3011625446025776, + "grad_norm": 0.4247482120990753, + "learning_rate": 1.584818064252031e-05, + "loss": 0.2335, + "step": 16222 + }, + { + "epoch": 0.3011996747399962, + "grad_norm": 0.417898952960968, + "learning_rate": 1.5847234386929247e-05, + "loss": 0.3243, + "step": 16224 + }, + { + "epoch": 0.30123680487741483, + "grad_norm": 0.3287702202796936, + "learning_rate": 1.584628805177484e-05, + "loss": 0.1683, + "step": 16226 + }, + { + "epoch": 0.3012739350148335, + "grad_norm": 0.38879936933517456, + "learning_rate": 1.584534163706996e-05, + "loss": 0.231, + "step": 16228 + }, + { + "epoch": 0.30131106515225214, + "grad_norm": 0.4904578626155853, + "learning_rate": 1.5844395142827492e-05, + "loss": 0.3104, + "step": 16230 + }, + { + "epoch": 0.3013481952896708, + "grad_norm": 0.30873847007751465, + "learning_rate": 1.584344856906031e-05, + "loss": 0.343, + "step": 16232 + }, + { + "epoch": 0.3013853254270894, + "grad_norm": 0.5864978432655334, + "learning_rate": 1.5842501915781298e-05, + "loss": 0.4186, + "step": 16234 + }, + { + "epoch": 0.30142245556450803, + "grad_norm": 0.34228113293647766, + "learning_rate": 1.5841555183003333e-05, + "loss": 0.2788, + "step": 16236 + }, + { + "epoch": 0.30145958570192666, + "grad_norm": 0.7428646683692932, + "learning_rate": 1.5840608370739305e-05, + "loss": 0.2834, + "step": 16238 + }, + { + "epoch": 0.30149671583934534, + "grad_norm": 0.3483157753944397, + "learning_rate": 1.5839661479002085e-05, + "loss": 0.1976, + "step": 16240 + }, + { + "epoch": 0.30153384597676397, + "grad_norm": 0.373030424118042, + "learning_rate": 1.583871450780457e-05, + "loss": 0.3706, + "step": 16242 + }, + { + "epoch": 0.3015709761141826, + "grad_norm": 0.3886907696723938, + "learning_rate": 1.5837767457159635e-05, + "loss": 0.1152, + "step": 16244 + }, + { + "epoch": 0.3016081062516012, + "grad_norm": 0.48107388615608215, + "learning_rate": 1.5836820327080172e-05, + "loss": 0.2688, + "step": 16246 + }, + { + "epoch": 0.30164523638901986, + "grad_norm": 0.4616096317768097, + "learning_rate": 1.5835873117579067e-05, + "loss": 0.1599, + "step": 16248 + }, + { + "epoch": 0.30168236652643854, + "grad_norm": 0.4929538071155548, + "learning_rate": 1.5834925828669214e-05, + "loss": 0.1403, + "step": 16250 + }, + { + "epoch": 0.30171949666385717, + "grad_norm": 0.4717556834220886, + "learning_rate": 1.5833978460363492e-05, + "loss": 0.2582, + "step": 16252 + }, + { + "epoch": 0.3017566268012758, + "grad_norm": 0.33128252625465393, + "learning_rate": 1.58330310126748e-05, + "loss": 0.184, + "step": 16254 + }, + { + "epoch": 0.3017937569386944, + "grad_norm": 0.6636273264884949, + "learning_rate": 1.583208348561603e-05, + "loss": 0.2302, + "step": 16256 + }, + { + "epoch": 0.30183088707611305, + "grad_norm": 0.34337711334228516, + "learning_rate": 1.583113587920007e-05, + "loss": 0.4143, + "step": 16258 + }, + { + "epoch": 0.3018680172135317, + "grad_norm": 0.3692743182182312, + "learning_rate": 1.583018819343982e-05, + "loss": 0.099, + "step": 16260 + }, + { + "epoch": 0.30190514735095036, + "grad_norm": 0.2798841893672943, + "learning_rate": 1.582924042834817e-05, + "loss": 0.2229, + "step": 16262 + }, + { + "epoch": 0.301942277488369, + "grad_norm": 0.39461827278137207, + "learning_rate": 1.582829258393802e-05, + "loss": 0.4144, + "step": 16264 + }, + { + "epoch": 0.3019794076257876, + "grad_norm": 0.38346654176712036, + "learning_rate": 1.5827344660222264e-05, + "loss": 0.4, + "step": 16266 + }, + { + "epoch": 0.30201653776320625, + "grad_norm": 0.4233570098876953, + "learning_rate": 1.5826396657213803e-05, + "loss": 0.2624, + "step": 16268 + }, + { + "epoch": 0.3020536679006249, + "grad_norm": 0.37801480293273926, + "learning_rate": 1.5825448574925535e-05, + "loss": 0.4319, + "step": 16270 + }, + { + "epoch": 0.30209079803804356, + "grad_norm": 0.5160408020019531, + "learning_rate": 1.582450041337036e-05, + "loss": 0.3099, + "step": 16272 + }, + { + "epoch": 0.3021279281754622, + "grad_norm": 0.4123377799987793, + "learning_rate": 1.582355217256118e-05, + "loss": 0.3165, + "step": 16274 + }, + { + "epoch": 0.3021650583128808, + "grad_norm": 0.31174302101135254, + "learning_rate": 1.5822603852510904e-05, + "loss": 0.3122, + "step": 16276 + }, + { + "epoch": 0.30220218845029945, + "grad_norm": 0.3060372769832611, + "learning_rate": 1.5821655453232427e-05, + "loss": 0.3226, + "step": 16278 + }, + { + "epoch": 0.3022393185877181, + "grad_norm": 0.5194147825241089, + "learning_rate": 1.5820706974738657e-05, + "loss": 0.4578, + "step": 16280 + }, + { + "epoch": 0.30227644872513676, + "grad_norm": 0.33621835708618164, + "learning_rate": 1.58197584170425e-05, + "loss": 0.4504, + "step": 16282 + }, + { + "epoch": 0.3023135788625554, + "grad_norm": 0.35131993889808655, + "learning_rate": 1.5818809780156863e-05, + "loss": 0.3386, + "step": 16284 + }, + { + "epoch": 0.302350708999974, + "grad_norm": 0.27675920724868774, + "learning_rate": 1.5817861064094654e-05, + "loss": 0.3684, + "step": 16286 + }, + { + "epoch": 0.30238783913739264, + "grad_norm": 0.42793887853622437, + "learning_rate": 1.5816912268868783e-05, + "loss": 0.2511, + "step": 16288 + }, + { + "epoch": 0.3024249692748113, + "grad_norm": 0.3586810231208801, + "learning_rate": 1.5815963394492162e-05, + "loss": 0.2102, + "step": 16290 + }, + { + "epoch": 0.3024620994122299, + "grad_norm": 0.26683351397514343, + "learning_rate": 1.5815014440977697e-05, + "loss": 0.3612, + "step": 16292 + }, + { + "epoch": 0.3024992295496486, + "grad_norm": 0.36184874176979065, + "learning_rate": 1.5814065408338303e-05, + "loss": 0.4777, + "step": 16294 + }, + { + "epoch": 0.3025363596870672, + "grad_norm": 0.39173153042793274, + "learning_rate": 1.5813116296586896e-05, + "loss": 0.2786, + "step": 16296 + }, + { + "epoch": 0.30257348982448584, + "grad_norm": 0.46049219369888306, + "learning_rate": 1.5812167105736387e-05, + "loss": 0.3843, + "step": 16298 + }, + { + "epoch": 0.30261061996190447, + "grad_norm": 0.3502812385559082, + "learning_rate": 1.5811217835799696e-05, + "loss": 0.2393, + "step": 16300 + }, + { + "epoch": 0.3026477500993231, + "grad_norm": 0.5710545778274536, + "learning_rate": 1.5810268486789732e-05, + "loss": 0.1627, + "step": 16302 + }, + { + "epoch": 0.3026848802367418, + "grad_norm": 0.5966254472732544, + "learning_rate": 1.580931905871942e-05, + "loss": 0.3489, + "step": 16304 + }, + { + "epoch": 0.3027220103741604, + "grad_norm": 0.2895124554634094, + "learning_rate": 1.5808369551601677e-05, + "loss": 0.3314, + "step": 16306 + }, + { + "epoch": 0.30275914051157904, + "grad_norm": 0.3414720892906189, + "learning_rate": 1.5807419965449423e-05, + "loss": 0.2939, + "step": 16308 + }, + { + "epoch": 0.30279627064899767, + "grad_norm": 0.2914572060108185, + "learning_rate": 1.580647030027558e-05, + "loss": 0.522, + "step": 16310 + }, + { + "epoch": 0.3028334007864163, + "grad_norm": 0.2794126272201538, + "learning_rate": 1.5805520556093062e-05, + "loss": 0.3364, + "step": 16312 + }, + { + "epoch": 0.3028705309238349, + "grad_norm": 0.5639039874076843, + "learning_rate": 1.5804570732914806e-05, + "loss": 0.2648, + "step": 16314 + }, + { + "epoch": 0.3029076610612536, + "grad_norm": 0.5402952432632446, + "learning_rate": 1.5803620830753726e-05, + "loss": 0.416, + "step": 16316 + }, + { + "epoch": 0.30294479119867224, + "grad_norm": 0.3365645110607147, + "learning_rate": 1.580267084962275e-05, + "loss": 0.2107, + "step": 16318 + }, + { + "epoch": 0.30298192133609086, + "grad_norm": 0.44152238965034485, + "learning_rate": 1.5801720789534805e-05, + "loss": 0.3615, + "step": 16320 + }, + { + "epoch": 0.3030190514735095, + "grad_norm": 0.48954296112060547, + "learning_rate": 1.580077065050282e-05, + "loss": 0.3276, + "step": 16322 + }, + { + "epoch": 0.3030561816109281, + "grad_norm": 0.30221888422966003, + "learning_rate": 1.5799820432539718e-05, + "loss": 0.2612, + "step": 16324 + }, + { + "epoch": 0.3030933117483468, + "grad_norm": 0.2957903742790222, + "learning_rate": 1.5798870135658435e-05, + "loss": 0.3312, + "step": 16326 + }, + { + "epoch": 0.30313044188576543, + "grad_norm": 0.3304097354412079, + "learning_rate": 1.57979197598719e-05, + "loss": 0.369, + "step": 16328 + }, + { + "epoch": 0.30316757202318406, + "grad_norm": 0.4203992784023285, + "learning_rate": 1.5796969305193042e-05, + "loss": 0.2827, + "step": 16330 + }, + { + "epoch": 0.3032047021606027, + "grad_norm": 0.5688360333442688, + "learning_rate": 1.5796018771634796e-05, + "loss": 0.2526, + "step": 16332 + }, + { + "epoch": 0.3032418322980213, + "grad_norm": 0.38786038756370544, + "learning_rate": 1.5795068159210095e-05, + "loss": 0.3511, + "step": 16334 + }, + { + "epoch": 0.30327896243543995, + "grad_norm": 0.3857662081718445, + "learning_rate": 1.5794117467931878e-05, + "loss": 0.1532, + "step": 16336 + }, + { + "epoch": 0.30331609257285863, + "grad_norm": 0.2677517235279083, + "learning_rate": 1.5793166697813074e-05, + "loss": 0.2753, + "step": 16338 + }, + { + "epoch": 0.30335322271027726, + "grad_norm": 0.4150560200214386, + "learning_rate": 1.5792215848866627e-05, + "loss": 0.4178, + "step": 16340 + }, + { + "epoch": 0.3033903528476959, + "grad_norm": 0.35351672768592834, + "learning_rate": 1.5791264921105467e-05, + "loss": 0.2274, + "step": 16342 + }, + { + "epoch": 0.3034274829851145, + "grad_norm": 0.5662781000137329, + "learning_rate": 1.5790313914542543e-05, + "loss": 0.3439, + "step": 16344 + }, + { + "epoch": 0.30346461312253314, + "grad_norm": 0.33523303270339966, + "learning_rate": 1.578936282919079e-05, + "loss": 0.2497, + "step": 16346 + }, + { + "epoch": 0.30350174325995183, + "grad_norm": 0.42839181423187256, + "learning_rate": 1.5788411665063148e-05, + "loss": 0.3044, + "step": 16348 + }, + { + "epoch": 0.30353887339737046, + "grad_norm": 0.44385963678359985, + "learning_rate": 1.578746042217256e-05, + "loss": 0.4338, + "step": 16350 + }, + { + "epoch": 0.3035760035347891, + "grad_norm": 0.41629549860954285, + "learning_rate": 1.5786509100531976e-05, + "loss": 0.1757, + "step": 16352 + }, + { + "epoch": 0.3036131336722077, + "grad_norm": 0.4542321562767029, + "learning_rate": 1.5785557700154333e-05, + "loss": 0.2535, + "step": 16354 + }, + { + "epoch": 0.30365026380962634, + "grad_norm": 0.3555143177509308, + "learning_rate": 1.5784606221052578e-05, + "loss": 0.3635, + "step": 16356 + }, + { + "epoch": 0.303687393947045, + "grad_norm": 0.5755445957183838, + "learning_rate": 1.578365466323966e-05, + "loss": 0.2618, + "step": 16358 + }, + { + "epoch": 0.30372452408446365, + "grad_norm": 0.4481756091117859, + "learning_rate": 1.578270302672853e-05, + "loss": 0.3444, + "step": 16360 + }, + { + "epoch": 0.3037616542218823, + "grad_norm": 0.3632296025753021, + "learning_rate": 1.5781751311532126e-05, + "loss": 0.2418, + "step": 16362 + }, + { + "epoch": 0.3037987843593009, + "grad_norm": 0.29582175612449646, + "learning_rate": 1.5780799517663414e-05, + "loss": 0.2556, + "step": 16364 + }, + { + "epoch": 0.30383591449671954, + "grad_norm": 0.5160143971443176, + "learning_rate": 1.577984764513533e-05, + "loss": 0.3634, + "step": 16366 + }, + { + "epoch": 0.30387304463413817, + "grad_norm": 0.6076065301895142, + "learning_rate": 1.577889569396083e-05, + "loss": 0.4017, + "step": 16368 + }, + { + "epoch": 0.30391017477155685, + "grad_norm": 0.4002518951892853, + "learning_rate": 1.5777943664152874e-05, + "loss": 0.3762, + "step": 16370 + }, + { + "epoch": 0.3039473049089755, + "grad_norm": 0.2705652415752411, + "learning_rate": 1.5776991555724412e-05, + "loss": 0.3525, + "step": 16372 + }, + { + "epoch": 0.3039844350463941, + "grad_norm": 0.29727306962013245, + "learning_rate": 1.5776039368688396e-05, + "loss": 0.4308, + "step": 16374 + }, + { + "epoch": 0.30402156518381274, + "grad_norm": 0.2842528820037842, + "learning_rate": 1.5775087103057786e-05, + "loss": 0.3998, + "step": 16376 + }, + { + "epoch": 0.30405869532123136, + "grad_norm": 0.3011460602283478, + "learning_rate": 1.577413475884554e-05, + "loss": 0.434, + "step": 16378 + }, + { + "epoch": 0.30409582545865005, + "grad_norm": 0.28696584701538086, + "learning_rate": 1.5773182336064615e-05, + "loss": 0.3244, + "step": 16380 + }, + { + "epoch": 0.3041329555960687, + "grad_norm": 0.35604965686798096, + "learning_rate": 1.5772229834727975e-05, + "loss": 0.3521, + "step": 16382 + }, + { + "epoch": 0.3041700857334873, + "grad_norm": 0.45947733521461487, + "learning_rate": 1.577127725484857e-05, + "loss": 0.2479, + "step": 16384 + }, + { + "epoch": 0.30420721587090593, + "grad_norm": 0.5401590466499329, + "learning_rate": 1.5770324596439374e-05, + "loss": 0.4565, + "step": 16386 + }, + { + "epoch": 0.30424434600832456, + "grad_norm": 0.45186594128608704, + "learning_rate": 1.576937185951334e-05, + "loss": 0.2811, + "step": 16388 + }, + { + "epoch": 0.3042814761457432, + "grad_norm": 0.3366377651691437, + "learning_rate": 1.576841904408344e-05, + "loss": 0.385, + "step": 16390 + }, + { + "epoch": 0.3043186062831619, + "grad_norm": 0.3845144212245941, + "learning_rate": 1.5767466150162632e-05, + "loss": 0.3989, + "step": 16392 + }, + { + "epoch": 0.3043557364205805, + "grad_norm": 0.2760485112667084, + "learning_rate": 1.5766513177763885e-05, + "loss": 0.3178, + "step": 16394 + }, + { + "epoch": 0.30439286655799913, + "grad_norm": 0.19351300597190857, + "learning_rate": 1.576556012690017e-05, + "loss": 0.2492, + "step": 16396 + }, + { + "epoch": 0.30442999669541776, + "grad_norm": 0.30799242854118347, + "learning_rate": 1.576460699758445e-05, + "loss": 0.3962, + "step": 16398 + }, + { + "epoch": 0.3044671268328364, + "grad_norm": 0.30644455552101135, + "learning_rate": 1.5763653789829693e-05, + "loss": 0.3345, + "step": 16400 + }, + { + "epoch": 0.30450425697025507, + "grad_norm": 0.34072408080101013, + "learning_rate": 1.5762700503648874e-05, + "loss": 0.3826, + "step": 16402 + }, + { + "epoch": 0.3045413871076737, + "grad_norm": 0.24619989097118378, + "learning_rate": 1.5761747139054967e-05, + "loss": 0.3262, + "step": 16404 + }, + { + "epoch": 0.30457851724509233, + "grad_norm": 0.38269028067588806, + "learning_rate": 1.5760793696060933e-05, + "loss": 0.3002, + "step": 16406 + }, + { + "epoch": 0.30461564738251096, + "grad_norm": 0.3844844102859497, + "learning_rate": 1.5759840174679756e-05, + "loss": 0.1433, + "step": 16408 + }, + { + "epoch": 0.3046527775199296, + "grad_norm": 0.48180678486824036, + "learning_rate": 1.5758886574924407e-05, + "loss": 0.218, + "step": 16410 + }, + { + "epoch": 0.3046899076573482, + "grad_norm": 0.37486982345581055, + "learning_rate": 1.575793289680786e-05, + "loss": 0.3982, + "step": 16412 + }, + { + "epoch": 0.3047270377947669, + "grad_norm": 0.580539345741272, + "learning_rate": 1.5756979140343094e-05, + "loss": 0.3191, + "step": 16414 + }, + { + "epoch": 0.3047641679321855, + "grad_norm": 0.37441331148147583, + "learning_rate": 1.5756025305543085e-05, + "loss": 0.3382, + "step": 16416 + }, + { + "epoch": 0.30480129806960415, + "grad_norm": 0.308307409286499, + "learning_rate": 1.575507139242081e-05, + "loss": 0.5857, + "step": 16418 + }, + { + "epoch": 0.3048384282070228, + "grad_norm": 0.27833014726638794, + "learning_rate": 1.575411740098926e-05, + "loss": 0.1347, + "step": 16420 + }, + { + "epoch": 0.3048755583444414, + "grad_norm": 0.25873076915740967, + "learning_rate": 1.57531633312614e-05, + "loss": 0.2473, + "step": 16422 + }, + { + "epoch": 0.3049126884818601, + "grad_norm": 0.3725907802581787, + "learning_rate": 1.5752209183250228e-05, + "loss": 0.4214, + "step": 16424 + }, + { + "epoch": 0.3049498186192787, + "grad_norm": 0.47309261560440063, + "learning_rate": 1.575125495696871e-05, + "loss": 0.2003, + "step": 16426 + }, + { + "epoch": 0.30498694875669735, + "grad_norm": 0.2646389603614807, + "learning_rate": 1.5750300652429846e-05, + "loss": 0.4743, + "step": 16428 + }, + { + "epoch": 0.305024078894116, + "grad_norm": 0.5863294005393982, + "learning_rate": 1.574934626964661e-05, + "loss": 0.4601, + "step": 16430 + }, + { + "epoch": 0.3050612090315346, + "grad_norm": 0.29395779967308044, + "learning_rate": 1.5748391808631992e-05, + "loss": 0.1264, + "step": 16432 + }, + { + "epoch": 0.3050983391689533, + "grad_norm": 0.2814014256000519, + "learning_rate": 1.5747437269398984e-05, + "loss": 0.548, + "step": 16434 + }, + { + "epoch": 0.3051354693063719, + "grad_norm": 0.2762012183666229, + "learning_rate": 1.5746482651960565e-05, + "loss": 0.2469, + "step": 16436 + }, + { + "epoch": 0.30517259944379055, + "grad_norm": 0.3044281601905823, + "learning_rate": 1.5745527956329736e-05, + "loss": 0.3498, + "step": 16438 + }, + { + "epoch": 0.3052097295812092, + "grad_norm": 0.209794819355011, + "learning_rate": 1.5744573182519477e-05, + "loss": 0.3118, + "step": 16440 + }, + { + "epoch": 0.3052468597186278, + "grad_norm": 0.5362929701805115, + "learning_rate": 1.574361833054279e-05, + "loss": 0.3381, + "step": 16442 + }, + { + "epoch": 0.30528398985604643, + "grad_norm": 0.6461273431777954, + "learning_rate": 1.5742663400412654e-05, + "loss": 0.3473, + "step": 16444 + }, + { + "epoch": 0.3053211199934651, + "grad_norm": 0.32658568024635315, + "learning_rate": 1.5741708392142072e-05, + "loss": 0.407, + "step": 16446 + }, + { + "epoch": 0.30535825013088375, + "grad_norm": 0.2680683732032776, + "learning_rate": 1.574075330574404e-05, + "loss": 0.1845, + "step": 16448 + }, + { + "epoch": 0.3053953802683024, + "grad_norm": 0.3414680063724518, + "learning_rate": 1.573979814123155e-05, + "loss": 0.3635, + "step": 16450 + }, + { + "epoch": 0.305432510405721, + "grad_norm": 0.3205528259277344, + "learning_rate": 1.57388428986176e-05, + "loss": 0.5048, + "step": 16452 + }, + { + "epoch": 0.30546964054313963, + "grad_norm": 0.42299753427505493, + "learning_rate": 1.5737887577915186e-05, + "loss": 0.2803, + "step": 16454 + }, + { + "epoch": 0.3055067706805583, + "grad_norm": 0.4396992027759552, + "learning_rate": 1.573693217913731e-05, + "loss": 0.3947, + "step": 16456 + }, + { + "epoch": 0.30554390081797694, + "grad_norm": 0.2916158437728882, + "learning_rate": 1.5735976702296972e-05, + "loss": 0.1199, + "step": 16458 + }, + { + "epoch": 0.30558103095539557, + "grad_norm": 0.36378878355026245, + "learning_rate": 1.5735021147407174e-05, + "loss": 0.3627, + "step": 16460 + }, + { + "epoch": 0.3056181610928142, + "grad_norm": 0.2865287661552429, + "learning_rate": 1.573406551448091e-05, + "loss": 0.2512, + "step": 16462 + }, + { + "epoch": 0.30565529123023283, + "grad_norm": 0.3946283161640167, + "learning_rate": 1.5733109803531197e-05, + "loss": 0.5993, + "step": 16464 + }, + { + "epoch": 0.30569242136765146, + "grad_norm": 0.38869160413742065, + "learning_rate": 1.573215401457103e-05, + "loss": 0.4061, + "step": 16466 + }, + { + "epoch": 0.30572955150507014, + "grad_norm": 0.4151996672153473, + "learning_rate": 1.5731198147613415e-05, + "loss": 0.3125, + "step": 16468 + }, + { + "epoch": 0.30576668164248877, + "grad_norm": 0.3945901393890381, + "learning_rate": 1.573024220267136e-05, + "loss": 0.4078, + "step": 16470 + }, + { + "epoch": 0.3058038117799074, + "grad_norm": 0.39663851261138916, + "learning_rate": 1.5729286179757878e-05, + "loss": 0.2258, + "step": 16472 + }, + { + "epoch": 0.305840941917326, + "grad_norm": 0.33139923214912415, + "learning_rate": 1.5728330078885966e-05, + "loss": 0.3279, + "step": 16474 + }, + { + "epoch": 0.30587807205474465, + "grad_norm": 0.43833813071250916, + "learning_rate": 1.5727373900068642e-05, + "loss": 0.364, + "step": 16476 + }, + { + "epoch": 0.30591520219216334, + "grad_norm": 0.4080651104450226, + "learning_rate": 1.5726417643318917e-05, + "loss": 0.1984, + "step": 16478 + }, + { + "epoch": 0.30595233232958197, + "grad_norm": 0.19043704867362976, + "learning_rate": 1.5725461308649797e-05, + "loss": 0.2146, + "step": 16480 + }, + { + "epoch": 0.3059894624670006, + "grad_norm": 0.2585740089416504, + "learning_rate": 1.57245048960743e-05, + "loss": 0.2808, + "step": 16482 + }, + { + "epoch": 0.3060265926044192, + "grad_norm": 0.3631007969379425, + "learning_rate": 1.5723548405605438e-05, + "loss": 0.3558, + "step": 16484 + }, + { + "epoch": 0.30606372274183785, + "grad_norm": 0.24084387719631195, + "learning_rate": 1.5722591837256228e-05, + "loss": 0.2598, + "step": 16486 + }, + { + "epoch": 0.3061008528792565, + "grad_norm": 0.36494797468185425, + "learning_rate": 1.572163519103968e-05, + "loss": 0.3286, + "step": 16488 + }, + { + "epoch": 0.30613798301667516, + "grad_norm": 0.3660631775856018, + "learning_rate": 1.572067846696882e-05, + "loss": 0.3426, + "step": 16490 + }, + { + "epoch": 0.3061751131540938, + "grad_norm": 0.6746957302093506, + "learning_rate": 1.571972166505666e-05, + "loss": 0.3402, + "step": 16492 + }, + { + "epoch": 0.3062122432915124, + "grad_norm": 0.34617069363594055, + "learning_rate": 1.571876478531622e-05, + "loss": 0.3434, + "step": 16494 + }, + { + "epoch": 0.30624937342893105, + "grad_norm": 0.2727055847644806, + "learning_rate": 1.571780782776052e-05, + "loss": 0.1097, + "step": 16496 + }, + { + "epoch": 0.3062865035663497, + "grad_norm": 0.4270043671131134, + "learning_rate": 1.5716850792402583e-05, + "loss": 0.2976, + "step": 16498 + }, + { + "epoch": 0.30632363370376836, + "grad_norm": 0.3759697675704956, + "learning_rate": 1.571589367925543e-05, + "loss": 0.3867, + "step": 16500 + }, + { + "epoch": 0.306360763841187, + "grad_norm": 0.2748478651046753, + "learning_rate": 1.5714936488332088e-05, + "loss": 0.4845, + "step": 16502 + }, + { + "epoch": 0.3063978939786056, + "grad_norm": 0.3093646466732025, + "learning_rate": 1.5713979219645575e-05, + "loss": 0.3446, + "step": 16504 + }, + { + "epoch": 0.30643502411602425, + "grad_norm": 0.7534693479537964, + "learning_rate": 1.5713021873208926e-05, + "loss": 0.2728, + "step": 16506 + }, + { + "epoch": 0.3064721542534429, + "grad_norm": 0.31008830666542053, + "learning_rate": 1.5712064449035157e-05, + "loss": 0.3914, + "step": 16508 + }, + { + "epoch": 0.30650928439086156, + "grad_norm": 0.49009695649147034, + "learning_rate": 1.57111069471373e-05, + "loss": 0.3029, + "step": 16510 + }, + { + "epoch": 0.3065464145282802, + "grad_norm": 0.29887720942497253, + "learning_rate": 1.5710149367528383e-05, + "loss": 0.3723, + "step": 16512 + }, + { + "epoch": 0.3065835446656988, + "grad_norm": 0.26778683066368103, + "learning_rate": 1.570919171022144e-05, + "loss": 0.2706, + "step": 16514 + }, + { + "epoch": 0.30662067480311744, + "grad_norm": 0.31673118472099304, + "learning_rate": 1.57082339752295e-05, + "loss": 0.2003, + "step": 16516 + }, + { + "epoch": 0.30665780494053607, + "grad_norm": 0.36437028646469116, + "learning_rate": 1.570727616256559e-05, + "loss": 0.2945, + "step": 16518 + }, + { + "epoch": 0.3066949350779547, + "grad_norm": 0.47071221470832825, + "learning_rate": 1.5706318272242747e-05, + "loss": 0.2912, + "step": 16520 + }, + { + "epoch": 0.3067320652153734, + "grad_norm": 0.30297818779945374, + "learning_rate": 1.570536030427401e-05, + "loss": 0.2929, + "step": 16522 + }, + { + "epoch": 0.306769195352792, + "grad_norm": 0.3967669606208801, + "learning_rate": 1.57044022586724e-05, + "loss": 0.3168, + "step": 16524 + }, + { + "epoch": 0.30680632549021064, + "grad_norm": 0.2511221766471863, + "learning_rate": 1.570344413545097e-05, + "loss": 0.3666, + "step": 16526 + }, + { + "epoch": 0.30684345562762927, + "grad_norm": 0.27303099632263184, + "learning_rate": 1.5702485934622747e-05, + "loss": 0.4436, + "step": 16528 + }, + { + "epoch": 0.3068805857650479, + "grad_norm": 0.5541063547134399, + "learning_rate": 1.570152765620077e-05, + "loss": 0.191, + "step": 16530 + }, + { + "epoch": 0.3069177159024666, + "grad_norm": 0.27020296454429626, + "learning_rate": 1.5700569300198077e-05, + "loss": 0.3549, + "step": 16532 + }, + { + "epoch": 0.3069548460398852, + "grad_norm": 0.44569888710975647, + "learning_rate": 1.5699610866627718e-05, + "loss": 0.2356, + "step": 16534 + }, + { + "epoch": 0.30699197617730384, + "grad_norm": 0.3283940553665161, + "learning_rate": 1.5698652355502724e-05, + "loss": 0.3459, + "step": 16536 + }, + { + "epoch": 0.30702910631472247, + "grad_norm": 0.44204750657081604, + "learning_rate": 1.5697693766836137e-05, + "loss": 0.2603, + "step": 16538 + }, + { + "epoch": 0.3070662364521411, + "grad_norm": 0.37685173749923706, + "learning_rate": 1.5696735100641013e-05, + "loss": 0.4798, + "step": 16540 + }, + { + "epoch": 0.3071033665895597, + "grad_norm": 0.4294675886631012, + "learning_rate": 1.5695776356930378e-05, + "loss": 0.2816, + "step": 16542 + }, + { + "epoch": 0.3071404967269784, + "grad_norm": 0.3429599106311798, + "learning_rate": 1.569481753571729e-05, + "loss": 0.3775, + "step": 16544 + }, + { + "epoch": 0.30717762686439704, + "grad_norm": 0.36742743849754333, + "learning_rate": 1.56938586370148e-05, + "loss": 0.3747, + "step": 16546 + }, + { + "epoch": 0.30721475700181566, + "grad_norm": 0.44516023993492126, + "learning_rate": 1.5692899660835946e-05, + "loss": 0.3403, + "step": 16548 + }, + { + "epoch": 0.3072518871392343, + "grad_norm": 0.4365513026714325, + "learning_rate": 1.5691940607193777e-05, + "loss": 0.157, + "step": 16550 + }, + { + "epoch": 0.3072890172766529, + "grad_norm": 0.4185921251773834, + "learning_rate": 1.5690981476101347e-05, + "loss": 0.329, + "step": 16552 + }, + { + "epoch": 0.3073261474140716, + "grad_norm": 0.22240328788757324, + "learning_rate": 1.569002226757171e-05, + "loss": 0.242, + "step": 16554 + }, + { + "epoch": 0.30736327755149023, + "grad_norm": 0.2623108923435211, + "learning_rate": 1.5689062981617905e-05, + "loss": 0.3067, + "step": 16556 + }, + { + "epoch": 0.30740040768890886, + "grad_norm": 0.33413684368133545, + "learning_rate": 1.5688103618252995e-05, + "loss": 0.4104, + "step": 16558 + }, + { + "epoch": 0.3074375378263275, + "grad_norm": 0.3714750111103058, + "learning_rate": 1.568714417749004e-05, + "loss": 0.2817, + "step": 16560 + }, + { + "epoch": 0.3074746679637461, + "grad_norm": 0.38130223751068115, + "learning_rate": 1.568618465934208e-05, + "loss": 0.2496, + "step": 16562 + }, + { + "epoch": 0.30751179810116475, + "grad_norm": 0.3975653648376465, + "learning_rate": 1.5685225063822183e-05, + "loss": 0.3612, + "step": 16564 + }, + { + "epoch": 0.30754892823858343, + "grad_norm": 0.37250664830207825, + "learning_rate": 1.5684265390943404e-05, + "loss": 0.3116, + "step": 16566 + }, + { + "epoch": 0.30758605837600206, + "grad_norm": 0.5297329425811768, + "learning_rate": 1.5683305640718794e-05, + "loss": 0.1213, + "step": 16568 + }, + { + "epoch": 0.3076231885134207, + "grad_norm": 0.3852524161338806, + "learning_rate": 1.5682345813161418e-05, + "loss": 0.325, + "step": 16570 + }, + { + "epoch": 0.3076603186508393, + "grad_norm": 0.39035290479660034, + "learning_rate": 1.568138590828434e-05, + "loss": 0.3929, + "step": 16572 + }, + { + "epoch": 0.30769744878825794, + "grad_norm": 0.6014788150787354, + "learning_rate": 1.5680425926100618e-05, + "loss": 0.2191, + "step": 16574 + }, + { + "epoch": 0.3077345789256766, + "grad_norm": 0.2949322760105133, + "learning_rate": 1.567946586662331e-05, + "loss": 0.2513, + "step": 16576 + }, + { + "epoch": 0.30777170906309526, + "grad_norm": 0.45096710324287415, + "learning_rate": 1.5678505729865485e-05, + "loss": 0.3135, + "step": 16578 + }, + { + "epoch": 0.3078088392005139, + "grad_norm": 0.40488123893737793, + "learning_rate": 1.5677545515840205e-05, + "loss": 0.3203, + "step": 16580 + }, + { + "epoch": 0.3078459693379325, + "grad_norm": 0.4338582158088684, + "learning_rate": 1.5676585224560537e-05, + "loss": 0.1121, + "step": 16582 + }, + { + "epoch": 0.30788309947535114, + "grad_norm": 0.28338631987571716, + "learning_rate": 1.5675624856039552e-05, + "loss": 0.3647, + "step": 16584 + }, + { + "epoch": 0.30792022961276977, + "grad_norm": 0.3651292026042938, + "learning_rate": 1.567466441029031e-05, + "loss": 0.2897, + "step": 16586 + }, + { + "epoch": 0.30795735975018845, + "grad_norm": 0.43862706422805786, + "learning_rate": 1.5673703887325885e-05, + "loss": 0.3559, + "step": 16588 + }, + { + "epoch": 0.3079944898876071, + "grad_norm": 0.899421215057373, + "learning_rate": 1.567274328715934e-05, + "loss": 0.3258, + "step": 16590 + }, + { + "epoch": 0.3080316200250257, + "grad_norm": 0.2669694125652313, + "learning_rate": 1.5671782609803754e-05, + "loss": 0.1478, + "step": 16592 + }, + { + "epoch": 0.30806875016244434, + "grad_norm": 0.35610005259513855, + "learning_rate": 1.5670821855272192e-05, + "loss": 0.2901, + "step": 16594 + }, + { + "epoch": 0.30810588029986297, + "grad_norm": 0.5791921615600586, + "learning_rate": 1.5669861023577735e-05, + "loss": 0.3681, + "step": 16596 + }, + { + "epoch": 0.30814301043728165, + "grad_norm": 0.3730636239051819, + "learning_rate": 1.5668900114733454e-05, + "loss": 0.1845, + "step": 16598 + }, + { + "epoch": 0.3081801405747003, + "grad_norm": 0.41708168387413025, + "learning_rate": 1.566793912875242e-05, + "loss": 0.3099, + "step": 16600 + }, + { + "epoch": 0.3082172707121189, + "grad_norm": 0.29422029852867126, + "learning_rate": 1.566697806564771e-05, + "loss": 0.3004, + "step": 16602 + }, + { + "epoch": 0.30825440084953754, + "grad_norm": 0.250308632850647, + "learning_rate": 1.5666016925432405e-05, + "loss": 0.158, + "step": 16604 + }, + { + "epoch": 0.30829153098695616, + "grad_norm": 0.39087826013565063, + "learning_rate": 1.566505570811958e-05, + "loss": 0.2636, + "step": 16606 + }, + { + "epoch": 0.30832866112437485, + "grad_norm": 0.32174766063690186, + "learning_rate": 1.5664094413722318e-05, + "loss": 0.3822, + "step": 16608 + }, + { + "epoch": 0.3083657912617935, + "grad_norm": 0.5919029712677002, + "learning_rate": 1.5663133042253696e-05, + "loss": 0.1749, + "step": 16610 + }, + { + "epoch": 0.3084029213992121, + "grad_norm": 0.6407782435417175, + "learning_rate": 1.5662171593726797e-05, + "loss": 0.4165, + "step": 16612 + }, + { + "epoch": 0.30844005153663073, + "grad_norm": 0.30908292531967163, + "learning_rate": 1.56612100681547e-05, + "loss": 0.3558, + "step": 16614 + }, + { + "epoch": 0.30847718167404936, + "grad_norm": 0.3452662229537964, + "learning_rate": 1.5660248465550495e-05, + "loss": 0.4225, + "step": 16616 + }, + { + "epoch": 0.308514311811468, + "grad_norm": 0.2847616374492645, + "learning_rate": 1.565928678592726e-05, + "loss": 0.2013, + "step": 16618 + }, + { + "epoch": 0.3085514419488867, + "grad_norm": 0.205544114112854, + "learning_rate": 1.5658325029298084e-05, + "loss": 0.3006, + "step": 16620 + }, + { + "epoch": 0.3085885720863053, + "grad_norm": 0.40073904395103455, + "learning_rate": 1.5657363195676055e-05, + "loss": 0.3951, + "step": 16622 + }, + { + "epoch": 0.30862570222372393, + "grad_norm": 0.37328073382377625, + "learning_rate": 1.5656401285074255e-05, + "loss": 0.3626, + "step": 16624 + }, + { + "epoch": 0.30866283236114256, + "grad_norm": 0.38561737537384033, + "learning_rate": 1.565543929750578e-05, + "loss": 0.4373, + "step": 16626 + }, + { + "epoch": 0.3086999624985612, + "grad_norm": 0.37617290019989014, + "learning_rate": 1.5654477232983715e-05, + "loss": 0.2897, + "step": 16628 + }, + { + "epoch": 0.30873709263597987, + "grad_norm": 0.32271674275398254, + "learning_rate": 1.565351509152115e-05, + "loss": 0.2295, + "step": 16630 + }, + { + "epoch": 0.3087742227733985, + "grad_norm": 0.2987178862094879, + "learning_rate": 1.565255287313118e-05, + "loss": 0.3438, + "step": 16632 + }, + { + "epoch": 0.3088113529108171, + "grad_norm": 0.3464885652065277, + "learning_rate": 1.56515905778269e-05, + "loss": 0.2258, + "step": 16634 + }, + { + "epoch": 0.30884848304823576, + "grad_norm": 0.24199305474758148, + "learning_rate": 1.5650628205621397e-05, + "loss": 0.0808, + "step": 16636 + }, + { + "epoch": 0.3088856131856544, + "grad_norm": 0.44331419467926025, + "learning_rate": 1.564966575652777e-05, + "loss": 0.3605, + "step": 16638 + }, + { + "epoch": 0.308922743323073, + "grad_norm": 0.33524084091186523, + "learning_rate": 1.5648703230559115e-05, + "loss": 0.1736, + "step": 16640 + }, + { + "epoch": 0.3089598734604917, + "grad_norm": 0.19516101479530334, + "learning_rate": 1.564774062772853e-05, + "loss": 0.3516, + "step": 16642 + }, + { + "epoch": 0.3089970035979103, + "grad_norm": 0.34480419754981995, + "learning_rate": 1.564677794804911e-05, + "loss": 0.3492, + "step": 16644 + }, + { + "epoch": 0.30903413373532895, + "grad_norm": 1.7767935991287231, + "learning_rate": 1.5645815191533956e-05, + "loss": 0.3228, + "step": 16646 + }, + { + "epoch": 0.3090712638727476, + "grad_norm": 0.4164237678050995, + "learning_rate": 1.564485235819617e-05, + "loss": 0.3073, + "step": 16648 + }, + { + "epoch": 0.3091083940101662, + "grad_norm": 0.49734604358673096, + "learning_rate": 1.5643889448048853e-05, + "loss": 0.3002, + "step": 16650 + }, + { + "epoch": 0.3091455241475849, + "grad_norm": 0.380039244890213, + "learning_rate": 1.5642926461105106e-05, + "loss": 0.2654, + "step": 16652 + }, + { + "epoch": 0.3091826542850035, + "grad_norm": 0.44405633211135864, + "learning_rate": 1.5641963397378032e-05, + "loss": 0.2832, + "step": 16654 + }, + { + "epoch": 0.30921978442242215, + "grad_norm": 0.33024150133132935, + "learning_rate": 1.5641000256880734e-05, + "loss": 0.4254, + "step": 16656 + }, + { + "epoch": 0.3092569145598408, + "grad_norm": 0.30719926953315735, + "learning_rate": 1.564003703962632e-05, + "loss": 0.3623, + "step": 16658 + }, + { + "epoch": 0.3092940446972594, + "grad_norm": 0.4838407635688782, + "learning_rate": 1.5639073745627894e-05, + "loss": 0.316, + "step": 16660 + }, + { + "epoch": 0.30933117483467804, + "grad_norm": 0.3967163562774658, + "learning_rate": 1.5638110374898567e-05, + "loss": 0.3132, + "step": 16662 + }, + { + "epoch": 0.3093683049720967, + "grad_norm": 0.23284919559955597, + "learning_rate": 1.5637146927451447e-05, + "loss": 0.2195, + "step": 16664 + }, + { + "epoch": 0.30940543510951535, + "grad_norm": 0.3349030315876007, + "learning_rate": 1.563618340329964e-05, + "loss": 0.2879, + "step": 16666 + }, + { + "epoch": 0.309442565246934, + "grad_norm": 0.4190990626811981, + "learning_rate": 1.5635219802456264e-05, + "loss": 0.366, + "step": 16668 + }, + { + "epoch": 0.3094796953843526, + "grad_norm": 0.3902192711830139, + "learning_rate": 1.563425612493442e-05, + "loss": 0.2041, + "step": 16670 + }, + { + "epoch": 0.30951682552177123, + "grad_norm": 0.3882218897342682, + "learning_rate": 1.5633292370747232e-05, + "loss": 0.2131, + "step": 16672 + }, + { + "epoch": 0.3095539556591899, + "grad_norm": 0.6261725425720215, + "learning_rate": 1.5632328539907806e-05, + "loss": 0.1961, + "step": 16674 + }, + { + "epoch": 0.30959108579660854, + "grad_norm": 0.3968117833137512, + "learning_rate": 1.5631364632429255e-05, + "loss": 0.2987, + "step": 16676 + }, + { + "epoch": 0.3096282159340272, + "grad_norm": 0.32582974433898926, + "learning_rate": 1.5630400648324705e-05, + "loss": 0.303, + "step": 16678 + }, + { + "epoch": 0.3096653460714458, + "grad_norm": 0.34196314215660095, + "learning_rate": 1.5629436587607266e-05, + "loss": 0.3422, + "step": 16680 + }, + { + "epoch": 0.30970247620886443, + "grad_norm": 0.3452033996582031, + "learning_rate": 1.5628472450290058e-05, + "loss": 0.2331, + "step": 16682 + }, + { + "epoch": 0.3097396063462831, + "grad_norm": 0.3539777994155884, + "learning_rate": 1.5627508236386195e-05, + "loss": 0.2744, + "step": 16684 + }, + { + "epoch": 0.30977673648370174, + "grad_norm": 0.3950103521347046, + "learning_rate": 1.5626543945908805e-05, + "loss": 0.3383, + "step": 16686 + }, + { + "epoch": 0.30981386662112037, + "grad_norm": 0.3352287709712982, + "learning_rate": 1.5625579578871004e-05, + "loss": 0.3235, + "step": 16688 + }, + { + "epoch": 0.309850996758539, + "grad_norm": 0.6298438906669617, + "learning_rate": 1.5624615135285917e-05, + "loss": 0.2235, + "step": 16690 + }, + { + "epoch": 0.3098881268959576, + "grad_norm": 0.45068278908729553, + "learning_rate": 1.5623650615166664e-05, + "loss": 0.4382, + "step": 16692 + }, + { + "epoch": 0.30992525703337626, + "grad_norm": 0.3961672782897949, + "learning_rate": 1.5622686018526373e-05, + "loss": 0.1926, + "step": 16694 + }, + { + "epoch": 0.30996238717079494, + "grad_norm": 0.5014447569847107, + "learning_rate": 1.5621721345378166e-05, + "loss": 0.3701, + "step": 16696 + }, + { + "epoch": 0.30999951730821357, + "grad_norm": 0.42759397625923157, + "learning_rate": 1.562075659573517e-05, + "loss": 0.3959, + "step": 16698 + }, + { + "epoch": 0.3100366474456322, + "grad_norm": 0.43737176060676575, + "learning_rate": 1.5619791769610512e-05, + "loss": 0.3423, + "step": 16700 + }, + { + "epoch": 0.3100737775830508, + "grad_norm": 0.29817995429039, + "learning_rate": 1.5618826867017322e-05, + "loss": 0.3211, + "step": 16702 + }, + { + "epoch": 0.31011090772046945, + "grad_norm": 0.3803866505622864, + "learning_rate": 1.561786188796873e-05, + "loss": 0.2431, + "step": 16704 + }, + { + "epoch": 0.31014803785788814, + "grad_norm": 0.551832914352417, + "learning_rate": 1.5616896832477864e-05, + "loss": 0.4096, + "step": 16706 + }, + { + "epoch": 0.31018516799530677, + "grad_norm": 0.388785719871521, + "learning_rate": 1.5615931700557856e-05, + "loss": 0.295, + "step": 16708 + }, + { + "epoch": 0.3102222981327254, + "grad_norm": 0.4111138582229614, + "learning_rate": 1.5614966492221843e-05, + "loss": 0.375, + "step": 16710 + }, + { + "epoch": 0.310259428270144, + "grad_norm": 0.384615421295166, + "learning_rate": 1.561400120748295e-05, + "loss": 0.2436, + "step": 16712 + }, + { + "epoch": 0.31029655840756265, + "grad_norm": 0.5951463580131531, + "learning_rate": 1.5613035846354318e-05, + "loss": 0.3952, + "step": 16714 + }, + { + "epoch": 0.3103336885449813, + "grad_norm": 0.3531409502029419, + "learning_rate": 1.561207040884908e-05, + "loss": 0.2586, + "step": 16716 + }, + { + "epoch": 0.31037081868239996, + "grad_norm": 0.664386510848999, + "learning_rate": 1.5611104894980377e-05, + "loss": 0.2643, + "step": 16718 + }, + { + "epoch": 0.3104079488198186, + "grad_norm": 0.20239783823490143, + "learning_rate": 1.561013930476134e-05, + "loss": 0.1795, + "step": 16720 + }, + { + "epoch": 0.3104450789572372, + "grad_norm": 0.4996491074562073, + "learning_rate": 1.560917363820511e-05, + "loss": 0.1277, + "step": 16722 + }, + { + "epoch": 0.31048220909465585, + "grad_norm": 0.3859277069568634, + "learning_rate": 1.5608207895324837e-05, + "loss": 0.1541, + "step": 16724 + }, + { + "epoch": 0.3105193392320745, + "grad_norm": 0.4379408359527588, + "learning_rate": 1.5607242076133646e-05, + "loss": 0.4843, + "step": 16726 + }, + { + "epoch": 0.31055646936949316, + "grad_norm": 0.5015046000480652, + "learning_rate": 1.5606276180644685e-05, + "loss": 0.2584, + "step": 16728 + }, + { + "epoch": 0.3105935995069118, + "grad_norm": 0.2655382454395294, + "learning_rate": 1.5605310208871107e-05, + "loss": 0.4608, + "step": 16730 + }, + { + "epoch": 0.3106307296443304, + "grad_norm": 0.3892948031425476, + "learning_rate": 1.5604344160826033e-05, + "loss": 0.1443, + "step": 16732 + }, + { + "epoch": 0.31066785978174905, + "grad_norm": 0.38978904485702515, + "learning_rate": 1.560337803652263e-05, + "loss": 0.3282, + "step": 16734 + }, + { + "epoch": 0.3107049899191677, + "grad_norm": 0.3597666323184967, + "learning_rate": 1.560241183597404e-05, + "loss": 0.2989, + "step": 16736 + }, + { + "epoch": 0.3107421200565863, + "grad_norm": 0.315565288066864, + "learning_rate": 1.5601445559193397e-05, + "loss": 0.2377, + "step": 16738 + }, + { + "epoch": 0.310779250194005, + "grad_norm": 0.30008500814437866, + "learning_rate": 1.5600479206193867e-05, + "loss": 0.3912, + "step": 16740 + }, + { + "epoch": 0.3108163803314236, + "grad_norm": 0.533191978931427, + "learning_rate": 1.5599512776988586e-05, + "loss": 0.3127, + "step": 16742 + }, + { + "epoch": 0.31085351046884224, + "grad_norm": 0.3123820424079895, + "learning_rate": 1.5598546271590705e-05, + "loss": 0.1398, + "step": 16744 + }, + { + "epoch": 0.31089064060626087, + "grad_norm": 0.3105533719062805, + "learning_rate": 1.559757969001338e-05, + "loss": 0.1833, + "step": 16746 + }, + { + "epoch": 0.3109277707436795, + "grad_norm": 0.36196911334991455, + "learning_rate": 1.5596613032269768e-05, + "loss": 0.1458, + "step": 16748 + }, + { + "epoch": 0.3109649008810982, + "grad_norm": 0.40094897150993347, + "learning_rate": 1.559564629837301e-05, + "loss": 0.2116, + "step": 16750 + }, + { + "epoch": 0.3110020310185168, + "grad_norm": 0.3503999710083008, + "learning_rate": 1.559467948833627e-05, + "loss": 0.4452, + "step": 16752 + }, + { + "epoch": 0.31103916115593544, + "grad_norm": 0.29853150248527527, + "learning_rate": 1.5593712602172702e-05, + "loss": 0.2372, + "step": 16754 + }, + { + "epoch": 0.31107629129335407, + "grad_norm": 0.38461723923683167, + "learning_rate": 1.559274563989546e-05, + "loss": 0.2793, + "step": 16756 + }, + { + "epoch": 0.3111134214307727, + "grad_norm": 0.46388331055641174, + "learning_rate": 1.5591778601517698e-05, + "loss": 0.2289, + "step": 16758 + }, + { + "epoch": 0.3111505515681914, + "grad_norm": 0.3509833514690399, + "learning_rate": 1.559081148705258e-05, + "loss": 0.3896, + "step": 16760 + }, + { + "epoch": 0.31118768170561, + "grad_norm": 0.4400383234024048, + "learning_rate": 1.5589844296513266e-05, + "loss": 0.3793, + "step": 16762 + }, + { + "epoch": 0.31122481184302864, + "grad_norm": 0.3094305694103241, + "learning_rate": 1.5588877029912914e-05, + "loss": 0.3506, + "step": 16764 + }, + { + "epoch": 0.31126194198044727, + "grad_norm": 0.41437506675720215, + "learning_rate": 1.5587909687264686e-05, + "loss": 0.3179, + "step": 16766 + }, + { + "epoch": 0.3112990721178659, + "grad_norm": 0.45820292830467224, + "learning_rate": 1.5586942268581747e-05, + "loss": 0.3012, + "step": 16768 + }, + { + "epoch": 0.3113362022552845, + "grad_norm": 0.38028040528297424, + "learning_rate": 1.558597477387725e-05, + "loss": 0.3794, + "step": 16770 + }, + { + "epoch": 0.3113733323927032, + "grad_norm": 0.48959627747535706, + "learning_rate": 1.5585007203164376e-05, + "loss": 0.2376, + "step": 16772 + }, + { + "epoch": 0.31141046253012183, + "grad_norm": 0.43235957622528076, + "learning_rate": 1.5584039556456284e-05, + "loss": 0.2732, + "step": 16774 + }, + { + "epoch": 0.31144759266754046, + "grad_norm": 0.33892425894737244, + "learning_rate": 1.558307183376614e-05, + "loss": 0.4299, + "step": 16776 + }, + { + "epoch": 0.3114847228049591, + "grad_norm": 0.3762683570384979, + "learning_rate": 1.5582104035107105e-05, + "loss": 0.2682, + "step": 16778 + }, + { + "epoch": 0.3115218529423777, + "grad_norm": 0.43319469690322876, + "learning_rate": 1.558113616049236e-05, + "loss": 0.293, + "step": 16780 + }, + { + "epoch": 0.3115589830797964, + "grad_norm": 0.429830938577652, + "learning_rate": 1.558016820993507e-05, + "loss": 0.2874, + "step": 16782 + }, + { + "epoch": 0.31159611321721503, + "grad_norm": 0.4593583941459656, + "learning_rate": 1.55792001834484e-05, + "loss": 0.3934, + "step": 16784 + }, + { + "epoch": 0.31163324335463366, + "grad_norm": 0.3336837887763977, + "learning_rate": 1.5578232081045532e-05, + "loss": 0.2438, + "step": 16786 + }, + { + "epoch": 0.3116703734920523, + "grad_norm": 0.28354746103286743, + "learning_rate": 1.557726390273963e-05, + "loss": 0.2287, + "step": 16788 + }, + { + "epoch": 0.3117075036294709, + "grad_norm": 0.33228740096092224, + "learning_rate": 1.5576295648543876e-05, + "loss": 0.2207, + "step": 16790 + }, + { + "epoch": 0.31174463376688955, + "grad_norm": 0.32588300108909607, + "learning_rate": 1.557532731847144e-05, + "loss": 0.2904, + "step": 16792 + }, + { + "epoch": 0.31178176390430823, + "grad_norm": 0.3487551212310791, + "learning_rate": 1.55743589125355e-05, + "loss": 0.1579, + "step": 16794 + }, + { + "epoch": 0.31181889404172686, + "grad_norm": 0.2647467255592346, + "learning_rate": 1.557339043074923e-05, + "loss": 0.2663, + "step": 16796 + }, + { + "epoch": 0.3118560241791455, + "grad_norm": 0.34950193762779236, + "learning_rate": 1.5572421873125812e-05, + "loss": 0.1523, + "step": 16798 + }, + { + "epoch": 0.3118931543165641, + "grad_norm": 0.33119437098503113, + "learning_rate": 1.557145323967842e-05, + "loss": 0.2687, + "step": 16800 + }, + { + "epoch": 0.31193028445398274, + "grad_norm": 0.643551230430603, + "learning_rate": 1.5570484530420243e-05, + "loss": 0.2939, + "step": 16802 + }, + { + "epoch": 0.3119674145914014, + "grad_norm": 0.3724636435508728, + "learning_rate": 1.5569515745364455e-05, + "loss": 0.3898, + "step": 16804 + }, + { + "epoch": 0.31200454472882005, + "grad_norm": 0.4132455289363861, + "learning_rate": 1.556854688452424e-05, + "loss": 0.3013, + "step": 16806 + }, + { + "epoch": 0.3120416748662387, + "grad_norm": 0.37189170718193054, + "learning_rate": 1.5567577947912783e-05, + "loss": 0.2683, + "step": 16808 + }, + { + "epoch": 0.3120788050036573, + "grad_norm": 0.6936743259429932, + "learning_rate": 1.5566608935543265e-05, + "loss": 0.313, + "step": 16810 + }, + { + "epoch": 0.31211593514107594, + "grad_norm": 0.3180742561817169, + "learning_rate": 1.5565639847428876e-05, + "loss": 0.4826, + "step": 16812 + }, + { + "epoch": 0.31215306527849457, + "grad_norm": 0.38176193833351135, + "learning_rate": 1.5564670683582797e-05, + "loss": 0.0578, + "step": 16814 + }, + { + "epoch": 0.31219019541591325, + "grad_norm": 0.3374461531639099, + "learning_rate": 1.556370144401822e-05, + "loss": 0.2773, + "step": 16816 + }, + { + "epoch": 0.3122273255533319, + "grad_norm": 0.5728803277015686, + "learning_rate": 1.556273212874833e-05, + "loss": 0.0859, + "step": 16818 + }, + { + "epoch": 0.3122644556907505, + "grad_norm": 0.3416641354560852, + "learning_rate": 1.5561762737786317e-05, + "loss": 0.4054, + "step": 16820 + }, + { + "epoch": 0.31230158582816914, + "grad_norm": 0.5831412076950073, + "learning_rate": 1.5560793271145373e-05, + "loss": 0.1905, + "step": 16822 + }, + { + "epoch": 0.31233871596558777, + "grad_norm": 0.414559006690979, + "learning_rate": 1.5559823728838694e-05, + "loss": 0.5655, + "step": 16824 + }, + { + "epoch": 0.31237584610300645, + "grad_norm": 0.42011094093322754, + "learning_rate": 1.5558854110879464e-05, + "loss": 0.2772, + "step": 16826 + }, + { + "epoch": 0.3124129762404251, + "grad_norm": 0.4071503281593323, + "learning_rate": 1.5557884417280876e-05, + "loss": 0.3225, + "step": 16828 + }, + { + "epoch": 0.3124501063778437, + "grad_norm": 0.22644047439098358, + "learning_rate": 1.5556914648056138e-05, + "loss": 0.1915, + "step": 16830 + }, + { + "epoch": 0.31248723651526233, + "grad_norm": 0.3112654387950897, + "learning_rate": 1.555594480321843e-05, + "loss": 0.2623, + "step": 16832 + }, + { + "epoch": 0.31252436665268096, + "grad_norm": 0.4038960337638855, + "learning_rate": 1.5554974882780955e-05, + "loss": 0.2931, + "step": 16834 + }, + { + "epoch": 0.31256149679009965, + "grad_norm": 0.2340363711118698, + "learning_rate": 1.5554004886756914e-05, + "loss": 0.3117, + "step": 16836 + }, + { + "epoch": 0.3125986269275183, + "grad_norm": 0.19384273886680603, + "learning_rate": 1.5553034815159502e-05, + "loss": 0.1922, + "step": 16838 + }, + { + "epoch": 0.3126357570649369, + "grad_norm": 0.3948344886302948, + "learning_rate": 1.5552064668001917e-05, + "loss": 0.324, + "step": 16840 + }, + { + "epoch": 0.31267288720235553, + "grad_norm": 0.35939788818359375, + "learning_rate": 1.5551094445297365e-05, + "loss": 0.3553, + "step": 16842 + }, + { + "epoch": 0.31271001733977416, + "grad_norm": 0.3781391680240631, + "learning_rate": 1.5550124147059046e-05, + "loss": 0.1575, + "step": 16844 + }, + { + "epoch": 0.3127471474771928, + "grad_norm": 0.2912297248840332, + "learning_rate": 1.5549153773300157e-05, + "loss": 0.2537, + "step": 16846 + }, + { + "epoch": 0.3127842776146115, + "grad_norm": 0.3970768451690674, + "learning_rate": 1.5548183324033912e-05, + "loss": 0.4899, + "step": 16848 + }, + { + "epoch": 0.3128214077520301, + "grad_norm": 0.5380943417549133, + "learning_rate": 1.5547212799273508e-05, + "loss": 0.2991, + "step": 16850 + }, + { + "epoch": 0.31285853788944873, + "grad_norm": 0.41640540957450867, + "learning_rate": 1.5546242199032155e-05, + "loss": 0.5422, + "step": 16852 + }, + { + "epoch": 0.31289566802686736, + "grad_norm": 0.4586479365825653, + "learning_rate": 1.554527152332306e-05, + "loss": 0.3226, + "step": 16854 + }, + { + "epoch": 0.312932798164286, + "grad_norm": 0.5198866724967957, + "learning_rate": 1.5544300772159425e-05, + "loss": 0.4153, + "step": 16856 + }, + { + "epoch": 0.31296992830170467, + "grad_norm": 0.34491318464279175, + "learning_rate": 1.5543329945554467e-05, + "loss": 0.2755, + "step": 16858 + }, + { + "epoch": 0.3130070584391233, + "grad_norm": 0.33182820677757263, + "learning_rate": 1.554235904352139e-05, + "loss": 0.3102, + "step": 16860 + }, + { + "epoch": 0.3130441885765419, + "grad_norm": 0.32501810789108276, + "learning_rate": 1.5541388066073412e-05, + "loss": 0.2889, + "step": 16862 + }, + { + "epoch": 0.31308131871396055, + "grad_norm": 0.3546353876590729, + "learning_rate": 1.5540417013223734e-05, + "loss": 0.425, + "step": 16864 + }, + { + "epoch": 0.3131184488513792, + "grad_norm": 0.3657384514808655, + "learning_rate": 1.553944588498558e-05, + "loss": 0.2205, + "step": 16866 + }, + { + "epoch": 0.3131555789887978, + "grad_norm": 0.39293238520622253, + "learning_rate": 1.5538474681372164e-05, + "loss": 0.3824, + "step": 16868 + }, + { + "epoch": 0.3131927091262165, + "grad_norm": 0.303480863571167, + "learning_rate": 1.553750340239669e-05, + "loss": 0.2991, + "step": 16870 + }, + { + "epoch": 0.3132298392636351, + "grad_norm": 0.3296826481819153, + "learning_rate": 1.5536532048072386e-05, + "loss": 0.3682, + "step": 16872 + }, + { + "epoch": 0.31326696940105375, + "grad_norm": 0.34540224075317383, + "learning_rate": 1.5535560618412464e-05, + "loss": 0.4344, + "step": 16874 + }, + { + "epoch": 0.3133040995384724, + "grad_norm": 0.33837801218032837, + "learning_rate": 1.553458911343014e-05, + "loss": 0.2586, + "step": 16876 + }, + { + "epoch": 0.313341229675891, + "grad_norm": 0.3564090132713318, + "learning_rate": 1.553361753313864e-05, + "loss": 0.3548, + "step": 16878 + }, + { + "epoch": 0.3133783598133097, + "grad_norm": 0.5724231600761414, + "learning_rate": 1.553264587755118e-05, + "loss": 0.2972, + "step": 16880 + }, + { + "epoch": 0.3134154899507283, + "grad_norm": 0.3740960955619812, + "learning_rate": 1.5531674146680978e-05, + "loss": 0.2756, + "step": 16882 + }, + { + "epoch": 0.31345262008814695, + "grad_norm": 0.2462249994277954, + "learning_rate": 1.5530702340541264e-05, + "loss": 0.261, + "step": 16884 + }, + { + "epoch": 0.3134897502255656, + "grad_norm": 0.503528892993927, + "learning_rate": 1.5529730459145255e-05, + "loss": 0.2253, + "step": 16886 + }, + { + "epoch": 0.3135268803629842, + "grad_norm": 0.408740758895874, + "learning_rate": 1.552875850250618e-05, + "loss": 0.2096, + "step": 16888 + }, + { + "epoch": 0.31356401050040283, + "grad_norm": 0.34150925278663635, + "learning_rate": 1.552778647063726e-05, + "loss": 0.4244, + "step": 16890 + }, + { + "epoch": 0.3136011406378215, + "grad_norm": 0.5568698644638062, + "learning_rate": 1.5526814363551728e-05, + "loss": 0.3533, + "step": 16892 + }, + { + "epoch": 0.31363827077524015, + "grad_norm": 0.32314637303352356, + "learning_rate": 1.5525842181262806e-05, + "loss": 0.38, + "step": 16894 + }, + { + "epoch": 0.3136754009126588, + "grad_norm": 0.5012269020080566, + "learning_rate": 1.552486992378372e-05, + "loss": 0.3402, + "step": 16896 + }, + { + "epoch": 0.3137125310500774, + "grad_norm": 0.37399527430534363, + "learning_rate": 1.5523897591127706e-05, + "loss": 0.2211, + "step": 16898 + }, + { + "epoch": 0.31374966118749603, + "grad_norm": 0.41073545813560486, + "learning_rate": 1.5522925183307994e-05, + "loss": 0.2781, + "step": 16900 + }, + { + "epoch": 0.3137867913249147, + "grad_norm": 0.426268994808197, + "learning_rate": 1.552195270033781e-05, + "loss": 0.3287, + "step": 16902 + }, + { + "epoch": 0.31382392146233334, + "grad_norm": 0.3344692587852478, + "learning_rate": 1.552098014223039e-05, + "loss": 0.3983, + "step": 16904 + }, + { + "epoch": 0.313861051599752, + "grad_norm": 0.4050685167312622, + "learning_rate": 1.5520007508998967e-05, + "loss": 0.222, + "step": 16906 + }, + { + "epoch": 0.3138981817371706, + "grad_norm": 0.4483424425125122, + "learning_rate": 1.5519034800656777e-05, + "loss": 0.2489, + "step": 16908 + }, + { + "epoch": 0.31393531187458923, + "grad_norm": 0.28396525979042053, + "learning_rate": 1.551806201721706e-05, + "loss": 0.3253, + "step": 16910 + }, + { + "epoch": 0.3139724420120079, + "grad_norm": 0.3871673047542572, + "learning_rate": 1.5517089158693036e-05, + "loss": 0.361, + "step": 16912 + }, + { + "epoch": 0.31400957214942654, + "grad_norm": 0.42115357518196106, + "learning_rate": 1.551611622509796e-05, + "loss": 0.3653, + "step": 16914 + }, + { + "epoch": 0.31404670228684517, + "grad_norm": 0.37125369906425476, + "learning_rate": 1.5515143216445068e-05, + "loss": 0.2533, + "step": 16916 + }, + { + "epoch": 0.3140838324242638, + "grad_norm": 0.58906489610672, + "learning_rate": 1.5514170132747596e-05, + "loss": 0.1792, + "step": 16918 + }, + { + "epoch": 0.3141209625616824, + "grad_norm": 0.41949182748794556, + "learning_rate": 1.551319697401878e-05, + "loss": 0.2152, + "step": 16920 + }, + { + "epoch": 0.31415809269910105, + "grad_norm": 0.46473515033721924, + "learning_rate": 1.551222374027187e-05, + "loss": 0.2991, + "step": 16922 + }, + { + "epoch": 0.31419522283651974, + "grad_norm": 0.4395226538181305, + "learning_rate": 1.551125043152011e-05, + "loss": 0.4531, + "step": 16924 + }, + { + "epoch": 0.31423235297393837, + "grad_norm": 0.41832226514816284, + "learning_rate": 1.5510277047776738e-05, + "loss": 0.224, + "step": 16926 + }, + { + "epoch": 0.314269483111357, + "grad_norm": 0.3494199514389038, + "learning_rate": 1.5509303589054995e-05, + "loss": 0.6301, + "step": 16928 + }, + { + "epoch": 0.3143066132487756, + "grad_norm": 0.3696920573711395, + "learning_rate": 1.5508330055368135e-05, + "loss": 0.3947, + "step": 16930 + }, + { + "epoch": 0.31434374338619425, + "grad_norm": 0.3260973393917084, + "learning_rate": 1.5507356446729408e-05, + "loss": 0.373, + "step": 16932 + }, + { + "epoch": 0.31438087352361294, + "grad_norm": 0.3837592303752899, + "learning_rate": 1.550638276315205e-05, + "loss": 0.4387, + "step": 16934 + }, + { + "epoch": 0.31441800366103156, + "grad_norm": 0.42772239446640015, + "learning_rate": 1.550540900464932e-05, + "loss": 0.1604, + "step": 16936 + }, + { + "epoch": 0.3144551337984502, + "grad_norm": 0.3555832505226135, + "learning_rate": 1.550443517123446e-05, + "loss": 0.2962, + "step": 16938 + }, + { + "epoch": 0.3144922639358688, + "grad_norm": 0.4194395840167999, + "learning_rate": 1.5503461262920726e-05, + "loss": 0.2955, + "step": 16940 + }, + { + "epoch": 0.31452939407328745, + "grad_norm": 0.36514461040496826, + "learning_rate": 1.550248727972137e-05, + "loss": 0.1946, + "step": 16942 + }, + { + "epoch": 0.3145665242107061, + "grad_norm": 0.2874706983566284, + "learning_rate": 1.5501513221649643e-05, + "loss": 0.2431, + "step": 16944 + }, + { + "epoch": 0.31460365434812476, + "grad_norm": 0.33884042501449585, + "learning_rate": 1.5500539088718802e-05, + "loss": 0.3121, + "step": 16946 + }, + { + "epoch": 0.3146407844855434, + "grad_norm": 0.3478817641735077, + "learning_rate": 1.54995648809421e-05, + "loss": 0.3009, + "step": 16948 + }, + { + "epoch": 0.314677914622962, + "grad_norm": 0.3149224817752838, + "learning_rate": 1.549859059833279e-05, + "loss": 0.1553, + "step": 16950 + }, + { + "epoch": 0.31471504476038065, + "grad_norm": 0.3089819848537445, + "learning_rate": 1.5497616240904135e-05, + "loss": 0.3595, + "step": 16952 + }, + { + "epoch": 0.3147521748977993, + "grad_norm": 0.323050320148468, + "learning_rate": 1.5496641808669387e-05, + "loss": 0.3407, + "step": 16954 + }, + { + "epoch": 0.31478930503521796, + "grad_norm": 0.3516741394996643, + "learning_rate": 1.5495667301641813e-05, + "loss": 0.6208, + "step": 16956 + }, + { + "epoch": 0.3148264351726366, + "grad_norm": 0.312967449426651, + "learning_rate": 1.5494692719834668e-05, + "loss": 0.2443, + "step": 16958 + }, + { + "epoch": 0.3148635653100552, + "grad_norm": 0.3278021216392517, + "learning_rate": 1.549371806326121e-05, + "loss": 0.3215, + "step": 16960 + }, + { + "epoch": 0.31490069544747384, + "grad_norm": 0.3084736764431, + "learning_rate": 1.5492743331934708e-05, + "loss": 0.4914, + "step": 16962 + }, + { + "epoch": 0.3149378255848925, + "grad_norm": 0.4344116151332855, + "learning_rate": 1.5491768525868417e-05, + "loss": 0.4472, + "step": 16964 + }, + { + "epoch": 0.3149749557223111, + "grad_norm": 0.2994072139263153, + "learning_rate": 1.5490793645075612e-05, + "loss": 0.2869, + "step": 16966 + }, + { + "epoch": 0.3150120858597298, + "grad_norm": 0.4782739579677582, + "learning_rate": 1.548981868956955e-05, + "loss": 0.1901, + "step": 16968 + }, + { + "epoch": 0.3150492159971484, + "grad_norm": 0.26447317004203796, + "learning_rate": 1.5488843659363498e-05, + "loss": 0.3519, + "step": 16970 + }, + { + "epoch": 0.31508634613456704, + "grad_norm": 0.41366568207740784, + "learning_rate": 1.5487868554470725e-05, + "loss": 0.3481, + "step": 16972 + }, + { + "epoch": 0.31512347627198567, + "grad_norm": 0.35360974073410034, + "learning_rate": 1.54868933749045e-05, + "loss": 0.369, + "step": 16974 + }, + { + "epoch": 0.3151606064094043, + "grad_norm": 0.2950894832611084, + "learning_rate": 1.5485918120678095e-05, + "loss": 0.1567, + "step": 16976 + }, + { + "epoch": 0.315197736546823, + "grad_norm": 0.48664331436157227, + "learning_rate": 1.5484942791804772e-05, + "loss": 0.2836, + "step": 16978 + }, + { + "epoch": 0.3152348666842416, + "grad_norm": 0.3096611797809601, + "learning_rate": 1.548396738829781e-05, + "loss": 0.3211, + "step": 16980 + }, + { + "epoch": 0.31527199682166024, + "grad_norm": 0.287286639213562, + "learning_rate": 1.5482991910170474e-05, + "loss": 0.2503, + "step": 16982 + }, + { + "epoch": 0.31530912695907887, + "grad_norm": 0.36283108592033386, + "learning_rate": 1.5482016357436044e-05, + "loss": 0.2339, + "step": 16984 + }, + { + "epoch": 0.3153462570964975, + "grad_norm": 0.4548643231391907, + "learning_rate": 1.5481040730107793e-05, + "loss": 0.2347, + "step": 16986 + }, + { + "epoch": 0.3153833872339162, + "grad_norm": 0.23737511038780212, + "learning_rate": 1.5480065028198993e-05, + "loss": 0.3196, + "step": 16988 + }, + { + "epoch": 0.3154205173713348, + "grad_norm": 0.39252200722694397, + "learning_rate": 1.5479089251722924e-05, + "loss": 0.1878, + "step": 16990 + }, + { + "epoch": 0.31545764750875344, + "grad_norm": 0.5081819295883179, + "learning_rate": 1.547811340069286e-05, + "loss": 0.3945, + "step": 16992 + }, + { + "epoch": 0.31549477764617206, + "grad_norm": 0.47546496987342834, + "learning_rate": 1.5477137475122087e-05, + "loss": 0.2928, + "step": 16994 + }, + { + "epoch": 0.3155319077835907, + "grad_norm": 0.38005730509757996, + "learning_rate": 1.5476161475023876e-05, + "loss": 0.2861, + "step": 16996 + }, + { + "epoch": 0.3155690379210093, + "grad_norm": 0.4586920142173767, + "learning_rate": 1.5475185400411505e-05, + "loss": 0.2932, + "step": 16998 + }, + { + "epoch": 0.315606168058428, + "grad_norm": 0.3637450635433197, + "learning_rate": 1.5474209251298266e-05, + "loss": 0.2526, + "step": 17000 + }, + { + "epoch": 0.31564329819584663, + "grad_norm": 0.44537630677223206, + "learning_rate": 1.5473233027697435e-05, + "loss": 0.1664, + "step": 17002 + }, + { + "epoch": 0.31568042833326526, + "grad_norm": 0.5144463777542114, + "learning_rate": 1.5472256729622296e-05, + "loss": 0.3907, + "step": 17004 + }, + { + "epoch": 0.3157175584706839, + "grad_norm": 0.3204866051673889, + "learning_rate": 1.5471280357086137e-05, + "loss": 0.1882, + "step": 17006 + }, + { + "epoch": 0.3157546886081025, + "grad_norm": 0.6256248950958252, + "learning_rate": 1.547030391010224e-05, + "loss": 0.488, + "step": 17008 + }, + { + "epoch": 0.3157918187455212, + "grad_norm": 0.44798097014427185, + "learning_rate": 1.5469327388683892e-05, + "loss": 0.254, + "step": 17010 + }, + { + "epoch": 0.31582894888293983, + "grad_norm": 0.45150575041770935, + "learning_rate": 1.546835079284438e-05, + "loss": 0.2003, + "step": 17012 + }, + { + "epoch": 0.31586607902035846, + "grad_norm": 0.2642674744129181, + "learning_rate": 1.5467374122596993e-05, + "loss": 0.2592, + "step": 17014 + }, + { + "epoch": 0.3159032091577771, + "grad_norm": 0.45391687750816345, + "learning_rate": 1.546639737795502e-05, + "loss": 0.3936, + "step": 17016 + }, + { + "epoch": 0.3159403392951957, + "grad_norm": 0.40966805815696716, + "learning_rate": 1.5465420558931754e-05, + "loss": 0.2343, + "step": 17018 + }, + { + "epoch": 0.31597746943261434, + "grad_norm": 0.38056161999702454, + "learning_rate": 1.5464443665540487e-05, + "loss": 0.2579, + "step": 17020 + }, + { + "epoch": 0.31601459957003303, + "grad_norm": 0.24162164330482483, + "learning_rate": 1.5463466697794506e-05, + "loss": 0.1599, + "step": 17022 + }, + { + "epoch": 0.31605172970745166, + "grad_norm": 0.2503904104232788, + "learning_rate": 1.546248965570711e-05, + "loss": 0.3272, + "step": 17024 + }, + { + "epoch": 0.3160888598448703, + "grad_norm": 0.2907984256744385, + "learning_rate": 1.5461512539291594e-05, + "loss": 0.3525, + "step": 17026 + }, + { + "epoch": 0.3161259899822889, + "grad_norm": 0.24068839848041534, + "learning_rate": 1.5460535348561245e-05, + "loss": 0.4553, + "step": 17028 + }, + { + "epoch": 0.31616312011970754, + "grad_norm": 0.3070763349533081, + "learning_rate": 1.545955808352937e-05, + "loss": 0.1747, + "step": 17030 + }, + { + "epoch": 0.3162002502571262, + "grad_norm": 0.6431100368499756, + "learning_rate": 1.5458580744209263e-05, + "loss": 0.253, + "step": 17032 + }, + { + "epoch": 0.31623738039454485, + "grad_norm": 0.3573244512081146, + "learning_rate": 1.5457603330614223e-05, + "loss": 0.378, + "step": 17034 + }, + { + "epoch": 0.3162745105319635, + "grad_norm": 0.3873637914657593, + "learning_rate": 1.5456625842757547e-05, + "loss": 0.171, + "step": 17036 + }, + { + "epoch": 0.3163116406693821, + "grad_norm": 0.4575580954551697, + "learning_rate": 1.5455648280652542e-05, + "loss": 0.3128, + "step": 17038 + }, + { + "epoch": 0.31634877080680074, + "grad_norm": 0.3653605282306671, + "learning_rate": 1.5454670644312502e-05, + "loss": 0.3308, + "step": 17040 + }, + { + "epoch": 0.31638590094421937, + "grad_norm": 0.29542601108551025, + "learning_rate": 1.5453692933750733e-05, + "loss": 0.0634, + "step": 17042 + }, + { + "epoch": 0.31642303108163805, + "grad_norm": 0.29997730255126953, + "learning_rate": 1.5452715148980542e-05, + "loss": 0.2642, + "step": 17044 + }, + { + "epoch": 0.3164601612190567, + "grad_norm": 0.6037067770957947, + "learning_rate": 1.5451737290015226e-05, + "loss": 0.1598, + "step": 17046 + }, + { + "epoch": 0.3164972913564753, + "grad_norm": 0.3124706745147705, + "learning_rate": 1.54507593568681e-05, + "loss": 0.5156, + "step": 17048 + }, + { + "epoch": 0.31653442149389394, + "grad_norm": 0.30188944935798645, + "learning_rate": 1.5449781349552463e-05, + "loss": 0.196, + "step": 17050 + }, + { + "epoch": 0.31657155163131256, + "grad_norm": 0.2679090201854706, + "learning_rate": 1.5448803268081623e-05, + "loss": 0.174, + "step": 17052 + }, + { + "epoch": 0.31660868176873125, + "grad_norm": 0.2522358000278473, + "learning_rate": 1.5447825112468896e-05, + "loss": 0.2339, + "step": 17054 + }, + { + "epoch": 0.3166458119061499, + "grad_norm": 0.3371419608592987, + "learning_rate": 1.5446846882727585e-05, + "loss": 0.3042, + "step": 17056 + }, + { + "epoch": 0.3166829420435685, + "grad_norm": 0.3254573941230774, + "learning_rate": 1.5445868578871003e-05, + "loss": 0.3273, + "step": 17058 + }, + { + "epoch": 0.31672007218098713, + "grad_norm": 0.4566475450992584, + "learning_rate": 1.5444890200912466e-05, + "loss": 0.406, + "step": 17060 + }, + { + "epoch": 0.31675720231840576, + "grad_norm": 0.32688191533088684, + "learning_rate": 1.5443911748865278e-05, + "loss": 0.3158, + "step": 17062 + }, + { + "epoch": 0.31679433245582445, + "grad_norm": 0.3352530300617218, + "learning_rate": 1.544293322274276e-05, + "loss": 0.3865, + "step": 17064 + }, + { + "epoch": 0.3168314625932431, + "grad_norm": 0.2378959059715271, + "learning_rate": 1.5441954622558225e-05, + "loss": 0.1334, + "step": 17066 + }, + { + "epoch": 0.3168685927306617, + "grad_norm": 0.5573373436927795, + "learning_rate": 1.5440975948324988e-05, + "loss": 0.4126, + "step": 17068 + }, + { + "epoch": 0.31690572286808033, + "grad_norm": 0.3591885268688202, + "learning_rate": 1.5439997200056365e-05, + "loss": 0.4827, + "step": 17070 + }, + { + "epoch": 0.31694285300549896, + "grad_norm": 0.40282365679740906, + "learning_rate": 1.5439018377765675e-05, + "loss": 0.1898, + "step": 17072 + }, + { + "epoch": 0.3169799831429176, + "grad_norm": 0.47648149728775024, + "learning_rate": 1.5438039481466236e-05, + "loss": 0.2402, + "step": 17074 + }, + { + "epoch": 0.31701711328033627, + "grad_norm": 0.44104674458503723, + "learning_rate": 1.543706051117137e-05, + "loss": 0.2882, + "step": 17076 + }, + { + "epoch": 0.3170542434177549, + "grad_norm": 0.341251015663147, + "learning_rate": 1.5436081466894394e-05, + "loss": 0.4102, + "step": 17078 + }, + { + "epoch": 0.31709137355517353, + "grad_norm": 0.41686397790908813, + "learning_rate": 1.543510234864863e-05, + "loss": 0.3669, + "step": 17080 + }, + { + "epoch": 0.31712850369259216, + "grad_norm": 0.4234611392021179, + "learning_rate": 1.543412315644741e-05, + "loss": 0.4593, + "step": 17082 + }, + { + "epoch": 0.3171656338300108, + "grad_norm": 0.40406325459480286, + "learning_rate": 1.5433143890304046e-05, + "loss": 0.3915, + "step": 17084 + }, + { + "epoch": 0.31720276396742947, + "grad_norm": 0.29293304681777954, + "learning_rate": 1.543216455023187e-05, + "loss": 0.2941, + "step": 17086 + }, + { + "epoch": 0.3172398941048481, + "grad_norm": 0.3940310776233673, + "learning_rate": 1.5431185136244204e-05, + "loss": 0.4023, + "step": 17088 + }, + { + "epoch": 0.3172770242422667, + "grad_norm": 0.42459636926651, + "learning_rate": 1.543020564835438e-05, + "loss": 0.3453, + "step": 17090 + }, + { + "epoch": 0.31731415437968535, + "grad_norm": 0.3892497420310974, + "learning_rate": 1.5429226086575718e-05, + "loss": 0.3783, + "step": 17092 + }, + { + "epoch": 0.317351284517104, + "grad_norm": 0.7685954570770264, + "learning_rate": 1.5428246450921552e-05, + "loss": 0.2637, + "step": 17094 + }, + { + "epoch": 0.3173884146545226, + "grad_norm": 0.3846810758113861, + "learning_rate": 1.5427266741405212e-05, + "loss": 0.4204, + "step": 17096 + }, + { + "epoch": 0.3174255447919413, + "grad_norm": 0.3611105680465698, + "learning_rate": 1.542628695804003e-05, + "loss": 0.3729, + "step": 17098 + }, + { + "epoch": 0.3174626749293599, + "grad_norm": 0.42518800497055054, + "learning_rate": 1.5425307100839332e-05, + "loss": 0.2504, + "step": 17100 + }, + { + "epoch": 0.31749980506677855, + "grad_norm": 0.27884796261787415, + "learning_rate": 1.542432716981646e-05, + "loss": 0.2942, + "step": 17102 + }, + { + "epoch": 0.3175369352041972, + "grad_norm": 0.3047977387905121, + "learning_rate": 1.542334716498474e-05, + "loss": 0.3466, + "step": 17104 + }, + { + "epoch": 0.3175740653416158, + "grad_norm": 0.21829275786876678, + "learning_rate": 1.542236708635751e-05, + "loss": 0.2413, + "step": 17106 + }, + { + "epoch": 0.3176111954790345, + "grad_norm": 0.6674472689628601, + "learning_rate": 1.542138693394811e-05, + "loss": 0.1465, + "step": 17108 + }, + { + "epoch": 0.3176483256164531, + "grad_norm": 0.40542134642601013, + "learning_rate": 1.5420406707769867e-05, + "loss": 0.4684, + "step": 17110 + }, + { + "epoch": 0.31768545575387175, + "grad_norm": 0.4353960156440735, + "learning_rate": 1.5419426407836126e-05, + "loss": 0.207, + "step": 17112 + }, + { + "epoch": 0.3177225858912904, + "grad_norm": 0.4035026431083679, + "learning_rate": 1.541844603416023e-05, + "loss": 0.2406, + "step": 17114 + }, + { + "epoch": 0.317759716028709, + "grad_norm": 0.522284984588623, + "learning_rate": 1.5417465586755508e-05, + "loss": 0.2644, + "step": 17116 + }, + { + "epoch": 0.31779684616612763, + "grad_norm": 0.46294915676116943, + "learning_rate": 1.5416485065635305e-05, + "loss": 0.2445, + "step": 17118 + }, + { + "epoch": 0.3178339763035463, + "grad_norm": 0.3212882876396179, + "learning_rate": 1.541550447081297e-05, + "loss": 0.4095, + "step": 17120 + }, + { + "epoch": 0.31787110644096495, + "grad_norm": 0.27929168939590454, + "learning_rate": 1.5414523802301834e-05, + "loss": 0.4618, + "step": 17122 + }, + { + "epoch": 0.3179082365783836, + "grad_norm": 0.4637104570865631, + "learning_rate": 1.541354306011525e-05, + "loss": 0.1734, + "step": 17124 + }, + { + "epoch": 0.3179453667158022, + "grad_norm": 0.2908938527107239, + "learning_rate": 1.5412562244266563e-05, + "loss": 0.3919, + "step": 17126 + }, + { + "epoch": 0.31798249685322083, + "grad_norm": 0.49456197023391724, + "learning_rate": 1.541158135476912e-05, + "loss": 0.353, + "step": 17128 + }, + { + "epoch": 0.3180196269906395, + "grad_norm": 0.4210249185562134, + "learning_rate": 1.5410600391636255e-05, + "loss": 0.5233, + "step": 17130 + }, + { + "epoch": 0.31805675712805814, + "grad_norm": 0.3941677212715149, + "learning_rate": 1.540961935488133e-05, + "loss": 0.4472, + "step": 17132 + }, + { + "epoch": 0.31809388726547677, + "grad_norm": 0.42962098121643066, + "learning_rate": 1.540863824451769e-05, + "loss": 0.317, + "step": 17134 + }, + { + "epoch": 0.3181310174028954, + "grad_norm": 0.2861193120479584, + "learning_rate": 1.5407657060558682e-05, + "loss": 0.159, + "step": 17136 + }, + { + "epoch": 0.31816814754031403, + "grad_norm": 0.39070311188697815, + "learning_rate": 1.540667580301766e-05, + "loss": 0.3073, + "step": 17138 + }, + { + "epoch": 0.3182052776777327, + "grad_norm": 0.35487428307533264, + "learning_rate": 1.5405694471907975e-05, + "loss": 0.3436, + "step": 17140 + }, + { + "epoch": 0.31824240781515134, + "grad_norm": 0.4509940445423126, + "learning_rate": 1.5404713067242982e-05, + "loss": 0.4102, + "step": 17142 + }, + { + "epoch": 0.31827953795256997, + "grad_norm": 0.31767669320106506, + "learning_rate": 1.540373158903603e-05, + "loss": 0.1784, + "step": 17144 + }, + { + "epoch": 0.3183166680899886, + "grad_norm": 0.6901447772979736, + "learning_rate": 1.540275003730048e-05, + "loss": 0.334, + "step": 17146 + }, + { + "epoch": 0.3183537982274072, + "grad_norm": 0.3304341733455658, + "learning_rate": 1.5401768412049682e-05, + "loss": 0.3113, + "step": 17148 + }, + { + "epoch": 0.31839092836482585, + "grad_norm": 0.46414175629615784, + "learning_rate": 1.5400786713297e-05, + "loss": 0.4127, + "step": 17150 + }, + { + "epoch": 0.31842805850224454, + "grad_norm": 0.3294484615325928, + "learning_rate": 1.539980494105579e-05, + "loss": 0.2479, + "step": 17152 + }, + { + "epoch": 0.31846518863966317, + "grad_norm": 0.7244925498962402, + "learning_rate": 1.53988230953394e-05, + "loss": 0.281, + "step": 17154 + }, + { + "epoch": 0.3185023187770818, + "grad_norm": 0.4349120855331421, + "learning_rate": 1.5397841176161205e-05, + "loss": 0.388, + "step": 17156 + }, + { + "epoch": 0.3185394489145004, + "grad_norm": 0.46940499544143677, + "learning_rate": 1.539685918353456e-05, + "loss": 0.3085, + "step": 17158 + }, + { + "epoch": 0.31857657905191905, + "grad_norm": 0.33632388710975647, + "learning_rate": 1.5395877117472828e-05, + "loss": 0.3303, + "step": 17160 + }, + { + "epoch": 0.31861370918933773, + "grad_norm": 0.5118480920791626, + "learning_rate": 1.5394894977989368e-05, + "loss": 0.2932, + "step": 17162 + }, + { + "epoch": 0.31865083932675636, + "grad_norm": 0.35144031047821045, + "learning_rate": 1.539391276509755e-05, + "loss": 0.3017, + "step": 17164 + }, + { + "epoch": 0.318687969464175, + "grad_norm": 0.29943007230758667, + "learning_rate": 1.5392930478810732e-05, + "loss": 0.4069, + "step": 17166 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.738653838634491, + "learning_rate": 1.5391948119142284e-05, + "loss": 0.3162, + "step": 17168 + }, + { + "epoch": 0.31876222973901225, + "grad_norm": 0.33713826537132263, + "learning_rate": 1.5390965686105578e-05, + "loss": 0.2667, + "step": 17170 + }, + { + "epoch": 0.3187993598764309, + "grad_norm": 0.3678927421569824, + "learning_rate": 1.538998317971397e-05, + "loss": 0.4504, + "step": 17172 + }, + { + "epoch": 0.31883649001384956, + "grad_norm": 0.24662251770496368, + "learning_rate": 1.5389000599980838e-05, + "loss": 0.2912, + "step": 17174 + }, + { + "epoch": 0.3188736201512682, + "grad_norm": 0.4291885495185852, + "learning_rate": 1.5388017946919554e-05, + "loss": 0.3903, + "step": 17176 + }, + { + "epoch": 0.3189107502886868, + "grad_norm": 0.286139577627182, + "learning_rate": 1.5387035220543482e-05, + "loss": 0.4978, + "step": 17178 + }, + { + "epoch": 0.31894788042610545, + "grad_norm": 0.3912820518016815, + "learning_rate": 1.5386052420865993e-05, + "loss": 0.3459, + "step": 17180 + }, + { + "epoch": 0.3189850105635241, + "grad_norm": 0.36913177371025085, + "learning_rate": 1.538506954790047e-05, + "loss": 0.2298, + "step": 17182 + }, + { + "epoch": 0.31902214070094276, + "grad_norm": 0.4968512952327728, + "learning_rate": 1.5384086601660278e-05, + "loss": 0.2571, + "step": 17184 + }, + { + "epoch": 0.3190592708383614, + "grad_norm": 0.37922948598861694, + "learning_rate": 1.5383103582158792e-05, + "loss": 0.3074, + "step": 17186 + }, + { + "epoch": 0.31909640097578, + "grad_norm": 0.42133092880249023, + "learning_rate": 1.5382120489409394e-05, + "loss": 0.3144, + "step": 17188 + }, + { + "epoch": 0.31913353111319864, + "grad_norm": 0.28329595923423767, + "learning_rate": 1.5381137323425455e-05, + "loss": 0.3173, + "step": 17190 + }, + { + "epoch": 0.31917066125061727, + "grad_norm": 0.3245623707771301, + "learning_rate": 1.5380154084220357e-05, + "loss": 0.2919, + "step": 17192 + }, + { + "epoch": 0.3192077913880359, + "grad_norm": 0.4003051519393921, + "learning_rate": 1.5379170771807478e-05, + "loss": 0.4647, + "step": 17194 + }, + { + "epoch": 0.3192449215254546, + "grad_norm": 0.3576517701148987, + "learning_rate": 1.5378187386200196e-05, + "loss": 0.1311, + "step": 17196 + }, + { + "epoch": 0.3192820516628732, + "grad_norm": 0.32612344622612, + "learning_rate": 1.5377203927411895e-05, + "loss": 0.3704, + "step": 17198 + }, + { + "epoch": 0.31931918180029184, + "grad_norm": 0.21062368154525757, + "learning_rate": 1.537622039545595e-05, + "loss": 0.3815, + "step": 17200 + }, + { + "epoch": 0.31935631193771047, + "grad_norm": 0.4796724021434784, + "learning_rate": 1.5375236790345755e-05, + "loss": 0.6317, + "step": 17202 + }, + { + "epoch": 0.3193934420751291, + "grad_norm": 0.3687177896499634, + "learning_rate": 1.5374253112094688e-05, + "loss": 0.1888, + "step": 17204 + }, + { + "epoch": 0.3194305722125478, + "grad_norm": 0.3645276129245758, + "learning_rate": 1.537326936071613e-05, + "loss": 0.1396, + "step": 17206 + }, + { + "epoch": 0.3194677023499664, + "grad_norm": 0.34646156430244446, + "learning_rate": 1.537228553622347e-05, + "loss": 0.3101, + "step": 17208 + }, + { + "epoch": 0.31950483248738504, + "grad_norm": 0.3313872218132019, + "learning_rate": 1.53713016386301e-05, + "loss": 0.1897, + "step": 17210 + }, + { + "epoch": 0.31954196262480367, + "grad_norm": 0.5552208423614502, + "learning_rate": 1.53703176679494e-05, + "loss": 0.4732, + "step": 17212 + }, + { + "epoch": 0.3195790927622223, + "grad_norm": 0.3241727948188782, + "learning_rate": 1.5369333624194766e-05, + "loss": 0.1679, + "step": 17214 + }, + { + "epoch": 0.319616222899641, + "grad_norm": 0.3199171721935272, + "learning_rate": 1.5368349507379583e-05, + "loss": 0.2744, + "step": 17216 + }, + { + "epoch": 0.3196533530370596, + "grad_norm": 0.28099092841148376, + "learning_rate": 1.536736531751724e-05, + "loss": 0.258, + "step": 17218 + }, + { + "epoch": 0.31969048317447823, + "grad_norm": 0.2466171681880951, + "learning_rate": 1.5366381054621137e-05, + "loss": 0.3925, + "step": 17220 + }, + { + "epoch": 0.31972761331189686, + "grad_norm": 0.33610379695892334, + "learning_rate": 1.536539671870466e-05, + "loss": 0.3924, + "step": 17222 + }, + { + "epoch": 0.3197647434493155, + "grad_norm": 0.47139471769332886, + "learning_rate": 1.5364412309781204e-05, + "loss": 0.2039, + "step": 17224 + }, + { + "epoch": 0.3198018735867341, + "grad_norm": 0.3382294476032257, + "learning_rate": 1.5363427827864165e-05, + "loss": 0.3758, + "step": 17226 + }, + { + "epoch": 0.3198390037241528, + "grad_norm": 0.3508336544036865, + "learning_rate": 1.5362443272966943e-05, + "loss": 0.3285, + "step": 17228 + }, + { + "epoch": 0.31987613386157143, + "grad_norm": 0.5476653575897217, + "learning_rate": 1.5361458645102926e-05, + "loss": 0.2605, + "step": 17230 + }, + { + "epoch": 0.31991326399899006, + "grad_norm": 0.31968954205513, + "learning_rate": 1.5360473944285515e-05, + "loss": 0.2254, + "step": 17232 + }, + { + "epoch": 0.3199503941364087, + "grad_norm": 0.8002970814704895, + "learning_rate": 1.5359489170528113e-05, + "loss": 0.3504, + "step": 17234 + }, + { + "epoch": 0.3199875242738273, + "grad_norm": 0.4641616642475128, + "learning_rate": 1.5358504323844118e-05, + "loss": 0.4218, + "step": 17236 + }, + { + "epoch": 0.320024654411246, + "grad_norm": 0.20135943591594696, + "learning_rate": 1.5357519404246923e-05, + "loss": 0.1754, + "step": 17238 + }, + { + "epoch": 0.32006178454866463, + "grad_norm": 0.41504454612731934, + "learning_rate": 1.5356534411749943e-05, + "loss": 0.2351, + "step": 17240 + }, + { + "epoch": 0.32009891468608326, + "grad_norm": 0.41960692405700684, + "learning_rate": 1.5355549346366575e-05, + "loss": 0.1294, + "step": 17242 + }, + { + "epoch": 0.3201360448235019, + "grad_norm": 0.26389211416244507, + "learning_rate": 1.5354564208110217e-05, + "loss": 0.2593, + "step": 17244 + }, + { + "epoch": 0.3201731749609205, + "grad_norm": 0.4203268587589264, + "learning_rate": 1.5353578996994284e-05, + "loss": 0.3053, + "step": 17246 + }, + { + "epoch": 0.32021030509833914, + "grad_norm": 0.36045747995376587, + "learning_rate": 1.5352593713032173e-05, + "loss": 0.3535, + "step": 17248 + }, + { + "epoch": 0.3202474352357578, + "grad_norm": 0.3000342845916748, + "learning_rate": 1.5351608356237296e-05, + "loss": 0.2659, + "step": 17250 + }, + { + "epoch": 0.32028456537317646, + "grad_norm": 0.36360421776771545, + "learning_rate": 1.535062292662306e-05, + "loss": 0.2136, + "step": 17252 + }, + { + "epoch": 0.3203216955105951, + "grad_norm": 0.33014458417892456, + "learning_rate": 1.534963742420287e-05, + "loss": 0.3287, + "step": 17254 + }, + { + "epoch": 0.3203588256480137, + "grad_norm": 0.39336514472961426, + "learning_rate": 1.534865184899014e-05, + "loss": 0.3898, + "step": 17256 + }, + { + "epoch": 0.32039595578543234, + "grad_norm": 0.46903207898139954, + "learning_rate": 1.5347666200998278e-05, + "loss": 0.4361, + "step": 17258 + }, + { + "epoch": 0.320433085922851, + "grad_norm": 0.300927996635437, + "learning_rate": 1.53466804802407e-05, + "loss": 0.304, + "step": 17260 + }, + { + "epoch": 0.32047021606026965, + "grad_norm": 0.4261765778064728, + "learning_rate": 1.5345694686730818e-05, + "loss": 0.2999, + "step": 17262 + }, + { + "epoch": 0.3205073461976883, + "grad_norm": 0.3545638620853424, + "learning_rate": 1.5344708820482036e-05, + "loss": 0.2746, + "step": 17264 + }, + { + "epoch": 0.3205444763351069, + "grad_norm": 0.2779943346977234, + "learning_rate": 1.5343722881507785e-05, + "loss": 0.1993, + "step": 17266 + }, + { + "epoch": 0.32058160647252554, + "grad_norm": 0.375210702419281, + "learning_rate": 1.5342736869821467e-05, + "loss": 0.2526, + "step": 17268 + }, + { + "epoch": 0.32061873660994417, + "grad_norm": 0.434173047542572, + "learning_rate": 1.53417507854365e-05, + "loss": 0.313, + "step": 17270 + }, + { + "epoch": 0.32065586674736285, + "grad_norm": 0.46127668023109436, + "learning_rate": 1.5340764628366312e-05, + "loss": 0.4468, + "step": 17272 + }, + { + "epoch": 0.3206929968847815, + "grad_norm": 0.49569782614707947, + "learning_rate": 1.533977839862431e-05, + "loss": 0.4376, + "step": 17274 + }, + { + "epoch": 0.3207301270222001, + "grad_norm": 0.4674670696258545, + "learning_rate": 1.5338792096223923e-05, + "loss": 0.2679, + "step": 17276 + }, + { + "epoch": 0.32076725715961873, + "grad_norm": 0.3406663239002228, + "learning_rate": 1.5337805721178564e-05, + "loss": 0.3752, + "step": 17278 + }, + { + "epoch": 0.32080438729703736, + "grad_norm": 0.35662025213241577, + "learning_rate": 1.5336819273501657e-05, + "loss": 0.3691, + "step": 17280 + }, + { + "epoch": 0.32084151743445605, + "grad_norm": 0.28006112575531006, + "learning_rate": 1.5335832753206627e-05, + "loss": 0.3284, + "step": 17282 + }, + { + "epoch": 0.3208786475718747, + "grad_norm": 0.2908593714237213, + "learning_rate": 1.5334846160306898e-05, + "loss": 0.4365, + "step": 17284 + }, + { + "epoch": 0.3209157777092933, + "grad_norm": 0.3357962369918823, + "learning_rate": 1.533385949481589e-05, + "loss": 0.3913, + "step": 17286 + }, + { + "epoch": 0.32095290784671193, + "grad_norm": 0.4044245183467865, + "learning_rate": 1.533287275674703e-05, + "loss": 0.2279, + "step": 17288 + }, + { + "epoch": 0.32099003798413056, + "grad_norm": 0.30236268043518066, + "learning_rate": 1.533188594611375e-05, + "loss": 0.3203, + "step": 17290 + }, + { + "epoch": 0.32102716812154924, + "grad_norm": 0.31960272789001465, + "learning_rate": 1.5330899062929466e-05, + "loss": 0.5171, + "step": 17292 + }, + { + "epoch": 0.3210642982589679, + "grad_norm": 0.41443485021591187, + "learning_rate": 1.532991210720762e-05, + "loss": 0.317, + "step": 17294 + }, + { + "epoch": 0.3211014283963865, + "grad_norm": 0.35447803139686584, + "learning_rate": 1.532892507896163e-05, + "loss": 0.2812, + "step": 17296 + }, + { + "epoch": 0.32113855853380513, + "grad_norm": 0.25095534324645996, + "learning_rate": 1.5327937978204934e-05, + "loss": 0.2661, + "step": 17298 + }, + { + "epoch": 0.32117568867122376, + "grad_norm": 0.4252188801765442, + "learning_rate": 1.5326950804950962e-05, + "loss": 0.3085, + "step": 17300 + }, + { + "epoch": 0.3212128188086424, + "grad_norm": 0.44707366824150085, + "learning_rate": 1.5325963559213145e-05, + "loss": 0.3979, + "step": 17302 + }, + { + "epoch": 0.32124994894606107, + "grad_norm": 0.48115578293800354, + "learning_rate": 1.5324976241004917e-05, + "loss": 0.0781, + "step": 17304 + }, + { + "epoch": 0.3212870790834797, + "grad_norm": 0.37249040603637695, + "learning_rate": 1.532398885033971e-05, + "loss": 0.2739, + "step": 17306 + }, + { + "epoch": 0.3213242092208983, + "grad_norm": 0.3208727240562439, + "learning_rate": 1.5323001387230966e-05, + "loss": 0.4704, + "step": 17308 + }, + { + "epoch": 0.32136133935831696, + "grad_norm": 0.5011731386184692, + "learning_rate": 1.5322013851692115e-05, + "loss": 0.1942, + "step": 17310 + }, + { + "epoch": 0.3213984694957356, + "grad_norm": 0.3763359785079956, + "learning_rate": 1.53210262437366e-05, + "loss": 0.2026, + "step": 17312 + }, + { + "epoch": 0.32143559963315427, + "grad_norm": 0.22363798320293427, + "learning_rate": 1.5320038563377852e-05, + "loss": 0.2756, + "step": 17314 + }, + { + "epoch": 0.3214727297705729, + "grad_norm": 0.48202216625213623, + "learning_rate": 1.5319050810629318e-05, + "loss": 0.3317, + "step": 17316 + }, + { + "epoch": 0.3215098599079915, + "grad_norm": 0.6128024458885193, + "learning_rate": 1.531806298550443e-05, + "loss": 0.474, + "step": 17318 + }, + { + "epoch": 0.32154699004541015, + "grad_norm": 0.42750561237335205, + "learning_rate": 1.531707508801664e-05, + "loss": 0.1523, + "step": 17320 + }, + { + "epoch": 0.3215841201828288, + "grad_norm": 0.2953563928604126, + "learning_rate": 1.5316087118179384e-05, + "loss": 0.2984, + "step": 17322 + }, + { + "epoch": 0.3216212503202474, + "grad_norm": 0.252926766872406, + "learning_rate": 1.5315099076006102e-05, + "loss": 0.2459, + "step": 17324 + }, + { + "epoch": 0.3216583804576661, + "grad_norm": 0.45459315180778503, + "learning_rate": 1.5314110961510243e-05, + "loss": 0.367, + "step": 17326 + }, + { + "epoch": 0.3216955105950847, + "grad_norm": 0.42746007442474365, + "learning_rate": 1.5313122774705252e-05, + "loss": 0.4985, + "step": 17328 + }, + { + "epoch": 0.32173264073250335, + "grad_norm": 0.46953216195106506, + "learning_rate": 1.5312134515604576e-05, + "loss": 0.3558, + "step": 17330 + }, + { + "epoch": 0.321769770869922, + "grad_norm": 0.5032084584236145, + "learning_rate": 1.5311146184221662e-05, + "loss": 0.4311, + "step": 17332 + }, + { + "epoch": 0.3218069010073406, + "grad_norm": 0.23475800454616547, + "learning_rate": 1.5310157780569955e-05, + "loss": 0.227, + "step": 17334 + }, + { + "epoch": 0.3218440311447593, + "grad_norm": 0.39614832401275635, + "learning_rate": 1.5309169304662902e-05, + "loss": 0.3198, + "step": 17336 + }, + { + "epoch": 0.3218811612821779, + "grad_norm": 0.5124629139900208, + "learning_rate": 1.5308180756513964e-05, + "loss": 0.4336, + "step": 17338 + }, + { + "epoch": 0.32191829141959655, + "grad_norm": 0.44774746894836426, + "learning_rate": 1.5307192136136585e-05, + "loss": 0.2818, + "step": 17340 + }, + { + "epoch": 0.3219554215570152, + "grad_norm": 0.3130606412887573, + "learning_rate": 1.5306203443544216e-05, + "loss": 0.1607, + "step": 17342 + }, + { + "epoch": 0.3219925516944338, + "grad_norm": 0.29858627915382385, + "learning_rate": 1.530521467875031e-05, + "loss": 0.4292, + "step": 17344 + }, + { + "epoch": 0.32202968183185243, + "grad_norm": 0.4775102734565735, + "learning_rate": 1.530422584176833e-05, + "loss": 0.3906, + "step": 17346 + }, + { + "epoch": 0.3220668119692711, + "grad_norm": 0.3680359125137329, + "learning_rate": 1.530323693261172e-05, + "loss": 0.3493, + "step": 17348 + }, + { + "epoch": 0.32210394210668974, + "grad_norm": 0.3718049228191376, + "learning_rate": 1.5302247951293937e-05, + "loss": 0.3705, + "step": 17350 + }, + { + "epoch": 0.3221410722441084, + "grad_norm": 0.4301467835903168, + "learning_rate": 1.530125889782845e-05, + "loss": 0.3691, + "step": 17352 + }, + { + "epoch": 0.322178202381527, + "grad_norm": 0.4566783905029297, + "learning_rate": 1.53002697722287e-05, + "loss": 0.2242, + "step": 17354 + }, + { + "epoch": 0.32221533251894563, + "grad_norm": 0.40902405977249146, + "learning_rate": 1.5299280574508156e-05, + "loss": 0.3981, + "step": 17356 + }, + { + "epoch": 0.3222524626563643, + "grad_norm": 0.30622631311416626, + "learning_rate": 1.529829130468028e-05, + "loss": 0.3508, + "step": 17358 + }, + { + "epoch": 0.32228959279378294, + "grad_norm": 0.4049510955810547, + "learning_rate": 1.529730196275853e-05, + "loss": 0.3525, + "step": 17360 + }, + { + "epoch": 0.32232672293120157, + "grad_norm": 0.3212869465351105, + "learning_rate": 1.5296312548756364e-05, + "loss": 0.317, + "step": 17362 + }, + { + "epoch": 0.3223638530686202, + "grad_norm": 0.41256698966026306, + "learning_rate": 1.5295323062687254e-05, + "loss": 0.4926, + "step": 17364 + }, + { + "epoch": 0.3224009832060388, + "grad_norm": 0.38584163784980774, + "learning_rate": 1.5294333504564652e-05, + "loss": 0.4525, + "step": 17366 + }, + { + "epoch": 0.3224381133434575, + "grad_norm": 0.4642098844051361, + "learning_rate": 1.529334387440203e-05, + "loss": 0.2633, + "step": 17368 + }, + { + "epoch": 0.32247524348087614, + "grad_norm": 0.5035676956176758, + "learning_rate": 1.5292354172212854e-05, + "loss": 0.215, + "step": 17370 + }, + { + "epoch": 0.32251237361829477, + "grad_norm": 0.36337435245513916, + "learning_rate": 1.5291364398010596e-05, + "loss": 0.2318, + "step": 17372 + }, + { + "epoch": 0.3225495037557134, + "grad_norm": 0.3551802635192871, + "learning_rate": 1.5290374551808715e-05, + "loss": 0.2171, + "step": 17374 + }, + { + "epoch": 0.322586633893132, + "grad_norm": 0.30073145031929016, + "learning_rate": 1.5289384633620678e-05, + "loss": 0.2128, + "step": 17376 + }, + { + "epoch": 0.32262376403055065, + "grad_norm": 0.23783640563488007, + "learning_rate": 1.5288394643459964e-05, + "loss": 0.3694, + "step": 17378 + }, + { + "epoch": 0.32266089416796934, + "grad_norm": 0.27272284030914307, + "learning_rate": 1.5287404581340036e-05, + "loss": 0.5207, + "step": 17380 + }, + { + "epoch": 0.32269802430538796, + "grad_norm": 0.4207136034965515, + "learning_rate": 1.528641444727437e-05, + "loss": 0.4435, + "step": 17382 + }, + { + "epoch": 0.3227351544428066, + "grad_norm": 0.37411409616470337, + "learning_rate": 1.5285424241276444e-05, + "loss": 0.2533, + "step": 17384 + }, + { + "epoch": 0.3227722845802252, + "grad_norm": 0.41730114817619324, + "learning_rate": 1.528443396335972e-05, + "loss": 0.1833, + "step": 17386 + }, + { + "epoch": 0.32280941471764385, + "grad_norm": 0.2243090122938156, + "learning_rate": 1.528344361353768e-05, + "loss": 0.2127, + "step": 17388 + }, + { + "epoch": 0.32284654485506253, + "grad_norm": 0.3316304385662079, + "learning_rate": 1.5282453191823797e-05, + "loss": 0.3668, + "step": 17390 + }, + { + "epoch": 0.32288367499248116, + "grad_norm": 0.29832443594932556, + "learning_rate": 1.528146269823155e-05, + "loss": 0.372, + "step": 17392 + }, + { + "epoch": 0.3229208051298998, + "grad_norm": 0.44492384791374207, + "learning_rate": 1.5280472132774414e-05, + "loss": 0.3083, + "step": 17394 + }, + { + "epoch": 0.3229579352673184, + "grad_norm": 0.4695076048374176, + "learning_rate": 1.5279481495465874e-05, + "loss": 0.275, + "step": 17396 + }, + { + "epoch": 0.32299506540473705, + "grad_norm": 0.5234975814819336, + "learning_rate": 1.5278490786319403e-05, + "loss": 0.2488, + "step": 17398 + }, + { + "epoch": 0.3230321955421557, + "grad_norm": 0.2769046127796173, + "learning_rate": 1.527750000534848e-05, + "loss": 0.3021, + "step": 17400 + }, + { + "epoch": 0.32306932567957436, + "grad_norm": 0.3211325407028198, + "learning_rate": 1.527650915256659e-05, + "loss": 0.2848, + "step": 17402 + }, + { + "epoch": 0.323106455816993, + "grad_norm": 0.3746379315853119, + "learning_rate": 1.5275518227987218e-05, + "loss": 0.4673, + "step": 17404 + }, + { + "epoch": 0.3231435859544116, + "grad_norm": 0.5678956508636475, + "learning_rate": 1.5274527231623844e-05, + "loss": 0.2224, + "step": 17406 + }, + { + "epoch": 0.32318071609183024, + "grad_norm": 0.3615468740463257, + "learning_rate": 1.5273536163489953e-05, + "loss": 0.4327, + "step": 17408 + }, + { + "epoch": 0.3232178462292489, + "grad_norm": 0.37712574005126953, + "learning_rate": 1.5272545023599032e-05, + "loss": 0.3222, + "step": 17410 + }, + { + "epoch": 0.32325497636666756, + "grad_norm": 0.24174006283283234, + "learning_rate": 1.5271553811964566e-05, + "loss": 0.2971, + "step": 17412 + }, + { + "epoch": 0.3232921065040862, + "grad_norm": 0.38790497183799744, + "learning_rate": 1.527056252860004e-05, + "loss": 0.253, + "step": 17414 + }, + { + "epoch": 0.3233292366415048, + "grad_norm": 0.28069284558296204, + "learning_rate": 1.526957117351895e-05, + "loss": 0.4008, + "step": 17416 + }, + { + "epoch": 0.32336636677892344, + "grad_norm": 0.37146157026290894, + "learning_rate": 1.5268579746734777e-05, + "loss": 0.373, + "step": 17418 + }, + { + "epoch": 0.32340349691634207, + "grad_norm": 0.4824257493019104, + "learning_rate": 1.5267588248261016e-05, + "loss": 0.3018, + "step": 17420 + }, + { + "epoch": 0.3234406270537607, + "grad_norm": 0.47460636496543884, + "learning_rate": 1.526659667811116e-05, + "loss": 0.303, + "step": 17422 + }, + { + "epoch": 0.3234777571911794, + "grad_norm": 0.30944719910621643, + "learning_rate": 1.526560503629869e-05, + "loss": 0.2781, + "step": 17424 + }, + { + "epoch": 0.323514887328598, + "grad_norm": 0.29679232835769653, + "learning_rate": 1.5264613322837116e-05, + "loss": 0.2733, + "step": 17426 + }, + { + "epoch": 0.32355201746601664, + "grad_norm": 0.3553576171398163, + "learning_rate": 1.5263621537739922e-05, + "loss": 0.4171, + "step": 17428 + }, + { + "epoch": 0.32358914760343527, + "grad_norm": 0.5936649441719055, + "learning_rate": 1.5262629681020603e-05, + "loss": 0.4375, + "step": 17430 + }, + { + "epoch": 0.3236262777408539, + "grad_norm": 0.28599485754966736, + "learning_rate": 1.5261637752692658e-05, + "loss": 0.3791, + "step": 17432 + }, + { + "epoch": 0.3236634078782726, + "grad_norm": 0.3888452351093292, + "learning_rate": 1.5260645752769585e-05, + "loss": 0.4027, + "step": 17434 + }, + { + "epoch": 0.3237005380156912, + "grad_norm": 0.4386485815048218, + "learning_rate": 1.525965368126488e-05, + "loss": 0.4206, + "step": 17436 + }, + { + "epoch": 0.32373766815310984, + "grad_norm": 0.5147672295570374, + "learning_rate": 1.5258661538192043e-05, + "loss": 0.2118, + "step": 17438 + }, + { + "epoch": 0.32377479829052846, + "grad_norm": 0.3790130019187927, + "learning_rate": 1.5257669323564575e-05, + "loss": 0.3404, + "step": 17440 + }, + { + "epoch": 0.3238119284279471, + "grad_norm": 0.3347530961036682, + "learning_rate": 1.5256677037395977e-05, + "loss": 0.35, + "step": 17442 + }, + { + "epoch": 0.3238490585653658, + "grad_norm": 0.5212411284446716, + "learning_rate": 1.5255684679699747e-05, + "loss": 0.4494, + "step": 17444 + }, + { + "epoch": 0.3238861887027844, + "grad_norm": 0.3576100468635559, + "learning_rate": 1.525469225048939e-05, + "loss": 0.2731, + "step": 17446 + }, + { + "epoch": 0.32392331884020303, + "grad_norm": 0.33504900336265564, + "learning_rate": 1.5253699749778417e-05, + "loss": 0.2819, + "step": 17448 + }, + { + "epoch": 0.32396044897762166, + "grad_norm": 0.3056875467300415, + "learning_rate": 1.5252707177580322e-05, + "loss": 0.4526, + "step": 17450 + }, + { + "epoch": 0.3239975791150403, + "grad_norm": 0.5367453694343567, + "learning_rate": 1.5251714533908617e-05, + "loss": 0.1851, + "step": 17452 + }, + { + "epoch": 0.3240347092524589, + "grad_norm": 0.41884738206863403, + "learning_rate": 1.5250721818776811e-05, + "loss": 0.3973, + "step": 17454 + }, + { + "epoch": 0.3240718393898776, + "grad_norm": 0.3681398034095764, + "learning_rate": 1.5249729032198404e-05, + "loss": 0.2082, + "step": 17456 + }, + { + "epoch": 0.32410896952729623, + "grad_norm": 0.41263511776924133, + "learning_rate": 1.5248736174186913e-05, + "loss": 0.294, + "step": 17458 + }, + { + "epoch": 0.32414609966471486, + "grad_norm": 0.366334468126297, + "learning_rate": 1.5247743244755846e-05, + "loss": 0.5154, + "step": 17460 + }, + { + "epoch": 0.3241832298021335, + "grad_norm": 0.5292330384254456, + "learning_rate": 1.5246750243918707e-05, + "loss": 0.2717, + "step": 17462 + }, + { + "epoch": 0.3242203599395521, + "grad_norm": 0.35583099722862244, + "learning_rate": 1.5245757171689016e-05, + "loss": 0.4378, + "step": 17464 + }, + { + "epoch": 0.3242574900769708, + "grad_norm": 0.28098538517951965, + "learning_rate": 1.5244764028080279e-05, + "loss": 0.3424, + "step": 17466 + }, + { + "epoch": 0.32429462021438943, + "grad_norm": 0.49299970269203186, + "learning_rate": 1.5243770813106018e-05, + "loss": 0.4128, + "step": 17468 + }, + { + "epoch": 0.32433175035180806, + "grad_norm": 0.3476695418357849, + "learning_rate": 1.5242777526779744e-05, + "loss": 0.2381, + "step": 17470 + }, + { + "epoch": 0.3243688804892267, + "grad_norm": 0.4195510745048523, + "learning_rate": 1.5241784169114968e-05, + "loss": 0.3695, + "step": 17472 + }, + { + "epoch": 0.3244060106266453, + "grad_norm": 0.26670563220977783, + "learning_rate": 1.5240790740125215e-05, + "loss": 0.3813, + "step": 17474 + }, + { + "epoch": 0.32444314076406394, + "grad_norm": 0.2688024938106537, + "learning_rate": 1.5239797239823992e-05, + "loss": 0.2917, + "step": 17476 + }, + { + "epoch": 0.3244802709014826, + "grad_norm": 0.29855844378471375, + "learning_rate": 1.5238803668224826e-05, + "loss": 0.1792, + "step": 17478 + }, + { + "epoch": 0.32451740103890125, + "grad_norm": 0.4138330817222595, + "learning_rate": 1.5237810025341238e-05, + "loss": 0.3354, + "step": 17480 + }, + { + "epoch": 0.3245545311763199, + "grad_norm": 0.3219321072101593, + "learning_rate": 1.5236816311186742e-05, + "loss": 0.6215, + "step": 17482 + }, + { + "epoch": 0.3245916613137385, + "grad_norm": 0.31449997425079346, + "learning_rate": 1.5235822525774859e-05, + "loss": 0.264, + "step": 17484 + }, + { + "epoch": 0.32462879145115714, + "grad_norm": 0.29745444655418396, + "learning_rate": 1.5234828669119116e-05, + "loss": 0.3373, + "step": 17486 + }, + { + "epoch": 0.3246659215885758, + "grad_norm": 0.2264559417963028, + "learning_rate": 1.5233834741233038e-05, + "loss": 0.2489, + "step": 17488 + }, + { + "epoch": 0.32470305172599445, + "grad_norm": 0.43536579608917236, + "learning_rate": 1.5232840742130143e-05, + "loss": 0.1644, + "step": 17490 + }, + { + "epoch": 0.3247401818634131, + "grad_norm": 0.18491986393928528, + "learning_rate": 1.5231846671823965e-05, + "loss": 0.3477, + "step": 17492 + }, + { + "epoch": 0.3247773120008317, + "grad_norm": 0.39093300700187683, + "learning_rate": 1.523085253032802e-05, + "loss": 0.4341, + "step": 17494 + }, + { + "epoch": 0.32481444213825034, + "grad_norm": 0.37062758207321167, + "learning_rate": 1.5229858317655839e-05, + "loss": 0.2965, + "step": 17496 + }, + { + "epoch": 0.32485157227566896, + "grad_norm": 0.2674814760684967, + "learning_rate": 1.5228864033820956e-05, + "loss": 0.2219, + "step": 17498 + }, + { + "epoch": 0.32488870241308765, + "grad_norm": 0.3341875970363617, + "learning_rate": 1.5227869678836892e-05, + "loss": 0.2747, + "step": 17500 + }, + { + "epoch": 0.3249258325505063, + "grad_norm": 0.2975541651248932, + "learning_rate": 1.5226875252717184e-05, + "loss": 0.4577, + "step": 17502 + }, + { + "epoch": 0.3249629626879249, + "grad_norm": 0.9835726618766785, + "learning_rate": 1.522588075547536e-05, + "loss": 0.2427, + "step": 17504 + }, + { + "epoch": 0.32500009282534353, + "grad_norm": 0.39639636874198914, + "learning_rate": 1.5224886187124947e-05, + "loss": 0.2435, + "step": 17506 + }, + { + "epoch": 0.32503722296276216, + "grad_norm": 0.2899585962295532, + "learning_rate": 1.5223891547679488e-05, + "loss": 0.2769, + "step": 17508 + }, + { + "epoch": 0.32507435310018085, + "grad_norm": 0.25025486946105957, + "learning_rate": 1.522289683715251e-05, + "loss": 0.2264, + "step": 17510 + }, + { + "epoch": 0.3251114832375995, + "grad_norm": 0.662032425403595, + "learning_rate": 1.5221902055557556e-05, + "loss": 0.1958, + "step": 17512 + }, + { + "epoch": 0.3251486133750181, + "grad_norm": 0.5503290295600891, + "learning_rate": 1.5220907202908151e-05, + "loss": 0.1985, + "step": 17514 + }, + { + "epoch": 0.32518574351243673, + "grad_norm": 0.38150089979171753, + "learning_rate": 1.5219912279217839e-05, + "loss": 0.4144, + "step": 17516 + }, + { + "epoch": 0.32522287364985536, + "grad_norm": 0.4446289837360382, + "learning_rate": 1.5218917284500157e-05, + "loss": 0.2535, + "step": 17518 + }, + { + "epoch": 0.32526000378727404, + "grad_norm": 0.5262390375137329, + "learning_rate": 1.5217922218768641e-05, + "loss": 0.3208, + "step": 17520 + }, + { + "epoch": 0.32529713392469267, + "grad_norm": 0.4500311613082886, + "learning_rate": 1.5216927082036837e-05, + "loss": 0.2856, + "step": 17522 + }, + { + "epoch": 0.3253342640621113, + "grad_norm": 0.213739812374115, + "learning_rate": 1.521593187431828e-05, + "loss": 0.2688, + "step": 17524 + }, + { + "epoch": 0.32537139419952993, + "grad_norm": 0.38909661769866943, + "learning_rate": 1.5214936595626514e-05, + "loss": 0.429, + "step": 17526 + }, + { + "epoch": 0.32540852433694856, + "grad_norm": 0.2122373729944229, + "learning_rate": 1.5213941245975077e-05, + "loss": 0.2699, + "step": 17528 + }, + { + "epoch": 0.3254456544743672, + "grad_norm": 0.2740071713924408, + "learning_rate": 1.5212945825377521e-05, + "loss": 0.3057, + "step": 17530 + }, + { + "epoch": 0.32548278461178587, + "grad_norm": 0.2417953759431839, + "learning_rate": 1.5211950333847387e-05, + "loss": 0.1686, + "step": 17532 + }, + { + "epoch": 0.3255199147492045, + "grad_norm": 0.46877309679985046, + "learning_rate": 1.521095477139822e-05, + "loss": 0.3598, + "step": 17534 + }, + { + "epoch": 0.3255570448866231, + "grad_norm": 0.3852144777774811, + "learning_rate": 1.5209959138043568e-05, + "loss": 0.3551, + "step": 17536 + }, + { + "epoch": 0.32559417502404175, + "grad_norm": 0.27003175020217896, + "learning_rate": 1.5208963433796977e-05, + "loss": 0.171, + "step": 17538 + }, + { + "epoch": 0.3256313051614604, + "grad_norm": 0.373787522315979, + "learning_rate": 1.5207967658671995e-05, + "loss": 0.3101, + "step": 17540 + }, + { + "epoch": 0.32566843529887907, + "grad_norm": 0.48493140935897827, + "learning_rate": 1.5206971812682176e-05, + "loss": 0.2358, + "step": 17542 + }, + { + "epoch": 0.3257055654362977, + "grad_norm": 0.3519165515899658, + "learning_rate": 1.5205975895841067e-05, + "loss": 0.2814, + "step": 17544 + }, + { + "epoch": 0.3257426955737163, + "grad_norm": 0.3446064889431, + "learning_rate": 1.520497990816222e-05, + "loss": 0.1971, + "step": 17546 + }, + { + "epoch": 0.32577982571113495, + "grad_norm": 0.43002253770828247, + "learning_rate": 1.5203983849659184e-05, + "loss": 0.2282, + "step": 17548 + }, + { + "epoch": 0.3258169558485536, + "grad_norm": 0.3817003667354584, + "learning_rate": 1.5202987720345517e-05, + "loss": 0.2598, + "step": 17550 + }, + { + "epoch": 0.3258540859859722, + "grad_norm": 0.7365832924842834, + "learning_rate": 1.5201991520234771e-05, + "loss": 0.2996, + "step": 17552 + }, + { + "epoch": 0.3258912161233909, + "grad_norm": 0.3756067454814911, + "learning_rate": 1.5200995249340505e-05, + "loss": 0.4609, + "step": 17554 + }, + { + "epoch": 0.3259283462608095, + "grad_norm": 0.21131837368011475, + "learning_rate": 1.5199998907676272e-05, + "loss": 0.3541, + "step": 17556 + }, + { + "epoch": 0.32596547639822815, + "grad_norm": 0.36641743779182434, + "learning_rate": 1.5199002495255626e-05, + "loss": 0.4212, + "step": 17558 + }, + { + "epoch": 0.3260026065356468, + "grad_norm": 0.2675943076610565, + "learning_rate": 1.519800601209213e-05, + "loss": 0.4573, + "step": 17560 + }, + { + "epoch": 0.3260397366730654, + "grad_norm": 0.36896276473999023, + "learning_rate": 1.5197009458199344e-05, + "loss": 0.3257, + "step": 17562 + }, + { + "epoch": 0.3260768668104841, + "grad_norm": 0.3580615818500519, + "learning_rate": 1.5196012833590825e-05, + "loss": 0.3963, + "step": 17564 + }, + { + "epoch": 0.3261139969479027, + "grad_norm": 0.45092225074768066, + "learning_rate": 1.5195016138280136e-05, + "loss": 0.3073, + "step": 17566 + }, + { + "epoch": 0.32615112708532135, + "grad_norm": 0.3870398700237274, + "learning_rate": 1.5194019372280839e-05, + "loss": 0.4073, + "step": 17568 + }, + { + "epoch": 0.32618825722274, + "grad_norm": 0.32575640082359314, + "learning_rate": 1.5193022535606494e-05, + "loss": 0.1979, + "step": 17570 + }, + { + "epoch": 0.3262253873601586, + "grad_norm": 0.33010345697402954, + "learning_rate": 1.5192025628270667e-05, + "loss": 0.3164, + "step": 17572 + }, + { + "epoch": 0.32626251749757723, + "grad_norm": 0.43834686279296875, + "learning_rate": 1.5191028650286925e-05, + "loss": 0.4459, + "step": 17574 + }, + { + "epoch": 0.3262996476349959, + "grad_norm": 0.347760945558548, + "learning_rate": 1.5190031601668832e-05, + "loss": 0.2396, + "step": 17576 + }, + { + "epoch": 0.32633677777241454, + "grad_norm": 0.41803792119026184, + "learning_rate": 1.5189034482429954e-05, + "loss": 0.2408, + "step": 17578 + }, + { + "epoch": 0.32637390790983317, + "grad_norm": 0.27051258087158203, + "learning_rate": 1.5188037292583861e-05, + "loss": 0.2564, + "step": 17580 + }, + { + "epoch": 0.3264110380472518, + "grad_norm": 0.3627765476703644, + "learning_rate": 1.5187040032144121e-05, + "loss": 0.4411, + "step": 17582 + }, + { + "epoch": 0.32644816818467043, + "grad_norm": 0.36485153436660767, + "learning_rate": 1.5186042701124297e-05, + "loss": 0.1505, + "step": 17584 + }, + { + "epoch": 0.3264852983220891, + "grad_norm": 0.3833666741847992, + "learning_rate": 1.5185045299537973e-05, + "loss": 0.5058, + "step": 17586 + }, + { + "epoch": 0.32652242845950774, + "grad_norm": 0.32322031259536743, + "learning_rate": 1.5184047827398711e-05, + "loss": 0.3719, + "step": 17588 + }, + { + "epoch": 0.32655955859692637, + "grad_norm": 0.4045676589012146, + "learning_rate": 1.5183050284720085e-05, + "loss": 0.3108, + "step": 17590 + }, + { + "epoch": 0.326596688734345, + "grad_norm": 0.3353482186794281, + "learning_rate": 1.5182052671515668e-05, + "loss": 0.3088, + "step": 17592 + }, + { + "epoch": 0.3266338188717636, + "grad_norm": 0.36045610904693604, + "learning_rate": 1.518105498779904e-05, + "loss": 0.3742, + "step": 17594 + }, + { + "epoch": 0.3266709490091823, + "grad_norm": 0.32726573944091797, + "learning_rate": 1.5180057233583773e-05, + "loss": 0.2729, + "step": 17596 + }, + { + "epoch": 0.32670807914660094, + "grad_norm": 0.3097217381000519, + "learning_rate": 1.517905940888344e-05, + "loss": 0.2189, + "step": 17598 + }, + { + "epoch": 0.32674520928401957, + "grad_norm": 0.2941550314426422, + "learning_rate": 1.5178061513711626e-05, + "loss": 0.2108, + "step": 17600 + }, + { + "epoch": 0.3267823394214382, + "grad_norm": 0.2941129803657532, + "learning_rate": 1.51770635480819e-05, + "loss": 0.2544, + "step": 17602 + }, + { + "epoch": 0.3268194695588568, + "grad_norm": 0.3898106813430786, + "learning_rate": 1.5176065512007845e-05, + "loss": 0.3461, + "step": 17604 + }, + { + "epoch": 0.32685659969627545, + "grad_norm": 0.44670993089675903, + "learning_rate": 1.5175067405503049e-05, + "loss": 0.3665, + "step": 17606 + }, + { + "epoch": 0.32689372983369414, + "grad_norm": 0.4735059142112732, + "learning_rate": 1.5174069228581083e-05, + "loss": 0.1969, + "step": 17608 + }, + { + "epoch": 0.32693085997111276, + "grad_norm": 0.4374541938304901, + "learning_rate": 1.5173070981255533e-05, + "loss": 0.244, + "step": 17610 + }, + { + "epoch": 0.3269679901085314, + "grad_norm": 0.615386426448822, + "learning_rate": 1.5172072663539983e-05, + "loss": 0.2606, + "step": 17612 + }, + { + "epoch": 0.32700512024595, + "grad_norm": 0.5306576490402222, + "learning_rate": 1.5171074275448015e-05, + "loss": 0.2445, + "step": 17614 + }, + { + "epoch": 0.32704225038336865, + "grad_norm": 0.31964847445487976, + "learning_rate": 1.5170075816993212e-05, + "loss": 0.2725, + "step": 17616 + }, + { + "epoch": 0.32707938052078733, + "grad_norm": 0.3527660667896271, + "learning_rate": 1.5169077288189171e-05, + "loss": 0.3411, + "step": 17618 + }, + { + "epoch": 0.32711651065820596, + "grad_norm": 0.32894060015678406, + "learning_rate": 1.5168078689049467e-05, + "loss": 0.3256, + "step": 17620 + }, + { + "epoch": 0.3271536407956246, + "grad_norm": 0.4134563207626343, + "learning_rate": 1.5167080019587692e-05, + "loss": 0.2392, + "step": 17622 + }, + { + "epoch": 0.3271907709330432, + "grad_norm": 0.2852741479873657, + "learning_rate": 1.5166081279817438e-05, + "loss": 0.1862, + "step": 17624 + }, + { + "epoch": 0.32722790107046185, + "grad_norm": 0.35628023743629456, + "learning_rate": 1.5165082469752288e-05, + "loss": 0.4758, + "step": 17626 + }, + { + "epoch": 0.3272650312078805, + "grad_norm": 0.3674044609069824, + "learning_rate": 1.5164083589405838e-05, + "loss": 0.3559, + "step": 17628 + }, + { + "epoch": 0.32730216134529916, + "grad_norm": 0.30254948139190674, + "learning_rate": 1.5163084638791681e-05, + "loss": 0.1966, + "step": 17630 + }, + { + "epoch": 0.3273392914827178, + "grad_norm": 0.3422847092151642, + "learning_rate": 1.5162085617923406e-05, + "loss": 0.3276, + "step": 17632 + }, + { + "epoch": 0.3273764216201364, + "grad_norm": 0.29455673694610596, + "learning_rate": 1.5161086526814606e-05, + "loss": 0.2337, + "step": 17634 + }, + { + "epoch": 0.32741355175755504, + "grad_norm": 0.3275715708732605, + "learning_rate": 1.5160087365478877e-05, + "loss": 0.3023, + "step": 17636 + }, + { + "epoch": 0.32745068189497367, + "grad_norm": 0.3403056263923645, + "learning_rate": 1.5159088133929818e-05, + "loss": 0.2749, + "step": 17638 + }, + { + "epoch": 0.32748781203239236, + "grad_norm": 0.8026832342147827, + "learning_rate": 1.5158088832181021e-05, + "loss": 0.3454, + "step": 17640 + }, + { + "epoch": 0.327524942169811, + "grad_norm": 0.41045746207237244, + "learning_rate": 1.5157089460246086e-05, + "loss": 0.2972, + "step": 17642 + }, + { + "epoch": 0.3275620723072296, + "grad_norm": 0.373545378446579, + "learning_rate": 1.5156090018138612e-05, + "loss": 0.2673, + "step": 17644 + }, + { + "epoch": 0.32759920244464824, + "grad_norm": 0.42723414301872253, + "learning_rate": 1.5155090505872197e-05, + "loss": 0.2114, + "step": 17646 + }, + { + "epoch": 0.32763633258206687, + "grad_norm": 0.3335084617137909, + "learning_rate": 1.5154090923460437e-05, + "loss": 0.1781, + "step": 17648 + }, + { + "epoch": 0.3276734627194855, + "grad_norm": 0.3104463815689087, + "learning_rate": 1.5153091270916945e-05, + "loss": 0.4006, + "step": 17650 + }, + { + "epoch": 0.3277105928569042, + "grad_norm": 0.3768613338470459, + "learning_rate": 1.5152091548255311e-05, + "loss": 0.4186, + "step": 17652 + }, + { + "epoch": 0.3277477229943228, + "grad_norm": 0.312747061252594, + "learning_rate": 1.5151091755489145e-05, + "loss": 0.2124, + "step": 17654 + }, + { + "epoch": 0.32778485313174144, + "grad_norm": 0.33740243315696716, + "learning_rate": 1.515009189263205e-05, + "loss": 0.3759, + "step": 17656 + }, + { + "epoch": 0.32782198326916007, + "grad_norm": 0.33628979325294495, + "learning_rate": 1.514909195969763e-05, + "loss": 0.3303, + "step": 17658 + }, + { + "epoch": 0.3278591134065787, + "grad_norm": 0.30011647939682007, + "learning_rate": 1.5148091956699494e-05, + "loss": 0.1345, + "step": 17660 + }, + { + "epoch": 0.3278962435439974, + "grad_norm": 0.29070523381233215, + "learning_rate": 1.5147091883651243e-05, + "loss": 0.4343, + "step": 17662 + }, + { + "epoch": 0.327933373681416, + "grad_norm": 0.26595136523246765, + "learning_rate": 1.5146091740566489e-05, + "loss": 0.2523, + "step": 17664 + }, + { + "epoch": 0.32797050381883464, + "grad_norm": 0.42118337750434875, + "learning_rate": 1.5145091527458841e-05, + "loss": 0.3005, + "step": 17666 + }, + { + "epoch": 0.32800763395625326, + "grad_norm": 0.3476470112800598, + "learning_rate": 1.5144091244341912e-05, + "loss": 0.2996, + "step": 17668 + }, + { + "epoch": 0.3280447640936719, + "grad_norm": 0.35275891423225403, + "learning_rate": 1.5143090891229304e-05, + "loss": 0.3488, + "step": 17670 + }, + { + "epoch": 0.3280818942310906, + "grad_norm": 0.3063492476940155, + "learning_rate": 1.514209046813464e-05, + "loss": 0.1288, + "step": 17672 + }, + { + "epoch": 0.3281190243685092, + "grad_norm": 0.3863551914691925, + "learning_rate": 1.5141089975071523e-05, + "loss": 0.4002, + "step": 17674 + }, + { + "epoch": 0.32815615450592783, + "grad_norm": 0.320879727602005, + "learning_rate": 1.5140089412053574e-05, + "loss": 0.2943, + "step": 17676 + }, + { + "epoch": 0.32819328464334646, + "grad_norm": 0.2639245092868805, + "learning_rate": 1.5139088779094402e-05, + "loss": 0.3584, + "step": 17678 + }, + { + "epoch": 0.3282304147807651, + "grad_norm": 0.309308260679245, + "learning_rate": 1.5138088076207626e-05, + "loss": 0.1263, + "step": 17680 + }, + { + "epoch": 0.3282675449181837, + "grad_norm": 0.31055134534835815, + "learning_rate": 1.5137087303406862e-05, + "loss": 0.2806, + "step": 17682 + }, + { + "epoch": 0.3283046750556024, + "grad_norm": 0.36841925978660583, + "learning_rate": 1.5136086460705729e-05, + "loss": 0.3882, + "step": 17684 + }, + { + "epoch": 0.32834180519302103, + "grad_norm": 0.5002008080482483, + "learning_rate": 1.5135085548117837e-05, + "loss": 0.1106, + "step": 17686 + }, + { + "epoch": 0.32837893533043966, + "grad_norm": 0.8487111330032349, + "learning_rate": 1.513408456565682e-05, + "loss": 0.401, + "step": 17688 + }, + { + "epoch": 0.3284160654678583, + "grad_norm": 0.5213286876678467, + "learning_rate": 1.5133083513336284e-05, + "loss": 0.195, + "step": 17690 + }, + { + "epoch": 0.3284531956052769, + "grad_norm": 0.4582628011703491, + "learning_rate": 1.513208239116986e-05, + "loss": 0.3187, + "step": 17692 + }, + { + "epoch": 0.3284903257426956, + "grad_norm": 0.3811066746711731, + "learning_rate": 1.5131081199171168e-05, + "loss": 0.2224, + "step": 17694 + }, + { + "epoch": 0.3285274558801142, + "grad_norm": 0.3852074444293976, + "learning_rate": 1.5130079937353827e-05, + "loss": 0.3576, + "step": 17696 + }, + { + "epoch": 0.32856458601753286, + "grad_norm": 0.4354002773761749, + "learning_rate": 1.5129078605731464e-05, + "loss": 0.3119, + "step": 17698 + }, + { + "epoch": 0.3286017161549515, + "grad_norm": 0.38786378502845764, + "learning_rate": 1.5128077204317708e-05, + "loss": 0.3141, + "step": 17700 + }, + { + "epoch": 0.3286388462923701, + "grad_norm": 0.39947086572647095, + "learning_rate": 1.5127075733126182e-05, + "loss": 0.3908, + "step": 17702 + }, + { + "epoch": 0.32867597642978874, + "grad_norm": 0.682485044002533, + "learning_rate": 1.512607419217051e-05, + "loss": 0.392, + "step": 17704 + }, + { + "epoch": 0.3287131065672074, + "grad_norm": 0.4146837890148163, + "learning_rate": 1.5125072581464325e-05, + "loss": 0.1713, + "step": 17706 + }, + { + "epoch": 0.32875023670462605, + "grad_norm": 0.3243124186992645, + "learning_rate": 1.5124070901021251e-05, + "loss": 0.379, + "step": 17708 + }, + { + "epoch": 0.3287873668420447, + "grad_norm": 0.5397246479988098, + "learning_rate": 1.512306915085492e-05, + "loss": 0.3793, + "step": 17710 + }, + { + "epoch": 0.3288244969794633, + "grad_norm": 0.8581190705299377, + "learning_rate": 1.5122067330978966e-05, + "loss": 0.2368, + "step": 17712 + }, + { + "epoch": 0.32886162711688194, + "grad_norm": 0.5463131070137024, + "learning_rate": 1.5121065441407017e-05, + "loss": 0.1634, + "step": 17714 + }, + { + "epoch": 0.3288987572543006, + "grad_norm": 0.313841313123703, + "learning_rate": 1.5120063482152707e-05, + "loss": 0.3584, + "step": 17716 + }, + { + "epoch": 0.32893588739171925, + "grad_norm": 0.3885699510574341, + "learning_rate": 1.5119061453229668e-05, + "loss": 0.407, + "step": 17718 + }, + { + "epoch": 0.3289730175291379, + "grad_norm": 0.38246700167655945, + "learning_rate": 1.5118059354651538e-05, + "loss": 0.2082, + "step": 17720 + }, + { + "epoch": 0.3290101476665565, + "grad_norm": 0.318854421377182, + "learning_rate": 1.5117057186431949e-05, + "loss": 0.3448, + "step": 17722 + }, + { + "epoch": 0.32904727780397514, + "grad_norm": 0.33163121342658997, + "learning_rate": 1.511605494858454e-05, + "loss": 0.4712, + "step": 17724 + }, + { + "epoch": 0.32908440794139376, + "grad_norm": 0.5115985870361328, + "learning_rate": 1.5115052641122947e-05, + "loss": 0.3477, + "step": 17726 + }, + { + "epoch": 0.32912153807881245, + "grad_norm": 0.3364209830760956, + "learning_rate": 1.5114050264060808e-05, + "loss": 0.1625, + "step": 17728 + }, + { + "epoch": 0.3291586682162311, + "grad_norm": 0.4749687612056732, + "learning_rate": 1.5113047817411764e-05, + "loss": 0.2879, + "step": 17730 + }, + { + "epoch": 0.3291957983536497, + "grad_norm": 0.449405699968338, + "learning_rate": 1.5112045301189454e-05, + "loss": 0.1957, + "step": 17732 + }, + { + "epoch": 0.32923292849106833, + "grad_norm": 0.3973706364631653, + "learning_rate": 1.5111042715407522e-05, + "loss": 0.2427, + "step": 17734 + }, + { + "epoch": 0.32927005862848696, + "grad_norm": 0.42291906476020813, + "learning_rate": 1.5110040060079607e-05, + "loss": 0.4602, + "step": 17736 + }, + { + "epoch": 0.32930718876590565, + "grad_norm": 0.31715869903564453, + "learning_rate": 1.5109037335219352e-05, + "loss": 0.2928, + "step": 17738 + }, + { + "epoch": 0.3293443189033243, + "grad_norm": 0.39417240023612976, + "learning_rate": 1.5108034540840404e-05, + "loss": 0.5245, + "step": 17740 + }, + { + "epoch": 0.3293814490407429, + "grad_norm": 0.22487516701221466, + "learning_rate": 1.5107031676956403e-05, + "loss": 0.1961, + "step": 17742 + }, + { + "epoch": 0.32941857917816153, + "grad_norm": 0.24678479135036469, + "learning_rate": 1.5106028743581e-05, + "loss": 0.2691, + "step": 17744 + }, + { + "epoch": 0.32945570931558016, + "grad_norm": 0.35239556431770325, + "learning_rate": 1.5105025740727843e-05, + "loss": 0.2633, + "step": 17746 + }, + { + "epoch": 0.32949283945299884, + "grad_norm": 0.5946488976478577, + "learning_rate": 1.5104022668410571e-05, + "loss": 0.4743, + "step": 17748 + }, + { + "epoch": 0.32952996959041747, + "grad_norm": 0.33725619316101074, + "learning_rate": 1.5103019526642846e-05, + "loss": 0.2039, + "step": 17750 + }, + { + "epoch": 0.3295670997278361, + "grad_norm": 0.3687255382537842, + "learning_rate": 1.5102016315438304e-05, + "loss": 0.2938, + "step": 17752 + }, + { + "epoch": 0.3296042298652547, + "grad_norm": 0.547210693359375, + "learning_rate": 1.5101013034810605e-05, + "loss": 0.2901, + "step": 17754 + }, + { + "epoch": 0.32964136000267336, + "grad_norm": 0.7228071093559265, + "learning_rate": 1.5100009684773397e-05, + "loss": 0.2829, + "step": 17756 + }, + { + "epoch": 0.329678490140092, + "grad_norm": 0.38952481746673584, + "learning_rate": 1.5099006265340337e-05, + "loss": 0.1534, + "step": 17758 + }, + { + "epoch": 0.32971562027751067, + "grad_norm": 0.6288354992866516, + "learning_rate": 1.5098002776525072e-05, + "loss": 0.2562, + "step": 17760 + }, + { + "epoch": 0.3297527504149293, + "grad_norm": 0.3054794669151306, + "learning_rate": 1.5096999218341257e-05, + "loss": 0.3383, + "step": 17762 + }, + { + "epoch": 0.3297898805523479, + "grad_norm": 0.582605242729187, + "learning_rate": 1.5095995590802554e-05, + "loss": 0.252, + "step": 17764 + }, + { + "epoch": 0.32982701068976655, + "grad_norm": 0.3173079788684845, + "learning_rate": 1.5094991893922615e-05, + "loss": 0.4706, + "step": 17766 + }, + { + "epoch": 0.3298641408271852, + "grad_norm": 0.40634965896606445, + "learning_rate": 1.5093988127715096e-05, + "loss": 0.2742, + "step": 17768 + }, + { + "epoch": 0.32990127096460387, + "grad_norm": 4.459052562713623, + "learning_rate": 1.5092984292193658e-05, + "loss": 0.2365, + "step": 17770 + }, + { + "epoch": 0.3299384011020225, + "grad_norm": 0.276470422744751, + "learning_rate": 1.5091980387371959e-05, + "loss": 0.259, + "step": 17772 + }, + { + "epoch": 0.3299755312394411, + "grad_norm": 0.3766598403453827, + "learning_rate": 1.5090976413263656e-05, + "loss": 0.1958, + "step": 17774 + }, + { + "epoch": 0.33001266137685975, + "grad_norm": 0.44429340958595276, + "learning_rate": 1.5089972369882418e-05, + "loss": 0.2262, + "step": 17776 + }, + { + "epoch": 0.3300497915142784, + "grad_norm": 0.44059309363365173, + "learning_rate": 1.50889682572419e-05, + "loss": 0.1919, + "step": 17778 + }, + { + "epoch": 0.330086921651697, + "grad_norm": 0.28233402967453003, + "learning_rate": 1.5087964075355768e-05, + "loss": 0.3583, + "step": 17780 + }, + { + "epoch": 0.3301240517891157, + "grad_norm": 0.36759957671165466, + "learning_rate": 1.5086959824237686e-05, + "loss": 0.2375, + "step": 17782 + }, + { + "epoch": 0.3301611819265343, + "grad_norm": 0.4023893475532532, + "learning_rate": 1.5085955503901314e-05, + "loss": 0.3377, + "step": 17784 + }, + { + "epoch": 0.33019831206395295, + "grad_norm": 0.37473300099372864, + "learning_rate": 1.5084951114360325e-05, + "loss": 0.3836, + "step": 17786 + }, + { + "epoch": 0.3302354422013716, + "grad_norm": 0.37250426411628723, + "learning_rate": 1.5083946655628382e-05, + "loss": 0.413, + "step": 17788 + }, + { + "epoch": 0.3302725723387902, + "grad_norm": 0.41275903582572937, + "learning_rate": 1.5082942127719156e-05, + "loss": 0.3508, + "step": 17790 + }, + { + "epoch": 0.3303097024762089, + "grad_norm": 0.43645885586738586, + "learning_rate": 1.5081937530646307e-05, + "loss": 0.2888, + "step": 17792 + }, + { + "epoch": 0.3303468326136275, + "grad_norm": 0.44883567094802856, + "learning_rate": 1.5080932864423514e-05, + "loss": 0.3066, + "step": 17794 + }, + { + "epoch": 0.33038396275104615, + "grad_norm": 0.44382384419441223, + "learning_rate": 1.5079928129064442e-05, + "loss": 0.5077, + "step": 17796 + }, + { + "epoch": 0.3304210928884648, + "grad_norm": 0.5066351294517517, + "learning_rate": 1.5078923324582766e-05, + "loss": 0.2077, + "step": 17798 + }, + { + "epoch": 0.3304582230258834, + "grad_norm": 0.4591412842273712, + "learning_rate": 1.5077918450992157e-05, + "loss": 0.2826, + "step": 17800 + }, + { + "epoch": 0.33049535316330203, + "grad_norm": 0.3614860773086548, + "learning_rate": 1.5076913508306288e-05, + "loss": 0.2676, + "step": 17802 + }, + { + "epoch": 0.3305324833007207, + "grad_norm": 0.34059938788414, + "learning_rate": 1.5075908496538831e-05, + "loss": 0.4131, + "step": 17804 + }, + { + "epoch": 0.33056961343813934, + "grad_norm": 0.4687926769256592, + "learning_rate": 1.5074903415703466e-05, + "loss": 0.1256, + "step": 17806 + }, + { + "epoch": 0.33060674357555797, + "grad_norm": 0.47998255491256714, + "learning_rate": 1.5073898265813865e-05, + "loss": 0.3518, + "step": 17808 + }, + { + "epoch": 0.3306438737129766, + "grad_norm": 0.5340453386306763, + "learning_rate": 1.5072893046883707e-05, + "loss": 0.2348, + "step": 17810 + }, + { + "epoch": 0.3306810038503952, + "grad_norm": 0.48077383637428284, + "learning_rate": 1.5071887758926669e-05, + "loss": 0.3362, + "step": 17812 + }, + { + "epoch": 0.3307181339878139, + "grad_norm": 0.37552666664123535, + "learning_rate": 1.5070882401956435e-05, + "loss": 0.2883, + "step": 17814 + }, + { + "epoch": 0.33075526412523254, + "grad_norm": 0.6973111629486084, + "learning_rate": 1.5069876975986675e-05, + "loss": 0.383, + "step": 17816 + }, + { + "epoch": 0.33079239426265117, + "grad_norm": 0.27123117446899414, + "learning_rate": 1.506887148103108e-05, + "loss": 0.3437, + "step": 17818 + }, + { + "epoch": 0.3308295244000698, + "grad_norm": 0.6203113794326782, + "learning_rate": 1.5067865917103324e-05, + "loss": 0.3698, + "step": 17820 + }, + { + "epoch": 0.3308666545374884, + "grad_norm": 0.3681880831718445, + "learning_rate": 1.5066860284217093e-05, + "loss": 0.377, + "step": 17822 + }, + { + "epoch": 0.3309037846749071, + "grad_norm": 0.35213109850883484, + "learning_rate": 1.506585458238607e-05, + "loss": 0.3372, + "step": 17824 + }, + { + "epoch": 0.33094091481232574, + "grad_norm": 0.5624686479568481, + "learning_rate": 1.5064848811623943e-05, + "loss": 0.3828, + "step": 17826 + }, + { + "epoch": 0.33097804494974437, + "grad_norm": 0.4062316119670868, + "learning_rate": 1.506384297194439e-05, + "loss": 0.2378, + "step": 17828 + }, + { + "epoch": 0.331015175087163, + "grad_norm": 0.4410496652126312, + "learning_rate": 1.5062837063361109e-05, + "loss": 0.4173, + "step": 17830 + }, + { + "epoch": 0.3310523052245816, + "grad_norm": 0.32789239287376404, + "learning_rate": 1.5061831085887777e-05, + "loss": 0.1051, + "step": 17832 + }, + { + "epoch": 0.33108943536200025, + "grad_norm": 0.32628872990608215, + "learning_rate": 1.5060825039538083e-05, + "loss": 0.4203, + "step": 17834 + }, + { + "epoch": 0.33112656549941893, + "grad_norm": 0.5331152677536011, + "learning_rate": 1.5059818924325722e-05, + "loss": 0.2779, + "step": 17836 + }, + { + "epoch": 0.33116369563683756, + "grad_norm": 0.30990102887153625, + "learning_rate": 1.505881274026438e-05, + "loss": 0.2681, + "step": 17838 + }, + { + "epoch": 0.3312008257742562, + "grad_norm": 0.3304871618747711, + "learning_rate": 1.505780648736775e-05, + "loss": 0.4464, + "step": 17840 + }, + { + "epoch": 0.3312379559116748, + "grad_norm": 0.4052425026893616, + "learning_rate": 1.5056800165649525e-05, + "loss": 0.4468, + "step": 17842 + }, + { + "epoch": 0.33127508604909345, + "grad_norm": 0.6457363963127136, + "learning_rate": 1.5055793775123396e-05, + "loss": 0.3015, + "step": 17844 + }, + { + "epoch": 0.33131221618651213, + "grad_norm": 0.330112487077713, + "learning_rate": 1.505478731580306e-05, + "loss": 0.215, + "step": 17846 + }, + { + "epoch": 0.33134934632393076, + "grad_norm": 0.25201767683029175, + "learning_rate": 1.5053780787702204e-05, + "loss": 0.3117, + "step": 17848 + }, + { + "epoch": 0.3313864764613494, + "grad_norm": 0.3351127505302429, + "learning_rate": 1.5052774190834532e-05, + "loss": 0.1682, + "step": 17850 + }, + { + "epoch": 0.331423606598768, + "grad_norm": 0.7798597812652588, + "learning_rate": 1.5051767525213738e-05, + "loss": 0.34, + "step": 17852 + }, + { + "epoch": 0.33146073673618665, + "grad_norm": 0.3377073407173157, + "learning_rate": 1.505076079085352e-05, + "loss": 0.3356, + "step": 17854 + }, + { + "epoch": 0.3314978668736053, + "grad_norm": 0.4009382426738739, + "learning_rate": 1.5049753987767574e-05, + "loss": 0.2366, + "step": 17856 + }, + { + "epoch": 0.33153499701102396, + "grad_norm": 0.5234353542327881, + "learning_rate": 1.5048747115969606e-05, + "loss": 0.33, + "step": 17858 + }, + { + "epoch": 0.3315721271484426, + "grad_norm": 0.5827322602272034, + "learning_rate": 1.5047740175473312e-05, + "loss": 0.435, + "step": 17860 + }, + { + "epoch": 0.3316092572858612, + "grad_norm": 0.4959309697151184, + "learning_rate": 1.5046733166292393e-05, + "loss": 0.3188, + "step": 17862 + }, + { + "epoch": 0.33164638742327984, + "grad_norm": 0.38424575328826904, + "learning_rate": 1.5045726088440553e-05, + "loss": 0.2, + "step": 17864 + }, + { + "epoch": 0.33168351756069847, + "grad_norm": 0.48506999015808105, + "learning_rate": 1.504471894193149e-05, + "loss": 0.4822, + "step": 17866 + }, + { + "epoch": 0.33172064769811715, + "grad_norm": 0.3128223717212677, + "learning_rate": 1.5043711726778915e-05, + "loss": 0.3039, + "step": 17868 + }, + { + "epoch": 0.3317577778355358, + "grad_norm": 0.3348535895347595, + "learning_rate": 1.5042704442996536e-05, + "loss": 0.3208, + "step": 17870 + }, + { + "epoch": 0.3317949079729544, + "grad_norm": 0.3610891103744507, + "learning_rate": 1.5041697090598053e-05, + "loss": 0.4105, + "step": 17872 + }, + { + "epoch": 0.33183203811037304, + "grad_norm": 0.3912738859653473, + "learning_rate": 1.5040689669597171e-05, + "loss": 0.2652, + "step": 17874 + }, + { + "epoch": 0.33186916824779167, + "grad_norm": 0.25603529810905457, + "learning_rate": 1.5039682180007602e-05, + "loss": 0.4298, + "step": 17876 + }, + { + "epoch": 0.3319062983852103, + "grad_norm": 0.5491587519645691, + "learning_rate": 1.5038674621843053e-05, + "loss": 0.3159, + "step": 17878 + }, + { + "epoch": 0.331943428522629, + "grad_norm": 0.35179197788238525, + "learning_rate": 1.5037666995117234e-05, + "loss": 0.4204, + "step": 17880 + }, + { + "epoch": 0.3319805586600476, + "grad_norm": 0.42932820320129395, + "learning_rate": 1.5036659299843864e-05, + "loss": 0.3302, + "step": 17882 + }, + { + "epoch": 0.33201768879746624, + "grad_norm": 0.42902302742004395, + "learning_rate": 1.5035651536036642e-05, + "loss": 0.3491, + "step": 17884 + }, + { + "epoch": 0.33205481893488487, + "grad_norm": 0.30484434962272644, + "learning_rate": 1.5034643703709285e-05, + "loss": 0.1665, + "step": 17886 + }, + { + "epoch": 0.3320919490723035, + "grad_norm": 0.4538746774196625, + "learning_rate": 1.5033635802875507e-05, + "loss": 0.2712, + "step": 17888 + }, + { + "epoch": 0.3321290792097222, + "grad_norm": 0.33844348788261414, + "learning_rate": 1.5032627833549028e-05, + "loss": 0.4158, + "step": 17890 + }, + { + "epoch": 0.3321662093471408, + "grad_norm": 0.5058414340019226, + "learning_rate": 1.5031619795743554e-05, + "loss": 0.2046, + "step": 17892 + }, + { + "epoch": 0.33220333948455943, + "grad_norm": 0.5693158507347107, + "learning_rate": 1.503061168947281e-05, + "loss": 0.228, + "step": 17894 + }, + { + "epoch": 0.33224046962197806, + "grad_norm": 0.4657512605190277, + "learning_rate": 1.5029603514750508e-05, + "loss": 0.4079, + "step": 17896 + }, + { + "epoch": 0.3322775997593967, + "grad_norm": 0.4146985709667206, + "learning_rate": 1.5028595271590367e-05, + "loss": 0.3487, + "step": 17898 + }, + { + "epoch": 0.3323147298968154, + "grad_norm": 0.2547996938228607, + "learning_rate": 1.5027586960006107e-05, + "loss": 0.0888, + "step": 17900 + }, + { + "epoch": 0.332351860034234, + "grad_norm": 0.28766632080078125, + "learning_rate": 1.5026578580011446e-05, + "loss": 0.4945, + "step": 17902 + }, + { + "epoch": 0.33238899017165263, + "grad_norm": 0.5285801291465759, + "learning_rate": 1.5025570131620111e-05, + "loss": 0.1636, + "step": 17904 + }, + { + "epoch": 0.33242612030907126, + "grad_norm": 0.3439742624759674, + "learning_rate": 1.5024561614845819e-05, + "loss": 0.2901, + "step": 17906 + }, + { + "epoch": 0.3324632504464899, + "grad_norm": 0.33359089493751526, + "learning_rate": 1.5023553029702292e-05, + "loss": 0.2764, + "step": 17908 + }, + { + "epoch": 0.3325003805839085, + "grad_norm": 0.4356013238430023, + "learning_rate": 1.5022544376203254e-05, + "loss": 0.1527, + "step": 17910 + }, + { + "epoch": 0.3325375107213272, + "grad_norm": 0.6118255853652954, + "learning_rate": 1.5021535654362431e-05, + "loss": 0.1973, + "step": 17912 + }, + { + "epoch": 0.33257464085874583, + "grad_norm": 0.3409269452095032, + "learning_rate": 1.5020526864193553e-05, + "loss": 0.3395, + "step": 17914 + }, + { + "epoch": 0.33261177099616446, + "grad_norm": 0.3647063970565796, + "learning_rate": 1.501951800571034e-05, + "loss": 0.2978, + "step": 17916 + }, + { + "epoch": 0.3326489011335831, + "grad_norm": 0.4006160795688629, + "learning_rate": 1.5018509078926525e-05, + "loss": 0.2305, + "step": 17918 + }, + { + "epoch": 0.3326860312710017, + "grad_norm": 0.43046560883522034, + "learning_rate": 1.501750008385583e-05, + "loss": 0.4847, + "step": 17920 + }, + { + "epoch": 0.3327231614084204, + "grad_norm": 0.26167067885398865, + "learning_rate": 1.5016491020511987e-05, + "loss": 0.2669, + "step": 17922 + }, + { + "epoch": 0.332760291545839, + "grad_norm": 0.32392409443855286, + "learning_rate": 1.501548188890873e-05, + "loss": 0.3135, + "step": 17924 + }, + { + "epoch": 0.33279742168325765, + "grad_norm": 0.49963998794555664, + "learning_rate": 1.501447268905979e-05, + "loss": 0.2329, + "step": 17926 + }, + { + "epoch": 0.3328345518206763, + "grad_norm": 0.3100620210170746, + "learning_rate": 1.5013463420978895e-05, + "loss": 0.2812, + "step": 17928 + }, + { + "epoch": 0.3328716819580949, + "grad_norm": 0.4499354660511017, + "learning_rate": 1.5012454084679778e-05, + "loss": 0.5106, + "step": 17930 + }, + { + "epoch": 0.33290881209551354, + "grad_norm": 0.5090258717536926, + "learning_rate": 1.5011444680176173e-05, + "loss": 0.3133, + "step": 17932 + }, + { + "epoch": 0.3329459422329322, + "grad_norm": 0.3382207751274109, + "learning_rate": 1.5010435207481822e-05, + "loss": 0.3882, + "step": 17934 + }, + { + "epoch": 0.33298307237035085, + "grad_norm": 0.45492812991142273, + "learning_rate": 1.5009425666610456e-05, + "loss": 0.4252, + "step": 17936 + }, + { + "epoch": 0.3330202025077695, + "grad_norm": 0.5794858932495117, + "learning_rate": 1.5008416057575805e-05, + "loss": 0.2892, + "step": 17938 + }, + { + "epoch": 0.3330573326451881, + "grad_norm": 0.3883581757545471, + "learning_rate": 1.500740638039162e-05, + "loss": 0.2041, + "step": 17940 + }, + { + "epoch": 0.33309446278260674, + "grad_norm": 0.4709897041320801, + "learning_rate": 1.5006396635071633e-05, + "loss": 0.2788, + "step": 17942 + }, + { + "epoch": 0.3331315929200254, + "grad_norm": 0.25976231694221497, + "learning_rate": 1.5005386821629577e-05, + "loss": 0.2902, + "step": 17944 + }, + { + "epoch": 0.33316872305744405, + "grad_norm": 0.4804779291152954, + "learning_rate": 1.5004376940079204e-05, + "loss": 0.3874, + "step": 17946 + }, + { + "epoch": 0.3332058531948627, + "grad_norm": 0.3953636586666107, + "learning_rate": 1.5003366990434254e-05, + "loss": 0.306, + "step": 17948 + }, + { + "epoch": 0.3332429833322813, + "grad_norm": 0.48406878113746643, + "learning_rate": 1.500235697270846e-05, + "loss": 0.3845, + "step": 17950 + }, + { + "epoch": 0.33328011346969993, + "grad_norm": 0.4482232928276062, + "learning_rate": 1.5001346886915576e-05, + "loss": 0.3902, + "step": 17952 + }, + { + "epoch": 0.33331724360711856, + "grad_norm": 0.3405599594116211, + "learning_rate": 1.5000336733069338e-05, + "loss": 0.28, + "step": 17954 + }, + { + "epoch": 0.33335437374453725, + "grad_norm": 0.4166962504386902, + "learning_rate": 1.4999326511183498e-05, + "loss": 0.2619, + "step": 17956 + }, + { + "epoch": 0.3333915038819559, + "grad_norm": 0.36426955461502075, + "learning_rate": 1.4998316221271798e-05, + "loss": 0.4352, + "step": 17958 + }, + { + "epoch": 0.3334286340193745, + "grad_norm": 0.4869938790798187, + "learning_rate": 1.4997305863347984e-05, + "loss": 0.159, + "step": 17960 + }, + { + "epoch": 0.33346576415679313, + "grad_norm": 0.3511381447315216, + "learning_rate": 1.4996295437425805e-05, + "loss": 0.4285, + "step": 17962 + }, + { + "epoch": 0.33350289429421176, + "grad_norm": 0.9059442281723022, + "learning_rate": 1.4995284943519012e-05, + "loss": 0.2069, + "step": 17964 + }, + { + "epoch": 0.33354002443163044, + "grad_norm": 0.40236422419548035, + "learning_rate": 1.4994274381641355e-05, + "loss": 0.3119, + "step": 17966 + }, + { + "epoch": 0.3335771545690491, + "grad_norm": 0.3907240927219391, + "learning_rate": 1.4993263751806584e-05, + "loss": 0.3197, + "step": 17968 + }, + { + "epoch": 0.3336142847064677, + "grad_norm": 0.4376314878463745, + "learning_rate": 1.4992253054028447e-05, + "loss": 0.2833, + "step": 17970 + }, + { + "epoch": 0.33365141484388633, + "grad_norm": 0.3018713891506195, + "learning_rate": 1.4991242288320703e-05, + "loss": 0.2281, + "step": 17972 + }, + { + "epoch": 0.33368854498130496, + "grad_norm": 1.0719242095947266, + "learning_rate": 1.4990231454697099e-05, + "loss": 0.2917, + "step": 17974 + }, + { + "epoch": 0.33372567511872364, + "grad_norm": 0.379462867975235, + "learning_rate": 1.498922055317139e-05, + "loss": 0.6311, + "step": 17976 + }, + { + "epoch": 0.33376280525614227, + "grad_norm": 0.3152543902397156, + "learning_rate": 1.4988209583757338e-05, + "loss": 0.324, + "step": 17978 + }, + { + "epoch": 0.3337999353935609, + "grad_norm": 0.42757052183151245, + "learning_rate": 1.4987198546468697e-05, + "loss": 0.5221, + "step": 17980 + }, + { + "epoch": 0.3338370655309795, + "grad_norm": 0.3611413538455963, + "learning_rate": 1.4986187441319217e-05, + "loss": 0.5084, + "step": 17982 + }, + { + "epoch": 0.33387419566839815, + "grad_norm": 0.5011786222457886, + "learning_rate": 1.4985176268322666e-05, + "loss": 0.4575, + "step": 17984 + }, + { + "epoch": 0.3339113258058168, + "grad_norm": 0.4002304673194885, + "learning_rate": 1.4984165027492794e-05, + "loss": 0.2608, + "step": 17986 + }, + { + "epoch": 0.33394845594323547, + "grad_norm": 0.3544389307498932, + "learning_rate": 1.4983153718843366e-05, + "loss": 0.4668, + "step": 17988 + }, + { + "epoch": 0.3339855860806541, + "grad_norm": 0.3432004153728485, + "learning_rate": 1.4982142342388146e-05, + "loss": 0.2618, + "step": 17990 + }, + { + "epoch": 0.3340227162180727, + "grad_norm": 0.4127478003501892, + "learning_rate": 1.4981130898140887e-05, + "loss": 0.2288, + "step": 17992 + }, + { + "epoch": 0.33405984635549135, + "grad_norm": 0.3847413957118988, + "learning_rate": 1.4980119386115357e-05, + "loss": 0.3089, + "step": 17994 + }, + { + "epoch": 0.33409697649291, + "grad_norm": 0.3328947424888611, + "learning_rate": 1.4979107806325324e-05, + "loss": 0.3101, + "step": 17996 + }, + { + "epoch": 0.33413410663032866, + "grad_norm": 0.34489721059799194, + "learning_rate": 1.4978096158784543e-05, + "loss": 0.5336, + "step": 17998 + }, + { + "epoch": 0.3341712367677473, + "grad_norm": 0.3683588206768036, + "learning_rate": 1.4977084443506789e-05, + "loss": 0.2934, + "step": 18000 + }, + { + "epoch": 0.3342083669051659, + "grad_norm": 0.4330499768257141, + "learning_rate": 1.497607266050582e-05, + "loss": 0.1523, + "step": 18002 + }, + { + "epoch": 0.33424549704258455, + "grad_norm": 0.4272777736186981, + "learning_rate": 1.497506080979541e-05, + "loss": 0.4294, + "step": 18004 + }, + { + "epoch": 0.3342826271800032, + "grad_norm": 0.639563798904419, + "learning_rate": 1.497404889138932e-05, + "loss": 0.4541, + "step": 18006 + }, + { + "epoch": 0.3343197573174218, + "grad_norm": 0.4317440092563629, + "learning_rate": 1.4973036905301325e-05, + "loss": 0.3376, + "step": 18008 + }, + { + "epoch": 0.3343568874548405, + "grad_norm": 0.46777164936065674, + "learning_rate": 1.4972024851545199e-05, + "loss": 0.3037, + "step": 18010 + }, + { + "epoch": 0.3343940175922591, + "grad_norm": 0.5522424578666687, + "learning_rate": 1.4971012730134703e-05, + "loss": 0.382, + "step": 18012 + }, + { + "epoch": 0.33443114772967775, + "grad_norm": 0.5065568089485168, + "learning_rate": 1.4970000541083614e-05, + "loss": 0.2006, + "step": 18014 + }, + { + "epoch": 0.3344682778670964, + "grad_norm": 0.5036908388137817, + "learning_rate": 1.4968988284405706e-05, + "loss": 0.3243, + "step": 18016 + }, + { + "epoch": 0.334505408004515, + "grad_norm": 0.28333693742752075, + "learning_rate": 1.4967975960114749e-05, + "loss": 0.1869, + "step": 18018 + }, + { + "epoch": 0.3345425381419337, + "grad_norm": 0.5294538736343384, + "learning_rate": 1.4966963568224522e-05, + "loss": 0.2584, + "step": 18020 + }, + { + "epoch": 0.3345796682793523, + "grad_norm": 0.4187543988227844, + "learning_rate": 1.4965951108748798e-05, + "loss": 0.2598, + "step": 18022 + }, + { + "epoch": 0.33461679841677094, + "grad_norm": 0.36463862657546997, + "learning_rate": 1.4964938581701354e-05, + "loss": 0.3729, + "step": 18024 + }, + { + "epoch": 0.3346539285541896, + "grad_norm": 0.3892824649810791, + "learning_rate": 1.4963925987095967e-05, + "loss": 0.2548, + "step": 18026 + }, + { + "epoch": 0.3346910586916082, + "grad_norm": 0.2688679099082947, + "learning_rate": 1.4962913324946417e-05, + "loss": 0.3552, + "step": 18028 + }, + { + "epoch": 0.33472818882902683, + "grad_norm": 0.35725778341293335, + "learning_rate": 1.4961900595266481e-05, + "loss": 0.3145, + "step": 18030 + }, + { + "epoch": 0.3347653189664455, + "grad_norm": 0.2737744450569153, + "learning_rate": 1.4960887798069942e-05, + "loss": 0.4351, + "step": 18032 + }, + { + "epoch": 0.33480244910386414, + "grad_norm": 0.47076961398124695, + "learning_rate": 1.495987493337058e-05, + "loss": 0.4078, + "step": 18034 + }, + { + "epoch": 0.33483957924128277, + "grad_norm": 0.5594877004623413, + "learning_rate": 1.4958862001182175e-05, + "loss": 0.3629, + "step": 18036 + }, + { + "epoch": 0.3348767093787014, + "grad_norm": 0.44012850522994995, + "learning_rate": 1.4957849001518512e-05, + "loss": 0.3209, + "step": 18038 + }, + { + "epoch": 0.33491383951612, + "grad_norm": 0.7808220386505127, + "learning_rate": 1.4956835934393374e-05, + "loss": 0.4849, + "step": 18040 + }, + { + "epoch": 0.3349509696535387, + "grad_norm": 0.33568307757377625, + "learning_rate": 1.4955822799820549e-05, + "loss": 0.2029, + "step": 18042 + }, + { + "epoch": 0.33498809979095734, + "grad_norm": 0.34144726395606995, + "learning_rate": 1.4954809597813817e-05, + "loss": 0.2223, + "step": 18044 + }, + { + "epoch": 0.33502522992837597, + "grad_norm": 0.3721770644187927, + "learning_rate": 1.4953796328386969e-05, + "loss": 0.3967, + "step": 18046 + }, + { + "epoch": 0.3350623600657946, + "grad_norm": 0.3758309781551361, + "learning_rate": 1.4952782991553794e-05, + "loss": 0.3822, + "step": 18048 + }, + { + "epoch": 0.3350994902032132, + "grad_norm": 0.3911252021789551, + "learning_rate": 1.4951769587328073e-05, + "loss": 0.3925, + "step": 18050 + }, + { + "epoch": 0.3351366203406319, + "grad_norm": 0.34705039858818054, + "learning_rate": 1.4950756115723604e-05, + "loss": 0.3055, + "step": 18052 + }, + { + "epoch": 0.33517375047805054, + "grad_norm": 0.21749524772167206, + "learning_rate": 1.4949742576754173e-05, + "loss": 0.1382, + "step": 18054 + }, + { + "epoch": 0.33521088061546916, + "grad_norm": 0.4963466227054596, + "learning_rate": 1.494872897043357e-05, + "loss": 0.2788, + "step": 18056 + }, + { + "epoch": 0.3352480107528878, + "grad_norm": 0.4473222494125366, + "learning_rate": 1.494771529677559e-05, + "loss": 0.3865, + "step": 18058 + }, + { + "epoch": 0.3352851408903064, + "grad_norm": 0.4760928452014923, + "learning_rate": 1.4946701555794026e-05, + "loss": 0.3801, + "step": 18060 + }, + { + "epoch": 0.33532227102772505, + "grad_norm": 0.47154590487480164, + "learning_rate": 1.4945687747502667e-05, + "loss": 0.2973, + "step": 18062 + }, + { + "epoch": 0.33535940116514373, + "grad_norm": 0.27352288365364075, + "learning_rate": 1.4944673871915317e-05, + "loss": 0.3828, + "step": 18064 + }, + { + "epoch": 0.33539653130256236, + "grad_norm": 0.3778846561908722, + "learning_rate": 1.4943659929045762e-05, + "loss": 0.2882, + "step": 18066 + }, + { + "epoch": 0.335433661439981, + "grad_norm": 0.20338238775730133, + "learning_rate": 1.4942645918907806e-05, + "loss": 0.1716, + "step": 18068 + }, + { + "epoch": 0.3354707915773996, + "grad_norm": 0.5180934071540833, + "learning_rate": 1.4941631841515243e-05, + "loss": 0.3009, + "step": 18070 + }, + { + "epoch": 0.33550792171481825, + "grad_norm": 0.2743087708950043, + "learning_rate": 1.4940617696881872e-05, + "loss": 0.4032, + "step": 18072 + }, + { + "epoch": 0.33554505185223693, + "grad_norm": 0.43645066022872925, + "learning_rate": 1.4939603485021494e-05, + "loss": 0.1106, + "step": 18074 + }, + { + "epoch": 0.33558218198965556, + "grad_norm": 0.4472203850746155, + "learning_rate": 1.4938589205947909e-05, + "loss": 0.4987, + "step": 18076 + }, + { + "epoch": 0.3356193121270742, + "grad_norm": 0.3432043492794037, + "learning_rate": 1.4937574859674917e-05, + "loss": 0.3349, + "step": 18078 + }, + { + "epoch": 0.3356564422644928, + "grad_norm": 0.21766094863414764, + "learning_rate": 1.4936560446216319e-05, + "loss": 0.2139, + "step": 18080 + }, + { + "epoch": 0.33569357240191144, + "grad_norm": 0.33876872062683105, + "learning_rate": 1.493554596558592e-05, + "loss": 0.1748, + "step": 18082 + }, + { + "epoch": 0.3357307025393301, + "grad_norm": 0.26665404438972473, + "learning_rate": 1.4934531417797528e-05, + "loss": 0.4438, + "step": 18084 + }, + { + "epoch": 0.33576783267674876, + "grad_norm": 0.48850172758102417, + "learning_rate": 1.4933516802864945e-05, + "loss": 0.272, + "step": 18086 + }, + { + "epoch": 0.3358049628141674, + "grad_norm": 0.5862286686897278, + "learning_rate": 1.493250212080197e-05, + "loss": 0.3842, + "step": 18088 + }, + { + "epoch": 0.335842092951586, + "grad_norm": 0.5566452145576477, + "learning_rate": 1.4931487371622418e-05, + "loss": 0.4226, + "step": 18090 + }, + { + "epoch": 0.33587922308900464, + "grad_norm": 0.4099358916282654, + "learning_rate": 1.4930472555340097e-05, + "loss": 0.4184, + "step": 18092 + }, + { + "epoch": 0.33591635322642327, + "grad_norm": 0.39967721700668335, + "learning_rate": 1.4929457671968809e-05, + "loss": 0.387, + "step": 18094 + }, + { + "epoch": 0.33595348336384195, + "grad_norm": 0.46006521582603455, + "learning_rate": 1.4928442721522369e-05, + "loss": 0.4534, + "step": 18096 + }, + { + "epoch": 0.3359906135012606, + "grad_norm": 0.3420569896697998, + "learning_rate": 1.4927427704014588e-05, + "loss": 0.3575, + "step": 18098 + }, + { + "epoch": 0.3360277436386792, + "grad_norm": 0.2239494025707245, + "learning_rate": 1.4926412619459272e-05, + "loss": 0.3339, + "step": 18100 + }, + { + "epoch": 0.33606487377609784, + "grad_norm": 0.31252405047416687, + "learning_rate": 1.4925397467870237e-05, + "loss": 0.1748, + "step": 18102 + }, + { + "epoch": 0.33610200391351647, + "grad_norm": 0.30184435844421387, + "learning_rate": 1.49243822492613e-05, + "loss": 0.2698, + "step": 18104 + }, + { + "epoch": 0.3361391340509351, + "grad_norm": 0.5092756152153015, + "learning_rate": 1.4923366963646269e-05, + "loss": 0.1324, + "step": 18106 + }, + { + "epoch": 0.3361762641883538, + "grad_norm": 0.324421763420105, + "learning_rate": 1.4922351611038957e-05, + "loss": 0.4726, + "step": 18108 + }, + { + "epoch": 0.3362133943257724, + "grad_norm": 0.5144684910774231, + "learning_rate": 1.492133619145319e-05, + "loss": 0.2619, + "step": 18110 + }, + { + "epoch": 0.33625052446319104, + "grad_norm": 0.31315067410469055, + "learning_rate": 1.4920320704902773e-05, + "loss": 0.2529, + "step": 18112 + }, + { + "epoch": 0.33628765460060966, + "grad_norm": 0.4374421536922455, + "learning_rate": 1.4919305151401529e-05, + "loss": 0.2697, + "step": 18114 + }, + { + "epoch": 0.3363247847380283, + "grad_norm": 0.3299446403980255, + "learning_rate": 1.4918289530963281e-05, + "loss": 0.3127, + "step": 18116 + }, + { + "epoch": 0.336361914875447, + "grad_norm": 0.446545273065567, + "learning_rate": 1.4917273843601844e-05, + "loss": 0.2267, + "step": 18118 + }, + { + "epoch": 0.3363990450128656, + "grad_norm": 0.4188675582408905, + "learning_rate": 1.4916258089331035e-05, + "loss": 0.2963, + "step": 18120 + }, + { + "epoch": 0.33643617515028423, + "grad_norm": 0.33802530169487, + "learning_rate": 1.4915242268164682e-05, + "loss": 0.5353, + "step": 18122 + }, + { + "epoch": 0.33647330528770286, + "grad_norm": 0.2887125015258789, + "learning_rate": 1.4914226380116606e-05, + "loss": 0.2303, + "step": 18124 + }, + { + "epoch": 0.3365104354251215, + "grad_norm": 0.2958166301250458, + "learning_rate": 1.4913210425200626e-05, + "loss": 0.2891, + "step": 18126 + }, + { + "epoch": 0.3365475655625402, + "grad_norm": 0.23940473794937134, + "learning_rate": 1.4912194403430569e-05, + "loss": 0.2643, + "step": 18128 + }, + { + "epoch": 0.3365846956999588, + "grad_norm": 0.3542059063911438, + "learning_rate": 1.4911178314820263e-05, + "loss": 0.279, + "step": 18130 + }, + { + "epoch": 0.33662182583737743, + "grad_norm": 0.4075571894645691, + "learning_rate": 1.4910162159383528e-05, + "loss": 0.3025, + "step": 18132 + }, + { + "epoch": 0.33665895597479606, + "grad_norm": 0.2779483497142792, + "learning_rate": 1.4909145937134192e-05, + "loss": 0.2808, + "step": 18134 + }, + { + "epoch": 0.3366960861122147, + "grad_norm": 0.26593664288520813, + "learning_rate": 1.4908129648086087e-05, + "loss": 0.4225, + "step": 18136 + }, + { + "epoch": 0.3367332162496333, + "grad_norm": 0.27906349301338196, + "learning_rate": 1.4907113292253042e-05, + "loss": 0.1161, + "step": 18138 + }, + { + "epoch": 0.336770346387052, + "grad_norm": 0.23779112100601196, + "learning_rate": 1.4906096869648878e-05, + "loss": 0.294, + "step": 18140 + }, + { + "epoch": 0.33680747652447063, + "grad_norm": 0.37250158190727234, + "learning_rate": 1.4905080380287435e-05, + "loss": 0.2972, + "step": 18142 + }, + { + "epoch": 0.33684460666188926, + "grad_norm": 0.42775219678878784, + "learning_rate": 1.4904063824182537e-05, + "loss": 0.3813, + "step": 18144 + }, + { + "epoch": 0.3368817367993079, + "grad_norm": 0.2107010930776596, + "learning_rate": 1.4903047201348022e-05, + "loss": 0.2266, + "step": 18146 + }, + { + "epoch": 0.3369188669367265, + "grad_norm": 0.3748937249183655, + "learning_rate": 1.490203051179772e-05, + "loss": 0.3494, + "step": 18148 + }, + { + "epoch": 0.3369559970741452, + "grad_norm": 0.5349082946777344, + "learning_rate": 1.4901013755545468e-05, + "loss": 0.4264, + "step": 18150 + }, + { + "epoch": 0.3369931272115638, + "grad_norm": 0.34124860167503357, + "learning_rate": 1.4899996932605097e-05, + "loss": 0.1921, + "step": 18152 + }, + { + "epoch": 0.33703025734898245, + "grad_norm": 0.3762378990650177, + "learning_rate": 1.4898980042990447e-05, + "loss": 0.2237, + "step": 18154 + }, + { + "epoch": 0.3370673874864011, + "grad_norm": 0.22387202084064484, + "learning_rate": 1.489796308671535e-05, + "loss": 0.2175, + "step": 18156 + }, + { + "epoch": 0.3371045176238197, + "grad_norm": 0.37569281458854675, + "learning_rate": 1.489694606379365e-05, + "loss": 0.1186, + "step": 18158 + }, + { + "epoch": 0.33714164776123834, + "grad_norm": 0.35135531425476074, + "learning_rate": 1.489592897423918e-05, + "loss": 0.3264, + "step": 18160 + }, + { + "epoch": 0.337178777898657, + "grad_norm": 0.4404272735118866, + "learning_rate": 1.489491181806578e-05, + "loss": 0.4489, + "step": 18162 + }, + { + "epoch": 0.33721590803607565, + "grad_norm": 0.38416457176208496, + "learning_rate": 1.4893894595287293e-05, + "loss": 0.3701, + "step": 18164 + }, + { + "epoch": 0.3372530381734943, + "grad_norm": 0.336603581905365, + "learning_rate": 1.4892877305917561e-05, + "loss": 0.391, + "step": 18166 + }, + { + "epoch": 0.3372901683109129, + "grad_norm": 0.47559013962745667, + "learning_rate": 1.4891859949970422e-05, + "loss": 0.3299, + "step": 18168 + }, + { + "epoch": 0.33732729844833154, + "grad_norm": 0.4884147644042969, + "learning_rate": 1.4890842527459725e-05, + "loss": 0.4522, + "step": 18170 + }, + { + "epoch": 0.3373644285857502, + "grad_norm": 1.8078135251998901, + "learning_rate": 1.4889825038399308e-05, + "loss": 0.5159, + "step": 18172 + }, + { + "epoch": 0.33740155872316885, + "grad_norm": 0.3391738533973694, + "learning_rate": 1.4888807482803023e-05, + "loss": 0.4232, + "step": 18174 + }, + { + "epoch": 0.3374386888605875, + "grad_norm": 0.27409452199935913, + "learning_rate": 1.488778986068471e-05, + "loss": 0.3525, + "step": 18176 + }, + { + "epoch": 0.3374758189980061, + "grad_norm": 0.5105887055397034, + "learning_rate": 1.4886772172058216e-05, + "loss": 0.3551, + "step": 18178 + }, + { + "epoch": 0.33751294913542473, + "grad_norm": 0.4102279841899872, + "learning_rate": 1.4885754416937392e-05, + "loss": 0.3401, + "step": 18180 + }, + { + "epoch": 0.33755007927284336, + "grad_norm": 0.5282325148582458, + "learning_rate": 1.4884736595336084e-05, + "loss": 0.1652, + "step": 18182 + }, + { + "epoch": 0.33758720941026205, + "grad_norm": 0.3346143066883087, + "learning_rate": 1.4883718707268142e-05, + "loss": 0.4516, + "step": 18184 + }, + { + "epoch": 0.3376243395476807, + "grad_norm": 0.4011649191379547, + "learning_rate": 1.4882700752747418e-05, + "loss": 0.4263, + "step": 18186 + }, + { + "epoch": 0.3376614696850993, + "grad_norm": 0.6434336304664612, + "learning_rate": 1.4881682731787761e-05, + "loss": 0.1938, + "step": 18188 + }, + { + "epoch": 0.33769859982251793, + "grad_norm": 0.4085772931575775, + "learning_rate": 1.4880664644403026e-05, + "loss": 0.291, + "step": 18190 + }, + { + "epoch": 0.33773572995993656, + "grad_norm": 0.3318369388580322, + "learning_rate": 1.4879646490607065e-05, + "loss": 0.1883, + "step": 18192 + }, + { + "epoch": 0.33777286009735524, + "grad_norm": 0.2531749904155731, + "learning_rate": 1.487862827041373e-05, + "loss": 0.3559, + "step": 18194 + }, + { + "epoch": 0.33780999023477387, + "grad_norm": 0.2857612669467926, + "learning_rate": 1.4877609983836875e-05, + "loss": 0.4578, + "step": 18196 + }, + { + "epoch": 0.3378471203721925, + "grad_norm": 0.41911283135414124, + "learning_rate": 1.4876591630890362e-05, + "loss": 0.3251, + "step": 18198 + }, + { + "epoch": 0.33788425050961113, + "grad_norm": 0.39310166239738464, + "learning_rate": 1.4875573211588044e-05, + "loss": 0.2964, + "step": 18200 + }, + { + "epoch": 0.33792138064702976, + "grad_norm": 0.3363457918167114, + "learning_rate": 1.4874554725943777e-05, + "loss": 0.5485, + "step": 18202 + }, + { + "epoch": 0.33795851078444844, + "grad_norm": 0.516950249671936, + "learning_rate": 1.4873536173971422e-05, + "loss": 0.2587, + "step": 18204 + }, + { + "epoch": 0.33799564092186707, + "grad_norm": 0.4866335690021515, + "learning_rate": 1.4872517555684835e-05, + "loss": 0.2183, + "step": 18206 + }, + { + "epoch": 0.3380327710592857, + "grad_norm": 0.568638265132904, + "learning_rate": 1.4871498871097883e-05, + "loss": 0.3719, + "step": 18208 + }, + { + "epoch": 0.3380699011967043, + "grad_norm": 0.5424169301986694, + "learning_rate": 1.487048012022442e-05, + "loss": 0.1711, + "step": 18210 + }, + { + "epoch": 0.33810703133412295, + "grad_norm": 0.9764118790626526, + "learning_rate": 1.4869461303078315e-05, + "loss": 0.3511, + "step": 18212 + }, + { + "epoch": 0.3381441614715416, + "grad_norm": 0.3772079646587372, + "learning_rate": 1.4868442419673424e-05, + "loss": 0.0629, + "step": 18214 + }, + { + "epoch": 0.33818129160896027, + "grad_norm": 0.3446721136569977, + "learning_rate": 1.4867423470023614e-05, + "loss": 0.4339, + "step": 18216 + }, + { + "epoch": 0.3382184217463789, + "grad_norm": 0.4032352566719055, + "learning_rate": 1.4866404454142754e-05, + "loss": 0.2481, + "step": 18218 + }, + { + "epoch": 0.3382555518837975, + "grad_norm": 0.2930324673652649, + "learning_rate": 1.4865385372044701e-05, + "loss": 0.394, + "step": 18220 + }, + { + "epoch": 0.33829268202121615, + "grad_norm": 0.383143812417984, + "learning_rate": 1.486436622374333e-05, + "loss": 0.3317, + "step": 18222 + }, + { + "epoch": 0.3383298121586348, + "grad_norm": 0.3990165889263153, + "learning_rate": 1.4863347009252504e-05, + "loss": 0.137, + "step": 18224 + }, + { + "epoch": 0.33836694229605346, + "grad_norm": 0.2508726418018341, + "learning_rate": 1.4862327728586095e-05, + "loss": 0.2739, + "step": 18226 + }, + { + "epoch": 0.3384040724334721, + "grad_norm": 0.39910128712654114, + "learning_rate": 1.4861308381757966e-05, + "loss": 0.3002, + "step": 18228 + }, + { + "epoch": 0.3384412025708907, + "grad_norm": 0.5039410591125488, + "learning_rate": 1.4860288968781993e-05, + "loss": 0.3888, + "step": 18230 + }, + { + "epoch": 0.33847833270830935, + "grad_norm": 0.37737831473350525, + "learning_rate": 1.4859269489672046e-05, + "loss": 0.3185, + "step": 18232 + }, + { + "epoch": 0.338515462845728, + "grad_norm": 0.483288049697876, + "learning_rate": 1.4858249944441996e-05, + "loss": 0.4087, + "step": 18234 + }, + { + "epoch": 0.3385525929831466, + "grad_norm": 0.32364943623542786, + "learning_rate": 1.4857230333105716e-05, + "loss": 0.262, + "step": 18236 + }, + { + "epoch": 0.3385897231205653, + "grad_norm": 0.33189359307289124, + "learning_rate": 1.4856210655677079e-05, + "loss": 0.2402, + "step": 18238 + }, + { + "epoch": 0.3386268532579839, + "grad_norm": 0.3368898630142212, + "learning_rate": 1.4855190912169963e-05, + "loss": 0.3653, + "step": 18240 + }, + { + "epoch": 0.33866398339540255, + "grad_norm": 0.44581544399261475, + "learning_rate": 1.4854171102598243e-05, + "loss": 0.3465, + "step": 18242 + }, + { + "epoch": 0.3387011135328212, + "grad_norm": 0.32312047481536865, + "learning_rate": 1.4853151226975793e-05, + "loss": 0.2754, + "step": 18244 + }, + { + "epoch": 0.3387382436702398, + "grad_norm": 0.41122275590896606, + "learning_rate": 1.4852131285316488e-05, + "loss": 0.289, + "step": 18246 + }, + { + "epoch": 0.3387753738076585, + "grad_norm": 0.42758405208587646, + "learning_rate": 1.4851111277634216e-05, + "loss": 0.3875, + "step": 18248 + }, + { + "epoch": 0.3388125039450771, + "grad_norm": 0.5334575772285461, + "learning_rate": 1.4850091203942847e-05, + "loss": 0.2982, + "step": 18250 + }, + { + "epoch": 0.33884963408249574, + "grad_norm": 0.41673538088798523, + "learning_rate": 1.4849071064256264e-05, + "loss": 0.3638, + "step": 18252 + }, + { + "epoch": 0.33888676421991437, + "grad_norm": 0.3653479218482971, + "learning_rate": 1.484805085858835e-05, + "loss": 0.4835, + "step": 18254 + }, + { + "epoch": 0.338923894357333, + "grad_norm": 0.3188316822052002, + "learning_rate": 1.4847030586952986e-05, + "loss": 0.2774, + "step": 18256 + }, + { + "epoch": 0.33896102449475163, + "grad_norm": 0.5281440019607544, + "learning_rate": 1.484601024936405e-05, + "loss": 0.261, + "step": 18258 + }, + { + "epoch": 0.3389981546321703, + "grad_norm": 0.35306859016418457, + "learning_rate": 1.4844989845835433e-05, + "loss": 0.2948, + "step": 18260 + }, + { + "epoch": 0.33903528476958894, + "grad_norm": 0.25086507201194763, + "learning_rate": 1.4843969376381017e-05, + "loss": 0.4044, + "step": 18262 + }, + { + "epoch": 0.33907241490700757, + "grad_norm": 0.3014141917228699, + "learning_rate": 1.4842948841014688e-05, + "loss": 0.2049, + "step": 18264 + }, + { + "epoch": 0.3391095450444262, + "grad_norm": 0.37135809659957886, + "learning_rate": 1.4841928239750329e-05, + "loss": 0.1638, + "step": 18266 + }, + { + "epoch": 0.3391466751818448, + "grad_norm": 0.35058286786079407, + "learning_rate": 1.4840907572601833e-05, + "loss": 0.2769, + "step": 18268 + }, + { + "epoch": 0.3391838053192635, + "grad_norm": 0.38817042112350464, + "learning_rate": 1.4839886839583082e-05, + "loss": 0.2606, + "step": 18270 + }, + { + "epoch": 0.33922093545668214, + "grad_norm": 0.29744139313697815, + "learning_rate": 1.483886604070797e-05, + "loss": 0.274, + "step": 18272 + }, + { + "epoch": 0.33925806559410077, + "grad_norm": 0.6697937846183777, + "learning_rate": 1.4837845175990384e-05, + "loss": 0.4953, + "step": 18274 + }, + { + "epoch": 0.3392951957315194, + "grad_norm": 0.3933962285518646, + "learning_rate": 1.483682424544422e-05, + "loss": 0.338, + "step": 18276 + }, + { + "epoch": 0.339332325868938, + "grad_norm": 0.2833634614944458, + "learning_rate": 1.483580324908336e-05, + "loss": 0.3862, + "step": 18278 + }, + { + "epoch": 0.3393694560063567, + "grad_norm": 0.44437509775161743, + "learning_rate": 1.4834782186921708e-05, + "loss": 0.3438, + "step": 18280 + }, + { + "epoch": 0.33940658614377534, + "grad_norm": 0.38834190368652344, + "learning_rate": 1.4833761058973148e-05, + "loss": 0.1916, + "step": 18282 + }, + { + "epoch": 0.33944371628119396, + "grad_norm": 0.3366439640522003, + "learning_rate": 1.483273986525158e-05, + "loss": 0.178, + "step": 18284 + }, + { + "epoch": 0.3394808464186126, + "grad_norm": 0.3393609821796417, + "learning_rate": 1.48317186057709e-05, + "loss": 0.3019, + "step": 18286 + }, + { + "epoch": 0.3395179765560312, + "grad_norm": 0.2888309061527252, + "learning_rate": 1.4830697280545003e-05, + "loss": 0.3157, + "step": 18288 + }, + { + "epoch": 0.33955510669344985, + "grad_norm": 0.3677518963813782, + "learning_rate": 1.4829675889587782e-05, + "loss": 0.247, + "step": 18290 + }, + { + "epoch": 0.33959223683086853, + "grad_norm": 0.5376136302947998, + "learning_rate": 1.4828654432913144e-05, + "loss": 0.3058, + "step": 18292 + }, + { + "epoch": 0.33962936696828716, + "grad_norm": 0.391912579536438, + "learning_rate": 1.4827632910534978e-05, + "loss": 0.4533, + "step": 18294 + }, + { + "epoch": 0.3396664971057058, + "grad_norm": 0.38033872842788696, + "learning_rate": 1.4826611322467188e-05, + "loss": 0.3379, + "step": 18296 + }, + { + "epoch": 0.3397036272431244, + "grad_norm": 0.48980510234832764, + "learning_rate": 1.4825589668723676e-05, + "loss": 0.293, + "step": 18298 + }, + { + "epoch": 0.33974075738054305, + "grad_norm": 0.5520845651626587, + "learning_rate": 1.4824567949318347e-05, + "loss": 0.3664, + "step": 18300 + }, + { + "epoch": 0.33977788751796173, + "grad_norm": 0.5400775074958801, + "learning_rate": 1.4823546164265095e-05, + "loss": 0.192, + "step": 18302 + }, + { + "epoch": 0.33981501765538036, + "grad_norm": 0.6307773590087891, + "learning_rate": 1.482252431357783e-05, + "loss": 0.5288, + "step": 18304 + }, + { + "epoch": 0.339852147792799, + "grad_norm": 0.35385769605636597, + "learning_rate": 1.4821502397270454e-05, + "loss": 0.2402, + "step": 18306 + }, + { + "epoch": 0.3398892779302176, + "grad_norm": 0.32030171155929565, + "learning_rate": 1.4820480415356873e-05, + "loss": 0.2521, + "step": 18308 + }, + { + "epoch": 0.33992640806763624, + "grad_norm": 0.4245765209197998, + "learning_rate": 1.4819458367850989e-05, + "loss": 0.3722, + "step": 18310 + }, + { + "epoch": 0.33996353820505487, + "grad_norm": 0.6182373762130737, + "learning_rate": 1.4818436254766716e-05, + "loss": 0.2469, + "step": 18312 + }, + { + "epoch": 0.34000066834247356, + "grad_norm": 0.3729632794857025, + "learning_rate": 1.481741407611796e-05, + "loss": 0.3412, + "step": 18314 + }, + { + "epoch": 0.3400377984798922, + "grad_norm": 0.3928276300430298, + "learning_rate": 1.4816391831918623e-05, + "loss": 0.3475, + "step": 18316 + }, + { + "epoch": 0.3400749286173108, + "grad_norm": 0.37950918078422546, + "learning_rate": 1.4815369522182626e-05, + "loss": 0.2787, + "step": 18318 + }, + { + "epoch": 0.34011205875472944, + "grad_norm": 0.33199527859687805, + "learning_rate": 1.4814347146923868e-05, + "loss": 0.2408, + "step": 18320 + }, + { + "epoch": 0.34014918889214807, + "grad_norm": 0.3416058123111725, + "learning_rate": 1.4813324706156269e-05, + "loss": 0.3192, + "step": 18322 + }, + { + "epoch": 0.34018631902956675, + "grad_norm": 0.4705767333507538, + "learning_rate": 1.481230219989374e-05, + "loss": 0.3766, + "step": 18324 + }, + { + "epoch": 0.3402234491669854, + "grad_norm": 0.3312683701515198, + "learning_rate": 1.4811279628150188e-05, + "loss": 0.2684, + "step": 18326 + }, + { + "epoch": 0.340260579304404, + "grad_norm": 0.41331741213798523, + "learning_rate": 1.4810256990939536e-05, + "loss": 0.1745, + "step": 18328 + }, + { + "epoch": 0.34029770944182264, + "grad_norm": 0.5942723155021667, + "learning_rate": 1.4809234288275693e-05, + "loss": 0.375, + "step": 18330 + }, + { + "epoch": 0.34033483957924127, + "grad_norm": 0.24649269878864288, + "learning_rate": 1.4808211520172576e-05, + "loss": 0.3339, + "step": 18332 + }, + { + "epoch": 0.3403719697166599, + "grad_norm": 0.44161251187324524, + "learning_rate": 1.4807188686644101e-05, + "loss": 0.1539, + "step": 18334 + }, + { + "epoch": 0.3404090998540786, + "grad_norm": 0.3539154827594757, + "learning_rate": 1.4806165787704188e-05, + "loss": 0.1946, + "step": 18336 + }, + { + "epoch": 0.3404462299914972, + "grad_norm": 0.7322278618812561, + "learning_rate": 1.4805142823366757e-05, + "loss": 0.3892, + "step": 18338 + }, + { + "epoch": 0.34048336012891584, + "grad_norm": 0.35283616185188293, + "learning_rate": 1.4804119793645724e-05, + "loss": 0.266, + "step": 18340 + }, + { + "epoch": 0.34052049026633446, + "grad_norm": 0.2875336706638336, + "learning_rate": 1.480309669855501e-05, + "loss": 0.1382, + "step": 18342 + }, + { + "epoch": 0.3405576204037531, + "grad_norm": 0.4004838466644287, + "learning_rate": 1.4802073538108538e-05, + "loss": 0.2007, + "step": 18344 + }, + { + "epoch": 0.3405947505411718, + "grad_norm": 0.36307767033576965, + "learning_rate": 1.4801050312320229e-05, + "loss": 0.2224, + "step": 18346 + }, + { + "epoch": 0.3406318806785904, + "grad_norm": 0.4122692048549652, + "learning_rate": 1.4800027021204007e-05, + "loss": 0.4446, + "step": 18348 + }, + { + "epoch": 0.34066901081600903, + "grad_norm": 0.27654966711997986, + "learning_rate": 1.4799003664773795e-05, + "loss": 0.3937, + "step": 18350 + }, + { + "epoch": 0.34070614095342766, + "grad_norm": 0.573334813117981, + "learning_rate": 1.4797980243043512e-05, + "loss": 0.5729, + "step": 18352 + }, + { + "epoch": 0.3407432710908463, + "grad_norm": 0.5605304837226868, + "learning_rate": 1.4796956756027094e-05, + "loss": 0.302, + "step": 18354 + }, + { + "epoch": 0.340780401228265, + "grad_norm": 0.27380573749542236, + "learning_rate": 1.4795933203738467e-05, + "loss": 0.2565, + "step": 18356 + }, + { + "epoch": 0.3408175313656836, + "grad_norm": 0.40755441784858704, + "learning_rate": 1.4794909586191547e-05, + "loss": 0.278, + "step": 18358 + }, + { + "epoch": 0.34085466150310223, + "grad_norm": 0.40119150280952454, + "learning_rate": 1.4793885903400276e-05, + "loss": 0.1833, + "step": 18360 + }, + { + "epoch": 0.34089179164052086, + "grad_norm": 0.36560434103012085, + "learning_rate": 1.4792862155378575e-05, + "loss": 0.38, + "step": 18362 + }, + { + "epoch": 0.3409289217779395, + "grad_norm": 0.6416745781898499, + "learning_rate": 1.4791838342140374e-05, + "loss": 0.3211, + "step": 18364 + }, + { + "epoch": 0.3409660519153581, + "grad_norm": 0.46165522933006287, + "learning_rate": 1.4790814463699607e-05, + "loss": 0.4203, + "step": 18366 + }, + { + "epoch": 0.3410031820527768, + "grad_norm": 0.28949829936027527, + "learning_rate": 1.4789790520070208e-05, + "loss": 0.3578, + "step": 18368 + }, + { + "epoch": 0.3410403121901954, + "grad_norm": 0.4012550711631775, + "learning_rate": 1.4788766511266105e-05, + "loss": 0.3998, + "step": 18370 + }, + { + "epoch": 0.34107744232761406, + "grad_norm": 0.5021101832389832, + "learning_rate": 1.4787742437301238e-05, + "loss": 0.2776, + "step": 18372 + }, + { + "epoch": 0.3411145724650327, + "grad_norm": 0.31587058305740356, + "learning_rate": 1.4786718298189532e-05, + "loss": 0.3816, + "step": 18374 + }, + { + "epoch": 0.3411517026024513, + "grad_norm": 0.4440610408782959, + "learning_rate": 1.478569409394493e-05, + "loss": 0.4557, + "step": 18376 + }, + { + "epoch": 0.34118883273987, + "grad_norm": 0.3337494730949402, + "learning_rate": 1.4784669824581366e-05, + "loss": 0.3086, + "step": 18378 + }, + { + "epoch": 0.3412259628772886, + "grad_norm": 0.38442182540893555, + "learning_rate": 1.4783645490112779e-05, + "loss": 0.3325, + "step": 18380 + }, + { + "epoch": 0.34126309301470725, + "grad_norm": 0.2954692542552948, + "learning_rate": 1.4782621090553105e-05, + "loss": 0.2121, + "step": 18382 + }, + { + "epoch": 0.3413002231521259, + "grad_norm": 0.3860663175582886, + "learning_rate": 1.4781596625916282e-05, + "loss": 0.2644, + "step": 18384 + }, + { + "epoch": 0.3413373532895445, + "grad_norm": 0.3253702223300934, + "learning_rate": 1.478057209621625e-05, + "loss": 0.2706, + "step": 18386 + }, + { + "epoch": 0.34137448342696314, + "grad_norm": 1.248823642730713, + "learning_rate": 1.4779547501466956e-05, + "loss": 0.2269, + "step": 18388 + }, + { + "epoch": 0.3414116135643818, + "grad_norm": 0.6219111680984497, + "learning_rate": 1.4778522841682333e-05, + "loss": 0.3911, + "step": 18390 + }, + { + "epoch": 0.34144874370180045, + "grad_norm": 0.47504132986068726, + "learning_rate": 1.4777498116876328e-05, + "loss": 0.513, + "step": 18392 + }, + { + "epoch": 0.3414858738392191, + "grad_norm": 0.3708641529083252, + "learning_rate": 1.4776473327062886e-05, + "loss": 0.2627, + "step": 18394 + }, + { + "epoch": 0.3415230039766377, + "grad_norm": 0.44879040122032166, + "learning_rate": 1.4775448472255946e-05, + "loss": 0.3754, + "step": 18396 + }, + { + "epoch": 0.34156013411405634, + "grad_norm": 0.46180999279022217, + "learning_rate": 1.4774423552469457e-05, + "loss": 0.3226, + "step": 18398 + }, + { + "epoch": 0.341597264251475, + "grad_norm": 0.46589720249176025, + "learning_rate": 1.4773398567717368e-05, + "loss": 0.411, + "step": 18400 + }, + { + "epoch": 0.34163439438889365, + "grad_norm": 0.6094220876693726, + "learning_rate": 1.4772373518013621e-05, + "loss": 0.1175, + "step": 18402 + }, + { + "epoch": 0.3416715245263123, + "grad_norm": 0.39103585481643677, + "learning_rate": 1.4771348403372163e-05, + "loss": 0.214, + "step": 18404 + }, + { + "epoch": 0.3417086546637309, + "grad_norm": 0.6076875925064087, + "learning_rate": 1.4770323223806948e-05, + "loss": 0.5578, + "step": 18406 + }, + { + "epoch": 0.34174578480114953, + "grad_norm": 0.4555501639842987, + "learning_rate": 1.476929797933192e-05, + "loss": 0.3345, + "step": 18408 + }, + { + "epoch": 0.34178291493856816, + "grad_norm": 0.3708654046058655, + "learning_rate": 1.4768272669961032e-05, + "loss": 0.265, + "step": 18410 + }, + { + "epoch": 0.34182004507598684, + "grad_norm": 0.3246888816356659, + "learning_rate": 1.4767247295708238e-05, + "loss": 0.1496, + "step": 18412 + }, + { + "epoch": 0.3418571752134055, + "grad_norm": 0.4212571382522583, + "learning_rate": 1.4766221856587487e-05, + "loss": 0.2489, + "step": 18414 + }, + { + "epoch": 0.3418943053508241, + "grad_norm": 0.2987867593765259, + "learning_rate": 1.4765196352612734e-05, + "loss": 0.3318, + "step": 18416 + }, + { + "epoch": 0.34193143548824273, + "grad_norm": 0.30457404255867004, + "learning_rate": 1.4764170783797931e-05, + "loss": 0.2203, + "step": 18418 + }, + { + "epoch": 0.34196856562566136, + "grad_norm": 0.35973840951919556, + "learning_rate": 1.4763145150157035e-05, + "loss": 0.3911, + "step": 18420 + }, + { + "epoch": 0.34200569576308004, + "grad_norm": 0.27560552954673767, + "learning_rate": 1.4762119451703998e-05, + "loss": 0.2638, + "step": 18422 + }, + { + "epoch": 0.34204282590049867, + "grad_norm": 0.3037315309047699, + "learning_rate": 1.4761093688452781e-05, + "loss": 0.2871, + "step": 18424 + }, + { + "epoch": 0.3420799560379173, + "grad_norm": 0.3087960183620453, + "learning_rate": 1.4760067860417343e-05, + "loss": 0.3604, + "step": 18426 + }, + { + "epoch": 0.3421170861753359, + "grad_norm": 0.4646759331226349, + "learning_rate": 1.4759041967611636e-05, + "loss": 0.25, + "step": 18428 + }, + { + "epoch": 0.34215421631275456, + "grad_norm": 0.4942202866077423, + "learning_rate": 1.4758016010049621e-05, + "loss": 0.2276, + "step": 18430 + }, + { + "epoch": 0.34219134645017324, + "grad_norm": 0.541326105594635, + "learning_rate": 1.4756989987745265e-05, + "loss": 0.3129, + "step": 18432 + }, + { + "epoch": 0.34222847658759187, + "grad_norm": 0.2703700363636017, + "learning_rate": 1.4755963900712521e-05, + "loss": 0.145, + "step": 18434 + }, + { + "epoch": 0.3422656067250105, + "grad_norm": 0.315107136964798, + "learning_rate": 1.4754937748965354e-05, + "loss": 0.4845, + "step": 18436 + }, + { + "epoch": 0.3423027368624291, + "grad_norm": 0.3075566291809082, + "learning_rate": 1.4753911532517728e-05, + "loss": 0.307, + "step": 18438 + }, + { + "epoch": 0.34233986699984775, + "grad_norm": 0.42130833864212036, + "learning_rate": 1.4752885251383607e-05, + "loss": 0.2895, + "step": 18440 + }, + { + "epoch": 0.3423769971372664, + "grad_norm": 0.40836820006370544, + "learning_rate": 1.475185890557695e-05, + "loss": 0.3714, + "step": 18442 + }, + { + "epoch": 0.34241412727468507, + "grad_norm": 0.4758872389793396, + "learning_rate": 1.475083249511173e-05, + "loss": 0.5032, + "step": 18444 + }, + { + "epoch": 0.3424512574121037, + "grad_norm": 0.4957320988178253, + "learning_rate": 1.4749806020001911e-05, + "loss": 0.2063, + "step": 18446 + }, + { + "epoch": 0.3424883875495223, + "grad_norm": 0.4166910946369171, + "learning_rate": 1.4748779480261456e-05, + "loss": 0.4267, + "step": 18448 + }, + { + "epoch": 0.34252551768694095, + "grad_norm": 0.44949257373809814, + "learning_rate": 1.474775287590434e-05, + "loss": 0.2754, + "step": 18450 + }, + { + "epoch": 0.3425626478243596, + "grad_norm": 0.5518895983695984, + "learning_rate": 1.4746726206944527e-05, + "loss": 0.4205, + "step": 18452 + }, + { + "epoch": 0.34259977796177826, + "grad_norm": 0.37484726309776306, + "learning_rate": 1.4745699473395986e-05, + "loss": 0.2487, + "step": 18454 + }, + { + "epoch": 0.3426369080991969, + "grad_norm": 0.39550405740737915, + "learning_rate": 1.4744672675272695e-05, + "loss": 0.4005, + "step": 18456 + }, + { + "epoch": 0.3426740382366155, + "grad_norm": 0.3381112813949585, + "learning_rate": 1.4743645812588619e-05, + "loss": 0.2921, + "step": 18458 + }, + { + "epoch": 0.34271116837403415, + "grad_norm": 0.38555794954299927, + "learning_rate": 1.4742618885357728e-05, + "loss": 0.2701, + "step": 18460 + }, + { + "epoch": 0.3427482985114528, + "grad_norm": 0.4074064791202545, + "learning_rate": 1.4741591893594004e-05, + "loss": 0.2891, + "step": 18462 + }, + { + "epoch": 0.3427854286488714, + "grad_norm": 0.3551812469959259, + "learning_rate": 1.4740564837311417e-05, + "loss": 0.3131, + "step": 18464 + }, + { + "epoch": 0.3428225587862901, + "grad_norm": 0.4106914699077606, + "learning_rate": 1.4739537716523943e-05, + "loss": 0.2862, + "step": 18466 + }, + { + "epoch": 0.3428596889237087, + "grad_norm": 0.25194311141967773, + "learning_rate": 1.4738510531245556e-05, + "loss": 0.1716, + "step": 18468 + }, + { + "epoch": 0.34289681906112734, + "grad_norm": 0.3113246262073517, + "learning_rate": 1.4737483281490235e-05, + "loss": 0.3441, + "step": 18470 + }, + { + "epoch": 0.342933949198546, + "grad_norm": 0.5459404587745667, + "learning_rate": 1.4736455967271955e-05, + "loss": 0.3171, + "step": 18472 + }, + { + "epoch": 0.3429710793359646, + "grad_norm": 0.3636152446269989, + "learning_rate": 1.4735428588604697e-05, + "loss": 0.4642, + "step": 18474 + }, + { + "epoch": 0.3430082094733833, + "grad_norm": 0.5082035660743713, + "learning_rate": 1.4734401145502442e-05, + "loss": 0.2514, + "step": 18476 + }, + { + "epoch": 0.3430453396108019, + "grad_norm": 0.38087666034698486, + "learning_rate": 1.4733373637979168e-05, + "loss": 0.3558, + "step": 18478 + }, + { + "epoch": 0.34308246974822054, + "grad_norm": 0.4863491356372833, + "learning_rate": 1.4732346066048855e-05, + "loss": 0.4232, + "step": 18480 + }, + { + "epoch": 0.34311959988563917, + "grad_norm": 0.30016979575157166, + "learning_rate": 1.473131842972549e-05, + "loss": 0.4608, + "step": 18482 + }, + { + "epoch": 0.3431567300230578, + "grad_norm": 0.35095369815826416, + "learning_rate": 1.473029072902305e-05, + "loss": 0.4489, + "step": 18484 + }, + { + "epoch": 0.3431938601604764, + "grad_norm": 0.3252183496952057, + "learning_rate": 1.4729262963955524e-05, + "loss": 0.2471, + "step": 18486 + }, + { + "epoch": 0.3432309902978951, + "grad_norm": 0.34476903080940247, + "learning_rate": 1.4728235134536894e-05, + "loss": 0.439, + "step": 18488 + }, + { + "epoch": 0.34326812043531374, + "grad_norm": 0.3564518392086029, + "learning_rate": 1.4727207240781148e-05, + "loss": 0.4229, + "step": 18490 + }, + { + "epoch": 0.34330525057273237, + "grad_norm": 0.365873247385025, + "learning_rate": 1.4726179282702268e-05, + "loss": 0.2776, + "step": 18492 + }, + { + "epoch": 0.343342380710151, + "grad_norm": 4.668469429016113, + "learning_rate": 1.4725151260314248e-05, + "loss": 0.3878, + "step": 18494 + }, + { + "epoch": 0.3433795108475696, + "grad_norm": 0.5209546685218811, + "learning_rate": 1.4724123173631071e-05, + "loss": 0.3809, + "step": 18496 + }, + { + "epoch": 0.3434166409849883, + "grad_norm": 0.3985597491264343, + "learning_rate": 1.472309502266673e-05, + "loss": 0.3819, + "step": 18498 + }, + { + "epoch": 0.34345377112240694, + "grad_norm": 0.41030529141426086, + "learning_rate": 1.4722066807435212e-05, + "loss": 0.2816, + "step": 18500 + }, + { + "epoch": 0.34349090125982557, + "grad_norm": 0.41002827882766724, + "learning_rate": 1.4721038527950507e-05, + "loss": 0.3486, + "step": 18502 + }, + { + "epoch": 0.3435280313972442, + "grad_norm": 0.4368060529232025, + "learning_rate": 1.4720010184226607e-05, + "loss": 0.4878, + "step": 18504 + }, + { + "epoch": 0.3435651615346628, + "grad_norm": 0.33190497756004333, + "learning_rate": 1.471898177627751e-05, + "loss": 0.441, + "step": 18506 + }, + { + "epoch": 0.3436022916720815, + "grad_norm": 0.37431544065475464, + "learning_rate": 1.4717953304117205e-05, + "loss": 0.3051, + "step": 18508 + }, + { + "epoch": 0.34363942180950013, + "grad_norm": 0.24775822460651398, + "learning_rate": 1.471692476775969e-05, + "loss": 0.2774, + "step": 18510 + }, + { + "epoch": 0.34367655194691876, + "grad_norm": 0.3449932038784027, + "learning_rate": 1.4715896167218956e-05, + "loss": 0.3105, + "step": 18512 + }, + { + "epoch": 0.3437136820843374, + "grad_norm": 0.32338500022888184, + "learning_rate": 1.4714867502509001e-05, + "loss": 0.3576, + "step": 18514 + }, + { + "epoch": 0.343750812221756, + "grad_norm": 0.41777700185775757, + "learning_rate": 1.471383877364382e-05, + "loss": 0.2771, + "step": 18516 + }, + { + "epoch": 0.34378794235917465, + "grad_norm": 0.9013303518295288, + "learning_rate": 1.4712809980637414e-05, + "loss": 0.2851, + "step": 18518 + }, + { + "epoch": 0.34382507249659333, + "grad_norm": 0.5362027883529663, + "learning_rate": 1.4711781123503784e-05, + "loss": 0.3007, + "step": 18520 + }, + { + "epoch": 0.34386220263401196, + "grad_norm": 0.32192492485046387, + "learning_rate": 1.4710752202256921e-05, + "loss": 0.3793, + "step": 18522 + }, + { + "epoch": 0.3438993327714306, + "grad_norm": 0.377270370721817, + "learning_rate": 1.4709723216910833e-05, + "loss": 0.1943, + "step": 18524 + }, + { + "epoch": 0.3439364629088492, + "grad_norm": 0.3293372392654419, + "learning_rate": 1.470869416747952e-05, + "loss": 0.3127, + "step": 18526 + }, + { + "epoch": 0.34397359304626784, + "grad_norm": 0.42359501123428345, + "learning_rate": 1.4707665053976982e-05, + "loss": 0.34, + "step": 18528 + }, + { + "epoch": 0.34401072318368653, + "grad_norm": 0.38239607214927673, + "learning_rate": 1.4706635876417226e-05, + "loss": 0.2964, + "step": 18530 + }, + { + "epoch": 0.34404785332110516, + "grad_norm": 0.3367011547088623, + "learning_rate": 1.4705606634814253e-05, + "loss": 0.3975, + "step": 18532 + }, + { + "epoch": 0.3440849834585238, + "grad_norm": 0.2904767394065857, + "learning_rate": 1.4704577329182069e-05, + "loss": 0.1761, + "step": 18534 + }, + { + "epoch": 0.3441221135959424, + "grad_norm": 0.38558465242385864, + "learning_rate": 1.4703547959534677e-05, + "loss": 0.2191, + "step": 18536 + }, + { + "epoch": 0.34415924373336104, + "grad_norm": 0.34732577204704285, + "learning_rate": 1.4702518525886088e-05, + "loss": 0.3606, + "step": 18538 + }, + { + "epoch": 0.34419637387077967, + "grad_norm": 0.45775535702705383, + "learning_rate": 1.4701489028250309e-05, + "loss": 0.2958, + "step": 18540 + }, + { + "epoch": 0.34423350400819835, + "grad_norm": 0.48883628845214844, + "learning_rate": 1.4700459466641344e-05, + "loss": 0.2339, + "step": 18542 + }, + { + "epoch": 0.344270634145617, + "grad_norm": 0.19950418174266815, + "learning_rate": 1.4699429841073204e-05, + "loss": 0.2887, + "step": 18544 + }, + { + "epoch": 0.3443077642830356, + "grad_norm": 0.45074546337127686, + "learning_rate": 1.4698400151559903e-05, + "loss": 0.3845, + "step": 18546 + }, + { + "epoch": 0.34434489442045424, + "grad_norm": 0.41176578402519226, + "learning_rate": 1.4697370398115449e-05, + "loss": 0.2371, + "step": 18548 + }, + { + "epoch": 0.34438202455787287, + "grad_norm": 0.29774734377861023, + "learning_rate": 1.4696340580753854e-05, + "loss": 0.2491, + "step": 18550 + }, + { + "epoch": 0.34441915469529155, + "grad_norm": 0.37725627422332764, + "learning_rate": 1.4695310699489134e-05, + "loss": 0.338, + "step": 18552 + }, + { + "epoch": 0.3444562848327102, + "grad_norm": 0.41958385705947876, + "learning_rate": 1.4694280754335296e-05, + "loss": 0.2931, + "step": 18554 + }, + { + "epoch": 0.3444934149701288, + "grad_norm": 0.315873920917511, + "learning_rate": 1.4693250745306356e-05, + "loss": 0.2888, + "step": 18556 + }, + { + "epoch": 0.34453054510754744, + "grad_norm": 0.42969194054603577, + "learning_rate": 1.4692220672416334e-05, + "loss": 0.3944, + "step": 18558 + }, + { + "epoch": 0.34456767524496607, + "grad_norm": 0.22775757312774658, + "learning_rate": 1.469119053567924e-05, + "loss": 0.1546, + "step": 18560 + }, + { + "epoch": 0.3446048053823847, + "grad_norm": 0.37798747420310974, + "learning_rate": 1.46901603351091e-05, + "loss": 0.2355, + "step": 18562 + }, + { + "epoch": 0.3446419355198034, + "grad_norm": 0.4860764443874359, + "learning_rate": 1.4689130070719926e-05, + "loss": 0.358, + "step": 18564 + }, + { + "epoch": 0.344679065657222, + "grad_norm": 0.3808879852294922, + "learning_rate": 1.4688099742525732e-05, + "loss": 0.3541, + "step": 18566 + }, + { + "epoch": 0.34471619579464063, + "grad_norm": 0.2747688889503479, + "learning_rate": 1.4687069350540545e-05, + "loss": 0.2521, + "step": 18568 + }, + { + "epoch": 0.34475332593205926, + "grad_norm": 0.4016200602054596, + "learning_rate": 1.4686038894778387e-05, + "loss": 0.2727, + "step": 18570 + }, + { + "epoch": 0.3447904560694779, + "grad_norm": 0.691413402557373, + "learning_rate": 1.4685008375253276e-05, + "loss": 0.2286, + "step": 18572 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 0.5206535458564758, + "learning_rate": 1.4683977791979228e-05, + "loss": 0.3718, + "step": 18574 + }, + { + "epoch": 0.3448647163443152, + "grad_norm": 0.23580624163150787, + "learning_rate": 1.4682947144970278e-05, + "loss": 0.3449, + "step": 18576 + }, + { + "epoch": 0.34490184648173383, + "grad_norm": 0.5057315230369568, + "learning_rate": 1.4681916434240442e-05, + "loss": 0.2601, + "step": 18578 + }, + { + "epoch": 0.34493897661915246, + "grad_norm": 0.4236955940723419, + "learning_rate": 1.4680885659803747e-05, + "loss": 0.4124, + "step": 18580 + }, + { + "epoch": 0.3449761067565711, + "grad_norm": 0.578579306602478, + "learning_rate": 1.4679854821674221e-05, + "loss": 0.27, + "step": 18582 + }, + { + "epoch": 0.34501323689398977, + "grad_norm": 0.5346922874450684, + "learning_rate": 1.467882391986589e-05, + "loss": 0.3457, + "step": 18584 + }, + { + "epoch": 0.3450503670314084, + "grad_norm": 0.5471673607826233, + "learning_rate": 1.4677792954392778e-05, + "loss": 0.4815, + "step": 18586 + }, + { + "epoch": 0.34508749716882703, + "grad_norm": 0.3296218514442444, + "learning_rate": 1.4676761925268915e-05, + "loss": 0.2711, + "step": 18588 + }, + { + "epoch": 0.34512462730624566, + "grad_norm": 0.4686400592327118, + "learning_rate": 1.467573083250833e-05, + "loss": 0.3736, + "step": 18590 + }, + { + "epoch": 0.3451617574436643, + "grad_norm": 0.38642826676368713, + "learning_rate": 1.4674699676125055e-05, + "loss": 0.2918, + "step": 18592 + }, + { + "epoch": 0.3451988875810829, + "grad_norm": 0.40722349286079407, + "learning_rate": 1.4673668456133121e-05, + "loss": 0.4454, + "step": 18594 + }, + { + "epoch": 0.3452360177185016, + "grad_norm": 0.20134998857975006, + "learning_rate": 1.4672637172546558e-05, + "loss": 0.3673, + "step": 18596 + }, + { + "epoch": 0.3452731478559202, + "grad_norm": 0.3522738516330719, + "learning_rate": 1.4671605825379396e-05, + "loss": 0.3988, + "step": 18598 + }, + { + "epoch": 0.34531027799333885, + "grad_norm": 0.3568783104419708, + "learning_rate": 1.4670574414645672e-05, + "loss": 0.2853, + "step": 18600 + }, + { + "epoch": 0.3453474081307575, + "grad_norm": 0.46944695711135864, + "learning_rate": 1.4669542940359424e-05, + "loss": 0.285, + "step": 18602 + }, + { + "epoch": 0.3453845382681761, + "grad_norm": 0.5164563655853271, + "learning_rate": 1.4668511402534684e-05, + "loss": 0.2112, + "step": 18604 + }, + { + "epoch": 0.3454216684055948, + "grad_norm": 0.408833771944046, + "learning_rate": 1.4667479801185485e-05, + "loss": 0.3871, + "step": 18606 + }, + { + "epoch": 0.3454587985430134, + "grad_norm": 3.7545087337493896, + "learning_rate": 1.4666448136325869e-05, + "loss": 0.2925, + "step": 18608 + }, + { + "epoch": 0.34549592868043205, + "grad_norm": 0.3766627907752991, + "learning_rate": 1.4665416407969867e-05, + "loss": 0.2477, + "step": 18610 + }, + { + "epoch": 0.3455330588178507, + "grad_norm": 0.3098503649234772, + "learning_rate": 1.4664384616131525e-05, + "loss": 0.3116, + "step": 18612 + }, + { + "epoch": 0.3455701889552693, + "grad_norm": 0.36140838265419006, + "learning_rate": 1.4663352760824881e-05, + "loss": 0.0721, + "step": 18614 + }, + { + "epoch": 0.34560731909268794, + "grad_norm": 0.3446345925331116, + "learning_rate": 1.4662320842063972e-05, + "loss": 0.4233, + "step": 18616 + }, + { + "epoch": 0.3456444492301066, + "grad_norm": 0.4399997293949127, + "learning_rate": 1.4661288859862844e-05, + "loss": 0.3954, + "step": 18618 + }, + { + "epoch": 0.34568157936752525, + "grad_norm": 0.33791542053222656, + "learning_rate": 1.4660256814235536e-05, + "loss": 0.3203, + "step": 18620 + }, + { + "epoch": 0.3457187095049439, + "grad_norm": 0.44381722807884216, + "learning_rate": 1.4659224705196092e-05, + "loss": 0.3669, + "step": 18622 + }, + { + "epoch": 0.3457558396423625, + "grad_norm": 0.4761871099472046, + "learning_rate": 1.4658192532758553e-05, + "loss": 0.1948, + "step": 18624 + }, + { + "epoch": 0.34579296977978113, + "grad_norm": 0.617220401763916, + "learning_rate": 1.465716029693697e-05, + "loss": 0.293, + "step": 18626 + }, + { + "epoch": 0.3458300999171998, + "grad_norm": 0.4558509886264801, + "learning_rate": 1.4656127997745385e-05, + "loss": 0.3223, + "step": 18628 + }, + { + "epoch": 0.34586723005461845, + "grad_norm": 0.2962754964828491, + "learning_rate": 1.4655095635197843e-05, + "loss": 0.1827, + "step": 18630 + }, + { + "epoch": 0.3459043601920371, + "grad_norm": 0.3096504807472229, + "learning_rate": 1.4654063209308392e-05, + "loss": 0.3243, + "step": 18632 + }, + { + "epoch": 0.3459414903294557, + "grad_norm": 0.42045292258262634, + "learning_rate": 1.4653030720091081e-05, + "loss": 0.2591, + "step": 18634 + }, + { + "epoch": 0.34597862046687433, + "grad_norm": 0.4099344313144684, + "learning_rate": 1.4651998167559964e-05, + "loss": 0.2409, + "step": 18636 + }, + { + "epoch": 0.34601575060429296, + "grad_norm": 0.4277016520500183, + "learning_rate": 1.4650965551729081e-05, + "loss": 0.2968, + "step": 18638 + }, + { + "epoch": 0.34605288074171164, + "grad_norm": 0.43297651410102844, + "learning_rate": 1.4649932872612493e-05, + "loss": 0.35, + "step": 18640 + }, + { + "epoch": 0.34609001087913027, + "grad_norm": 0.38874542713165283, + "learning_rate": 1.4648900130224243e-05, + "loss": 0.3985, + "step": 18642 + }, + { + "epoch": 0.3461271410165489, + "grad_norm": 0.43265971541404724, + "learning_rate": 1.4647867324578386e-05, + "loss": 0.195, + "step": 18644 + }, + { + "epoch": 0.34616427115396753, + "grad_norm": 0.331837922334671, + "learning_rate": 1.4646834455688984e-05, + "loss": 0.2876, + "step": 18646 + }, + { + "epoch": 0.34620140129138616, + "grad_norm": 0.5791233777999878, + "learning_rate": 1.4645801523570078e-05, + "loss": 0.3325, + "step": 18648 + }, + { + "epoch": 0.34623853142880484, + "grad_norm": 0.3405318558216095, + "learning_rate": 1.464476852823573e-05, + "loss": 0.2062, + "step": 18650 + }, + { + "epoch": 0.34627566156622347, + "grad_norm": 0.26828306913375854, + "learning_rate": 1.4643735469699996e-05, + "loss": 0.3651, + "step": 18652 + }, + { + "epoch": 0.3463127917036421, + "grad_norm": 0.24720120429992676, + "learning_rate": 1.4642702347976929e-05, + "loss": 0.1551, + "step": 18654 + }, + { + "epoch": 0.3463499218410607, + "grad_norm": 0.4631330072879791, + "learning_rate": 1.4641669163080594e-05, + "loss": 0.1814, + "step": 18656 + }, + { + "epoch": 0.34638705197847935, + "grad_norm": 0.2955108880996704, + "learning_rate": 1.4640635915025043e-05, + "loss": 0.367, + "step": 18658 + }, + { + "epoch": 0.34642418211589804, + "grad_norm": 0.3483012020587921, + "learning_rate": 1.4639602603824336e-05, + "loss": 0.1411, + "step": 18660 + }, + { + "epoch": 0.34646131225331667, + "grad_norm": 0.31474199891090393, + "learning_rate": 1.4638569229492534e-05, + "loss": 0.2085, + "step": 18662 + }, + { + "epoch": 0.3464984423907353, + "grad_norm": 0.29732224345207214, + "learning_rate": 1.4637535792043703e-05, + "loss": 0.202, + "step": 18664 + }, + { + "epoch": 0.3465355725281539, + "grad_norm": 0.6565135717391968, + "learning_rate": 1.4636502291491898e-05, + "loss": 0.4816, + "step": 18666 + }, + { + "epoch": 0.34657270266557255, + "grad_norm": 0.2887673079967499, + "learning_rate": 1.4635468727851185e-05, + "loss": 0.2504, + "step": 18668 + }, + { + "epoch": 0.3466098328029912, + "grad_norm": 0.4279642403125763, + "learning_rate": 1.463443510113563e-05, + "loss": 0.2958, + "step": 18670 + }, + { + "epoch": 0.34664696294040986, + "grad_norm": 0.3540893793106079, + "learning_rate": 1.4633401411359293e-05, + "loss": 0.1684, + "step": 18672 + }, + { + "epoch": 0.3466840930778285, + "grad_norm": 0.46122291684150696, + "learning_rate": 1.463236765853624e-05, + "loss": 0.1376, + "step": 18674 + }, + { + "epoch": 0.3467212232152471, + "grad_norm": 0.4325233995914459, + "learning_rate": 1.463133384268054e-05, + "loss": 0.2256, + "step": 18676 + }, + { + "epoch": 0.34675835335266575, + "grad_norm": 0.48237791657447815, + "learning_rate": 1.4630299963806258e-05, + "loss": 0.3086, + "step": 18678 + }, + { + "epoch": 0.3467954834900844, + "grad_norm": 0.3149224519729614, + "learning_rate": 1.4629266021927466e-05, + "loss": 0.343, + "step": 18680 + }, + { + "epoch": 0.34683261362750306, + "grad_norm": 0.35750582814216614, + "learning_rate": 1.4628232017058226e-05, + "loss": 0.3399, + "step": 18682 + }, + { + "epoch": 0.3468697437649217, + "grad_norm": 0.670259952545166, + "learning_rate": 1.4627197949212615e-05, + "loss": 0.2727, + "step": 18684 + }, + { + "epoch": 0.3469068739023403, + "grad_norm": 0.39398065209388733, + "learning_rate": 1.4626163818404697e-05, + "loss": 0.2913, + "step": 18686 + }, + { + "epoch": 0.34694400403975895, + "grad_norm": 0.5637697577476501, + "learning_rate": 1.462512962464855e-05, + "loss": 0.1917, + "step": 18688 + }, + { + "epoch": 0.3469811341771776, + "grad_norm": 0.3628010153770447, + "learning_rate": 1.462409536795824e-05, + "loss": 0.2279, + "step": 18690 + }, + { + "epoch": 0.3470182643145962, + "grad_norm": 0.41548043489456177, + "learning_rate": 1.462306104834784e-05, + "loss": 0.5091, + "step": 18692 + }, + { + "epoch": 0.3470553944520149, + "grad_norm": 0.32653701305389404, + "learning_rate": 1.462202666583143e-05, + "loss": 0.2071, + "step": 18694 + }, + { + "epoch": 0.3470925245894335, + "grad_norm": 0.39029526710510254, + "learning_rate": 1.4620992220423083e-05, + "loss": 0.2316, + "step": 18696 + }, + { + "epoch": 0.34712965472685214, + "grad_norm": 0.40262460708618164, + "learning_rate": 1.461995771213687e-05, + "loss": 0.4467, + "step": 18698 + }, + { + "epoch": 0.34716678486427077, + "grad_norm": 0.6048769354820251, + "learning_rate": 1.4618923140986873e-05, + "loss": 0.2478, + "step": 18700 + }, + { + "epoch": 0.3472039150016894, + "grad_norm": 0.29295116662979126, + "learning_rate": 1.4617888506987168e-05, + "loss": 0.3061, + "step": 18702 + }, + { + "epoch": 0.3472410451391081, + "grad_norm": 0.35957178473472595, + "learning_rate": 1.461685381015183e-05, + "loss": 0.1717, + "step": 18704 + }, + { + "epoch": 0.3472781752765267, + "grad_norm": 0.34165510535240173, + "learning_rate": 1.4615819050494941e-05, + "loss": 0.3767, + "step": 18706 + }, + { + "epoch": 0.34731530541394534, + "grad_norm": 0.29587599635124207, + "learning_rate": 1.461478422803058e-05, + "loss": 0.2332, + "step": 18708 + }, + { + "epoch": 0.34735243555136397, + "grad_norm": 0.5329194664955139, + "learning_rate": 1.4613749342772834e-05, + "loss": 0.4454, + "step": 18710 + }, + { + "epoch": 0.3473895656887826, + "grad_norm": 0.46350687742233276, + "learning_rate": 1.4612714394735776e-05, + "loss": 0.3278, + "step": 18712 + }, + { + "epoch": 0.3474266958262012, + "grad_norm": 0.36020445823669434, + "learning_rate": 1.4611679383933491e-05, + "loss": 0.1883, + "step": 18714 + }, + { + "epoch": 0.3474638259636199, + "grad_norm": 0.40545451641082764, + "learning_rate": 1.4610644310380064e-05, + "loss": 0.1758, + "step": 18716 + }, + { + "epoch": 0.34750095610103854, + "grad_norm": 0.5108841061592102, + "learning_rate": 1.4609609174089577e-05, + "loss": 0.3112, + "step": 18718 + }, + { + "epoch": 0.34753808623845717, + "grad_norm": 0.3191016614437103, + "learning_rate": 1.460857397507612e-05, + "loss": 0.2535, + "step": 18720 + }, + { + "epoch": 0.3475752163758758, + "grad_norm": 0.3361862897872925, + "learning_rate": 1.4607538713353773e-05, + "loss": 0.5364, + "step": 18722 + }, + { + "epoch": 0.3476123465132944, + "grad_norm": 0.4213963747024536, + "learning_rate": 1.4606503388936624e-05, + "loss": 0.3165, + "step": 18724 + }, + { + "epoch": 0.3476494766507131, + "grad_norm": 0.46971482038497925, + "learning_rate": 1.4605468001838761e-05, + "loss": 0.2962, + "step": 18726 + }, + { + "epoch": 0.34768660678813174, + "grad_norm": 0.20069344341754913, + "learning_rate": 1.4604432552074279e-05, + "loss": 0.2469, + "step": 18728 + }, + { + "epoch": 0.34772373692555036, + "grad_norm": 0.3534761965274811, + "learning_rate": 1.4603397039657256e-05, + "loss": 0.3857, + "step": 18730 + }, + { + "epoch": 0.347760867062969, + "grad_norm": 0.3173171579837799, + "learning_rate": 1.4602361464601792e-05, + "loss": 0.3651, + "step": 18732 + }, + { + "epoch": 0.3477979972003876, + "grad_norm": 0.2487792819738388, + "learning_rate": 1.4601325826921975e-05, + "loss": 0.3731, + "step": 18734 + }, + { + "epoch": 0.3478351273378063, + "grad_norm": 0.28780779242515564, + "learning_rate": 1.460029012663189e-05, + "loss": 0.5058, + "step": 18736 + }, + { + "epoch": 0.34787225747522493, + "grad_norm": 0.44666510820388794, + "learning_rate": 1.4599254363745641e-05, + "loss": 0.3549, + "step": 18738 + }, + { + "epoch": 0.34790938761264356, + "grad_norm": 0.3653266429901123, + "learning_rate": 1.4598218538277316e-05, + "loss": 0.1752, + "step": 18740 + }, + { + "epoch": 0.3479465177500622, + "grad_norm": 0.3699641227722168, + "learning_rate": 1.459718265024101e-05, + "loss": 0.1784, + "step": 18742 + }, + { + "epoch": 0.3479836478874808, + "grad_norm": 0.3862158954143524, + "learning_rate": 1.4596146699650815e-05, + "loss": 0.5148, + "step": 18744 + }, + { + "epoch": 0.34802077802489945, + "grad_norm": 0.3720279037952423, + "learning_rate": 1.4595110686520835e-05, + "loss": 0.2753, + "step": 18746 + }, + { + "epoch": 0.34805790816231813, + "grad_norm": 0.5562987327575684, + "learning_rate": 1.459407461086516e-05, + "loss": 0.3424, + "step": 18748 + }, + { + "epoch": 0.34809503829973676, + "grad_norm": 0.3503073751926422, + "learning_rate": 1.459303847269789e-05, + "loss": 0.3055, + "step": 18750 + }, + { + "epoch": 0.3481321684371554, + "grad_norm": 0.3184279501438141, + "learning_rate": 1.4592002272033124e-05, + "loss": 0.3122, + "step": 18752 + }, + { + "epoch": 0.348169298574574, + "grad_norm": 0.31101447343826294, + "learning_rate": 1.4590966008884964e-05, + "loss": 0.2606, + "step": 18754 + }, + { + "epoch": 0.34820642871199264, + "grad_norm": 0.27976590394973755, + "learning_rate": 1.4589929683267506e-05, + "loss": 0.2799, + "step": 18756 + }, + { + "epoch": 0.3482435588494113, + "grad_norm": 0.28993380069732666, + "learning_rate": 1.4588893295194852e-05, + "loss": 0.084, + "step": 18758 + }, + { + "epoch": 0.34828068898682996, + "grad_norm": 0.30710121989250183, + "learning_rate": 1.458785684468111e-05, + "loss": 0.1164, + "step": 18760 + }, + { + "epoch": 0.3483178191242486, + "grad_norm": 0.36882343888282776, + "learning_rate": 1.4586820331740375e-05, + "loss": 0.2058, + "step": 18762 + }, + { + "epoch": 0.3483549492616672, + "grad_norm": 0.3641347587108612, + "learning_rate": 1.4585783756386758e-05, + "loss": 0.3212, + "step": 18764 + }, + { + "epoch": 0.34839207939908584, + "grad_norm": 0.20711177587509155, + "learning_rate": 1.4584747118634357e-05, + "loss": 0.2762, + "step": 18766 + }, + { + "epoch": 0.34842920953650447, + "grad_norm": 0.2762656509876251, + "learning_rate": 1.4583710418497281e-05, + "loss": 0.2832, + "step": 18768 + }, + { + "epoch": 0.34846633967392315, + "grad_norm": 0.38318726420402527, + "learning_rate": 1.4582673655989636e-05, + "loss": 0.1701, + "step": 18770 + }, + { + "epoch": 0.3485034698113418, + "grad_norm": 0.24246035516262054, + "learning_rate": 1.4581636831125533e-05, + "loss": 0.349, + "step": 18772 + }, + { + "epoch": 0.3485405999487604, + "grad_norm": 0.47852542996406555, + "learning_rate": 1.4580599943919072e-05, + "loss": 0.2527, + "step": 18774 + }, + { + "epoch": 0.34857773008617904, + "grad_norm": 0.410382479429245, + "learning_rate": 1.4579562994384367e-05, + "loss": 0.3348, + "step": 18776 + }, + { + "epoch": 0.34861486022359767, + "grad_norm": 0.35311540961265564, + "learning_rate": 1.4578525982535528e-05, + "loss": 0.2678, + "step": 18778 + }, + { + "epoch": 0.34865199036101635, + "grad_norm": 0.39281854033470154, + "learning_rate": 1.4577488908386662e-05, + "loss": 0.2425, + "step": 18780 + }, + { + "epoch": 0.348689120498435, + "grad_norm": 0.35671600699424744, + "learning_rate": 1.4576451771951884e-05, + "loss": 0.3336, + "step": 18782 + }, + { + "epoch": 0.3487262506358536, + "grad_norm": 0.4285375773906708, + "learning_rate": 1.4575414573245308e-05, + "loss": 0.2162, + "step": 18784 + }, + { + "epoch": 0.34876338077327224, + "grad_norm": 0.31513622403144836, + "learning_rate": 1.4574377312281045e-05, + "loss": 0.2992, + "step": 18786 + }, + { + "epoch": 0.34880051091069086, + "grad_norm": 0.3937511444091797, + "learning_rate": 1.4573339989073205e-05, + "loss": 0.3872, + "step": 18788 + }, + { + "epoch": 0.3488376410481095, + "grad_norm": 0.3312351703643799, + "learning_rate": 1.4572302603635909e-05, + "loss": 0.3135, + "step": 18790 + }, + { + "epoch": 0.3488747711855282, + "grad_norm": 0.26028090715408325, + "learning_rate": 1.4571265155983267e-05, + "loss": 0.3707, + "step": 18792 + }, + { + "epoch": 0.3489119013229468, + "grad_norm": 2.04455304145813, + "learning_rate": 1.4570227646129404e-05, + "loss": 0.4414, + "step": 18794 + }, + { + "epoch": 0.34894903146036543, + "grad_norm": 0.44057872891426086, + "learning_rate": 1.4569190074088429e-05, + "loss": 0.302, + "step": 18796 + }, + { + "epoch": 0.34898616159778406, + "grad_norm": 0.5087441802024841, + "learning_rate": 1.4568152439874463e-05, + "loss": 0.2661, + "step": 18798 + }, + { + "epoch": 0.3490232917352027, + "grad_norm": 0.35400882363319397, + "learning_rate": 1.4567114743501626e-05, + "loss": 0.3644, + "step": 18800 + }, + { + "epoch": 0.3490604218726214, + "grad_norm": 0.3413498103618622, + "learning_rate": 1.4566076984984037e-05, + "loss": 0.3869, + "step": 18802 + }, + { + "epoch": 0.34909755201004, + "grad_norm": 0.46016883850097656, + "learning_rate": 1.4565039164335817e-05, + "loss": 0.5021, + "step": 18804 + }, + { + "epoch": 0.34913468214745863, + "grad_norm": 0.3496893346309662, + "learning_rate": 1.456400128157109e-05, + "loss": 0.624, + "step": 18806 + }, + { + "epoch": 0.34917181228487726, + "grad_norm": 0.30721890926361084, + "learning_rate": 1.4562963336703975e-05, + "loss": 0.3317, + "step": 18808 + }, + { + "epoch": 0.3492089424222959, + "grad_norm": 0.3969864547252655, + "learning_rate": 1.4561925329748599e-05, + "loss": 0.3714, + "step": 18810 + }, + { + "epoch": 0.34924607255971457, + "grad_norm": 0.3184185028076172, + "learning_rate": 1.4560887260719078e-05, + "loss": 0.2846, + "step": 18812 + }, + { + "epoch": 0.3492832026971332, + "grad_norm": 0.3169567286968231, + "learning_rate": 1.4559849129629546e-05, + "loss": 0.3262, + "step": 18814 + }, + { + "epoch": 0.3493203328345518, + "grad_norm": 0.3786376714706421, + "learning_rate": 1.4558810936494127e-05, + "loss": 0.4997, + "step": 18816 + }, + { + "epoch": 0.34935746297197046, + "grad_norm": 0.41073447465896606, + "learning_rate": 1.4557772681326946e-05, + "loss": 0.3819, + "step": 18818 + }, + { + "epoch": 0.3493945931093891, + "grad_norm": 0.26496219635009766, + "learning_rate": 1.4556734364142131e-05, + "loss": 0.1437, + "step": 18820 + }, + { + "epoch": 0.3494317232468077, + "grad_norm": 0.4841495752334595, + "learning_rate": 1.455569598495381e-05, + "loss": 0.2011, + "step": 18822 + }, + { + "epoch": 0.3494688533842264, + "grad_norm": 0.26623275876045227, + "learning_rate": 1.4554657543776111e-05, + "loss": 0.1131, + "step": 18824 + }, + { + "epoch": 0.349505983521645, + "grad_norm": 0.47117680311203003, + "learning_rate": 1.4553619040623168e-05, + "loss": 0.1915, + "step": 18826 + }, + { + "epoch": 0.34954311365906365, + "grad_norm": 0.4823889136314392, + "learning_rate": 1.4552580475509108e-05, + "loss": 0.3308, + "step": 18828 + }, + { + "epoch": 0.3495802437964823, + "grad_norm": 0.35075661540031433, + "learning_rate": 1.4551541848448065e-05, + "loss": 0.304, + "step": 18830 + }, + { + "epoch": 0.3496173739339009, + "grad_norm": 0.35729700326919556, + "learning_rate": 1.455050315945417e-05, + "loss": 0.3433, + "step": 18832 + }, + { + "epoch": 0.3496545040713196, + "grad_norm": 0.4022490382194519, + "learning_rate": 1.4549464408541556e-05, + "loss": 0.1271, + "step": 18834 + }, + { + "epoch": 0.3496916342087382, + "grad_norm": 0.29640525579452515, + "learning_rate": 1.4548425595724363e-05, + "loss": 0.304, + "step": 18836 + }, + { + "epoch": 0.34972876434615685, + "grad_norm": 0.3635883927345276, + "learning_rate": 1.4547386721016719e-05, + "loss": 0.4537, + "step": 18838 + }, + { + "epoch": 0.3497658944835755, + "grad_norm": 0.35302531719207764, + "learning_rate": 1.4546347784432762e-05, + "loss": 0.2853, + "step": 18840 + }, + { + "epoch": 0.3498030246209941, + "grad_norm": 0.4265187382698059, + "learning_rate": 1.4545308785986629e-05, + "loss": 0.2285, + "step": 18842 + }, + { + "epoch": 0.34984015475841274, + "grad_norm": 0.27550208568573, + "learning_rate": 1.454426972569246e-05, + "loss": 0.5115, + "step": 18844 + }, + { + "epoch": 0.3498772848958314, + "grad_norm": 0.37839362025260925, + "learning_rate": 1.4543230603564391e-05, + "loss": 0.3212, + "step": 18846 + }, + { + "epoch": 0.34991441503325005, + "grad_norm": 0.334229439496994, + "learning_rate": 1.4542191419616564e-05, + "loss": 0.2941, + "step": 18848 + }, + { + "epoch": 0.3499515451706687, + "grad_norm": 0.40643075108528137, + "learning_rate": 1.4541152173863115e-05, + "loss": 0.0503, + "step": 18850 + }, + { + "epoch": 0.3499886753080873, + "grad_norm": 0.428376168012619, + "learning_rate": 1.4540112866318188e-05, + "loss": 0.533, + "step": 18852 + }, + { + "epoch": 0.35002580544550593, + "grad_norm": 0.3517612814903259, + "learning_rate": 1.4539073496995923e-05, + "loss": 0.3046, + "step": 18854 + }, + { + "epoch": 0.3500629355829246, + "grad_norm": 0.36351194977760315, + "learning_rate": 1.4538034065910464e-05, + "loss": 0.2042, + "step": 18856 + }, + { + "epoch": 0.35010006572034325, + "grad_norm": 0.30827611684799194, + "learning_rate": 1.4536994573075958e-05, + "loss": 0.2718, + "step": 18858 + }, + { + "epoch": 0.3501371958577619, + "grad_norm": 0.5288692116737366, + "learning_rate": 1.4535955018506543e-05, + "loss": 0.2539, + "step": 18860 + }, + { + "epoch": 0.3501743259951805, + "grad_norm": 0.25314849615097046, + "learning_rate": 1.4534915402216365e-05, + "loss": 0.2002, + "step": 18862 + }, + { + "epoch": 0.35021145613259913, + "grad_norm": 0.29567253589630127, + "learning_rate": 1.4533875724219573e-05, + "loss": 0.2256, + "step": 18864 + }, + { + "epoch": 0.35024858627001776, + "grad_norm": 0.43860554695129395, + "learning_rate": 1.4532835984530316e-05, + "loss": 0.2226, + "step": 18866 + }, + { + "epoch": 0.35028571640743644, + "grad_norm": 0.2843737006187439, + "learning_rate": 1.4531796183162734e-05, + "loss": 0.2969, + "step": 18868 + }, + { + "epoch": 0.35032284654485507, + "grad_norm": 0.34850290417671204, + "learning_rate": 1.4530756320130986e-05, + "loss": 0.0991, + "step": 18870 + }, + { + "epoch": 0.3503599766822737, + "grad_norm": 0.5070458650588989, + "learning_rate": 1.4529716395449211e-05, + "loss": 0.3592, + "step": 18872 + }, + { + "epoch": 0.3503971068196923, + "grad_norm": 0.32836413383483887, + "learning_rate": 1.4528676409131563e-05, + "loss": 0.2995, + "step": 18874 + }, + { + "epoch": 0.35043423695711096, + "grad_norm": 0.5014967322349548, + "learning_rate": 1.4527636361192197e-05, + "loss": 0.194, + "step": 18876 + }, + { + "epoch": 0.35047136709452964, + "grad_norm": 0.2500004172325134, + "learning_rate": 1.4526596251645262e-05, + "loss": 0.3754, + "step": 18878 + }, + { + "epoch": 0.35050849723194827, + "grad_norm": 0.48153412342071533, + "learning_rate": 1.4525556080504911e-05, + "loss": 0.3056, + "step": 18880 + }, + { + "epoch": 0.3505456273693669, + "grad_norm": 0.39096927642822266, + "learning_rate": 1.4524515847785296e-05, + "loss": 0.4848, + "step": 18882 + }, + { + "epoch": 0.3505827575067855, + "grad_norm": 0.27435189485549927, + "learning_rate": 1.4523475553500572e-05, + "loss": 0.1736, + "step": 18884 + }, + { + "epoch": 0.35061988764420415, + "grad_norm": 0.4454135000705719, + "learning_rate": 1.4522435197664897e-05, + "loss": 0.387, + "step": 18886 + }, + { + "epoch": 0.35065701778162284, + "grad_norm": 0.25808340311050415, + "learning_rate": 1.4521394780292424e-05, + "loss": 0.2177, + "step": 18888 + }, + { + "epoch": 0.35069414791904147, + "grad_norm": 0.30163413286209106, + "learning_rate": 1.4520354301397311e-05, + "loss": 0.4891, + "step": 18890 + }, + { + "epoch": 0.3507312780564601, + "grad_norm": 0.3731898367404938, + "learning_rate": 1.4519313760993718e-05, + "loss": 0.2501, + "step": 18892 + }, + { + "epoch": 0.3507684081938787, + "grad_norm": 0.33895304799079895, + "learning_rate": 1.4518273159095799e-05, + "loss": 0.3835, + "step": 18894 + }, + { + "epoch": 0.35080553833129735, + "grad_norm": 0.4123552143573761, + "learning_rate": 1.4517232495717718e-05, + "loss": 0.2735, + "step": 18896 + }, + { + "epoch": 0.350842668468716, + "grad_norm": 0.3857586681842804, + "learning_rate": 1.4516191770873634e-05, + "loss": 0.3535, + "step": 18898 + }, + { + "epoch": 0.35087979860613466, + "grad_norm": 1.3258209228515625, + "learning_rate": 1.4515150984577704e-05, + "loss": 0.1833, + "step": 18900 + }, + { + "epoch": 0.3509169287435533, + "grad_norm": 0.4324641823768616, + "learning_rate": 1.4514110136844098e-05, + "loss": 0.5614, + "step": 18902 + }, + { + "epoch": 0.3509540588809719, + "grad_norm": 0.57988440990448, + "learning_rate": 1.4513069227686971e-05, + "loss": 0.2968, + "step": 18904 + }, + { + "epoch": 0.35099118901839055, + "grad_norm": 0.3326569199562073, + "learning_rate": 1.4512028257120493e-05, + "loss": 0.4654, + "step": 18906 + }, + { + "epoch": 0.3510283191558092, + "grad_norm": 0.5209187865257263, + "learning_rate": 1.4510987225158821e-05, + "loss": 0.4054, + "step": 18908 + }, + { + "epoch": 0.35106544929322786, + "grad_norm": 0.4271082282066345, + "learning_rate": 1.4509946131816128e-05, + "loss": 0.2713, + "step": 18910 + }, + { + "epoch": 0.3511025794306465, + "grad_norm": 0.4181838035583496, + "learning_rate": 1.4508904977106578e-05, + "loss": 0.3248, + "step": 18912 + }, + { + "epoch": 0.3511397095680651, + "grad_norm": 0.37019485235214233, + "learning_rate": 1.4507863761044333e-05, + "loss": 0.2776, + "step": 18914 + }, + { + "epoch": 0.35117683970548375, + "grad_norm": 0.6449816823005676, + "learning_rate": 1.4506822483643568e-05, + "loss": 0.2889, + "step": 18916 + }, + { + "epoch": 0.3512139698429024, + "grad_norm": 0.40885910391807556, + "learning_rate": 1.4505781144918449e-05, + "loss": 0.4017, + "step": 18918 + }, + { + "epoch": 0.351251099980321, + "grad_norm": 0.4887562394142151, + "learning_rate": 1.4504739744883142e-05, + "loss": 0.3366, + "step": 18920 + }, + { + "epoch": 0.3512882301177397, + "grad_norm": 0.2767049968242645, + "learning_rate": 1.4503698283551824e-05, + "loss": 0.1118, + "step": 18922 + }, + { + "epoch": 0.3513253602551583, + "grad_norm": 0.40932509303092957, + "learning_rate": 1.4502656760938662e-05, + "loss": 0.3261, + "step": 18924 + }, + { + "epoch": 0.35136249039257694, + "grad_norm": 0.3227868974208832, + "learning_rate": 1.4501615177057826e-05, + "loss": 0.2119, + "step": 18926 + }, + { + "epoch": 0.35139962052999557, + "grad_norm": 0.5095918774604797, + "learning_rate": 1.4500573531923492e-05, + "loss": 0.3011, + "step": 18928 + }, + { + "epoch": 0.3514367506674142, + "grad_norm": 0.5764751434326172, + "learning_rate": 1.4499531825549832e-05, + "loss": 0.3115, + "step": 18930 + }, + { + "epoch": 0.3514738808048329, + "grad_norm": 0.4265660345554352, + "learning_rate": 1.4498490057951027e-05, + "loss": 0.3725, + "step": 18932 + }, + { + "epoch": 0.3515110109422515, + "grad_norm": 0.39393746852874756, + "learning_rate": 1.449744822914124e-05, + "loss": 0.3337, + "step": 18934 + }, + { + "epoch": 0.35154814107967014, + "grad_norm": 0.3743571639060974, + "learning_rate": 1.4496406339134659e-05, + "loss": 0.3893, + "step": 18936 + }, + { + "epoch": 0.35158527121708877, + "grad_norm": 0.3582865297794342, + "learning_rate": 1.4495364387945452e-05, + "loss": 0.443, + "step": 18938 + }, + { + "epoch": 0.3516224013545074, + "grad_norm": 0.6780370473861694, + "learning_rate": 1.4494322375587802e-05, + "loss": 0.2762, + "step": 18940 + }, + { + "epoch": 0.351659531491926, + "grad_norm": 0.23048317432403564, + "learning_rate": 1.4493280302075889e-05, + "loss": 0.3221, + "step": 18942 + }, + { + "epoch": 0.3516966616293447, + "grad_norm": 0.23651336133480072, + "learning_rate": 1.449223816742389e-05, + "loss": 0.2192, + "step": 18944 + }, + { + "epoch": 0.35173379176676334, + "grad_norm": 0.22723370790481567, + "learning_rate": 1.4491195971645982e-05, + "loss": 0.1957, + "step": 18946 + }, + { + "epoch": 0.35177092190418197, + "grad_norm": 0.3477269411087036, + "learning_rate": 1.4490153714756352e-05, + "loss": 0.1932, + "step": 18948 + }, + { + "epoch": 0.3518080520416006, + "grad_norm": 0.3378618657588959, + "learning_rate": 1.4489111396769177e-05, + "loss": 0.2347, + "step": 18950 + }, + { + "epoch": 0.3518451821790192, + "grad_norm": 0.28808775544166565, + "learning_rate": 1.4488069017698645e-05, + "loss": 0.3441, + "step": 18952 + }, + { + "epoch": 0.3518823123164379, + "grad_norm": 0.3887220323085785, + "learning_rate": 1.4487026577558936e-05, + "loss": 0.3022, + "step": 18954 + }, + { + "epoch": 0.35191944245385653, + "grad_norm": 0.31523799896240234, + "learning_rate": 1.4485984076364236e-05, + "loss": 0.3654, + "step": 18956 + }, + { + "epoch": 0.35195657259127516, + "grad_norm": 0.47469210624694824, + "learning_rate": 1.4484941514128727e-05, + "loss": 0.411, + "step": 18958 + }, + { + "epoch": 0.3519937027286938, + "grad_norm": 0.3684457838535309, + "learning_rate": 1.44838988908666e-05, + "loss": 0.2563, + "step": 18960 + }, + { + "epoch": 0.3520308328661124, + "grad_norm": 0.464172899723053, + "learning_rate": 1.4482856206592042e-05, + "loss": 0.3372, + "step": 18962 + }, + { + "epoch": 0.3520679630035311, + "grad_norm": 0.4620152711868286, + "learning_rate": 1.4481813461319239e-05, + "loss": 0.5468, + "step": 18964 + }, + { + "epoch": 0.35210509314094973, + "grad_norm": 0.37961575388908386, + "learning_rate": 1.4480770655062376e-05, + "loss": 0.2963, + "step": 18966 + }, + { + "epoch": 0.35214222327836836, + "grad_norm": 0.31157928705215454, + "learning_rate": 1.447972778783565e-05, + "loss": 0.2724, + "step": 18968 + }, + { + "epoch": 0.352179353415787, + "grad_norm": 0.2580147385597229, + "learning_rate": 1.447868485965324e-05, + "loss": 0.5055, + "step": 18970 + }, + { + "epoch": 0.3522164835532056, + "grad_norm": 0.294188916683197, + "learning_rate": 1.4477641870529349e-05, + "loss": 0.2431, + "step": 18972 + }, + { + "epoch": 0.35225361369062425, + "grad_norm": 0.3997357487678528, + "learning_rate": 1.4476598820478165e-05, + "loss": 0.3866, + "step": 18974 + }, + { + "epoch": 0.35229074382804293, + "grad_norm": 0.3046589493751526, + "learning_rate": 1.447555570951388e-05, + "loss": 0.3742, + "step": 18976 + }, + { + "epoch": 0.35232787396546156, + "grad_norm": 0.2687056064605713, + "learning_rate": 1.4474512537650685e-05, + "loss": 0.195, + "step": 18978 + }, + { + "epoch": 0.3523650041028802, + "grad_norm": 0.25810807943344116, + "learning_rate": 1.4473469304902779e-05, + "loss": 0.0964, + "step": 18980 + }, + { + "epoch": 0.3524021342402988, + "grad_norm": 0.3868236541748047, + "learning_rate": 1.4472426011284352e-05, + "loss": 0.4079, + "step": 18982 + }, + { + "epoch": 0.35243926437771744, + "grad_norm": 0.3207060694694519, + "learning_rate": 1.4471382656809607e-05, + "loss": 0.1489, + "step": 18984 + }, + { + "epoch": 0.3524763945151361, + "grad_norm": 0.6569737195968628, + "learning_rate": 1.4470339241492738e-05, + "loss": 0.3572, + "step": 18986 + }, + { + "epoch": 0.35251352465255475, + "grad_norm": 0.29283884167671204, + "learning_rate": 1.4469295765347941e-05, + "loss": 0.364, + "step": 18988 + }, + { + "epoch": 0.3525506547899734, + "grad_norm": 0.37402501702308655, + "learning_rate": 1.4468252228389417e-05, + "loss": 0.1169, + "step": 18990 + }, + { + "epoch": 0.352587784927392, + "grad_norm": 0.4235284626483917, + "learning_rate": 1.4467208630631362e-05, + "loss": 0.3852, + "step": 18992 + }, + { + "epoch": 0.35262491506481064, + "grad_norm": 0.39247334003448486, + "learning_rate": 1.4466164972087979e-05, + "loss": 0.2522, + "step": 18994 + }, + { + "epoch": 0.35266204520222927, + "grad_norm": 0.3064703643321991, + "learning_rate": 1.4465121252773468e-05, + "loss": 0.2032, + "step": 18996 + }, + { + "epoch": 0.35269917533964795, + "grad_norm": 0.3349137604236603, + "learning_rate": 1.4464077472702032e-05, + "loss": 0.3044, + "step": 18998 + }, + { + "epoch": 0.3527363054770666, + "grad_norm": 0.40613552927970886, + "learning_rate": 1.4463033631887875e-05, + "loss": 0.3493, + "step": 19000 + }, + { + "epoch": 0.3527734356144852, + "grad_norm": 0.4191027879714966, + "learning_rate": 1.4461989730345193e-05, + "loss": 0.2418, + "step": 19002 + }, + { + "epoch": 0.35281056575190384, + "grad_norm": 0.4051223397254944, + "learning_rate": 1.44609457680882e-05, + "loss": 0.2732, + "step": 19004 + }, + { + "epoch": 0.35284769588932247, + "grad_norm": 0.5641425251960754, + "learning_rate": 1.4459901745131097e-05, + "loss": 0.266, + "step": 19006 + }, + { + "epoch": 0.35288482602674115, + "grad_norm": 0.3093421161174774, + "learning_rate": 1.4458857661488092e-05, + "loss": 0.2755, + "step": 19008 + }, + { + "epoch": 0.3529219561641598, + "grad_norm": 0.4044405519962311, + "learning_rate": 1.4457813517173387e-05, + "loss": 0.2002, + "step": 19010 + }, + { + "epoch": 0.3529590863015784, + "grad_norm": 0.3390043079853058, + "learning_rate": 1.4456769312201195e-05, + "loss": 0.2729, + "step": 19012 + }, + { + "epoch": 0.35299621643899703, + "grad_norm": 0.47962307929992676, + "learning_rate": 1.445572504658572e-05, + "loss": 0.4253, + "step": 19014 + }, + { + "epoch": 0.35303334657641566, + "grad_norm": 0.3191542625427246, + "learning_rate": 1.4454680720341175e-05, + "loss": 0.3936, + "step": 19016 + }, + { + "epoch": 0.3530704767138343, + "grad_norm": 0.39377671480178833, + "learning_rate": 1.4453636333481769e-05, + "loss": 0.2508, + "step": 19018 + }, + { + "epoch": 0.353107606851253, + "grad_norm": 0.3223792016506195, + "learning_rate": 1.4452591886021711e-05, + "loss": 0.3646, + "step": 19020 + }, + { + "epoch": 0.3531447369886716, + "grad_norm": 0.32481256127357483, + "learning_rate": 1.4451547377975214e-05, + "loss": 0.4295, + "step": 19022 + }, + { + "epoch": 0.35318186712609023, + "grad_norm": 0.44496139883995056, + "learning_rate": 1.4450502809356492e-05, + "loss": 0.3073, + "step": 19024 + }, + { + "epoch": 0.35321899726350886, + "grad_norm": 0.5025999546051025, + "learning_rate": 1.4449458180179759e-05, + "loss": 0.4304, + "step": 19026 + }, + { + "epoch": 0.3532561274009275, + "grad_norm": 0.39571231603622437, + "learning_rate": 1.4448413490459228e-05, + "loss": 0.4454, + "step": 19028 + }, + { + "epoch": 0.3532932575383462, + "grad_norm": 0.3054034113883972, + "learning_rate": 1.4447368740209113e-05, + "loss": 0.2294, + "step": 19030 + }, + { + "epoch": 0.3533303876757648, + "grad_norm": 0.3918306529521942, + "learning_rate": 1.4446323929443631e-05, + "loss": 0.46, + "step": 19032 + }, + { + "epoch": 0.35336751781318343, + "grad_norm": 0.46022093296051025, + "learning_rate": 1.4445279058176996e-05, + "loss": 0.3588, + "step": 19034 + }, + { + "epoch": 0.35340464795060206, + "grad_norm": 0.2909444570541382, + "learning_rate": 1.4444234126423431e-05, + "loss": 0.361, + "step": 19036 + }, + { + "epoch": 0.3534417780880207, + "grad_norm": 0.29414787888526917, + "learning_rate": 1.444318913419715e-05, + "loss": 0.3962, + "step": 19038 + }, + { + "epoch": 0.35347890822543937, + "grad_norm": 0.5349095463752747, + "learning_rate": 1.4442144081512376e-05, + "loss": 0.4621, + "step": 19040 + }, + { + "epoch": 0.353516038362858, + "grad_norm": 0.5301125049591064, + "learning_rate": 1.4441098968383325e-05, + "loss": 0.4063, + "step": 19042 + }, + { + "epoch": 0.3535531685002766, + "grad_norm": 0.37551823258399963, + "learning_rate": 1.4440053794824222e-05, + "loss": 0.468, + "step": 19044 + }, + { + "epoch": 0.35359029863769526, + "grad_norm": 0.4179868996143341, + "learning_rate": 1.4439008560849283e-05, + "loss": 0.1533, + "step": 19046 + }, + { + "epoch": 0.3536274287751139, + "grad_norm": 0.3283635675907135, + "learning_rate": 1.4437963266472737e-05, + "loss": 0.196, + "step": 19048 + }, + { + "epoch": 0.3536645589125325, + "grad_norm": 0.28628742694854736, + "learning_rate": 1.4436917911708804e-05, + "loss": 0.3198, + "step": 19050 + }, + { + "epoch": 0.3537016890499512, + "grad_norm": 0.38309386372566223, + "learning_rate": 1.4435872496571705e-05, + "loss": 0.3602, + "step": 19052 + }, + { + "epoch": 0.3537388191873698, + "grad_norm": 0.34142830967903137, + "learning_rate": 1.443482702107567e-05, + "loss": 0.1889, + "step": 19054 + }, + { + "epoch": 0.35377594932478845, + "grad_norm": 0.5507894158363342, + "learning_rate": 1.4433781485234923e-05, + "loss": 0.2195, + "step": 19056 + }, + { + "epoch": 0.3538130794622071, + "grad_norm": 0.32069459557533264, + "learning_rate": 1.4432735889063692e-05, + "loss": 0.366, + "step": 19058 + }, + { + "epoch": 0.3538502095996257, + "grad_norm": 0.30436572432518005, + "learning_rate": 1.4431690232576203e-05, + "loss": 0.314, + "step": 19060 + }, + { + "epoch": 0.3538873397370444, + "grad_norm": 0.5098092555999756, + "learning_rate": 1.4430644515786685e-05, + "loss": 0.2964, + "step": 19062 + }, + { + "epoch": 0.353924469874463, + "grad_norm": 0.20647500455379486, + "learning_rate": 1.4429598738709362e-05, + "loss": 0.2752, + "step": 19064 + }, + { + "epoch": 0.35396160001188165, + "grad_norm": 0.5044084787368774, + "learning_rate": 1.4428552901358472e-05, + "loss": 0.2656, + "step": 19066 + }, + { + "epoch": 0.3539987301493003, + "grad_norm": 0.3877022862434387, + "learning_rate": 1.4427507003748242e-05, + "loss": 0.0961, + "step": 19068 + }, + { + "epoch": 0.3540358602867189, + "grad_norm": 0.40637773275375366, + "learning_rate": 1.4426461045892904e-05, + "loss": 0.2014, + "step": 19070 + }, + { + "epoch": 0.35407299042413753, + "grad_norm": 0.34401699900627136, + "learning_rate": 1.442541502780669e-05, + "loss": 0.1265, + "step": 19072 + }, + { + "epoch": 0.3541101205615562, + "grad_norm": 0.4048682451248169, + "learning_rate": 1.4424368949503832e-05, + "loss": 0.0969, + "step": 19074 + }, + { + "epoch": 0.35414725069897485, + "grad_norm": 0.3697626292705536, + "learning_rate": 1.4423322810998563e-05, + "loss": 0.2951, + "step": 19076 + }, + { + "epoch": 0.3541843808363935, + "grad_norm": 0.363372266292572, + "learning_rate": 1.4422276612305122e-05, + "loss": 0.1346, + "step": 19078 + }, + { + "epoch": 0.3542215109738121, + "grad_norm": 0.31801411509513855, + "learning_rate": 1.4421230353437744e-05, + "loss": 0.4545, + "step": 19080 + }, + { + "epoch": 0.35425864111123073, + "grad_norm": 0.4249822497367859, + "learning_rate": 1.4420184034410662e-05, + "loss": 0.2492, + "step": 19082 + }, + { + "epoch": 0.3542957712486494, + "grad_norm": 0.30824634432792664, + "learning_rate": 1.4419137655238115e-05, + "loss": 0.1225, + "step": 19084 + }, + { + "epoch": 0.35433290138606804, + "grad_norm": 0.4208660423755646, + "learning_rate": 1.4418091215934342e-05, + "loss": 0.4945, + "step": 19086 + }, + { + "epoch": 0.3543700315234867, + "grad_norm": 0.6723758578300476, + "learning_rate": 1.4417044716513583e-05, + "loss": 0.4782, + "step": 19088 + }, + { + "epoch": 0.3544071616609053, + "grad_norm": 0.44197720289230347, + "learning_rate": 1.4415998156990074e-05, + "loss": 0.3774, + "step": 19090 + }, + { + "epoch": 0.35444429179832393, + "grad_norm": 0.28031468391418457, + "learning_rate": 1.4414951537378059e-05, + "loss": 0.2776, + "step": 19092 + }, + { + "epoch": 0.35448142193574256, + "grad_norm": 0.3647652268409729, + "learning_rate": 1.4413904857691778e-05, + "loss": 0.1988, + "step": 19094 + }, + { + "epoch": 0.35451855207316124, + "grad_norm": 0.29621008038520813, + "learning_rate": 1.4412858117945473e-05, + "loss": 0.4334, + "step": 19096 + }, + { + "epoch": 0.35455568221057987, + "grad_norm": 0.29490378499031067, + "learning_rate": 1.4411811318153383e-05, + "loss": 0.4856, + "step": 19098 + }, + { + "epoch": 0.3545928123479985, + "grad_norm": 0.3255443871021271, + "learning_rate": 1.441076445832976e-05, + "loss": 0.2549, + "step": 19100 + }, + { + "epoch": 0.3546299424854171, + "grad_norm": 0.4909319579601288, + "learning_rate": 1.4409717538488845e-05, + "loss": 0.256, + "step": 19102 + }, + { + "epoch": 0.35466707262283576, + "grad_norm": 0.40481331944465637, + "learning_rate": 1.440867055864488e-05, + "loss": 0.2274, + "step": 19104 + }, + { + "epoch": 0.35470420276025444, + "grad_norm": 0.46618103981018066, + "learning_rate": 1.4407623518812117e-05, + "loss": 0.1934, + "step": 19106 + }, + { + "epoch": 0.35474133289767307, + "grad_norm": 0.38732388615608215, + "learning_rate": 1.4406576419004798e-05, + "loss": 0.3747, + "step": 19108 + }, + { + "epoch": 0.3547784630350917, + "grad_norm": 0.4707793593406677, + "learning_rate": 1.4405529259237172e-05, + "loss": 0.3053, + "step": 19110 + }, + { + "epoch": 0.3548155931725103, + "grad_norm": 0.4835131764411926, + "learning_rate": 1.4404482039523492e-05, + "loss": 0.239, + "step": 19112 + }, + { + "epoch": 0.35485272330992895, + "grad_norm": 0.41828829050064087, + "learning_rate": 1.4403434759878005e-05, + "loss": 0.2935, + "step": 19114 + }, + { + "epoch": 0.35488985344734764, + "grad_norm": 0.44106265902519226, + "learning_rate": 1.4402387420314959e-05, + "loss": 0.2116, + "step": 19116 + }, + { + "epoch": 0.35492698358476626, + "grad_norm": 0.4473205804824829, + "learning_rate": 1.4401340020848608e-05, + "loss": 0.3158, + "step": 19118 + }, + { + "epoch": 0.3549641137221849, + "grad_norm": 0.4244312345981598, + "learning_rate": 1.44002925614932e-05, + "loss": 0.6686, + "step": 19120 + }, + { + "epoch": 0.3550012438596035, + "grad_norm": 0.4721492528915405, + "learning_rate": 1.4399245042262991e-05, + "loss": 0.405, + "step": 19122 + }, + { + "epoch": 0.35503837399702215, + "grad_norm": 0.4620789885520935, + "learning_rate": 1.4398197463172236e-05, + "loss": 0.3029, + "step": 19124 + }, + { + "epoch": 0.3550755041344408, + "grad_norm": 0.30846601724624634, + "learning_rate": 1.439714982423519e-05, + "loss": 0.3605, + "step": 19126 + }, + { + "epoch": 0.35511263427185946, + "grad_norm": 0.34849846363067627, + "learning_rate": 1.4396102125466103e-05, + "loss": 0.2225, + "step": 19128 + }, + { + "epoch": 0.3551497644092781, + "grad_norm": 0.38298845291137695, + "learning_rate": 1.4395054366879232e-05, + "loss": 0.3043, + "step": 19130 + }, + { + "epoch": 0.3551868945466967, + "grad_norm": 0.360188752412796, + "learning_rate": 1.4394006548488839e-05, + "loss": 0.6272, + "step": 19132 + }, + { + "epoch": 0.35522402468411535, + "grad_norm": 0.504703164100647, + "learning_rate": 1.4392958670309178e-05, + "loss": 0.2917, + "step": 19134 + }, + { + "epoch": 0.355261154821534, + "grad_norm": 0.5333461165428162, + "learning_rate": 1.4391910732354508e-05, + "loss": 0.4409, + "step": 19136 + }, + { + "epoch": 0.35529828495895266, + "grad_norm": 0.33476823568344116, + "learning_rate": 1.4390862734639086e-05, + "loss": 0.3322, + "step": 19138 + }, + { + "epoch": 0.3553354150963713, + "grad_norm": 0.4086160957813263, + "learning_rate": 1.4389814677177177e-05, + "loss": 0.4502, + "step": 19140 + }, + { + "epoch": 0.3553725452337899, + "grad_norm": 0.30259355902671814, + "learning_rate": 1.4388766559983036e-05, + "loss": 0.2849, + "step": 19142 + }, + { + "epoch": 0.35540967537120854, + "grad_norm": 0.3576435148715973, + "learning_rate": 1.4387718383070932e-05, + "loss": 0.3169, + "step": 19144 + }, + { + "epoch": 0.3554468055086272, + "grad_norm": 0.40742558240890503, + "learning_rate": 1.4386670146455122e-05, + "loss": 0.2034, + "step": 19146 + }, + { + "epoch": 0.3554839356460458, + "grad_norm": 0.3722027540206909, + "learning_rate": 1.4385621850149872e-05, + "loss": 0.1926, + "step": 19148 + }, + { + "epoch": 0.3555210657834645, + "grad_norm": 0.2623305916786194, + "learning_rate": 1.4384573494169442e-05, + "loss": 0.4584, + "step": 19150 + }, + { + "epoch": 0.3555581959208831, + "grad_norm": 0.4020449221134186, + "learning_rate": 1.4383525078528101e-05, + "loss": 0.3635, + "step": 19152 + }, + { + "epoch": 0.35559532605830174, + "grad_norm": 0.36772334575653076, + "learning_rate": 1.4382476603240113e-05, + "loss": 0.3094, + "step": 19154 + }, + { + "epoch": 0.35563245619572037, + "grad_norm": 0.8524356484413147, + "learning_rate": 1.4381428068319752e-05, + "loss": 0.2937, + "step": 19156 + }, + { + "epoch": 0.355669586333139, + "grad_norm": 0.3645341992378235, + "learning_rate": 1.4380379473781272e-05, + "loss": 0.3304, + "step": 19158 + }, + { + "epoch": 0.3557067164705577, + "grad_norm": 0.5001469850540161, + "learning_rate": 1.437933081963895e-05, + "loss": 0.2521, + "step": 19160 + }, + { + "epoch": 0.3557438466079763, + "grad_norm": 0.3354986011981964, + "learning_rate": 1.4378282105907054e-05, + "loss": 0.3293, + "step": 19162 + }, + { + "epoch": 0.35578097674539494, + "grad_norm": 0.4423581659793854, + "learning_rate": 1.4377233332599852e-05, + "loss": 0.2792, + "step": 19164 + }, + { + "epoch": 0.35581810688281357, + "grad_norm": 0.2927844822406769, + "learning_rate": 1.4376184499731617e-05, + "loss": 0.3956, + "step": 19166 + }, + { + "epoch": 0.3558552370202322, + "grad_norm": 0.4370005130767822, + "learning_rate": 1.4375135607316614e-05, + "loss": 0.2585, + "step": 19168 + }, + { + "epoch": 0.3558923671576508, + "grad_norm": 0.7163337469100952, + "learning_rate": 1.4374086655369129e-05, + "loss": 0.349, + "step": 19170 + }, + { + "epoch": 0.3559294972950695, + "grad_norm": 0.457614928483963, + "learning_rate": 1.4373037643903419e-05, + "loss": 0.2962, + "step": 19172 + }, + { + "epoch": 0.35596662743248814, + "grad_norm": 0.3752543032169342, + "learning_rate": 1.4371988572933768e-05, + "loss": 0.3875, + "step": 19174 + }, + { + "epoch": 0.35600375756990676, + "grad_norm": 0.4762692451477051, + "learning_rate": 1.437093944247445e-05, + "loss": 0.3106, + "step": 19176 + }, + { + "epoch": 0.3560408877073254, + "grad_norm": 0.3628615438938141, + "learning_rate": 1.4369890252539738e-05, + "loss": 0.4215, + "step": 19178 + }, + { + "epoch": 0.356078017844744, + "grad_norm": 0.5029466152191162, + "learning_rate": 1.4368841003143908e-05, + "loss": 0.1494, + "step": 19180 + }, + { + "epoch": 0.3561151479821627, + "grad_norm": 0.37303322553634644, + "learning_rate": 1.4367791694301238e-05, + "loss": 0.3318, + "step": 19182 + }, + { + "epoch": 0.35615227811958133, + "grad_norm": 0.38239043951034546, + "learning_rate": 1.4366742326026004e-05, + "loss": 0.2978, + "step": 19184 + }, + { + "epoch": 0.35618940825699996, + "grad_norm": 0.3466437757015228, + "learning_rate": 1.436569289833249e-05, + "loss": 0.4416, + "step": 19186 + }, + { + "epoch": 0.3562265383944186, + "grad_norm": 0.39457646012306213, + "learning_rate": 1.4364643411234976e-05, + "loss": 0.3742, + "step": 19188 + }, + { + "epoch": 0.3562636685318372, + "grad_norm": 0.449174702167511, + "learning_rate": 1.4363593864747732e-05, + "loss": 0.3012, + "step": 19190 + }, + { + "epoch": 0.3563007986692559, + "grad_norm": 0.4582138955593109, + "learning_rate": 1.4362544258885046e-05, + "loss": 0.2818, + "step": 19192 + }, + { + "epoch": 0.35633792880667453, + "grad_norm": 0.348893404006958, + "learning_rate": 1.4361494593661205e-05, + "loss": 0.3953, + "step": 19194 + }, + { + "epoch": 0.35637505894409316, + "grad_norm": 0.2620885372161865, + "learning_rate": 1.4360444869090484e-05, + "loss": 0.177, + "step": 19196 + }, + { + "epoch": 0.3564121890815118, + "grad_norm": 0.3366757035255432, + "learning_rate": 1.4359395085187169e-05, + "loss": 0.4193, + "step": 19198 + }, + { + "epoch": 0.3564493192189304, + "grad_norm": 0.46537330746650696, + "learning_rate": 1.4358345241965548e-05, + "loss": 0.5712, + "step": 19200 + }, + { + "epoch": 0.35648644935634904, + "grad_norm": 0.42223677039146423, + "learning_rate": 1.4357295339439899e-05, + "loss": 0.3058, + "step": 19202 + }, + { + "epoch": 0.35652357949376773, + "grad_norm": 0.4203982949256897, + "learning_rate": 1.4356245377624514e-05, + "loss": 0.1256, + "step": 19204 + }, + { + "epoch": 0.35656070963118636, + "grad_norm": 0.30981892347335815, + "learning_rate": 1.4355195356533675e-05, + "loss": 0.2917, + "step": 19206 + }, + { + "epoch": 0.356597839768605, + "grad_norm": 0.29343441128730774, + "learning_rate": 1.4354145276181677e-05, + "loss": 0.2114, + "step": 19208 + }, + { + "epoch": 0.3566349699060236, + "grad_norm": 0.6155351400375366, + "learning_rate": 1.4353095136582802e-05, + "loss": 0.2353, + "step": 19210 + }, + { + "epoch": 0.35667210004344224, + "grad_norm": 0.3946570158004761, + "learning_rate": 1.435204493775134e-05, + "loss": 0.1984, + "step": 19212 + }, + { + "epoch": 0.3567092301808609, + "grad_norm": 0.5555511713027954, + "learning_rate": 1.4350994679701584e-05, + "loss": 0.4287, + "step": 19214 + }, + { + "epoch": 0.35674636031827955, + "grad_norm": 1.17799711227417, + "learning_rate": 1.434994436244782e-05, + "loss": 0.2128, + "step": 19216 + }, + { + "epoch": 0.3567834904556982, + "grad_norm": 0.29123303294181824, + "learning_rate": 1.4348893986004349e-05, + "loss": 0.399, + "step": 19218 + }, + { + "epoch": 0.3568206205931168, + "grad_norm": 0.5263556241989136, + "learning_rate": 1.4347843550385454e-05, + "loss": 0.1962, + "step": 19220 + }, + { + "epoch": 0.35685775073053544, + "grad_norm": 0.23480884730815887, + "learning_rate": 1.434679305560543e-05, + "loss": 0.327, + "step": 19222 + }, + { + "epoch": 0.35689488086795407, + "grad_norm": 0.32617440819740295, + "learning_rate": 1.4345742501678572e-05, + "loss": 0.3877, + "step": 19224 + }, + { + "epoch": 0.35693201100537275, + "grad_norm": 0.35131189227104187, + "learning_rate": 1.4344691888619177e-05, + "loss": 0.2187, + "step": 19226 + }, + { + "epoch": 0.3569691411427914, + "grad_norm": 0.5032466650009155, + "learning_rate": 1.434364121644154e-05, + "loss": 0.3571, + "step": 19228 + }, + { + "epoch": 0.35700627128021, + "grad_norm": 0.2917448878288269, + "learning_rate": 1.4342590485159957e-05, + "loss": 0.2341, + "step": 19230 + }, + { + "epoch": 0.35704340141762864, + "grad_norm": 0.44347652792930603, + "learning_rate": 1.4341539694788726e-05, + "loss": 0.3934, + "step": 19232 + }, + { + "epoch": 0.35708053155504726, + "grad_norm": 0.35745173692703247, + "learning_rate": 1.4340488845342142e-05, + "loss": 0.3543, + "step": 19234 + }, + { + "epoch": 0.35711766169246595, + "grad_norm": 0.4164411425590515, + "learning_rate": 1.4339437936834506e-05, + "loss": 0.5095, + "step": 19236 + }, + { + "epoch": 0.3571547918298846, + "grad_norm": 0.3867277204990387, + "learning_rate": 1.433838696928012e-05, + "loss": 0.3299, + "step": 19238 + }, + { + "epoch": 0.3571919219673032, + "grad_norm": 0.9293546676635742, + "learning_rate": 1.4337335942693282e-05, + "loss": 0.3052, + "step": 19240 + }, + { + "epoch": 0.35722905210472183, + "grad_norm": 0.3814060389995575, + "learning_rate": 1.433628485708829e-05, + "loss": 0.32, + "step": 19242 + }, + { + "epoch": 0.35726618224214046, + "grad_norm": 0.34118515253067017, + "learning_rate": 1.4335233712479454e-05, + "loss": 0.3605, + "step": 19244 + }, + { + "epoch": 0.3573033123795591, + "grad_norm": 0.42078039050102234, + "learning_rate": 1.4334182508881069e-05, + "loss": 0.2718, + "step": 19246 + }, + { + "epoch": 0.3573404425169778, + "grad_norm": 0.31590035557746887, + "learning_rate": 1.4333131246307445e-05, + "loss": 0.1826, + "step": 19248 + }, + { + "epoch": 0.3573775726543964, + "grad_norm": 0.3172628879547119, + "learning_rate": 1.4332079924772885e-05, + "loss": 0.1794, + "step": 19250 + }, + { + "epoch": 0.35741470279181503, + "grad_norm": 0.49243366718292236, + "learning_rate": 1.4331028544291692e-05, + "loss": 0.2191, + "step": 19252 + }, + { + "epoch": 0.35745183292923366, + "grad_norm": 0.42085498571395874, + "learning_rate": 1.4329977104878172e-05, + "loss": 0.1693, + "step": 19254 + }, + { + "epoch": 0.3574889630666523, + "grad_norm": 0.3401266038417816, + "learning_rate": 1.4328925606546634e-05, + "loss": 0.1716, + "step": 19256 + }, + { + "epoch": 0.35752609320407097, + "grad_norm": 0.2822279930114746, + "learning_rate": 1.4327874049311386e-05, + "loss": 0.2452, + "step": 19258 + }, + { + "epoch": 0.3575632233414896, + "grad_norm": 0.28478866815567017, + "learning_rate": 1.4326822433186735e-05, + "loss": 0.2706, + "step": 19260 + }, + { + "epoch": 0.35760035347890823, + "grad_norm": 0.3893972635269165, + "learning_rate": 1.4325770758186993e-05, + "loss": 0.3115, + "step": 19262 + }, + { + "epoch": 0.35763748361632686, + "grad_norm": 0.34105637669563293, + "learning_rate": 1.4324719024326465e-05, + "loss": 0.2059, + "step": 19264 + }, + { + "epoch": 0.3576746137537455, + "grad_norm": 0.598537802696228, + "learning_rate": 1.4323667231619467e-05, + "loss": 0.3632, + "step": 19266 + }, + { + "epoch": 0.35771174389116417, + "grad_norm": 0.4352877140045166, + "learning_rate": 1.4322615380080307e-05, + "loss": 0.247, + "step": 19268 + }, + { + "epoch": 0.3577488740285828, + "grad_norm": 0.5066704750061035, + "learning_rate": 1.4321563469723302e-05, + "loss": 0.323, + "step": 19270 + }, + { + "epoch": 0.3577860041660014, + "grad_norm": 0.3929022252559662, + "learning_rate": 1.4320511500562763e-05, + "loss": 0.3433, + "step": 19272 + }, + { + "epoch": 0.35782313430342005, + "grad_norm": 0.3574424088001251, + "learning_rate": 1.4319459472613002e-05, + "loss": 0.2943, + "step": 19274 + }, + { + "epoch": 0.3578602644408387, + "grad_norm": 0.3119262456893921, + "learning_rate": 1.4318407385888339e-05, + "loss": 0.3586, + "step": 19276 + }, + { + "epoch": 0.3578973945782573, + "grad_norm": 0.2621961236000061, + "learning_rate": 1.4317355240403082e-05, + "loss": 0.2365, + "step": 19278 + }, + { + "epoch": 0.357934524715676, + "grad_norm": 0.45818644762039185, + "learning_rate": 1.4316303036171553e-05, + "loss": 0.299, + "step": 19280 + }, + { + "epoch": 0.3579716548530946, + "grad_norm": 0.4423239529132843, + "learning_rate": 1.431525077320807e-05, + "loss": 0.3084, + "step": 19282 + }, + { + "epoch": 0.35800878499051325, + "grad_norm": 0.38348856568336487, + "learning_rate": 1.431419845152695e-05, + "loss": 0.3641, + "step": 19284 + }, + { + "epoch": 0.3580459151279319, + "grad_norm": 0.21954844892024994, + "learning_rate": 1.431314607114251e-05, + "loss": 0.3943, + "step": 19286 + }, + { + "epoch": 0.3580830452653505, + "grad_norm": 0.3987937569618225, + "learning_rate": 1.431209363206907e-05, + "loss": 0.2648, + "step": 19288 + }, + { + "epoch": 0.3581201754027692, + "grad_norm": 0.2781100571155548, + "learning_rate": 1.4311041134320958e-05, + "loss": 0.3875, + "step": 19290 + }, + { + "epoch": 0.3581573055401878, + "grad_norm": 0.3760882019996643, + "learning_rate": 1.4309988577912481e-05, + "loss": 0.3152, + "step": 19292 + }, + { + "epoch": 0.35819443567760645, + "grad_norm": 0.25897979736328125, + "learning_rate": 1.4308935962857978e-05, + "loss": 0.2846, + "step": 19294 + }, + { + "epoch": 0.3582315658150251, + "grad_norm": 0.47104737162590027, + "learning_rate": 1.4307883289171758e-05, + "loss": 0.3588, + "step": 19296 + }, + { + "epoch": 0.3582686959524437, + "grad_norm": 0.3214458227157593, + "learning_rate": 1.430683055686815e-05, + "loss": 0.3748, + "step": 19298 + }, + { + "epoch": 0.35830582608986233, + "grad_norm": 0.5017143487930298, + "learning_rate": 1.4305777765961479e-05, + "loss": 0.4127, + "step": 19300 + }, + { + "epoch": 0.358342956227281, + "grad_norm": 0.2939670979976654, + "learning_rate": 1.430472491646607e-05, + "loss": 0.2129, + "step": 19302 + }, + { + "epoch": 0.35838008636469965, + "grad_norm": 0.41886237263679504, + "learning_rate": 1.4303672008396251e-05, + "loss": 0.3259, + "step": 19304 + }, + { + "epoch": 0.3584172165021183, + "grad_norm": 0.41379138827323914, + "learning_rate": 1.4302619041766345e-05, + "loss": 0.285, + "step": 19306 + }, + { + "epoch": 0.3584543466395369, + "grad_norm": 0.29987138509750366, + "learning_rate": 1.4301566016590684e-05, + "loss": 0.3014, + "step": 19308 + }, + { + "epoch": 0.35849147677695553, + "grad_norm": 0.5723175406455994, + "learning_rate": 1.4300512932883593e-05, + "loss": 0.3322, + "step": 19310 + }, + { + "epoch": 0.3585286069143742, + "grad_norm": 0.36991435289382935, + "learning_rate": 1.42994597906594e-05, + "loss": 0.3981, + "step": 19312 + }, + { + "epoch": 0.35856573705179284, + "grad_norm": 0.4879956841468811, + "learning_rate": 1.429840658993244e-05, + "loss": 0.2317, + "step": 19314 + }, + { + "epoch": 0.35860286718921147, + "grad_norm": 0.2696889638900757, + "learning_rate": 1.4297353330717041e-05, + "loss": 0.2703, + "step": 19316 + }, + { + "epoch": 0.3586399973266301, + "grad_norm": 0.4427299499511719, + "learning_rate": 1.4296300013027535e-05, + "loss": 0.2024, + "step": 19318 + }, + { + "epoch": 0.35867712746404873, + "grad_norm": 0.35004928708076477, + "learning_rate": 1.4295246636878256e-05, + "loss": 0.4206, + "step": 19320 + }, + { + "epoch": 0.35871425760146736, + "grad_norm": 0.310921311378479, + "learning_rate": 1.4294193202283535e-05, + "loss": 0.2024, + "step": 19322 + }, + { + "epoch": 0.35875138773888604, + "grad_norm": 0.4394582509994507, + "learning_rate": 1.4293139709257708e-05, + "loss": 0.3806, + "step": 19324 + }, + { + "epoch": 0.35878851787630467, + "grad_norm": 0.31655433773994446, + "learning_rate": 1.4292086157815108e-05, + "loss": 0.3274, + "step": 19326 + }, + { + "epoch": 0.3588256480137233, + "grad_norm": 0.427811861038208, + "learning_rate": 1.4291032547970073e-05, + "loss": 0.3271, + "step": 19328 + }, + { + "epoch": 0.3588627781511419, + "grad_norm": 0.383688747882843, + "learning_rate": 1.4289978879736934e-05, + "loss": 0.1742, + "step": 19330 + }, + { + "epoch": 0.35889990828856055, + "grad_norm": 0.3519056737422943, + "learning_rate": 1.4288925153130036e-05, + "loss": 0.401, + "step": 19332 + }, + { + "epoch": 0.35893703842597924, + "grad_norm": 0.4023473560810089, + "learning_rate": 1.4287871368163712e-05, + "loss": 0.2115, + "step": 19334 + }, + { + "epoch": 0.35897416856339787, + "grad_norm": 0.4246709644794464, + "learning_rate": 1.4286817524852306e-05, + "loss": 0.4124, + "step": 19336 + }, + { + "epoch": 0.3590112987008165, + "grad_norm": 0.42302870750427246, + "learning_rate": 1.428576362321015e-05, + "loss": 0.3036, + "step": 19338 + }, + { + "epoch": 0.3590484288382351, + "grad_norm": 0.44748467206954956, + "learning_rate": 1.428470966325159e-05, + "loss": 0.4582, + "step": 19340 + }, + { + "epoch": 0.35908555897565375, + "grad_norm": 0.3671167492866516, + "learning_rate": 1.4283655644990963e-05, + "loss": 0.1495, + "step": 19342 + }, + { + "epoch": 0.3591226891130724, + "grad_norm": 0.55576092004776, + "learning_rate": 1.4282601568442615e-05, + "loss": 0.2972, + "step": 19344 + }, + { + "epoch": 0.35915981925049106, + "grad_norm": 0.3427821695804596, + "learning_rate": 1.428154743362089e-05, + "loss": 0.2152, + "step": 19346 + }, + { + "epoch": 0.3591969493879097, + "grad_norm": 0.3411884307861328, + "learning_rate": 1.4280493240540129e-05, + "loss": 0.3299, + "step": 19348 + }, + { + "epoch": 0.3592340795253283, + "grad_norm": 0.5980251431465149, + "learning_rate": 1.4279438989214676e-05, + "loss": 0.2164, + "step": 19350 + }, + { + "epoch": 0.35927120966274695, + "grad_norm": 0.7066493034362793, + "learning_rate": 1.4278384679658876e-05, + "loss": 0.4806, + "step": 19352 + }, + { + "epoch": 0.3593083398001656, + "grad_norm": 0.49866965413093567, + "learning_rate": 1.4277330311887077e-05, + "loss": 0.3272, + "step": 19354 + }, + { + "epoch": 0.35934546993758426, + "grad_norm": 0.2560674846172333, + "learning_rate": 1.4276275885913622e-05, + "loss": 0.3135, + "step": 19356 + }, + { + "epoch": 0.3593826000750029, + "grad_norm": 0.43878498673439026, + "learning_rate": 1.4275221401752864e-05, + "loss": 0.1221, + "step": 19358 + }, + { + "epoch": 0.3594197302124215, + "grad_norm": 0.3382638990879059, + "learning_rate": 1.4274166859419146e-05, + "loss": 0.3064, + "step": 19360 + }, + { + "epoch": 0.35945686034984015, + "grad_norm": 0.29951754212379456, + "learning_rate": 1.427311225892682e-05, + "loss": 0.2599, + "step": 19362 + }, + { + "epoch": 0.3594939904872588, + "grad_norm": 0.37848109006881714, + "learning_rate": 1.4272057600290236e-05, + "loss": 0.3596, + "step": 19364 + }, + { + "epoch": 0.35953112062467746, + "grad_norm": 0.527595043182373, + "learning_rate": 1.4271002883523743e-05, + "loss": 0.5082, + "step": 19366 + }, + { + "epoch": 0.3595682507620961, + "grad_norm": 0.47584301233291626, + "learning_rate": 1.4269948108641698e-05, + "loss": 0.2329, + "step": 19368 + }, + { + "epoch": 0.3596053808995147, + "grad_norm": 0.27808305621147156, + "learning_rate": 1.4268893275658449e-05, + "loss": 0.29, + "step": 19370 + }, + { + "epoch": 0.35964251103693334, + "grad_norm": 0.41396379470825195, + "learning_rate": 1.4267838384588343e-05, + "loss": 0.14, + "step": 19372 + }, + { + "epoch": 0.35967964117435197, + "grad_norm": 0.3349596858024597, + "learning_rate": 1.4266783435445746e-05, + "loss": 0.4614, + "step": 19374 + }, + { + "epoch": 0.3597167713117706, + "grad_norm": 0.4447016417980194, + "learning_rate": 1.4265728428245002e-05, + "loss": 0.3802, + "step": 19376 + }, + { + "epoch": 0.3597539014491893, + "grad_norm": 0.485930860042572, + "learning_rate": 1.4264673363000478e-05, + "loss": 0.3623, + "step": 19378 + }, + { + "epoch": 0.3597910315866079, + "grad_norm": 0.6662027835845947, + "learning_rate": 1.426361823972652e-05, + "loss": 0.211, + "step": 19380 + }, + { + "epoch": 0.35982816172402654, + "grad_norm": 0.49511489272117615, + "learning_rate": 1.426256305843749e-05, + "loss": 0.2623, + "step": 19382 + }, + { + "epoch": 0.35986529186144517, + "grad_norm": 0.29400357604026794, + "learning_rate": 1.4261507819147741e-05, + "loss": 0.3062, + "step": 19384 + }, + { + "epoch": 0.3599024219988638, + "grad_norm": 0.38039326667785645, + "learning_rate": 1.426045252187164e-05, + "loss": 0.3109, + "step": 19386 + }, + { + "epoch": 0.3599395521362825, + "grad_norm": 0.2881231904029846, + "learning_rate": 1.425939716662354e-05, + "loss": 0.4163, + "step": 19388 + }, + { + "epoch": 0.3599766822737011, + "grad_norm": 0.4104432761669159, + "learning_rate": 1.4258341753417803e-05, + "loss": 0.4175, + "step": 19390 + }, + { + "epoch": 0.36001381241111974, + "grad_norm": 0.25922685861587524, + "learning_rate": 1.425728628226879e-05, + "loss": 0.3576, + "step": 19392 + }, + { + "epoch": 0.36005094254853837, + "grad_norm": 0.27954208850860596, + "learning_rate": 1.4256230753190861e-05, + "loss": 0.3717, + "step": 19394 + }, + { + "epoch": 0.360088072685957, + "grad_norm": 0.4536297619342804, + "learning_rate": 1.4255175166198383e-05, + "loss": 0.2959, + "step": 19396 + }, + { + "epoch": 0.3601252028233756, + "grad_norm": 0.3709540367126465, + "learning_rate": 1.4254119521305714e-05, + "loss": 0.272, + "step": 19398 + }, + { + "epoch": 0.3601623329607943, + "grad_norm": 0.3171406686306, + "learning_rate": 1.4253063818527224e-05, + "loss": 0.2997, + "step": 19400 + }, + { + "epoch": 0.36019946309821294, + "grad_norm": 0.2949376404285431, + "learning_rate": 1.4252008057877274e-05, + "loss": 0.4016, + "step": 19402 + }, + { + "epoch": 0.36023659323563156, + "grad_norm": 1.0760380029678345, + "learning_rate": 1.4250952239370228e-05, + "loss": 0.2733, + "step": 19404 + }, + { + "epoch": 0.3602737233730502, + "grad_norm": 0.350651353597641, + "learning_rate": 1.4249896363020455e-05, + "loss": 0.3678, + "step": 19406 + }, + { + "epoch": 0.3603108535104688, + "grad_norm": 0.32489749789237976, + "learning_rate": 1.4248840428842325e-05, + "loss": 0.3478, + "step": 19408 + }, + { + "epoch": 0.3603479836478875, + "grad_norm": 0.415242999792099, + "learning_rate": 1.4247784436850204e-05, + "loss": 0.3702, + "step": 19410 + }, + { + "epoch": 0.36038511378530613, + "grad_norm": 0.3564092218875885, + "learning_rate": 1.424672838705846e-05, + "loss": 0.2973, + "step": 19412 + }, + { + "epoch": 0.36042224392272476, + "grad_norm": 0.33056971430778503, + "learning_rate": 1.424567227948146e-05, + "loss": 0.3879, + "step": 19414 + }, + { + "epoch": 0.3604593740601434, + "grad_norm": 0.37101346254348755, + "learning_rate": 1.4244616114133581e-05, + "loss": 0.2339, + "step": 19416 + }, + { + "epoch": 0.360496504197562, + "grad_norm": 0.21719877421855927, + "learning_rate": 1.4243559891029189e-05, + "loss": 0.2379, + "step": 19418 + }, + { + "epoch": 0.36053363433498065, + "grad_norm": 0.25728940963745117, + "learning_rate": 1.4242503610182658e-05, + "loss": 0.3683, + "step": 19420 + }, + { + "epoch": 0.36057076447239933, + "grad_norm": 0.5536287426948547, + "learning_rate": 1.4241447271608363e-05, + "loss": 0.4299, + "step": 19422 + }, + { + "epoch": 0.36060789460981796, + "grad_norm": 0.28365758061408997, + "learning_rate": 1.424039087532067e-05, + "loss": 0.3239, + "step": 19424 + }, + { + "epoch": 0.3606450247472366, + "grad_norm": 0.39073842763900757, + "learning_rate": 1.4239334421333962e-05, + "loss": 0.6209, + "step": 19426 + }, + { + "epoch": 0.3606821548846552, + "grad_norm": 0.3459383249282837, + "learning_rate": 1.4238277909662612e-05, + "loss": 0.3424, + "step": 19428 + }, + { + "epoch": 0.36071928502207384, + "grad_norm": 0.8320008516311646, + "learning_rate": 1.4237221340320993e-05, + "loss": 0.1513, + "step": 19430 + }, + { + "epoch": 0.3607564151594925, + "grad_norm": 0.3445095717906952, + "learning_rate": 1.4236164713323483e-05, + "loss": 0.3787, + "step": 19432 + }, + { + "epoch": 0.36079354529691116, + "grad_norm": 0.4179249703884125, + "learning_rate": 1.4235108028684463e-05, + "loss": 0.3335, + "step": 19434 + }, + { + "epoch": 0.3608306754343298, + "grad_norm": 0.3133543133735657, + "learning_rate": 1.4234051286418305e-05, + "loss": 0.2513, + "step": 19436 + }, + { + "epoch": 0.3608678055717484, + "grad_norm": 0.2690754234790802, + "learning_rate": 1.423299448653939e-05, + "loss": 0.2418, + "step": 19438 + }, + { + "epoch": 0.36090493570916704, + "grad_norm": 0.464995801448822, + "learning_rate": 1.4231937629062105e-05, + "loss": 0.252, + "step": 19440 + }, + { + "epoch": 0.3609420658465857, + "grad_norm": 0.3129501938819885, + "learning_rate": 1.4230880714000824e-05, + "loss": 0.2725, + "step": 19442 + }, + { + "epoch": 0.36097919598400435, + "grad_norm": 0.3478209972381592, + "learning_rate": 1.4229823741369926e-05, + "loss": 0.1372, + "step": 19444 + }, + { + "epoch": 0.361016326121423, + "grad_norm": 0.40127572417259216, + "learning_rate": 1.42287667111838e-05, + "loss": 0.3705, + "step": 19446 + }, + { + "epoch": 0.3610534562588416, + "grad_norm": 0.3084023594856262, + "learning_rate": 1.4227709623456823e-05, + "loss": 0.4258, + "step": 19448 + }, + { + "epoch": 0.36109058639626024, + "grad_norm": 0.5917215347290039, + "learning_rate": 1.422665247820338e-05, + "loss": 0.2639, + "step": 19450 + }, + { + "epoch": 0.36112771653367887, + "grad_norm": 0.32714250683784485, + "learning_rate": 1.422559527543786e-05, + "loss": 0.3106, + "step": 19452 + }, + { + "epoch": 0.36116484667109755, + "grad_norm": 0.19050537049770355, + "learning_rate": 1.4224538015174647e-05, + "loss": 0.1674, + "step": 19454 + }, + { + "epoch": 0.3612019768085162, + "grad_norm": 0.4742777347564697, + "learning_rate": 1.422348069742812e-05, + "loss": 0.383, + "step": 19456 + }, + { + "epoch": 0.3612391069459348, + "grad_norm": 0.3823901116847992, + "learning_rate": 1.4222423322212676e-05, + "loss": 0.2073, + "step": 19458 + }, + { + "epoch": 0.36127623708335344, + "grad_norm": 0.4636363387107849, + "learning_rate": 1.4221365889542698e-05, + "loss": 0.4066, + "step": 19460 + }, + { + "epoch": 0.36131336722077206, + "grad_norm": 0.3310929536819458, + "learning_rate": 1.4220308399432574e-05, + "loss": 0.2447, + "step": 19462 + }, + { + "epoch": 0.36135049735819075, + "grad_norm": 0.34844252467155457, + "learning_rate": 1.4219250851896693e-05, + "loss": 0.2753, + "step": 19464 + }, + { + "epoch": 0.3613876274956094, + "grad_norm": 0.3810291290283203, + "learning_rate": 1.4218193246949448e-05, + "loss": 0.2264, + "step": 19466 + }, + { + "epoch": 0.361424757633028, + "grad_norm": 0.3767564594745636, + "learning_rate": 1.4217135584605225e-05, + "loss": 0.1662, + "step": 19468 + }, + { + "epoch": 0.36146188777044663, + "grad_norm": 0.4006989300251007, + "learning_rate": 1.421607786487842e-05, + "loss": 0.2757, + "step": 19470 + }, + { + "epoch": 0.36149901790786526, + "grad_norm": 0.340705543756485, + "learning_rate": 1.4215020087783425e-05, + "loss": 0.209, + "step": 19472 + }, + { + "epoch": 0.3615361480452839, + "grad_norm": 0.4438702166080475, + "learning_rate": 1.4213962253334633e-05, + "loss": 0.2898, + "step": 19474 + }, + { + "epoch": 0.3615732781827026, + "grad_norm": 0.2826271653175354, + "learning_rate": 1.4212904361546432e-05, + "loss": 0.1184, + "step": 19476 + }, + { + "epoch": 0.3616104083201212, + "grad_norm": 0.316656231880188, + "learning_rate": 1.4211846412433226e-05, + "loss": 0.2498, + "step": 19478 + }, + { + "epoch": 0.36164753845753983, + "grad_norm": 0.530942976474762, + "learning_rate": 1.4210788406009404e-05, + "loss": 0.3692, + "step": 19480 + }, + { + "epoch": 0.36168466859495846, + "grad_norm": 0.404958039522171, + "learning_rate": 1.4209730342289366e-05, + "loss": 0.3864, + "step": 19482 + }, + { + "epoch": 0.3617217987323771, + "grad_norm": 0.3390571177005768, + "learning_rate": 1.4208672221287507e-05, + "loss": 0.2055, + "step": 19484 + }, + { + "epoch": 0.36175892886979577, + "grad_norm": 0.4394838213920593, + "learning_rate": 1.4207614043018228e-05, + "loss": 0.4327, + "step": 19486 + }, + { + "epoch": 0.3617960590072144, + "grad_norm": 0.351423978805542, + "learning_rate": 1.4206555807495922e-05, + "loss": 0.3163, + "step": 19488 + }, + { + "epoch": 0.361833189144633, + "grad_norm": 0.457510769367218, + "learning_rate": 1.4205497514734993e-05, + "loss": 0.2466, + "step": 19490 + }, + { + "epoch": 0.36187031928205166, + "grad_norm": 0.35586559772491455, + "learning_rate": 1.4204439164749837e-05, + "loss": 0.2381, + "step": 19492 + }, + { + "epoch": 0.3619074494194703, + "grad_norm": 0.5308236479759216, + "learning_rate": 1.420338075755486e-05, + "loss": 0.3012, + "step": 19494 + }, + { + "epoch": 0.3619445795568889, + "grad_norm": 0.6988393068313599, + "learning_rate": 1.4202322293164464e-05, + "loss": 0.3331, + "step": 19496 + }, + { + "epoch": 0.3619817096943076, + "grad_norm": 1.3446675539016724, + "learning_rate": 1.4201263771593045e-05, + "loss": 0.2747, + "step": 19498 + }, + { + "epoch": 0.3620188398317262, + "grad_norm": 0.38556936383247375, + "learning_rate": 1.420020519285501e-05, + "loss": 0.3301, + "step": 19500 + }, + { + "epoch": 0.36205596996914485, + "grad_norm": 0.35705500841140747, + "learning_rate": 1.4199146556964765e-05, + "loss": 0.325, + "step": 19502 + }, + { + "epoch": 0.3620931001065635, + "grad_norm": 0.46387165784835815, + "learning_rate": 1.4198087863936714e-05, + "loss": 0.1574, + "step": 19504 + }, + { + "epoch": 0.3621302302439821, + "grad_norm": 0.4282967746257782, + "learning_rate": 1.4197029113785264e-05, + "loss": 0.196, + "step": 19506 + }, + { + "epoch": 0.3621673603814008, + "grad_norm": 0.3932381272315979, + "learning_rate": 1.4195970306524815e-05, + "loss": 0.3732, + "step": 19508 + }, + { + "epoch": 0.3622044905188194, + "grad_norm": 0.36688801646232605, + "learning_rate": 1.4194911442169781e-05, + "loss": 0.3949, + "step": 19510 + }, + { + "epoch": 0.36224162065623805, + "grad_norm": 0.5592859387397766, + "learning_rate": 1.4193852520734565e-05, + "loss": 0.2494, + "step": 19512 + }, + { + "epoch": 0.3622787507936567, + "grad_norm": 0.45428985357284546, + "learning_rate": 1.4192793542233579e-05, + "loss": 0.421, + "step": 19514 + }, + { + "epoch": 0.3623158809310753, + "grad_norm": 0.4370211362838745, + "learning_rate": 1.4191734506681236e-05, + "loss": 0.2709, + "step": 19516 + }, + { + "epoch": 0.362353011068494, + "grad_norm": 0.4268166124820709, + "learning_rate": 1.4190675414091937e-05, + "loss": 0.1489, + "step": 19518 + }, + { + "epoch": 0.3623901412059126, + "grad_norm": 0.4415932893753052, + "learning_rate": 1.41896162644801e-05, + "loss": 0.4014, + "step": 19520 + }, + { + "epoch": 0.36242727134333125, + "grad_norm": 0.2947825789451599, + "learning_rate": 1.4188557057860135e-05, + "loss": 0.3449, + "step": 19522 + }, + { + "epoch": 0.3624644014807499, + "grad_norm": 0.453798770904541, + "learning_rate": 1.4187497794246455e-05, + "loss": 0.4646, + "step": 19524 + }, + { + "epoch": 0.3625015316181685, + "grad_norm": 3.982621669769287, + "learning_rate": 1.4186438473653473e-05, + "loss": 0.3369, + "step": 19526 + }, + { + "epoch": 0.36253866175558713, + "grad_norm": 0.5682688355445862, + "learning_rate": 1.4185379096095602e-05, + "loss": 0.1894, + "step": 19528 + }, + { + "epoch": 0.3625757918930058, + "grad_norm": 0.28124478459358215, + "learning_rate": 1.4184319661587258e-05, + "loss": 0.4234, + "step": 19530 + }, + { + "epoch": 0.36261292203042444, + "grad_norm": 0.6364871859550476, + "learning_rate": 1.4183260170142857e-05, + "loss": 0.2529, + "step": 19532 + }, + { + "epoch": 0.3626500521678431, + "grad_norm": 0.35344138741493225, + "learning_rate": 1.4182200621776816e-05, + "loss": 0.4488, + "step": 19534 + }, + { + "epoch": 0.3626871823052617, + "grad_norm": 0.7011620998382568, + "learning_rate": 1.4181141016503552e-05, + "loss": 0.435, + "step": 19536 + }, + { + "epoch": 0.36272431244268033, + "grad_norm": 0.4352754056453705, + "learning_rate": 1.4180081354337483e-05, + "loss": 0.4362, + "step": 19538 + }, + { + "epoch": 0.362761442580099, + "grad_norm": 0.570061206817627, + "learning_rate": 1.4179021635293027e-05, + "loss": 0.4287, + "step": 19540 + }, + { + "epoch": 0.36279857271751764, + "grad_norm": 0.46416133642196655, + "learning_rate": 1.4177961859384604e-05, + "loss": 0.2883, + "step": 19542 + }, + { + "epoch": 0.36283570285493627, + "grad_norm": 0.4613078236579895, + "learning_rate": 1.4176902026626635e-05, + "loss": 0.2745, + "step": 19544 + }, + { + "epoch": 0.3628728329923549, + "grad_norm": 0.31681764125823975, + "learning_rate": 1.417584213703354e-05, + "loss": 0.2747, + "step": 19546 + }, + { + "epoch": 0.3629099631297735, + "grad_norm": 0.4063815474510193, + "learning_rate": 1.4174782190619745e-05, + "loss": 0.4011, + "step": 19548 + }, + { + "epoch": 0.36294709326719216, + "grad_norm": 0.4370083808898926, + "learning_rate": 1.4173722187399666e-05, + "loss": 0.2299, + "step": 19550 + }, + { + "epoch": 0.36298422340461084, + "grad_norm": 0.26459628343582153, + "learning_rate": 1.4172662127387726e-05, + "loss": 0.2999, + "step": 19552 + }, + { + "epoch": 0.36302135354202947, + "grad_norm": 0.24206113815307617, + "learning_rate": 1.417160201059836e-05, + "loss": 0.31, + "step": 19554 + }, + { + "epoch": 0.3630584836794481, + "grad_norm": 0.40039780735969543, + "learning_rate": 1.4170541837045981e-05, + "loss": 0.4261, + "step": 19556 + }, + { + "epoch": 0.3630956138168667, + "grad_norm": 0.4162488877773285, + "learning_rate": 1.4169481606745023e-05, + "loss": 0.3216, + "step": 19558 + }, + { + "epoch": 0.36313274395428535, + "grad_norm": 0.3435562551021576, + "learning_rate": 1.4168421319709912e-05, + "loss": 0.3266, + "step": 19560 + }, + { + "epoch": 0.36316987409170404, + "grad_norm": 0.29202988743782043, + "learning_rate": 1.4167360975955067e-05, + "loss": 0.1948, + "step": 19562 + }, + { + "epoch": 0.36320700422912267, + "grad_norm": 0.37673017382621765, + "learning_rate": 1.4166300575494922e-05, + "loss": 0.4237, + "step": 19564 + }, + { + "epoch": 0.3632441343665413, + "grad_norm": 0.4063500165939331, + "learning_rate": 1.4165240118343908e-05, + "loss": 0.252, + "step": 19566 + }, + { + "epoch": 0.3632812645039599, + "grad_norm": 0.4879564046859741, + "learning_rate": 1.416417960451645e-05, + "loss": 0.4091, + "step": 19568 + }, + { + "epoch": 0.36331839464137855, + "grad_norm": 0.4541158974170685, + "learning_rate": 1.4163119034026984e-05, + "loss": 0.3571, + "step": 19570 + }, + { + "epoch": 0.3633555247787972, + "grad_norm": 0.3003189265727997, + "learning_rate": 1.4162058406889938e-05, + "loss": 0.5226, + "step": 19572 + }, + { + "epoch": 0.36339265491621586, + "grad_norm": 0.3884391784667969, + "learning_rate": 1.4160997723119742e-05, + "loss": 0.3981, + "step": 19574 + }, + { + "epoch": 0.3634297850536345, + "grad_norm": 0.37985676527023315, + "learning_rate": 1.415993698273083e-05, + "loss": 0.2039, + "step": 19576 + }, + { + "epoch": 0.3634669151910531, + "grad_norm": 0.37474411725997925, + "learning_rate": 1.415887618573764e-05, + "loss": 0.1386, + "step": 19578 + }, + { + "epoch": 0.36350404532847175, + "grad_norm": 0.3575778901576996, + "learning_rate": 1.4157815332154598e-05, + "loss": 0.2139, + "step": 19580 + }, + { + "epoch": 0.3635411754658904, + "grad_norm": 0.35885781049728394, + "learning_rate": 1.4156754421996147e-05, + "loss": 0.4081, + "step": 19582 + }, + { + "epoch": 0.36357830560330906, + "grad_norm": 0.3839130699634552, + "learning_rate": 1.4155693455276716e-05, + "loss": 0.4175, + "step": 19584 + }, + { + "epoch": 0.3636154357407277, + "grad_norm": 0.30613404512405396, + "learning_rate": 1.4154632432010746e-05, + "loss": 0.3266, + "step": 19586 + }, + { + "epoch": 0.3636525658781463, + "grad_norm": 0.3216108977794647, + "learning_rate": 1.4153571352212671e-05, + "loss": 0.3034, + "step": 19588 + }, + { + "epoch": 0.36368969601556494, + "grad_norm": 0.2818622589111328, + "learning_rate": 1.4152510215896936e-05, + "loss": 0.1936, + "step": 19590 + }, + { + "epoch": 0.3637268261529836, + "grad_norm": 0.2734121084213257, + "learning_rate": 1.4151449023077972e-05, + "loss": 0.1365, + "step": 19592 + }, + { + "epoch": 0.36376395629040226, + "grad_norm": 0.39637458324432373, + "learning_rate": 1.4150387773770222e-05, + "loss": 0.2345, + "step": 19594 + }, + { + "epoch": 0.3638010864278209, + "grad_norm": 0.32374125719070435, + "learning_rate": 1.4149326467988127e-05, + "loss": 0.381, + "step": 19596 + }, + { + "epoch": 0.3638382165652395, + "grad_norm": 0.3764549791812897, + "learning_rate": 1.4148265105746128e-05, + "loss": 0.3107, + "step": 19598 + }, + { + "epoch": 0.36387534670265814, + "grad_norm": 0.4097733199596405, + "learning_rate": 1.4147203687058667e-05, + "loss": 0.2284, + "step": 19600 + }, + { + "epoch": 0.36391247684007677, + "grad_norm": 0.3480445146560669, + "learning_rate": 1.4146142211940184e-05, + "loss": 0.3732, + "step": 19602 + }, + { + "epoch": 0.3639496069774954, + "grad_norm": 0.32401493191719055, + "learning_rate": 1.4145080680405125e-05, + "loss": 0.1009, + "step": 19604 + }, + { + "epoch": 0.3639867371149141, + "grad_norm": 0.3910563886165619, + "learning_rate": 1.4144019092467933e-05, + "loss": 0.2834, + "step": 19606 + }, + { + "epoch": 0.3640238672523327, + "grad_norm": 0.3129458725452423, + "learning_rate": 1.4142957448143056e-05, + "loss": 0.3255, + "step": 19608 + }, + { + "epoch": 0.36406099738975134, + "grad_norm": 0.4297815263271332, + "learning_rate": 1.4141895747444938e-05, + "loss": 0.1376, + "step": 19610 + }, + { + "epoch": 0.36409812752716997, + "grad_norm": 0.38689860701560974, + "learning_rate": 1.4140833990388022e-05, + "loss": 0.4161, + "step": 19612 + }, + { + "epoch": 0.3641352576645886, + "grad_norm": 0.4048779010772705, + "learning_rate": 1.4139772176986762e-05, + "loss": 0.3856, + "step": 19614 + }, + { + "epoch": 0.3641723878020073, + "grad_norm": 0.28877386450767517, + "learning_rate": 1.4138710307255602e-05, + "loss": 0.4035, + "step": 19616 + }, + { + "epoch": 0.3642095179394259, + "grad_norm": 0.31135332584381104, + "learning_rate": 1.413764838120899e-05, + "loss": 0.4439, + "step": 19618 + }, + { + "epoch": 0.36424664807684454, + "grad_norm": 0.48642754554748535, + "learning_rate": 1.4136586398861379e-05, + "loss": 0.2254, + "step": 19620 + }, + { + "epoch": 0.36428377821426317, + "grad_norm": 0.48098546266555786, + "learning_rate": 1.4135524360227218e-05, + "loss": 0.2268, + "step": 19622 + }, + { + "epoch": 0.3643209083516818, + "grad_norm": 0.338257759809494, + "learning_rate": 1.413446226532096e-05, + "loss": 0.4359, + "step": 19624 + }, + { + "epoch": 0.3643580384891004, + "grad_norm": 0.4198701083660126, + "learning_rate": 1.4133400114157047e-05, + "loss": 0.2899, + "step": 19626 + }, + { + "epoch": 0.3643951686265191, + "grad_norm": 0.32552823424339294, + "learning_rate": 1.4132337906749944e-05, + "loss": 0.3084, + "step": 19628 + }, + { + "epoch": 0.36443229876393773, + "grad_norm": 0.41945046186447144, + "learning_rate": 1.41312756431141e-05, + "loss": 0.2714, + "step": 19630 + }, + { + "epoch": 0.36446942890135636, + "grad_norm": 0.3319258689880371, + "learning_rate": 1.4130213323263971e-05, + "loss": 0.3977, + "step": 19632 + }, + { + "epoch": 0.364506559038775, + "grad_norm": 0.29167088866233826, + "learning_rate": 1.4129150947214006e-05, + "loss": 0.3782, + "step": 19634 + }, + { + "epoch": 0.3645436891761936, + "grad_norm": 0.4393881857395172, + "learning_rate": 1.4128088514978668e-05, + "loss": 0.3559, + "step": 19636 + }, + { + "epoch": 0.3645808193136123, + "grad_norm": 0.6081888675689697, + "learning_rate": 1.4127026026572408e-05, + "loss": 0.385, + "step": 19638 + }, + { + "epoch": 0.36461794945103093, + "grad_norm": 0.5613787174224854, + "learning_rate": 1.4125963482009686e-05, + "loss": 0.2547, + "step": 19640 + }, + { + "epoch": 0.36465507958844956, + "grad_norm": 0.37745073437690735, + "learning_rate": 1.4124900881304962e-05, + "loss": 0.2323, + "step": 19642 + }, + { + "epoch": 0.3646922097258682, + "grad_norm": 0.3574422597885132, + "learning_rate": 1.4123838224472692e-05, + "loss": 0.2231, + "step": 19644 + }, + { + "epoch": 0.3647293398632868, + "grad_norm": 0.3520275950431824, + "learning_rate": 1.4122775511527333e-05, + "loss": 0.3635, + "step": 19646 + }, + { + "epoch": 0.36476647000070545, + "grad_norm": 0.2459086775779724, + "learning_rate": 1.4121712742483354e-05, + "loss": 0.321, + "step": 19648 + }, + { + "epoch": 0.36480360013812413, + "grad_norm": 0.43133100867271423, + "learning_rate": 1.4120649917355205e-05, + "loss": 0.2612, + "step": 19650 + }, + { + "epoch": 0.36484073027554276, + "grad_norm": 0.7307223677635193, + "learning_rate": 1.4119587036157354e-05, + "loss": 0.3203, + "step": 19652 + }, + { + "epoch": 0.3648778604129614, + "grad_norm": 0.29405370354652405, + "learning_rate": 1.4118524098904267e-05, + "loss": 0.28, + "step": 19654 + }, + { + "epoch": 0.36491499055038, + "grad_norm": 0.3256421387195587, + "learning_rate": 1.4117461105610402e-05, + "loss": 0.2807, + "step": 19656 + }, + { + "epoch": 0.36495212068779864, + "grad_norm": 0.35624784231185913, + "learning_rate": 1.4116398056290222e-05, + "loss": 0.1957, + "step": 19658 + }, + { + "epoch": 0.3649892508252173, + "grad_norm": 0.2632369101047516, + "learning_rate": 1.4115334950958198e-05, + "loss": 0.2977, + "step": 19660 + }, + { + "epoch": 0.36502638096263595, + "grad_norm": 0.24811019003391266, + "learning_rate": 1.4114271789628788e-05, + "loss": 0.2749, + "step": 19662 + }, + { + "epoch": 0.3650635111000546, + "grad_norm": 1.0779023170471191, + "learning_rate": 1.4113208572316465e-05, + "loss": 0.2522, + "step": 19664 + }, + { + "epoch": 0.3651006412374732, + "grad_norm": 0.32825222611427307, + "learning_rate": 1.4112145299035693e-05, + "loss": 0.4234, + "step": 19666 + }, + { + "epoch": 0.36513777137489184, + "grad_norm": 0.38045060634613037, + "learning_rate": 1.4111081969800941e-05, + "loss": 0.3656, + "step": 19668 + }, + { + "epoch": 0.3651749015123105, + "grad_norm": 0.36870869994163513, + "learning_rate": 1.4110018584626678e-05, + "loss": 0.354, + "step": 19670 + }, + { + "epoch": 0.36521203164972915, + "grad_norm": 0.4029601812362671, + "learning_rate": 1.4108955143527372e-05, + "loss": 0.3685, + "step": 19672 + }, + { + "epoch": 0.3652491617871478, + "grad_norm": 0.34298762679100037, + "learning_rate": 1.4107891646517497e-05, + "loss": 0.1645, + "step": 19674 + }, + { + "epoch": 0.3652862919245664, + "grad_norm": 0.35761430859565735, + "learning_rate": 1.410682809361152e-05, + "loss": 0.2785, + "step": 19676 + }, + { + "epoch": 0.36532342206198504, + "grad_norm": 0.21414943039417267, + "learning_rate": 1.4105764484823912e-05, + "loss": 0.2213, + "step": 19678 + }, + { + "epoch": 0.36536055219940367, + "grad_norm": 0.2507987916469574, + "learning_rate": 1.410470082016915e-05, + "loss": 0.2802, + "step": 19680 + }, + { + "epoch": 0.36539768233682235, + "grad_norm": 0.3661417067050934, + "learning_rate": 1.4103637099661703e-05, + "loss": 0.0958, + "step": 19682 + }, + { + "epoch": 0.365434812474241, + "grad_norm": 0.24103400111198425, + "learning_rate": 1.4102573323316046e-05, + "loss": 0.2302, + "step": 19684 + }, + { + "epoch": 0.3654719426116596, + "grad_norm": 0.47021281719207764, + "learning_rate": 1.410150949114666e-05, + "loss": 0.4059, + "step": 19686 + }, + { + "epoch": 0.36550907274907823, + "grad_norm": 3.089322805404663, + "learning_rate": 1.410044560316801e-05, + "loss": 0.4785, + "step": 19688 + }, + { + "epoch": 0.36554620288649686, + "grad_norm": 0.2955191731452942, + "learning_rate": 1.4099381659394579e-05, + "loss": 0.3642, + "step": 19690 + }, + { + "epoch": 0.36558333302391555, + "grad_norm": 0.26204001903533936, + "learning_rate": 1.4098317659840846e-05, + "loss": 0.249, + "step": 19692 + }, + { + "epoch": 0.3656204631613342, + "grad_norm": 0.44068658351898193, + "learning_rate": 1.4097253604521281e-05, + "loss": 0.3962, + "step": 19694 + }, + { + "epoch": 0.3656575932987528, + "grad_norm": 0.34314167499542236, + "learning_rate": 1.4096189493450369e-05, + "loss": 0.2091, + "step": 19696 + }, + { + "epoch": 0.36569472343617143, + "grad_norm": 0.40914028882980347, + "learning_rate": 1.409512532664259e-05, + "loss": 0.3216, + "step": 19698 + }, + { + "epoch": 0.36573185357359006, + "grad_norm": 0.3474060297012329, + "learning_rate": 1.4094061104112415e-05, + "loss": 0.3361, + "step": 19700 + }, + { + "epoch": 0.3657689837110087, + "grad_norm": 0.4032563865184784, + "learning_rate": 1.4092996825874335e-05, + "loss": 0.1635, + "step": 19702 + }, + { + "epoch": 0.36580611384842737, + "grad_norm": 0.40958371758461, + "learning_rate": 1.4091932491942828e-05, + "loss": 0.1466, + "step": 19704 + }, + { + "epoch": 0.365843243985846, + "grad_norm": 0.32558050751686096, + "learning_rate": 1.4090868102332378e-05, + "loss": 0.2585, + "step": 19706 + }, + { + "epoch": 0.36588037412326463, + "grad_norm": 0.3151330053806305, + "learning_rate": 1.4089803657057465e-05, + "loss": 0.2442, + "step": 19708 + }, + { + "epoch": 0.36591750426068326, + "grad_norm": 0.3123070299625397, + "learning_rate": 1.4088739156132576e-05, + "loss": 0.2547, + "step": 19710 + }, + { + "epoch": 0.3659546343981019, + "grad_norm": 0.4219985604286194, + "learning_rate": 1.4087674599572195e-05, + "loss": 0.3546, + "step": 19712 + }, + { + "epoch": 0.36599176453552057, + "grad_norm": 0.3112752437591553, + "learning_rate": 1.4086609987390802e-05, + "loss": 0.2208, + "step": 19714 + }, + { + "epoch": 0.3660288946729392, + "grad_norm": 0.38033539056777954, + "learning_rate": 1.4085545319602894e-05, + "loss": 0.2005, + "step": 19716 + }, + { + "epoch": 0.3660660248103578, + "grad_norm": 0.5819393992424011, + "learning_rate": 1.408448059622295e-05, + "loss": 0.2716, + "step": 19718 + }, + { + "epoch": 0.36610315494777645, + "grad_norm": 0.41848811507225037, + "learning_rate": 1.4083415817265457e-05, + "loss": 0.2395, + "step": 19720 + }, + { + "epoch": 0.3661402850851951, + "grad_norm": 0.3015039563179016, + "learning_rate": 1.4082350982744906e-05, + "loss": 0.1003, + "step": 19722 + }, + { + "epoch": 0.3661774152226137, + "grad_norm": 0.3398449420928955, + "learning_rate": 1.4081286092675788e-05, + "loss": 0.2636, + "step": 19724 + }, + { + "epoch": 0.3662145453600324, + "grad_norm": 0.49297478795051575, + "learning_rate": 1.408022114707259e-05, + "loss": 0.426, + "step": 19726 + }, + { + "epoch": 0.366251675497451, + "grad_norm": 0.4017919898033142, + "learning_rate": 1.4079156145949806e-05, + "loss": 0.2537, + "step": 19728 + }, + { + "epoch": 0.36628880563486965, + "grad_norm": 0.5466989874839783, + "learning_rate": 1.4078091089321925e-05, + "loss": 0.1877, + "step": 19730 + }, + { + "epoch": 0.3663259357722883, + "grad_norm": 0.275598406791687, + "learning_rate": 1.4077025977203438e-05, + "loss": 0.2078, + "step": 19732 + }, + { + "epoch": 0.3663630659097069, + "grad_norm": 0.24837088584899902, + "learning_rate": 1.4075960809608839e-05, + "loss": 0.2299, + "step": 19734 + }, + { + "epoch": 0.3664001960471256, + "grad_norm": 0.36476150155067444, + "learning_rate": 1.4074895586552626e-05, + "loss": 0.3551, + "step": 19736 + }, + { + "epoch": 0.3664373261845442, + "grad_norm": 0.3732600510120392, + "learning_rate": 1.4073830308049286e-05, + "loss": 0.2971, + "step": 19738 + }, + { + "epoch": 0.36647445632196285, + "grad_norm": 0.49440932273864746, + "learning_rate": 1.407276497411332e-05, + "loss": 0.2615, + "step": 19740 + }, + { + "epoch": 0.3665115864593815, + "grad_norm": 0.4376753866672516, + "learning_rate": 1.4071699584759222e-05, + "loss": 0.169, + "step": 19742 + }, + { + "epoch": 0.3665487165968001, + "grad_norm": 0.4948379695415497, + "learning_rate": 1.4070634140001487e-05, + "loss": 0.217, + "step": 19744 + }, + { + "epoch": 0.3665858467342188, + "grad_norm": 0.4794681668281555, + "learning_rate": 1.4069568639854613e-05, + "loss": 0.4288, + "step": 19746 + }, + { + "epoch": 0.3666229768716374, + "grad_norm": 0.37141862511634827, + "learning_rate": 1.4068503084333105e-05, + "loss": 0.2167, + "step": 19748 + }, + { + "epoch": 0.36666010700905605, + "grad_norm": 0.35490235686302185, + "learning_rate": 1.4067437473451453e-05, + "loss": 0.3415, + "step": 19750 + }, + { + "epoch": 0.3666972371464747, + "grad_norm": 4.577044486999512, + "learning_rate": 1.4066371807224161e-05, + "loss": 0.4908, + "step": 19752 + }, + { + "epoch": 0.3667343672838933, + "grad_norm": 0.6670210361480713, + "learning_rate": 1.4065306085665728e-05, + "loss": 0.2843, + "step": 19754 + }, + { + "epoch": 0.36677149742131193, + "grad_norm": 0.5045584440231323, + "learning_rate": 1.4064240308790658e-05, + "loss": 0.1779, + "step": 19756 + }, + { + "epoch": 0.3668086275587306, + "grad_norm": 0.5253371000289917, + "learning_rate": 1.4063174476613449e-05, + "loss": 0.3026, + "step": 19758 + }, + { + "epoch": 0.36684575769614924, + "grad_norm": 0.3712799549102783, + "learning_rate": 1.4062108589148609e-05, + "loss": 0.3911, + "step": 19760 + }, + { + "epoch": 0.3668828878335679, + "grad_norm": 0.49503302574157715, + "learning_rate": 1.4061042646410637e-05, + "loss": 0.2465, + "step": 19762 + }, + { + "epoch": 0.3669200179709865, + "grad_norm": 0.4100792706012726, + "learning_rate": 1.4059976648414038e-05, + "loss": 0.499, + "step": 19764 + }, + { + "epoch": 0.36695714810840513, + "grad_norm": 0.566878080368042, + "learning_rate": 1.4058910595173315e-05, + "loss": 0.2255, + "step": 19766 + }, + { + "epoch": 0.3669942782458238, + "grad_norm": 0.3531090021133423, + "learning_rate": 1.4057844486702983e-05, + "loss": 0.4532, + "step": 19768 + }, + { + "epoch": 0.36703140838324244, + "grad_norm": 0.3961486518383026, + "learning_rate": 1.4056778323017541e-05, + "loss": 0.3222, + "step": 19770 + }, + { + "epoch": 0.36706853852066107, + "grad_norm": 0.30226781964302063, + "learning_rate": 1.4055712104131494e-05, + "loss": 0.3407, + "step": 19772 + }, + { + "epoch": 0.3671056686580797, + "grad_norm": 0.40176841616630554, + "learning_rate": 1.4054645830059356e-05, + "loss": 0.2552, + "step": 19774 + }, + { + "epoch": 0.3671427987954983, + "grad_norm": 0.3255210816860199, + "learning_rate": 1.4053579500815633e-05, + "loss": 0.38, + "step": 19776 + }, + { + "epoch": 0.36717992893291695, + "grad_norm": 0.3860858082771301, + "learning_rate": 1.4052513116414831e-05, + "loss": 0.2448, + "step": 19778 + }, + { + "epoch": 0.36721705907033564, + "grad_norm": 0.5881112813949585, + "learning_rate": 1.405144667687147e-05, + "loss": 0.2097, + "step": 19780 + }, + { + "epoch": 0.36725418920775427, + "grad_norm": 0.5123052000999451, + "learning_rate": 1.4050380182200054e-05, + "loss": 0.2412, + "step": 19782 + }, + { + "epoch": 0.3672913193451729, + "grad_norm": 0.41227757930755615, + "learning_rate": 1.4049313632415093e-05, + "loss": 0.3831, + "step": 19784 + }, + { + "epoch": 0.3673284494825915, + "grad_norm": 0.39411109685897827, + "learning_rate": 1.4048247027531104e-05, + "loss": 0.2493, + "step": 19786 + }, + { + "epoch": 0.36736557962001015, + "grad_norm": 0.3688342273235321, + "learning_rate": 1.40471803675626e-05, + "loss": 0.3052, + "step": 19788 + }, + { + "epoch": 0.36740270975742884, + "grad_norm": 0.5016171932220459, + "learning_rate": 1.4046113652524094e-05, + "loss": 0.3513, + "step": 19790 + }, + { + "epoch": 0.36743983989484746, + "grad_norm": 0.5210905075073242, + "learning_rate": 1.40450468824301e-05, + "loss": 0.3102, + "step": 19792 + }, + { + "epoch": 0.3674769700322661, + "grad_norm": 0.27672454714775085, + "learning_rate": 1.4043980057295135e-05, + "loss": 0.3127, + "step": 19794 + }, + { + "epoch": 0.3675141001696847, + "grad_norm": 0.4603653848171234, + "learning_rate": 1.4042913177133713e-05, + "loss": 0.3356, + "step": 19796 + }, + { + "epoch": 0.36755123030710335, + "grad_norm": 0.45041635632514954, + "learning_rate": 1.4041846241960353e-05, + "loss": 0.2722, + "step": 19798 + }, + { + "epoch": 0.367588360444522, + "grad_norm": 0.2742089629173279, + "learning_rate": 1.4040779251789572e-05, + "loss": 0.2325, + "step": 19800 + }, + { + "epoch": 0.36762549058194066, + "grad_norm": 0.4382975697517395, + "learning_rate": 1.4039712206635891e-05, + "loss": 0.2816, + "step": 19802 + }, + { + "epoch": 0.3676626207193593, + "grad_norm": 0.530083954334259, + "learning_rate": 1.4038645106513823e-05, + "loss": 0.2455, + "step": 19804 + }, + { + "epoch": 0.3676997508567779, + "grad_norm": 0.29423898458480835, + "learning_rate": 1.4037577951437898e-05, + "loss": 0.2766, + "step": 19806 + }, + { + "epoch": 0.36773688099419655, + "grad_norm": 0.49707090854644775, + "learning_rate": 1.4036510741422627e-05, + "loss": 0.2937, + "step": 19808 + }, + { + "epoch": 0.3677740111316152, + "grad_norm": 0.42107880115509033, + "learning_rate": 1.4035443476482535e-05, + "loss": 0.3112, + "step": 19810 + }, + { + "epoch": 0.36781114126903386, + "grad_norm": 0.28626957535743713, + "learning_rate": 1.4034376156632148e-05, + "loss": 0.2299, + "step": 19812 + }, + { + "epoch": 0.3678482714064525, + "grad_norm": 0.3611034154891968, + "learning_rate": 1.4033308781885987e-05, + "loss": 0.2723, + "step": 19814 + }, + { + "epoch": 0.3678854015438711, + "grad_norm": 0.37940099835395813, + "learning_rate": 1.4032241352258568e-05, + "loss": 0.2344, + "step": 19816 + }, + { + "epoch": 0.36792253168128974, + "grad_norm": 0.3809797167778015, + "learning_rate": 1.403117386776443e-05, + "loss": 0.4395, + "step": 19818 + }, + { + "epoch": 0.3679596618187084, + "grad_norm": 0.30382221937179565, + "learning_rate": 1.4030106328418085e-05, + "loss": 0.1861, + "step": 19820 + }, + { + "epoch": 0.36799679195612706, + "grad_norm": 0.3881332576274872, + "learning_rate": 1.4029038734234064e-05, + "loss": 0.2843, + "step": 19822 + }, + { + "epoch": 0.3680339220935457, + "grad_norm": 0.4450070559978485, + "learning_rate": 1.40279710852269e-05, + "loss": 0.3987, + "step": 19824 + }, + { + "epoch": 0.3680710522309643, + "grad_norm": 0.3255630433559418, + "learning_rate": 1.402690338141111e-05, + "loss": 0.3101, + "step": 19826 + }, + { + "epoch": 0.36810818236838294, + "grad_norm": 0.31240347027778625, + "learning_rate": 1.4025835622801225e-05, + "loss": 0.2112, + "step": 19828 + }, + { + "epoch": 0.36814531250580157, + "grad_norm": 0.46936532855033875, + "learning_rate": 1.4024767809411779e-05, + "loss": 0.2729, + "step": 19830 + }, + { + "epoch": 0.3681824426432202, + "grad_norm": 0.4618358314037323, + "learning_rate": 1.4023699941257299e-05, + "loss": 0.3405, + "step": 19832 + }, + { + "epoch": 0.3682195727806389, + "grad_norm": 0.3514604866504669, + "learning_rate": 1.4022632018352315e-05, + "loss": 0.1945, + "step": 19834 + }, + { + "epoch": 0.3682567029180575, + "grad_norm": 0.44015613198280334, + "learning_rate": 1.4021564040711357e-05, + "loss": 0.275, + "step": 19836 + }, + { + "epoch": 0.36829383305547614, + "grad_norm": 0.5452552437782288, + "learning_rate": 1.4020496008348961e-05, + "loss": 0.3467, + "step": 19838 + }, + { + "epoch": 0.36833096319289477, + "grad_norm": 0.5117089748382568, + "learning_rate": 1.4019427921279653e-05, + "loss": 0.4378, + "step": 19840 + }, + { + "epoch": 0.3683680933303134, + "grad_norm": 0.3432047963142395, + "learning_rate": 1.4018359779517974e-05, + "loss": 0.4241, + "step": 19842 + }, + { + "epoch": 0.3684052234677321, + "grad_norm": 0.3727293312549591, + "learning_rate": 1.4017291583078454e-05, + "loss": 0.352, + "step": 19844 + }, + { + "epoch": 0.3684423536051507, + "grad_norm": 0.3713934123516083, + "learning_rate": 1.4016223331975628e-05, + "loss": 0.1653, + "step": 19846 + }, + { + "epoch": 0.36847948374256934, + "grad_norm": 0.3766219913959503, + "learning_rate": 1.4015155026224034e-05, + "loss": 0.2882, + "step": 19848 + }, + { + "epoch": 0.36851661387998796, + "grad_norm": 0.35271942615509033, + "learning_rate": 1.4014086665838207e-05, + "loss": 0.4556, + "step": 19850 + }, + { + "epoch": 0.3685537440174066, + "grad_norm": 0.2975536286830902, + "learning_rate": 1.4013018250832682e-05, + "loss": 0.1583, + "step": 19852 + }, + { + "epoch": 0.3685908741548252, + "grad_norm": 0.26816362142562866, + "learning_rate": 1.4011949781222e-05, + "loss": 0.3755, + "step": 19854 + }, + { + "epoch": 0.3686280042922439, + "grad_norm": 0.5107616186141968, + "learning_rate": 1.40108812570207e-05, + "loss": 0.2508, + "step": 19856 + }, + { + "epoch": 0.36866513442966253, + "grad_norm": 0.3093423843383789, + "learning_rate": 1.4009812678243317e-05, + "loss": 0.3939, + "step": 19858 + }, + { + "epoch": 0.36870226456708116, + "grad_norm": 0.3917011618614197, + "learning_rate": 1.4008744044904397e-05, + "loss": 0.4131, + "step": 19860 + }, + { + "epoch": 0.3687393947044998, + "grad_norm": 0.4304105341434479, + "learning_rate": 1.4007675357018478e-05, + "loss": 0.3959, + "step": 19862 + }, + { + "epoch": 0.3687765248419184, + "grad_norm": 0.2791980803012848, + "learning_rate": 1.4006606614600099e-05, + "loss": 0.361, + "step": 19864 + }, + { + "epoch": 0.3688136549793371, + "grad_norm": 0.362958699464798, + "learning_rate": 1.4005537817663807e-05, + "loss": 0.1953, + "step": 19866 + }, + { + "epoch": 0.36885078511675573, + "grad_norm": 0.29998910427093506, + "learning_rate": 1.4004468966224142e-05, + "loss": 0.268, + "step": 19868 + }, + { + "epoch": 0.36888791525417436, + "grad_norm": 0.4440038502216339, + "learning_rate": 1.4003400060295653e-05, + "loss": 0.3163, + "step": 19870 + }, + { + "epoch": 0.368925045391593, + "grad_norm": 0.583439826965332, + "learning_rate": 1.4002331099892877e-05, + "loss": 0.2807, + "step": 19872 + }, + { + "epoch": 0.3689621755290116, + "grad_norm": 0.29887261986732483, + "learning_rate": 1.4001262085030363e-05, + "loss": 0.1539, + "step": 19874 + }, + { + "epoch": 0.36899930566643024, + "grad_norm": 0.333649605512619, + "learning_rate": 1.4000193015722659e-05, + "loss": 0.2771, + "step": 19876 + }, + { + "epoch": 0.36903643580384893, + "grad_norm": 0.375797301530838, + "learning_rate": 1.3999123891984309e-05, + "loss": 0.2906, + "step": 19878 + }, + { + "epoch": 0.36907356594126756, + "grad_norm": 0.30055153369903564, + "learning_rate": 1.3998054713829862e-05, + "loss": 0.421, + "step": 19880 + }, + { + "epoch": 0.3691106960786862, + "grad_norm": 0.24014157056808472, + "learning_rate": 1.3996985481273869e-05, + "loss": 0.431, + "step": 19882 + }, + { + "epoch": 0.3691478262161048, + "grad_norm": 0.3399409353733063, + "learning_rate": 1.3995916194330872e-05, + "loss": 0.1793, + "step": 19884 + }, + { + "epoch": 0.36918495635352344, + "grad_norm": 0.4142495095729828, + "learning_rate": 1.3994846853015425e-05, + "loss": 0.3761, + "step": 19886 + }, + { + "epoch": 0.3692220864909421, + "grad_norm": 0.4311040937900543, + "learning_rate": 1.399377745734208e-05, + "loss": 0.3731, + "step": 19888 + }, + { + "epoch": 0.36925921662836075, + "grad_norm": 0.4256608486175537, + "learning_rate": 1.3992708007325384e-05, + "loss": 0.3338, + "step": 19890 + }, + { + "epoch": 0.3692963467657794, + "grad_norm": 0.28841111063957214, + "learning_rate": 1.3991638502979891e-05, + "loss": 0.1511, + "step": 19892 + }, + { + "epoch": 0.369333476903198, + "grad_norm": 0.4195404648780823, + "learning_rate": 1.399056894432016e-05, + "loss": 0.2248, + "step": 19894 + }, + { + "epoch": 0.36937060704061664, + "grad_norm": 0.5865377187728882, + "learning_rate": 1.3989499331360733e-05, + "loss": 0.396, + "step": 19896 + }, + { + "epoch": 0.3694077371780353, + "grad_norm": 0.7562296390533447, + "learning_rate": 1.3988429664116175e-05, + "loss": 0.2078, + "step": 19898 + }, + { + "epoch": 0.36944486731545395, + "grad_norm": 0.3007420301437378, + "learning_rate": 1.3987359942601032e-05, + "loss": 0.3334, + "step": 19900 + }, + { + "epoch": 0.3694819974528726, + "grad_norm": 0.5938670635223389, + "learning_rate": 1.3986290166829866e-05, + "loss": 0.3465, + "step": 19902 + }, + { + "epoch": 0.3695191275902912, + "grad_norm": 0.31614843010902405, + "learning_rate": 1.3985220336817226e-05, + "loss": 0.495, + "step": 19904 + }, + { + "epoch": 0.36955625772770984, + "grad_norm": 0.3564547896385193, + "learning_rate": 1.398415045257768e-05, + "loss": 0.2107, + "step": 19906 + }, + { + "epoch": 0.36959338786512846, + "grad_norm": 0.42998653650283813, + "learning_rate": 1.398308051412578e-05, + "loss": 0.3464, + "step": 19908 + }, + { + "epoch": 0.36963051800254715, + "grad_norm": 0.5240596532821655, + "learning_rate": 1.3982010521476083e-05, + "loss": 0.5494, + "step": 19910 + }, + { + "epoch": 0.3696676481399658, + "grad_norm": 0.35242682695388794, + "learning_rate": 1.398094047464315e-05, + "loss": 0.2738, + "step": 19912 + }, + { + "epoch": 0.3697047782773844, + "grad_norm": 0.3632034957408905, + "learning_rate": 1.3979870373641543e-05, + "loss": 0.3553, + "step": 19914 + }, + { + "epoch": 0.36974190841480303, + "grad_norm": 0.29925113916397095, + "learning_rate": 1.3978800218485819e-05, + "loss": 0.2721, + "step": 19916 + }, + { + "epoch": 0.36977903855222166, + "grad_norm": 0.42727649211883545, + "learning_rate": 1.3977730009190547e-05, + "loss": 0.2008, + "step": 19918 + }, + { + "epoch": 0.36981616868964035, + "grad_norm": 0.38272160291671753, + "learning_rate": 1.397665974577028e-05, + "loss": 0.4324, + "step": 19920 + }, + { + "epoch": 0.369853298827059, + "grad_norm": 0.37463170289993286, + "learning_rate": 1.3975589428239587e-05, + "loss": 0.3564, + "step": 19922 + }, + { + "epoch": 0.3698904289644776, + "grad_norm": 0.34132835268974304, + "learning_rate": 1.3974519056613028e-05, + "loss": 0.5391, + "step": 19924 + }, + { + "epoch": 0.36992755910189623, + "grad_norm": 0.49981725215911865, + "learning_rate": 1.3973448630905171e-05, + "loss": 0.4044, + "step": 19926 + }, + { + "epoch": 0.36996468923931486, + "grad_norm": 0.32303696870803833, + "learning_rate": 1.397237815113058e-05, + "loss": 0.1147, + "step": 19928 + }, + { + "epoch": 0.3700018193767335, + "grad_norm": 0.37099358439445496, + "learning_rate": 1.3971307617303823e-05, + "loss": 0.3839, + "step": 19930 + }, + { + "epoch": 0.37003894951415217, + "grad_norm": 0.4158245921134949, + "learning_rate": 1.3970237029439462e-05, + "loss": 0.4518, + "step": 19932 + }, + { + "epoch": 0.3700760796515708, + "grad_norm": 0.3985375761985779, + "learning_rate": 1.3969166387552067e-05, + "loss": 0.1662, + "step": 19934 + }, + { + "epoch": 0.37011320978898943, + "grad_norm": 0.38583165407180786, + "learning_rate": 1.3968095691656207e-05, + "loss": 0.2297, + "step": 19936 + }, + { + "epoch": 0.37015033992640806, + "grad_norm": 0.46508556604385376, + "learning_rate": 1.3967024941766451e-05, + "loss": 0.3123, + "step": 19938 + }, + { + "epoch": 0.3701874700638267, + "grad_norm": 0.34358328580856323, + "learning_rate": 1.3965954137897368e-05, + "loss": 0.4212, + "step": 19940 + }, + { + "epoch": 0.37022460020124537, + "grad_norm": 0.40049877762794495, + "learning_rate": 1.396488328006353e-05, + "loss": 0.2406, + "step": 19942 + }, + { + "epoch": 0.370261730338664, + "grad_norm": 0.6300816535949707, + "learning_rate": 1.3963812368279506e-05, + "loss": 0.393, + "step": 19944 + }, + { + "epoch": 0.3702988604760826, + "grad_norm": 0.3964472711086273, + "learning_rate": 1.3962741402559867e-05, + "loss": 0.2183, + "step": 19946 + }, + { + "epoch": 0.37033599061350125, + "grad_norm": 0.3769698143005371, + "learning_rate": 1.3961670382919187e-05, + "loss": 0.1766, + "step": 19948 + }, + { + "epoch": 0.3703731207509199, + "grad_norm": 0.4179772436618805, + "learning_rate": 1.3960599309372042e-05, + "loss": 0.3964, + "step": 19950 + }, + { + "epoch": 0.3704102508883385, + "grad_norm": 0.23776397109031677, + "learning_rate": 1.3959528181933005e-05, + "loss": 0.1912, + "step": 19952 + }, + { + "epoch": 0.3704473810257572, + "grad_norm": 0.39710840582847595, + "learning_rate": 1.3958457000616646e-05, + "loss": 0.2873, + "step": 19954 + }, + { + "epoch": 0.3704845111631758, + "grad_norm": 0.3445592522621155, + "learning_rate": 1.3957385765437544e-05, + "loss": 0.3484, + "step": 19956 + }, + { + "epoch": 0.37052164130059445, + "grad_norm": 0.3431375026702881, + "learning_rate": 1.3956314476410278e-05, + "loss": 0.403, + "step": 19958 + }, + { + "epoch": 0.3705587714380131, + "grad_norm": 0.3629648983478546, + "learning_rate": 1.395524313354942e-05, + "loss": 0.5092, + "step": 19960 + }, + { + "epoch": 0.3705959015754317, + "grad_norm": 0.4141274094581604, + "learning_rate": 1.3954171736869553e-05, + "loss": 0.1268, + "step": 19962 + }, + { + "epoch": 0.3706330317128504, + "grad_norm": 0.22249390184879303, + "learning_rate": 1.3953100286385253e-05, + "loss": 0.2202, + "step": 19964 + }, + { + "epoch": 0.370670161850269, + "grad_norm": 0.3651902377605438, + "learning_rate": 1.3952028782111099e-05, + "loss": 0.2827, + "step": 19966 + }, + { + "epoch": 0.37070729198768765, + "grad_norm": 0.2630471885204315, + "learning_rate": 1.3950957224061668e-05, + "loss": 0.2429, + "step": 19968 + }, + { + "epoch": 0.3707444221251063, + "grad_norm": 0.43547508120536804, + "learning_rate": 1.3949885612251546e-05, + "loss": 0.1774, + "step": 19970 + }, + { + "epoch": 0.3707815522625249, + "grad_norm": 0.44554635882377625, + "learning_rate": 1.3948813946695311e-05, + "loss": 0.2937, + "step": 19972 + }, + { + "epoch": 0.3708186823999436, + "grad_norm": 0.49214935302734375, + "learning_rate": 1.3947742227407545e-05, + "loss": 0.2204, + "step": 19974 + }, + { + "epoch": 0.3708558125373622, + "grad_norm": 0.6007041335105896, + "learning_rate": 1.3946670454402838e-05, + "loss": 0.2867, + "step": 19976 + }, + { + "epoch": 0.37089294267478085, + "grad_norm": 0.31248021125793457, + "learning_rate": 1.394559862769576e-05, + "loss": 0.3555, + "step": 19978 + }, + { + "epoch": 0.3709300728121995, + "grad_norm": 0.5294929146766663, + "learning_rate": 1.3944526747300906e-05, + "loss": 0.3467, + "step": 19980 + }, + { + "epoch": 0.3709672029496181, + "grad_norm": 0.27503687143325806, + "learning_rate": 1.3943454813232862e-05, + "loss": 0.204, + "step": 19982 + }, + { + "epoch": 0.37100433308703673, + "grad_norm": 0.2617643475532532, + "learning_rate": 1.3942382825506206e-05, + "loss": 0.187, + "step": 19984 + }, + { + "epoch": 0.3710414632244554, + "grad_norm": 0.4061446189880371, + "learning_rate": 1.3941310784135529e-05, + "loss": 0.2201, + "step": 19986 + }, + { + "epoch": 0.37107859336187404, + "grad_norm": 0.5333889126777649, + "learning_rate": 1.394023868913542e-05, + "loss": 0.3019, + "step": 19988 + }, + { + "epoch": 0.37111572349929267, + "grad_norm": 0.3337761461734772, + "learning_rate": 1.393916654052046e-05, + "loss": 0.158, + "step": 19990 + }, + { + "epoch": 0.3711528536367113, + "grad_norm": 0.45581570267677307, + "learning_rate": 1.3938094338305249e-05, + "loss": 0.2493, + "step": 19992 + }, + { + "epoch": 0.37118998377412993, + "grad_norm": 0.3279041051864624, + "learning_rate": 1.3937022082504364e-05, + "loss": 0.4578, + "step": 19994 + }, + { + "epoch": 0.3712271139115486, + "grad_norm": 0.3062148094177246, + "learning_rate": 1.3935949773132405e-05, + "loss": 0.3448, + "step": 19996 + }, + { + "epoch": 0.37126424404896724, + "grad_norm": 0.39469510316848755, + "learning_rate": 1.3934877410203958e-05, + "loss": 0.357, + "step": 19998 + }, + { + "epoch": 0.37130137418638587, + "grad_norm": 0.321353018283844, + "learning_rate": 1.3933804993733615e-05, + "loss": 0.3805, + "step": 20000 + }, + { + "epoch": 0.3713385043238045, + "grad_norm": 0.35121700167655945, + "learning_rate": 1.393273252373597e-05, + "loss": 0.2556, + "step": 20002 + }, + { + "epoch": 0.3713756344612231, + "grad_norm": 0.43175145983695984, + "learning_rate": 1.3931660000225615e-05, + "loss": 0.2147, + "step": 20004 + }, + { + "epoch": 0.37141276459864175, + "grad_norm": 0.35854315757751465, + "learning_rate": 1.3930587423217144e-05, + "loss": 0.2244, + "step": 20006 + }, + { + "epoch": 0.37144989473606044, + "grad_norm": 0.3761281967163086, + "learning_rate": 1.3929514792725155e-05, + "loss": 0.2454, + "step": 20008 + }, + { + "epoch": 0.37148702487347907, + "grad_norm": 0.2423001378774643, + "learning_rate": 1.3928442108764233e-05, + "loss": 0.4407, + "step": 20010 + }, + { + "epoch": 0.3715241550108977, + "grad_norm": 0.43786919116973877, + "learning_rate": 1.3927369371348985e-05, + "loss": 0.3069, + "step": 20012 + }, + { + "epoch": 0.3715612851483163, + "grad_norm": 0.28782787919044495, + "learning_rate": 1.3926296580494003e-05, + "loss": 0.2496, + "step": 20014 + }, + { + "epoch": 0.37159841528573495, + "grad_norm": 0.46189549565315247, + "learning_rate": 1.3925223736213888e-05, + "loss": 0.2131, + "step": 20016 + }, + { + "epoch": 0.37163554542315363, + "grad_norm": 0.6770983338356018, + "learning_rate": 1.3924150838523232e-05, + "loss": 0.4071, + "step": 20018 + }, + { + "epoch": 0.37167267556057226, + "grad_norm": 0.3633163571357727, + "learning_rate": 1.392307788743664e-05, + "loss": 0.3694, + "step": 20020 + }, + { + "epoch": 0.3717098056979909, + "grad_norm": 0.3335234522819519, + "learning_rate": 1.3922004882968705e-05, + "loss": 0.2469, + "step": 20022 + }, + { + "epoch": 0.3717469358354095, + "grad_norm": 0.3177517056465149, + "learning_rate": 1.3920931825134034e-05, + "loss": 0.3049, + "step": 20024 + }, + { + "epoch": 0.37178406597282815, + "grad_norm": 0.3574046492576599, + "learning_rate": 1.3919858713947228e-05, + "loss": 0.235, + "step": 20026 + }, + { + "epoch": 0.3718211961102468, + "grad_norm": 0.3220933973789215, + "learning_rate": 1.391878554942288e-05, + "loss": 0.2809, + "step": 20028 + }, + { + "epoch": 0.37185832624766546, + "grad_norm": 0.5346874594688416, + "learning_rate": 1.3917712331575601e-05, + "loss": 0.2479, + "step": 20030 + }, + { + "epoch": 0.3718954563850841, + "grad_norm": 0.37695932388305664, + "learning_rate": 1.3916639060419993e-05, + "loss": 0.3018, + "step": 20032 + }, + { + "epoch": 0.3719325865225027, + "grad_norm": 0.5514830946922302, + "learning_rate": 1.391556573597066e-05, + "loss": 0.4107, + "step": 20034 + }, + { + "epoch": 0.37196971665992135, + "grad_norm": 0.3860941231250763, + "learning_rate": 1.3914492358242206e-05, + "loss": 0.2678, + "step": 20036 + }, + { + "epoch": 0.37200684679734, + "grad_norm": 0.40493252873420715, + "learning_rate": 1.3913418927249233e-05, + "loss": 0.2871, + "step": 20038 + }, + { + "epoch": 0.37204397693475866, + "grad_norm": 0.2914874255657196, + "learning_rate": 1.3912345443006355e-05, + "loss": 0.2694, + "step": 20040 + }, + { + "epoch": 0.3720811070721773, + "grad_norm": 0.40273770689964294, + "learning_rate": 1.391127190552817e-05, + "loss": 0.1674, + "step": 20042 + }, + { + "epoch": 0.3721182372095959, + "grad_norm": 0.2518525719642639, + "learning_rate": 1.391019831482929e-05, + "loss": 0.2432, + "step": 20044 + }, + { + "epoch": 0.37215536734701454, + "grad_norm": 0.35817602276802063, + "learning_rate": 1.390912467092433e-05, + "loss": 0.3608, + "step": 20046 + }, + { + "epoch": 0.37219249748443317, + "grad_norm": 0.682751476764679, + "learning_rate": 1.3908050973827887e-05, + "loss": 0.182, + "step": 20048 + }, + { + "epoch": 0.37222962762185186, + "grad_norm": 0.526157557964325, + "learning_rate": 1.3906977223554576e-05, + "loss": 0.2158, + "step": 20050 + }, + { + "epoch": 0.3722667577592705, + "grad_norm": 0.29240548610687256, + "learning_rate": 1.390590342011901e-05, + "loss": 0.666, + "step": 20052 + }, + { + "epoch": 0.3723038878966891, + "grad_norm": 0.5400998592376709, + "learning_rate": 1.3904829563535796e-05, + "loss": 0.2926, + "step": 20054 + }, + { + "epoch": 0.37234101803410774, + "grad_norm": 0.4679791033267975, + "learning_rate": 1.3903755653819549e-05, + "loss": 0.2421, + "step": 20056 + }, + { + "epoch": 0.37237814817152637, + "grad_norm": 0.32286354899406433, + "learning_rate": 1.390268169098488e-05, + "loss": 0.2445, + "step": 20058 + }, + { + "epoch": 0.372415278308945, + "grad_norm": 0.3165193498134613, + "learning_rate": 1.3901607675046402e-05, + "loss": 0.1307, + "step": 20060 + }, + { + "epoch": 0.3724524084463637, + "grad_norm": 0.6171788573265076, + "learning_rate": 1.3900533606018732e-05, + "loss": 0.3297, + "step": 20062 + }, + { + "epoch": 0.3724895385837823, + "grad_norm": 0.4425636827945709, + "learning_rate": 1.3899459483916481e-05, + "loss": 0.3031, + "step": 20064 + }, + { + "epoch": 0.37252666872120094, + "grad_norm": 0.4312133193016052, + "learning_rate": 1.389838530875427e-05, + "loss": 0.2747, + "step": 20066 + }, + { + "epoch": 0.37256379885861957, + "grad_norm": 0.3306850790977478, + "learning_rate": 1.389731108054671e-05, + "loss": 0.1685, + "step": 20068 + }, + { + "epoch": 0.3726009289960382, + "grad_norm": 0.3475620448589325, + "learning_rate": 1.3896236799308422e-05, + "loss": 0.3834, + "step": 20070 + }, + { + "epoch": 0.3726380591334569, + "grad_norm": 0.30715104937553406, + "learning_rate": 1.3895162465054019e-05, + "loss": 0.1464, + "step": 20072 + }, + { + "epoch": 0.3726751892708755, + "grad_norm": 0.40902286767959595, + "learning_rate": 1.389408807779812e-05, + "loss": 0.1452, + "step": 20074 + }, + { + "epoch": 0.37271231940829413, + "grad_norm": 0.5330752730369568, + "learning_rate": 1.3893013637555353e-05, + "loss": 0.5111, + "step": 20076 + }, + { + "epoch": 0.37274944954571276, + "grad_norm": 0.2899560332298279, + "learning_rate": 1.3891939144340328e-05, + "loss": 0.1988, + "step": 20078 + }, + { + "epoch": 0.3727865796831314, + "grad_norm": 0.7345959544181824, + "learning_rate": 1.3890864598167668e-05, + "loss": 0.4382, + "step": 20080 + }, + { + "epoch": 0.37282370982055, + "grad_norm": 0.31923699378967285, + "learning_rate": 1.3889789999051995e-05, + "loss": 0.1323, + "step": 20082 + }, + { + "epoch": 0.3728608399579687, + "grad_norm": 0.3731330633163452, + "learning_rate": 1.3888715347007932e-05, + "loss": 0.2733, + "step": 20084 + }, + { + "epoch": 0.37289797009538733, + "grad_norm": 0.2001037448644638, + "learning_rate": 1.3887640642050102e-05, + "loss": 0.189, + "step": 20086 + }, + { + "epoch": 0.37293510023280596, + "grad_norm": 0.3716852366924286, + "learning_rate": 1.3886565884193129e-05, + "loss": 0.4738, + "step": 20088 + }, + { + "epoch": 0.3729722303702246, + "grad_norm": 0.3844122886657715, + "learning_rate": 1.3885491073451634e-05, + "loss": 0.3485, + "step": 20090 + }, + { + "epoch": 0.3730093605076432, + "grad_norm": 0.29949280619621277, + "learning_rate": 1.3884416209840243e-05, + "loss": 0.215, + "step": 20092 + }, + { + "epoch": 0.3730464906450619, + "grad_norm": 0.35048598051071167, + "learning_rate": 1.3883341293373582e-05, + "loss": 0.3716, + "step": 20094 + }, + { + "epoch": 0.37308362078248053, + "grad_norm": 0.4095684289932251, + "learning_rate": 1.3882266324066282e-05, + "loss": 0.2034, + "step": 20096 + }, + { + "epoch": 0.37312075091989916, + "grad_norm": 0.3948003351688385, + "learning_rate": 1.3881191301932963e-05, + "loss": 0.3831, + "step": 20098 + }, + { + "epoch": 0.3731578810573178, + "grad_norm": 0.38309329748153687, + "learning_rate": 1.3880116226988258e-05, + "loss": 0.5074, + "step": 20100 + }, + { + "epoch": 0.3731950111947364, + "grad_norm": 0.3737712502479553, + "learning_rate": 1.3879041099246792e-05, + "loss": 0.4418, + "step": 20102 + }, + { + "epoch": 0.37323214133215504, + "grad_norm": 0.5642034411430359, + "learning_rate": 1.3877965918723194e-05, + "loss": 0.3944, + "step": 20104 + }, + { + "epoch": 0.3732692714695737, + "grad_norm": 0.3231869637966156, + "learning_rate": 1.3876890685432099e-05, + "loss": 0.3836, + "step": 20106 + }, + { + "epoch": 0.37330640160699236, + "grad_norm": 0.39459705352783203, + "learning_rate": 1.3875815399388133e-05, + "loss": 0.2053, + "step": 20108 + }, + { + "epoch": 0.373343531744411, + "grad_norm": 0.322160929441452, + "learning_rate": 1.3874740060605931e-05, + "loss": 0.2356, + "step": 20110 + }, + { + "epoch": 0.3733806618818296, + "grad_norm": 0.3258999288082123, + "learning_rate": 1.3873664669100118e-05, + "loss": 0.2407, + "step": 20112 + }, + { + "epoch": 0.37341779201924824, + "grad_norm": 0.5043696165084839, + "learning_rate": 1.3872589224885335e-05, + "loss": 0.4399, + "step": 20114 + }, + { + "epoch": 0.3734549221566669, + "grad_norm": 0.4689836800098419, + "learning_rate": 1.3871513727976212e-05, + "loss": 0.3366, + "step": 20116 + }, + { + "epoch": 0.37349205229408555, + "grad_norm": 0.4764638841152191, + "learning_rate": 1.3870438178387382e-05, + "loss": 0.1361, + "step": 20118 + }, + { + "epoch": 0.3735291824315042, + "grad_norm": 0.305999219417572, + "learning_rate": 1.3869362576133485e-05, + "loss": 0.1727, + "step": 20120 + }, + { + "epoch": 0.3735663125689228, + "grad_norm": 0.3126106262207031, + "learning_rate": 1.3868286921229153e-05, + "loss": 0.3255, + "step": 20122 + }, + { + "epoch": 0.37360344270634144, + "grad_norm": 0.3261148929595947, + "learning_rate": 1.386721121368902e-05, + "loss": 0.4469, + "step": 20124 + }, + { + "epoch": 0.3736405728437601, + "grad_norm": 0.30270540714263916, + "learning_rate": 1.3866135453527727e-05, + "loss": 0.3831, + "step": 20126 + }, + { + "epoch": 0.37367770298117875, + "grad_norm": 0.42626285552978516, + "learning_rate": 1.3865059640759913e-05, + "loss": 0.324, + "step": 20128 + }, + { + "epoch": 0.3737148331185974, + "grad_norm": 0.4067237377166748, + "learning_rate": 1.3863983775400214e-05, + "loss": 0.1985, + "step": 20130 + }, + { + "epoch": 0.373751963256016, + "grad_norm": 0.39006122946739197, + "learning_rate": 1.3862907857463268e-05, + "loss": 0.4717, + "step": 20132 + }, + { + "epoch": 0.37378909339343463, + "grad_norm": 0.5019491314888, + "learning_rate": 1.3861831886963718e-05, + "loss": 0.4423, + "step": 20134 + }, + { + "epoch": 0.37382622353085326, + "grad_norm": 0.4504082500934601, + "learning_rate": 1.3860755863916203e-05, + "loss": 0.1617, + "step": 20136 + }, + { + "epoch": 0.37386335366827195, + "grad_norm": 0.4010758697986603, + "learning_rate": 1.3859679788335363e-05, + "loss": 0.3667, + "step": 20138 + }, + { + "epoch": 0.3739004838056906, + "grad_norm": 0.4239271879196167, + "learning_rate": 1.3858603660235846e-05, + "loss": 0.2497, + "step": 20140 + }, + { + "epoch": 0.3739376139431092, + "grad_norm": 0.3606035113334656, + "learning_rate": 1.3857527479632289e-05, + "loss": 0.4436, + "step": 20142 + }, + { + "epoch": 0.37397474408052783, + "grad_norm": 0.5415440797805786, + "learning_rate": 1.3856451246539337e-05, + "loss": 0.2846, + "step": 20144 + }, + { + "epoch": 0.37401187421794646, + "grad_norm": 0.5108171701431274, + "learning_rate": 1.3855374960971637e-05, + "loss": 0.3918, + "step": 20146 + }, + { + "epoch": 0.37404900435536514, + "grad_norm": 0.387134850025177, + "learning_rate": 1.3854298622943832e-05, + "loss": 0.3604, + "step": 20148 + }, + { + "epoch": 0.3740861344927838, + "grad_norm": 0.3296907842159271, + "learning_rate": 1.3853222232470564e-05, + "loss": 0.3141, + "step": 20150 + }, + { + "epoch": 0.3741232646302024, + "grad_norm": 0.3946489691734314, + "learning_rate": 1.3852145789566487e-05, + "loss": 0.3296, + "step": 20152 + }, + { + "epoch": 0.37416039476762103, + "grad_norm": 0.269846647977829, + "learning_rate": 1.3851069294246244e-05, + "loss": 0.1606, + "step": 20154 + }, + { + "epoch": 0.37419752490503966, + "grad_norm": 0.3285703957080841, + "learning_rate": 1.3849992746524481e-05, + "loss": 0.201, + "step": 20156 + }, + { + "epoch": 0.3742346550424583, + "grad_norm": 0.40791597962379456, + "learning_rate": 1.384891614641585e-05, + "loss": 0.3791, + "step": 20158 + }, + { + "epoch": 0.37427178517987697, + "grad_norm": 0.38663923740386963, + "learning_rate": 1.3847839493935e-05, + "loss": 0.1598, + "step": 20160 + }, + { + "epoch": 0.3743089153172956, + "grad_norm": 0.2794301211833954, + "learning_rate": 1.384676278909658e-05, + "loss": 0.4217, + "step": 20162 + }, + { + "epoch": 0.3743460454547142, + "grad_norm": 0.40352746844291687, + "learning_rate": 1.3845686031915238e-05, + "loss": 0.2545, + "step": 20164 + }, + { + "epoch": 0.37438317559213286, + "grad_norm": 0.514489471912384, + "learning_rate": 1.3844609222405632e-05, + "loss": 0.1664, + "step": 20166 + }, + { + "epoch": 0.3744203057295515, + "grad_norm": 0.6553933620452881, + "learning_rate": 1.3843532360582408e-05, + "loss": 0.2509, + "step": 20168 + }, + { + "epoch": 0.37445743586697017, + "grad_norm": 0.40633782744407654, + "learning_rate": 1.3842455446460221e-05, + "loss": 0.5374, + "step": 20170 + }, + { + "epoch": 0.3744945660043888, + "grad_norm": 0.49291539192199707, + "learning_rate": 1.3841378480053726e-05, + "loss": 0.2023, + "step": 20172 + }, + { + "epoch": 0.3745316961418074, + "grad_norm": 0.3370891511440277, + "learning_rate": 1.3840301461377577e-05, + "loss": 0.3819, + "step": 20174 + }, + { + "epoch": 0.37456882627922605, + "grad_norm": 0.4726948142051697, + "learning_rate": 1.3839224390446425e-05, + "loss": 0.3378, + "step": 20176 + }, + { + "epoch": 0.3746059564166447, + "grad_norm": 0.33347535133361816, + "learning_rate": 1.3838147267274935e-05, + "loss": 0.2956, + "step": 20178 + }, + { + "epoch": 0.3746430865540633, + "grad_norm": 0.4049592912197113, + "learning_rate": 1.3837070091877752e-05, + "loss": 0.3076, + "step": 20180 + }, + { + "epoch": 0.374680216691482, + "grad_norm": 0.23513221740722656, + "learning_rate": 1.3835992864269538e-05, + "loss": 0.4452, + "step": 20182 + }, + { + "epoch": 0.3747173468289006, + "grad_norm": 0.49878957867622375, + "learning_rate": 1.3834915584464956e-05, + "loss": 0.3168, + "step": 20184 + }, + { + "epoch": 0.37475447696631925, + "grad_norm": 0.27951204776763916, + "learning_rate": 1.3833838252478655e-05, + "loss": 0.2515, + "step": 20186 + }, + { + "epoch": 0.3747916071037379, + "grad_norm": 0.4702208638191223, + "learning_rate": 1.3832760868325302e-05, + "loss": 0.2061, + "step": 20188 + }, + { + "epoch": 0.3748287372411565, + "grad_norm": 0.27690836787223816, + "learning_rate": 1.3831683432019553e-05, + "loss": 0.108, + "step": 20190 + }, + { + "epoch": 0.3748658673785752, + "grad_norm": 0.3577194809913635, + "learning_rate": 1.3830605943576068e-05, + "loss": 0.2822, + "step": 20192 + }, + { + "epoch": 0.3749029975159938, + "grad_norm": 0.46531084179878235, + "learning_rate": 1.3829528403009513e-05, + "loss": 0.301, + "step": 20194 + }, + { + "epoch": 0.37494012765341245, + "grad_norm": 0.31811726093292236, + "learning_rate": 1.3828450810334547e-05, + "loss": 0.3618, + "step": 20196 + }, + { + "epoch": 0.3749772577908311, + "grad_norm": 0.34373581409454346, + "learning_rate": 1.3827373165565829e-05, + "loss": 0.388, + "step": 20198 + }, + { + "epoch": 0.3750143879282497, + "grad_norm": 0.399286687374115, + "learning_rate": 1.3826295468718029e-05, + "loss": 0.1814, + "step": 20200 + }, + { + "epoch": 0.3750515180656684, + "grad_norm": 0.42931726574897766, + "learning_rate": 1.382521771980581e-05, + "loss": 0.2449, + "step": 20202 + }, + { + "epoch": 0.375088648203087, + "grad_norm": 0.2964993417263031, + "learning_rate": 1.3824139918843835e-05, + "loss": 0.2523, + "step": 20204 + }, + { + "epoch": 0.37512577834050564, + "grad_norm": 0.3761836290359497, + "learning_rate": 1.382306206584677e-05, + "loss": 0.4636, + "step": 20206 + }, + { + "epoch": 0.3751629084779243, + "grad_norm": 0.47489526867866516, + "learning_rate": 1.3821984160829282e-05, + "loss": 0.3403, + "step": 20208 + }, + { + "epoch": 0.3752000386153429, + "grad_norm": 0.42662665247917175, + "learning_rate": 1.3820906203806039e-05, + "loss": 0.1792, + "step": 20210 + }, + { + "epoch": 0.37523716875276153, + "grad_norm": 0.5387255549430847, + "learning_rate": 1.3819828194791705e-05, + "loss": 0.3561, + "step": 20212 + }, + { + "epoch": 0.3752742988901802, + "grad_norm": 0.36297503113746643, + "learning_rate": 1.3818750133800952e-05, + "loss": 0.2866, + "step": 20214 + }, + { + "epoch": 0.37531142902759884, + "grad_norm": 0.26776930689811707, + "learning_rate": 1.3817672020848449e-05, + "loss": 0.5039, + "step": 20216 + }, + { + "epoch": 0.37534855916501747, + "grad_norm": 0.3598172664642334, + "learning_rate": 1.3816593855948863e-05, + "loss": 0.2748, + "step": 20218 + }, + { + "epoch": 0.3753856893024361, + "grad_norm": 0.40361225605010986, + "learning_rate": 1.3815515639116868e-05, + "loss": 0.1978, + "step": 20220 + }, + { + "epoch": 0.3754228194398547, + "grad_norm": 0.49906542897224426, + "learning_rate": 1.3814437370367135e-05, + "loss": 0.2688, + "step": 20222 + }, + { + "epoch": 0.3754599495772734, + "grad_norm": 0.19722506403923035, + "learning_rate": 1.3813359049714332e-05, + "loss": 0.3045, + "step": 20224 + }, + { + "epoch": 0.37549707971469204, + "grad_norm": 0.21416231989860535, + "learning_rate": 1.3812280677173138e-05, + "loss": 0.1203, + "step": 20226 + }, + { + "epoch": 0.37553420985211067, + "grad_norm": 0.29404380917549133, + "learning_rate": 1.3811202252758223e-05, + "loss": 0.382, + "step": 20228 + }, + { + "epoch": 0.3755713399895293, + "grad_norm": 0.4771401286125183, + "learning_rate": 1.3810123776484259e-05, + "loss": 0.3273, + "step": 20230 + }, + { + "epoch": 0.3756084701269479, + "grad_norm": 0.43984588980674744, + "learning_rate": 1.3809045248365923e-05, + "loss": 0.1622, + "step": 20232 + }, + { + "epoch": 0.37564560026436655, + "grad_norm": 0.404715359210968, + "learning_rate": 1.3807966668417891e-05, + "loss": 0.2863, + "step": 20234 + }, + { + "epoch": 0.37568273040178524, + "grad_norm": 0.4546372592449188, + "learning_rate": 1.380688803665484e-05, + "loss": 0.2462, + "step": 20236 + }, + { + "epoch": 0.37571986053920386, + "grad_norm": 0.36051028966903687, + "learning_rate": 1.3805809353091446e-05, + "loss": 0.2272, + "step": 20238 + }, + { + "epoch": 0.3757569906766225, + "grad_norm": 0.3537476658821106, + "learning_rate": 1.3804730617742386e-05, + "loss": 0.1723, + "step": 20240 + }, + { + "epoch": 0.3757941208140411, + "grad_norm": 0.36529824137687683, + "learning_rate": 1.3803651830622338e-05, + "loss": 0.5659, + "step": 20242 + }, + { + "epoch": 0.37583125095145975, + "grad_norm": 0.40430253744125366, + "learning_rate": 1.3802572991745979e-05, + "loss": 0.3682, + "step": 20244 + }, + { + "epoch": 0.37586838108887843, + "grad_norm": 0.39774322509765625, + "learning_rate": 1.3801494101127997e-05, + "loss": 0.2397, + "step": 20246 + }, + { + "epoch": 0.37590551122629706, + "grad_norm": 0.3177061378955841, + "learning_rate": 1.3800415158783065e-05, + "loss": 0.3003, + "step": 20248 + }, + { + "epoch": 0.3759426413637157, + "grad_norm": 0.35512226819992065, + "learning_rate": 1.3799336164725865e-05, + "loss": 0.3467, + "step": 20250 + }, + { + "epoch": 0.3759797715011343, + "grad_norm": 0.470333069562912, + "learning_rate": 1.3798257118971079e-05, + "loss": 0.2541, + "step": 20252 + }, + { + "epoch": 0.37601690163855295, + "grad_norm": 0.433196485042572, + "learning_rate": 1.3797178021533396e-05, + "loss": 0.3809, + "step": 20254 + }, + { + "epoch": 0.3760540317759716, + "grad_norm": 0.3638652563095093, + "learning_rate": 1.3796098872427487e-05, + "loss": 0.3122, + "step": 20256 + }, + { + "epoch": 0.37609116191339026, + "grad_norm": 0.2668880522251129, + "learning_rate": 1.3795019671668048e-05, + "loss": 0.1882, + "step": 20258 + }, + { + "epoch": 0.3761282920508089, + "grad_norm": 0.29094910621643066, + "learning_rate": 1.379394041926976e-05, + "loss": 0.2471, + "step": 20260 + }, + { + "epoch": 0.3761654221882275, + "grad_norm": 0.3126886785030365, + "learning_rate": 1.3792861115247304e-05, + "loss": 0.2921, + "step": 20262 + }, + { + "epoch": 0.37620255232564614, + "grad_norm": 0.4343261122703552, + "learning_rate": 1.3791781759615368e-05, + "loss": 0.4161, + "step": 20264 + }, + { + "epoch": 0.3762396824630648, + "grad_norm": 0.3110836446285248, + "learning_rate": 1.3790702352388642e-05, + "loss": 0.3998, + "step": 20266 + }, + { + "epoch": 0.37627681260048346, + "grad_norm": 0.38669630885124207, + "learning_rate": 1.3789622893581812e-05, + "loss": 0.2528, + "step": 20268 + }, + { + "epoch": 0.3763139427379021, + "grad_norm": 0.25040751695632935, + "learning_rate": 1.3788543383209565e-05, + "loss": 0.3041, + "step": 20270 + }, + { + "epoch": 0.3763510728753207, + "grad_norm": 0.43754398822784424, + "learning_rate": 1.3787463821286592e-05, + "loss": 0.2961, + "step": 20272 + }, + { + "epoch": 0.37638820301273934, + "grad_norm": 0.3090064525604248, + "learning_rate": 1.3786384207827577e-05, + "loss": 0.3027, + "step": 20274 + }, + { + "epoch": 0.37642533315015797, + "grad_norm": 0.4928251802921295, + "learning_rate": 1.3785304542847215e-05, + "loss": 0.3496, + "step": 20276 + }, + { + "epoch": 0.37646246328757665, + "grad_norm": 0.5487385988235474, + "learning_rate": 1.3784224826360202e-05, + "loss": 0.3784, + "step": 20278 + }, + { + "epoch": 0.3764995934249953, + "grad_norm": 0.49109455943107605, + "learning_rate": 1.3783145058381219e-05, + "loss": 0.2521, + "step": 20280 + }, + { + "epoch": 0.3765367235624139, + "grad_norm": 0.37386277318000793, + "learning_rate": 1.3782065238924966e-05, + "loss": 0.2755, + "step": 20282 + }, + { + "epoch": 0.37657385369983254, + "grad_norm": 0.4417116641998291, + "learning_rate": 1.378098536800613e-05, + "loss": 0.2784, + "step": 20284 + }, + { + "epoch": 0.37661098383725117, + "grad_norm": 0.3929435610771179, + "learning_rate": 1.3779905445639414e-05, + "loss": 0.1653, + "step": 20286 + }, + { + "epoch": 0.3766481139746698, + "grad_norm": 0.44289231300354004, + "learning_rate": 1.3778825471839503e-05, + "loss": 0.4669, + "step": 20288 + }, + { + "epoch": 0.3766852441120885, + "grad_norm": 0.4003215730190277, + "learning_rate": 1.3777745446621098e-05, + "loss": 0.3566, + "step": 20290 + }, + { + "epoch": 0.3767223742495071, + "grad_norm": 0.31513309478759766, + "learning_rate": 1.3776665369998895e-05, + "loss": 0.3176, + "step": 20292 + }, + { + "epoch": 0.37675950438692574, + "grad_norm": 0.3404492437839508, + "learning_rate": 1.3775585241987584e-05, + "loss": 0.2161, + "step": 20294 + }, + { + "epoch": 0.37679663452434436, + "grad_norm": 0.31017041206359863, + "learning_rate": 1.3774505062601868e-05, + "loss": 0.3265, + "step": 20296 + }, + { + "epoch": 0.376833764661763, + "grad_norm": 0.39869487285614014, + "learning_rate": 1.3773424831856448e-05, + "loss": 0.329, + "step": 20298 + }, + { + "epoch": 0.3768708947991817, + "grad_norm": 0.3715631663799286, + "learning_rate": 1.3772344549766017e-05, + "loss": 0.3852, + "step": 20300 + }, + { + "epoch": 0.3769080249366003, + "grad_norm": 0.3697710931301117, + "learning_rate": 1.3771264216345276e-05, + "loss": 0.2902, + "step": 20302 + }, + { + "epoch": 0.37694515507401893, + "grad_norm": 0.4216092526912689, + "learning_rate": 1.3770183831608925e-05, + "loss": 0.1894, + "step": 20304 + }, + { + "epoch": 0.37698228521143756, + "grad_norm": 0.5225966572761536, + "learning_rate": 1.3769103395571664e-05, + "loss": 0.2304, + "step": 20306 + }, + { + "epoch": 0.3770194153488562, + "grad_norm": 0.2896948754787445, + "learning_rate": 1.3768022908248196e-05, + "loss": 0.2495, + "step": 20308 + }, + { + "epoch": 0.3770565454862748, + "grad_norm": 0.49638232588768005, + "learning_rate": 1.3766942369653222e-05, + "loss": 0.2503, + "step": 20310 + }, + { + "epoch": 0.3770936756236935, + "grad_norm": 0.23471103608608246, + "learning_rate": 1.3765861779801448e-05, + "loss": 0.3026, + "step": 20312 + }, + { + "epoch": 0.37713080576111213, + "grad_norm": 0.36043858528137207, + "learning_rate": 1.3764781138707573e-05, + "loss": 0.2585, + "step": 20314 + }, + { + "epoch": 0.37716793589853076, + "grad_norm": 0.6257306337356567, + "learning_rate": 1.3763700446386306e-05, + "loss": 0.1778, + "step": 20316 + }, + { + "epoch": 0.3772050660359494, + "grad_norm": 0.253715842962265, + "learning_rate": 1.3762619702852347e-05, + "loss": 0.2372, + "step": 20318 + }, + { + "epoch": 0.377242196173368, + "grad_norm": 0.38302934169769287, + "learning_rate": 1.3761538908120404e-05, + "loss": 0.3603, + "step": 20320 + }, + { + "epoch": 0.3772793263107867, + "grad_norm": 0.3651839792728424, + "learning_rate": 1.3760458062205185e-05, + "loss": 0.4339, + "step": 20322 + }, + { + "epoch": 0.37731645644820533, + "grad_norm": 0.3702917695045471, + "learning_rate": 1.3759377165121397e-05, + "loss": 0.2648, + "step": 20324 + }, + { + "epoch": 0.37735358658562396, + "grad_norm": 0.36105290055274963, + "learning_rate": 1.3758296216883744e-05, + "loss": 0.4399, + "step": 20326 + }, + { + "epoch": 0.3773907167230426, + "grad_norm": 0.3261259198188782, + "learning_rate": 1.3757215217506936e-05, + "loss": 0.2304, + "step": 20328 + }, + { + "epoch": 0.3774278468604612, + "grad_norm": 0.3453252911567688, + "learning_rate": 1.3756134167005686e-05, + "loss": 0.2952, + "step": 20330 + }, + { + "epoch": 0.37746497699787984, + "grad_norm": 0.3901367485523224, + "learning_rate": 1.37550530653947e-05, + "loss": 0.4554, + "step": 20332 + }, + { + "epoch": 0.3775021071352985, + "grad_norm": 0.29137834906578064, + "learning_rate": 1.3753971912688688e-05, + "loss": 0.3514, + "step": 20334 + }, + { + "epoch": 0.37753923727271715, + "grad_norm": 0.3398778438568115, + "learning_rate": 1.3752890708902366e-05, + "loss": 0.3795, + "step": 20336 + }, + { + "epoch": 0.3775763674101358, + "grad_norm": 0.39813128113746643, + "learning_rate": 1.375180945405044e-05, + "loss": 0.4359, + "step": 20338 + }, + { + "epoch": 0.3776134975475544, + "grad_norm": 0.46451589465141296, + "learning_rate": 1.3750728148147625e-05, + "loss": 0.3998, + "step": 20340 + }, + { + "epoch": 0.37765062768497304, + "grad_norm": 0.5691365003585815, + "learning_rate": 1.3749646791208635e-05, + "loss": 0.2756, + "step": 20342 + }, + { + "epoch": 0.3776877578223917, + "grad_norm": 0.30361229181289673, + "learning_rate": 1.3748565383248187e-05, + "loss": 0.2064, + "step": 20344 + }, + { + "epoch": 0.37772488795981035, + "grad_norm": 0.43304821848869324, + "learning_rate": 1.3747483924280989e-05, + "loss": 0.3243, + "step": 20346 + }, + { + "epoch": 0.377762018097229, + "grad_norm": 0.49435749650001526, + "learning_rate": 1.374640241432176e-05, + "loss": 0.2634, + "step": 20348 + }, + { + "epoch": 0.3777991482346476, + "grad_norm": 0.4735288918018341, + "learning_rate": 1.3745320853385215e-05, + "loss": 0.2771, + "step": 20350 + }, + { + "epoch": 0.37783627837206624, + "grad_norm": 0.4788779616355896, + "learning_rate": 1.3744239241486072e-05, + "loss": 0.3688, + "step": 20352 + }, + { + "epoch": 0.3778734085094849, + "grad_norm": 0.40775758028030396, + "learning_rate": 1.3743157578639053e-05, + "loss": 0.4317, + "step": 20354 + }, + { + "epoch": 0.37791053864690355, + "grad_norm": 0.3843857944011688, + "learning_rate": 1.3742075864858866e-05, + "loss": 0.3096, + "step": 20356 + }, + { + "epoch": 0.3779476687843222, + "grad_norm": 0.3461295962333679, + "learning_rate": 1.3740994100160236e-05, + "loss": 0.3152, + "step": 20358 + }, + { + "epoch": 0.3779847989217408, + "grad_norm": 0.35579144954681396, + "learning_rate": 1.3739912284557884e-05, + "loss": 0.2204, + "step": 20360 + }, + { + "epoch": 0.37802192905915943, + "grad_norm": 0.3175497353076935, + "learning_rate": 1.3738830418066526e-05, + "loss": 0.3939, + "step": 20362 + }, + { + "epoch": 0.37805905919657806, + "grad_norm": 0.4204776883125305, + "learning_rate": 1.3737748500700888e-05, + "loss": 0.2641, + "step": 20364 + }, + { + "epoch": 0.37809618933399675, + "grad_norm": 0.3607107400894165, + "learning_rate": 1.3736666532475684e-05, + "loss": 0.3211, + "step": 20366 + }, + { + "epoch": 0.3781333194714154, + "grad_norm": 0.4271625578403473, + "learning_rate": 1.3735584513405647e-05, + "loss": 0.1706, + "step": 20368 + }, + { + "epoch": 0.378170449608834, + "grad_norm": 0.18619593977928162, + "learning_rate": 1.3734502443505488e-05, + "loss": 0.3222, + "step": 20370 + }, + { + "epoch": 0.37820757974625263, + "grad_norm": 0.4362223744392395, + "learning_rate": 1.3733420322789939e-05, + "loss": 0.284, + "step": 20372 + }, + { + "epoch": 0.37824470988367126, + "grad_norm": 0.4553815424442291, + "learning_rate": 1.3732338151273723e-05, + "loss": 0.3589, + "step": 20374 + }, + { + "epoch": 0.37828184002108994, + "grad_norm": 0.4997227191925049, + "learning_rate": 1.3731255928971567e-05, + "loss": 0.2397, + "step": 20376 + }, + { + "epoch": 0.37831897015850857, + "grad_norm": 0.3272298276424408, + "learning_rate": 1.3730173655898191e-05, + "loss": 0.3295, + "step": 20378 + }, + { + "epoch": 0.3783561002959272, + "grad_norm": 0.3228127360343933, + "learning_rate": 1.3729091332068327e-05, + "loss": 0.3993, + "step": 20380 + }, + { + "epoch": 0.37839323043334583, + "grad_norm": 0.35237592458724976, + "learning_rate": 1.3728008957496697e-05, + "loss": 0.3585, + "step": 20382 + }, + { + "epoch": 0.37843036057076446, + "grad_norm": 0.5042340755462646, + "learning_rate": 1.3726926532198035e-05, + "loss": 0.2242, + "step": 20384 + }, + { + "epoch": 0.3784674907081831, + "grad_norm": 0.4173370897769928, + "learning_rate": 1.3725844056187065e-05, + "loss": 0.3806, + "step": 20386 + }, + { + "epoch": 0.37850462084560177, + "grad_norm": 0.49067965149879456, + "learning_rate": 1.3724761529478516e-05, + "loss": 0.2187, + "step": 20388 + }, + { + "epoch": 0.3785417509830204, + "grad_norm": 0.3630199432373047, + "learning_rate": 1.3723678952087118e-05, + "loss": 0.1061, + "step": 20390 + }, + { + "epoch": 0.378578881120439, + "grad_norm": 0.42760494351387024, + "learning_rate": 1.3722596324027608e-05, + "loss": 0.5486, + "step": 20392 + }, + { + "epoch": 0.37861601125785765, + "grad_norm": 0.9868090152740479, + "learning_rate": 1.372151364531471e-05, + "loss": 0.295, + "step": 20394 + }, + { + "epoch": 0.3786531413952763, + "grad_norm": 0.3924976885318756, + "learning_rate": 1.3720430915963159e-05, + "loss": 0.2392, + "step": 20396 + }, + { + "epoch": 0.37869027153269497, + "grad_norm": 0.6594648361206055, + "learning_rate": 1.371934813598769e-05, + "loss": 0.2606, + "step": 20398 + }, + { + "epoch": 0.3787274016701136, + "grad_norm": 0.3596559762954712, + "learning_rate": 1.371826530540303e-05, + "loss": 0.1531, + "step": 20400 + }, + { + "epoch": 0.3787645318075322, + "grad_norm": 0.4754466116428375, + "learning_rate": 1.3717182424223914e-05, + "loss": 0.2637, + "step": 20402 + }, + { + "epoch": 0.37880166194495085, + "grad_norm": 0.5291134119033813, + "learning_rate": 1.3716099492465086e-05, + "loss": 0.2715, + "step": 20404 + }, + { + "epoch": 0.3788387920823695, + "grad_norm": 0.3805255591869354, + "learning_rate": 1.3715016510141268e-05, + "loss": 0.214, + "step": 20406 + }, + { + "epoch": 0.3788759222197881, + "grad_norm": 0.2791435420513153, + "learning_rate": 1.3713933477267207e-05, + "loss": 0.384, + "step": 20408 + }, + { + "epoch": 0.3789130523572068, + "grad_norm": 0.3975648880004883, + "learning_rate": 1.3712850393857636e-05, + "loss": 0.3435, + "step": 20410 + }, + { + "epoch": 0.3789501824946254, + "grad_norm": 0.49346694350242615, + "learning_rate": 1.3711767259927294e-05, + "loss": 0.3415, + "step": 20412 + }, + { + "epoch": 0.37898731263204405, + "grad_norm": 0.5182379484176636, + "learning_rate": 1.3710684075490914e-05, + "loss": 0.2778, + "step": 20414 + }, + { + "epoch": 0.3790244427694627, + "grad_norm": 0.3699735999107361, + "learning_rate": 1.370960084056324e-05, + "loss": 0.3075, + "step": 20416 + }, + { + "epoch": 0.3790615729068813, + "grad_norm": 0.3503396213054657, + "learning_rate": 1.3708517555159011e-05, + "loss": 0.27, + "step": 20418 + }, + { + "epoch": 0.3790987030443, + "grad_norm": 0.3092592656612396, + "learning_rate": 1.3707434219292966e-05, + "loss": 0.3359, + "step": 20420 + }, + { + "epoch": 0.3791358331817186, + "grad_norm": 0.3436076045036316, + "learning_rate": 1.3706350832979845e-05, + "loss": 0.3574, + "step": 20422 + }, + { + "epoch": 0.37917296331913725, + "grad_norm": 0.42176553606987, + "learning_rate": 1.3705267396234393e-05, + "loss": 0.422, + "step": 20424 + }, + { + "epoch": 0.3792100934565559, + "grad_norm": 0.48506563901901245, + "learning_rate": 1.3704183909071346e-05, + "loss": 0.2037, + "step": 20426 + }, + { + "epoch": 0.3792472235939745, + "grad_norm": 0.3283042311668396, + "learning_rate": 1.3703100371505458e-05, + "loss": 0.3238, + "step": 20428 + }, + { + "epoch": 0.3792843537313932, + "grad_norm": 0.5023981928825378, + "learning_rate": 1.3702016783551464e-05, + "loss": 0.294, + "step": 20430 + }, + { + "epoch": 0.3793214838688118, + "grad_norm": 0.4818968176841736, + "learning_rate": 1.3700933145224107e-05, + "loss": 0.3597, + "step": 20432 + }, + { + "epoch": 0.37935861400623044, + "grad_norm": 0.2782365679740906, + "learning_rate": 1.3699849456538137e-05, + "loss": 0.2668, + "step": 20434 + }, + { + "epoch": 0.37939574414364907, + "grad_norm": 0.4432222843170166, + "learning_rate": 1.36987657175083e-05, + "loss": 0.3965, + "step": 20436 + }, + { + "epoch": 0.3794328742810677, + "grad_norm": 0.3050204813480377, + "learning_rate": 1.369768192814934e-05, + "loss": 0.2497, + "step": 20438 + }, + { + "epoch": 0.37947000441848633, + "grad_norm": 0.2594740092754364, + "learning_rate": 1.3696598088476003e-05, + "loss": 0.3189, + "step": 20440 + }, + { + "epoch": 0.379507134555905, + "grad_norm": 0.3082748055458069, + "learning_rate": 1.369551419850304e-05, + "loss": 0.2091, + "step": 20442 + }, + { + "epoch": 0.37954426469332364, + "grad_norm": 0.23749855160713196, + "learning_rate": 1.3694430258245197e-05, + "loss": 0.1367, + "step": 20444 + }, + { + "epoch": 0.37958139483074227, + "grad_norm": 0.4700714945793152, + "learning_rate": 1.3693346267717225e-05, + "loss": 0.3756, + "step": 20446 + }, + { + "epoch": 0.3796185249681609, + "grad_norm": 0.3161366581916809, + "learning_rate": 1.3692262226933874e-05, + "loss": 0.3495, + "step": 20448 + }, + { + "epoch": 0.3796556551055795, + "grad_norm": 0.5001155138015747, + "learning_rate": 1.3691178135909896e-05, + "loss": 0.2445, + "step": 20450 + }, + { + "epoch": 0.3796927852429982, + "grad_norm": 0.38339486718177795, + "learning_rate": 1.3690093994660035e-05, + "loss": 0.1631, + "step": 20452 + }, + { + "epoch": 0.37972991538041684, + "grad_norm": 0.3934367001056671, + "learning_rate": 1.3689009803199047e-05, + "loss": 0.1345, + "step": 20454 + }, + { + "epoch": 0.37976704551783547, + "grad_norm": 0.29895511269569397, + "learning_rate": 1.368792556154169e-05, + "loss": 0.1783, + "step": 20456 + }, + { + "epoch": 0.3798041756552541, + "grad_norm": 0.45116275548934937, + "learning_rate": 1.3686841269702708e-05, + "loss": 0.1813, + "step": 20458 + }, + { + "epoch": 0.3798413057926727, + "grad_norm": 0.4946790337562561, + "learning_rate": 1.3685756927696865e-05, + "loss": 0.2913, + "step": 20460 + }, + { + "epoch": 0.37987843593009135, + "grad_norm": 0.28817740082740784, + "learning_rate": 1.368467253553891e-05, + "loss": 0.258, + "step": 20462 + }, + { + "epoch": 0.37991556606751004, + "grad_norm": 0.4325798451900482, + "learning_rate": 1.3683588093243594e-05, + "loss": 0.2569, + "step": 20464 + }, + { + "epoch": 0.37995269620492866, + "grad_norm": 0.4484389126300812, + "learning_rate": 1.368250360082568e-05, + "loss": 0.4341, + "step": 20466 + }, + { + "epoch": 0.3799898263423473, + "grad_norm": 0.315391480922699, + "learning_rate": 1.3681419058299925e-05, + "loss": 0.3907, + "step": 20468 + }, + { + "epoch": 0.3800269564797659, + "grad_norm": 0.648390531539917, + "learning_rate": 1.3680334465681082e-05, + "loss": 0.1124, + "step": 20470 + }, + { + "epoch": 0.38006408661718455, + "grad_norm": 0.3056327998638153, + "learning_rate": 1.3679249822983908e-05, + "loss": 0.3437, + "step": 20472 + }, + { + "epoch": 0.38010121675460323, + "grad_norm": 0.32072412967681885, + "learning_rate": 1.3678165130223169e-05, + "loss": 0.3675, + "step": 20474 + }, + { + "epoch": 0.38013834689202186, + "grad_norm": 0.3778831958770752, + "learning_rate": 1.3677080387413617e-05, + "loss": 0.244, + "step": 20476 + }, + { + "epoch": 0.3801754770294405, + "grad_norm": 0.31670743227005005, + "learning_rate": 1.3675995594570016e-05, + "loss": 0.3022, + "step": 20478 + }, + { + "epoch": 0.3802126071668591, + "grad_norm": 0.25902318954467773, + "learning_rate": 1.3674910751707125e-05, + "loss": 0.3219, + "step": 20480 + }, + { + "epoch": 0.38024973730427775, + "grad_norm": 0.35113245248794556, + "learning_rate": 1.3673825858839708e-05, + "loss": 0.3814, + "step": 20482 + }, + { + "epoch": 0.3802868674416964, + "grad_norm": 0.35644057393074036, + "learning_rate": 1.3672740915982523e-05, + "loss": 0.2453, + "step": 20484 + }, + { + "epoch": 0.38032399757911506, + "grad_norm": 0.320034384727478, + "learning_rate": 1.3671655923150338e-05, + "loss": 0.3547, + "step": 20486 + }, + { + "epoch": 0.3803611277165337, + "grad_norm": 0.7678956985473633, + "learning_rate": 1.3670570880357912e-05, + "loss": 0.1925, + "step": 20488 + }, + { + "epoch": 0.3803982578539523, + "grad_norm": 0.28292229771614075, + "learning_rate": 1.3669485787620009e-05, + "loss": 0.2242, + "step": 20490 + }, + { + "epoch": 0.38043538799137094, + "grad_norm": 0.28078579902648926, + "learning_rate": 1.3668400644951399e-05, + "loss": 0.4978, + "step": 20492 + }, + { + "epoch": 0.38047251812878957, + "grad_norm": 0.4316692054271698, + "learning_rate": 1.3667315452366844e-05, + "loss": 0.3052, + "step": 20494 + }, + { + "epoch": 0.38050964826620826, + "grad_norm": 0.4917985796928406, + "learning_rate": 1.366623020988111e-05, + "loss": 0.4659, + "step": 20496 + }, + { + "epoch": 0.3805467784036269, + "grad_norm": 0.4542140066623688, + "learning_rate": 1.3665144917508963e-05, + "loss": 0.2941, + "step": 20498 + }, + { + "epoch": 0.3805839085410455, + "grad_norm": 0.4768338203430176, + "learning_rate": 1.3664059575265175e-05, + "loss": 0.1233, + "step": 20500 + }, + { + "epoch": 0.38062103867846414, + "grad_norm": 0.3003799319267273, + "learning_rate": 1.3662974183164512e-05, + "loss": 0.1661, + "step": 20502 + }, + { + "epoch": 0.38065816881588277, + "grad_norm": 0.4999409317970276, + "learning_rate": 1.3661888741221738e-05, + "loss": 0.4353, + "step": 20504 + }, + { + "epoch": 0.38069529895330145, + "grad_norm": 0.2663573622703552, + "learning_rate": 1.3660803249451631e-05, + "loss": 0.4115, + "step": 20506 + }, + { + "epoch": 0.3807324290907201, + "grad_norm": 1.525360107421875, + "learning_rate": 1.3659717707868953e-05, + "loss": 0.3512, + "step": 20508 + }, + { + "epoch": 0.3807695592281387, + "grad_norm": 0.4079039692878723, + "learning_rate": 1.365863211648848e-05, + "loss": 0.2369, + "step": 20510 + }, + { + "epoch": 0.38080668936555734, + "grad_norm": 0.3989945948123932, + "learning_rate": 1.3657546475324985e-05, + "loss": 0.4508, + "step": 20512 + }, + { + "epoch": 0.38084381950297597, + "grad_norm": 0.36864134669303894, + "learning_rate": 1.3656460784393239e-05, + "loss": 0.4803, + "step": 20514 + }, + { + "epoch": 0.3808809496403946, + "grad_norm": 0.4028772711753845, + "learning_rate": 1.365537504370801e-05, + "loss": 0.1342, + "step": 20516 + }, + { + "epoch": 0.3809180797778133, + "grad_norm": 0.35606274008750916, + "learning_rate": 1.365428925328408e-05, + "loss": 0.4585, + "step": 20518 + }, + { + "epoch": 0.3809552099152319, + "grad_norm": 0.2720849812030792, + "learning_rate": 1.3653203413136215e-05, + "loss": 0.3143, + "step": 20520 + }, + { + "epoch": 0.38099234005265054, + "grad_norm": 0.45939990878105164, + "learning_rate": 1.3652117523279198e-05, + "loss": 0.3574, + "step": 20522 + }, + { + "epoch": 0.38102947019006916, + "grad_norm": 0.37470516562461853, + "learning_rate": 1.36510315837278e-05, + "loss": 0.1504, + "step": 20524 + }, + { + "epoch": 0.3810666003274878, + "grad_norm": 0.3238503336906433, + "learning_rate": 1.3649945594496799e-05, + "loss": 0.2712, + "step": 20526 + }, + { + "epoch": 0.3811037304649065, + "grad_norm": 0.2754843533039093, + "learning_rate": 1.3648859555600965e-05, + "loss": 0.3999, + "step": 20528 + }, + { + "epoch": 0.3811408606023251, + "grad_norm": 0.36991745233535767, + "learning_rate": 1.364777346705509e-05, + "loss": 0.5372, + "step": 20530 + }, + { + "epoch": 0.38117799073974373, + "grad_norm": 0.43377766013145447, + "learning_rate": 1.364668732887394e-05, + "loss": 0.2779, + "step": 20532 + }, + { + "epoch": 0.38121512087716236, + "grad_norm": 0.37827566266059875, + "learning_rate": 1.36456011410723e-05, + "loss": 0.2897, + "step": 20534 + }, + { + "epoch": 0.381252251014581, + "grad_norm": 0.3209867477416992, + "learning_rate": 1.3644514903664948e-05, + "loss": 0.213, + "step": 20536 + }, + { + "epoch": 0.3812893811519996, + "grad_norm": 0.7252829074859619, + "learning_rate": 1.3643428616666666e-05, + "loss": 0.425, + "step": 20538 + }, + { + "epoch": 0.3813265112894183, + "grad_norm": 0.3354499042034149, + "learning_rate": 1.3642342280092232e-05, + "loss": 0.3663, + "step": 20540 + }, + { + "epoch": 0.38136364142683693, + "grad_norm": 0.4506750702857971, + "learning_rate": 1.3641255893956429e-05, + "loss": 0.3269, + "step": 20542 + }, + { + "epoch": 0.38140077156425556, + "grad_norm": 0.4305640161037445, + "learning_rate": 1.3640169458274043e-05, + "loss": 0.506, + "step": 20544 + }, + { + "epoch": 0.3814379017016742, + "grad_norm": 0.5406296253204346, + "learning_rate": 1.3639082973059853e-05, + "loss": 0.4419, + "step": 20546 + }, + { + "epoch": 0.3814750318390928, + "grad_norm": 0.6192275881767273, + "learning_rate": 1.3637996438328642e-05, + "loss": 0.3148, + "step": 20548 + }, + { + "epoch": 0.3815121619765115, + "grad_norm": 0.34689196944236755, + "learning_rate": 1.3636909854095199e-05, + "loss": 0.3298, + "step": 20550 + }, + { + "epoch": 0.3815492921139301, + "grad_norm": 0.5247117877006531, + "learning_rate": 1.3635823220374307e-05, + "loss": 0.2452, + "step": 20552 + }, + { + "epoch": 0.38158642225134876, + "grad_norm": 0.31703129410743713, + "learning_rate": 1.363473653718075e-05, + "loss": 0.0497, + "step": 20554 + }, + { + "epoch": 0.3816235523887674, + "grad_norm": 0.4014911353588104, + "learning_rate": 1.3633649804529319e-05, + "loss": 0.2181, + "step": 20556 + }, + { + "epoch": 0.381660682526186, + "grad_norm": 0.3600418269634247, + "learning_rate": 1.3632563022434795e-05, + "loss": 0.2458, + "step": 20558 + }, + { + "epoch": 0.38169781266360464, + "grad_norm": 0.3815743327140808, + "learning_rate": 1.3631476190911967e-05, + "loss": 0.3458, + "step": 20560 + }, + { + "epoch": 0.3817349428010233, + "grad_norm": 0.2056141048669815, + "learning_rate": 1.363038930997563e-05, + "loss": 0.3166, + "step": 20562 + }, + { + "epoch": 0.38177207293844195, + "grad_norm": 0.4001248776912689, + "learning_rate": 1.3629302379640566e-05, + "loss": 0.4189, + "step": 20564 + }, + { + "epoch": 0.3818092030758606, + "grad_norm": 0.46026602387428284, + "learning_rate": 1.362821539992157e-05, + "loss": 0.2763, + "step": 20566 + }, + { + "epoch": 0.3818463332132792, + "grad_norm": 0.3800026774406433, + "learning_rate": 1.3627128370833428e-05, + "loss": 0.2214, + "step": 20568 + }, + { + "epoch": 0.38188346335069784, + "grad_norm": 0.382589727640152, + "learning_rate": 1.3626041292390935e-05, + "loss": 0.291, + "step": 20570 + }, + { + "epoch": 0.3819205934881165, + "grad_norm": 0.5096129179000854, + "learning_rate": 1.362495416460888e-05, + "loss": 0.2788, + "step": 20572 + }, + { + "epoch": 0.38195772362553515, + "grad_norm": 0.4187592566013336, + "learning_rate": 1.3623866987502058e-05, + "loss": 0.3177, + "step": 20574 + }, + { + "epoch": 0.3819948537629538, + "grad_norm": 0.47894489765167236, + "learning_rate": 1.3622779761085261e-05, + "loss": 0.3229, + "step": 20576 + }, + { + "epoch": 0.3820319839003724, + "grad_norm": 0.49744951725006104, + "learning_rate": 1.362169248537328e-05, + "loss": 0.3892, + "step": 20578 + }, + { + "epoch": 0.38206911403779104, + "grad_norm": 0.49890074133872986, + "learning_rate": 1.3620605160380915e-05, + "loss": 0.1438, + "step": 20580 + }, + { + "epoch": 0.3821062441752097, + "grad_norm": 0.5018547773361206, + "learning_rate": 1.361951778612296e-05, + "loss": 0.3175, + "step": 20582 + }, + { + "epoch": 0.38214337431262835, + "grad_norm": 0.3734203577041626, + "learning_rate": 1.3618430362614207e-05, + "loss": 0.3407, + "step": 20584 + }, + { + "epoch": 0.382180504450047, + "grad_norm": 0.33536097407341003, + "learning_rate": 1.3617342889869457e-05, + "loss": 0.283, + "step": 20586 + }, + { + "epoch": 0.3822176345874656, + "grad_norm": 0.5764023065567017, + "learning_rate": 1.3616255367903506e-05, + "loss": 0.3569, + "step": 20588 + }, + { + "epoch": 0.38225476472488423, + "grad_norm": 0.26459458470344543, + "learning_rate": 1.361516779673115e-05, + "loss": 0.3067, + "step": 20590 + }, + { + "epoch": 0.38229189486230286, + "grad_norm": 0.4419151246547699, + "learning_rate": 1.3614080176367189e-05, + "loss": 0.3166, + "step": 20592 + }, + { + "epoch": 0.38232902499972155, + "grad_norm": 0.34582841396331787, + "learning_rate": 1.3612992506826421e-05, + "loss": 0.298, + "step": 20594 + }, + { + "epoch": 0.3823661551371402, + "grad_norm": 0.34223949909210205, + "learning_rate": 1.3611904788123649e-05, + "loss": 0.4859, + "step": 20596 + }, + { + "epoch": 0.3824032852745588, + "grad_norm": 0.3498494625091553, + "learning_rate": 1.3610817020273673e-05, + "loss": 0.1676, + "step": 20598 + }, + { + "epoch": 0.38244041541197743, + "grad_norm": 0.3764305114746094, + "learning_rate": 1.3609729203291293e-05, + "loss": 0.3048, + "step": 20600 + }, + { + "epoch": 0.38247754554939606, + "grad_norm": 0.4367479681968689, + "learning_rate": 1.3608641337191308e-05, + "loss": 0.2758, + "step": 20602 + }, + { + "epoch": 0.38251467568681474, + "grad_norm": 0.6940366625785828, + "learning_rate": 1.3607553421988522e-05, + "loss": 0.2847, + "step": 20604 + }, + { + "epoch": 0.38255180582423337, + "grad_norm": 0.28144943714141846, + "learning_rate": 1.3606465457697745e-05, + "loss": 0.3064, + "step": 20606 + }, + { + "epoch": 0.382588935961652, + "grad_norm": 0.4265226125717163, + "learning_rate": 1.3605377444333774e-05, + "loss": 0.1447, + "step": 20608 + }, + { + "epoch": 0.3826260660990706, + "grad_norm": 0.48617100715637207, + "learning_rate": 1.3604289381911412e-05, + "loss": 0.2759, + "step": 20610 + }, + { + "epoch": 0.38266319623648926, + "grad_norm": 0.537524402141571, + "learning_rate": 1.3603201270445472e-05, + "loss": 0.4904, + "step": 20612 + }, + { + "epoch": 0.3827003263739079, + "grad_norm": 0.43786805868148804, + "learning_rate": 1.3602113109950754e-05, + "loss": 0.2519, + "step": 20614 + }, + { + "epoch": 0.38273745651132657, + "grad_norm": 0.3110393285751343, + "learning_rate": 1.3601024900442066e-05, + "loss": 0.3612, + "step": 20616 + }, + { + "epoch": 0.3827745866487452, + "grad_norm": 0.4043773114681244, + "learning_rate": 1.3599936641934216e-05, + "loss": 0.3865, + "step": 20618 + }, + { + "epoch": 0.3828117167861638, + "grad_norm": 0.47072672843933105, + "learning_rate": 1.3598848334442012e-05, + "loss": 0.186, + "step": 20620 + }, + { + "epoch": 0.38284884692358245, + "grad_norm": 0.2767280340194702, + "learning_rate": 1.3597759977980258e-05, + "loss": 0.3127, + "step": 20622 + }, + { + "epoch": 0.3828859770610011, + "grad_norm": 0.35014182329177856, + "learning_rate": 1.359667157256377e-05, + "loss": 0.2374, + "step": 20624 + }, + { + "epoch": 0.38292310719841977, + "grad_norm": 0.5060983896255493, + "learning_rate": 1.3595583118207357e-05, + "loss": 0.4177, + "step": 20626 + }, + { + "epoch": 0.3829602373358384, + "grad_norm": 0.3250192701816559, + "learning_rate": 1.3594494614925823e-05, + "loss": 0.4588, + "step": 20628 + }, + { + "epoch": 0.382997367473257, + "grad_norm": 0.4417460262775421, + "learning_rate": 1.359340606273399e-05, + "loss": 0.4208, + "step": 20630 + }, + { + "epoch": 0.38303449761067565, + "grad_norm": 0.626419186592102, + "learning_rate": 1.359231746164666e-05, + "loss": 0.2677, + "step": 20632 + }, + { + "epoch": 0.3830716277480943, + "grad_norm": 0.3920382261276245, + "learning_rate": 1.3591228811678652e-05, + "loss": 0.2439, + "step": 20634 + }, + { + "epoch": 0.3831087578855129, + "grad_norm": 0.40673375129699707, + "learning_rate": 1.3590140112844772e-05, + "loss": 0.3038, + "step": 20636 + }, + { + "epoch": 0.3831458880229316, + "grad_norm": 0.3109780550003052, + "learning_rate": 1.3589051365159845e-05, + "loss": 0.2835, + "step": 20638 + }, + { + "epoch": 0.3831830181603502, + "grad_norm": 0.4205111563205719, + "learning_rate": 1.3587962568638678e-05, + "loss": 0.3475, + "step": 20640 + }, + { + "epoch": 0.38322014829776885, + "grad_norm": 0.3837061822414398, + "learning_rate": 1.3586873723296084e-05, + "loss": 0.5226, + "step": 20642 + }, + { + "epoch": 0.3832572784351875, + "grad_norm": 0.3452624976634979, + "learning_rate": 1.3585784829146887e-05, + "loss": 0.3252, + "step": 20644 + }, + { + "epoch": 0.3832944085726061, + "grad_norm": 0.2979359030723572, + "learning_rate": 1.3584695886205894e-05, + "loss": 0.3098, + "step": 20646 + }, + { + "epoch": 0.3833315387100248, + "grad_norm": 0.3637213110923767, + "learning_rate": 1.358360689448793e-05, + "loss": 0.3067, + "step": 20648 + }, + { + "epoch": 0.3833686688474434, + "grad_norm": 0.34143176674842834, + "learning_rate": 1.358251785400781e-05, + "loss": 0.274, + "step": 20650 + }, + { + "epoch": 0.38340579898486205, + "grad_norm": 0.4378577768802643, + "learning_rate": 1.3581428764780356e-05, + "loss": 0.4916, + "step": 20652 + }, + { + "epoch": 0.3834429291222807, + "grad_norm": 0.39235880970954895, + "learning_rate": 1.358033962682038e-05, + "loss": 0.2964, + "step": 20654 + }, + { + "epoch": 0.3834800592596993, + "grad_norm": 0.3207988440990448, + "learning_rate": 1.3579250440142708e-05, + "loss": 0.3585, + "step": 20656 + }, + { + "epoch": 0.383517189397118, + "grad_norm": 0.3703027367591858, + "learning_rate": 1.357816120476216e-05, + "loss": 0.2508, + "step": 20658 + }, + { + "epoch": 0.3835543195345366, + "grad_norm": 0.44120365381240845, + "learning_rate": 1.3577071920693555e-05, + "loss": 0.2725, + "step": 20660 + }, + { + "epoch": 0.38359144967195524, + "grad_norm": 0.36774852871894836, + "learning_rate": 1.3575982587951712e-05, + "loss": 0.2028, + "step": 20662 + }, + { + "epoch": 0.38362857980937387, + "grad_norm": 0.5156617760658264, + "learning_rate": 1.3574893206551463e-05, + "loss": 0.1843, + "step": 20664 + }, + { + "epoch": 0.3836657099467925, + "grad_norm": 0.36022359132766724, + "learning_rate": 1.357380377650762e-05, + "loss": 0.4465, + "step": 20666 + }, + { + "epoch": 0.3837028400842111, + "grad_norm": 0.3445912301540375, + "learning_rate": 1.3572714297835015e-05, + "loss": 0.1971, + "step": 20668 + }, + { + "epoch": 0.3837399702216298, + "grad_norm": 0.3245997130870819, + "learning_rate": 1.357162477054847e-05, + "loss": 0.2748, + "step": 20670 + }, + { + "epoch": 0.38377710035904844, + "grad_norm": 0.2201475352048874, + "learning_rate": 1.3570535194662812e-05, + "loss": 0.3352, + "step": 20672 + }, + { + "epoch": 0.38381423049646707, + "grad_norm": 0.24271944165229797, + "learning_rate": 1.3569445570192863e-05, + "loss": 0.2269, + "step": 20674 + }, + { + "epoch": 0.3838513606338857, + "grad_norm": 0.28391996026039124, + "learning_rate": 1.3568355897153451e-05, + "loss": 0.1698, + "step": 20676 + }, + { + "epoch": 0.3838884907713043, + "grad_norm": 0.5780355334281921, + "learning_rate": 1.3567266175559403e-05, + "loss": 0.3052, + "step": 20678 + }, + { + "epoch": 0.383925620908723, + "grad_norm": 0.3626141846179962, + "learning_rate": 1.3566176405425549e-05, + "loss": 0.3149, + "step": 20680 + }, + { + "epoch": 0.38396275104614164, + "grad_norm": 0.3780469298362732, + "learning_rate": 1.3565086586766716e-05, + "loss": 0.4655, + "step": 20682 + }, + { + "epoch": 0.38399988118356027, + "grad_norm": 0.351436048746109, + "learning_rate": 1.3563996719597735e-05, + "loss": 0.387, + "step": 20684 + }, + { + "epoch": 0.3840370113209789, + "grad_norm": 0.5046899914741516, + "learning_rate": 1.356290680393343e-05, + "loss": 0.1605, + "step": 20686 + }, + { + "epoch": 0.3840741414583975, + "grad_norm": 0.4094959497451782, + "learning_rate": 1.356181683978864e-05, + "loss": 0.3558, + "step": 20688 + }, + { + "epoch": 0.38411127159581615, + "grad_norm": 0.5119794011116028, + "learning_rate": 1.3560726827178188e-05, + "loss": 0.2913, + "step": 20690 + }, + { + "epoch": 0.38414840173323483, + "grad_norm": 0.6190775632858276, + "learning_rate": 1.3559636766116913e-05, + "loss": 0.2832, + "step": 20692 + }, + { + "epoch": 0.38418553187065346, + "grad_norm": 0.3524041175842285, + "learning_rate": 1.3558546656619642e-05, + "loss": 0.3811, + "step": 20694 + }, + { + "epoch": 0.3842226620080721, + "grad_norm": 0.4896256625652313, + "learning_rate": 1.3557456498701208e-05, + "loss": 0.1852, + "step": 20696 + }, + { + "epoch": 0.3842597921454907, + "grad_norm": 0.40523743629455566, + "learning_rate": 1.3556366292376449e-05, + "loss": 0.2495, + "step": 20698 + }, + { + "epoch": 0.38429692228290935, + "grad_norm": 0.38215065002441406, + "learning_rate": 1.3555276037660194e-05, + "loss": 0.2913, + "step": 20700 + }, + { + "epoch": 0.38433405242032803, + "grad_norm": 0.36358264088630676, + "learning_rate": 1.3554185734567287e-05, + "loss": 0.263, + "step": 20702 + }, + { + "epoch": 0.38437118255774666, + "grad_norm": 0.3265949487686157, + "learning_rate": 1.3553095383112559e-05, + "loss": 0.4035, + "step": 20704 + }, + { + "epoch": 0.3844083126951653, + "grad_norm": 0.36384525895118713, + "learning_rate": 1.355200498331084e-05, + "loss": 0.398, + "step": 20706 + }, + { + "epoch": 0.3844454428325839, + "grad_norm": 0.37665513157844543, + "learning_rate": 1.3550914535176976e-05, + "loss": 0.2416, + "step": 20708 + }, + { + "epoch": 0.38448257297000255, + "grad_norm": 0.31690457463264465, + "learning_rate": 1.3549824038725798e-05, + "loss": 0.3089, + "step": 20710 + }, + { + "epoch": 0.3845197031074212, + "grad_norm": 0.336553692817688, + "learning_rate": 1.3548733493972149e-05, + "loss": 0.3224, + "step": 20712 + }, + { + "epoch": 0.38455683324483986, + "grad_norm": 0.41726386547088623, + "learning_rate": 1.354764290093087e-05, + "loss": 0.4078, + "step": 20714 + }, + { + "epoch": 0.3845939633822585, + "grad_norm": 0.31295037269592285, + "learning_rate": 1.3546552259616796e-05, + "loss": 0.3534, + "step": 20716 + }, + { + "epoch": 0.3846310935196771, + "grad_norm": 0.3717849850654602, + "learning_rate": 1.3545461570044767e-05, + "loss": 0.1314, + "step": 20718 + }, + { + "epoch": 0.38466822365709574, + "grad_norm": 0.37344080209732056, + "learning_rate": 1.3544370832229627e-05, + "loss": 0.296, + "step": 20720 + }, + { + "epoch": 0.38470535379451437, + "grad_norm": 0.6680738925933838, + "learning_rate": 1.3543280046186218e-05, + "loss": 0.3636, + "step": 20722 + }, + { + "epoch": 0.38474248393193305, + "grad_norm": 0.35376036167144775, + "learning_rate": 1.354218921192938e-05, + "loss": 0.492, + "step": 20724 + }, + { + "epoch": 0.3847796140693517, + "grad_norm": 0.45670193433761597, + "learning_rate": 1.3541098329473959e-05, + "loss": 0.0976, + "step": 20726 + }, + { + "epoch": 0.3848167442067703, + "grad_norm": 0.36153843998908997, + "learning_rate": 1.354000739883479e-05, + "loss": 0.2532, + "step": 20728 + }, + { + "epoch": 0.38485387434418894, + "grad_norm": 0.21934334933757782, + "learning_rate": 1.3538916420026729e-05, + "loss": 0.294, + "step": 20730 + }, + { + "epoch": 0.38489100448160757, + "grad_norm": 0.48715871572494507, + "learning_rate": 1.3537825393064617e-05, + "loss": 0.6024, + "step": 20732 + }, + { + "epoch": 0.38492813461902625, + "grad_norm": 0.39648228883743286, + "learning_rate": 1.3536734317963295e-05, + "loss": 0.1849, + "step": 20734 + }, + { + "epoch": 0.3849652647564449, + "grad_norm": 0.3398877680301666, + "learning_rate": 1.3535643194737615e-05, + "loss": 0.4288, + "step": 20736 + }, + { + "epoch": 0.3850023948938635, + "grad_norm": 0.44927507638931274, + "learning_rate": 1.3534552023402419e-05, + "loss": 0.3296, + "step": 20738 + }, + { + "epoch": 0.38503952503128214, + "grad_norm": 0.3417001962661743, + "learning_rate": 1.353346080397256e-05, + "loss": 0.3033, + "step": 20740 + }, + { + "epoch": 0.38507665516870077, + "grad_norm": 0.49154606461524963, + "learning_rate": 1.3532369536462882e-05, + "loss": 0.2953, + "step": 20742 + }, + { + "epoch": 0.3851137853061194, + "grad_norm": 0.6155595183372498, + "learning_rate": 1.3531278220888236e-05, + "loss": 0.2304, + "step": 20744 + }, + { + "epoch": 0.3851509154435381, + "grad_norm": 0.566434919834137, + "learning_rate": 1.3530186857263472e-05, + "loss": 0.296, + "step": 20746 + }, + { + "epoch": 0.3851880455809567, + "grad_norm": 0.42195311188697815, + "learning_rate": 1.3529095445603436e-05, + "loss": 0.3149, + "step": 20748 + }, + { + "epoch": 0.38522517571837533, + "grad_norm": 0.41019490361213684, + "learning_rate": 1.3528003985922982e-05, + "loss": 0.2655, + "step": 20750 + }, + { + "epoch": 0.38526230585579396, + "grad_norm": 0.6509824991226196, + "learning_rate": 1.3526912478236964e-05, + "loss": 0.3241, + "step": 20752 + }, + { + "epoch": 0.3852994359932126, + "grad_norm": 0.4650309085845947, + "learning_rate": 1.3525820922560229e-05, + "loss": 0.2569, + "step": 20754 + }, + { + "epoch": 0.3853365661306313, + "grad_norm": 0.3566668629646301, + "learning_rate": 1.3524729318907634e-05, + "loss": 0.23, + "step": 20756 + }, + { + "epoch": 0.3853736962680499, + "grad_norm": 0.29631632566452026, + "learning_rate": 1.3523637667294031e-05, + "loss": 0.4055, + "step": 20758 + }, + { + "epoch": 0.38541082640546853, + "grad_norm": 0.3497377634048462, + "learning_rate": 1.3522545967734273e-05, + "loss": 0.2774, + "step": 20760 + }, + { + "epoch": 0.38544795654288716, + "grad_norm": 0.3390210270881653, + "learning_rate": 1.3521454220243213e-05, + "loss": 0.277, + "step": 20762 + }, + { + "epoch": 0.3854850866803058, + "grad_norm": 0.41099813580513, + "learning_rate": 1.3520362424835713e-05, + "loss": 0.2541, + "step": 20764 + }, + { + "epoch": 0.3855222168177244, + "grad_norm": 0.4419342577457428, + "learning_rate": 1.3519270581526622e-05, + "loss": 0.115, + "step": 20766 + }, + { + "epoch": 0.3855593469551431, + "grad_norm": 0.40069812536239624, + "learning_rate": 1.3518178690330804e-05, + "loss": 0.362, + "step": 20768 + }, + { + "epoch": 0.38559647709256173, + "grad_norm": 0.4005439877510071, + "learning_rate": 1.3517086751263111e-05, + "loss": 0.3497, + "step": 20770 + }, + { + "epoch": 0.38563360722998036, + "grad_norm": 0.4062248170375824, + "learning_rate": 1.3515994764338399e-05, + "loss": 0.2436, + "step": 20772 + }, + { + "epoch": 0.385670737367399, + "grad_norm": 0.4341149926185608, + "learning_rate": 1.3514902729571531e-05, + "loss": 0.4543, + "step": 20774 + }, + { + "epoch": 0.3857078675048176, + "grad_norm": 0.3939974308013916, + "learning_rate": 1.3513810646977368e-05, + "loss": 0.2545, + "step": 20776 + }, + { + "epoch": 0.3857449976422363, + "grad_norm": 0.5217578411102295, + "learning_rate": 1.3512718516570766e-05, + "loss": 0.2443, + "step": 20778 + }, + { + "epoch": 0.3857821277796549, + "grad_norm": 0.2592640817165375, + "learning_rate": 1.3511626338366583e-05, + "loss": 0.0874, + "step": 20780 + }, + { + "epoch": 0.38581925791707355, + "grad_norm": 0.3430674970149994, + "learning_rate": 1.3510534112379688e-05, + "loss": 0.3298, + "step": 20782 + }, + { + "epoch": 0.3858563880544922, + "grad_norm": 0.35523903369903564, + "learning_rate": 1.350944183862494e-05, + "loss": 0.2056, + "step": 20784 + }, + { + "epoch": 0.3858935181919108, + "grad_norm": 0.44332513213157654, + "learning_rate": 1.3508349517117197e-05, + "loss": 0.1999, + "step": 20786 + }, + { + "epoch": 0.38593064832932944, + "grad_norm": 0.4697682857513428, + "learning_rate": 1.3507257147871328e-05, + "loss": 0.6506, + "step": 20788 + }, + { + "epoch": 0.3859677784667481, + "grad_norm": 0.50628262758255, + "learning_rate": 1.3506164730902195e-05, + "loss": 0.2437, + "step": 20790 + }, + { + "epoch": 0.38600490860416675, + "grad_norm": 0.4779333174228668, + "learning_rate": 1.350507226622466e-05, + "loss": 0.1921, + "step": 20792 + }, + { + "epoch": 0.3860420387415854, + "grad_norm": 0.2679564356803894, + "learning_rate": 1.3503979753853591e-05, + "loss": 0.5024, + "step": 20794 + }, + { + "epoch": 0.386079168879004, + "grad_norm": 0.5327733755111694, + "learning_rate": 1.3502887193803856e-05, + "loss": 0.3846, + "step": 20796 + }, + { + "epoch": 0.38611629901642264, + "grad_norm": 0.3848426342010498, + "learning_rate": 1.3501794586090313e-05, + "loss": 0.2346, + "step": 20798 + }, + { + "epoch": 0.3861534291538413, + "grad_norm": 0.47144296765327454, + "learning_rate": 1.350070193072784e-05, + "loss": 0.3231, + "step": 20800 + }, + { + "epoch": 0.38619055929125995, + "grad_norm": 0.31437861919403076, + "learning_rate": 1.3499609227731298e-05, + "loss": 0.263, + "step": 20802 + }, + { + "epoch": 0.3862276894286786, + "grad_norm": 0.3516876697540283, + "learning_rate": 1.3498516477115558e-05, + "loss": 0.3418, + "step": 20804 + }, + { + "epoch": 0.3862648195660972, + "grad_norm": 0.31398946046829224, + "learning_rate": 1.3497423678895484e-05, + "loss": 0.241, + "step": 20806 + }, + { + "epoch": 0.38630194970351583, + "grad_norm": 0.43738794326782227, + "learning_rate": 1.3496330833085955e-05, + "loss": 0.2829, + "step": 20808 + }, + { + "epoch": 0.3863390798409345, + "grad_norm": 0.47186732292175293, + "learning_rate": 1.3495237939701833e-05, + "loss": 0.2678, + "step": 20810 + }, + { + "epoch": 0.38637620997835315, + "grad_norm": 0.40803611278533936, + "learning_rate": 1.3494144998757992e-05, + "loss": 0.3319, + "step": 20812 + }, + { + "epoch": 0.3864133401157718, + "grad_norm": 0.32951849699020386, + "learning_rate": 1.3493052010269305e-05, + "loss": 0.2858, + "step": 20814 + }, + { + "epoch": 0.3864504702531904, + "grad_norm": 0.6528772711753845, + "learning_rate": 1.3491958974250642e-05, + "loss": 0.2179, + "step": 20816 + }, + { + "epoch": 0.38648760039060903, + "grad_norm": 0.3366418480873108, + "learning_rate": 1.3490865890716878e-05, + "loss": 0.3621, + "step": 20818 + }, + { + "epoch": 0.38652473052802766, + "grad_norm": 0.33307650685310364, + "learning_rate": 1.3489772759682884e-05, + "loss": 0.2444, + "step": 20820 + }, + { + "epoch": 0.38656186066544634, + "grad_norm": 0.5108956098556519, + "learning_rate": 1.3488679581163538e-05, + "loss": 0.2383, + "step": 20822 + }, + { + "epoch": 0.386598990802865, + "grad_norm": 0.2978900969028473, + "learning_rate": 1.3487586355173711e-05, + "loss": 0.4003, + "step": 20824 + }, + { + "epoch": 0.3866361209402836, + "grad_norm": 0.3805393576622009, + "learning_rate": 1.348649308172828e-05, + "loss": 0.3751, + "step": 20826 + }, + { + "epoch": 0.38667325107770223, + "grad_norm": 0.42926251888275146, + "learning_rate": 1.3485399760842124e-05, + "loss": 0.1912, + "step": 20828 + }, + { + "epoch": 0.38671038121512086, + "grad_norm": 0.40642642974853516, + "learning_rate": 1.3484306392530115e-05, + "loss": 0.2548, + "step": 20830 + }, + { + "epoch": 0.38674751135253954, + "grad_norm": 0.4774445593357086, + "learning_rate": 1.3483212976807134e-05, + "loss": 0.239, + "step": 20832 + }, + { + "epoch": 0.38678464148995817, + "grad_norm": 0.27411189675331116, + "learning_rate": 1.3482119513688057e-05, + "loss": 0.3896, + "step": 20834 + }, + { + "epoch": 0.3868217716273768, + "grad_norm": 0.4208177924156189, + "learning_rate": 1.3481026003187762e-05, + "loss": 0.2264, + "step": 20836 + }, + { + "epoch": 0.3868589017647954, + "grad_norm": 0.3298663794994354, + "learning_rate": 1.3479932445321132e-05, + "loss": 0.4131, + "step": 20838 + }, + { + "epoch": 0.38689603190221405, + "grad_norm": 0.2186870574951172, + "learning_rate": 1.3478838840103045e-05, + "loss": 0.3937, + "step": 20840 + }, + { + "epoch": 0.3869331620396327, + "grad_norm": 0.354340523481369, + "learning_rate": 1.3477745187548381e-05, + "loss": 0.152, + "step": 20842 + }, + { + "epoch": 0.38697029217705137, + "grad_norm": 0.35978081822395325, + "learning_rate": 1.3476651487672023e-05, + "loss": 0.6011, + "step": 20844 + }, + { + "epoch": 0.38700742231447, + "grad_norm": 0.4790162742137909, + "learning_rate": 1.3475557740488851e-05, + "loss": 0.4519, + "step": 20846 + }, + { + "epoch": 0.3870445524518886, + "grad_norm": 0.32861942052841187, + "learning_rate": 1.347446394601375e-05, + "loss": 0.2626, + "step": 20848 + }, + { + "epoch": 0.38708168258930725, + "grad_norm": 0.3664741814136505, + "learning_rate": 1.3473370104261599e-05, + "loss": 0.257, + "step": 20850 + }, + { + "epoch": 0.3871188127267259, + "grad_norm": 0.3495136499404907, + "learning_rate": 1.3472276215247287e-05, + "loss": 0.3904, + "step": 20852 + }, + { + "epoch": 0.38715594286414456, + "grad_norm": 0.22909832000732422, + "learning_rate": 1.3471182278985697e-05, + "loss": 0.3477, + "step": 20854 + }, + { + "epoch": 0.3871930730015632, + "grad_norm": 0.42756387591362, + "learning_rate": 1.3470088295491712e-05, + "loss": 0.1728, + "step": 20856 + }, + { + "epoch": 0.3872302031389818, + "grad_norm": 0.2829986810684204, + "learning_rate": 1.346899426478022e-05, + "loss": 0.2708, + "step": 20858 + }, + { + "epoch": 0.38726733327640045, + "grad_norm": 0.543093740940094, + "learning_rate": 1.3467900186866107e-05, + "loss": 0.2961, + "step": 20860 + }, + { + "epoch": 0.3873044634138191, + "grad_norm": 0.36149317026138306, + "learning_rate": 1.3466806061764261e-05, + "loss": 0.2172, + "step": 20862 + }, + { + "epoch": 0.3873415935512377, + "grad_norm": 0.3609108328819275, + "learning_rate": 1.3465711889489566e-05, + "loss": 0.3758, + "step": 20864 + }, + { + "epoch": 0.3873787236886564, + "grad_norm": 0.4260145425796509, + "learning_rate": 1.3464617670056917e-05, + "loss": 0.3615, + "step": 20866 + }, + { + "epoch": 0.387415853826075, + "grad_norm": 0.36468422412872314, + "learning_rate": 1.3463523403481195e-05, + "loss": 0.4057, + "step": 20868 + }, + { + "epoch": 0.38745298396349365, + "grad_norm": 0.3786851763725281, + "learning_rate": 1.3462429089777296e-05, + "loss": 0.2455, + "step": 20870 + }, + { + "epoch": 0.3874901141009123, + "grad_norm": 0.2598850727081299, + "learning_rate": 1.346133472896011e-05, + "loss": 0.3806, + "step": 20872 + }, + { + "epoch": 0.3875272442383309, + "grad_norm": 0.5214970111846924, + "learning_rate": 1.3460240321044525e-05, + "loss": 0.3802, + "step": 20874 + }, + { + "epoch": 0.3875643743757496, + "grad_norm": 0.3710053563117981, + "learning_rate": 1.3459145866045433e-05, + "loss": 0.414, + "step": 20876 + }, + { + "epoch": 0.3876015045131682, + "grad_norm": 0.4426306486129761, + "learning_rate": 1.3458051363977728e-05, + "loss": 0.2758, + "step": 20878 + }, + { + "epoch": 0.38763863465058684, + "grad_norm": 0.2110464721918106, + "learning_rate": 1.3456956814856302e-05, + "loss": 0.2671, + "step": 20880 + }, + { + "epoch": 0.3876757647880055, + "grad_norm": 0.3493625521659851, + "learning_rate": 1.3455862218696045e-05, + "loss": 0.2062, + "step": 20882 + }, + { + "epoch": 0.3877128949254241, + "grad_norm": 0.5118810534477234, + "learning_rate": 1.3454767575511862e-05, + "loss": 0.2884, + "step": 20884 + }, + { + "epoch": 0.3877500250628428, + "grad_norm": 0.5153297185897827, + "learning_rate": 1.3453672885318636e-05, + "loss": 0.3604, + "step": 20886 + }, + { + "epoch": 0.3877871552002614, + "grad_norm": 0.3881579637527466, + "learning_rate": 1.3452578148131265e-05, + "loss": 0.3028, + "step": 20888 + }, + { + "epoch": 0.38782428533768004, + "grad_norm": 0.25400811433792114, + "learning_rate": 1.345148336396465e-05, + "loss": 0.2939, + "step": 20890 + }, + { + "epoch": 0.38786141547509867, + "grad_norm": 1.0255693197250366, + "learning_rate": 1.3450388532833685e-05, + "loss": 0.3004, + "step": 20892 + }, + { + "epoch": 0.3878985456125173, + "grad_norm": 0.24599038064479828, + "learning_rate": 1.3449293654753266e-05, + "loss": 0.2602, + "step": 20894 + }, + { + "epoch": 0.3879356757499359, + "grad_norm": 0.39052486419677734, + "learning_rate": 1.3448198729738295e-05, + "loss": 0.185, + "step": 20896 + }, + { + "epoch": 0.3879728058873546, + "grad_norm": 0.29600226879119873, + "learning_rate": 1.3447103757803666e-05, + "loss": 0.2882, + "step": 20898 + }, + { + "epoch": 0.38800993602477324, + "grad_norm": 0.527108371257782, + "learning_rate": 1.3446008738964276e-05, + "loss": 0.2831, + "step": 20900 + }, + { + "epoch": 0.38804706616219187, + "grad_norm": 0.36527591943740845, + "learning_rate": 1.3444913673235034e-05, + "loss": 0.1595, + "step": 20902 + }, + { + "epoch": 0.3880841962996105, + "grad_norm": 0.274984747171402, + "learning_rate": 1.3443818560630834e-05, + "loss": 0.3654, + "step": 20904 + }, + { + "epoch": 0.3881213264370291, + "grad_norm": 0.27965012192726135, + "learning_rate": 1.344272340116658e-05, + "loss": 0.2232, + "step": 20906 + }, + { + "epoch": 0.3881584565744478, + "grad_norm": 0.6494016051292419, + "learning_rate": 1.344162819485717e-05, + "loss": 0.5417, + "step": 20908 + }, + { + "epoch": 0.38819558671186644, + "grad_norm": 0.4156023859977722, + "learning_rate": 1.3440532941717513e-05, + "loss": 0.4697, + "step": 20910 + }, + { + "epoch": 0.38823271684928506, + "grad_norm": 0.46189770102500916, + "learning_rate": 1.3439437641762505e-05, + "loss": 0.3199, + "step": 20912 + }, + { + "epoch": 0.3882698469867037, + "grad_norm": 0.5394167304039001, + "learning_rate": 1.3438342295007054e-05, + "loss": 0.2194, + "step": 20914 + }, + { + "epoch": 0.3883069771241223, + "grad_norm": 0.37195292115211487, + "learning_rate": 1.3437246901466066e-05, + "loss": 0.4286, + "step": 20916 + }, + { + "epoch": 0.38834410726154095, + "grad_norm": 0.31552475690841675, + "learning_rate": 1.3436151461154441e-05, + "loss": 0.1736, + "step": 20918 + }, + { + "epoch": 0.38838123739895963, + "grad_norm": 0.5918641090393066, + "learning_rate": 1.3435055974087083e-05, + "loss": 0.2236, + "step": 20920 + }, + { + "epoch": 0.38841836753637826, + "grad_norm": 0.3376547694206238, + "learning_rate": 1.3433960440278908e-05, + "loss": 0.2717, + "step": 20922 + }, + { + "epoch": 0.3884554976737969, + "grad_norm": 0.35671812295913696, + "learning_rate": 1.3432864859744816e-05, + "loss": 0.4821, + "step": 20924 + }, + { + "epoch": 0.3884926278112155, + "grad_norm": 0.40426918864250183, + "learning_rate": 1.3431769232499716e-05, + "loss": 0.2692, + "step": 20926 + }, + { + "epoch": 0.38852975794863415, + "grad_norm": 0.45418408513069153, + "learning_rate": 1.3430673558558516e-05, + "loss": 0.4624, + "step": 20928 + }, + { + "epoch": 0.38856688808605283, + "grad_norm": 0.3226243257522583, + "learning_rate": 1.3429577837936125e-05, + "loss": 0.297, + "step": 20930 + }, + { + "epoch": 0.38860401822347146, + "grad_norm": 0.4090017080307007, + "learning_rate": 1.342848207064745e-05, + "loss": 0.1396, + "step": 20932 + }, + { + "epoch": 0.3886411483608901, + "grad_norm": 0.46387729048728943, + "learning_rate": 1.3427386256707407e-05, + "loss": 0.3157, + "step": 20934 + }, + { + "epoch": 0.3886782784983087, + "grad_norm": 0.49868059158325195, + "learning_rate": 1.34262903961309e-05, + "loss": 0.3909, + "step": 20936 + }, + { + "epoch": 0.38871540863572734, + "grad_norm": 0.33697232604026794, + "learning_rate": 1.3425194488932847e-05, + "loss": 0.3319, + "step": 20938 + }, + { + "epoch": 0.388752538773146, + "grad_norm": 0.4747236669063568, + "learning_rate": 1.3424098535128155e-05, + "loss": 0.25, + "step": 20940 + }, + { + "epoch": 0.38878966891056466, + "grad_norm": 0.2780747711658478, + "learning_rate": 1.342300253473174e-05, + "loss": 0.2726, + "step": 20942 + }, + { + "epoch": 0.3888267990479833, + "grad_norm": 0.36898523569107056, + "learning_rate": 1.342190648775851e-05, + "loss": 0.1427, + "step": 20944 + }, + { + "epoch": 0.3888639291854019, + "grad_norm": 0.330413818359375, + "learning_rate": 1.3420810394223385e-05, + "loss": 0.2066, + "step": 20946 + }, + { + "epoch": 0.38890105932282054, + "grad_norm": 0.3008469045162201, + "learning_rate": 1.3419714254141278e-05, + "loss": 0.439, + "step": 20948 + }, + { + "epoch": 0.38893818946023917, + "grad_norm": 0.4299360513687134, + "learning_rate": 1.3418618067527101e-05, + "loss": 0.3182, + "step": 20950 + }, + { + "epoch": 0.38897531959765785, + "grad_norm": 0.31372538208961487, + "learning_rate": 1.341752183439577e-05, + "loss": 0.1605, + "step": 20952 + }, + { + "epoch": 0.3890124497350765, + "grad_norm": 0.3003089427947998, + "learning_rate": 1.3416425554762209e-05, + "loss": 0.4386, + "step": 20954 + }, + { + "epoch": 0.3890495798724951, + "grad_norm": 0.39285245537757874, + "learning_rate": 1.3415329228641326e-05, + "loss": 0.4065, + "step": 20956 + }, + { + "epoch": 0.38908671000991374, + "grad_norm": 0.5915917158126831, + "learning_rate": 1.3414232856048045e-05, + "loss": 0.2302, + "step": 20958 + }, + { + "epoch": 0.38912384014733237, + "grad_norm": 0.2580043375492096, + "learning_rate": 1.3413136436997278e-05, + "loss": 0.3308, + "step": 20960 + }, + { + "epoch": 0.38916097028475105, + "grad_norm": 0.30734193325042725, + "learning_rate": 1.3412039971503949e-05, + "loss": 0.1003, + "step": 20962 + }, + { + "epoch": 0.3891981004221697, + "grad_norm": 0.3624153137207031, + "learning_rate": 1.3410943459582977e-05, + "loss": 0.2541, + "step": 20964 + }, + { + "epoch": 0.3892352305595883, + "grad_norm": 0.444143146276474, + "learning_rate": 1.3409846901249282e-05, + "loss": 0.2639, + "step": 20966 + }, + { + "epoch": 0.38927236069700694, + "grad_norm": 0.4695796072483063, + "learning_rate": 1.3408750296517786e-05, + "loss": 0.4196, + "step": 20968 + }, + { + "epoch": 0.38930949083442556, + "grad_norm": 0.48529860377311707, + "learning_rate": 1.3407653645403405e-05, + "loss": 0.2694, + "step": 20970 + }, + { + "epoch": 0.3893466209718442, + "grad_norm": 0.3905702233314514, + "learning_rate": 1.3406556947921066e-05, + "loss": 0.3957, + "step": 20972 + }, + { + "epoch": 0.3893837511092629, + "grad_norm": 0.4604209065437317, + "learning_rate": 1.340546020408569e-05, + "loss": 0.3678, + "step": 20974 + }, + { + "epoch": 0.3894208812466815, + "grad_norm": 0.5386272072792053, + "learning_rate": 1.34043634139122e-05, + "loss": 0.3906, + "step": 20976 + }, + { + "epoch": 0.38945801138410013, + "grad_norm": 0.4299006462097168, + "learning_rate": 1.3403266577415526e-05, + "loss": 0.2727, + "step": 20978 + }, + { + "epoch": 0.38949514152151876, + "grad_norm": 0.3433120846748352, + "learning_rate": 1.3402169694610585e-05, + "loss": 0.2322, + "step": 20980 + }, + { + "epoch": 0.3895322716589374, + "grad_norm": 0.316511332988739, + "learning_rate": 1.3401072765512304e-05, + "loss": 0.3579, + "step": 20982 + }, + { + "epoch": 0.3895694017963561, + "grad_norm": 0.3686107099056244, + "learning_rate": 1.3399975790135613e-05, + "loss": 0.3703, + "step": 20984 + }, + { + "epoch": 0.3896065319337747, + "grad_norm": 0.3704521358013153, + "learning_rate": 1.3398878768495434e-05, + "loss": 0.2759, + "step": 20986 + }, + { + "epoch": 0.38964366207119333, + "grad_norm": 0.5123729109764099, + "learning_rate": 1.3397781700606694e-05, + "loss": 0.2449, + "step": 20988 + }, + { + "epoch": 0.38968079220861196, + "grad_norm": 0.41569241881370544, + "learning_rate": 1.3396684586484325e-05, + "loss": 0.3517, + "step": 20990 + }, + { + "epoch": 0.3897179223460306, + "grad_norm": 0.28489747643470764, + "learning_rate": 1.3395587426143255e-05, + "loss": 0.2565, + "step": 20992 + }, + { + "epoch": 0.3897550524834492, + "grad_norm": 0.6491680145263672, + "learning_rate": 1.3394490219598406e-05, + "loss": 0.3088, + "step": 20994 + }, + { + "epoch": 0.3897921826208679, + "grad_norm": 0.41795191168785095, + "learning_rate": 1.3393392966864714e-05, + "loss": 0.3171, + "step": 20996 + }, + { + "epoch": 0.38982931275828653, + "grad_norm": 0.9456215500831604, + "learning_rate": 1.3392295667957111e-05, + "loss": 0.2891, + "step": 20998 + }, + { + "epoch": 0.38986644289570516, + "grad_norm": 0.28814780712127686, + "learning_rate": 1.3391198322890524e-05, + "loss": 0.2366, + "step": 21000 + }, + { + "epoch": 0.3899035730331238, + "grad_norm": 0.4209561347961426, + "learning_rate": 1.3390100931679886e-05, + "loss": 0.3493, + "step": 21002 + }, + { + "epoch": 0.3899407031705424, + "grad_norm": 0.2765824794769287, + "learning_rate": 1.3389003494340127e-05, + "loss": 0.2164, + "step": 21004 + }, + { + "epoch": 0.3899778333079611, + "grad_norm": 0.4336777627468109, + "learning_rate": 1.3387906010886182e-05, + "loss": 0.5543, + "step": 21006 + }, + { + "epoch": 0.3900149634453797, + "grad_norm": 0.522078812122345, + "learning_rate": 1.3386808481332985e-05, + "loss": 0.4384, + "step": 21008 + }, + { + "epoch": 0.39005209358279835, + "grad_norm": 0.3964535593986511, + "learning_rate": 1.338571090569547e-05, + "loss": 0.1105, + "step": 21010 + }, + { + "epoch": 0.390089223720217, + "grad_norm": 0.5796847939491272, + "learning_rate": 1.338461328398857e-05, + "loss": 0.2128, + "step": 21012 + }, + { + "epoch": 0.3901263538576356, + "grad_norm": 0.3732931613922119, + "learning_rate": 1.338351561622722e-05, + "loss": 0.2689, + "step": 21014 + }, + { + "epoch": 0.39016348399505424, + "grad_norm": 0.44324392080307007, + "learning_rate": 1.3382417902426358e-05, + "loss": 0.4351, + "step": 21016 + }, + { + "epoch": 0.3902006141324729, + "grad_norm": 0.29921138286590576, + "learning_rate": 1.3381320142600922e-05, + "loss": 0.3281, + "step": 21018 + }, + { + "epoch": 0.39023774426989155, + "grad_norm": 0.2649189233779907, + "learning_rate": 1.3380222336765844e-05, + "loss": 0.2626, + "step": 21020 + }, + { + "epoch": 0.3902748744073102, + "grad_norm": 0.3623538017272949, + "learning_rate": 1.337912448493607e-05, + "loss": 0.288, + "step": 21022 + }, + { + "epoch": 0.3903120045447288, + "grad_norm": 0.3688085675239563, + "learning_rate": 1.337802658712653e-05, + "loss": 0.2343, + "step": 21024 + }, + { + "epoch": 0.39034913468214744, + "grad_norm": 0.4979703724384308, + "learning_rate": 1.3376928643352165e-05, + "loss": 0.2031, + "step": 21026 + }, + { + "epoch": 0.3903862648195661, + "grad_norm": 0.34725460410118103, + "learning_rate": 1.3375830653627914e-05, + "loss": 0.2158, + "step": 21028 + }, + { + "epoch": 0.39042339495698475, + "grad_norm": 0.48986560106277466, + "learning_rate": 1.3374732617968725e-05, + "loss": 0.2763, + "step": 21030 + }, + { + "epoch": 0.3904605250944034, + "grad_norm": 0.25171974301338196, + "learning_rate": 1.3373634536389533e-05, + "loss": 0.4196, + "step": 21032 + }, + { + "epoch": 0.390497655231822, + "grad_norm": 0.33803626894950867, + "learning_rate": 1.3372536408905279e-05, + "loss": 0.257, + "step": 21034 + }, + { + "epoch": 0.39053478536924063, + "grad_norm": 0.31209734082221985, + "learning_rate": 1.3371438235530907e-05, + "loss": 0.2631, + "step": 21036 + }, + { + "epoch": 0.3905719155066593, + "grad_norm": 0.39501842856407166, + "learning_rate": 1.3370340016281356e-05, + "loss": 0.3451, + "step": 21038 + }, + { + "epoch": 0.39060904564407795, + "grad_norm": 0.2956770956516266, + "learning_rate": 1.3369241751171575e-05, + "loss": 0.4203, + "step": 21040 + }, + { + "epoch": 0.3906461757814966, + "grad_norm": 0.2859567403793335, + "learning_rate": 1.3368143440216508e-05, + "loss": 0.2804, + "step": 21042 + }, + { + "epoch": 0.3906833059189152, + "grad_norm": 0.40659254789352417, + "learning_rate": 1.3367045083431096e-05, + "loss": 0.3244, + "step": 21044 + }, + { + "epoch": 0.39072043605633383, + "grad_norm": 0.4834231734275818, + "learning_rate": 1.3365946680830285e-05, + "loss": 0.2023, + "step": 21046 + }, + { + "epoch": 0.39075756619375246, + "grad_norm": 0.4338100552558899, + "learning_rate": 1.3364848232429025e-05, + "loss": 0.2128, + "step": 21048 + }, + { + "epoch": 0.39079469633117114, + "grad_norm": 0.36406126618385315, + "learning_rate": 1.3363749738242253e-05, + "loss": 0.1364, + "step": 21050 + }, + { + "epoch": 0.39083182646858977, + "grad_norm": 0.3659365773200989, + "learning_rate": 1.3362651198284929e-05, + "loss": 0.3412, + "step": 21052 + }, + { + "epoch": 0.3908689566060084, + "grad_norm": 0.3697330355644226, + "learning_rate": 1.3361552612571993e-05, + "loss": 0.1963, + "step": 21054 + }, + { + "epoch": 0.39090608674342703, + "grad_norm": 0.3940783739089966, + "learning_rate": 1.3360453981118392e-05, + "loss": 0.4194, + "step": 21056 + }, + { + "epoch": 0.39094321688084566, + "grad_norm": 0.34696316719055176, + "learning_rate": 1.3359355303939079e-05, + "loss": 0.2441, + "step": 21058 + }, + { + "epoch": 0.39098034701826434, + "grad_norm": 0.34688594937324524, + "learning_rate": 1.3358256581049005e-05, + "loss": 0.3986, + "step": 21060 + }, + { + "epoch": 0.39101747715568297, + "grad_norm": 0.29138773679733276, + "learning_rate": 1.3357157812463116e-05, + "loss": 0.1161, + "step": 21062 + }, + { + "epoch": 0.3910546072931016, + "grad_norm": 0.3353050947189331, + "learning_rate": 1.3356058998196366e-05, + "loss": 0.2321, + "step": 21064 + }, + { + "epoch": 0.3910917374305202, + "grad_norm": 0.5530006885528564, + "learning_rate": 1.3354960138263706e-05, + "loss": 0.284, + "step": 21066 + }, + { + "epoch": 0.39112886756793885, + "grad_norm": 0.5759733319282532, + "learning_rate": 1.3353861232680085e-05, + "loss": 0.2761, + "step": 21068 + }, + { + "epoch": 0.3911659977053575, + "grad_norm": 0.4651113450527191, + "learning_rate": 1.335276228146046e-05, + "loss": 0.1315, + "step": 21070 + }, + { + "epoch": 0.39120312784277617, + "grad_norm": 0.45784854888916016, + "learning_rate": 1.3351663284619784e-05, + "loss": 0.5345, + "step": 21072 + }, + { + "epoch": 0.3912402579801948, + "grad_norm": 0.6664987206459045, + "learning_rate": 1.3350564242173012e-05, + "loss": 0.4362, + "step": 21074 + }, + { + "epoch": 0.3912773881176134, + "grad_norm": 0.3597731590270996, + "learning_rate": 1.3349465154135094e-05, + "loss": 0.1609, + "step": 21076 + }, + { + "epoch": 0.39131451825503205, + "grad_norm": 0.3273705840110779, + "learning_rate": 1.3348366020520989e-05, + "loss": 0.4168, + "step": 21078 + }, + { + "epoch": 0.3913516483924507, + "grad_norm": 0.3557974696159363, + "learning_rate": 1.3347266841345652e-05, + "loss": 0.2151, + "step": 21080 + }, + { + "epoch": 0.39138877852986936, + "grad_norm": 0.21370138227939606, + "learning_rate": 1.334616761662404e-05, + "loss": 0.1439, + "step": 21082 + }, + { + "epoch": 0.391425908667288, + "grad_norm": 0.338091105222702, + "learning_rate": 1.3345068346371108e-05, + "loss": 0.3926, + "step": 21084 + }, + { + "epoch": 0.3914630388047066, + "grad_norm": 0.42032161355018616, + "learning_rate": 1.3343969030601818e-05, + "loss": 0.13, + "step": 21086 + }, + { + "epoch": 0.39150016894212525, + "grad_norm": 0.23689033091068268, + "learning_rate": 1.3342869669331123e-05, + "loss": 0.4157, + "step": 21088 + }, + { + "epoch": 0.3915372990795439, + "grad_norm": 0.4776283800601959, + "learning_rate": 1.3341770262573986e-05, + "loss": 0.3817, + "step": 21090 + }, + { + "epoch": 0.3915744292169625, + "grad_norm": 0.5193130373954773, + "learning_rate": 1.3340670810345366e-05, + "loss": 0.3639, + "step": 21092 + }, + { + "epoch": 0.3916115593543812, + "grad_norm": 0.4199797511100769, + "learning_rate": 1.333957131266022e-05, + "loss": 0.354, + "step": 21094 + }, + { + "epoch": 0.3916486894917998, + "grad_norm": 0.2748834788799286, + "learning_rate": 1.3338471769533517e-05, + "loss": 0.1861, + "step": 21096 + }, + { + "epoch": 0.39168581962921845, + "grad_norm": 0.517143726348877, + "learning_rate": 1.3337372180980208e-05, + "loss": 0.304, + "step": 21098 + }, + { + "epoch": 0.3917229497666371, + "grad_norm": 0.40330085158348083, + "learning_rate": 1.3336272547015263e-05, + "loss": 0.3512, + "step": 21100 + }, + { + "epoch": 0.3917600799040557, + "grad_norm": 0.36303767561912537, + "learning_rate": 1.3335172867653642e-05, + "loss": 0.38, + "step": 21102 + }, + { + "epoch": 0.3917972100414744, + "grad_norm": 0.38747358322143555, + "learning_rate": 1.3334073142910307e-05, + "loss": 0.3439, + "step": 21104 + }, + { + "epoch": 0.391834340178893, + "grad_norm": 0.43079546093940735, + "learning_rate": 1.3332973372800224e-05, + "loss": 0.3644, + "step": 21106 + }, + { + "epoch": 0.39187147031631164, + "grad_norm": 0.4817890524864197, + "learning_rate": 1.3331873557338355e-05, + "loss": 0.3224, + "step": 21108 + }, + { + "epoch": 0.39190860045373027, + "grad_norm": 0.2721439301967621, + "learning_rate": 1.3330773696539669e-05, + "loss": 0.3213, + "step": 21110 + }, + { + "epoch": 0.3919457305911489, + "grad_norm": 0.47233209013938904, + "learning_rate": 1.3329673790419128e-05, + "loss": 0.2544, + "step": 21112 + }, + { + "epoch": 0.3919828607285676, + "grad_norm": 0.5644276142120361, + "learning_rate": 1.33285738389917e-05, + "loss": 0.386, + "step": 21114 + }, + { + "epoch": 0.3920199908659862, + "grad_norm": 0.36235418915748596, + "learning_rate": 1.3327473842272356e-05, + "loss": 0.4345, + "step": 21116 + }, + { + "epoch": 0.39205712100340484, + "grad_norm": 0.4895215630531311, + "learning_rate": 1.3326373800276057e-05, + "loss": 0.3474, + "step": 21118 + }, + { + "epoch": 0.39209425114082347, + "grad_norm": 0.32556769251823425, + "learning_rate": 1.3325273713017775e-05, + "loss": 0.2586, + "step": 21120 + }, + { + "epoch": 0.3921313812782421, + "grad_norm": 0.4315909743309021, + "learning_rate": 1.3324173580512478e-05, + "loss": 0.2239, + "step": 21122 + }, + { + "epoch": 0.3921685114156607, + "grad_norm": 0.530937671661377, + "learning_rate": 1.3323073402775135e-05, + "loss": 0.2066, + "step": 21124 + }, + { + "epoch": 0.3922056415530794, + "grad_norm": 0.4639502763748169, + "learning_rate": 1.3321973179820717e-05, + "loss": 0.2685, + "step": 21126 + }, + { + "epoch": 0.39224277169049804, + "grad_norm": 0.24358013272285461, + "learning_rate": 1.3320872911664195e-05, + "loss": 0.2083, + "step": 21128 + }, + { + "epoch": 0.39227990182791667, + "grad_norm": 0.3938358426094055, + "learning_rate": 1.3319772598320543e-05, + "loss": 0.2416, + "step": 21130 + }, + { + "epoch": 0.3923170319653353, + "grad_norm": 0.37304943799972534, + "learning_rate": 1.3318672239804724e-05, + "loss": 0.1616, + "step": 21132 + }, + { + "epoch": 0.3923541621027539, + "grad_norm": 0.39024823904037476, + "learning_rate": 1.3317571836131718e-05, + "loss": 0.2498, + "step": 21134 + }, + { + "epoch": 0.3923912922401726, + "grad_norm": 0.5496432781219482, + "learning_rate": 1.3316471387316499e-05, + "loss": 0.3569, + "step": 21136 + }, + { + "epoch": 0.39242842237759124, + "grad_norm": 0.30583733320236206, + "learning_rate": 1.3315370893374038e-05, + "loss": 0.3339, + "step": 21138 + }, + { + "epoch": 0.39246555251500986, + "grad_norm": 0.4832291007041931, + "learning_rate": 1.3314270354319308e-05, + "loss": 0.301, + "step": 21140 + }, + { + "epoch": 0.3925026826524285, + "grad_norm": 0.4621949791908264, + "learning_rate": 1.3313169770167287e-05, + "loss": 0.2119, + "step": 21142 + }, + { + "epoch": 0.3925398127898471, + "grad_norm": 0.37230128049850464, + "learning_rate": 1.3312069140932948e-05, + "loss": 0.3119, + "step": 21144 + }, + { + "epoch": 0.39257694292726575, + "grad_norm": 0.30157050490379333, + "learning_rate": 1.3310968466631268e-05, + "loss": 0.5064, + "step": 21146 + }, + { + "epoch": 0.39261407306468443, + "grad_norm": 0.3491836190223694, + "learning_rate": 1.3309867747277227e-05, + "loss": 0.2235, + "step": 21148 + }, + { + "epoch": 0.39265120320210306, + "grad_norm": 0.4691217839717865, + "learning_rate": 1.3308766982885803e-05, + "loss": 0.3598, + "step": 21150 + }, + { + "epoch": 0.3926883333395217, + "grad_norm": 0.32685142755508423, + "learning_rate": 1.3307666173471966e-05, + "loss": 0.4718, + "step": 21152 + }, + { + "epoch": 0.3927254634769403, + "grad_norm": 0.35390961170196533, + "learning_rate": 1.3306565319050702e-05, + "loss": 0.2644, + "step": 21154 + }, + { + "epoch": 0.39276259361435895, + "grad_norm": 0.45941615104675293, + "learning_rate": 1.3305464419636988e-05, + "loss": 0.3608, + "step": 21156 + }, + { + "epoch": 0.39279972375177763, + "grad_norm": 0.3114888072013855, + "learning_rate": 1.3304363475245805e-05, + "loss": 0.2937, + "step": 21158 + }, + { + "epoch": 0.39283685388919626, + "grad_norm": 0.4053100049495697, + "learning_rate": 1.3303262485892132e-05, + "loss": 0.236, + "step": 21160 + }, + { + "epoch": 0.3928739840266149, + "grad_norm": 0.35260215401649475, + "learning_rate": 1.3302161451590953e-05, + "loss": 0.266, + "step": 21162 + }, + { + "epoch": 0.3929111141640335, + "grad_norm": 0.4710381031036377, + "learning_rate": 1.3301060372357247e-05, + "loss": 0.4269, + "step": 21164 + }, + { + "epoch": 0.39294824430145214, + "grad_norm": 0.2651098966598511, + "learning_rate": 1.3299959248205994e-05, + "loss": 0.314, + "step": 21166 + }, + { + "epoch": 0.39298537443887077, + "grad_norm": 0.3585895299911499, + "learning_rate": 1.3298858079152184e-05, + "loss": 0.2222, + "step": 21168 + }, + { + "epoch": 0.39302250457628946, + "grad_norm": 0.30114251375198364, + "learning_rate": 1.3297756865210794e-05, + "loss": 0.2756, + "step": 21170 + }, + { + "epoch": 0.3930596347137081, + "grad_norm": 0.29901087284088135, + "learning_rate": 1.3296655606396814e-05, + "loss": 0.2678, + "step": 21172 + }, + { + "epoch": 0.3930967648511267, + "grad_norm": 0.33698150515556335, + "learning_rate": 1.3295554302725223e-05, + "loss": 0.4412, + "step": 21174 + }, + { + "epoch": 0.39313389498854534, + "grad_norm": 0.3872765004634857, + "learning_rate": 1.3294452954211008e-05, + "loss": 0.2959, + "step": 21176 + }, + { + "epoch": 0.39317102512596397, + "grad_norm": 0.1867680698633194, + "learning_rate": 1.329335156086916e-05, + "loss": 0.1974, + "step": 21178 + }, + { + "epoch": 0.39320815526338265, + "grad_norm": 0.30476659536361694, + "learning_rate": 1.329225012271466e-05, + "loss": 0.4242, + "step": 21180 + }, + { + "epoch": 0.3932452854008013, + "grad_norm": 0.37183302640914917, + "learning_rate": 1.32911486397625e-05, + "loss": 0.354, + "step": 21182 + }, + { + "epoch": 0.3932824155382199, + "grad_norm": 0.43083375692367554, + "learning_rate": 1.3290047112027661e-05, + "loss": 0.4312, + "step": 21184 + }, + { + "epoch": 0.39331954567563854, + "grad_norm": 0.2652100920677185, + "learning_rate": 1.3288945539525138e-05, + "loss": 0.3595, + "step": 21186 + }, + { + "epoch": 0.39335667581305717, + "grad_norm": 0.35747572779655457, + "learning_rate": 1.3287843922269916e-05, + "loss": 0.3319, + "step": 21188 + }, + { + "epoch": 0.39339380595047585, + "grad_norm": 0.34798482060432434, + "learning_rate": 1.328674226027699e-05, + "loss": 0.4377, + "step": 21190 + }, + { + "epoch": 0.3934309360878945, + "grad_norm": 0.2766435742378235, + "learning_rate": 1.3285640553561345e-05, + "loss": 0.1935, + "step": 21192 + }, + { + "epoch": 0.3934680662253131, + "grad_norm": 0.5573251247406006, + "learning_rate": 1.3284538802137972e-05, + "loss": 0.2626, + "step": 21194 + }, + { + "epoch": 0.39350519636273174, + "grad_norm": 0.39734238386154175, + "learning_rate": 1.3283437006021864e-05, + "loss": 0.1784, + "step": 21196 + }, + { + "epoch": 0.39354232650015036, + "grad_norm": 0.45531079173088074, + "learning_rate": 1.3282335165228016e-05, + "loss": 0.4027, + "step": 21198 + }, + { + "epoch": 0.393579456637569, + "grad_norm": 0.3285204768180847, + "learning_rate": 1.3281233279771418e-05, + "loss": 0.1327, + "step": 21200 + }, + { + "epoch": 0.3936165867749877, + "grad_norm": 0.26710113883018494, + "learning_rate": 1.3280131349667062e-05, + "loss": 0.3392, + "step": 21202 + }, + { + "epoch": 0.3936537169124063, + "grad_norm": 0.36151015758514404, + "learning_rate": 1.3279029374929944e-05, + "loss": 0.3119, + "step": 21204 + }, + { + "epoch": 0.39369084704982493, + "grad_norm": 0.33402392268180847, + "learning_rate": 1.3277927355575057e-05, + "loss": 0.143, + "step": 21206 + }, + { + "epoch": 0.39372797718724356, + "grad_norm": 0.3668234050273895, + "learning_rate": 1.3276825291617399e-05, + "loss": 0.3995, + "step": 21208 + }, + { + "epoch": 0.3937651073246622, + "grad_norm": 0.40789994597435, + "learning_rate": 1.3275723183071962e-05, + "loss": 0.3212, + "step": 21210 + }, + { + "epoch": 0.3938022374620809, + "grad_norm": 0.5132853388786316, + "learning_rate": 1.3274621029953747e-05, + "loss": 0.0871, + "step": 21212 + }, + { + "epoch": 0.3938393675994995, + "grad_norm": 0.6429086923599243, + "learning_rate": 1.3273518832277747e-05, + "loss": 0.3234, + "step": 21214 + }, + { + "epoch": 0.39387649773691813, + "grad_norm": 0.5411849021911621, + "learning_rate": 1.327241659005896e-05, + "loss": 0.3408, + "step": 21216 + }, + { + "epoch": 0.39391362787433676, + "grad_norm": 0.3104947507381439, + "learning_rate": 1.3271314303312388e-05, + "loss": 0.2007, + "step": 21218 + }, + { + "epoch": 0.3939507580117554, + "grad_norm": 0.4175235629081726, + "learning_rate": 1.3270211972053024e-05, + "loss": 0.2485, + "step": 21220 + }, + { + "epoch": 0.393987888149174, + "grad_norm": 0.35643407702445984, + "learning_rate": 1.3269109596295873e-05, + "loss": 0.3487, + "step": 21222 + }, + { + "epoch": 0.3940250182865927, + "grad_norm": 0.32255619764328003, + "learning_rate": 1.3268007176055932e-05, + "loss": 0.209, + "step": 21224 + }, + { + "epoch": 0.3940621484240113, + "grad_norm": 0.6464990973472595, + "learning_rate": 1.3266904711348202e-05, + "loss": 0.2817, + "step": 21226 + }, + { + "epoch": 0.39409927856142996, + "grad_norm": 0.9786666035652161, + "learning_rate": 1.3265802202187682e-05, + "loss": 0.2629, + "step": 21228 + }, + { + "epoch": 0.3941364086988486, + "grad_norm": 0.3321402072906494, + "learning_rate": 1.3264699648589381e-05, + "loss": 0.3024, + "step": 21230 + }, + { + "epoch": 0.3941735388362672, + "grad_norm": 0.25988873839378357, + "learning_rate": 1.326359705056829e-05, + "loss": 0.3272, + "step": 21232 + }, + { + "epoch": 0.3942106689736859, + "grad_norm": 0.43860870599746704, + "learning_rate": 1.3262494408139426e-05, + "loss": 0.4614, + "step": 21234 + }, + { + "epoch": 0.3942477991111045, + "grad_norm": 0.2583576440811157, + "learning_rate": 1.326139172131778e-05, + "loss": 0.2008, + "step": 21236 + }, + { + "epoch": 0.39428492924852315, + "grad_norm": 0.3852914571762085, + "learning_rate": 1.3260288990118364e-05, + "loss": 0.2093, + "step": 21238 + }, + { + "epoch": 0.3943220593859418, + "grad_norm": 0.32271698117256165, + "learning_rate": 1.3259186214556178e-05, + "loss": 0.2465, + "step": 21240 + }, + { + "epoch": 0.3943591895233604, + "grad_norm": 0.3636697828769684, + "learning_rate": 1.3258083394646233e-05, + "loss": 0.3989, + "step": 21242 + }, + { + "epoch": 0.39439631966077904, + "grad_norm": 0.5882134437561035, + "learning_rate": 1.3256980530403533e-05, + "loss": 0.3478, + "step": 21244 + }, + { + "epoch": 0.3944334497981977, + "grad_norm": 0.3851419985294342, + "learning_rate": 1.3255877621843084e-05, + "loss": 0.4214, + "step": 21246 + }, + { + "epoch": 0.39447057993561635, + "grad_norm": 0.39672455191612244, + "learning_rate": 1.3254774668979889e-05, + "loss": 0.481, + "step": 21248 + }, + { + "epoch": 0.394507710073035, + "grad_norm": 0.32517409324645996, + "learning_rate": 1.3253671671828963e-05, + "loss": 0.2591, + "step": 21250 + }, + { + "epoch": 0.3945448402104536, + "grad_norm": 0.38770776987075806, + "learning_rate": 1.325256863040531e-05, + "loss": 0.3072, + "step": 21252 + }, + { + "epoch": 0.39458197034787224, + "grad_norm": 0.48269984126091003, + "learning_rate": 1.325146554472394e-05, + "loss": 0.2211, + "step": 21254 + }, + { + "epoch": 0.3946191004852909, + "grad_norm": 0.4763682782649994, + "learning_rate": 1.3250362414799866e-05, + "loss": 0.2234, + "step": 21256 + }, + { + "epoch": 0.39465623062270955, + "grad_norm": 1.129381775856018, + "learning_rate": 1.3249259240648093e-05, + "loss": 0.3374, + "step": 21258 + }, + { + "epoch": 0.3946933607601282, + "grad_norm": 0.5001177787780762, + "learning_rate": 1.3248156022283634e-05, + "loss": 0.4029, + "step": 21260 + }, + { + "epoch": 0.3947304908975468, + "grad_norm": 0.5001845955848694, + "learning_rate": 1.3247052759721504e-05, + "loss": 0.4877, + "step": 21262 + }, + { + "epoch": 0.39476762103496543, + "grad_norm": 0.3281537592411041, + "learning_rate": 1.3245949452976707e-05, + "loss": 0.34, + "step": 21264 + }, + { + "epoch": 0.3948047511723841, + "grad_norm": 0.2567083537578583, + "learning_rate": 1.3244846102064266e-05, + "loss": 0.2163, + "step": 21266 + }, + { + "epoch": 0.39484188130980274, + "grad_norm": 0.40553757548332214, + "learning_rate": 1.3243742706999187e-05, + "loss": 0.2292, + "step": 21268 + }, + { + "epoch": 0.3948790114472214, + "grad_norm": 0.5182636380195618, + "learning_rate": 1.3242639267796484e-05, + "loss": 0.246, + "step": 21270 + }, + { + "epoch": 0.39491614158464, + "grad_norm": 0.2763744294643402, + "learning_rate": 1.3241535784471173e-05, + "loss": 0.2178, + "step": 21272 + }, + { + "epoch": 0.39495327172205863, + "grad_norm": 0.3817051947116852, + "learning_rate": 1.3240432257038274e-05, + "loss": 0.4699, + "step": 21274 + }, + { + "epoch": 0.39499040185947726, + "grad_norm": 0.37756794691085815, + "learning_rate": 1.3239328685512796e-05, + "loss": 0.4243, + "step": 21276 + }, + { + "epoch": 0.39502753199689594, + "grad_norm": 0.3087480664253235, + "learning_rate": 1.3238225069909757e-05, + "loss": 0.2684, + "step": 21278 + }, + { + "epoch": 0.39506466213431457, + "grad_norm": 0.4551987946033478, + "learning_rate": 1.3237121410244174e-05, + "loss": 0.341, + "step": 21280 + }, + { + "epoch": 0.3951017922717332, + "grad_norm": 0.28957197070121765, + "learning_rate": 1.3236017706531066e-05, + "loss": 0.2887, + "step": 21282 + }, + { + "epoch": 0.3951389224091518, + "grad_norm": 0.36098745465278625, + "learning_rate": 1.323491395878545e-05, + "loss": 0.179, + "step": 21284 + }, + { + "epoch": 0.39517605254657046, + "grad_norm": 0.23374038934707642, + "learning_rate": 1.3233810167022343e-05, + "loss": 0.2984, + "step": 21286 + }, + { + "epoch": 0.39521318268398914, + "grad_norm": 0.35284125804901123, + "learning_rate": 1.3232706331256768e-05, + "loss": 0.1746, + "step": 21288 + }, + { + "epoch": 0.39525031282140777, + "grad_norm": 0.3048955798149109, + "learning_rate": 1.3231602451503743e-05, + "loss": 0.28, + "step": 21290 + }, + { + "epoch": 0.3952874429588264, + "grad_norm": 0.38226455450057983, + "learning_rate": 1.3230498527778285e-05, + "loss": 0.3399, + "step": 21292 + }, + { + "epoch": 0.395324573096245, + "grad_norm": 0.3001598119735718, + "learning_rate": 1.3229394560095421e-05, + "loss": 0.5027, + "step": 21294 + }, + { + "epoch": 0.39536170323366365, + "grad_norm": 0.4842562675476074, + "learning_rate": 1.322829054847017e-05, + "loss": 0.3179, + "step": 21296 + }, + { + "epoch": 0.3953988333710823, + "grad_norm": 0.38519835472106934, + "learning_rate": 1.3227186492917557e-05, + "loss": 0.4255, + "step": 21298 + }, + { + "epoch": 0.39543596350850096, + "grad_norm": 0.3944139778614044, + "learning_rate": 1.3226082393452599e-05, + "loss": 0.1568, + "step": 21300 + }, + { + "epoch": 0.3954730936459196, + "grad_norm": 0.27098917961120605, + "learning_rate": 1.3224978250090323e-05, + "loss": 0.2928, + "step": 21302 + }, + { + "epoch": 0.3955102237833382, + "grad_norm": 0.2932801842689514, + "learning_rate": 1.3223874062845755e-05, + "loss": 0.3957, + "step": 21304 + }, + { + "epoch": 0.39554735392075685, + "grad_norm": 0.364400178194046, + "learning_rate": 1.3222769831733918e-05, + "loss": 0.2877, + "step": 21306 + }, + { + "epoch": 0.3955844840581755, + "grad_norm": 0.45450127124786377, + "learning_rate": 1.3221665556769833e-05, + "loss": 0.2072, + "step": 21308 + }, + { + "epoch": 0.39562161419559416, + "grad_norm": 0.4417363107204437, + "learning_rate": 1.3220561237968531e-05, + "loss": 0.5069, + "step": 21310 + }, + { + "epoch": 0.3956587443330128, + "grad_norm": 0.4484076201915741, + "learning_rate": 1.321945687534504e-05, + "loss": 0.3622, + "step": 21312 + }, + { + "epoch": 0.3956958744704314, + "grad_norm": 0.38349708914756775, + "learning_rate": 1.3218352468914381e-05, + "loss": 0.3303, + "step": 21314 + }, + { + "epoch": 0.39573300460785005, + "grad_norm": 0.33295178413391113, + "learning_rate": 1.3217248018691589e-05, + "loss": 0.2341, + "step": 21316 + }, + { + "epoch": 0.3957701347452687, + "grad_norm": 0.4847041964530945, + "learning_rate": 1.3216143524691684e-05, + "loss": 0.4044, + "step": 21318 + }, + { + "epoch": 0.3958072648826873, + "grad_norm": 0.4032294750213623, + "learning_rate": 1.3215038986929702e-05, + "loss": 0.2625, + "step": 21320 + }, + { + "epoch": 0.395844395020106, + "grad_norm": 0.5330463647842407, + "learning_rate": 1.321393440542067e-05, + "loss": 0.2502, + "step": 21322 + }, + { + "epoch": 0.3958815251575246, + "grad_norm": 0.35369816422462463, + "learning_rate": 1.3212829780179614e-05, + "loss": 0.4269, + "step": 21324 + }, + { + "epoch": 0.39591865529494324, + "grad_norm": 0.31855523586273193, + "learning_rate": 1.3211725111221571e-05, + "loss": 0.268, + "step": 21326 + }, + { + "epoch": 0.3959557854323619, + "grad_norm": 0.41287457942962646, + "learning_rate": 1.3210620398561568e-05, + "loss": 0.2168, + "step": 21328 + }, + { + "epoch": 0.3959929155697805, + "grad_norm": 0.461836576461792, + "learning_rate": 1.3209515642214642e-05, + "loss": 0.2727, + "step": 21330 + }, + { + "epoch": 0.3960300457071992, + "grad_norm": 0.3211372494697571, + "learning_rate": 1.3208410842195818e-05, + "loss": 0.2892, + "step": 21332 + }, + { + "epoch": 0.3960671758446178, + "grad_norm": 0.25807759165763855, + "learning_rate": 1.3207305998520133e-05, + "loss": 0.2613, + "step": 21334 + }, + { + "epoch": 0.39610430598203644, + "grad_norm": 0.36174342036247253, + "learning_rate": 1.3206201111202619e-05, + "loss": 0.3193, + "step": 21336 + }, + { + "epoch": 0.39614143611945507, + "grad_norm": 0.3498457670211792, + "learning_rate": 1.3205096180258314e-05, + "loss": 0.3475, + "step": 21338 + }, + { + "epoch": 0.3961785662568737, + "grad_norm": 0.37105366587638855, + "learning_rate": 1.320399120570225e-05, + "loss": 0.334, + "step": 21340 + }, + { + "epoch": 0.3962156963942924, + "grad_norm": 0.38353902101516724, + "learning_rate": 1.3202886187549465e-05, + "loss": 0.2405, + "step": 21342 + }, + { + "epoch": 0.396252826531711, + "grad_norm": 0.3860570192337036, + "learning_rate": 1.320178112581499e-05, + "loss": 0.2292, + "step": 21344 + }, + { + "epoch": 0.39628995666912964, + "grad_norm": 0.48549535870552063, + "learning_rate": 1.3200676020513866e-05, + "loss": 0.1837, + "step": 21346 + }, + { + "epoch": 0.39632708680654827, + "grad_norm": 0.5339629054069519, + "learning_rate": 1.3199570871661124e-05, + "loss": 0.384, + "step": 21348 + }, + { + "epoch": 0.3963642169439669, + "grad_norm": 0.4551251232624054, + "learning_rate": 1.319846567927181e-05, + "loss": 0.3641, + "step": 21350 + }, + { + "epoch": 0.3964013470813855, + "grad_norm": 0.4270746409893036, + "learning_rate": 1.319736044336096e-05, + "loss": 0.2495, + "step": 21352 + }, + { + "epoch": 0.3964384772188042, + "grad_norm": 0.32252249121665955, + "learning_rate": 1.3196255163943608e-05, + "loss": 0.2225, + "step": 21354 + }, + { + "epoch": 0.39647560735622284, + "grad_norm": 0.34464919567108154, + "learning_rate": 1.31951498410348e-05, + "loss": 0.4904, + "step": 21356 + }, + { + "epoch": 0.39651273749364147, + "grad_norm": 0.3619462847709656, + "learning_rate": 1.3194044474649572e-05, + "loss": 0.2859, + "step": 21358 + }, + { + "epoch": 0.3965498676310601, + "grad_norm": 0.4266261160373688, + "learning_rate": 1.3192939064802966e-05, + "loss": 0.2435, + "step": 21360 + }, + { + "epoch": 0.3965869977684787, + "grad_norm": 0.5674176216125488, + "learning_rate": 1.3191833611510024e-05, + "loss": 0.3594, + "step": 21362 + }, + { + "epoch": 0.3966241279058974, + "grad_norm": 0.44203129410743713, + "learning_rate": 1.3190728114785784e-05, + "loss": 0.2809, + "step": 21364 + }, + { + "epoch": 0.39666125804331603, + "grad_norm": 0.45486965775489807, + "learning_rate": 1.3189622574645295e-05, + "loss": 0.2336, + "step": 21366 + }, + { + "epoch": 0.39669838818073466, + "grad_norm": 0.3379833698272705, + "learning_rate": 1.3188516991103595e-05, + "loss": 0.3201, + "step": 21368 + }, + { + "epoch": 0.3967355183181533, + "grad_norm": 0.3028241991996765, + "learning_rate": 1.318741136417573e-05, + "loss": 0.2787, + "step": 21370 + }, + { + "epoch": 0.3967726484555719, + "grad_norm": 0.3652766942977905, + "learning_rate": 1.3186305693876746e-05, + "loss": 0.3897, + "step": 21372 + }, + { + "epoch": 0.39680977859299055, + "grad_norm": 0.39875528216362, + "learning_rate": 1.3185199980221683e-05, + "loss": 0.3577, + "step": 21374 + }, + { + "epoch": 0.39684690873040923, + "grad_norm": 0.3411742150783539, + "learning_rate": 1.3184094223225592e-05, + "loss": 0.2498, + "step": 21376 + }, + { + "epoch": 0.39688403886782786, + "grad_norm": 0.4538913667201996, + "learning_rate": 1.3182988422903513e-05, + "loss": 0.3293, + "step": 21378 + }, + { + "epoch": 0.3969211690052465, + "grad_norm": 0.46343672275543213, + "learning_rate": 1.3181882579270495e-05, + "loss": 0.5112, + "step": 21380 + }, + { + "epoch": 0.3969582991426651, + "grad_norm": 2.7093591690063477, + "learning_rate": 1.3180776692341589e-05, + "loss": 0.3928, + "step": 21382 + }, + { + "epoch": 0.39699542928008374, + "grad_norm": 0.44910404086112976, + "learning_rate": 1.3179670762131838e-05, + "loss": 0.2048, + "step": 21384 + }, + { + "epoch": 0.39703255941750243, + "grad_norm": 0.3405029773712158, + "learning_rate": 1.3178564788656291e-05, + "loss": 0.2839, + "step": 21386 + }, + { + "epoch": 0.39706968955492106, + "grad_norm": 0.32834893465042114, + "learning_rate": 1.317745877193e-05, + "loss": 0.1788, + "step": 21388 + }, + { + "epoch": 0.3971068196923397, + "grad_norm": 0.42046496272087097, + "learning_rate": 1.3176352711968013e-05, + "loss": 0.362, + "step": 21390 + }, + { + "epoch": 0.3971439498297583, + "grad_norm": 0.4401164948940277, + "learning_rate": 1.317524660878538e-05, + "loss": 0.4253, + "step": 21392 + }, + { + "epoch": 0.39718107996717694, + "grad_norm": 0.3752206563949585, + "learning_rate": 1.3174140462397152e-05, + "loss": 0.3122, + "step": 21394 + }, + { + "epoch": 0.39721821010459557, + "grad_norm": 0.4886712431907654, + "learning_rate": 1.3173034272818378e-05, + "loss": 0.3552, + "step": 21396 + }, + { + "epoch": 0.39725534024201425, + "grad_norm": 0.340210884809494, + "learning_rate": 1.317192804006411e-05, + "loss": 0.3417, + "step": 21398 + }, + { + "epoch": 0.3972924703794329, + "grad_norm": 0.3552398085594177, + "learning_rate": 1.3170821764149406e-05, + "loss": 0.2166, + "step": 21400 + }, + { + "epoch": 0.3973296005168515, + "grad_norm": 0.36921200156211853, + "learning_rate": 1.3169715445089315e-05, + "loss": 0.3026, + "step": 21402 + }, + { + "epoch": 0.39736673065427014, + "grad_norm": 0.34347596764564514, + "learning_rate": 1.3168609082898892e-05, + "loss": 0.3594, + "step": 21404 + }, + { + "epoch": 0.39740386079168877, + "grad_norm": 0.24991226196289062, + "learning_rate": 1.3167502677593187e-05, + "loss": 0.2462, + "step": 21406 + }, + { + "epoch": 0.39744099092910745, + "grad_norm": 0.49743711948394775, + "learning_rate": 1.3166396229187262e-05, + "loss": 0.4166, + "step": 21408 + }, + { + "epoch": 0.3974781210665261, + "grad_norm": 3.030057430267334, + "learning_rate": 1.3165289737696165e-05, + "loss": 0.0842, + "step": 21410 + }, + { + "epoch": 0.3975152512039447, + "grad_norm": 0.5286726355552673, + "learning_rate": 1.3164183203134957e-05, + "loss": 0.3606, + "step": 21412 + }, + { + "epoch": 0.39755238134136334, + "grad_norm": 0.5562342405319214, + "learning_rate": 1.3163076625518699e-05, + "loss": 0.2306, + "step": 21414 + }, + { + "epoch": 0.39758951147878197, + "grad_norm": 0.3972679674625397, + "learning_rate": 1.3161970004862437e-05, + "loss": 0.3435, + "step": 21416 + }, + { + "epoch": 0.39762664161620065, + "grad_norm": 0.362491637468338, + "learning_rate": 1.3160863341181234e-05, + "loss": 0.5325, + "step": 21418 + }, + { + "epoch": 0.3976637717536193, + "grad_norm": 0.4831483066082001, + "learning_rate": 1.315975663449015e-05, + "loss": 0.3474, + "step": 21420 + }, + { + "epoch": 0.3977009018910379, + "grad_norm": 0.40423882007598877, + "learning_rate": 1.3158649884804242e-05, + "loss": 0.1777, + "step": 21422 + }, + { + "epoch": 0.39773803202845653, + "grad_norm": 0.3032251000404358, + "learning_rate": 1.3157543092138574e-05, + "loss": 0.3354, + "step": 21424 + }, + { + "epoch": 0.39777516216587516, + "grad_norm": 0.3344537317752838, + "learning_rate": 1.31564362565082e-05, + "loss": 0.3274, + "step": 21426 + }, + { + "epoch": 0.3978122923032938, + "grad_norm": 0.5467990636825562, + "learning_rate": 1.3155329377928183e-05, + "loss": 0.2467, + "step": 21428 + }, + { + "epoch": 0.3978494224407125, + "grad_norm": 0.2842448055744171, + "learning_rate": 1.3154222456413584e-05, + "loss": 0.2789, + "step": 21430 + }, + { + "epoch": 0.3978865525781311, + "grad_norm": 0.6168789863586426, + "learning_rate": 1.3153115491979467e-05, + "loss": 0.3106, + "step": 21432 + }, + { + "epoch": 0.39792368271554973, + "grad_norm": 0.4014568328857422, + "learning_rate": 1.3152008484640891e-05, + "loss": 0.2412, + "step": 21434 + }, + { + "epoch": 0.39796081285296836, + "grad_norm": 0.2806049883365631, + "learning_rate": 1.3150901434412921e-05, + "loss": 0.2462, + "step": 21436 + }, + { + "epoch": 0.397997942990387, + "grad_norm": 0.2523127496242523, + "learning_rate": 1.3149794341310623e-05, + "loss": 0.2597, + "step": 21438 + }, + { + "epoch": 0.39803507312780567, + "grad_norm": 0.44269904494285583, + "learning_rate": 1.3148687205349055e-05, + "loss": 0.3978, + "step": 21440 + }, + { + "epoch": 0.3980722032652243, + "grad_norm": 0.3841804563999176, + "learning_rate": 1.3147580026543288e-05, + "loss": 0.4216, + "step": 21442 + }, + { + "epoch": 0.39810933340264293, + "grad_norm": 0.5604090690612793, + "learning_rate": 1.3146472804908386e-05, + "loss": 0.3827, + "step": 21444 + }, + { + "epoch": 0.39814646354006156, + "grad_norm": 0.5545257329940796, + "learning_rate": 1.3145365540459412e-05, + "loss": 0.2932, + "step": 21446 + }, + { + "epoch": 0.3981835936774802, + "grad_norm": 0.28801146149635315, + "learning_rate": 1.3144258233211434e-05, + "loss": 0.2779, + "step": 21448 + }, + { + "epoch": 0.3982207238148988, + "grad_norm": 0.29569265246391296, + "learning_rate": 1.3143150883179519e-05, + "loss": 0.2422, + "step": 21450 + }, + { + "epoch": 0.3982578539523175, + "grad_norm": 0.4048759937286377, + "learning_rate": 1.3142043490378737e-05, + "loss": 0.3219, + "step": 21452 + }, + { + "epoch": 0.3982949840897361, + "grad_norm": 0.5841191411018372, + "learning_rate": 1.3140936054824153e-05, + "loss": 0.3237, + "step": 21454 + }, + { + "epoch": 0.39833211422715475, + "grad_norm": 0.3433621823787689, + "learning_rate": 1.313982857653084e-05, + "loss": 0.2875, + "step": 21456 + }, + { + "epoch": 0.3983692443645734, + "grad_norm": 0.541553795337677, + "learning_rate": 1.3138721055513866e-05, + "loss": 0.2948, + "step": 21458 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.32442864775657654, + "learning_rate": 1.3137613491788295e-05, + "loss": 0.1831, + "step": 21460 + }, + { + "epoch": 0.3984435046394107, + "grad_norm": 0.45583680272102356, + "learning_rate": 1.3136505885369203e-05, + "loss": 0.1952, + "step": 21462 + }, + { + "epoch": 0.3984806347768293, + "grad_norm": 0.41744568943977356, + "learning_rate": 1.3135398236271666e-05, + "loss": 0.4591, + "step": 21464 + }, + { + "epoch": 0.39851776491424795, + "grad_norm": 0.4317241311073303, + "learning_rate": 1.3134290544510746e-05, + "loss": 0.238, + "step": 21466 + }, + { + "epoch": 0.3985548950516666, + "grad_norm": 0.40772545337677, + "learning_rate": 1.3133182810101524e-05, + "loss": 0.1757, + "step": 21468 + }, + { + "epoch": 0.3985920251890852, + "grad_norm": 0.3294576406478882, + "learning_rate": 1.3132075033059067e-05, + "loss": 0.2891, + "step": 21470 + }, + { + "epoch": 0.39862915532650384, + "grad_norm": 0.3333803713321686, + "learning_rate": 1.3130967213398448e-05, + "loss": 0.2023, + "step": 21472 + }, + { + "epoch": 0.3986662854639225, + "grad_norm": 0.26052340865135193, + "learning_rate": 1.3129859351134746e-05, + "loss": 0.3567, + "step": 21474 + }, + { + "epoch": 0.39870341560134115, + "grad_norm": 0.29549750685691833, + "learning_rate": 1.3128751446283032e-05, + "loss": 0.2689, + "step": 21476 + }, + { + "epoch": 0.3987405457387598, + "grad_norm": 0.3896428942680359, + "learning_rate": 1.3127643498858386e-05, + "loss": 0.3316, + "step": 21478 + }, + { + "epoch": 0.3987776758761784, + "grad_norm": 0.5215996503829956, + "learning_rate": 1.3126535508875878e-05, + "loss": 0.4622, + "step": 21480 + }, + { + "epoch": 0.39881480601359703, + "grad_norm": 0.3271085023880005, + "learning_rate": 1.3125427476350586e-05, + "loss": 0.2186, + "step": 21482 + }, + { + "epoch": 0.3988519361510157, + "grad_norm": 0.36718741059303284, + "learning_rate": 1.3124319401297587e-05, + "loss": 0.376, + "step": 21484 + }, + { + "epoch": 0.39888906628843435, + "grad_norm": 0.4738021790981293, + "learning_rate": 1.312321128373196e-05, + "loss": 0.3363, + "step": 21486 + }, + { + "epoch": 0.398926196425853, + "grad_norm": 0.35821533203125, + "learning_rate": 1.3122103123668783e-05, + "loss": 0.2736, + "step": 21488 + }, + { + "epoch": 0.3989633265632716, + "grad_norm": 0.4866703152656555, + "learning_rate": 1.3120994921123136e-05, + "loss": 0.3008, + "step": 21490 + }, + { + "epoch": 0.39900045670069023, + "grad_norm": 0.6055618524551392, + "learning_rate": 1.3119886676110094e-05, + "loss": 0.1778, + "step": 21492 + }, + { + "epoch": 0.3990375868381089, + "grad_norm": 0.38749030232429504, + "learning_rate": 1.3118778388644739e-05, + "loss": 0.4815, + "step": 21494 + }, + { + "epoch": 0.39907471697552754, + "grad_norm": 0.32699114084243774, + "learning_rate": 1.3117670058742156e-05, + "loss": 0.4089, + "step": 21496 + }, + { + "epoch": 0.39911184711294617, + "grad_norm": 0.37155407667160034, + "learning_rate": 1.3116561686417419e-05, + "loss": 0.3324, + "step": 21498 + }, + { + "epoch": 0.3991489772503648, + "grad_norm": 0.38316500186920166, + "learning_rate": 1.3115453271685614e-05, + "loss": 0.1814, + "step": 21500 + }, + { + "epoch": 0.39918610738778343, + "grad_norm": 0.5003445744514465, + "learning_rate": 1.3114344814561822e-05, + "loss": 0.3445, + "step": 21502 + }, + { + "epoch": 0.39922323752520206, + "grad_norm": 0.3085777759552002, + "learning_rate": 1.3113236315061122e-05, + "loss": 0.3262, + "step": 21504 + }, + { + "epoch": 0.39926036766262074, + "grad_norm": 0.28057214617729187, + "learning_rate": 1.3112127773198605e-05, + "loss": 0.2253, + "step": 21506 + }, + { + "epoch": 0.39929749780003937, + "grad_norm": 0.34965234994888306, + "learning_rate": 1.311101918898935e-05, + "loss": 0.2385, + "step": 21508 + }, + { + "epoch": 0.399334627937458, + "grad_norm": 0.2598631978034973, + "learning_rate": 1.3109910562448441e-05, + "loss": 0.2407, + "step": 21510 + }, + { + "epoch": 0.3993717580748766, + "grad_norm": 0.518075704574585, + "learning_rate": 1.3108801893590965e-05, + "loss": 0.1029, + "step": 21512 + }, + { + "epoch": 0.39940888821229525, + "grad_norm": 0.5881736874580383, + "learning_rate": 1.3107693182432009e-05, + "loss": 0.3001, + "step": 21514 + }, + { + "epoch": 0.39944601834971394, + "grad_norm": 0.34947633743286133, + "learning_rate": 1.3106584428986655e-05, + "loss": 0.3163, + "step": 21516 + }, + { + "epoch": 0.39948314848713257, + "grad_norm": 0.4749600291252136, + "learning_rate": 1.310547563326999e-05, + "loss": 0.378, + "step": 21518 + }, + { + "epoch": 0.3995202786245512, + "grad_norm": 0.4823436141014099, + "learning_rate": 1.310436679529711e-05, + "loss": 0.345, + "step": 21520 + }, + { + "epoch": 0.3995574087619698, + "grad_norm": 0.41014525294303894, + "learning_rate": 1.3103257915083094e-05, + "loss": 0.3339, + "step": 21522 + }, + { + "epoch": 0.39959453889938845, + "grad_norm": 0.415578693151474, + "learning_rate": 1.3102148992643031e-05, + "loss": 0.2949, + "step": 21524 + }, + { + "epoch": 0.3996316690368071, + "grad_norm": 0.30829232931137085, + "learning_rate": 1.3101040027992013e-05, + "loss": 0.2404, + "step": 21526 + }, + { + "epoch": 0.39966879917422576, + "grad_norm": 0.2791459858417511, + "learning_rate": 1.3099931021145128e-05, + "loss": 0.2947, + "step": 21528 + }, + { + "epoch": 0.3997059293116444, + "grad_norm": 0.4334114193916321, + "learning_rate": 1.3098821972117472e-05, + "loss": 0.3971, + "step": 21530 + }, + { + "epoch": 0.399743059449063, + "grad_norm": 0.3893239200115204, + "learning_rate": 1.3097712880924127e-05, + "loss": 0.3549, + "step": 21532 + }, + { + "epoch": 0.39978018958648165, + "grad_norm": 0.5558448433876038, + "learning_rate": 1.309660374758019e-05, + "loss": 0.2364, + "step": 21534 + }, + { + "epoch": 0.3998173197239003, + "grad_norm": 0.5246394872665405, + "learning_rate": 1.3095494572100749e-05, + "loss": 0.2716, + "step": 21536 + }, + { + "epoch": 0.39985444986131896, + "grad_norm": 0.35002467036247253, + "learning_rate": 1.30943853545009e-05, + "loss": 0.327, + "step": 21538 + }, + { + "epoch": 0.3998915799987376, + "grad_norm": 0.42251402139663696, + "learning_rate": 1.3093276094795738e-05, + "loss": 0.3332, + "step": 21540 + }, + { + "epoch": 0.3999287101361562, + "grad_norm": 0.3282107710838318, + "learning_rate": 1.3092166793000351e-05, + "loss": 0.3129, + "step": 21542 + }, + { + "epoch": 0.39996584027357485, + "grad_norm": 0.367582768201828, + "learning_rate": 1.3091057449129837e-05, + "loss": 0.3116, + "step": 21544 + }, + { + "epoch": 0.4000029704109935, + "grad_norm": 0.28339990973472595, + "learning_rate": 1.308994806319929e-05, + "loss": 0.1659, + "step": 21546 + }, + { + "epoch": 0.4000401005484121, + "grad_norm": 0.44281962513923645, + "learning_rate": 1.3088838635223802e-05, + "loss": 0.2373, + "step": 21548 + }, + { + "epoch": 0.4000772306858308, + "grad_norm": 0.5008556246757507, + "learning_rate": 1.3087729165218475e-05, + "loss": 0.2859, + "step": 21550 + }, + { + "epoch": 0.4001143608232494, + "grad_norm": 0.47648438811302185, + "learning_rate": 1.3086619653198405e-05, + "loss": 0.361, + "step": 21552 + }, + { + "epoch": 0.40015149096066804, + "grad_norm": 0.3815816044807434, + "learning_rate": 1.3085510099178684e-05, + "loss": 0.2648, + "step": 21554 + }, + { + "epoch": 0.40018862109808667, + "grad_norm": 0.28419145941734314, + "learning_rate": 1.3084400503174413e-05, + "loss": 0.24, + "step": 21556 + }, + { + "epoch": 0.4002257512355053, + "grad_norm": 0.24789069592952728, + "learning_rate": 1.3083290865200691e-05, + "loss": 0.1528, + "step": 21558 + }, + { + "epoch": 0.400262881372924, + "grad_norm": 0.39886751770973206, + "learning_rate": 1.3082181185272616e-05, + "loss": 0.4838, + "step": 21560 + }, + { + "epoch": 0.4003000115103426, + "grad_norm": 0.3590618371963501, + "learning_rate": 1.3081071463405286e-05, + "loss": 0.3311, + "step": 21562 + }, + { + "epoch": 0.40033714164776124, + "grad_norm": 0.4033971428871155, + "learning_rate": 1.3079961699613803e-05, + "loss": 0.3016, + "step": 21564 + }, + { + "epoch": 0.40037427178517987, + "grad_norm": 0.4263336658477783, + "learning_rate": 1.3078851893913267e-05, + "loss": 0.5059, + "step": 21566 + }, + { + "epoch": 0.4004114019225985, + "grad_norm": 0.3186505138874054, + "learning_rate": 1.3077742046318777e-05, + "loss": 0.2991, + "step": 21568 + }, + { + "epoch": 0.4004485320600172, + "grad_norm": 0.26430410146713257, + "learning_rate": 1.3076632156845438e-05, + "loss": 0.1828, + "step": 21570 + }, + { + "epoch": 0.4004856621974358, + "grad_norm": 0.4302446246147156, + "learning_rate": 1.3075522225508352e-05, + "loss": 0.2867, + "step": 21572 + }, + { + "epoch": 0.40052279233485444, + "grad_norm": 0.34651967883110046, + "learning_rate": 1.3074412252322622e-05, + "loss": 0.0938, + "step": 21574 + }, + { + "epoch": 0.40055992247227307, + "grad_norm": 0.38205254077911377, + "learning_rate": 1.3073302237303347e-05, + "loss": 0.2982, + "step": 21576 + }, + { + "epoch": 0.4005970526096917, + "grad_norm": 0.333000510931015, + "learning_rate": 1.3072192180465638e-05, + "loss": 0.2751, + "step": 21578 + }, + { + "epoch": 0.4006341827471103, + "grad_norm": 0.28413447737693787, + "learning_rate": 1.307108208182459e-05, + "loss": 0.1093, + "step": 21580 + }, + { + "epoch": 0.400671312884529, + "grad_norm": 0.20403039455413818, + "learning_rate": 1.3069971941395319e-05, + "loss": 0.2215, + "step": 21582 + }, + { + "epoch": 0.40070844302194764, + "grad_norm": 0.3771042227745056, + "learning_rate": 1.3068861759192926e-05, + "loss": 0.3641, + "step": 21584 + }, + { + "epoch": 0.40074557315936626, + "grad_norm": 0.36434975266456604, + "learning_rate": 1.3067751535232511e-05, + "loss": 0.1067, + "step": 21586 + }, + { + "epoch": 0.4007827032967849, + "grad_norm": 0.2780624032020569, + "learning_rate": 1.306664126952919e-05, + "loss": 0.2565, + "step": 21588 + }, + { + "epoch": 0.4008198334342035, + "grad_norm": 0.368868887424469, + "learning_rate": 1.3065530962098068e-05, + "loss": 0.4183, + "step": 21590 + }, + { + "epoch": 0.4008569635716222, + "grad_norm": 0.48014897108078003, + "learning_rate": 1.306442061295425e-05, + "loss": 0.2594, + "step": 21592 + }, + { + "epoch": 0.40089409370904083, + "grad_norm": 0.4141768515110016, + "learning_rate": 1.3063310222112847e-05, + "loss": 0.3086, + "step": 21594 + }, + { + "epoch": 0.40093122384645946, + "grad_norm": 0.762408435344696, + "learning_rate": 1.3062199789588969e-05, + "loss": 0.3154, + "step": 21596 + }, + { + "epoch": 0.4009683539838781, + "grad_norm": 0.4782993495464325, + "learning_rate": 1.3061089315397721e-05, + "loss": 0.3074, + "step": 21598 + }, + { + "epoch": 0.4010054841212967, + "grad_norm": 0.46653079986572266, + "learning_rate": 1.3059978799554218e-05, + "loss": 0.2632, + "step": 21600 + }, + { + "epoch": 0.40104261425871535, + "grad_norm": 0.3656485676765442, + "learning_rate": 1.3058868242073572e-05, + "loss": 0.3791, + "step": 21602 + }, + { + "epoch": 0.40107974439613403, + "grad_norm": 0.40548112988471985, + "learning_rate": 1.3057757642970887e-05, + "loss": 0.4675, + "step": 21604 + }, + { + "epoch": 0.40111687453355266, + "grad_norm": 0.3303479552268982, + "learning_rate": 1.3056647002261283e-05, + "loss": 0.4633, + "step": 21606 + }, + { + "epoch": 0.4011540046709713, + "grad_norm": 0.36539745330810547, + "learning_rate": 1.3055536319959868e-05, + "loss": 0.3138, + "step": 21608 + }, + { + "epoch": 0.4011911348083899, + "grad_norm": 0.5113174915313721, + "learning_rate": 1.3054425596081757e-05, + "loss": 0.2921, + "step": 21610 + }, + { + "epoch": 0.40122826494580854, + "grad_norm": 0.3386172950267792, + "learning_rate": 1.305331483064206e-05, + "loss": 0.1756, + "step": 21612 + }, + { + "epoch": 0.4012653950832272, + "grad_norm": 0.513117790222168, + "learning_rate": 1.3052204023655897e-05, + "loss": 0.4476, + "step": 21614 + }, + { + "epoch": 0.40130252522064586, + "grad_norm": 0.4559982717037201, + "learning_rate": 1.3051093175138379e-05, + "loss": 0.3686, + "step": 21616 + }, + { + "epoch": 0.4013396553580645, + "grad_norm": 0.378753125667572, + "learning_rate": 1.3049982285104618e-05, + "loss": 0.3007, + "step": 21618 + }, + { + "epoch": 0.4013767854954831, + "grad_norm": 0.37841469049453735, + "learning_rate": 1.3048871353569737e-05, + "loss": 0.3634, + "step": 21620 + }, + { + "epoch": 0.40141391563290174, + "grad_norm": 0.28112712502479553, + "learning_rate": 1.304776038054885e-05, + "loss": 0.2533, + "step": 21622 + }, + { + "epoch": 0.40145104577032037, + "grad_norm": 0.389801949262619, + "learning_rate": 1.304664936605707e-05, + "loss": 0.2219, + "step": 21624 + }, + { + "epoch": 0.40148817590773905, + "grad_norm": 0.2653312087059021, + "learning_rate": 1.3045538310109524e-05, + "loss": 0.2627, + "step": 21626 + }, + { + "epoch": 0.4015253060451577, + "grad_norm": 0.5216689109802246, + "learning_rate": 1.304442721272132e-05, + "loss": 0.2086, + "step": 21628 + }, + { + "epoch": 0.4015624361825763, + "grad_norm": 1.0068920850753784, + "learning_rate": 1.3043316073907582e-05, + "loss": 0.2471, + "step": 21630 + }, + { + "epoch": 0.40159956631999494, + "grad_norm": 0.42398956418037415, + "learning_rate": 1.3042204893683425e-05, + "loss": 0.3777, + "step": 21632 + }, + { + "epoch": 0.40163669645741357, + "grad_norm": 0.4337816536426544, + "learning_rate": 1.3041093672063976e-05, + "loss": 0.4506, + "step": 21634 + }, + { + "epoch": 0.40167382659483225, + "grad_norm": 0.32641860842704773, + "learning_rate": 1.303998240906435e-05, + "loss": 0.32, + "step": 21636 + }, + { + "epoch": 0.4017109567322509, + "grad_norm": 0.4616388976573944, + "learning_rate": 1.3038871104699666e-05, + "loss": 0.2949, + "step": 21638 + }, + { + "epoch": 0.4017480868696695, + "grad_norm": 0.41880515217781067, + "learning_rate": 1.3037759758985053e-05, + "loss": 0.142, + "step": 21640 + }, + { + "epoch": 0.40178521700708814, + "grad_norm": 0.9421810507774353, + "learning_rate": 1.3036648371935628e-05, + "loss": 0.3271, + "step": 21642 + }, + { + "epoch": 0.40182234714450676, + "grad_norm": 0.2598293423652649, + "learning_rate": 1.3035536943566512e-05, + "loss": 0.2909, + "step": 21644 + }, + { + "epoch": 0.40185947728192545, + "grad_norm": 0.4688859283924103, + "learning_rate": 1.3034425473892833e-05, + "loss": 0.199, + "step": 21646 + }, + { + "epoch": 0.4018966074193441, + "grad_norm": 0.28260838985443115, + "learning_rate": 1.3033313962929713e-05, + "loss": 0.2756, + "step": 21648 + }, + { + "epoch": 0.4019337375567627, + "grad_norm": 0.4188922345638275, + "learning_rate": 1.3032202410692274e-05, + "loss": 0.4869, + "step": 21650 + }, + { + "epoch": 0.40197086769418133, + "grad_norm": 0.25489652156829834, + "learning_rate": 1.3031090817195641e-05, + "loss": 0.3904, + "step": 21652 + }, + { + "epoch": 0.40200799783159996, + "grad_norm": 0.691049337387085, + "learning_rate": 1.3029979182454944e-05, + "loss": 0.295, + "step": 21654 + }, + { + "epoch": 0.4020451279690186, + "grad_norm": 0.4149673283100128, + "learning_rate": 1.3028867506485305e-05, + "loss": 0.201, + "step": 21656 + }, + { + "epoch": 0.4020822581064373, + "grad_norm": 0.38414302468299866, + "learning_rate": 1.3027755789301853e-05, + "loss": 0.3103, + "step": 21658 + }, + { + "epoch": 0.4021193882438559, + "grad_norm": 0.360568106174469, + "learning_rate": 1.3026644030919715e-05, + "loss": 0.1855, + "step": 21660 + }, + { + "epoch": 0.40215651838127453, + "grad_norm": 0.41245150566101074, + "learning_rate": 1.3025532231354014e-05, + "loss": 0.2322, + "step": 21662 + }, + { + "epoch": 0.40219364851869316, + "grad_norm": 0.38634243607521057, + "learning_rate": 1.3024420390619882e-05, + "loss": 0.2568, + "step": 21664 + }, + { + "epoch": 0.4022307786561118, + "grad_norm": 0.41453200578689575, + "learning_rate": 1.302330850873245e-05, + "loss": 0.2059, + "step": 21666 + }, + { + "epoch": 0.40226790879353047, + "grad_norm": 0.3939494490623474, + "learning_rate": 1.3022196585706848e-05, + "loss": 0.1023, + "step": 21668 + }, + { + "epoch": 0.4023050389309491, + "grad_norm": 0.5226161479949951, + "learning_rate": 1.3021084621558197e-05, + "loss": 0.4945, + "step": 21670 + }, + { + "epoch": 0.4023421690683677, + "grad_norm": 0.5636163949966431, + "learning_rate": 1.3019972616301637e-05, + "loss": 0.3663, + "step": 21672 + }, + { + "epoch": 0.40237929920578636, + "grad_norm": 0.31683680415153503, + "learning_rate": 1.3018860569952292e-05, + "loss": 0.3103, + "step": 21674 + }, + { + "epoch": 0.402416429343205, + "grad_norm": 0.3009171783924103, + "learning_rate": 1.30177484825253e-05, + "loss": 0.3506, + "step": 21676 + }, + { + "epoch": 0.4024535594806236, + "grad_norm": 0.4664005935192108, + "learning_rate": 1.3016636354035793e-05, + "loss": 0.2831, + "step": 21678 + }, + { + "epoch": 0.4024906896180423, + "grad_norm": 0.5344043374061584, + "learning_rate": 1.3015524184498895e-05, + "loss": 0.097, + "step": 21680 + }, + { + "epoch": 0.4025278197554609, + "grad_norm": 0.5564618706703186, + "learning_rate": 1.301441197392975e-05, + "loss": 0.4211, + "step": 21682 + }, + { + "epoch": 0.40256494989287955, + "grad_norm": 0.5117163062095642, + "learning_rate": 1.3013299722343487e-05, + "loss": 0.277, + "step": 21684 + }, + { + "epoch": 0.4026020800302982, + "grad_norm": 0.34625816345214844, + "learning_rate": 1.3012187429755237e-05, + "loss": 0.2664, + "step": 21686 + }, + { + "epoch": 0.4026392101677168, + "grad_norm": 0.9470846652984619, + "learning_rate": 1.3011075096180142e-05, + "loss": 0.3186, + "step": 21688 + }, + { + "epoch": 0.4026763403051355, + "grad_norm": 0.3190852999687195, + "learning_rate": 1.3009962721633333e-05, + "loss": 0.4491, + "step": 21690 + }, + { + "epoch": 0.4027134704425541, + "grad_norm": 0.2218458652496338, + "learning_rate": 1.3008850306129948e-05, + "loss": 0.1908, + "step": 21692 + }, + { + "epoch": 0.40275060057997275, + "grad_norm": 0.40608659386634827, + "learning_rate": 1.300773784968512e-05, + "loss": 0.4634, + "step": 21694 + }, + { + "epoch": 0.4027877307173914, + "grad_norm": 0.33153679966926575, + "learning_rate": 1.3006625352313991e-05, + "loss": 0.4621, + "step": 21696 + }, + { + "epoch": 0.40282486085481, + "grad_norm": 0.29292234778404236, + "learning_rate": 1.3005512814031697e-05, + "loss": 0.2983, + "step": 21698 + }, + { + "epoch": 0.40286199099222864, + "grad_norm": 0.4406436085700989, + "learning_rate": 1.3004400234853378e-05, + "loss": 0.2223, + "step": 21700 + }, + { + "epoch": 0.4028991211296473, + "grad_norm": 0.3828709125518799, + "learning_rate": 1.300328761479417e-05, + "loss": 0.4096, + "step": 21702 + }, + { + "epoch": 0.40293625126706595, + "grad_norm": 0.4048078656196594, + "learning_rate": 1.3002174953869214e-05, + "loss": 0.2852, + "step": 21704 + }, + { + "epoch": 0.4029733814044846, + "grad_norm": 0.3186642527580261, + "learning_rate": 1.3001062252093645e-05, + "loss": 0.1662, + "step": 21706 + }, + { + "epoch": 0.4030105115419032, + "grad_norm": 0.39670151472091675, + "learning_rate": 1.299994950948261e-05, + "loss": 0.249, + "step": 21708 + }, + { + "epoch": 0.40304764167932183, + "grad_norm": 0.3205305337905884, + "learning_rate": 1.2998836726051252e-05, + "loss": 0.3076, + "step": 21710 + }, + { + "epoch": 0.4030847718167405, + "grad_norm": 0.26526880264282227, + "learning_rate": 1.2997723901814707e-05, + "loss": 0.236, + "step": 21712 + }, + { + "epoch": 0.40312190195415915, + "grad_norm": 0.3418671488761902, + "learning_rate": 1.2996611036788117e-05, + "loss": 0.2131, + "step": 21714 + }, + { + "epoch": 0.4031590320915778, + "grad_norm": 0.3721555769443512, + "learning_rate": 1.2995498130986627e-05, + "loss": 0.2716, + "step": 21716 + }, + { + "epoch": 0.4031961622289964, + "grad_norm": 0.28217655420303345, + "learning_rate": 1.299438518442538e-05, + "loss": 0.0978, + "step": 21718 + }, + { + "epoch": 0.40323329236641503, + "grad_norm": 0.25930649042129517, + "learning_rate": 1.2993272197119522e-05, + "loss": 0.256, + "step": 21720 + }, + { + "epoch": 0.4032704225038337, + "grad_norm": 0.2782888114452362, + "learning_rate": 1.2992159169084194e-05, + "loss": 0.5646, + "step": 21722 + }, + { + "epoch": 0.40330755264125234, + "grad_norm": 0.398754745721817, + "learning_rate": 1.2991046100334541e-05, + "loss": 0.3, + "step": 21724 + }, + { + "epoch": 0.40334468277867097, + "grad_norm": 0.3119527995586395, + "learning_rate": 1.298993299088571e-05, + "loss": 0.3481, + "step": 21726 + }, + { + "epoch": 0.4033818129160896, + "grad_norm": 0.353280633687973, + "learning_rate": 1.2988819840752849e-05, + "loss": 0.0875, + "step": 21728 + }, + { + "epoch": 0.4034189430535082, + "grad_norm": 0.49691230058670044, + "learning_rate": 1.29877066499511e-05, + "loss": 0.3868, + "step": 21730 + }, + { + "epoch": 0.40345607319092686, + "grad_norm": 0.4137439429759979, + "learning_rate": 1.2986593418495614e-05, + "loss": 0.3547, + "step": 21732 + }, + { + "epoch": 0.40349320332834554, + "grad_norm": 0.3058105707168579, + "learning_rate": 1.2985480146401537e-05, + "loss": 0.2253, + "step": 21734 + }, + { + "epoch": 0.40353033346576417, + "grad_norm": 0.2684827446937561, + "learning_rate": 1.2984366833684019e-05, + "loss": 0.2719, + "step": 21736 + }, + { + "epoch": 0.4035674636031828, + "grad_norm": 0.5444830060005188, + "learning_rate": 1.2983253480358205e-05, + "loss": 0.3607, + "step": 21738 + }, + { + "epoch": 0.4036045937406014, + "grad_norm": 0.5136457085609436, + "learning_rate": 1.2982140086439248e-05, + "loss": 0.1674, + "step": 21740 + }, + { + "epoch": 0.40364172387802005, + "grad_norm": 0.3209325671195984, + "learning_rate": 1.2981026651942299e-05, + "loss": 0.4467, + "step": 21742 + }, + { + "epoch": 0.40367885401543874, + "grad_norm": 0.2790270447731018, + "learning_rate": 1.2979913176882505e-05, + "loss": 0.1646, + "step": 21744 + }, + { + "epoch": 0.40371598415285737, + "grad_norm": 0.3251958191394806, + "learning_rate": 1.297879966127502e-05, + "loss": 0.2531, + "step": 21746 + }, + { + "epoch": 0.403753114290276, + "grad_norm": 0.47938621044158936, + "learning_rate": 1.2977686105134993e-05, + "loss": 0.215, + "step": 21748 + }, + { + "epoch": 0.4037902444276946, + "grad_norm": 0.38935449719429016, + "learning_rate": 1.2976572508477577e-05, + "loss": 0.1579, + "step": 21750 + }, + { + "epoch": 0.40382737456511325, + "grad_norm": 0.3315196633338928, + "learning_rate": 1.2975458871317925e-05, + "loss": 0.2163, + "step": 21752 + }, + { + "epoch": 0.4038645047025319, + "grad_norm": 0.5032125115394592, + "learning_rate": 1.2974345193671192e-05, + "loss": 0.188, + "step": 21754 + }, + { + "epoch": 0.40390163483995056, + "grad_norm": 0.37132251262664795, + "learning_rate": 1.2973231475552526e-05, + "loss": 0.1701, + "step": 21756 + }, + { + "epoch": 0.4039387649773692, + "grad_norm": 0.39295119047164917, + "learning_rate": 1.2972117716977088e-05, + "loss": 0.1931, + "step": 21758 + }, + { + "epoch": 0.4039758951147878, + "grad_norm": 0.4047181010246277, + "learning_rate": 1.2971003917960032e-05, + "loss": 0.4227, + "step": 21760 + }, + { + "epoch": 0.40401302525220645, + "grad_norm": 0.5436265468597412, + "learning_rate": 1.2969890078516509e-05, + "loss": 0.2296, + "step": 21762 + }, + { + "epoch": 0.4040501553896251, + "grad_norm": 0.3423088788986206, + "learning_rate": 1.296877619866168e-05, + "loss": 0.4142, + "step": 21764 + }, + { + "epoch": 0.40408728552704376, + "grad_norm": 0.4568958878517151, + "learning_rate": 1.29676622784107e-05, + "loss": 0.1749, + "step": 21766 + }, + { + "epoch": 0.4041244156644624, + "grad_norm": 0.36838066577911377, + "learning_rate": 1.2966548317778722e-05, + "loss": 0.2227, + "step": 21768 + }, + { + "epoch": 0.404161545801881, + "grad_norm": 0.44415345788002014, + "learning_rate": 1.296543431678091e-05, + "loss": 0.2351, + "step": 21770 + }, + { + "epoch": 0.40419867593929965, + "grad_norm": 0.32770678400993347, + "learning_rate": 1.2964320275432418e-05, + "loss": 0.41, + "step": 21772 + }, + { + "epoch": 0.4042358060767183, + "grad_norm": 0.33450430631637573, + "learning_rate": 1.296320619374841e-05, + "loss": 0.4396, + "step": 21774 + }, + { + "epoch": 0.4042729362141369, + "grad_norm": 0.7526260614395142, + "learning_rate": 1.2962092071744037e-05, + "loss": 0.17, + "step": 21776 + }, + { + "epoch": 0.4043100663515556, + "grad_norm": 0.3169478178024292, + "learning_rate": 1.2960977909434465e-05, + "loss": 0.2449, + "step": 21778 + }, + { + "epoch": 0.4043471964889742, + "grad_norm": 0.5039459466934204, + "learning_rate": 1.2959863706834852e-05, + "loss": 0.1448, + "step": 21780 + }, + { + "epoch": 0.40438432662639284, + "grad_norm": 0.2954540252685547, + "learning_rate": 1.2958749463960361e-05, + "loss": 0.1503, + "step": 21782 + }, + { + "epoch": 0.40442145676381147, + "grad_norm": 0.47662824392318726, + "learning_rate": 1.295763518082615e-05, + "loss": 0.3386, + "step": 21784 + }, + { + "epoch": 0.4044585869012301, + "grad_norm": 0.30216357111930847, + "learning_rate": 1.2956520857447389e-05, + "loss": 0.3492, + "step": 21786 + }, + { + "epoch": 0.4044957170386488, + "grad_norm": 0.39031192660331726, + "learning_rate": 1.295540649383923e-05, + "loss": 0.2154, + "step": 21788 + }, + { + "epoch": 0.4045328471760674, + "grad_norm": 0.32937124371528625, + "learning_rate": 1.295429209001684e-05, + "loss": 0.1658, + "step": 21790 + }, + { + "epoch": 0.40456997731348604, + "grad_norm": 0.34788423776626587, + "learning_rate": 1.2953177645995387e-05, + "loss": 0.3279, + "step": 21792 + }, + { + "epoch": 0.40460710745090467, + "grad_norm": 0.20182204246520996, + "learning_rate": 1.2952063161790032e-05, + "loss": 0.3672, + "step": 21794 + }, + { + "epoch": 0.4046442375883233, + "grad_norm": 0.45234495401382446, + "learning_rate": 1.2950948637415938e-05, + "loss": 0.267, + "step": 21796 + }, + { + "epoch": 0.404681367725742, + "grad_norm": 0.49033886194229126, + "learning_rate": 1.2949834072888275e-05, + "loss": 0.3331, + "step": 21798 + }, + { + "epoch": 0.4047184978631606, + "grad_norm": 0.5745718479156494, + "learning_rate": 1.2948719468222204e-05, + "loss": 0.2635, + "step": 21800 + }, + { + "epoch": 0.40475562800057924, + "grad_norm": 0.43127498030662537, + "learning_rate": 1.294760482343289e-05, + "loss": 0.3136, + "step": 21802 + }, + { + "epoch": 0.40479275813799787, + "grad_norm": 0.36452364921569824, + "learning_rate": 1.294649013853551e-05, + "loss": 0.1716, + "step": 21804 + }, + { + "epoch": 0.4048298882754165, + "grad_norm": 0.21352314949035645, + "learning_rate": 1.2945375413545222e-05, + "loss": 0.3057, + "step": 21806 + }, + { + "epoch": 0.4048670184128351, + "grad_norm": 0.5029972195625305, + "learning_rate": 1.2944260648477196e-05, + "loss": 0.2712, + "step": 21808 + }, + { + "epoch": 0.4049041485502538, + "grad_norm": 0.4876079261302948, + "learning_rate": 1.2943145843346605e-05, + "loss": 0.2478, + "step": 21810 + }, + { + "epoch": 0.40494127868767243, + "grad_norm": 0.37176385521888733, + "learning_rate": 1.294203099816861e-05, + "loss": 0.2913, + "step": 21812 + }, + { + "epoch": 0.40497840882509106, + "grad_norm": 0.4429399073123932, + "learning_rate": 1.2940916112958388e-05, + "loss": 0.3614, + "step": 21814 + }, + { + "epoch": 0.4050155389625097, + "grad_norm": 0.26407429575920105, + "learning_rate": 1.2939801187731108e-05, + "loss": 0.2098, + "step": 21816 + }, + { + "epoch": 0.4050526690999283, + "grad_norm": 0.3279687166213989, + "learning_rate": 1.2938686222501939e-05, + "loss": 0.3351, + "step": 21818 + }, + { + "epoch": 0.405089799237347, + "grad_norm": 0.37407180666923523, + "learning_rate": 1.2937571217286052e-05, + "loss": 0.1892, + "step": 21820 + }, + { + "epoch": 0.40512692937476563, + "grad_norm": 0.31828615069389343, + "learning_rate": 1.2936456172098616e-05, + "loss": 0.3469, + "step": 21822 + }, + { + "epoch": 0.40516405951218426, + "grad_norm": 0.34677034616470337, + "learning_rate": 1.2935341086954813e-05, + "loss": 0.2379, + "step": 21824 + }, + { + "epoch": 0.4052011896496029, + "grad_norm": 0.2904433608055115, + "learning_rate": 1.2934225961869809e-05, + "loss": 0.2955, + "step": 21826 + }, + { + "epoch": 0.4052383197870215, + "grad_norm": 0.2883409857749939, + "learning_rate": 1.2933110796858778e-05, + "loss": 0.0993, + "step": 21828 + }, + { + "epoch": 0.40527544992444015, + "grad_norm": 0.44709712266921997, + "learning_rate": 1.2931995591936897e-05, + "loss": 0.2151, + "step": 21830 + }, + { + "epoch": 0.40531258006185883, + "grad_norm": 0.36038461327552795, + "learning_rate": 1.2930880347119335e-05, + "loss": 0.2366, + "step": 21832 + }, + { + "epoch": 0.40534971019927746, + "grad_norm": 0.33324363827705383, + "learning_rate": 1.2929765062421268e-05, + "loss": 0.33, + "step": 21834 + }, + { + "epoch": 0.4053868403366961, + "grad_norm": 0.47948741912841797, + "learning_rate": 1.292864973785788e-05, + "loss": 0.4498, + "step": 21836 + }, + { + "epoch": 0.4054239704741147, + "grad_norm": 0.303437739610672, + "learning_rate": 1.2927534373444342e-05, + "loss": 0.3818, + "step": 21838 + }, + { + "epoch": 0.40546110061153334, + "grad_norm": 0.465300589799881, + "learning_rate": 1.2926418969195827e-05, + "loss": 0.1835, + "step": 21840 + }, + { + "epoch": 0.405498230748952, + "grad_norm": 0.45483431220054626, + "learning_rate": 1.2925303525127515e-05, + "loss": 0.2159, + "step": 21842 + }, + { + "epoch": 0.40553536088637065, + "grad_norm": 0.5572876334190369, + "learning_rate": 1.2924188041254583e-05, + "loss": 0.3726, + "step": 21844 + }, + { + "epoch": 0.4055724910237893, + "grad_norm": 0.309108167886734, + "learning_rate": 1.2923072517592212e-05, + "loss": 0.2518, + "step": 21846 + }, + { + "epoch": 0.4056096211612079, + "grad_norm": 0.2934266924858093, + "learning_rate": 1.2921956954155583e-05, + "loss": 0.1943, + "step": 21848 + }, + { + "epoch": 0.40564675129862654, + "grad_norm": 0.3023587465286255, + "learning_rate": 1.2920841350959869e-05, + "loss": 0.3293, + "step": 21850 + }, + { + "epoch": 0.40568388143604517, + "grad_norm": 0.8941232562065125, + "learning_rate": 1.2919725708020254e-05, + "loss": 0.2897, + "step": 21852 + }, + { + "epoch": 0.40572101157346385, + "grad_norm": 0.39852002263069153, + "learning_rate": 1.2918610025351919e-05, + "loss": 0.3711, + "step": 21854 + }, + { + "epoch": 0.4057581417108825, + "grad_norm": 0.39841628074645996, + "learning_rate": 1.2917494302970041e-05, + "loss": 0.2896, + "step": 21856 + }, + { + "epoch": 0.4057952718483011, + "grad_norm": 0.3771861493587494, + "learning_rate": 1.2916378540889804e-05, + "loss": 0.3221, + "step": 21858 + }, + { + "epoch": 0.40583240198571974, + "grad_norm": 0.3386521339416504, + "learning_rate": 1.2915262739126395e-05, + "loss": 0.4418, + "step": 21860 + }, + { + "epoch": 0.40586953212313837, + "grad_norm": 0.3377368748188019, + "learning_rate": 1.291414689769499e-05, + "loss": 0.184, + "step": 21862 + }, + { + "epoch": 0.40590666226055705, + "grad_norm": 0.2260040044784546, + "learning_rate": 1.2913031016610773e-05, + "loss": 0.1264, + "step": 21864 + }, + { + "epoch": 0.4059437923979757, + "grad_norm": 0.33107268810272217, + "learning_rate": 1.2911915095888929e-05, + "loss": 0.3793, + "step": 21866 + }, + { + "epoch": 0.4059809225353943, + "grad_norm": 0.6162815690040588, + "learning_rate": 1.2910799135544646e-05, + "loss": 0.2911, + "step": 21868 + }, + { + "epoch": 0.40601805267281293, + "grad_norm": 0.34075483679771423, + "learning_rate": 1.2909683135593105e-05, + "loss": 0.2123, + "step": 21870 + }, + { + "epoch": 0.40605518281023156, + "grad_norm": 0.6916275024414062, + "learning_rate": 1.290856709604949e-05, + "loss": 0.5729, + "step": 21872 + }, + { + "epoch": 0.40609231294765025, + "grad_norm": 0.35797375440597534, + "learning_rate": 1.2907451016928991e-05, + "loss": 0.2361, + "step": 21874 + }, + { + "epoch": 0.4061294430850689, + "grad_norm": 0.330986350774765, + "learning_rate": 1.290633489824679e-05, + "loss": 0.2909, + "step": 21876 + }, + { + "epoch": 0.4061665732224875, + "grad_norm": 0.4496910870075226, + "learning_rate": 1.2905218740018077e-05, + "loss": 0.3276, + "step": 21878 + }, + { + "epoch": 0.40620370335990613, + "grad_norm": 0.29114681482315063, + "learning_rate": 1.2904102542258041e-05, + "loss": 0.2864, + "step": 21880 + }, + { + "epoch": 0.40624083349732476, + "grad_norm": 0.26612186431884766, + "learning_rate": 1.290298630498187e-05, + "loss": 0.1163, + "step": 21882 + }, + { + "epoch": 0.4062779636347434, + "grad_norm": 0.32967260479927063, + "learning_rate": 1.2901870028204746e-05, + "loss": 0.141, + "step": 21884 + }, + { + "epoch": 0.4063150937721621, + "grad_norm": 0.45753490924835205, + "learning_rate": 1.2900753711941866e-05, + "loss": 0.1653, + "step": 21886 + }, + { + "epoch": 0.4063522239095807, + "grad_norm": 0.3833034634590149, + "learning_rate": 1.2899637356208417e-05, + "loss": 0.1963, + "step": 21888 + }, + { + "epoch": 0.40638935404699933, + "grad_norm": 0.30614739656448364, + "learning_rate": 1.2898520961019587e-05, + "loss": 0.3235, + "step": 21890 + }, + { + "epoch": 0.40642648418441796, + "grad_norm": 0.4653598964214325, + "learning_rate": 1.289740452639057e-05, + "loss": 0.3921, + "step": 21892 + }, + { + "epoch": 0.4064636143218366, + "grad_norm": 0.33255210518836975, + "learning_rate": 1.2896288052336556e-05, + "loss": 0.3772, + "step": 21894 + }, + { + "epoch": 0.40650074445925527, + "grad_norm": 0.34993740916252136, + "learning_rate": 1.2895171538872738e-05, + "loss": 0.1821, + "step": 21896 + }, + { + "epoch": 0.4065378745966739, + "grad_norm": 0.4239446520805359, + "learning_rate": 1.2894054986014305e-05, + "loss": 0.1997, + "step": 21898 + }, + { + "epoch": 0.4065750047340925, + "grad_norm": 0.23019862174987793, + "learning_rate": 1.2892938393776453e-05, + "loss": 0.4514, + "step": 21900 + }, + { + "epoch": 0.40661213487151115, + "grad_norm": 0.24404466152191162, + "learning_rate": 1.2891821762174376e-05, + "loss": 0.2215, + "step": 21902 + }, + { + "epoch": 0.4066492650089298, + "grad_norm": 0.281076580286026, + "learning_rate": 1.2890705091223265e-05, + "loss": 0.2858, + "step": 21904 + }, + { + "epoch": 0.4066863951463484, + "grad_norm": 0.3514172434806824, + "learning_rate": 1.2889588380938318e-05, + "loss": 0.1443, + "step": 21906 + }, + { + "epoch": 0.4067235252837671, + "grad_norm": 0.39859738945961, + "learning_rate": 1.288847163133473e-05, + "loss": 0.2289, + "step": 21908 + }, + { + "epoch": 0.4067606554211857, + "grad_norm": 1.7944791316986084, + "learning_rate": 1.288735484242769e-05, + "loss": 0.1953, + "step": 21910 + }, + { + "epoch": 0.40679778555860435, + "grad_norm": 0.3340701162815094, + "learning_rate": 1.2886238014232404e-05, + "loss": 0.3154, + "step": 21912 + }, + { + "epoch": 0.406834915696023, + "grad_norm": 0.4775649607181549, + "learning_rate": 1.2885121146764064e-05, + "loss": 0.2809, + "step": 21914 + }, + { + "epoch": 0.4068720458334416, + "grad_norm": 0.5364314913749695, + "learning_rate": 1.2884004240037863e-05, + "loss": 0.2985, + "step": 21916 + }, + { + "epoch": 0.4069091759708603, + "grad_norm": 0.41578948497772217, + "learning_rate": 1.2882887294069007e-05, + "loss": 0.3321, + "step": 21918 + }, + { + "epoch": 0.4069463061082789, + "grad_norm": 0.33452993631362915, + "learning_rate": 1.2881770308872687e-05, + "loss": 0.3258, + "step": 21920 + }, + { + "epoch": 0.40698343624569755, + "grad_norm": 0.4842405915260315, + "learning_rate": 1.288065328446411e-05, + "loss": 0.2687, + "step": 21922 + }, + { + "epoch": 0.4070205663831162, + "grad_norm": 0.4110822081565857, + "learning_rate": 1.2879536220858467e-05, + "loss": 0.2408, + "step": 21924 + }, + { + "epoch": 0.4070576965205348, + "grad_norm": 0.1997343897819519, + "learning_rate": 1.2878419118070959e-05, + "loss": 0.2086, + "step": 21926 + }, + { + "epoch": 0.40709482665795343, + "grad_norm": 0.3725113272666931, + "learning_rate": 1.287730197611679e-05, + "loss": 0.245, + "step": 21928 + }, + { + "epoch": 0.4071319567953721, + "grad_norm": 0.41921934485435486, + "learning_rate": 1.2876184795011162e-05, + "loss": 0.238, + "step": 21930 + }, + { + "epoch": 0.40716908693279075, + "grad_norm": 0.36538413166999817, + "learning_rate": 1.2875067574769271e-05, + "loss": 0.3131, + "step": 21932 + }, + { + "epoch": 0.4072062170702094, + "grad_norm": 0.4104001224040985, + "learning_rate": 1.2873950315406325e-05, + "loss": 0.3004, + "step": 21934 + }, + { + "epoch": 0.407243347207628, + "grad_norm": 0.31808042526245117, + "learning_rate": 1.2872833016937523e-05, + "loss": 0.4164, + "step": 21936 + }, + { + "epoch": 0.40728047734504663, + "grad_norm": 0.4039733111858368, + "learning_rate": 1.2871715679378068e-05, + "loss": 0.2781, + "step": 21938 + }, + { + "epoch": 0.4073176074824653, + "grad_norm": 0.352021723985672, + "learning_rate": 1.2870598302743163e-05, + "loss": 0.4339, + "step": 21940 + }, + { + "epoch": 0.40735473761988394, + "grad_norm": 0.2531554400920868, + "learning_rate": 1.2869480887048012e-05, + "loss": 0.3418, + "step": 21942 + }, + { + "epoch": 0.4073918677573026, + "grad_norm": 0.6839932203292847, + "learning_rate": 1.286836343230783e-05, + "loss": 0.381, + "step": 21944 + }, + { + "epoch": 0.4074289978947212, + "grad_norm": 0.3953968286514282, + "learning_rate": 1.2867245938537804e-05, + "loss": 0.2606, + "step": 21946 + }, + { + "epoch": 0.40746612803213983, + "grad_norm": 0.36634135246276855, + "learning_rate": 1.2866128405753153e-05, + "loss": 0.4976, + "step": 21948 + }, + { + "epoch": 0.4075032581695585, + "grad_norm": 0.9666898846626282, + "learning_rate": 1.286501083396908e-05, + "loss": 0.3888, + "step": 21950 + }, + { + "epoch": 0.40754038830697714, + "grad_norm": 0.37718817591667175, + "learning_rate": 1.286389322320079e-05, + "loss": 0.1564, + "step": 21952 + }, + { + "epoch": 0.40757751844439577, + "grad_norm": 0.28650951385498047, + "learning_rate": 1.2862775573463492e-05, + "loss": 0.1909, + "step": 21954 + }, + { + "epoch": 0.4076146485818144, + "grad_norm": 0.3960302174091339, + "learning_rate": 1.2861657884772397e-05, + "loss": 0.2756, + "step": 21956 + }, + { + "epoch": 0.407651778719233, + "grad_norm": 0.5276435017585754, + "learning_rate": 1.2860540157142705e-05, + "loss": 0.2076, + "step": 21958 + }, + { + "epoch": 0.40768890885665166, + "grad_norm": 0.32526475191116333, + "learning_rate": 1.2859422390589629e-05, + "loss": 0.3485, + "step": 21960 + }, + { + "epoch": 0.40772603899407034, + "grad_norm": 0.4247753620147705, + "learning_rate": 1.2858304585128382e-05, + "loss": 0.1784, + "step": 21962 + }, + { + "epoch": 0.40776316913148897, + "grad_norm": 0.4074791371822357, + "learning_rate": 1.285718674077417e-05, + "loss": 0.2451, + "step": 21964 + }, + { + "epoch": 0.4078002992689076, + "grad_norm": 1.5009312629699707, + "learning_rate": 1.2856068857542205e-05, + "loss": 0.4021, + "step": 21966 + }, + { + "epoch": 0.4078374294063262, + "grad_norm": 0.5415545105934143, + "learning_rate": 1.28549509354477e-05, + "loss": 0.3435, + "step": 21968 + }, + { + "epoch": 0.40787455954374485, + "grad_norm": 0.5598306059837341, + "learning_rate": 1.2853832974505863e-05, + "loss": 0.2738, + "step": 21970 + }, + { + "epoch": 0.40791168968116354, + "grad_norm": 0.42792728543281555, + "learning_rate": 1.2852714974731903e-05, + "loss": 0.2435, + "step": 21972 + }, + { + "epoch": 0.40794881981858216, + "grad_norm": 0.6369269490242004, + "learning_rate": 1.2851596936141042e-05, + "loss": 0.3702, + "step": 21974 + }, + { + "epoch": 0.4079859499560008, + "grad_norm": 0.34136971831321716, + "learning_rate": 1.285047885874849e-05, + "loss": 0.133, + "step": 21976 + }, + { + "epoch": 0.4080230800934194, + "grad_norm": 0.3024825155735016, + "learning_rate": 1.2849360742569455e-05, + "loss": 0.2843, + "step": 21978 + }, + { + "epoch": 0.40806021023083805, + "grad_norm": 0.26109573245048523, + "learning_rate": 1.2848242587619154e-05, + "loss": 0.1244, + "step": 21980 + }, + { + "epoch": 0.4080973403682567, + "grad_norm": 0.34682828187942505, + "learning_rate": 1.2847124393912806e-05, + "loss": 0.4284, + "step": 21982 + }, + { + "epoch": 0.40813447050567536, + "grad_norm": 0.3292953372001648, + "learning_rate": 1.284600616146562e-05, + "loss": 0.2863, + "step": 21984 + }, + { + "epoch": 0.408171600643094, + "grad_norm": 0.4602740406990051, + "learning_rate": 1.2844887890292821e-05, + "loss": 0.335, + "step": 21986 + }, + { + "epoch": 0.4082087307805126, + "grad_norm": 0.3390699028968811, + "learning_rate": 1.2843769580409616e-05, + "loss": 0.2223, + "step": 21988 + }, + { + "epoch": 0.40824586091793125, + "grad_norm": 0.2238067388534546, + "learning_rate": 1.2842651231831223e-05, + "loss": 0.2637, + "step": 21990 + }, + { + "epoch": 0.4082829910553499, + "grad_norm": 0.3772287964820862, + "learning_rate": 1.2841532844572862e-05, + "loss": 0.184, + "step": 21992 + }, + { + "epoch": 0.40832012119276856, + "grad_norm": 0.4175773859024048, + "learning_rate": 1.284041441864975e-05, + "loss": 0.2449, + "step": 21994 + }, + { + "epoch": 0.4083572513301872, + "grad_norm": 0.30667364597320557, + "learning_rate": 1.2839295954077108e-05, + "loss": 0.2868, + "step": 21996 + }, + { + "epoch": 0.4083943814676058, + "grad_norm": 0.2907668650150299, + "learning_rate": 1.2838177450870153e-05, + "loss": 0.328, + "step": 21998 + }, + { + "epoch": 0.40843151160502444, + "grad_norm": 0.3119083046913147, + "learning_rate": 1.2837058909044102e-05, + "loss": 0.2762, + "step": 22000 + }, + { + "epoch": 0.4084686417424431, + "grad_norm": 0.42587146162986755, + "learning_rate": 1.2835940328614178e-05, + "loss": 0.3885, + "step": 22002 + }, + { + "epoch": 0.4085057718798617, + "grad_norm": 0.3464655876159668, + "learning_rate": 1.2834821709595599e-05, + "loss": 0.1904, + "step": 22004 + }, + { + "epoch": 0.4085429020172804, + "grad_norm": 0.23605437576770782, + "learning_rate": 1.2833703052003592e-05, + "loss": 0.2812, + "step": 22006 + }, + { + "epoch": 0.408580032154699, + "grad_norm": 0.5717463493347168, + "learning_rate": 1.2832584355853372e-05, + "loss": 0.2866, + "step": 22008 + }, + { + "epoch": 0.40861716229211764, + "grad_norm": 0.47225040197372437, + "learning_rate": 1.2831465621160163e-05, + "loss": 0.2274, + "step": 22010 + }, + { + "epoch": 0.40865429242953627, + "grad_norm": 0.4087161123752594, + "learning_rate": 1.283034684793919e-05, + "loss": 0.27, + "step": 22012 + }, + { + "epoch": 0.4086914225669549, + "grad_norm": 0.43267086148262024, + "learning_rate": 1.2829228036205672e-05, + "loss": 0.2401, + "step": 22014 + }, + { + "epoch": 0.4087285527043736, + "grad_norm": 0.3372718393802643, + "learning_rate": 1.2828109185974835e-05, + "loss": 0.2642, + "step": 22016 + }, + { + "epoch": 0.4087656828417922, + "grad_norm": 0.29518356919288635, + "learning_rate": 1.2826990297261905e-05, + "loss": 0.4242, + "step": 22018 + }, + { + "epoch": 0.40880281297921084, + "grad_norm": 0.40041399002075195, + "learning_rate": 1.2825871370082106e-05, + "loss": 0.3102, + "step": 22020 + }, + { + "epoch": 0.40883994311662947, + "grad_norm": 0.3825591802597046, + "learning_rate": 1.2824752404450658e-05, + "loss": 0.2562, + "step": 22022 + }, + { + "epoch": 0.4088770732540481, + "grad_norm": 0.3004355728626251, + "learning_rate": 1.2823633400382792e-05, + "loss": 0.1737, + "step": 22024 + }, + { + "epoch": 0.4089142033914668, + "grad_norm": 0.3956766128540039, + "learning_rate": 1.2822514357893736e-05, + "loss": 0.5191, + "step": 22026 + }, + { + "epoch": 0.4089513335288854, + "grad_norm": 0.7151725888252258, + "learning_rate": 1.2821395276998714e-05, + "loss": 0.3229, + "step": 22028 + }, + { + "epoch": 0.40898846366630404, + "grad_norm": 0.2683252990245819, + "learning_rate": 1.282027615771295e-05, + "loss": 0.3193, + "step": 22030 + }, + { + "epoch": 0.40902559380372266, + "grad_norm": 0.30411496758461, + "learning_rate": 1.2819157000051676e-05, + "loss": 0.3319, + "step": 22032 + }, + { + "epoch": 0.4090627239411413, + "grad_norm": 0.3963678777217865, + "learning_rate": 1.281803780403012e-05, + "loss": 0.3166, + "step": 22034 + }, + { + "epoch": 0.4090998540785599, + "grad_norm": 0.35781484842300415, + "learning_rate": 1.281691856966351e-05, + "loss": 0.1533, + "step": 22036 + }, + { + "epoch": 0.4091369842159786, + "grad_norm": 0.3196694850921631, + "learning_rate": 1.2815799296967077e-05, + "loss": 0.2475, + "step": 22038 + }, + { + "epoch": 0.40917411435339723, + "grad_norm": 0.4347062408924103, + "learning_rate": 1.2814679985956051e-05, + "loss": 0.227, + "step": 22040 + }, + { + "epoch": 0.40921124449081586, + "grad_norm": 0.42302682995796204, + "learning_rate": 1.2813560636645658e-05, + "loss": 0.4828, + "step": 22042 + }, + { + "epoch": 0.4092483746282345, + "grad_norm": 0.2995123565196991, + "learning_rate": 1.2812441249051136e-05, + "loss": 0.1105, + "step": 22044 + }, + { + "epoch": 0.4092855047656531, + "grad_norm": 0.5302488803863525, + "learning_rate": 1.281132182318771e-05, + "loss": 0.3285, + "step": 22046 + }, + { + "epoch": 0.4093226349030718, + "grad_norm": 0.495243638753891, + "learning_rate": 1.2810202359070614e-05, + "loss": 0.4314, + "step": 22048 + }, + { + "epoch": 0.40935976504049043, + "grad_norm": 0.46075671911239624, + "learning_rate": 1.2809082856715084e-05, + "loss": 0.2619, + "step": 22050 + }, + { + "epoch": 0.40939689517790906, + "grad_norm": 0.36969444155693054, + "learning_rate": 1.2807963316136354e-05, + "loss": 0.3702, + "step": 22052 + }, + { + "epoch": 0.4094340253153277, + "grad_norm": 0.4912140667438507, + "learning_rate": 1.2806843737349648e-05, + "loss": 0.2102, + "step": 22054 + }, + { + "epoch": 0.4094711554527463, + "grad_norm": 0.3260190486907959, + "learning_rate": 1.2805724120370208e-05, + "loss": 0.169, + "step": 22056 + }, + { + "epoch": 0.40950828559016494, + "grad_norm": 0.31940963864326477, + "learning_rate": 1.2804604465213267e-05, + "loss": 0.3333, + "step": 22058 + }, + { + "epoch": 0.40954541572758363, + "grad_norm": 0.28856950998306274, + "learning_rate": 1.2803484771894059e-05, + "loss": 0.2899, + "step": 22060 + }, + { + "epoch": 0.40958254586500226, + "grad_norm": 0.4783872365951538, + "learning_rate": 1.2802365040427824e-05, + "loss": 0.3011, + "step": 22062 + }, + { + "epoch": 0.4096196760024209, + "grad_norm": 0.713039755821228, + "learning_rate": 1.2801245270829791e-05, + "loss": 0.3008, + "step": 22064 + }, + { + "epoch": 0.4096568061398395, + "grad_norm": 0.3580036461353302, + "learning_rate": 1.2800125463115201e-05, + "loss": 0.3416, + "step": 22066 + }, + { + "epoch": 0.40969393627725814, + "grad_norm": 0.34160560369491577, + "learning_rate": 1.2799005617299292e-05, + "loss": 0.395, + "step": 22068 + }, + { + "epoch": 0.4097310664146768, + "grad_norm": 0.21054638922214508, + "learning_rate": 1.27978857333973e-05, + "loss": 0.1851, + "step": 22070 + }, + { + "epoch": 0.40976819655209545, + "grad_norm": 0.36528873443603516, + "learning_rate": 1.2796765811424465e-05, + "loss": 0.4689, + "step": 22072 + }, + { + "epoch": 0.4098053266895141, + "grad_norm": 0.42593032121658325, + "learning_rate": 1.2795645851396024e-05, + "loss": 0.1817, + "step": 22074 + }, + { + "epoch": 0.4098424568269327, + "grad_norm": 1.1457351446151733, + "learning_rate": 1.2794525853327218e-05, + "loss": 0.3701, + "step": 22076 + }, + { + "epoch": 0.40987958696435134, + "grad_norm": 0.3434610366821289, + "learning_rate": 1.2793405817233282e-05, + "loss": 0.2395, + "step": 22078 + }, + { + "epoch": 0.40991671710176997, + "grad_norm": 0.9202415347099304, + "learning_rate": 1.2792285743129462e-05, + "loss": 0.1899, + "step": 22080 + }, + { + "epoch": 0.40995384723918865, + "grad_norm": 0.32737934589385986, + "learning_rate": 1.2791165631030999e-05, + "loss": 0.1559, + "step": 22082 + }, + { + "epoch": 0.4099909773766073, + "grad_norm": 0.5351212620735168, + "learning_rate": 1.279004548095313e-05, + "loss": 0.2897, + "step": 22084 + }, + { + "epoch": 0.4100281075140259, + "grad_norm": 0.41636592149734497, + "learning_rate": 1.27889252929111e-05, + "loss": 0.1397, + "step": 22086 + }, + { + "epoch": 0.41006523765144454, + "grad_norm": 0.3152400553226471, + "learning_rate": 1.2787805066920152e-05, + "loss": 0.1116, + "step": 22088 + }, + { + "epoch": 0.41010236778886316, + "grad_norm": 0.3018081784248352, + "learning_rate": 1.2786684802995523e-05, + "loss": 0.2394, + "step": 22090 + }, + { + "epoch": 0.41013949792628185, + "grad_norm": 0.38363754749298096, + "learning_rate": 1.2785564501152466e-05, + "loss": 0.4681, + "step": 22092 + }, + { + "epoch": 0.4101766280637005, + "grad_norm": 0.42933177947998047, + "learning_rate": 1.2784444161406217e-05, + "loss": 0.215, + "step": 22094 + }, + { + "epoch": 0.4102137582011191, + "grad_norm": 0.3766684830188751, + "learning_rate": 1.2783323783772024e-05, + "loss": 0.3584, + "step": 22096 + }, + { + "epoch": 0.41025088833853773, + "grad_norm": 0.42244377732276917, + "learning_rate": 1.2782203368265132e-05, + "loss": 0.1763, + "step": 22098 + }, + { + "epoch": 0.41028801847595636, + "grad_norm": 0.5051140785217285, + "learning_rate": 1.2781082914900784e-05, + "loss": 0.3556, + "step": 22100 + }, + { + "epoch": 0.410325148613375, + "grad_norm": 0.8686238527297974, + "learning_rate": 1.2779962423694228e-05, + "loss": 0.3756, + "step": 22102 + }, + { + "epoch": 0.4103622787507937, + "grad_norm": 0.2839677333831787, + "learning_rate": 1.2778841894660711e-05, + "loss": 0.3638, + "step": 22104 + }, + { + "epoch": 0.4103994088882123, + "grad_norm": 0.3259103298187256, + "learning_rate": 1.277772132781548e-05, + "loss": 0.2873, + "step": 22106 + }, + { + "epoch": 0.41043653902563093, + "grad_norm": 0.48589566349983215, + "learning_rate": 1.2776600723173781e-05, + "loss": 0.2614, + "step": 22108 + }, + { + "epoch": 0.41047366916304956, + "grad_norm": 0.3626291751861572, + "learning_rate": 1.2775480080750865e-05, + "loss": 0.1987, + "step": 22110 + }, + { + "epoch": 0.4105107993004682, + "grad_norm": 0.3636896312236786, + "learning_rate": 1.2774359400561978e-05, + "loss": 0.2602, + "step": 22112 + }, + { + "epoch": 0.41054792943788687, + "grad_norm": 0.27297523617744446, + "learning_rate": 1.2773238682622369e-05, + "loss": 0.2658, + "step": 22114 + }, + { + "epoch": 0.4105850595753055, + "grad_norm": 0.5583170652389526, + "learning_rate": 1.2772117926947288e-05, + "loss": 0.2219, + "step": 22116 + }, + { + "epoch": 0.41062218971272413, + "grad_norm": 0.24498596787452698, + "learning_rate": 1.2770997133551985e-05, + "loss": 0.1947, + "step": 22118 + }, + { + "epoch": 0.41065931985014276, + "grad_norm": 0.40482643246650696, + "learning_rate": 1.2769876302451713e-05, + "loss": 0.1495, + "step": 22120 + }, + { + "epoch": 0.4106964499875614, + "grad_norm": 0.4663492441177368, + "learning_rate": 1.276875543366172e-05, + "loss": 0.1769, + "step": 22122 + }, + { + "epoch": 0.41073358012498007, + "grad_norm": 0.33320194482803345, + "learning_rate": 1.276763452719726e-05, + "loss": 0.2146, + "step": 22124 + }, + { + "epoch": 0.4107707102623987, + "grad_norm": 0.2746712267398834, + "learning_rate": 1.2766513583073584e-05, + "loss": 0.3031, + "step": 22126 + }, + { + "epoch": 0.4108078403998173, + "grad_norm": 0.38440099358558655, + "learning_rate": 1.2765392601305941e-05, + "loss": 0.2659, + "step": 22128 + }, + { + "epoch": 0.41084497053723595, + "grad_norm": 0.3118140995502472, + "learning_rate": 1.2764271581909591e-05, + "loss": 0.4069, + "step": 22130 + }, + { + "epoch": 0.4108821006746546, + "grad_norm": 0.34607967734336853, + "learning_rate": 1.2763150524899785e-05, + "loss": 0.2222, + "step": 22132 + }, + { + "epoch": 0.4109192308120732, + "grad_norm": 0.39724650979042053, + "learning_rate": 1.2762029430291775e-05, + "loss": 0.2466, + "step": 22134 + }, + { + "epoch": 0.4109563609494919, + "grad_norm": 0.48744913935661316, + "learning_rate": 1.2760908298100822e-05, + "loss": 0.5332, + "step": 22136 + }, + { + "epoch": 0.4109934910869105, + "grad_norm": 0.41839271783828735, + "learning_rate": 1.2759787128342175e-05, + "loss": 0.2966, + "step": 22138 + }, + { + "epoch": 0.41103062122432915, + "grad_norm": 0.3997511565685272, + "learning_rate": 1.275866592103109e-05, + "loss": 0.4353, + "step": 22140 + }, + { + "epoch": 0.4110677513617478, + "grad_norm": 0.3391873240470886, + "learning_rate": 1.2757544676182825e-05, + "loss": 0.1147, + "step": 22142 + }, + { + "epoch": 0.4111048814991664, + "grad_norm": 0.3180474042892456, + "learning_rate": 1.2756423393812641e-05, + "loss": 0.3028, + "step": 22144 + }, + { + "epoch": 0.4111420116365851, + "grad_norm": 0.4800836741924286, + "learning_rate": 1.2755302073935787e-05, + "loss": 0.249, + "step": 22146 + }, + { + "epoch": 0.4111791417740037, + "grad_norm": 0.30945470929145813, + "learning_rate": 1.2754180716567526e-05, + "loss": 0.2923, + "step": 22148 + }, + { + "epoch": 0.41121627191142235, + "grad_norm": 0.460316002368927, + "learning_rate": 1.2753059321723113e-05, + "loss": 0.2798, + "step": 22150 + }, + { + "epoch": 0.411253402048841, + "grad_norm": 0.37005358934402466, + "learning_rate": 1.2751937889417812e-05, + "loss": 0.4806, + "step": 22152 + }, + { + "epoch": 0.4112905321862596, + "grad_norm": 0.47213301062583923, + "learning_rate": 1.2750816419666875e-05, + "loss": 0.4558, + "step": 22154 + }, + { + "epoch": 0.41132766232367823, + "grad_norm": 0.4230639934539795, + "learning_rate": 1.2749694912485573e-05, + "loss": 0.3416, + "step": 22156 + }, + { + "epoch": 0.4113647924610969, + "grad_norm": 0.6258881092071533, + "learning_rate": 1.2748573367889157e-05, + "loss": 0.2694, + "step": 22158 + }, + { + "epoch": 0.41140192259851555, + "grad_norm": 0.4091075360774994, + "learning_rate": 1.2747451785892888e-05, + "loss": 0.2415, + "step": 22160 + }, + { + "epoch": 0.4114390527359342, + "grad_norm": 0.43477752804756165, + "learning_rate": 1.274633016651203e-05, + "loss": 0.3638, + "step": 22162 + }, + { + "epoch": 0.4114761828733528, + "grad_norm": 0.43976837396621704, + "learning_rate": 1.2745208509761846e-05, + "loss": 0.472, + "step": 22164 + }, + { + "epoch": 0.41151331301077143, + "grad_norm": 0.3615671694278717, + "learning_rate": 1.2744086815657599e-05, + "loss": 0.2014, + "step": 22166 + }, + { + "epoch": 0.4115504431481901, + "grad_norm": 0.3114439845085144, + "learning_rate": 1.2742965084214547e-05, + "loss": 0.4001, + "step": 22168 + }, + { + "epoch": 0.41158757328560874, + "grad_norm": 0.29664692282676697, + "learning_rate": 1.2741843315447958e-05, + "loss": 0.3341, + "step": 22170 + }, + { + "epoch": 0.41162470342302737, + "grad_norm": 0.2777436375617981, + "learning_rate": 1.274072150937309e-05, + "loss": 0.305, + "step": 22172 + }, + { + "epoch": 0.411661833560446, + "grad_norm": 0.48372167348861694, + "learning_rate": 1.2739599666005216e-05, + "loss": 0.2438, + "step": 22174 + }, + { + "epoch": 0.41169896369786463, + "grad_norm": 0.3963071405887604, + "learning_rate": 1.2738477785359595e-05, + "loss": 0.2792, + "step": 22176 + }, + { + "epoch": 0.41173609383528326, + "grad_norm": 0.2918238341808319, + "learning_rate": 1.2737355867451498e-05, + "loss": 0.3107, + "step": 22178 + }, + { + "epoch": 0.41177322397270194, + "grad_norm": 0.4556460678577423, + "learning_rate": 1.273623391229618e-05, + "loss": 0.4217, + "step": 22180 + }, + { + "epoch": 0.41181035411012057, + "grad_norm": 0.321087121963501, + "learning_rate": 1.273511191990892e-05, + "loss": 0.2012, + "step": 22182 + }, + { + "epoch": 0.4118474842475392, + "grad_norm": 0.33717501163482666, + "learning_rate": 1.2733989890304976e-05, + "loss": 0.4066, + "step": 22184 + }, + { + "epoch": 0.4118846143849578, + "grad_norm": 0.4401673376560211, + "learning_rate": 1.273286782349962e-05, + "loss": 0.2485, + "step": 22186 + }, + { + "epoch": 0.41192174452237645, + "grad_norm": 0.28662893176078796, + "learning_rate": 1.2731745719508119e-05, + "loss": 0.429, + "step": 22188 + }, + { + "epoch": 0.41195887465979514, + "grad_norm": 0.5214123129844666, + "learning_rate": 1.2730623578345742e-05, + "loss": 0.3119, + "step": 22190 + }, + { + "epoch": 0.41199600479721377, + "grad_norm": 0.2936370372772217, + "learning_rate": 1.2729501400027757e-05, + "loss": 0.2931, + "step": 22192 + }, + { + "epoch": 0.4120331349346324, + "grad_norm": 0.32686617970466614, + "learning_rate": 1.272837918456943e-05, + "loss": 0.2659, + "step": 22194 + }, + { + "epoch": 0.412070265072051, + "grad_norm": 0.3903789520263672, + "learning_rate": 1.2727256931986041e-05, + "loss": 0.4379, + "step": 22196 + }, + { + "epoch": 0.41210739520946965, + "grad_norm": 0.3727833032608032, + "learning_rate": 1.272613464229285e-05, + "loss": 0.367, + "step": 22198 + }, + { + "epoch": 0.41214452534688834, + "grad_norm": 0.608617901802063, + "learning_rate": 1.2725012315505135e-05, + "loss": 0.4806, + "step": 22200 + }, + { + "epoch": 0.41218165548430696, + "grad_norm": 0.3271966874599457, + "learning_rate": 1.2723889951638164e-05, + "loss": 0.2447, + "step": 22202 + }, + { + "epoch": 0.4122187856217256, + "grad_norm": 0.38452741503715515, + "learning_rate": 1.272276755070721e-05, + "loss": 0.2558, + "step": 22204 + }, + { + "epoch": 0.4122559157591442, + "grad_norm": 0.31904590129852295, + "learning_rate": 1.272164511272754e-05, + "loss": 0.4097, + "step": 22206 + }, + { + "epoch": 0.41229304589656285, + "grad_norm": 0.38718876242637634, + "learning_rate": 1.2720522637714438e-05, + "loss": 0.282, + "step": 22208 + }, + { + "epoch": 0.4123301760339815, + "grad_norm": 0.23350363969802856, + "learning_rate": 1.2719400125683173e-05, + "loss": 0.2003, + "step": 22210 + }, + { + "epoch": 0.41236730617140016, + "grad_norm": 0.3466246724128723, + "learning_rate": 1.2718277576649013e-05, + "loss": 0.1488, + "step": 22212 + }, + { + "epoch": 0.4124044363088188, + "grad_norm": 0.3530449867248535, + "learning_rate": 1.2717154990627241e-05, + "loss": 0.3411, + "step": 22214 + }, + { + "epoch": 0.4124415664462374, + "grad_norm": 0.5673527121543884, + "learning_rate": 1.2716032367633127e-05, + "loss": 0.2394, + "step": 22216 + }, + { + "epoch": 0.41247869658365605, + "grad_norm": 0.48155415058135986, + "learning_rate": 1.2714909707681947e-05, + "loss": 0.2724, + "step": 22218 + }, + { + "epoch": 0.4125158267210747, + "grad_norm": 0.4941686689853668, + "learning_rate": 1.271378701078898e-05, + "loss": 0.2185, + "step": 22220 + }, + { + "epoch": 0.41255295685849336, + "grad_norm": 0.45109835267066956, + "learning_rate": 1.27126642769695e-05, + "loss": 0.1631, + "step": 22222 + }, + { + "epoch": 0.412590086995912, + "grad_norm": 0.3319634795188904, + "learning_rate": 1.2711541506238783e-05, + "loss": 0.3316, + "step": 22224 + }, + { + "epoch": 0.4126272171333306, + "grad_norm": 0.2841742932796478, + "learning_rate": 1.2710418698612111e-05, + "loss": 0.2583, + "step": 22226 + }, + { + "epoch": 0.41266434727074924, + "grad_norm": 0.3774219751358032, + "learning_rate": 1.2709295854104754e-05, + "loss": 0.254, + "step": 22228 + }, + { + "epoch": 0.41270147740816787, + "grad_norm": 0.36438465118408203, + "learning_rate": 1.2708172972732e-05, + "loss": 0.3751, + "step": 22230 + }, + { + "epoch": 0.4127386075455865, + "grad_norm": 0.4523380696773529, + "learning_rate": 1.2707050054509122e-05, + "loss": 0.218, + "step": 22232 + }, + { + "epoch": 0.4127757376830052, + "grad_norm": 0.2547382712364197, + "learning_rate": 1.2705927099451402e-05, + "loss": 0.3666, + "step": 22234 + }, + { + "epoch": 0.4128128678204238, + "grad_norm": 0.494395911693573, + "learning_rate": 1.2704804107574118e-05, + "loss": 0.3002, + "step": 22236 + }, + { + "epoch": 0.41284999795784244, + "grad_norm": 0.4235498905181885, + "learning_rate": 1.2703681078892554e-05, + "loss": 0.1947, + "step": 22238 + }, + { + "epoch": 0.41288712809526107, + "grad_norm": 0.20970535278320312, + "learning_rate": 1.2702558013421989e-05, + "loss": 0.2835, + "step": 22240 + }, + { + "epoch": 0.4129242582326797, + "grad_norm": 0.4145757555961609, + "learning_rate": 1.2701434911177705e-05, + "loss": 0.2779, + "step": 22242 + }, + { + "epoch": 0.4129613883700984, + "grad_norm": 0.3728560209274292, + "learning_rate": 1.270031177217498e-05, + "loss": 0.3807, + "step": 22244 + }, + { + "epoch": 0.412998518507517, + "grad_norm": 0.37832915782928467, + "learning_rate": 1.2699188596429104e-05, + "loss": 0.4804, + "step": 22246 + }, + { + "epoch": 0.41303564864493564, + "grad_norm": 0.3881000578403473, + "learning_rate": 1.2698065383955353e-05, + "loss": 0.1524, + "step": 22248 + }, + { + "epoch": 0.41307277878235427, + "grad_norm": 0.45861899852752686, + "learning_rate": 1.2696942134769019e-05, + "loss": 0.2758, + "step": 22250 + }, + { + "epoch": 0.4131099089197729, + "grad_norm": 0.3362489640712738, + "learning_rate": 1.2695818848885377e-05, + "loss": 0.2918, + "step": 22252 + }, + { + "epoch": 0.4131470390571915, + "grad_norm": 0.32611092925071716, + "learning_rate": 1.2694695526319714e-05, + "loss": 0.5259, + "step": 22254 + }, + { + "epoch": 0.4131841691946102, + "grad_norm": 0.3833596110343933, + "learning_rate": 1.2693572167087318e-05, + "loss": 0.4025, + "step": 22256 + }, + { + "epoch": 0.41322129933202884, + "grad_norm": 0.5344322323799133, + "learning_rate": 1.2692448771203473e-05, + "loss": 0.3344, + "step": 22258 + }, + { + "epoch": 0.41325842946944746, + "grad_norm": 0.3957487642765045, + "learning_rate": 1.2691325338683464e-05, + "loss": 0.1913, + "step": 22260 + }, + { + "epoch": 0.4132955596068661, + "grad_norm": 0.41014614701271057, + "learning_rate": 1.2690201869542582e-05, + "loss": 0.2992, + "step": 22262 + }, + { + "epoch": 0.4133326897442847, + "grad_norm": 0.34932592511177063, + "learning_rate": 1.268907836379611e-05, + "loss": 0.4096, + "step": 22264 + }, + { + "epoch": 0.4133698198817034, + "grad_norm": 0.4497375190258026, + "learning_rate": 1.2687954821459334e-05, + "loss": 0.4504, + "step": 22266 + }, + { + "epoch": 0.41340695001912203, + "grad_norm": 0.38725030422210693, + "learning_rate": 1.2686831242547544e-05, + "loss": 0.4117, + "step": 22268 + }, + { + "epoch": 0.41344408015654066, + "grad_norm": 0.31301382184028625, + "learning_rate": 1.268570762707603e-05, + "loss": 0.4188, + "step": 22270 + }, + { + "epoch": 0.4134812102939593, + "grad_norm": 0.39095038175582886, + "learning_rate": 1.2684583975060079e-05, + "loss": 0.153, + "step": 22272 + }, + { + "epoch": 0.4135183404313779, + "grad_norm": 0.27216750383377075, + "learning_rate": 1.2683460286514982e-05, + "loss": 0.3461, + "step": 22274 + }, + { + "epoch": 0.4135554705687966, + "grad_norm": 0.3232177197933197, + "learning_rate": 1.2682336561456029e-05, + "loss": 0.13, + "step": 22276 + }, + { + "epoch": 0.41359260070621523, + "grad_norm": 0.6255667209625244, + "learning_rate": 1.2681212799898507e-05, + "loss": 0.2016, + "step": 22278 + }, + { + "epoch": 0.41362973084363386, + "grad_norm": 0.5035505890846252, + "learning_rate": 1.2680089001857712e-05, + "loss": 0.329, + "step": 22280 + }, + { + "epoch": 0.4136668609810525, + "grad_norm": 0.5952460765838623, + "learning_rate": 1.2678965167348936e-05, + "loss": 0.1998, + "step": 22282 + }, + { + "epoch": 0.4137039911184711, + "grad_norm": 0.3756662905216217, + "learning_rate": 1.2677841296387465e-05, + "loss": 0.2137, + "step": 22284 + }, + { + "epoch": 0.41374112125588974, + "grad_norm": 0.5828529000282288, + "learning_rate": 1.2676717388988595e-05, + "loss": 0.1433, + "step": 22286 + }, + { + "epoch": 0.4137782513933084, + "grad_norm": 0.42227739095687866, + "learning_rate": 1.267559344516762e-05, + "loss": 0.3524, + "step": 22288 + }, + { + "epoch": 0.41381538153072706, + "grad_norm": 0.4134095013141632, + "learning_rate": 1.2674469464939834e-05, + "loss": 0.3187, + "step": 22290 + }, + { + "epoch": 0.4138525116681457, + "grad_norm": 0.8603450655937195, + "learning_rate": 1.2673345448320527e-05, + "loss": 0.2461, + "step": 22292 + }, + { + "epoch": 0.4138896418055643, + "grad_norm": 0.43120262026786804, + "learning_rate": 1.2672221395324997e-05, + "loss": 0.5413, + "step": 22294 + }, + { + "epoch": 0.41392677194298294, + "grad_norm": 0.36243775486946106, + "learning_rate": 1.2671097305968541e-05, + "loss": 0.2902, + "step": 22296 + }, + { + "epoch": 0.4139639020804016, + "grad_norm": 0.35519498586654663, + "learning_rate": 1.2669973180266446e-05, + "loss": 0.3629, + "step": 22298 + }, + { + "epoch": 0.41400103221782025, + "grad_norm": 0.3112615644931793, + "learning_rate": 1.2668849018234018e-05, + "loss": 0.1534, + "step": 22300 + }, + { + "epoch": 0.4140381623552389, + "grad_norm": 0.4090019762516022, + "learning_rate": 1.2667724819886547e-05, + "loss": 0.3092, + "step": 22302 + }, + { + "epoch": 0.4140752924926575, + "grad_norm": 0.3079445958137512, + "learning_rate": 1.2666600585239332e-05, + "loss": 0.2301, + "step": 22304 + }, + { + "epoch": 0.41411242263007614, + "grad_norm": 0.3032417595386505, + "learning_rate": 1.2665476314307669e-05, + "loss": 0.3444, + "step": 22306 + }, + { + "epoch": 0.41414955276749477, + "grad_norm": 0.38366255164146423, + "learning_rate": 1.2664352007106861e-05, + "loss": 0.3142, + "step": 22308 + }, + { + "epoch": 0.41418668290491345, + "grad_norm": 0.3512505888938904, + "learning_rate": 1.26632276636522e-05, + "loss": 0.3726, + "step": 22310 + }, + { + "epoch": 0.4142238130423321, + "grad_norm": 0.4796908497810364, + "learning_rate": 1.2662103283958988e-05, + "loss": 0.388, + "step": 22312 + }, + { + "epoch": 0.4142609431797507, + "grad_norm": 0.34870582818984985, + "learning_rate": 1.2660978868042527e-05, + "loss": 0.3169, + "step": 22314 + }, + { + "epoch": 0.41429807331716934, + "grad_norm": 0.32598769664764404, + "learning_rate": 1.2659854415918113e-05, + "loss": 0.2753, + "step": 22316 + }, + { + "epoch": 0.41433520345458796, + "grad_norm": 0.32522955536842346, + "learning_rate": 1.2658729927601046e-05, + "loss": 0.257, + "step": 22318 + }, + { + "epoch": 0.41437233359200665, + "grad_norm": 0.358354777097702, + "learning_rate": 1.2657605403106628e-05, + "loss": 0.2733, + "step": 22320 + }, + { + "epoch": 0.4144094637294253, + "grad_norm": 0.5097821950912476, + "learning_rate": 1.2656480842450162e-05, + "loss": 0.2829, + "step": 22322 + }, + { + "epoch": 0.4144465938668439, + "grad_norm": 0.33971095085144043, + "learning_rate": 1.2655356245646948e-05, + "loss": 0.3393, + "step": 22324 + }, + { + "epoch": 0.41448372400426253, + "grad_norm": 0.36624693870544434, + "learning_rate": 1.2654231612712292e-05, + "loss": 0.3593, + "step": 22326 + }, + { + "epoch": 0.41452085414168116, + "grad_norm": 0.5099690556526184, + "learning_rate": 1.2653106943661492e-05, + "loss": 0.642, + "step": 22328 + }, + { + "epoch": 0.4145579842790998, + "grad_norm": 0.5320605635643005, + "learning_rate": 1.2651982238509854e-05, + "loss": 0.2412, + "step": 22330 + }, + { + "epoch": 0.4145951144165185, + "grad_norm": 1.5672521591186523, + "learning_rate": 1.2650857497272679e-05, + "loss": 0.3139, + "step": 22332 + }, + { + "epoch": 0.4146322445539371, + "grad_norm": 0.30454984307289124, + "learning_rate": 1.2649732719965277e-05, + "loss": 0.2804, + "step": 22334 + }, + { + "epoch": 0.41466937469135573, + "grad_norm": 0.2134273797273636, + "learning_rate": 1.264860790660295e-05, + "loss": 0.2725, + "step": 22336 + }, + { + "epoch": 0.41470650482877436, + "grad_norm": 0.4693770706653595, + "learning_rate": 1.2647483057201e-05, + "loss": 0.2274, + "step": 22338 + }, + { + "epoch": 0.414743634966193, + "grad_norm": 0.5466782450675964, + "learning_rate": 1.2646358171774738e-05, + "loss": 0.1077, + "step": 22340 + }, + { + "epoch": 0.41478076510361167, + "grad_norm": 0.38015276193618774, + "learning_rate": 1.2645233250339465e-05, + "loss": 0.3335, + "step": 22342 + }, + { + "epoch": 0.4148178952410303, + "grad_norm": 0.4040320813655853, + "learning_rate": 1.2644108292910493e-05, + "loss": 0.195, + "step": 22344 + }, + { + "epoch": 0.4148550253784489, + "grad_norm": 0.5064042806625366, + "learning_rate": 1.264298329950313e-05, + "loss": 0.3448, + "step": 22346 + }, + { + "epoch": 0.41489215551586756, + "grad_norm": 0.48571261763572693, + "learning_rate": 1.2641858270132676e-05, + "loss": 0.3193, + "step": 22348 + }, + { + "epoch": 0.4149292856532862, + "grad_norm": 0.262954443693161, + "learning_rate": 1.2640733204814446e-05, + "loss": 0.299, + "step": 22350 + }, + { + "epoch": 0.41496641579070487, + "grad_norm": 0.4457722306251526, + "learning_rate": 1.2639608103563747e-05, + "loss": 0.2353, + "step": 22352 + }, + { + "epoch": 0.4150035459281235, + "grad_norm": 0.29154732823371887, + "learning_rate": 1.2638482966395888e-05, + "loss": 0.2643, + "step": 22354 + }, + { + "epoch": 0.4150406760655421, + "grad_norm": 0.3535892069339752, + "learning_rate": 1.2637357793326179e-05, + "loss": 0.2678, + "step": 22356 + }, + { + "epoch": 0.41507780620296075, + "grad_norm": 0.36658260226249695, + "learning_rate": 1.2636232584369931e-05, + "loss": 0.404, + "step": 22358 + }, + { + "epoch": 0.4151149363403794, + "grad_norm": 0.20808948576450348, + "learning_rate": 1.2635107339542457e-05, + "loss": 0.2026, + "step": 22360 + }, + { + "epoch": 0.415152066477798, + "grad_norm": 0.36776214838027954, + "learning_rate": 1.2633982058859058e-05, + "loss": 0.3276, + "step": 22362 + }, + { + "epoch": 0.4151891966152167, + "grad_norm": 0.36909955739974976, + "learning_rate": 1.2632856742335056e-05, + "loss": 0.145, + "step": 22364 + }, + { + "epoch": 0.4152263267526353, + "grad_norm": 0.5838977694511414, + "learning_rate": 1.2631731389985762e-05, + "loss": 0.2485, + "step": 22366 + }, + { + "epoch": 0.41526345689005395, + "grad_norm": 0.39687880873680115, + "learning_rate": 1.2630606001826486e-05, + "loss": 0.2355, + "step": 22368 + }, + { + "epoch": 0.4153005870274726, + "grad_norm": 0.39901086688041687, + "learning_rate": 1.2629480577872541e-05, + "loss": 0.3431, + "step": 22370 + }, + { + "epoch": 0.4153377171648912, + "grad_norm": 0.43962743878364563, + "learning_rate": 1.2628355118139242e-05, + "loss": 0.3904, + "step": 22372 + }, + { + "epoch": 0.4153748473023099, + "grad_norm": 0.6133963465690613, + "learning_rate": 1.2627229622641903e-05, + "loss": 0.3298, + "step": 22374 + }, + { + "epoch": 0.4154119774397285, + "grad_norm": 0.31781327724456787, + "learning_rate": 1.2626104091395836e-05, + "loss": 0.2589, + "step": 22376 + }, + { + "epoch": 0.41544910757714715, + "grad_norm": 0.5248299837112427, + "learning_rate": 1.2624978524416363e-05, + "loss": 0.2843, + "step": 22378 + }, + { + "epoch": 0.4154862377145658, + "grad_norm": 0.3978639543056488, + "learning_rate": 1.2623852921718791e-05, + "loss": 0.3733, + "step": 22380 + }, + { + "epoch": 0.4155233678519844, + "grad_norm": 0.32916125655174255, + "learning_rate": 1.262272728331844e-05, + "loss": 0.3335, + "step": 22382 + }, + { + "epoch": 0.41556049798940303, + "grad_norm": 0.3002493679523468, + "learning_rate": 1.262160160923063e-05, + "loss": 0.1836, + "step": 22384 + }, + { + "epoch": 0.4155976281268217, + "grad_norm": 0.46988898515701294, + "learning_rate": 1.2620475899470672e-05, + "loss": 0.3031, + "step": 22386 + }, + { + "epoch": 0.41563475826424034, + "grad_norm": 0.377679705619812, + "learning_rate": 1.2619350154053885e-05, + "loss": 0.1563, + "step": 22388 + }, + { + "epoch": 0.415671888401659, + "grad_norm": 0.5000572204589844, + "learning_rate": 1.2618224372995593e-05, + "loss": 0.3355, + "step": 22390 + }, + { + "epoch": 0.4157090185390776, + "grad_norm": 0.41688838601112366, + "learning_rate": 1.2617098556311107e-05, + "loss": 0.3628, + "step": 22392 + }, + { + "epoch": 0.41574614867649623, + "grad_norm": 0.3701666593551636, + "learning_rate": 1.2615972704015746e-05, + "loss": 0.4207, + "step": 22394 + }, + { + "epoch": 0.4157832788139149, + "grad_norm": 0.371471107006073, + "learning_rate": 1.2614846816124834e-05, + "loss": 0.2842, + "step": 22396 + }, + { + "epoch": 0.41582040895133354, + "grad_norm": 0.4221228063106537, + "learning_rate": 1.2613720892653692e-05, + "loss": 0.3154, + "step": 22398 + }, + { + "epoch": 0.41585753908875217, + "grad_norm": 0.4081903100013733, + "learning_rate": 1.2612594933617636e-05, + "loss": 0.3374, + "step": 22400 + }, + { + "epoch": 0.4158946692261708, + "grad_norm": 0.2494126707315445, + "learning_rate": 1.2611468939031988e-05, + "loss": 0.4441, + "step": 22402 + }, + { + "epoch": 0.4159317993635894, + "grad_norm": 0.3489457964897156, + "learning_rate": 1.2610342908912072e-05, + "loss": 0.4704, + "step": 22404 + }, + { + "epoch": 0.41596892950100806, + "grad_norm": 0.3283196985721588, + "learning_rate": 1.2609216843273205e-05, + "loss": 0.2223, + "step": 22406 + }, + { + "epoch": 0.41600605963842674, + "grad_norm": 0.23611219227313995, + "learning_rate": 1.2608090742130712e-05, + "loss": 0.326, + "step": 22408 + }, + { + "epoch": 0.41604318977584537, + "grad_norm": 0.3353910744190216, + "learning_rate": 1.2606964605499918e-05, + "loss": 0.3564, + "step": 22410 + }, + { + "epoch": 0.416080319913264, + "grad_norm": 0.24398206174373627, + "learning_rate": 1.2605838433396147e-05, + "loss": 0.3901, + "step": 22412 + }, + { + "epoch": 0.4161174500506826, + "grad_norm": 0.4506359100341797, + "learning_rate": 1.2604712225834716e-05, + "loss": 0.1377, + "step": 22414 + }, + { + "epoch": 0.41615458018810125, + "grad_norm": 0.3832830488681793, + "learning_rate": 1.2603585982830956e-05, + "loss": 0.2633, + "step": 22416 + }, + { + "epoch": 0.41619171032551994, + "grad_norm": 0.3120003938674927, + "learning_rate": 1.2602459704400188e-05, + "loss": 0.2865, + "step": 22418 + }, + { + "epoch": 0.41622884046293857, + "grad_norm": 0.3582679033279419, + "learning_rate": 1.260133339055774e-05, + "loss": 0.4833, + "step": 22420 + }, + { + "epoch": 0.4162659706003572, + "grad_norm": 1.9840152263641357, + "learning_rate": 1.2600207041318937e-05, + "loss": 0.2441, + "step": 22422 + }, + { + "epoch": 0.4163031007377758, + "grad_norm": 0.55437833070755, + "learning_rate": 1.2599080656699102e-05, + "loss": 0.2911, + "step": 22424 + }, + { + "epoch": 0.41634023087519445, + "grad_norm": 0.18342788517475128, + "learning_rate": 1.2597954236713563e-05, + "loss": 0.2221, + "step": 22426 + }, + { + "epoch": 0.41637736101261313, + "grad_norm": 0.39264431595802307, + "learning_rate": 1.2596827781377655e-05, + "loss": 0.1615, + "step": 22428 + }, + { + "epoch": 0.41641449115003176, + "grad_norm": 0.3323133885860443, + "learning_rate": 1.2595701290706695e-05, + "loss": 0.2976, + "step": 22430 + }, + { + "epoch": 0.4164516212874504, + "grad_norm": 0.5766331553459167, + "learning_rate": 1.2594574764716017e-05, + "loss": 0.1977, + "step": 22432 + }, + { + "epoch": 0.416488751424869, + "grad_norm": 0.3951661288738251, + "learning_rate": 1.2593448203420947e-05, + "loss": 0.4084, + "step": 22434 + }, + { + "epoch": 0.41652588156228765, + "grad_norm": 0.5520362257957458, + "learning_rate": 1.2592321606836815e-05, + "loss": 0.2932, + "step": 22436 + }, + { + "epoch": 0.4165630116997063, + "grad_norm": 0.3689362108707428, + "learning_rate": 1.259119497497895e-05, + "loss": 0.186, + "step": 22438 + }, + { + "epoch": 0.41660014183712496, + "grad_norm": 0.37370771169662476, + "learning_rate": 1.2590068307862682e-05, + "loss": 0.2458, + "step": 22440 + }, + { + "epoch": 0.4166372719745436, + "grad_norm": 0.45080283284187317, + "learning_rate": 1.2588941605503347e-05, + "loss": 0.2814, + "step": 22442 + }, + { + "epoch": 0.4166744021119622, + "grad_norm": 0.4246978759765625, + "learning_rate": 1.258781486791627e-05, + "loss": 0.2053, + "step": 22444 + }, + { + "epoch": 0.41671153224938084, + "grad_norm": 0.3312555253505707, + "learning_rate": 1.2586688095116782e-05, + "loss": 0.3394, + "step": 22446 + }, + { + "epoch": 0.4167486623867995, + "grad_norm": 0.34370651841163635, + "learning_rate": 1.258556128712022e-05, + "loss": 0.3435, + "step": 22448 + }, + { + "epoch": 0.41678579252421816, + "grad_norm": 0.5265771746635437, + "learning_rate": 1.2584434443941911e-05, + "loss": 0.204, + "step": 22450 + }, + { + "epoch": 0.4168229226616368, + "grad_norm": 0.2886512279510498, + "learning_rate": 1.2583307565597192e-05, + "loss": 0.2267, + "step": 22452 + }, + { + "epoch": 0.4168600527990554, + "grad_norm": 0.37400022149086, + "learning_rate": 1.2582180652101394e-05, + "loss": 0.2839, + "step": 22454 + }, + { + "epoch": 0.41689718293647404, + "grad_norm": 0.3268842399120331, + "learning_rate": 1.2581053703469852e-05, + "loss": 0.3035, + "step": 22456 + }, + { + "epoch": 0.41693431307389267, + "grad_norm": 0.47995254397392273, + "learning_rate": 1.2579926719717899e-05, + "loss": 0.3242, + "step": 22458 + }, + { + "epoch": 0.4169714432113113, + "grad_norm": 0.4329659342765808, + "learning_rate": 1.2578799700860876e-05, + "loss": 0.2703, + "step": 22460 + }, + { + "epoch": 0.41700857334873, + "grad_norm": 0.43291178345680237, + "learning_rate": 1.2577672646914107e-05, + "loss": 0.4337, + "step": 22462 + }, + { + "epoch": 0.4170457034861486, + "grad_norm": 0.35983771085739136, + "learning_rate": 1.2576545557892939e-05, + "loss": 0.2319, + "step": 22464 + }, + { + "epoch": 0.41708283362356724, + "grad_norm": 0.391220360994339, + "learning_rate": 1.2575418433812702e-05, + "loss": 0.2065, + "step": 22466 + }, + { + "epoch": 0.41711996376098587, + "grad_norm": 0.3316316604614258, + "learning_rate": 1.2574291274688734e-05, + "loss": 0.3326, + "step": 22468 + }, + { + "epoch": 0.4171570938984045, + "grad_norm": 0.3946816325187683, + "learning_rate": 1.257316408053637e-05, + "loss": 0.3084, + "step": 22470 + }, + { + "epoch": 0.4171942240358232, + "grad_norm": 0.31929251551628113, + "learning_rate": 1.2572036851370956e-05, + "loss": 0.4248, + "step": 22472 + }, + { + "epoch": 0.4172313541732418, + "grad_norm": 0.5548383593559265, + "learning_rate": 1.2570909587207822e-05, + "loss": 0.4839, + "step": 22474 + }, + { + "epoch": 0.41726848431066044, + "grad_norm": 0.47951415181159973, + "learning_rate": 1.2569782288062309e-05, + "loss": 0.2928, + "step": 22476 + }, + { + "epoch": 0.41730561444807907, + "grad_norm": 0.36975565552711487, + "learning_rate": 1.2568654953949755e-05, + "loss": 0.3424, + "step": 22478 + }, + { + "epoch": 0.4173427445854977, + "grad_norm": 0.21453146636486053, + "learning_rate": 1.2567527584885503e-05, + "loss": 0.1939, + "step": 22480 + }, + { + "epoch": 0.4173798747229163, + "grad_norm": 0.43721315264701843, + "learning_rate": 1.256640018088489e-05, + "loss": 0.2002, + "step": 22482 + }, + { + "epoch": 0.417417004860335, + "grad_norm": 0.369391530752182, + "learning_rate": 1.2565272741963262e-05, + "loss": 0.199, + "step": 22484 + }, + { + "epoch": 0.41745413499775363, + "grad_norm": 0.28911253809928894, + "learning_rate": 1.2564145268135952e-05, + "loss": 0.1586, + "step": 22486 + }, + { + "epoch": 0.41749126513517226, + "grad_norm": 0.2809910774230957, + "learning_rate": 1.2563017759418305e-05, + "loss": 0.1899, + "step": 22488 + }, + { + "epoch": 0.4175283952725909, + "grad_norm": 0.4824793040752411, + "learning_rate": 1.2561890215825662e-05, + "loss": 0.4619, + "step": 22490 + }, + { + "epoch": 0.4175655254100095, + "grad_norm": 9.8869047164917, + "learning_rate": 1.2560762637373371e-05, + "loss": 0.2899, + "step": 22492 + }, + { + "epoch": 0.4176026555474282, + "grad_norm": 0.4115208685398102, + "learning_rate": 1.2559635024076769e-05, + "loss": 0.3026, + "step": 22494 + }, + { + "epoch": 0.41763978568484683, + "grad_norm": 0.5028640031814575, + "learning_rate": 1.2558507375951204e-05, + "loss": 0.2202, + "step": 22496 + }, + { + "epoch": 0.41767691582226546, + "grad_norm": 0.4057249426841736, + "learning_rate": 1.2557379693012015e-05, + "loss": 0.4076, + "step": 22498 + }, + { + "epoch": 0.4177140459596841, + "grad_norm": 0.3594348728656769, + "learning_rate": 1.2556251975274547e-05, + "loss": 0.2056, + "step": 22500 + }, + { + "epoch": 0.4177511760971027, + "grad_norm": 0.2122144252061844, + "learning_rate": 1.2555124222754148e-05, + "loss": 0.251, + "step": 22502 + }, + { + "epoch": 0.4177883062345214, + "grad_norm": 0.35639718174934387, + "learning_rate": 1.2553996435466163e-05, + "loss": 0.4011, + "step": 22504 + }, + { + "epoch": 0.41782543637194003, + "grad_norm": 0.36366695165634155, + "learning_rate": 1.2552868613425935e-05, + "loss": 0.1872, + "step": 22506 + }, + { + "epoch": 0.41786256650935866, + "grad_norm": 0.38499191403388977, + "learning_rate": 1.255174075664881e-05, + "loss": 0.2026, + "step": 22508 + }, + { + "epoch": 0.4178996966467773, + "grad_norm": 0.3735295832157135, + "learning_rate": 1.2550612865150141e-05, + "loss": 0.3605, + "step": 22510 + }, + { + "epoch": 0.4179368267841959, + "grad_norm": 0.37961629033088684, + "learning_rate": 1.2549484938945267e-05, + "loss": 0.2803, + "step": 22512 + }, + { + "epoch": 0.41797395692161454, + "grad_norm": 0.5305289626121521, + "learning_rate": 1.254835697804954e-05, + "loss": 0.2571, + "step": 22514 + }, + { + "epoch": 0.4180110870590332, + "grad_norm": 0.4585077166557312, + "learning_rate": 1.254722898247831e-05, + "loss": 0.185, + "step": 22516 + }, + { + "epoch": 0.41804821719645185, + "grad_norm": 0.44458502531051636, + "learning_rate": 1.2546100952246925e-05, + "loss": 0.2868, + "step": 22518 + }, + { + "epoch": 0.4180853473338705, + "grad_norm": 0.6276764273643494, + "learning_rate": 1.2544972887370727e-05, + "loss": 0.2004, + "step": 22520 + }, + { + "epoch": 0.4181224774712891, + "grad_norm": 0.485355943441391, + "learning_rate": 1.2543844787865074e-05, + "loss": 0.4108, + "step": 22522 + }, + { + "epoch": 0.41815960760870774, + "grad_norm": 0.27617180347442627, + "learning_rate": 1.2542716653745312e-05, + "loss": 0.0921, + "step": 22524 + }, + { + "epoch": 0.4181967377461264, + "grad_norm": 0.36838507652282715, + "learning_rate": 1.2541588485026794e-05, + "loss": 0.2826, + "step": 22526 + }, + { + "epoch": 0.41823386788354505, + "grad_norm": 0.33345288038253784, + "learning_rate": 1.2540460281724871e-05, + "loss": 0.3104, + "step": 22528 + }, + { + "epoch": 0.4182709980209637, + "grad_norm": 0.4041168987751007, + "learning_rate": 1.2539332043854893e-05, + "loss": 0.2697, + "step": 22530 + }, + { + "epoch": 0.4183081281583823, + "grad_norm": 0.2819782495498657, + "learning_rate": 1.2538203771432209e-05, + "loss": 0.2312, + "step": 22532 + }, + { + "epoch": 0.41834525829580094, + "grad_norm": 0.7115983366966248, + "learning_rate": 1.2537075464472174e-05, + "loss": 0.3205, + "step": 22534 + }, + { + "epoch": 0.41838238843321957, + "grad_norm": 0.20556685328483582, + "learning_rate": 1.2535947122990146e-05, + "loss": 0.2175, + "step": 22536 + }, + { + "epoch": 0.41841951857063825, + "grad_norm": 0.30183717608451843, + "learning_rate": 1.2534818747001471e-05, + "loss": 0.2709, + "step": 22538 + }, + { + "epoch": 0.4184566487080569, + "grad_norm": 0.31497636437416077, + "learning_rate": 1.2533690336521507e-05, + "loss": 0.175, + "step": 22540 + }, + { + "epoch": 0.4184937788454755, + "grad_norm": 0.38196489214897156, + "learning_rate": 1.2532561891565604e-05, + "loss": 0.1586, + "step": 22542 + }, + { + "epoch": 0.41853090898289413, + "grad_norm": 0.3567071557044983, + "learning_rate": 1.253143341214912e-05, + "loss": 0.0769, + "step": 22544 + }, + { + "epoch": 0.41856803912031276, + "grad_norm": 0.3131732642650604, + "learning_rate": 1.2530304898287411e-05, + "loss": 0.3743, + "step": 22546 + }, + { + "epoch": 0.41860516925773145, + "grad_norm": 0.32282838225364685, + "learning_rate": 1.2529176349995833e-05, + "loss": 0.3831, + "step": 22548 + }, + { + "epoch": 0.4186422993951501, + "grad_norm": 0.32722216844558716, + "learning_rate": 1.252804776728974e-05, + "loss": 0.3444, + "step": 22550 + }, + { + "epoch": 0.4186794295325687, + "grad_norm": 0.5492792129516602, + "learning_rate": 1.2526919150184487e-05, + "loss": 0.4061, + "step": 22552 + }, + { + "epoch": 0.41871655966998733, + "grad_norm": 0.27537593245506287, + "learning_rate": 1.2525790498695434e-05, + "loss": 0.3589, + "step": 22554 + }, + { + "epoch": 0.41875368980740596, + "grad_norm": 0.4112614095211029, + "learning_rate": 1.2524661812837936e-05, + "loss": 0.3125, + "step": 22556 + }, + { + "epoch": 0.4187908199448246, + "grad_norm": 0.5140699744224548, + "learning_rate": 1.2523533092627357e-05, + "loss": 0.2321, + "step": 22558 + }, + { + "epoch": 0.41882795008224327, + "grad_norm": 0.42734774947166443, + "learning_rate": 1.2522404338079054e-05, + "loss": 0.4071, + "step": 22560 + }, + { + "epoch": 0.4188650802196619, + "grad_norm": 0.30502915382385254, + "learning_rate": 1.2521275549208375e-05, + "loss": 0.3844, + "step": 22562 + }, + { + "epoch": 0.41890221035708053, + "grad_norm": 0.32507753372192383, + "learning_rate": 1.2520146726030692e-05, + "loss": 0.2428, + "step": 22564 + }, + { + "epoch": 0.41893934049449916, + "grad_norm": 0.2918146252632141, + "learning_rate": 1.251901786856136e-05, + "loss": 0.3123, + "step": 22566 + }, + { + "epoch": 0.4189764706319178, + "grad_norm": 0.5561637282371521, + "learning_rate": 1.2517888976815743e-05, + "loss": 0.3694, + "step": 22568 + }, + { + "epoch": 0.41901360076933647, + "grad_norm": 0.3774101138114929, + "learning_rate": 1.2516760050809198e-05, + "loss": 0.4536, + "step": 22570 + }, + { + "epoch": 0.4190507309067551, + "grad_norm": 0.4797917604446411, + "learning_rate": 1.2515631090557086e-05, + "loss": 0.3997, + "step": 22572 + }, + { + "epoch": 0.4190878610441737, + "grad_norm": 0.27623823285102844, + "learning_rate": 1.251450209607477e-05, + "loss": 0.3785, + "step": 22574 + }, + { + "epoch": 0.41912499118159235, + "grad_norm": 0.3862324655056, + "learning_rate": 1.2513373067377613e-05, + "loss": 0.2645, + "step": 22576 + }, + { + "epoch": 0.419162121319011, + "grad_norm": 0.6311046481132507, + "learning_rate": 1.2512244004480976e-05, + "loss": 0.2798, + "step": 22578 + }, + { + "epoch": 0.41919925145642967, + "grad_norm": 0.2939143478870392, + "learning_rate": 1.2511114907400224e-05, + "loss": 0.3434, + "step": 22580 + }, + { + "epoch": 0.4192363815938483, + "grad_norm": 0.5135506987571716, + "learning_rate": 1.2509985776150719e-05, + "loss": 0.4155, + "step": 22582 + }, + { + "epoch": 0.4192735117312669, + "grad_norm": 0.3979550898075104, + "learning_rate": 1.2508856610747826e-05, + "loss": 0.3452, + "step": 22584 + }, + { + "epoch": 0.41931064186868555, + "grad_norm": 0.3651474714279175, + "learning_rate": 1.250772741120691e-05, + "loss": 0.2031, + "step": 22586 + }, + { + "epoch": 0.4193477720061042, + "grad_norm": 0.33344167470932007, + "learning_rate": 1.2506598177543336e-05, + "loss": 0.3693, + "step": 22588 + }, + { + "epoch": 0.4193849021435228, + "grad_norm": 0.3398403227329254, + "learning_rate": 1.250546890977247e-05, + "loss": 0.3254, + "step": 22590 + }, + { + "epoch": 0.4194220322809415, + "grad_norm": 0.41044193506240845, + "learning_rate": 1.2504339607909674e-05, + "loss": 0.491, + "step": 22592 + }, + { + "epoch": 0.4194591624183601, + "grad_norm": 0.6559653878211975, + "learning_rate": 1.2503210271970319e-05, + "loss": 0.391, + "step": 22594 + }, + { + "epoch": 0.41949629255577875, + "grad_norm": 0.4157891869544983, + "learning_rate": 1.2502080901969768e-05, + "loss": 0.2938, + "step": 22596 + }, + { + "epoch": 0.4195334226931974, + "grad_norm": 0.45225077867507935, + "learning_rate": 1.2500951497923395e-05, + "loss": 0.3571, + "step": 22598 + }, + { + "epoch": 0.419570552830616, + "grad_norm": 0.42949336767196655, + "learning_rate": 1.2499822059846558e-05, + "loss": 0.4214, + "step": 22600 + }, + { + "epoch": 0.4196076829680347, + "grad_norm": 0.468717485666275, + "learning_rate": 1.2498692587754633e-05, + "loss": 0.3102, + "step": 22602 + }, + { + "epoch": 0.4196448131054533, + "grad_norm": 0.35520678758621216, + "learning_rate": 1.2497563081662986e-05, + "loss": 0.4355, + "step": 22604 + }, + { + "epoch": 0.41968194324287195, + "grad_norm": 0.5001177191734314, + "learning_rate": 1.2496433541586988e-05, + "loss": 0.4724, + "step": 22606 + }, + { + "epoch": 0.4197190733802906, + "grad_norm": 0.31420400738716125, + "learning_rate": 1.2495303967542006e-05, + "loss": 0.3317, + "step": 22608 + }, + { + "epoch": 0.4197562035177092, + "grad_norm": 0.4527732729911804, + "learning_rate": 1.249417435954341e-05, + "loss": 0.248, + "step": 22610 + }, + { + "epoch": 0.41979333365512783, + "grad_norm": 0.24900026619434357, + "learning_rate": 1.2493044717606578e-05, + "loss": 0.2381, + "step": 22612 + }, + { + "epoch": 0.4198304637925465, + "grad_norm": 0.4950977563858032, + "learning_rate": 1.2491915041746867e-05, + "loss": 0.1906, + "step": 22614 + }, + { + "epoch": 0.41986759392996514, + "grad_norm": 0.35937047004699707, + "learning_rate": 1.2490785331979657e-05, + "loss": 0.4202, + "step": 22616 + }, + { + "epoch": 0.41990472406738377, + "grad_norm": 0.3663603961467743, + "learning_rate": 1.2489655588320324e-05, + "loss": 0.3175, + "step": 22618 + }, + { + "epoch": 0.4199418542048024, + "grad_norm": 0.31991150975227356, + "learning_rate": 1.2488525810784234e-05, + "loss": 0.3025, + "step": 22620 + }, + { + "epoch": 0.41997898434222103, + "grad_norm": 0.33774426579475403, + "learning_rate": 1.2487395999386763e-05, + "loss": 0.5691, + "step": 22622 + }, + { + "epoch": 0.4200161144796397, + "grad_norm": 0.2390291839838028, + "learning_rate": 1.2486266154143282e-05, + "loss": 0.2675, + "step": 22624 + }, + { + "epoch": 0.42005324461705834, + "grad_norm": 0.47351783514022827, + "learning_rate": 1.2485136275069166e-05, + "loss": 0.1541, + "step": 22626 + }, + { + "epoch": 0.42009037475447697, + "grad_norm": 0.5223966240882874, + "learning_rate": 1.2484006362179786e-05, + "loss": 0.2042, + "step": 22628 + }, + { + "epoch": 0.4201275048918956, + "grad_norm": 0.34706950187683105, + "learning_rate": 1.2482876415490523e-05, + "loss": 0.3336, + "step": 22630 + }, + { + "epoch": 0.4201646350293142, + "grad_norm": 0.3015008866786957, + "learning_rate": 1.2481746435016749e-05, + "loss": 0.5155, + "step": 22632 + }, + { + "epoch": 0.42020176516673285, + "grad_norm": 0.37568390369415283, + "learning_rate": 1.248061642077384e-05, + "loss": 0.3653, + "step": 22634 + }, + { + "epoch": 0.42023889530415154, + "grad_norm": 0.3156394958496094, + "learning_rate": 1.2479486372777172e-05, + "loss": 0.2329, + "step": 22636 + }, + { + "epoch": 0.42027602544157017, + "grad_norm": 0.41168591380119324, + "learning_rate": 1.247835629104212e-05, + "loss": 0.4531, + "step": 22638 + }, + { + "epoch": 0.4203131555789888, + "grad_norm": 0.4071168303489685, + "learning_rate": 1.2477226175584061e-05, + "loss": 0.3128, + "step": 22640 + }, + { + "epoch": 0.4203502857164074, + "grad_norm": 0.3467046618461609, + "learning_rate": 1.2476096026418376e-05, + "loss": 0.1717, + "step": 22642 + }, + { + "epoch": 0.42038741585382605, + "grad_norm": 0.4623037278652191, + "learning_rate": 1.2474965843560443e-05, + "loss": 0.1231, + "step": 22644 + }, + { + "epoch": 0.42042454599124474, + "grad_norm": 0.3470204174518585, + "learning_rate": 1.2473835627025634e-05, + "loss": 0.3367, + "step": 22646 + }, + { + "epoch": 0.42046167612866336, + "grad_norm": 0.3762527108192444, + "learning_rate": 1.2472705376829333e-05, + "loss": 0.1833, + "step": 22648 + }, + { + "epoch": 0.420498806266082, + "grad_norm": 0.3082398474216461, + "learning_rate": 1.247157509298692e-05, + "loss": 0.3068, + "step": 22650 + }, + { + "epoch": 0.4205359364035006, + "grad_norm": 0.5302447080612183, + "learning_rate": 1.2470444775513773e-05, + "loss": 0.1479, + "step": 22652 + }, + { + "epoch": 0.42057306654091925, + "grad_norm": 0.3482266962528229, + "learning_rate": 1.2469314424425272e-05, + "loss": 0.3229, + "step": 22654 + }, + { + "epoch": 0.42061019667833793, + "grad_norm": 0.283134788274765, + "learning_rate": 1.2468184039736799e-05, + "loss": 0.4041, + "step": 22656 + }, + { + "epoch": 0.42064732681575656, + "grad_norm": 0.5044047236442566, + "learning_rate": 1.2467053621463734e-05, + "loss": 0.3357, + "step": 22658 + }, + { + "epoch": 0.4206844569531752, + "grad_norm": 0.31700700521469116, + "learning_rate": 1.2465923169621456e-05, + "loss": 0.2638, + "step": 22660 + }, + { + "epoch": 0.4207215870905938, + "grad_norm": 0.41031935811042786, + "learning_rate": 1.2464792684225355e-05, + "loss": 0.2693, + "step": 22662 + }, + { + "epoch": 0.42075871722801245, + "grad_norm": 0.45200008153915405, + "learning_rate": 1.2463662165290804e-05, + "loss": 0.2813, + "step": 22664 + }, + { + "epoch": 0.4207958473654311, + "grad_norm": 0.27047303318977356, + "learning_rate": 1.2462531612833194e-05, + "loss": 0.2714, + "step": 22666 + }, + { + "epoch": 0.42083297750284976, + "grad_norm": 0.28923970460891724, + "learning_rate": 1.2461401026867904e-05, + "loss": 0.3784, + "step": 22668 + }, + { + "epoch": 0.4208701076402684, + "grad_norm": 0.17238780856132507, + "learning_rate": 1.2460270407410318e-05, + "loss": 0.2301, + "step": 22670 + }, + { + "epoch": 0.420907237777687, + "grad_norm": 0.5400007367134094, + "learning_rate": 1.2459139754475818e-05, + "loss": 0.3487, + "step": 22672 + }, + { + "epoch": 0.42094436791510564, + "grad_norm": 0.3310256600379944, + "learning_rate": 1.2458009068079797e-05, + "loss": 0.3329, + "step": 22674 + }, + { + "epoch": 0.4209814980525243, + "grad_norm": 0.5805754661560059, + "learning_rate": 1.2456878348237632e-05, + "loss": 0.2925, + "step": 22676 + }, + { + "epoch": 0.42101862818994296, + "grad_norm": 0.5921448469161987, + "learning_rate": 1.2455747594964713e-05, + "loss": 0.2806, + "step": 22678 + }, + { + "epoch": 0.4210557583273616, + "grad_norm": 0.4316427409648895, + "learning_rate": 1.2454616808276428e-05, + "loss": 0.2188, + "step": 22680 + }, + { + "epoch": 0.4210928884647802, + "grad_norm": 0.3620373010635376, + "learning_rate": 1.2453485988188156e-05, + "loss": 0.287, + "step": 22682 + }, + { + "epoch": 0.42113001860219884, + "grad_norm": 0.2724496126174927, + "learning_rate": 1.2452355134715291e-05, + "loss": 0.4246, + "step": 22684 + }, + { + "epoch": 0.42116714873961747, + "grad_norm": 0.2627083659172058, + "learning_rate": 1.2451224247873217e-05, + "loss": 0.2528, + "step": 22686 + }, + { + "epoch": 0.4212042788770361, + "grad_norm": 0.338580459356308, + "learning_rate": 1.2450093327677325e-05, + "loss": 0.1854, + "step": 22688 + }, + { + "epoch": 0.4212414090144548, + "grad_norm": 0.4994960427284241, + "learning_rate": 1.2448962374143001e-05, + "loss": 0.3067, + "step": 22690 + }, + { + "epoch": 0.4212785391518734, + "grad_norm": 0.48993536829948425, + "learning_rate": 1.2447831387285631e-05, + "loss": 0.2692, + "step": 22692 + }, + { + "epoch": 0.42131566928929204, + "grad_norm": 0.5791990160942078, + "learning_rate": 1.2446700367120614e-05, + "loss": 0.2534, + "step": 22694 + }, + { + "epoch": 0.42135279942671067, + "grad_norm": 0.6208539605140686, + "learning_rate": 1.2445569313663333e-05, + "loss": 0.2369, + "step": 22696 + }, + { + "epoch": 0.4213899295641293, + "grad_norm": 0.4211345314979553, + "learning_rate": 1.2444438226929174e-05, + "loss": 0.4635, + "step": 22698 + }, + { + "epoch": 0.421427059701548, + "grad_norm": 0.47431862354278564, + "learning_rate": 1.2443307106933538e-05, + "loss": 0.0952, + "step": 22700 + }, + { + "epoch": 0.4214641898389666, + "grad_norm": 0.4242370128631592, + "learning_rate": 1.2442175953691806e-05, + "loss": 0.3361, + "step": 22702 + }, + { + "epoch": 0.42150131997638524, + "grad_norm": 0.36016300320625305, + "learning_rate": 1.2441044767219378e-05, + "loss": 0.2444, + "step": 22704 + }, + { + "epoch": 0.42153845011380386, + "grad_norm": 0.2902941107749939, + "learning_rate": 1.2439913547531641e-05, + "loss": 0.4214, + "step": 22706 + }, + { + "epoch": 0.4215755802512225, + "grad_norm": 0.43286773562431335, + "learning_rate": 1.243878229464399e-05, + "loss": 0.3672, + "step": 22708 + }, + { + "epoch": 0.4216127103886411, + "grad_norm": 0.48602020740509033, + "learning_rate": 1.2437651008571816e-05, + "loss": 0.2056, + "step": 22710 + }, + { + "epoch": 0.4216498405260598, + "grad_norm": 0.31060197949409485, + "learning_rate": 1.2436519689330516e-05, + "loss": 0.217, + "step": 22712 + }, + { + "epoch": 0.42168697066347843, + "grad_norm": 0.33403152227401733, + "learning_rate": 1.2435388336935476e-05, + "loss": 0.2793, + "step": 22714 + }, + { + "epoch": 0.42172410080089706, + "grad_norm": 0.28068020939826965, + "learning_rate": 1.2434256951402097e-05, + "loss": 0.0453, + "step": 22716 + }, + { + "epoch": 0.4217612309383157, + "grad_norm": 0.38120952248573303, + "learning_rate": 1.2433125532745776e-05, + "loss": 0.388, + "step": 22718 + }, + { + "epoch": 0.4217983610757343, + "grad_norm": 0.33326345682144165, + "learning_rate": 1.2431994080981902e-05, + "loss": 0.4343, + "step": 22720 + }, + { + "epoch": 0.421835491213153, + "grad_norm": 0.3045865297317505, + "learning_rate": 1.2430862596125875e-05, + "loss": 0.1846, + "step": 22722 + }, + { + "epoch": 0.42187262135057163, + "grad_norm": 0.36258482933044434, + "learning_rate": 1.2429731078193088e-05, + "loss": 0.3848, + "step": 22724 + }, + { + "epoch": 0.42190975148799026, + "grad_norm": 0.4740740656852722, + "learning_rate": 1.2428599527198936e-05, + "loss": 0.3284, + "step": 22726 + }, + { + "epoch": 0.4219468816254089, + "grad_norm": 0.40667107701301575, + "learning_rate": 1.2427467943158824e-05, + "loss": 0.3763, + "step": 22728 + }, + { + "epoch": 0.4219840117628275, + "grad_norm": 0.37334051728248596, + "learning_rate": 1.2426336326088142e-05, + "loss": 0.2557, + "step": 22730 + }, + { + "epoch": 0.4220211419002462, + "grad_norm": 0.4673433005809784, + "learning_rate": 1.2425204676002291e-05, + "loss": 0.3418, + "step": 22732 + }, + { + "epoch": 0.42205827203766483, + "grad_norm": 0.28942012786865234, + "learning_rate": 1.2424072992916666e-05, + "loss": 0.2359, + "step": 22734 + }, + { + "epoch": 0.42209540217508346, + "grad_norm": 0.3647730052471161, + "learning_rate": 1.2422941276846672e-05, + "loss": 0.1397, + "step": 22736 + }, + { + "epoch": 0.4221325323125021, + "grad_norm": 0.9200834035873413, + "learning_rate": 1.2421809527807705e-05, + "loss": 0.2261, + "step": 22738 + }, + { + "epoch": 0.4221696624499207, + "grad_norm": 0.29546844959259033, + "learning_rate": 1.2420677745815161e-05, + "loss": 0.3452, + "step": 22740 + }, + { + "epoch": 0.42220679258733934, + "grad_norm": 0.3888777494430542, + "learning_rate": 1.2419545930884447e-05, + "loss": 0.2735, + "step": 22742 + }, + { + "epoch": 0.422243922724758, + "grad_norm": 0.2747851610183716, + "learning_rate": 1.2418414083030958e-05, + "loss": 0.2599, + "step": 22744 + }, + { + "epoch": 0.42228105286217665, + "grad_norm": 0.4163869619369507, + "learning_rate": 1.2417282202270099e-05, + "loss": 0.2856, + "step": 22746 + }, + { + "epoch": 0.4223181829995953, + "grad_norm": 0.41317400336265564, + "learning_rate": 1.2416150288617268e-05, + "loss": 0.2423, + "step": 22748 + }, + { + "epoch": 0.4223553131370139, + "grad_norm": 0.36191147565841675, + "learning_rate": 1.2415018342087872e-05, + "loss": 0.3766, + "step": 22750 + }, + { + "epoch": 0.42239244327443254, + "grad_norm": 0.3260866403579712, + "learning_rate": 1.2413886362697307e-05, + "loss": 0.2617, + "step": 22752 + }, + { + "epoch": 0.4224295734118512, + "grad_norm": 0.3463301360607147, + "learning_rate": 1.2412754350460978e-05, + "loss": 0.2133, + "step": 22754 + }, + { + "epoch": 0.42246670354926985, + "grad_norm": 0.3350251615047455, + "learning_rate": 1.2411622305394294e-05, + "loss": 0.3557, + "step": 22756 + }, + { + "epoch": 0.4225038336866885, + "grad_norm": 0.4382261335849762, + "learning_rate": 1.241049022751265e-05, + "loss": 0.3999, + "step": 22758 + }, + { + "epoch": 0.4225409638241071, + "grad_norm": 0.36749404668807983, + "learning_rate": 1.2409358116831456e-05, + "loss": 0.4108, + "step": 22760 + }, + { + "epoch": 0.42257809396152574, + "grad_norm": 0.26575928926467896, + "learning_rate": 1.2408225973366117e-05, + "loss": 0.2619, + "step": 22762 + }, + { + "epoch": 0.42261522409894436, + "grad_norm": 0.3456897735595703, + "learning_rate": 1.2407093797132033e-05, + "loss": 0.333, + "step": 22764 + }, + { + "epoch": 0.42265235423636305, + "grad_norm": 0.22343279421329498, + "learning_rate": 1.2405961588144611e-05, + "loss": 0.2969, + "step": 22766 + }, + { + "epoch": 0.4226894843737817, + "grad_norm": 0.31952428817749023, + "learning_rate": 1.240482934641926e-05, + "loss": 0.1943, + "step": 22768 + }, + { + "epoch": 0.4227266145112003, + "grad_norm": 0.33623000979423523, + "learning_rate": 1.2403697071971386e-05, + "loss": 0.3037, + "step": 22770 + }, + { + "epoch": 0.42276374464861893, + "grad_norm": 0.438067227602005, + "learning_rate": 1.2402564764816396e-05, + "loss": 0.2707, + "step": 22772 + }, + { + "epoch": 0.42280087478603756, + "grad_norm": 0.2880840301513672, + "learning_rate": 1.2401432424969693e-05, + "loss": 0.2182, + "step": 22774 + }, + { + "epoch": 0.42283800492345625, + "grad_norm": 0.3437395691871643, + "learning_rate": 1.240030005244669e-05, + "loss": 0.2097, + "step": 22776 + }, + { + "epoch": 0.4228751350608749, + "grad_norm": 0.2337070107460022, + "learning_rate": 1.2399167647262791e-05, + "loss": 0.3433, + "step": 22778 + }, + { + "epoch": 0.4229122651982935, + "grad_norm": 0.3288826048374176, + "learning_rate": 1.2398035209433407e-05, + "loss": 0.2974, + "step": 22780 + }, + { + "epoch": 0.42294939533571213, + "grad_norm": 0.38473883271217346, + "learning_rate": 1.2396902738973951e-05, + "loss": 0.2397, + "step": 22782 + }, + { + "epoch": 0.42298652547313076, + "grad_norm": 0.3544543981552124, + "learning_rate": 1.2395770235899821e-05, + "loss": 0.1966, + "step": 22784 + }, + { + "epoch": 0.4230236556105494, + "grad_norm": 0.4153810143470764, + "learning_rate": 1.239463770022644e-05, + "loss": 0.2791, + "step": 22786 + }, + { + "epoch": 0.42306078574796807, + "grad_norm": 0.5816980600357056, + "learning_rate": 1.239350513196921e-05, + "loss": 0.3479, + "step": 22788 + }, + { + "epoch": 0.4230979158853867, + "grad_norm": 0.22781512141227722, + "learning_rate": 1.2392372531143545e-05, + "loss": 0.3277, + "step": 22790 + }, + { + "epoch": 0.42313504602280533, + "grad_norm": 0.37092310190200806, + "learning_rate": 1.2391239897764857e-05, + "loss": 0.2714, + "step": 22792 + }, + { + "epoch": 0.42317217616022396, + "grad_norm": 0.3977445960044861, + "learning_rate": 1.2390107231848557e-05, + "loss": 0.3608, + "step": 22794 + }, + { + "epoch": 0.4232093062976426, + "grad_norm": 0.38939419388771057, + "learning_rate": 1.2388974533410054e-05, + "loss": 0.3195, + "step": 22796 + }, + { + "epoch": 0.42324643643506127, + "grad_norm": 0.4520815908908844, + "learning_rate": 1.2387841802464764e-05, + "loss": 0.2828, + "step": 22798 + }, + { + "epoch": 0.4232835665724799, + "grad_norm": 0.37553831934928894, + "learning_rate": 1.2386709039028103e-05, + "loss": 0.2603, + "step": 22800 + }, + { + "epoch": 0.4233206967098985, + "grad_norm": 0.3826637268066406, + "learning_rate": 1.2385576243115476e-05, + "loss": 0.3561, + "step": 22802 + }, + { + "epoch": 0.42335782684731715, + "grad_norm": 0.5578433871269226, + "learning_rate": 1.2384443414742304e-05, + "loss": 0.2962, + "step": 22804 + }, + { + "epoch": 0.4233949569847358, + "grad_norm": 0.28151801228523254, + "learning_rate": 1.2383310553924002e-05, + "loss": 0.4536, + "step": 22806 + }, + { + "epoch": 0.42343208712215447, + "grad_norm": 0.4643591046333313, + "learning_rate": 1.2382177660675979e-05, + "loss": 0.2425, + "step": 22808 + }, + { + "epoch": 0.4234692172595731, + "grad_norm": 0.29180800914764404, + "learning_rate": 1.2381044735013652e-05, + "loss": 0.4458, + "step": 22810 + }, + { + "epoch": 0.4235063473969917, + "grad_norm": 0.47145482897758484, + "learning_rate": 1.2379911776952443e-05, + "loss": 0.197, + "step": 22812 + }, + { + "epoch": 0.42354347753441035, + "grad_norm": 0.3320649266242981, + "learning_rate": 1.2378778786507761e-05, + "loss": 0.2019, + "step": 22814 + }, + { + "epoch": 0.423580607671829, + "grad_norm": 0.34492138028144836, + "learning_rate": 1.2377645763695024e-05, + "loss": 0.3681, + "step": 22816 + }, + { + "epoch": 0.4236177378092476, + "grad_norm": 0.31292101740837097, + "learning_rate": 1.2376512708529649e-05, + "loss": 0.3541, + "step": 22818 + }, + { + "epoch": 0.4236548679466663, + "grad_norm": 0.7579625844955444, + "learning_rate": 1.2375379621027057e-05, + "loss": 0.254, + "step": 22820 + }, + { + "epoch": 0.4236919980840849, + "grad_norm": 0.4231375753879547, + "learning_rate": 1.2374246501202663e-05, + "loss": 0.3349, + "step": 22822 + }, + { + "epoch": 0.42372912822150355, + "grad_norm": 0.4542980492115021, + "learning_rate": 1.2373113349071889e-05, + "loss": 0.2038, + "step": 22824 + }, + { + "epoch": 0.4237662583589222, + "grad_norm": 0.6494877934455872, + "learning_rate": 1.2371980164650145e-05, + "loss": 0.3464, + "step": 22826 + }, + { + "epoch": 0.4238033884963408, + "grad_norm": 0.34000924229621887, + "learning_rate": 1.237084694795286e-05, + "loss": 0.3373, + "step": 22828 + }, + { + "epoch": 0.4238405186337595, + "grad_norm": 0.2824628949165344, + "learning_rate": 1.2369713698995444e-05, + "loss": 0.2161, + "step": 22830 + }, + { + "epoch": 0.4238776487711781, + "grad_norm": 0.26404258608818054, + "learning_rate": 1.2368580417793329e-05, + "loss": 0.1449, + "step": 22832 + }, + { + "epoch": 0.42391477890859675, + "grad_norm": 0.29113391041755676, + "learning_rate": 1.2367447104361926e-05, + "loss": 0.1401, + "step": 22834 + }, + { + "epoch": 0.4239519090460154, + "grad_norm": 0.31812843680381775, + "learning_rate": 1.236631375871666e-05, + "loss": 0.4179, + "step": 22836 + }, + { + "epoch": 0.423989039183434, + "grad_norm": 0.3559657335281372, + "learning_rate": 1.2365180380872952e-05, + "loss": 0.2882, + "step": 22838 + }, + { + "epoch": 0.42402616932085263, + "grad_norm": 0.3740849494934082, + "learning_rate": 1.2364046970846221e-05, + "loss": 0.3081, + "step": 22840 + }, + { + "epoch": 0.4240632994582713, + "grad_norm": 0.38844820857048035, + "learning_rate": 1.2362913528651894e-05, + "loss": 0.3419, + "step": 22842 + }, + { + "epoch": 0.42410042959568994, + "grad_norm": 0.3080359995365143, + "learning_rate": 1.2361780054305392e-05, + "loss": 0.3599, + "step": 22844 + }, + { + "epoch": 0.42413755973310857, + "grad_norm": 0.2991790175437927, + "learning_rate": 1.2360646547822139e-05, + "loss": 0.1865, + "step": 22846 + }, + { + "epoch": 0.4241746898705272, + "grad_norm": 0.3395121097564697, + "learning_rate": 1.2359513009217556e-05, + "loss": 0.3303, + "step": 22848 + }, + { + "epoch": 0.42421182000794583, + "grad_norm": 0.28079748153686523, + "learning_rate": 1.2358379438507069e-05, + "loss": 0.4024, + "step": 22850 + }, + { + "epoch": 0.4242489501453645, + "grad_norm": 0.29016152024269104, + "learning_rate": 1.23572458357061e-05, + "loss": 0.2816, + "step": 22852 + }, + { + "epoch": 0.42428608028278314, + "grad_norm": 0.36855748295783997, + "learning_rate": 1.2356112200830077e-05, + "loss": 0.2084, + "step": 22854 + }, + { + "epoch": 0.42432321042020177, + "grad_norm": 1.0051796436309814, + "learning_rate": 1.2354978533894424e-05, + "loss": 0.2515, + "step": 22856 + }, + { + "epoch": 0.4243603405576204, + "grad_norm": 0.4385787844657898, + "learning_rate": 1.235384483491457e-05, + "loss": 0.3012, + "step": 22858 + }, + { + "epoch": 0.424397470695039, + "grad_norm": 0.4704073667526245, + "learning_rate": 1.2352711103905934e-05, + "loss": 0.3389, + "step": 22860 + }, + { + "epoch": 0.42443460083245765, + "grad_norm": 0.37719833850860596, + "learning_rate": 1.2351577340883948e-05, + "loss": 0.2799, + "step": 22862 + }, + { + "epoch": 0.42447173096987634, + "grad_norm": 0.3356054723262787, + "learning_rate": 1.235044354586404e-05, + "loss": 0.3124, + "step": 22864 + }, + { + "epoch": 0.42450886110729497, + "grad_norm": 0.4583863914012909, + "learning_rate": 1.2349309718861638e-05, + "loss": 0.1776, + "step": 22866 + }, + { + "epoch": 0.4245459912447136, + "grad_norm": 0.4187314212322235, + "learning_rate": 1.2348175859892164e-05, + "loss": 0.3005, + "step": 22868 + }, + { + "epoch": 0.4245831213821322, + "grad_norm": 0.4396700859069824, + "learning_rate": 1.2347041968971052e-05, + "loss": 0.4134, + "step": 22870 + }, + { + "epoch": 0.42462025151955085, + "grad_norm": 0.2254665642976761, + "learning_rate": 1.2345908046113726e-05, + "loss": 0.1278, + "step": 22872 + }, + { + "epoch": 0.42465738165696953, + "grad_norm": 0.35500895977020264, + "learning_rate": 1.2344774091335618e-05, + "loss": 0.1753, + "step": 22874 + }, + { + "epoch": 0.42469451179438816, + "grad_norm": 0.6753424406051636, + "learning_rate": 1.2343640104652163e-05, + "loss": 0.2017, + "step": 22876 + }, + { + "epoch": 0.4247316419318068, + "grad_norm": 0.4725152850151062, + "learning_rate": 1.2342506086078785e-05, + "loss": 0.2733, + "step": 22878 + }, + { + "epoch": 0.4247687720692254, + "grad_norm": 0.40250149369239807, + "learning_rate": 1.2341372035630912e-05, + "loss": 0.2067, + "step": 22880 + }, + { + "epoch": 0.42480590220664405, + "grad_norm": 0.3511542081832886, + "learning_rate": 1.2340237953323983e-05, + "loss": 0.1954, + "step": 22882 + }, + { + "epoch": 0.42484303234406273, + "grad_norm": 0.4057621955871582, + "learning_rate": 1.2339103839173423e-05, + "loss": 0.2613, + "step": 22884 + }, + { + "epoch": 0.42488016248148136, + "grad_norm": 0.4731575846672058, + "learning_rate": 1.2337969693194665e-05, + "loss": 0.3361, + "step": 22886 + }, + { + "epoch": 0.4249172926189, + "grad_norm": 0.44400572776794434, + "learning_rate": 1.2336835515403147e-05, + "loss": 0.2467, + "step": 22888 + }, + { + "epoch": 0.4249544227563186, + "grad_norm": 0.38770273327827454, + "learning_rate": 1.2335701305814294e-05, + "loss": 0.2607, + "step": 22890 + }, + { + "epoch": 0.42499155289373725, + "grad_norm": 0.3193565309047699, + "learning_rate": 1.233456706444354e-05, + "loss": 0.197, + "step": 22892 + }, + { + "epoch": 0.4250286830311559, + "grad_norm": 0.39757582545280457, + "learning_rate": 1.2333432791306323e-05, + "loss": 0.3951, + "step": 22894 + }, + { + "epoch": 0.42506581316857456, + "grad_norm": 0.4024692475795746, + "learning_rate": 1.2332298486418077e-05, + "loss": 0.3592, + "step": 22896 + }, + { + "epoch": 0.4251029433059932, + "grad_norm": 0.3067563772201538, + "learning_rate": 1.2331164149794234e-05, + "loss": 0.437, + "step": 22898 + }, + { + "epoch": 0.4251400734434118, + "grad_norm": 0.2389126867055893, + "learning_rate": 1.233002978145023e-05, + "loss": 0.2956, + "step": 22900 + }, + { + "epoch": 0.42517720358083044, + "grad_norm": 0.3209879994392395, + "learning_rate": 1.23288953814015e-05, + "loss": 0.2473, + "step": 22902 + }, + { + "epoch": 0.42521433371824907, + "grad_norm": 0.9317252039909363, + "learning_rate": 1.2327760949663477e-05, + "loss": 0.2747, + "step": 22904 + }, + { + "epoch": 0.42525146385566776, + "grad_norm": 0.38047611713409424, + "learning_rate": 1.23266264862516e-05, + "loss": 0.2258, + "step": 22906 + }, + { + "epoch": 0.4252885939930864, + "grad_norm": 0.23447075486183167, + "learning_rate": 1.2325491991181309e-05, + "loss": 0.194, + "step": 22908 + }, + { + "epoch": 0.425325724130505, + "grad_norm": 0.325809121131897, + "learning_rate": 1.2324357464468037e-05, + "loss": 0.3452, + "step": 22910 + }, + { + "epoch": 0.42536285426792364, + "grad_norm": 0.2812923491001129, + "learning_rate": 1.2323222906127222e-05, + "loss": 0.2049, + "step": 22912 + }, + { + "epoch": 0.42539998440534227, + "grad_norm": 0.29659613966941833, + "learning_rate": 1.2322088316174301e-05, + "loss": 0.1808, + "step": 22914 + }, + { + "epoch": 0.4254371145427609, + "grad_norm": 0.22950735688209534, + "learning_rate": 1.2320953694624712e-05, + "loss": 0.3864, + "step": 22916 + }, + { + "epoch": 0.4254742446801796, + "grad_norm": 0.2875741124153137, + "learning_rate": 1.2319819041493897e-05, + "loss": 0.3914, + "step": 22918 + }, + { + "epoch": 0.4255113748175982, + "grad_norm": 0.2793431878089905, + "learning_rate": 1.2318684356797297e-05, + "loss": 0.3962, + "step": 22920 + }, + { + "epoch": 0.42554850495501684, + "grad_norm": 0.3640422821044922, + "learning_rate": 1.2317549640550344e-05, + "loss": 0.2095, + "step": 22922 + }, + { + "epoch": 0.42558563509243547, + "grad_norm": 0.4431861937046051, + "learning_rate": 1.2316414892768482e-05, + "loss": 0.2884, + "step": 22924 + }, + { + "epoch": 0.4256227652298541, + "grad_norm": 0.4358106851577759, + "learning_rate": 1.2315280113467152e-05, + "loss": 0.2747, + "step": 22926 + }, + { + "epoch": 0.4256598953672728, + "grad_norm": 0.30239108204841614, + "learning_rate": 1.2314145302661795e-05, + "loss": 0.3204, + "step": 22928 + }, + { + "epoch": 0.4256970255046914, + "grad_norm": 0.40962329506874084, + "learning_rate": 1.2313010460367853e-05, + "loss": 0.3144, + "step": 22930 + }, + { + "epoch": 0.42573415564211003, + "grad_norm": 0.28338727355003357, + "learning_rate": 1.2311875586600768e-05, + "loss": 0.2308, + "step": 22932 + }, + { + "epoch": 0.42577128577952866, + "grad_norm": 0.3081931173801422, + "learning_rate": 1.231074068137598e-05, + "loss": 0.3556, + "step": 22934 + }, + { + "epoch": 0.4258084159169473, + "grad_norm": 0.4209566116333008, + "learning_rate": 1.230960574470893e-05, + "loss": 0.4912, + "step": 22936 + }, + { + "epoch": 0.4258455460543659, + "grad_norm": 0.506011962890625, + "learning_rate": 1.2308470776615065e-05, + "loss": 0.1949, + "step": 22938 + }, + { + "epoch": 0.4258826761917846, + "grad_norm": 0.6085726022720337, + "learning_rate": 1.2307335777109831e-05, + "loss": 0.4255, + "step": 22940 + }, + { + "epoch": 0.42591980632920323, + "grad_norm": 0.42970091104507446, + "learning_rate": 1.2306200746208668e-05, + "loss": 0.2556, + "step": 22942 + }, + { + "epoch": 0.42595693646662186, + "grad_norm": 0.3490724563598633, + "learning_rate": 1.2305065683927015e-05, + "loss": 0.3008, + "step": 22944 + }, + { + "epoch": 0.4259940666040405, + "grad_norm": 0.3265831470489502, + "learning_rate": 1.2303930590280328e-05, + "loss": 0.4057, + "step": 22946 + }, + { + "epoch": 0.4260311967414591, + "grad_norm": 0.4583674967288971, + "learning_rate": 1.2302795465284046e-05, + "loss": 0.1731, + "step": 22948 + }, + { + "epoch": 0.4260683268788778, + "grad_norm": 0.3717461824417114, + "learning_rate": 1.2301660308953614e-05, + "loss": 0.3712, + "step": 22950 + }, + { + "epoch": 0.42610545701629643, + "grad_norm": 0.25856301188468933, + "learning_rate": 1.2300525121304481e-05, + "loss": 0.1683, + "step": 22952 + }, + { + "epoch": 0.42614258715371506, + "grad_norm": 0.25912532210350037, + "learning_rate": 1.229938990235209e-05, + "loss": 0.2344, + "step": 22954 + }, + { + "epoch": 0.4261797172911337, + "grad_norm": 0.41414007544517517, + "learning_rate": 1.2298254652111889e-05, + "loss": 0.3619, + "step": 22956 + }, + { + "epoch": 0.4262168474285523, + "grad_norm": 0.2439611554145813, + "learning_rate": 1.2297119370599328e-05, + "loss": 0.1103, + "step": 22958 + }, + { + "epoch": 0.426253977565971, + "grad_norm": 0.18834145367145538, + "learning_rate": 1.2295984057829853e-05, + "loss": 0.1764, + "step": 22960 + }, + { + "epoch": 0.4262911077033896, + "grad_norm": 0.5146392583847046, + "learning_rate": 1.2294848713818914e-05, + "loss": 0.1639, + "step": 22962 + }, + { + "epoch": 0.42632823784080826, + "grad_norm": 0.37873223423957825, + "learning_rate": 1.2293713338581957e-05, + "loss": 0.3562, + "step": 22964 + }, + { + "epoch": 0.4263653679782269, + "grad_norm": 0.35179558396339417, + "learning_rate": 1.229257793213443e-05, + "loss": 0.3914, + "step": 22966 + }, + { + "epoch": 0.4264024981156455, + "grad_norm": 0.33815863728523254, + "learning_rate": 1.2291442494491786e-05, + "loss": 0.154, + "step": 22968 + }, + { + "epoch": 0.42643962825306414, + "grad_norm": 0.4831031858921051, + "learning_rate": 1.2290307025669473e-05, + "loss": 0.2434, + "step": 22970 + }, + { + "epoch": 0.4264767583904828, + "grad_norm": 0.4986487925052643, + "learning_rate": 1.228917152568294e-05, + "loss": 0.2098, + "step": 22972 + }, + { + "epoch": 0.42651388852790145, + "grad_norm": 0.38299787044525146, + "learning_rate": 1.2288035994547642e-05, + "loss": 0.2932, + "step": 22974 + }, + { + "epoch": 0.4265510186653201, + "grad_norm": 0.28940391540527344, + "learning_rate": 1.2286900432279025e-05, + "loss": 0.3772, + "step": 22976 + }, + { + "epoch": 0.4265881488027387, + "grad_norm": 0.24480387568473816, + "learning_rate": 1.2285764838892544e-05, + "loss": 0.1235, + "step": 22978 + }, + { + "epoch": 0.42662527894015734, + "grad_norm": 0.6749100089073181, + "learning_rate": 1.2284629214403651e-05, + "loss": 0.1893, + "step": 22980 + }, + { + "epoch": 0.426662409077576, + "grad_norm": 0.5375548601150513, + "learning_rate": 1.22834935588278e-05, + "loss": 0.4104, + "step": 22982 + }, + { + "epoch": 0.42669953921499465, + "grad_norm": 0.30733543634414673, + "learning_rate": 1.228235787218044e-05, + "loss": 0.264, + "step": 22984 + }, + { + "epoch": 0.4267366693524133, + "grad_norm": 0.338520348072052, + "learning_rate": 1.2281222154477021e-05, + "loss": 0.3547, + "step": 22986 + }, + { + "epoch": 0.4267737994898319, + "grad_norm": 0.27566567063331604, + "learning_rate": 1.2280086405733005e-05, + "loss": 0.1892, + "step": 22988 + }, + { + "epoch": 0.42681092962725053, + "grad_norm": 0.26056286692619324, + "learning_rate": 1.2278950625963843e-05, + "loss": 0.2179, + "step": 22990 + }, + { + "epoch": 0.42684805976466916, + "grad_norm": 0.3909897208213806, + "learning_rate": 1.2277814815184989e-05, + "loss": 0.4415, + "step": 22992 + }, + { + "epoch": 0.42688518990208785, + "grad_norm": 0.3203113377094269, + "learning_rate": 1.22766789734119e-05, + "loss": 0.3038, + "step": 22994 + }, + { + "epoch": 0.4269223200395065, + "grad_norm": 0.4672233760356903, + "learning_rate": 1.227554310066003e-05, + "loss": 0.2978, + "step": 22996 + }, + { + "epoch": 0.4269594501769251, + "grad_norm": 0.4107931852340698, + "learning_rate": 1.2274407196944831e-05, + "loss": 0.2269, + "step": 22998 + }, + { + "epoch": 0.42699658031434373, + "grad_norm": 0.40406060218811035, + "learning_rate": 1.2273271262281762e-05, + "loss": 0.2408, + "step": 23000 + }, + { + "epoch": 0.42703371045176236, + "grad_norm": 0.39488640427589417, + "learning_rate": 1.2272135296686285e-05, + "loss": 0.2529, + "step": 23002 + }, + { + "epoch": 0.42707084058918104, + "grad_norm": 0.3654564619064331, + "learning_rate": 1.2270999300173849e-05, + "loss": 0.3488, + "step": 23004 + }, + { + "epoch": 0.4271079707265997, + "grad_norm": 0.6523936986923218, + "learning_rate": 1.2269863272759916e-05, + "loss": 0.3418, + "step": 23006 + }, + { + "epoch": 0.4271451008640183, + "grad_norm": 0.547085165977478, + "learning_rate": 1.2268727214459944e-05, + "loss": 0.4111, + "step": 23008 + }, + { + "epoch": 0.42718223100143693, + "grad_norm": 0.3856388330459595, + "learning_rate": 1.2267591125289388e-05, + "loss": 0.1991, + "step": 23010 + }, + { + "epoch": 0.42721936113885556, + "grad_norm": 0.38456717133522034, + "learning_rate": 1.2266455005263709e-05, + "loss": 0.1407, + "step": 23012 + }, + { + "epoch": 0.4272564912762742, + "grad_norm": 0.612034022808075, + "learning_rate": 1.2265318854398366e-05, + "loss": 0.3702, + "step": 23014 + }, + { + "epoch": 0.42729362141369287, + "grad_norm": 0.40548208355903625, + "learning_rate": 1.2264182672708823e-05, + "loss": 0.3399, + "step": 23016 + }, + { + "epoch": 0.4273307515511115, + "grad_norm": 0.3133481442928314, + "learning_rate": 1.2263046460210532e-05, + "loss": 0.3199, + "step": 23018 + }, + { + "epoch": 0.4273678816885301, + "grad_norm": 0.42254459857940674, + "learning_rate": 1.2261910216918957e-05, + "loss": 0.2897, + "step": 23020 + }, + { + "epoch": 0.42740501182594876, + "grad_norm": 0.33620908856391907, + "learning_rate": 1.2260773942849562e-05, + "loss": 0.4578, + "step": 23022 + }, + { + "epoch": 0.4274421419633674, + "grad_norm": 0.335371196269989, + "learning_rate": 1.2259637638017803e-05, + "loss": 0.2779, + "step": 23024 + }, + { + "epoch": 0.42747927210078607, + "grad_norm": 0.3532801866531372, + "learning_rate": 1.2258501302439145e-05, + "loss": 0.1636, + "step": 23026 + }, + { + "epoch": 0.4275164022382047, + "grad_norm": 0.2666946053504944, + "learning_rate": 1.225736493612905e-05, + "loss": 0.2908, + "step": 23028 + }, + { + "epoch": 0.4275535323756233, + "grad_norm": 0.2571125328540802, + "learning_rate": 1.2256228539102979e-05, + "loss": 0.3319, + "step": 23030 + }, + { + "epoch": 0.42759066251304195, + "grad_norm": 0.42697784304618835, + "learning_rate": 1.2255092111376395e-05, + "loss": 0.1265, + "step": 23032 + }, + { + "epoch": 0.4276277926504606, + "grad_norm": 0.3501487672328949, + "learning_rate": 1.2253955652964762e-05, + "loss": 0.3816, + "step": 23034 + }, + { + "epoch": 0.42766492278787926, + "grad_norm": 0.4579354226589203, + "learning_rate": 1.2252819163883548e-05, + "loss": 0.2369, + "step": 23036 + }, + { + "epoch": 0.4277020529252979, + "grad_norm": 0.3472498059272766, + "learning_rate": 1.2251682644148208e-05, + "loss": 0.3058, + "step": 23038 + }, + { + "epoch": 0.4277391830627165, + "grad_norm": 0.4118129312992096, + "learning_rate": 1.2250546093774214e-05, + "loss": 0.2865, + "step": 23040 + }, + { + "epoch": 0.42777631320013515, + "grad_norm": 0.37376001477241516, + "learning_rate": 1.2249409512777029e-05, + "loss": 0.1539, + "step": 23042 + }, + { + "epoch": 0.4278134433375538, + "grad_norm": 0.4816228747367859, + "learning_rate": 1.2248272901172116e-05, + "loss": 0.3309, + "step": 23044 + }, + { + "epoch": 0.4278505734749724, + "grad_norm": 0.3830767869949341, + "learning_rate": 1.2247136258974945e-05, + "loss": 0.413, + "step": 23046 + }, + { + "epoch": 0.4278877036123911, + "grad_norm": 0.44802048802375793, + "learning_rate": 1.2245999586200982e-05, + "loss": 0.3644, + "step": 23048 + }, + { + "epoch": 0.4279248337498097, + "grad_norm": 0.28901875019073486, + "learning_rate": 1.2244862882865691e-05, + "loss": 0.2763, + "step": 23050 + }, + { + "epoch": 0.42796196388722835, + "grad_norm": 0.7705404758453369, + "learning_rate": 1.2243726148984541e-05, + "loss": 0.4502, + "step": 23052 + }, + { + "epoch": 0.427999094024647, + "grad_norm": 0.29758206009864807, + "learning_rate": 1.2242589384572996e-05, + "loss": 0.1601, + "step": 23054 + }, + { + "epoch": 0.4280362241620656, + "grad_norm": 0.4036569595336914, + "learning_rate": 1.2241452589646528e-05, + "loss": 0.3428, + "step": 23056 + }, + { + "epoch": 0.4280733542994843, + "grad_norm": 0.416521817445755, + "learning_rate": 1.2240315764220604e-05, + "loss": 0.1801, + "step": 23058 + }, + { + "epoch": 0.4281104844369029, + "grad_norm": 0.40393149852752686, + "learning_rate": 1.2239178908310697e-05, + "loss": 0.3821, + "step": 23060 + }, + { + "epoch": 0.42814761457432154, + "grad_norm": 0.42451438307762146, + "learning_rate": 1.2238042021932268e-05, + "loss": 0.1351, + "step": 23062 + }, + { + "epoch": 0.4281847447117402, + "grad_norm": 0.29696065187454224, + "learning_rate": 1.223690510510079e-05, + "loss": 0.1528, + "step": 23064 + }, + { + "epoch": 0.4282218748491588, + "grad_norm": 0.37280505895614624, + "learning_rate": 1.2235768157831738e-05, + "loss": 0.2149, + "step": 23066 + }, + { + "epoch": 0.42825900498657743, + "grad_norm": 0.33905187249183655, + "learning_rate": 1.2234631180140575e-05, + "loss": 0.3213, + "step": 23068 + }, + { + "epoch": 0.4282961351239961, + "grad_norm": 0.36966705322265625, + "learning_rate": 1.2233494172042777e-05, + "loss": 0.3788, + "step": 23070 + }, + { + "epoch": 0.42833326526141474, + "grad_norm": 0.2549493610858917, + "learning_rate": 1.2232357133553812e-05, + "loss": 0.2205, + "step": 23072 + }, + { + "epoch": 0.42837039539883337, + "grad_norm": 0.4327949285507202, + "learning_rate": 1.2231220064689153e-05, + "loss": 0.4365, + "step": 23074 + }, + { + "epoch": 0.428407525536252, + "grad_norm": 0.2787284553050995, + "learning_rate": 1.2230082965464271e-05, + "loss": 0.3685, + "step": 23076 + }, + { + "epoch": 0.4284446556736706, + "grad_norm": 0.3058987855911255, + "learning_rate": 1.2228945835894644e-05, + "loss": 0.3338, + "step": 23078 + }, + { + "epoch": 0.4284817858110893, + "grad_norm": 0.39583471417427063, + "learning_rate": 1.2227808675995738e-05, + "loss": 0.2665, + "step": 23080 + }, + { + "epoch": 0.42851891594850794, + "grad_norm": 0.34660831093788147, + "learning_rate": 1.2226671485783028e-05, + "loss": 0.3431, + "step": 23082 + }, + { + "epoch": 0.42855604608592657, + "grad_norm": 0.349342405796051, + "learning_rate": 1.222553426527199e-05, + "loss": 0.2833, + "step": 23084 + }, + { + "epoch": 0.4285931762233452, + "grad_norm": 0.39499565958976746, + "learning_rate": 1.2224397014478098e-05, + "loss": 0.3762, + "step": 23086 + }, + { + "epoch": 0.4286303063607638, + "grad_norm": 0.46083858609199524, + "learning_rate": 1.2223259733416823e-05, + "loss": 0.2188, + "step": 23088 + }, + { + "epoch": 0.42866743649818245, + "grad_norm": 3.4553961753845215, + "learning_rate": 1.2222122422103646e-05, + "loss": 0.3337, + "step": 23090 + }, + { + "epoch": 0.42870456663560114, + "grad_norm": 0.4417554438114166, + "learning_rate": 1.2220985080554035e-05, + "loss": 0.2312, + "step": 23092 + }, + { + "epoch": 0.42874169677301976, + "grad_norm": 0.24031218886375427, + "learning_rate": 1.221984770878347e-05, + "loss": 0.3668, + "step": 23094 + }, + { + "epoch": 0.4287788269104384, + "grad_norm": 0.3566148579120636, + "learning_rate": 1.2218710306807429e-05, + "loss": 0.3607, + "step": 23096 + }, + { + "epoch": 0.428815957047857, + "grad_norm": 0.43841347098350525, + "learning_rate": 1.2217572874641386e-05, + "loss": 0.3739, + "step": 23098 + }, + { + "epoch": 0.42885308718527565, + "grad_norm": 0.3723064363002777, + "learning_rate": 1.2216435412300818e-05, + "loss": 0.3829, + "step": 23100 + }, + { + "epoch": 0.42889021732269433, + "grad_norm": 0.36878836154937744, + "learning_rate": 1.22152979198012e-05, + "loss": 0.116, + "step": 23102 + }, + { + "epoch": 0.42892734746011296, + "grad_norm": 0.4086867868900299, + "learning_rate": 1.221416039715802e-05, + "loss": 0.1755, + "step": 23104 + }, + { + "epoch": 0.4289644775975316, + "grad_norm": 0.7433109283447266, + "learning_rate": 1.2213022844386743e-05, + "loss": 0.2151, + "step": 23106 + }, + { + "epoch": 0.4290016077349502, + "grad_norm": 0.3579424321651459, + "learning_rate": 1.2211885261502855e-05, + "loss": 0.329, + "step": 23108 + }, + { + "epoch": 0.42903873787236885, + "grad_norm": 0.5112415552139282, + "learning_rate": 1.2210747648521837e-05, + "loss": 0.2966, + "step": 23110 + }, + { + "epoch": 0.42907586800978753, + "grad_norm": 0.3081870377063751, + "learning_rate": 1.2209610005459164e-05, + "loss": 0.2573, + "step": 23112 + }, + { + "epoch": 0.42911299814720616, + "grad_norm": 0.32011377811431885, + "learning_rate": 1.2208472332330318e-05, + "loss": 0.2732, + "step": 23114 + }, + { + "epoch": 0.4291501282846248, + "grad_norm": 0.5502941012382507, + "learning_rate": 1.2207334629150776e-05, + "loss": 0.445, + "step": 23116 + }, + { + "epoch": 0.4291872584220434, + "grad_norm": 0.43634793162345886, + "learning_rate": 1.2206196895936023e-05, + "loss": 0.3416, + "step": 23118 + }, + { + "epoch": 0.42922438855946204, + "grad_norm": 0.3453655242919922, + "learning_rate": 1.220505913270154e-05, + "loss": 0.2658, + "step": 23120 + }, + { + "epoch": 0.4292615186968807, + "grad_norm": 0.35604044795036316, + "learning_rate": 1.2203921339462805e-05, + "loss": 0.3373, + "step": 23122 + }, + { + "epoch": 0.42929864883429936, + "grad_norm": 0.401803195476532, + "learning_rate": 1.2202783516235303e-05, + "loss": 0.2712, + "step": 23124 + }, + { + "epoch": 0.429335778971718, + "grad_norm": 0.3024548292160034, + "learning_rate": 1.2201645663034514e-05, + "loss": 0.4558, + "step": 23126 + }, + { + "epoch": 0.4293729091091366, + "grad_norm": 0.8156460523605347, + "learning_rate": 1.2200507779875925e-05, + "loss": 0.375, + "step": 23128 + }, + { + "epoch": 0.42941003924655524, + "grad_norm": 0.3897463381290436, + "learning_rate": 1.2199369866775012e-05, + "loss": 0.4592, + "step": 23130 + }, + { + "epoch": 0.42944716938397387, + "grad_norm": 0.3447442054748535, + "learning_rate": 1.2198231923747267e-05, + "loss": 0.362, + "step": 23132 + }, + { + "epoch": 0.42948429952139255, + "grad_norm": 0.4403630495071411, + "learning_rate": 1.2197093950808173e-05, + "loss": 0.3008, + "step": 23134 + }, + { + "epoch": 0.4295214296588112, + "grad_norm": 0.33020108938217163, + "learning_rate": 1.2195955947973203e-05, + "loss": 0.3821, + "step": 23136 + }, + { + "epoch": 0.4295585597962298, + "grad_norm": 0.3349105417728424, + "learning_rate": 1.2194817915257855e-05, + "loss": 0.1456, + "step": 23138 + }, + { + "epoch": 0.42959568993364844, + "grad_norm": 0.3247455060482025, + "learning_rate": 1.219367985267761e-05, + "loss": 0.2925, + "step": 23140 + }, + { + "epoch": 0.42963282007106707, + "grad_norm": 0.2528064548969269, + "learning_rate": 1.2192541760247953e-05, + "loss": 0.3945, + "step": 23142 + }, + { + "epoch": 0.4296699502084857, + "grad_norm": 0.31630587577819824, + "learning_rate": 1.2191403637984367e-05, + "loss": 0.2378, + "step": 23144 + }, + { + "epoch": 0.4297070803459044, + "grad_norm": 0.23515333235263824, + "learning_rate": 1.2190265485902343e-05, + "loss": 0.2519, + "step": 23146 + }, + { + "epoch": 0.429744210483323, + "grad_norm": 0.45384711027145386, + "learning_rate": 1.2189127304017367e-05, + "loss": 0.4155, + "step": 23148 + }, + { + "epoch": 0.42978134062074164, + "grad_norm": 0.28183314204216003, + "learning_rate": 1.2187989092344925e-05, + "loss": 0.2654, + "step": 23150 + }, + { + "epoch": 0.42981847075816026, + "grad_norm": 0.5351614356040955, + "learning_rate": 1.2186850850900507e-05, + "loss": 0.3583, + "step": 23152 + }, + { + "epoch": 0.4298556008955789, + "grad_norm": 0.34302783012390137, + "learning_rate": 1.2185712579699598e-05, + "loss": 0.1633, + "step": 23154 + }, + { + "epoch": 0.4298927310329976, + "grad_norm": 0.18280485272407532, + "learning_rate": 1.2184574278757688e-05, + "loss": 0.1829, + "step": 23156 + }, + { + "epoch": 0.4299298611704162, + "grad_norm": 0.44055184721946716, + "learning_rate": 1.2183435948090263e-05, + "loss": 0.2283, + "step": 23158 + }, + { + "epoch": 0.42996699130783483, + "grad_norm": 0.3919624090194702, + "learning_rate": 1.218229758771282e-05, + "loss": 0.2851, + "step": 23160 + }, + { + "epoch": 0.43000412144525346, + "grad_norm": 0.32525837421417236, + "learning_rate": 1.2181159197640838e-05, + "loss": 0.4172, + "step": 23162 + }, + { + "epoch": 0.4300412515826721, + "grad_norm": 0.436370313167572, + "learning_rate": 1.2180020777889815e-05, + "loss": 0.2238, + "step": 23164 + }, + { + "epoch": 0.4300783817200907, + "grad_norm": 0.3954935073852539, + "learning_rate": 1.2178882328475244e-05, + "loss": 0.3807, + "step": 23166 + }, + { + "epoch": 0.4301155118575094, + "grad_norm": 0.3737346827983856, + "learning_rate": 1.2177743849412605e-05, + "loss": 0.3215, + "step": 23168 + }, + { + "epoch": 0.43015264199492803, + "grad_norm": 0.5883674025535583, + "learning_rate": 1.2176605340717395e-05, + "loss": 0.4364, + "step": 23170 + }, + { + "epoch": 0.43018977213234666, + "grad_norm": 0.391816109418869, + "learning_rate": 1.217546680240511e-05, + "loss": 0.2544, + "step": 23172 + }, + { + "epoch": 0.4302269022697653, + "grad_norm": 0.5665327310562134, + "learning_rate": 1.2174328234491235e-05, + "loss": 0.251, + "step": 23174 + }, + { + "epoch": 0.4302640324071839, + "grad_norm": 0.36060065031051636, + "learning_rate": 1.2173189636991266e-05, + "loss": 0.2896, + "step": 23176 + }, + { + "epoch": 0.4303011625446026, + "grad_norm": 0.40805038809776306, + "learning_rate": 1.2172051009920698e-05, + "loss": 0.5056, + "step": 23178 + }, + { + "epoch": 0.43033829268202123, + "grad_norm": 0.29368507862091064, + "learning_rate": 1.2170912353295017e-05, + "loss": 0.2301, + "step": 23180 + }, + { + "epoch": 0.43037542281943986, + "grad_norm": 0.2922075688838959, + "learning_rate": 1.2169773667129725e-05, + "loss": 0.3532, + "step": 23182 + }, + { + "epoch": 0.4304125529568585, + "grad_norm": 0.374648779630661, + "learning_rate": 1.2168634951440312e-05, + "loss": 0.2379, + "step": 23184 + }, + { + "epoch": 0.4304496830942771, + "grad_norm": 0.38597720861434937, + "learning_rate": 1.2167496206242276e-05, + "loss": 0.3382, + "step": 23186 + }, + { + "epoch": 0.4304868132316958, + "grad_norm": 0.3847152292728424, + "learning_rate": 1.2166357431551103e-05, + "loss": 0.3534, + "step": 23188 + }, + { + "epoch": 0.4305239433691144, + "grad_norm": 0.40399467945098877, + "learning_rate": 1.2165218627382296e-05, + "loss": 0.3624, + "step": 23190 + }, + { + "epoch": 0.43056107350653305, + "grad_norm": 0.3807084560394287, + "learning_rate": 1.2164079793751353e-05, + "loss": 0.2474, + "step": 23192 + }, + { + "epoch": 0.4305982036439517, + "grad_norm": 0.25879350304603577, + "learning_rate": 1.216294093067376e-05, + "loss": 0.2076, + "step": 23194 + }, + { + "epoch": 0.4306353337813703, + "grad_norm": 0.36035412549972534, + "learning_rate": 1.2161802038165028e-05, + "loss": 0.3596, + "step": 23196 + }, + { + "epoch": 0.43067246391878894, + "grad_norm": 0.39120063185691833, + "learning_rate": 1.2160663116240641e-05, + "loss": 0.3354, + "step": 23198 + }, + { + "epoch": 0.4307095940562076, + "grad_norm": 0.2907465696334839, + "learning_rate": 1.2159524164916099e-05, + "loss": 0.2905, + "step": 23200 + }, + { + "epoch": 0.43074672419362625, + "grad_norm": 0.4556128978729248, + "learning_rate": 1.2158385184206902e-05, + "loss": 0.3241, + "step": 23202 + }, + { + "epoch": 0.4307838543310449, + "grad_norm": 0.4247533082962036, + "learning_rate": 1.2157246174128553e-05, + "loss": 0.3166, + "step": 23204 + }, + { + "epoch": 0.4308209844684635, + "grad_norm": 0.47747036814689636, + "learning_rate": 1.2156107134696543e-05, + "loss": 0.182, + "step": 23206 + }, + { + "epoch": 0.43085811460588214, + "grad_norm": 0.4764822721481323, + "learning_rate": 1.2154968065926369e-05, + "loss": 0.3656, + "step": 23208 + }, + { + "epoch": 0.4308952447433008, + "grad_norm": 0.3703691065311432, + "learning_rate": 1.2153828967833539e-05, + "loss": 0.2498, + "step": 23210 + }, + { + "epoch": 0.43093237488071945, + "grad_norm": 0.3465287387371063, + "learning_rate": 1.2152689840433545e-05, + "loss": 0.354, + "step": 23212 + }, + { + "epoch": 0.4309695050181381, + "grad_norm": 0.42818284034729004, + "learning_rate": 1.215155068374189e-05, + "loss": 0.4737, + "step": 23214 + }, + { + "epoch": 0.4310066351555567, + "grad_norm": 0.37709662318229675, + "learning_rate": 1.2150411497774077e-05, + "loss": 0.2949, + "step": 23216 + }, + { + "epoch": 0.43104376529297533, + "grad_norm": 0.4030877649784088, + "learning_rate": 1.2149272282545608e-05, + "loss": 0.2289, + "step": 23218 + }, + { + "epoch": 0.43108089543039396, + "grad_norm": 0.3285463750362396, + "learning_rate": 1.2148133038071976e-05, + "loss": 0.1606, + "step": 23220 + }, + { + "epoch": 0.43111802556781265, + "grad_norm": 0.2906420826911926, + "learning_rate": 1.2146993764368688e-05, + "loss": 0.2696, + "step": 23222 + }, + { + "epoch": 0.4311551557052313, + "grad_norm": 0.319789320230484, + "learning_rate": 1.2145854461451248e-05, + "loss": 0.2895, + "step": 23224 + }, + { + "epoch": 0.4311922858426499, + "grad_norm": 0.5244007110595703, + "learning_rate": 1.2144715129335157e-05, + "loss": 0.315, + "step": 23226 + }, + { + "epoch": 0.43122941598006853, + "grad_norm": 0.35446733236312866, + "learning_rate": 1.2143575768035914e-05, + "loss": 0.4828, + "step": 23228 + }, + { + "epoch": 0.43126654611748716, + "grad_norm": 0.6110371351242065, + "learning_rate": 1.2142436377569027e-05, + "loss": 0.291, + "step": 23230 + }, + { + "epoch": 0.43130367625490584, + "grad_norm": 0.38392752408981323, + "learning_rate": 1.2141296957949997e-05, + "loss": 0.3001, + "step": 23232 + }, + { + "epoch": 0.43134080639232447, + "grad_norm": 0.2700739800930023, + "learning_rate": 1.214015750919433e-05, + "loss": 0.261, + "step": 23234 + }, + { + "epoch": 0.4313779365297431, + "grad_norm": 0.38205358386039734, + "learning_rate": 1.2139018031317533e-05, + "loss": 0.2886, + "step": 23236 + }, + { + "epoch": 0.43141506666716173, + "grad_norm": 0.42201003432273865, + "learning_rate": 1.2137878524335106e-05, + "loss": 0.2207, + "step": 23238 + }, + { + "epoch": 0.43145219680458036, + "grad_norm": 0.4867798686027527, + "learning_rate": 1.2136738988262555e-05, + "loss": 0.2635, + "step": 23240 + }, + { + "epoch": 0.431489326941999, + "grad_norm": 0.38359594345092773, + "learning_rate": 1.2135599423115387e-05, + "loss": 0.3509, + "step": 23242 + }, + { + "epoch": 0.43152645707941767, + "grad_norm": 0.33045750856399536, + "learning_rate": 1.2134459828909107e-05, + "loss": 0.5437, + "step": 23244 + }, + { + "epoch": 0.4315635872168363, + "grad_norm": 0.35055261850357056, + "learning_rate": 1.2133320205659223e-05, + "loss": 0.2281, + "step": 23246 + }, + { + "epoch": 0.4316007173542549, + "grad_norm": 0.4190187156200409, + "learning_rate": 1.2132180553381242e-05, + "loss": 0.3006, + "step": 23248 + }, + { + "epoch": 0.43163784749167355, + "grad_norm": 0.5887752771377563, + "learning_rate": 1.213104087209067e-05, + "loss": 0.2004, + "step": 23250 + }, + { + "epoch": 0.4316749776290922, + "grad_norm": 0.2555970847606659, + "learning_rate": 1.2129901161803013e-05, + "loss": 0.167, + "step": 23252 + }, + { + "epoch": 0.43171210776651087, + "grad_norm": 0.5476435422897339, + "learning_rate": 1.2128761422533781e-05, + "loss": 0.5467, + "step": 23254 + }, + { + "epoch": 0.4317492379039295, + "grad_norm": 0.2246808111667633, + "learning_rate": 1.2127621654298482e-05, + "loss": 0.3526, + "step": 23256 + }, + { + "epoch": 0.4317863680413481, + "grad_norm": 0.4039890170097351, + "learning_rate": 1.2126481857112629e-05, + "loss": 0.3415, + "step": 23258 + }, + { + "epoch": 0.43182349817876675, + "grad_norm": 0.4147579073905945, + "learning_rate": 1.2125342030991726e-05, + "loss": 0.3932, + "step": 23260 + }, + { + "epoch": 0.4318606283161854, + "grad_norm": 0.3487914204597473, + "learning_rate": 1.2124202175951283e-05, + "loss": 0.3348, + "step": 23262 + }, + { + "epoch": 0.43189775845360406, + "grad_norm": 0.3068159818649292, + "learning_rate": 1.2123062292006811e-05, + "loss": 0.2338, + "step": 23264 + }, + { + "epoch": 0.4319348885910227, + "grad_norm": 0.42535725235939026, + "learning_rate": 1.2121922379173818e-05, + "loss": 0.3939, + "step": 23266 + }, + { + "epoch": 0.4319720187284413, + "grad_norm": 0.5371565818786621, + "learning_rate": 1.2120782437467821e-05, + "loss": 0.3613, + "step": 23268 + }, + { + "epoch": 0.43200914886585995, + "grad_norm": 0.39797282218933105, + "learning_rate": 1.2119642466904328e-05, + "loss": 0.3979, + "step": 23270 + }, + { + "epoch": 0.4320462790032786, + "grad_norm": 0.44049230217933655, + "learning_rate": 1.2118502467498849e-05, + "loss": 0.5261, + "step": 23272 + }, + { + "epoch": 0.4320834091406972, + "grad_norm": 0.3842976689338684, + "learning_rate": 1.2117362439266898e-05, + "loss": 0.3972, + "step": 23274 + }, + { + "epoch": 0.4321205392781159, + "grad_norm": 0.48244377970695496, + "learning_rate": 1.2116222382223983e-05, + "loss": 0.1613, + "step": 23276 + }, + { + "epoch": 0.4321576694155345, + "grad_norm": 0.23082970082759857, + "learning_rate": 1.2115082296385623e-05, + "loss": 0.2896, + "step": 23278 + }, + { + "epoch": 0.43219479955295315, + "grad_norm": 0.4386811852455139, + "learning_rate": 1.211394218176733e-05, + "loss": 0.2761, + "step": 23280 + }, + { + "epoch": 0.4322319296903718, + "grad_norm": 0.3757105767726898, + "learning_rate": 1.2112802038384612e-05, + "loss": 0.3827, + "step": 23282 + }, + { + "epoch": 0.4322690598277904, + "grad_norm": 0.6001486778259277, + "learning_rate": 1.2111661866252988e-05, + "loss": 0.5113, + "step": 23284 + }, + { + "epoch": 0.4323061899652091, + "grad_norm": 0.38551756739616394, + "learning_rate": 1.2110521665387973e-05, + "loss": 0.309, + "step": 23286 + }, + { + "epoch": 0.4323433201026277, + "grad_norm": 0.3928928077220917, + "learning_rate": 1.210938143580508e-05, + "loss": 0.4282, + "step": 23288 + }, + { + "epoch": 0.43238045024004634, + "grad_norm": 0.5995832085609436, + "learning_rate": 1.2108241177519821e-05, + "loss": 0.2396, + "step": 23290 + }, + { + "epoch": 0.43241758037746497, + "grad_norm": 0.331207275390625, + "learning_rate": 1.2107100890547718e-05, + "loss": 0.3683, + "step": 23292 + }, + { + "epoch": 0.4324547105148836, + "grad_norm": 0.44506579637527466, + "learning_rate": 1.2105960574904282e-05, + "loss": 0.3635, + "step": 23294 + }, + { + "epoch": 0.43249184065230223, + "grad_norm": 0.3518659472465515, + "learning_rate": 1.2104820230605028e-05, + "loss": 0.2474, + "step": 23296 + }, + { + "epoch": 0.4325289707897209, + "grad_norm": 0.2805148959159851, + "learning_rate": 1.2103679857665477e-05, + "loss": 0.3096, + "step": 23298 + }, + { + "epoch": 0.43256610092713954, + "grad_norm": 0.40993937849998474, + "learning_rate": 1.2102539456101145e-05, + "loss": 0.36, + "step": 23300 + }, + { + "epoch": 0.43260323106455817, + "grad_norm": 0.29210585355758667, + "learning_rate": 1.210139902592755e-05, + "loss": 0.2525, + "step": 23302 + }, + { + "epoch": 0.4326403612019768, + "grad_norm": 0.2924707233905792, + "learning_rate": 1.2100258567160207e-05, + "loss": 0.1766, + "step": 23304 + }, + { + "epoch": 0.4326774913393954, + "grad_norm": 0.5818803310394287, + "learning_rate": 1.2099118079814636e-05, + "loss": 0.3899, + "step": 23306 + }, + { + "epoch": 0.4327146214768141, + "grad_norm": 0.5296061635017395, + "learning_rate": 1.2097977563906356e-05, + "loss": 0.177, + "step": 23308 + }, + { + "epoch": 0.43275175161423274, + "grad_norm": 0.36435437202453613, + "learning_rate": 1.2096837019450884e-05, + "loss": 0.3098, + "step": 23310 + }, + { + "epoch": 0.43278888175165137, + "grad_norm": 0.49574407935142517, + "learning_rate": 1.2095696446463746e-05, + "loss": 0.3023, + "step": 23312 + }, + { + "epoch": 0.43282601188907, + "grad_norm": 0.3840111196041107, + "learning_rate": 1.2094555844960452e-05, + "loss": 0.3476, + "step": 23314 + }, + { + "epoch": 0.4328631420264886, + "grad_norm": 0.3309880197048187, + "learning_rate": 1.2093415214956526e-05, + "loss": 0.1849, + "step": 23316 + }, + { + "epoch": 0.43290027216390725, + "grad_norm": 0.3271883726119995, + "learning_rate": 1.2092274556467492e-05, + "loss": 0.1865, + "step": 23318 + }, + { + "epoch": 0.43293740230132594, + "grad_norm": 0.4441545903682709, + "learning_rate": 1.2091133869508867e-05, + "loss": 0.3206, + "step": 23320 + }, + { + "epoch": 0.43297453243874456, + "grad_norm": 0.3447020351886749, + "learning_rate": 1.2089993154096176e-05, + "loss": 0.2285, + "step": 23322 + }, + { + "epoch": 0.4330116625761632, + "grad_norm": 0.36402827501296997, + "learning_rate": 1.2088852410244939e-05, + "loss": 0.2962, + "step": 23324 + }, + { + "epoch": 0.4330487927135818, + "grad_norm": 0.4426742494106293, + "learning_rate": 1.2087711637970672e-05, + "loss": 0.2773, + "step": 23326 + }, + { + "epoch": 0.43308592285100045, + "grad_norm": 0.30313706398010254, + "learning_rate": 1.2086570837288904e-05, + "loss": 0.4364, + "step": 23328 + }, + { + "epoch": 0.43312305298841913, + "grad_norm": 0.25074154138565063, + "learning_rate": 1.2085430008215162e-05, + "loss": 0.3632, + "step": 23330 + }, + { + "epoch": 0.43316018312583776, + "grad_norm": 0.2982826232910156, + "learning_rate": 1.208428915076496e-05, + "loss": 0.235, + "step": 23332 + }, + { + "epoch": 0.4331973132632564, + "grad_norm": 0.2748717665672302, + "learning_rate": 1.2083148264953828e-05, + "loss": 0.1569, + "step": 23334 + }, + { + "epoch": 0.433234443400675, + "grad_norm": 0.36050736904144287, + "learning_rate": 1.2082007350797289e-05, + "loss": 0.308, + "step": 23336 + }, + { + "epoch": 0.43327157353809365, + "grad_norm": 0.41918471455574036, + "learning_rate": 1.2080866408310864e-05, + "loss": 0.2041, + "step": 23338 + }, + { + "epoch": 0.43330870367551233, + "grad_norm": 0.39146679639816284, + "learning_rate": 1.2079725437510077e-05, + "loss": 0.2156, + "step": 23340 + }, + { + "epoch": 0.43334583381293096, + "grad_norm": 0.580489456653595, + "learning_rate": 1.2078584438410462e-05, + "loss": 0.2443, + "step": 23342 + }, + { + "epoch": 0.4333829639503496, + "grad_norm": 0.38199475407600403, + "learning_rate": 1.2077443411027539e-05, + "loss": 0.2843, + "step": 23344 + }, + { + "epoch": 0.4334200940877682, + "grad_norm": 0.3835553824901581, + "learning_rate": 1.2076302355376827e-05, + "loss": 0.1446, + "step": 23346 + }, + { + "epoch": 0.43345722422518684, + "grad_norm": 0.4048469066619873, + "learning_rate": 1.2075161271473866e-05, + "loss": 0.523, + "step": 23348 + }, + { + "epoch": 0.43349435436260547, + "grad_norm": 0.19754807651042938, + "learning_rate": 1.2074020159334175e-05, + "loss": 0.1556, + "step": 23350 + }, + { + "epoch": 0.43353148450002416, + "grad_norm": 0.45872822403907776, + "learning_rate": 1.2072879018973277e-05, + "loss": 0.4246, + "step": 23352 + }, + { + "epoch": 0.4335686146374428, + "grad_norm": 0.5624155402183533, + "learning_rate": 1.2071737850406711e-05, + "loss": 0.3328, + "step": 23354 + }, + { + "epoch": 0.4336057447748614, + "grad_norm": 0.567057728767395, + "learning_rate": 1.2070596653649996e-05, + "loss": 0.2451, + "step": 23356 + }, + { + "epoch": 0.43364287491228004, + "grad_norm": 0.33670100569725037, + "learning_rate": 1.206945542871866e-05, + "loss": 0.4138, + "step": 23358 + }, + { + "epoch": 0.43368000504969867, + "grad_norm": 0.2645980417728424, + "learning_rate": 1.2068314175628237e-05, + "loss": 0.3229, + "step": 23360 + }, + { + "epoch": 0.43371713518711735, + "grad_norm": 0.4024185240268707, + "learning_rate": 1.2067172894394254e-05, + "loss": 0.2547, + "step": 23362 + }, + { + "epoch": 0.433754265324536, + "grad_norm": 0.3469744026660919, + "learning_rate": 1.2066031585032239e-05, + "loss": 0.2468, + "step": 23364 + }, + { + "epoch": 0.4337913954619546, + "grad_norm": 0.37204432487487793, + "learning_rate": 1.2064890247557721e-05, + "loss": 0.3668, + "step": 23366 + }, + { + "epoch": 0.43382852559937324, + "grad_norm": 0.46184876561164856, + "learning_rate": 1.2063748881986235e-05, + "loss": 0.2843, + "step": 23368 + }, + { + "epoch": 0.43386565573679187, + "grad_norm": 0.419368177652359, + "learning_rate": 1.2062607488333306e-05, + "loss": 0.1764, + "step": 23370 + }, + { + "epoch": 0.4339027858742105, + "grad_norm": 0.330716609954834, + "learning_rate": 1.2061466066614467e-05, + "loss": 0.4845, + "step": 23372 + }, + { + "epoch": 0.4339399160116292, + "grad_norm": 0.55287104845047, + "learning_rate": 1.2060324616845252e-05, + "loss": 0.2753, + "step": 23374 + }, + { + "epoch": 0.4339770461490478, + "grad_norm": 0.44660496711730957, + "learning_rate": 1.2059183139041188e-05, + "loss": 0.2021, + "step": 23376 + }, + { + "epoch": 0.43401417628646644, + "grad_norm": 0.4001522958278656, + "learning_rate": 1.2058041633217812e-05, + "loss": 0.3987, + "step": 23378 + }, + { + "epoch": 0.43405130642388506, + "grad_norm": 0.45645761489868164, + "learning_rate": 1.2056900099390651e-05, + "loss": 0.3296, + "step": 23380 + }, + { + "epoch": 0.4340884365613037, + "grad_norm": 0.5197969079017639, + "learning_rate": 1.2055758537575241e-05, + "loss": 0.4612, + "step": 23382 + }, + { + "epoch": 0.4341255666987224, + "grad_norm": 0.36933034658432007, + "learning_rate": 1.2054616947787117e-05, + "loss": 0.3072, + "step": 23384 + }, + { + "epoch": 0.434162696836141, + "grad_norm": 0.3709254562854767, + "learning_rate": 1.2053475330041811e-05, + "loss": 0.2236, + "step": 23386 + }, + { + "epoch": 0.43419982697355963, + "grad_norm": 0.31581318378448486, + "learning_rate": 1.2052333684354856e-05, + "loss": 0.349, + "step": 23388 + }, + { + "epoch": 0.43423695711097826, + "grad_norm": 0.30725303292274475, + "learning_rate": 1.2051192010741786e-05, + "loss": 0.3808, + "step": 23390 + }, + { + "epoch": 0.4342740872483969, + "grad_norm": 0.34022271633148193, + "learning_rate": 1.2050050309218136e-05, + "loss": 0.2695, + "step": 23392 + }, + { + "epoch": 0.4343112173858155, + "grad_norm": 0.26789337396621704, + "learning_rate": 1.2048908579799444e-05, + "loss": 0.306, + "step": 23394 + }, + { + "epoch": 0.4343483475232342, + "grad_norm": 0.3022186756134033, + "learning_rate": 1.2047766822501244e-05, + "loss": 0.3314, + "step": 23396 + }, + { + "epoch": 0.43438547766065283, + "grad_norm": 0.3700431287288666, + "learning_rate": 1.2046625037339067e-05, + "loss": 0.3632, + "step": 23398 + }, + { + "epoch": 0.43442260779807146, + "grad_norm": 0.37892642617225647, + "learning_rate": 1.2045483224328458e-05, + "loss": 0.0968, + "step": 23400 + }, + { + "epoch": 0.4344597379354901, + "grad_norm": 0.5065642595291138, + "learning_rate": 1.2044341383484946e-05, + "loss": 0.448, + "step": 23402 + }, + { + "epoch": 0.4344968680729087, + "grad_norm": 0.4491575360298157, + "learning_rate": 1.2043199514824071e-05, + "loss": 0.4688, + "step": 23404 + }, + { + "epoch": 0.4345339982103274, + "grad_norm": 0.31397414207458496, + "learning_rate": 1.2042057618361374e-05, + "loss": 0.2925, + "step": 23406 + }, + { + "epoch": 0.434571128347746, + "grad_norm": 0.25237804651260376, + "learning_rate": 1.204091569411239e-05, + "loss": 0.4011, + "step": 23408 + }, + { + "epoch": 0.43460825848516466, + "grad_norm": 0.34165745973587036, + "learning_rate": 1.2039773742092654e-05, + "loss": 0.1067, + "step": 23410 + }, + { + "epoch": 0.4346453886225833, + "grad_norm": 0.45153725147247314, + "learning_rate": 1.2038631762317708e-05, + "loss": 0.454, + "step": 23412 + }, + { + "epoch": 0.4346825187600019, + "grad_norm": 0.4539611041545868, + "learning_rate": 1.2037489754803088e-05, + "loss": 0.4256, + "step": 23414 + }, + { + "epoch": 0.4347196488974206, + "grad_norm": 0.4070637822151184, + "learning_rate": 1.2036347719564337e-05, + "loss": 0.2729, + "step": 23416 + }, + { + "epoch": 0.4347567790348392, + "grad_norm": 0.6854671835899353, + "learning_rate": 1.2035205656616998e-05, + "loss": 0.2036, + "step": 23418 + }, + { + "epoch": 0.43479390917225785, + "grad_norm": 0.28789129853248596, + "learning_rate": 1.20340635659766e-05, + "loss": 0.2542, + "step": 23420 + }, + { + "epoch": 0.4348310393096765, + "grad_norm": 0.43835821747779846, + "learning_rate": 1.203292144765869e-05, + "loss": 0.316, + "step": 23422 + }, + { + "epoch": 0.4348681694470951, + "grad_norm": 0.3816711902618408, + "learning_rate": 1.2031779301678812e-05, + "loss": 0.1848, + "step": 23424 + }, + { + "epoch": 0.43490529958451374, + "grad_norm": 0.38157516717910767, + "learning_rate": 1.2030637128052499e-05, + "loss": 0.2261, + "step": 23426 + }, + { + "epoch": 0.4349424297219324, + "grad_norm": 0.5143613815307617, + "learning_rate": 1.20294949267953e-05, + "loss": 0.3824, + "step": 23428 + }, + { + "epoch": 0.43497955985935105, + "grad_norm": 0.4589637219905853, + "learning_rate": 1.2028352697922757e-05, + "loss": 0.334, + "step": 23430 + }, + { + "epoch": 0.4350166899967697, + "grad_norm": 0.30158838629722595, + "learning_rate": 1.2027210441450404e-05, + "loss": 0.4638, + "step": 23432 + }, + { + "epoch": 0.4350538201341883, + "grad_norm": 0.37864673137664795, + "learning_rate": 1.202606815739379e-05, + "loss": 0.4138, + "step": 23434 + }, + { + "epoch": 0.43509095027160694, + "grad_norm": 0.3741653263568878, + "learning_rate": 1.2024925845768457e-05, + "loss": 0.2642, + "step": 23436 + }, + { + "epoch": 0.4351280804090256, + "grad_norm": 0.5454931259155273, + "learning_rate": 1.202378350658995e-05, + "loss": 0.2884, + "step": 23438 + }, + { + "epoch": 0.43516521054644425, + "grad_norm": 0.3262859880924225, + "learning_rate": 1.2022641139873811e-05, + "loss": 0.3105, + "step": 23440 + }, + { + "epoch": 0.4352023406838629, + "grad_norm": 0.31813907623291016, + "learning_rate": 1.2021498745635585e-05, + "loss": 0.2357, + "step": 23442 + }, + { + "epoch": 0.4352394708212815, + "grad_norm": 0.39694613218307495, + "learning_rate": 1.2020356323890815e-05, + "loss": 0.315, + "step": 23444 + }, + { + "epoch": 0.43527660095870013, + "grad_norm": 0.4749318063259125, + "learning_rate": 1.201921387465505e-05, + "loss": 0.4181, + "step": 23446 + }, + { + "epoch": 0.43531373109611876, + "grad_norm": 0.25391390919685364, + "learning_rate": 1.2018071397943829e-05, + "loss": 0.332, + "step": 23448 + }, + { + "epoch": 0.43535086123353745, + "grad_norm": 0.3346829414367676, + "learning_rate": 1.2016928893772706e-05, + "loss": 0.4412, + "step": 23450 + }, + { + "epoch": 0.4353879913709561, + "grad_norm": 0.30323755741119385, + "learning_rate": 1.2015786362157215e-05, + "loss": 0.3074, + "step": 23452 + }, + { + "epoch": 0.4354251215083747, + "grad_norm": 0.3684828281402588, + "learning_rate": 1.2014643803112916e-05, + "loss": 0.3547, + "step": 23454 + }, + { + "epoch": 0.43546225164579333, + "grad_norm": 0.4753077030181885, + "learning_rate": 1.2013501216655345e-05, + "loss": 0.3697, + "step": 23456 + }, + { + "epoch": 0.43549938178321196, + "grad_norm": 0.4252491295337677, + "learning_rate": 1.2012358602800056e-05, + "loss": 0.2645, + "step": 23458 + }, + { + "epoch": 0.43553651192063064, + "grad_norm": 0.391164630651474, + "learning_rate": 1.2011215961562595e-05, + "loss": 0.2626, + "step": 23460 + }, + { + "epoch": 0.43557364205804927, + "grad_norm": 0.33539018034935, + "learning_rate": 1.2010073292958506e-05, + "loss": 0.2308, + "step": 23462 + }, + { + "epoch": 0.4356107721954679, + "grad_norm": 0.19928646087646484, + "learning_rate": 1.2008930597003341e-05, + "loss": 0.287, + "step": 23464 + }, + { + "epoch": 0.4356479023328865, + "grad_norm": 0.5480028390884399, + "learning_rate": 1.2007787873712649e-05, + "loss": 0.2732, + "step": 23466 + }, + { + "epoch": 0.43568503247030516, + "grad_norm": 0.3385656774044037, + "learning_rate": 1.200664512310198e-05, + "loss": 0.4161, + "step": 23468 + }, + { + "epoch": 0.4357221626077238, + "grad_norm": 0.27215296030044556, + "learning_rate": 1.2005502345186878e-05, + "loss": 0.2598, + "step": 23470 + }, + { + "epoch": 0.43575929274514247, + "grad_norm": 0.2901736795902252, + "learning_rate": 1.2004359539982897e-05, + "loss": 0.2295, + "step": 23472 + }, + { + "epoch": 0.4357964228825611, + "grad_norm": 0.3861478567123413, + "learning_rate": 1.2003216707505587e-05, + "loss": 0.2296, + "step": 23474 + }, + { + "epoch": 0.4358335530199797, + "grad_norm": 0.42177239060401917, + "learning_rate": 1.2002073847770499e-05, + "loss": 0.1907, + "step": 23476 + }, + { + "epoch": 0.43587068315739835, + "grad_norm": 0.32222235202789307, + "learning_rate": 1.200093096079318e-05, + "loss": 0.165, + "step": 23478 + }, + { + "epoch": 0.435907813294817, + "grad_norm": 0.27451735734939575, + "learning_rate": 1.1999788046589188e-05, + "loss": 0.212, + "step": 23480 + }, + { + "epoch": 0.43594494343223567, + "grad_norm": 0.2757277488708496, + "learning_rate": 1.1998645105174069e-05, + "loss": 0.301, + "step": 23482 + }, + { + "epoch": 0.4359820735696543, + "grad_norm": 0.4462694525718689, + "learning_rate": 1.1997502136563375e-05, + "loss": 0.1161, + "step": 23484 + }, + { + "epoch": 0.4360192037070729, + "grad_norm": 0.39914610981941223, + "learning_rate": 1.1996359140772661e-05, + "loss": 0.2204, + "step": 23486 + }, + { + "epoch": 0.43605633384449155, + "grad_norm": 0.5718931555747986, + "learning_rate": 1.199521611781748e-05, + "loss": 0.0884, + "step": 23488 + }, + { + "epoch": 0.4360934639819102, + "grad_norm": 0.4829443097114563, + "learning_rate": 1.1994073067713383e-05, + "loss": 0.3983, + "step": 23490 + }, + { + "epoch": 0.43613059411932886, + "grad_norm": 0.3673272728919983, + "learning_rate": 1.1992929990475926e-05, + "loss": 0.2151, + "step": 23492 + }, + { + "epoch": 0.4361677242567475, + "grad_norm": 0.3773770034313202, + "learning_rate": 1.199178688612066e-05, + "loss": 0.1144, + "step": 23494 + }, + { + "epoch": 0.4362048543941661, + "grad_norm": 0.24898436665534973, + "learning_rate": 1.199064375466314e-05, + "loss": 0.2372, + "step": 23496 + }, + { + "epoch": 0.43624198453158475, + "grad_norm": 0.42697516083717346, + "learning_rate": 1.1989500596118921e-05, + "loss": 0.1878, + "step": 23498 + }, + { + "epoch": 0.4362791146690034, + "grad_norm": 0.4543319642543793, + "learning_rate": 1.1988357410503562e-05, + "loss": 0.2946, + "step": 23500 + }, + { + "epoch": 0.436316244806422, + "grad_norm": 0.228993758559227, + "learning_rate": 1.1987214197832611e-05, + "loss": 0.3176, + "step": 23502 + }, + { + "epoch": 0.4363533749438407, + "grad_norm": 0.4548444449901581, + "learning_rate": 1.1986070958121627e-05, + "loss": 0.4259, + "step": 23504 + }, + { + "epoch": 0.4363905050812593, + "grad_norm": 0.7008549571037292, + "learning_rate": 1.1984927691386167e-05, + "loss": 0.3127, + "step": 23506 + }, + { + "epoch": 0.43642763521867795, + "grad_norm": 0.3774678707122803, + "learning_rate": 1.1983784397641787e-05, + "loss": 0.3254, + "step": 23508 + }, + { + "epoch": 0.4364647653560966, + "grad_norm": 0.31446534395217896, + "learning_rate": 1.1982641076904042e-05, + "loss": 0.2699, + "step": 23510 + }, + { + "epoch": 0.4365018954935152, + "grad_norm": 0.3595420718193054, + "learning_rate": 1.1981497729188495e-05, + "loss": 0.3007, + "step": 23512 + }, + { + "epoch": 0.4365390256309339, + "grad_norm": 0.235508531332016, + "learning_rate": 1.1980354354510696e-05, + "loss": 0.2891, + "step": 23514 + }, + { + "epoch": 0.4365761557683525, + "grad_norm": 0.3506704270839691, + "learning_rate": 1.1979210952886205e-05, + "loss": 0.1837, + "step": 23516 + }, + { + "epoch": 0.43661328590577114, + "grad_norm": 0.30411526560783386, + "learning_rate": 1.1978067524330582e-05, + "loss": 0.2854, + "step": 23518 + }, + { + "epoch": 0.43665041604318977, + "grad_norm": 0.29133695363998413, + "learning_rate": 1.1976924068859384e-05, + "loss": 0.2478, + "step": 23520 + }, + { + "epoch": 0.4366875461806084, + "grad_norm": 0.39450761675834656, + "learning_rate": 1.1975780586488172e-05, + "loss": 0.2582, + "step": 23522 + }, + { + "epoch": 0.436724676318027, + "grad_norm": 0.2995562255382538, + "learning_rate": 1.1974637077232505e-05, + "loss": 0.2765, + "step": 23524 + }, + { + "epoch": 0.4367618064554457, + "grad_norm": 0.46240922808647156, + "learning_rate": 1.1973493541107944e-05, + "loss": 0.1299, + "step": 23526 + }, + { + "epoch": 0.43679893659286434, + "grad_norm": 0.8839894533157349, + "learning_rate": 1.1972349978130045e-05, + "loss": 0.2663, + "step": 23528 + }, + { + "epoch": 0.43683606673028297, + "grad_norm": 0.33137866854667664, + "learning_rate": 1.1971206388314368e-05, + "loss": 0.1237, + "step": 23530 + }, + { + "epoch": 0.4368731968677016, + "grad_norm": 0.7207096219062805, + "learning_rate": 1.197006277167648e-05, + "loss": 0.257, + "step": 23532 + }, + { + "epoch": 0.4369103270051202, + "grad_norm": 0.2938666343688965, + "learning_rate": 1.196891912823194e-05, + "loss": 0.2525, + "step": 23534 + }, + { + "epoch": 0.4369474571425389, + "grad_norm": 0.4168083369731903, + "learning_rate": 1.1967775457996304e-05, + "loss": 0.2091, + "step": 23536 + }, + { + "epoch": 0.43698458727995754, + "grad_norm": 0.39647307991981506, + "learning_rate": 1.1966631760985142e-05, + "loss": 0.2094, + "step": 23538 + }, + { + "epoch": 0.43702171741737617, + "grad_norm": 0.3388347029685974, + "learning_rate": 1.1965488037214011e-05, + "loss": 0.3695, + "step": 23540 + }, + { + "epoch": 0.4370588475547948, + "grad_norm": 0.33060359954833984, + "learning_rate": 1.1964344286698473e-05, + "loss": 0.2225, + "step": 23542 + }, + { + "epoch": 0.4370959776922134, + "grad_norm": 0.5126566886901855, + "learning_rate": 1.1963200509454096e-05, + "loss": 0.2986, + "step": 23544 + }, + { + "epoch": 0.43713310782963205, + "grad_norm": 0.37752553820610046, + "learning_rate": 1.196205670549644e-05, + "loss": 0.133, + "step": 23546 + }, + { + "epoch": 0.43717023796705073, + "grad_norm": 0.4504946768283844, + "learning_rate": 1.1960912874841068e-05, + "loss": 0.2805, + "step": 23548 + }, + { + "epoch": 0.43720736810446936, + "grad_norm": 0.335664302110672, + "learning_rate": 1.1959769017503548e-05, + "loss": 0.1968, + "step": 23550 + }, + { + "epoch": 0.437244498241888, + "grad_norm": 0.3718658983707428, + "learning_rate": 1.195862513349944e-05, + "loss": 0.3762, + "step": 23552 + }, + { + "epoch": 0.4372816283793066, + "grad_norm": 0.34397536516189575, + "learning_rate": 1.1957481222844309e-05, + "loss": 0.3097, + "step": 23554 + }, + { + "epoch": 0.43731875851672525, + "grad_norm": 0.38907334208488464, + "learning_rate": 1.1956337285553725e-05, + "loss": 0.1681, + "step": 23556 + }, + { + "epoch": 0.43735588865414393, + "grad_norm": 0.33358630537986755, + "learning_rate": 1.195519332164325e-05, + "loss": 0.4205, + "step": 23558 + }, + { + "epoch": 0.43739301879156256, + "grad_norm": 0.32918423414230347, + "learning_rate": 1.1954049331128447e-05, + "loss": 0.275, + "step": 23560 + }, + { + "epoch": 0.4374301489289812, + "grad_norm": 0.3052709400653839, + "learning_rate": 1.1952905314024887e-05, + "loss": 0.3273, + "step": 23562 + }, + { + "epoch": 0.4374672790663998, + "grad_norm": 0.2730685770511627, + "learning_rate": 1.1951761270348139e-05, + "loss": 0.2047, + "step": 23564 + }, + { + "epoch": 0.43750440920381845, + "grad_norm": 0.4017612636089325, + "learning_rate": 1.1950617200113765e-05, + "loss": 0.2397, + "step": 23566 + }, + { + "epoch": 0.43754153934123713, + "grad_norm": 0.3625796139240265, + "learning_rate": 1.1949473103337333e-05, + "loss": 0.427, + "step": 23568 + }, + { + "epoch": 0.43757866947865576, + "grad_norm": 0.30939817428588867, + "learning_rate": 1.1948328980034412e-05, + "loss": 0.3075, + "step": 23570 + }, + { + "epoch": 0.4376157996160744, + "grad_norm": 0.2783633768558502, + "learning_rate": 1.1947184830220566e-05, + "loss": 0.2993, + "step": 23572 + }, + { + "epoch": 0.437652929753493, + "grad_norm": 0.3197158873081207, + "learning_rate": 1.1946040653911368e-05, + "loss": 0.2296, + "step": 23574 + }, + { + "epoch": 0.43769005989091164, + "grad_norm": 0.23380829393863678, + "learning_rate": 1.1944896451122387e-05, + "loss": 0.2422, + "step": 23576 + }, + { + "epoch": 0.43772719002833027, + "grad_norm": 0.5042902231216431, + "learning_rate": 1.1943752221869194e-05, + "loss": 0.3204, + "step": 23578 + }, + { + "epoch": 0.43776432016574895, + "grad_norm": 0.4200644791126251, + "learning_rate": 1.1942607966167353e-05, + "loss": 0.2944, + "step": 23580 + }, + { + "epoch": 0.4378014503031676, + "grad_norm": 0.2486756145954132, + "learning_rate": 1.1941463684032435e-05, + "loss": 0.3438, + "step": 23582 + }, + { + "epoch": 0.4378385804405862, + "grad_norm": 0.30275389552116394, + "learning_rate": 1.1940319375480012e-05, + "loss": 0.2734, + "step": 23584 + }, + { + "epoch": 0.43787571057800484, + "grad_norm": 0.34503835439682007, + "learning_rate": 1.1939175040525655e-05, + "loss": 0.2646, + "step": 23586 + }, + { + "epoch": 0.43791284071542347, + "grad_norm": 0.34359151124954224, + "learning_rate": 1.1938030679184936e-05, + "loss": 0.266, + "step": 23588 + }, + { + "epoch": 0.43794997085284215, + "grad_norm": 0.37743625044822693, + "learning_rate": 1.1936886291473421e-05, + "loss": 0.4051, + "step": 23590 + }, + { + "epoch": 0.4379871009902608, + "grad_norm": 0.3425731360912323, + "learning_rate": 1.1935741877406685e-05, + "loss": 0.2631, + "step": 23592 + }, + { + "epoch": 0.4380242311276794, + "grad_norm": 0.39717477560043335, + "learning_rate": 1.1934597437000304e-05, + "loss": 0.5267, + "step": 23594 + }, + { + "epoch": 0.43806136126509804, + "grad_norm": 0.7706037163734436, + "learning_rate": 1.1933452970269843e-05, + "loss": 0.308, + "step": 23596 + }, + { + "epoch": 0.43809849140251667, + "grad_norm": 0.604516863822937, + "learning_rate": 1.1932308477230883e-05, + "loss": 0.2436, + "step": 23598 + }, + { + "epoch": 0.4381356215399353, + "grad_norm": 0.33222606778144836, + "learning_rate": 1.1931163957898988e-05, + "loss": 0.3729, + "step": 23600 + }, + { + "epoch": 0.438172751677354, + "grad_norm": 0.4277777671813965, + "learning_rate": 1.1930019412289738e-05, + "loss": 0.1177, + "step": 23602 + }, + { + "epoch": 0.4382098818147726, + "grad_norm": 0.40444886684417725, + "learning_rate": 1.1928874840418705e-05, + "loss": 0.2541, + "step": 23604 + }, + { + "epoch": 0.43824701195219123, + "grad_norm": 0.2931002080440521, + "learning_rate": 1.192773024230146e-05, + "loss": 0.2252, + "step": 23606 + }, + { + "epoch": 0.43828414208960986, + "grad_norm": 0.26695096492767334, + "learning_rate": 1.1926585617953586e-05, + "loss": 0.2187, + "step": 23608 + }, + { + "epoch": 0.4383212722270285, + "grad_norm": 0.2957378923892975, + "learning_rate": 1.192544096739065e-05, + "loss": 0.3458, + "step": 23610 + }, + { + "epoch": 0.4383584023644472, + "grad_norm": 0.40524229407310486, + "learning_rate": 1.1924296290628228e-05, + "loss": 0.2635, + "step": 23612 + }, + { + "epoch": 0.4383955325018658, + "grad_norm": 0.3096320927143097, + "learning_rate": 1.19231515876819e-05, + "loss": 0.2552, + "step": 23614 + }, + { + "epoch": 0.43843266263928443, + "grad_norm": 0.7302179336547852, + "learning_rate": 1.1922006858567239e-05, + "loss": 0.193, + "step": 23616 + }, + { + "epoch": 0.43846979277670306, + "grad_norm": 0.4016776978969574, + "learning_rate": 1.1920862103299822e-05, + "loss": 0.2798, + "step": 23618 + }, + { + "epoch": 0.4385069229141217, + "grad_norm": 0.3226209580898285, + "learning_rate": 1.1919717321895226e-05, + "loss": 0.4318, + "step": 23620 + }, + { + "epoch": 0.4385440530515403, + "grad_norm": 0.40980634093284607, + "learning_rate": 1.1918572514369024e-05, + "loss": 0.141, + "step": 23622 + }, + { + "epoch": 0.438581183188959, + "grad_norm": 0.35381022095680237, + "learning_rate": 1.1917427680736798e-05, + "loss": 0.3719, + "step": 23624 + }, + { + "epoch": 0.43861831332637763, + "grad_norm": 0.45407426357269287, + "learning_rate": 1.1916282821014127e-05, + "loss": 0.3829, + "step": 23626 + }, + { + "epoch": 0.43865544346379626, + "grad_norm": 0.4319459795951843, + "learning_rate": 1.1915137935216583e-05, + "loss": 0.3205, + "step": 23628 + }, + { + "epoch": 0.4386925736012149, + "grad_norm": 0.5747219920158386, + "learning_rate": 1.1913993023359751e-05, + "loss": 0.3315, + "step": 23630 + }, + { + "epoch": 0.4387297037386335, + "grad_norm": 0.372081458568573, + "learning_rate": 1.1912848085459209e-05, + "loss": 0.2112, + "step": 23632 + }, + { + "epoch": 0.4387668338760522, + "grad_norm": 0.3500063419342041, + "learning_rate": 1.1911703121530532e-05, + "loss": 0.1823, + "step": 23634 + }, + { + "epoch": 0.4388039640134708, + "grad_norm": 0.2596704959869385, + "learning_rate": 1.1910558131589298e-05, + "loss": 0.2484, + "step": 23636 + }, + { + "epoch": 0.43884109415088945, + "grad_norm": 0.5212854743003845, + "learning_rate": 1.1909413115651096e-05, + "loss": 0.3309, + "step": 23638 + }, + { + "epoch": 0.4388782242883081, + "grad_norm": 0.42243945598602295, + "learning_rate": 1.1908268073731497e-05, + "loss": 0.1906, + "step": 23640 + }, + { + "epoch": 0.4389153544257267, + "grad_norm": 0.48382827639579773, + "learning_rate": 1.1907123005846089e-05, + "loss": 0.2125, + "step": 23642 + }, + { + "epoch": 0.4389524845631454, + "grad_norm": 0.39249637722969055, + "learning_rate": 1.1905977912010447e-05, + "loss": 0.2454, + "step": 23644 + }, + { + "epoch": 0.438989614700564, + "grad_norm": 0.48454025387763977, + "learning_rate": 1.1904832792240157e-05, + "loss": 0.2909, + "step": 23646 + }, + { + "epoch": 0.43902674483798265, + "grad_norm": 0.394525945186615, + "learning_rate": 1.1903687646550795e-05, + "loss": 0.3435, + "step": 23648 + }, + { + "epoch": 0.4390638749754013, + "grad_norm": 0.39680492877960205, + "learning_rate": 1.190254247495795e-05, + "loss": 0.3474, + "step": 23650 + }, + { + "epoch": 0.4391010051128199, + "grad_norm": 0.36667600274086, + "learning_rate": 1.19013972774772e-05, + "loss": 0.2419, + "step": 23652 + }, + { + "epoch": 0.43913813525023854, + "grad_norm": 0.2906239926815033, + "learning_rate": 1.1900252054124127e-05, + "loss": 0.263, + "step": 23654 + }, + { + "epoch": 0.4391752653876572, + "grad_norm": 0.4001409113407135, + "learning_rate": 1.1899106804914313e-05, + "loss": 0.3431, + "step": 23656 + }, + { + "epoch": 0.43921239552507585, + "grad_norm": 0.37662196159362793, + "learning_rate": 1.1897961529863346e-05, + "loss": 0.4826, + "step": 23658 + }, + { + "epoch": 0.4392495256624945, + "grad_norm": 0.35898080468177795, + "learning_rate": 1.1896816228986809e-05, + "loss": 0.3105, + "step": 23660 + }, + { + "epoch": 0.4392866557999131, + "grad_norm": 0.40890440344810486, + "learning_rate": 1.1895670902300285e-05, + "loss": 0.2809, + "step": 23662 + }, + { + "epoch": 0.43932378593733173, + "grad_norm": 0.38531020283699036, + "learning_rate": 1.1894525549819358e-05, + "loss": 0.3465, + "step": 23664 + }, + { + "epoch": 0.4393609160747504, + "grad_norm": 0.553290069103241, + "learning_rate": 1.189338017155961e-05, + "loss": 0.3095, + "step": 23666 + }, + { + "epoch": 0.43939804621216905, + "grad_norm": 0.3739621043205261, + "learning_rate": 1.189223476753663e-05, + "loss": 0.3406, + "step": 23668 + }, + { + "epoch": 0.4394351763495877, + "grad_norm": 0.27504613995552063, + "learning_rate": 1.1891089337766006e-05, + "loss": 0.0928, + "step": 23670 + }, + { + "epoch": 0.4394723064870063, + "grad_norm": 0.3084995746612549, + "learning_rate": 1.188994388226332e-05, + "loss": 0.2113, + "step": 23672 + }, + { + "epoch": 0.43950943662442493, + "grad_norm": 0.49022436141967773, + "learning_rate": 1.1888798401044158e-05, + "loss": 0.3225, + "step": 23674 + }, + { + "epoch": 0.43954656676184356, + "grad_norm": 0.5140630006790161, + "learning_rate": 1.1887652894124107e-05, + "loss": 0.1536, + "step": 23676 + }, + { + "epoch": 0.43958369689926224, + "grad_norm": 0.3043373227119446, + "learning_rate": 1.188650736151875e-05, + "loss": 0.238, + "step": 23678 + }, + { + "epoch": 0.4396208270366809, + "grad_norm": 0.23684245347976685, + "learning_rate": 1.1885361803243685e-05, + "loss": 0.109, + "step": 23680 + }, + { + "epoch": 0.4396579571740995, + "grad_norm": 0.3821539580821991, + "learning_rate": 1.188421621931449e-05, + "loss": 0.1975, + "step": 23682 + }, + { + "epoch": 0.43969508731151813, + "grad_norm": 0.413874089717865, + "learning_rate": 1.1883070609746758e-05, + "loss": 0.2326, + "step": 23684 + }, + { + "epoch": 0.43973221744893676, + "grad_norm": 0.6154585480690002, + "learning_rate": 1.1881924974556074e-05, + "loss": 0.2247, + "step": 23686 + }, + { + "epoch": 0.43976934758635544, + "grad_norm": 0.33350494503974915, + "learning_rate": 1.1880779313758026e-05, + "loss": 0.2178, + "step": 23688 + }, + { + "epoch": 0.43980647772377407, + "grad_norm": 0.2938879430294037, + "learning_rate": 1.187963362736821e-05, + "loss": 0.2546, + "step": 23690 + }, + { + "epoch": 0.4398436078611927, + "grad_norm": 0.2042945921421051, + "learning_rate": 1.1878487915402209e-05, + "loss": 0.3589, + "step": 23692 + }, + { + "epoch": 0.4398807379986113, + "grad_norm": 0.3872981369495392, + "learning_rate": 1.1877342177875613e-05, + "loss": 0.4697, + "step": 23694 + }, + { + "epoch": 0.43991786813602995, + "grad_norm": 0.34465670585632324, + "learning_rate": 1.1876196414804014e-05, + "loss": 0.288, + "step": 23696 + }, + { + "epoch": 0.4399549982734486, + "grad_norm": 0.2642345130443573, + "learning_rate": 1.1875050626203e-05, + "loss": 0.4191, + "step": 23698 + }, + { + "epoch": 0.43999212841086727, + "grad_norm": 0.5179166793823242, + "learning_rate": 1.1873904812088163e-05, + "loss": 0.3563, + "step": 23700 + }, + { + "epoch": 0.4400292585482859, + "grad_norm": 0.34532374143600464, + "learning_rate": 1.1872758972475097e-05, + "loss": 0.4796, + "step": 23702 + }, + { + "epoch": 0.4400663886857045, + "grad_norm": 0.24797675013542175, + "learning_rate": 1.1871613107379392e-05, + "loss": 0.4058, + "step": 23704 + }, + { + "epoch": 0.44010351882312315, + "grad_norm": 1.0722007751464844, + "learning_rate": 1.1870467216816635e-05, + "loss": 0.1337, + "step": 23706 + }, + { + "epoch": 0.4401406489605418, + "grad_norm": 0.5361062288284302, + "learning_rate": 1.1869321300802425e-05, + "loss": 0.3732, + "step": 23708 + }, + { + "epoch": 0.44017777909796046, + "grad_norm": 0.4638739228248596, + "learning_rate": 1.186817535935235e-05, + "loss": 0.2358, + "step": 23710 + }, + { + "epoch": 0.4402149092353791, + "grad_norm": 0.37871047854423523, + "learning_rate": 1.1867029392482004e-05, + "loss": 0.466, + "step": 23712 + }, + { + "epoch": 0.4402520393727977, + "grad_norm": 0.3585980236530304, + "learning_rate": 1.1865883400206982e-05, + "loss": 0.1976, + "step": 23714 + }, + { + "epoch": 0.44028916951021635, + "grad_norm": 0.38213521242141724, + "learning_rate": 1.1864737382542874e-05, + "loss": 0.5324, + "step": 23716 + }, + { + "epoch": 0.440326299647635, + "grad_norm": 0.26332977414131165, + "learning_rate": 1.1863591339505276e-05, + "loss": 0.2894, + "step": 23718 + }, + { + "epoch": 0.44036342978505366, + "grad_norm": 0.23357310891151428, + "learning_rate": 1.1862445271109781e-05, + "loss": 0.1982, + "step": 23720 + }, + { + "epoch": 0.4404005599224723, + "grad_norm": 0.37500178813934326, + "learning_rate": 1.1861299177371986e-05, + "loss": 0.2802, + "step": 23722 + }, + { + "epoch": 0.4404376900598909, + "grad_norm": 0.6693291068077087, + "learning_rate": 1.1860153058307482e-05, + "loss": 0.249, + "step": 23724 + }, + { + "epoch": 0.44047482019730955, + "grad_norm": 0.3429964482784271, + "learning_rate": 1.1859006913931871e-05, + "loss": 0.248, + "step": 23726 + }, + { + "epoch": 0.4405119503347282, + "grad_norm": 0.5627470016479492, + "learning_rate": 1.1857860744260743e-05, + "loss": 0.2844, + "step": 23728 + }, + { + "epoch": 0.4405490804721468, + "grad_norm": 0.4331391453742981, + "learning_rate": 1.1856714549309694e-05, + "loss": 0.2493, + "step": 23730 + }, + { + "epoch": 0.4405862106095655, + "grad_norm": 0.43830960988998413, + "learning_rate": 1.185556832909432e-05, + "loss": 0.215, + "step": 23732 + }, + { + "epoch": 0.4406233407469841, + "grad_norm": 0.5378835797309875, + "learning_rate": 1.1854422083630222e-05, + "loss": 0.2995, + "step": 23734 + }, + { + "epoch": 0.44066047088440274, + "grad_norm": 0.4547078609466553, + "learning_rate": 1.1853275812932994e-05, + "loss": 0.3756, + "step": 23736 + }, + { + "epoch": 0.4406976010218214, + "grad_norm": 0.34700706601142883, + "learning_rate": 1.1852129517018232e-05, + "loss": 0.2541, + "step": 23738 + }, + { + "epoch": 0.44073473115924, + "grad_norm": 0.40170758962631226, + "learning_rate": 1.1850983195901535e-05, + "loss": 0.2866, + "step": 23740 + }, + { + "epoch": 0.4407718612966587, + "grad_norm": 0.3848995864391327, + "learning_rate": 1.1849836849598501e-05, + "loss": 0.3374, + "step": 23742 + }, + { + "epoch": 0.4408089914340773, + "grad_norm": 0.3300943374633789, + "learning_rate": 1.184869047812473e-05, + "loss": 0.2351, + "step": 23744 + }, + { + "epoch": 0.44084612157149594, + "grad_norm": 0.3932442367076874, + "learning_rate": 1.1847544081495816e-05, + "loss": 0.2037, + "step": 23746 + }, + { + "epoch": 0.44088325170891457, + "grad_norm": 0.5142634510993958, + "learning_rate": 1.1846397659727367e-05, + "loss": 0.1404, + "step": 23748 + }, + { + "epoch": 0.4409203818463332, + "grad_norm": 0.3607427775859833, + "learning_rate": 1.1845251212834969e-05, + "loss": 0.1879, + "step": 23750 + }, + { + "epoch": 0.4409575119837518, + "grad_norm": 0.36212384700775146, + "learning_rate": 1.1844104740834235e-05, + "loss": 0.294, + "step": 23752 + }, + { + "epoch": 0.4409946421211705, + "grad_norm": 0.3918575346469879, + "learning_rate": 1.1842958243740756e-05, + "loss": 0.3842, + "step": 23754 + }, + { + "epoch": 0.44103177225858914, + "grad_norm": 0.4928266704082489, + "learning_rate": 1.1841811721570138e-05, + "loss": 0.4172, + "step": 23756 + }, + { + "epoch": 0.44106890239600777, + "grad_norm": 0.35291337966918945, + "learning_rate": 1.1840665174337977e-05, + "loss": 0.3658, + "step": 23758 + }, + { + "epoch": 0.4411060325334264, + "grad_norm": 0.5061340928077698, + "learning_rate": 1.1839518602059877e-05, + "loss": 0.0446, + "step": 23760 + }, + { + "epoch": 0.441143162670845, + "grad_norm": 0.42473292350769043, + "learning_rate": 1.1838372004751436e-05, + "loss": 0.182, + "step": 23762 + }, + { + "epoch": 0.4411802928082637, + "grad_norm": 0.4381121098995209, + "learning_rate": 1.1837225382428263e-05, + "loss": 0.2985, + "step": 23764 + }, + { + "epoch": 0.44121742294568234, + "grad_norm": 0.5044626593589783, + "learning_rate": 1.1836078735105954e-05, + "loss": 0.1455, + "step": 23766 + }, + { + "epoch": 0.44125455308310096, + "grad_norm": 0.42887911200523376, + "learning_rate": 1.1834932062800115e-05, + "loss": 0.4054, + "step": 23768 + }, + { + "epoch": 0.4412916832205196, + "grad_norm": 0.6691617369651794, + "learning_rate": 1.1833785365526344e-05, + "loss": 0.3122, + "step": 23770 + }, + { + "epoch": 0.4413288133579382, + "grad_norm": 0.35354384779930115, + "learning_rate": 1.1832638643300249e-05, + "loss": 0.3718, + "step": 23772 + }, + { + "epoch": 0.44136594349535685, + "grad_norm": 0.5450619459152222, + "learning_rate": 1.183149189613743e-05, + "loss": 0.3814, + "step": 23774 + }, + { + "epoch": 0.44140307363277553, + "grad_norm": 0.23925697803497314, + "learning_rate": 1.1830345124053493e-05, + "loss": 0.1398, + "step": 23776 + }, + { + "epoch": 0.44144020377019416, + "grad_norm": 0.7135844826698303, + "learning_rate": 1.1829198327064042e-05, + "loss": 0.1999, + "step": 23778 + }, + { + "epoch": 0.4414773339076128, + "grad_norm": 0.7205314040184021, + "learning_rate": 1.182805150518468e-05, + "loss": 0.3509, + "step": 23780 + }, + { + "epoch": 0.4415144640450314, + "grad_norm": 0.29555100202560425, + "learning_rate": 1.1826904658431014e-05, + "loss": 0.3551, + "step": 23782 + }, + { + "epoch": 0.44155159418245005, + "grad_norm": 0.7239555716514587, + "learning_rate": 1.1825757786818646e-05, + "loss": 0.2727, + "step": 23784 + }, + { + "epoch": 0.44158872431986873, + "grad_norm": 0.34976357221603394, + "learning_rate": 1.1824610890363185e-05, + "loss": 0.1807, + "step": 23786 + }, + { + "epoch": 0.44162585445728736, + "grad_norm": 0.4200124740600586, + "learning_rate": 1.1823463969080234e-05, + "loss": 0.2017, + "step": 23788 + }, + { + "epoch": 0.441662984594706, + "grad_norm": 0.41671186685562134, + "learning_rate": 1.1822317022985401e-05, + "loss": 0.3845, + "step": 23790 + }, + { + "epoch": 0.4417001147321246, + "grad_norm": 0.30348482728004456, + "learning_rate": 1.182117005209429e-05, + "loss": 0.1164, + "step": 23792 + }, + { + "epoch": 0.44173724486954324, + "grad_norm": 0.260517418384552, + "learning_rate": 1.182002305642251e-05, + "loss": 0.5546, + "step": 23794 + }, + { + "epoch": 0.44177437500696193, + "grad_norm": 0.36911827325820923, + "learning_rate": 1.1818876035985669e-05, + "loss": 0.2633, + "step": 23796 + }, + { + "epoch": 0.44181150514438056, + "grad_norm": 0.49242958426475525, + "learning_rate": 1.1817728990799374e-05, + "loss": 0.2084, + "step": 23798 + }, + { + "epoch": 0.4418486352817992, + "grad_norm": 0.3496561348438263, + "learning_rate": 1.181658192087923e-05, + "loss": 0.2069, + "step": 23800 + }, + { + "epoch": 0.4418857654192178, + "grad_norm": 0.4719865024089813, + "learning_rate": 1.181543482624085e-05, + "loss": 0.2234, + "step": 23802 + }, + { + "epoch": 0.44192289555663644, + "grad_norm": 0.33670270442962646, + "learning_rate": 1.1814287706899836e-05, + "loss": 0.4826, + "step": 23804 + }, + { + "epoch": 0.44196002569405507, + "grad_norm": 0.30540457367897034, + "learning_rate": 1.1813140562871803e-05, + "loss": 0.3552, + "step": 23806 + }, + { + "epoch": 0.44199715583147375, + "grad_norm": 0.67805415391922, + "learning_rate": 1.1811993394172356e-05, + "loss": 0.2221, + "step": 23808 + }, + { + "epoch": 0.4420342859688924, + "grad_norm": 0.3721048831939697, + "learning_rate": 1.1810846200817112e-05, + "loss": 0.3098, + "step": 23810 + }, + { + "epoch": 0.442071416106311, + "grad_norm": 0.3707892894744873, + "learning_rate": 1.180969898282167e-05, + "loss": 0.2358, + "step": 23812 + }, + { + "epoch": 0.44210854624372964, + "grad_norm": 0.28123393654823303, + "learning_rate": 1.1808551740201649e-05, + "loss": 0.3635, + "step": 23814 + }, + { + "epoch": 0.44214567638114827, + "grad_norm": 0.43573126196861267, + "learning_rate": 1.1807404472972655e-05, + "loss": 0.2, + "step": 23816 + }, + { + "epoch": 0.44218280651856695, + "grad_norm": 0.34867316484451294, + "learning_rate": 1.1806257181150298e-05, + "loss": 0.2057, + "step": 23818 + }, + { + "epoch": 0.4422199366559856, + "grad_norm": 0.4324578642845154, + "learning_rate": 1.1805109864750193e-05, + "loss": 0.3184, + "step": 23820 + }, + { + "epoch": 0.4422570667934042, + "grad_norm": 0.3645049035549164, + "learning_rate": 1.1803962523787948e-05, + "loss": 0.3173, + "step": 23822 + }, + { + "epoch": 0.44229419693082284, + "grad_norm": 0.27655619382858276, + "learning_rate": 1.1802815158279178e-05, + "loss": 0.3596, + "step": 23824 + }, + { + "epoch": 0.44233132706824146, + "grad_norm": 0.48209938406944275, + "learning_rate": 1.180166776823949e-05, + "loss": 0.4345, + "step": 23826 + }, + { + "epoch": 0.4423684572056601, + "grad_norm": 0.2817263603210449, + "learning_rate": 1.1800520353684505e-05, + "loss": 0.248, + "step": 23828 + }, + { + "epoch": 0.4424055873430788, + "grad_norm": 0.3553263247013092, + "learning_rate": 1.179937291462983e-05, + "loss": 0.3545, + "step": 23830 + }, + { + "epoch": 0.4424427174804974, + "grad_norm": 0.30517029762268066, + "learning_rate": 1.179822545109108e-05, + "loss": 0.3027, + "step": 23832 + }, + { + "epoch": 0.44247984761791603, + "grad_norm": 0.3245173990726471, + "learning_rate": 1.1797077963083869e-05, + "loss": 0.4029, + "step": 23834 + }, + { + "epoch": 0.44251697775533466, + "grad_norm": 0.4065669775009155, + "learning_rate": 1.1795930450623807e-05, + "loss": 0.2569, + "step": 23836 + }, + { + "epoch": 0.4425541078927533, + "grad_norm": 0.35312923789024353, + "learning_rate": 1.179478291372651e-05, + "loss": 0.1981, + "step": 23838 + }, + { + "epoch": 0.442591238030172, + "grad_norm": 0.42727893590927124, + "learning_rate": 1.1793635352407598e-05, + "loss": 0.208, + "step": 23840 + }, + { + "epoch": 0.4426283681675906, + "grad_norm": 0.30203187465667725, + "learning_rate": 1.1792487766682679e-05, + "loss": 0.3275, + "step": 23842 + }, + { + "epoch": 0.44266549830500923, + "grad_norm": 0.3186185657978058, + "learning_rate": 1.179134015656737e-05, + "loss": 0.1899, + "step": 23844 + }, + { + "epoch": 0.44270262844242786, + "grad_norm": 0.28849631547927856, + "learning_rate": 1.1790192522077286e-05, + "loss": 0.4682, + "step": 23846 + }, + { + "epoch": 0.4427397585798465, + "grad_norm": 0.6674695014953613, + "learning_rate": 1.1789044863228047e-05, + "loss": 0.2275, + "step": 23848 + }, + { + "epoch": 0.4427768887172651, + "grad_norm": 0.4135274589061737, + "learning_rate": 1.1787897180035263e-05, + "loss": 0.3184, + "step": 23850 + }, + { + "epoch": 0.4428140188546838, + "grad_norm": 0.3842872679233551, + "learning_rate": 1.1786749472514556e-05, + "loss": 0.3361, + "step": 23852 + }, + { + "epoch": 0.44285114899210243, + "grad_norm": 0.3466397523880005, + "learning_rate": 1.1785601740681542e-05, + "loss": 0.2423, + "step": 23854 + }, + { + "epoch": 0.44288827912952106, + "grad_norm": 0.30038174986839294, + "learning_rate": 1.1784453984551833e-05, + "loss": 0.2812, + "step": 23856 + }, + { + "epoch": 0.4429254092669397, + "grad_norm": 0.3503870666027069, + "learning_rate": 1.1783306204141052e-05, + "loss": 0.2705, + "step": 23858 + }, + { + "epoch": 0.4429625394043583, + "grad_norm": 0.5904318690299988, + "learning_rate": 1.1782158399464816e-05, + "loss": 0.1788, + "step": 23860 + }, + { + "epoch": 0.442999669541777, + "grad_norm": 0.28318244218826294, + "learning_rate": 1.1781010570538742e-05, + "loss": 0.3146, + "step": 23862 + }, + { + "epoch": 0.4430367996791956, + "grad_norm": 0.5092008113861084, + "learning_rate": 1.1779862717378449e-05, + "loss": 0.2174, + "step": 23864 + }, + { + "epoch": 0.44307392981661425, + "grad_norm": 0.4855818748474121, + "learning_rate": 1.1778714839999556e-05, + "loss": 0.3217, + "step": 23866 + }, + { + "epoch": 0.4431110599540329, + "grad_norm": 0.4310131371021271, + "learning_rate": 1.1777566938417681e-05, + "loss": 0.4751, + "step": 23868 + }, + { + "epoch": 0.4431481900914515, + "grad_norm": 0.4504663348197937, + "learning_rate": 1.1776419012648443e-05, + "loss": 0.3063, + "step": 23870 + }, + { + "epoch": 0.4431853202288702, + "grad_norm": 0.3513050675392151, + "learning_rate": 1.1775271062707468e-05, + "loss": 0.2177, + "step": 23872 + }, + { + "epoch": 0.4432224503662888, + "grad_norm": 0.4568898677825928, + "learning_rate": 1.1774123088610369e-05, + "loss": 0.1516, + "step": 23874 + }, + { + "epoch": 0.44325958050370745, + "grad_norm": 0.3882018029689789, + "learning_rate": 1.1772975090372766e-05, + "loss": 0.2372, + "step": 23876 + }, + { + "epoch": 0.4432967106411261, + "grad_norm": 0.27046525478363037, + "learning_rate": 1.1771827068010286e-05, + "loss": 0.167, + "step": 23878 + }, + { + "epoch": 0.4433338407785447, + "grad_norm": 0.36586055159568787, + "learning_rate": 1.1770679021538545e-05, + "loss": 0.1866, + "step": 23880 + }, + { + "epoch": 0.44337097091596334, + "grad_norm": 0.4313051402568817, + "learning_rate": 1.1769530950973167e-05, + "loss": 0.2696, + "step": 23882 + }, + { + "epoch": 0.443408101053382, + "grad_norm": 0.5722182393074036, + "learning_rate": 1.1768382856329774e-05, + "loss": 0.22, + "step": 23884 + }, + { + "epoch": 0.44344523119080065, + "grad_norm": 0.24752557277679443, + "learning_rate": 1.1767234737623988e-05, + "loss": 0.3267, + "step": 23886 + }, + { + "epoch": 0.4434823613282193, + "grad_norm": 0.4271041750907898, + "learning_rate": 1.176608659487143e-05, + "loss": 0.329, + "step": 23888 + }, + { + "epoch": 0.4435194914656379, + "grad_norm": 0.4507372975349426, + "learning_rate": 1.176493842808772e-05, + "loss": 0.1859, + "step": 23890 + }, + { + "epoch": 0.44355662160305653, + "grad_norm": 0.30965831875801086, + "learning_rate": 1.1763790237288488e-05, + "loss": 0.4126, + "step": 23892 + }, + { + "epoch": 0.4435937517404752, + "grad_norm": 0.4449214041233063, + "learning_rate": 1.1762642022489355e-05, + "loss": 0.2956, + "step": 23894 + }, + { + "epoch": 0.44363088187789385, + "grad_norm": 0.6289788484573364, + "learning_rate": 1.176149378370594e-05, + "loss": 0.2853, + "step": 23896 + }, + { + "epoch": 0.4436680120153125, + "grad_norm": 0.47286567091941833, + "learning_rate": 1.1760345520953875e-05, + "loss": 0.2164, + "step": 23898 + }, + { + "epoch": 0.4437051421527311, + "grad_norm": 0.5417950749397278, + "learning_rate": 1.1759197234248775e-05, + "loss": 0.3351, + "step": 23900 + }, + { + "epoch": 0.44374227229014973, + "grad_norm": 0.6363476514816284, + "learning_rate": 1.1758048923606274e-05, + "loss": 0.4697, + "step": 23902 + }, + { + "epoch": 0.44377940242756836, + "grad_norm": 0.3745894432067871, + "learning_rate": 1.1756900589041993e-05, + "loss": 0.3303, + "step": 23904 + }, + { + "epoch": 0.44381653256498704, + "grad_norm": 0.4443109631538391, + "learning_rate": 1.1755752230571557e-05, + "loss": 0.3082, + "step": 23906 + }, + { + "epoch": 0.44385366270240567, + "grad_norm": 0.5911005139350891, + "learning_rate": 1.1754603848210593e-05, + "loss": 0.4053, + "step": 23908 + }, + { + "epoch": 0.4438907928398243, + "grad_norm": 0.34702086448669434, + "learning_rate": 1.1753455441974726e-05, + "loss": 0.3088, + "step": 23910 + }, + { + "epoch": 0.44392792297724293, + "grad_norm": 0.3851543068885803, + "learning_rate": 1.175230701187958e-05, + "loss": 0.4825, + "step": 23912 + }, + { + "epoch": 0.44396505311466156, + "grad_norm": 0.4213216304779053, + "learning_rate": 1.1751158557940787e-05, + "loss": 0.3531, + "step": 23914 + }, + { + "epoch": 0.44400218325208024, + "grad_norm": 0.3401533365249634, + "learning_rate": 1.175001008017397e-05, + "loss": 0.2891, + "step": 23916 + }, + { + "epoch": 0.44403931338949887, + "grad_norm": 0.530329704284668, + "learning_rate": 1.174886157859476e-05, + "loss": 0.2451, + "step": 23918 + }, + { + "epoch": 0.4440764435269175, + "grad_norm": 0.33848339319229126, + "learning_rate": 1.1747713053218782e-05, + "loss": 0.2495, + "step": 23920 + }, + { + "epoch": 0.4441135736643361, + "grad_norm": 0.26268675923347473, + "learning_rate": 1.1746564504061663e-05, + "loss": 0.3251, + "step": 23922 + }, + { + "epoch": 0.44415070380175475, + "grad_norm": 0.39955538511276245, + "learning_rate": 1.1745415931139033e-05, + "loss": 0.2323, + "step": 23924 + }, + { + "epoch": 0.4441878339391734, + "grad_norm": 0.38394472002983093, + "learning_rate": 1.174426733446652e-05, + "loss": 0.3164, + "step": 23926 + }, + { + "epoch": 0.44422496407659207, + "grad_norm": 0.3943612277507782, + "learning_rate": 1.1743118714059754e-05, + "loss": 0.3199, + "step": 23928 + }, + { + "epoch": 0.4442620942140107, + "grad_norm": 0.40309277176856995, + "learning_rate": 1.1741970069934364e-05, + "loss": 0.496, + "step": 23930 + }, + { + "epoch": 0.4442992243514293, + "grad_norm": 0.34559857845306396, + "learning_rate": 1.1740821402105975e-05, + "loss": 0.3281, + "step": 23932 + }, + { + "epoch": 0.44433635448884795, + "grad_norm": 0.32255464792251587, + "learning_rate": 1.1739672710590224e-05, + "loss": 0.2037, + "step": 23934 + }, + { + "epoch": 0.4443734846262666, + "grad_norm": 0.18587647378444672, + "learning_rate": 1.1738523995402739e-05, + "loss": 0.2626, + "step": 23936 + }, + { + "epoch": 0.44441061476368526, + "grad_norm": 0.3890645205974579, + "learning_rate": 1.1737375256559152e-05, + "loss": 0.3201, + "step": 23938 + }, + { + "epoch": 0.4444477449011039, + "grad_norm": 0.4232846796512604, + "learning_rate": 1.1736226494075086e-05, + "loss": 0.2023, + "step": 23940 + }, + { + "epoch": 0.4444848750385225, + "grad_norm": 0.39195990562438965, + "learning_rate": 1.1735077707966183e-05, + "loss": 0.1999, + "step": 23942 + }, + { + "epoch": 0.44452200517594115, + "grad_norm": 0.3769840896129608, + "learning_rate": 1.1733928898248069e-05, + "loss": 0.4931, + "step": 23944 + }, + { + "epoch": 0.4445591353133598, + "grad_norm": 0.5406650304794312, + "learning_rate": 1.1732780064936374e-05, + "loss": 0.2801, + "step": 23946 + }, + { + "epoch": 0.44459626545077846, + "grad_norm": 2.4105215072631836, + "learning_rate": 1.1731631208046736e-05, + "loss": 0.4158, + "step": 23948 + }, + { + "epoch": 0.4446333955881971, + "grad_norm": 0.4223223626613617, + "learning_rate": 1.1730482327594781e-05, + "loss": 0.3205, + "step": 23950 + }, + { + "epoch": 0.4446705257256157, + "grad_norm": 0.3691438138484955, + "learning_rate": 1.1729333423596145e-05, + "loss": 0.2326, + "step": 23952 + }, + { + "epoch": 0.44470765586303435, + "grad_norm": 0.28826966881752014, + "learning_rate": 1.1728184496066462e-05, + "loss": 0.2155, + "step": 23954 + }, + { + "epoch": 0.444744786000453, + "grad_norm": 0.350644588470459, + "learning_rate": 1.1727035545021363e-05, + "loss": 0.4619, + "step": 23956 + }, + { + "epoch": 0.4447819161378716, + "grad_norm": 0.4109569191932678, + "learning_rate": 1.1725886570476486e-05, + "loss": 0.2696, + "step": 23958 + }, + { + "epoch": 0.4448190462752903, + "grad_norm": 0.33637312054634094, + "learning_rate": 1.1724737572447462e-05, + "loss": 0.3739, + "step": 23960 + }, + { + "epoch": 0.4448561764127089, + "grad_norm": 0.42051035165786743, + "learning_rate": 1.1723588550949925e-05, + "loss": 0.3735, + "step": 23962 + }, + { + "epoch": 0.44489330655012754, + "grad_norm": 0.26516294479370117, + "learning_rate": 1.1722439505999508e-05, + "loss": 0.3908, + "step": 23964 + }, + { + "epoch": 0.44493043668754617, + "grad_norm": 0.3965071439743042, + "learning_rate": 1.1721290437611854e-05, + "loss": 0.259, + "step": 23966 + }, + { + "epoch": 0.4449675668249648, + "grad_norm": 0.3436717987060547, + "learning_rate": 1.1720141345802588e-05, + "loss": 0.4519, + "step": 23968 + }, + { + "epoch": 0.4450046969623835, + "grad_norm": 0.4568477272987366, + "learning_rate": 1.1718992230587355e-05, + "loss": 0.4053, + "step": 23970 + }, + { + "epoch": 0.4450418270998021, + "grad_norm": 0.6536632776260376, + "learning_rate": 1.1717843091981782e-05, + "loss": 0.4454, + "step": 23972 + }, + { + "epoch": 0.44507895723722074, + "grad_norm": 0.3625437021255493, + "learning_rate": 1.1716693930001515e-05, + "loss": 0.436, + "step": 23974 + }, + { + "epoch": 0.44511608737463937, + "grad_norm": 0.3941170871257782, + "learning_rate": 1.1715544744662182e-05, + "loss": 0.4027, + "step": 23976 + }, + { + "epoch": 0.445153217512058, + "grad_norm": 0.3628867268562317, + "learning_rate": 1.1714395535979424e-05, + "loss": 0.3325, + "step": 23978 + }, + { + "epoch": 0.4451903476494766, + "grad_norm": 0.429275244474411, + "learning_rate": 1.1713246303968882e-05, + "loss": 0.3416, + "step": 23980 + }, + { + "epoch": 0.4452274777868953, + "grad_norm": 0.5084751844406128, + "learning_rate": 1.1712097048646186e-05, + "loss": 0.2719, + "step": 23982 + }, + { + "epoch": 0.44526460792431394, + "grad_norm": 0.2849515378475189, + "learning_rate": 1.1710947770026977e-05, + "loss": 0.2419, + "step": 23984 + }, + { + "epoch": 0.44530173806173257, + "grad_norm": 0.33132898807525635, + "learning_rate": 1.1709798468126896e-05, + "loss": 0.342, + "step": 23986 + }, + { + "epoch": 0.4453388681991512, + "grad_norm": 0.44278115034103394, + "learning_rate": 1.1708649142961576e-05, + "loss": 0.1917, + "step": 23988 + }, + { + "epoch": 0.4453759983365698, + "grad_norm": 0.345720499753952, + "learning_rate": 1.1707499794546662e-05, + "loss": 0.441, + "step": 23990 + }, + { + "epoch": 0.4454131284739885, + "grad_norm": 0.29425013065338135, + "learning_rate": 1.1706350422897792e-05, + "loss": 0.4168, + "step": 23992 + }, + { + "epoch": 0.44545025861140713, + "grad_norm": 0.24771080911159515, + "learning_rate": 1.17052010280306e-05, + "loss": 0.446, + "step": 23994 + }, + { + "epoch": 0.44548738874882576, + "grad_norm": 0.5992075800895691, + "learning_rate": 1.1704051609960729e-05, + "loss": 0.1989, + "step": 23996 + }, + { + "epoch": 0.4455245188862444, + "grad_norm": 0.3501785695552826, + "learning_rate": 1.1702902168703823e-05, + "loss": 0.2966, + "step": 23998 + }, + { + "epoch": 0.445561649023663, + "grad_norm": 0.46936652064323425, + "learning_rate": 1.1701752704275518e-05, + "loss": 0.3691, + "step": 24000 + }, + { + "epoch": 0.44559877916108165, + "grad_norm": 0.36993396282196045, + "learning_rate": 1.1700603216691458e-05, + "loss": 0.4871, + "step": 24002 + }, + { + "epoch": 0.44563590929850033, + "grad_norm": 0.42775020003318787, + "learning_rate": 1.169945370596728e-05, + "loss": 0.2937, + "step": 24004 + }, + { + "epoch": 0.44567303943591896, + "grad_norm": 0.3824927508831024, + "learning_rate": 1.1698304172118627e-05, + "loss": 0.389, + "step": 24006 + }, + { + "epoch": 0.4457101695733376, + "grad_norm": 0.35369062423706055, + "learning_rate": 1.1697154615161142e-05, + "loss": 0.3447, + "step": 24008 + }, + { + "epoch": 0.4457472997107562, + "grad_norm": 0.3623195290565491, + "learning_rate": 1.1696005035110468e-05, + "loss": 0.2547, + "step": 24010 + }, + { + "epoch": 0.44578442984817485, + "grad_norm": 0.33387210965156555, + "learning_rate": 1.1694855431982243e-05, + "loss": 0.5037, + "step": 24012 + }, + { + "epoch": 0.44582155998559353, + "grad_norm": 0.29337361454963684, + "learning_rate": 1.1693705805792112e-05, + "loss": 0.4434, + "step": 24014 + }, + { + "epoch": 0.44585869012301216, + "grad_norm": 0.47422701120376587, + "learning_rate": 1.1692556156555719e-05, + "loss": 0.2834, + "step": 24016 + }, + { + "epoch": 0.4458958202604308, + "grad_norm": 0.5289061665534973, + "learning_rate": 1.1691406484288708e-05, + "loss": 0.213, + "step": 24018 + }, + { + "epoch": 0.4459329503978494, + "grad_norm": 0.27482104301452637, + "learning_rate": 1.169025678900672e-05, + "loss": 0.3034, + "step": 24020 + }, + { + "epoch": 0.44597008053526804, + "grad_norm": 0.299385130405426, + "learning_rate": 1.16891070707254e-05, + "loss": 0.2953, + "step": 24022 + }, + { + "epoch": 0.4460072106726867, + "grad_norm": 0.5817592144012451, + "learning_rate": 1.1687957329460393e-05, + "loss": 0.2229, + "step": 24024 + }, + { + "epoch": 0.44604434081010536, + "grad_norm": 0.5666360259056091, + "learning_rate": 1.1686807565227341e-05, + "loss": 0.4658, + "step": 24026 + }, + { + "epoch": 0.446081470947524, + "grad_norm": 0.35040393471717834, + "learning_rate": 1.168565777804189e-05, + "loss": 0.2929, + "step": 24028 + }, + { + "epoch": 0.4461186010849426, + "grad_norm": 0.3602055013179779, + "learning_rate": 1.168450796791969e-05, + "loss": 0.2605, + "step": 24030 + }, + { + "epoch": 0.44615573122236124, + "grad_norm": 0.34880051016807556, + "learning_rate": 1.1683358134876377e-05, + "loss": 0.446, + "step": 24032 + }, + { + "epoch": 0.44619286135977987, + "grad_norm": 0.7034657001495361, + "learning_rate": 1.1682208278927607e-05, + "loss": 0.5569, + "step": 24034 + }, + { + "epoch": 0.44622999149719855, + "grad_norm": 0.33224138617515564, + "learning_rate": 1.168105840008902e-05, + "loss": 0.3605, + "step": 24036 + }, + { + "epoch": 0.4462671216346172, + "grad_norm": 0.24132730066776276, + "learning_rate": 1.167990849837626e-05, + "loss": 0.2734, + "step": 24038 + }, + { + "epoch": 0.4463042517720358, + "grad_norm": 0.2993573844432831, + "learning_rate": 1.167875857380498e-05, + "loss": 0.4002, + "step": 24040 + }, + { + "epoch": 0.44634138190945444, + "grad_norm": 0.33143749833106995, + "learning_rate": 1.1677608626390824e-05, + "loss": 0.3191, + "step": 24042 + }, + { + "epoch": 0.44637851204687307, + "grad_norm": 0.6321326494216919, + "learning_rate": 1.167645865614944e-05, + "loss": 0.5384, + "step": 24044 + }, + { + "epoch": 0.44641564218429175, + "grad_norm": 0.3029479384422302, + "learning_rate": 1.1675308663096473e-05, + "loss": 0.164, + "step": 24046 + }, + { + "epoch": 0.4464527723217104, + "grad_norm": 0.3932936191558838, + "learning_rate": 1.1674158647247578e-05, + "loss": 0.269, + "step": 24048 + }, + { + "epoch": 0.446489902459129, + "grad_norm": 0.36209365725517273, + "learning_rate": 1.1673008608618392e-05, + "loss": 0.4002, + "step": 24050 + }, + { + "epoch": 0.44652703259654764, + "grad_norm": 0.3835330605506897, + "learning_rate": 1.1671858547224574e-05, + "loss": 0.2055, + "step": 24052 + }, + { + "epoch": 0.44656416273396626, + "grad_norm": 0.3269481956958771, + "learning_rate": 1.167070846308177e-05, + "loss": 0.299, + "step": 24054 + }, + { + "epoch": 0.4466012928713849, + "grad_norm": 0.2737219035625458, + "learning_rate": 1.1669558356205628e-05, + "loss": 0.3431, + "step": 24056 + }, + { + "epoch": 0.4466384230088036, + "grad_norm": 0.35054299235343933, + "learning_rate": 1.1668408226611793e-05, + "loss": 0.1714, + "step": 24058 + }, + { + "epoch": 0.4466755531462222, + "grad_norm": 0.3232698440551758, + "learning_rate": 1.1667258074315924e-05, + "loss": 0.3528, + "step": 24060 + }, + { + "epoch": 0.44671268328364083, + "grad_norm": 0.3683943748474121, + "learning_rate": 1.1666107899333665e-05, + "loss": 0.3006, + "step": 24062 + }, + { + "epoch": 0.44674981342105946, + "grad_norm": 0.31169185042381287, + "learning_rate": 1.166495770168067e-05, + "loss": 0.1771, + "step": 24064 + }, + { + "epoch": 0.4467869435584781, + "grad_norm": 0.23884519934654236, + "learning_rate": 1.1663807481372587e-05, + "loss": 0.2089, + "step": 24066 + }, + { + "epoch": 0.4468240736958968, + "grad_norm": 0.5430421829223633, + "learning_rate": 1.1662657238425065e-05, + "loss": 0.3875, + "step": 24068 + }, + { + "epoch": 0.4468612038333154, + "grad_norm": 0.41073131561279297, + "learning_rate": 1.1661506972853762e-05, + "loss": 0.2629, + "step": 24070 + }, + { + "epoch": 0.44689833397073403, + "grad_norm": 0.5527470111846924, + "learning_rate": 1.1660356684674323e-05, + "loss": 0.2775, + "step": 24072 + }, + { + "epoch": 0.44693546410815266, + "grad_norm": 4.22423791885376, + "learning_rate": 1.1659206373902405e-05, + "loss": 0.246, + "step": 24074 + }, + { + "epoch": 0.4469725942455713, + "grad_norm": 0.45812708139419556, + "learning_rate": 1.1658056040553658e-05, + "loss": 0.3123, + "step": 24076 + }, + { + "epoch": 0.4470097243829899, + "grad_norm": 0.49532121419906616, + "learning_rate": 1.1656905684643733e-05, + "loss": 0.2828, + "step": 24078 + }, + { + "epoch": 0.4470468545204086, + "grad_norm": 0.27955830097198486, + "learning_rate": 1.1655755306188288e-05, + "loss": 0.4633, + "step": 24080 + }, + { + "epoch": 0.4470839846578272, + "grad_norm": 0.7527885437011719, + "learning_rate": 1.1654604905202968e-05, + "loss": 0.2245, + "step": 24082 + }, + { + "epoch": 0.44712111479524586, + "grad_norm": 0.5109013319015503, + "learning_rate": 1.1653454481703433e-05, + "loss": 0.2999, + "step": 24084 + }, + { + "epoch": 0.4471582449326645, + "grad_norm": 0.4185657799243927, + "learning_rate": 1.1652304035705337e-05, + "loss": 0.2853, + "step": 24086 + }, + { + "epoch": 0.4471953750700831, + "grad_norm": 0.3150821030139923, + "learning_rate": 1.1651153567224332e-05, + "loss": 0.227, + "step": 24088 + }, + { + "epoch": 0.4472325052075018, + "grad_norm": 0.3207242786884308, + "learning_rate": 1.1650003076276071e-05, + "loss": 0.4943, + "step": 24090 + }, + { + "epoch": 0.4472696353449204, + "grad_norm": 0.4198581874370575, + "learning_rate": 1.1648852562876214e-05, + "loss": 0.2121, + "step": 24092 + }, + { + "epoch": 0.44730676548233905, + "grad_norm": 0.5249981880187988, + "learning_rate": 1.1647702027040408e-05, + "loss": 0.4632, + "step": 24094 + }, + { + "epoch": 0.4473438956197577, + "grad_norm": 0.3403719961643219, + "learning_rate": 1.1646551468784315e-05, + "loss": 0.1828, + "step": 24096 + }, + { + "epoch": 0.4473810257571763, + "grad_norm": 0.4295170307159424, + "learning_rate": 1.1645400888123588e-05, + "loss": 0.2504, + "step": 24098 + }, + { + "epoch": 0.447418155894595, + "grad_norm": 0.34082096815109253, + "learning_rate": 1.1644250285073886e-05, + "loss": 0.2214, + "step": 24100 + }, + { + "epoch": 0.4474552860320136, + "grad_norm": 0.2675071060657501, + "learning_rate": 1.1643099659650858e-05, + "loss": 0.2623, + "step": 24102 + }, + { + "epoch": 0.44749241616943225, + "grad_norm": 0.29089948534965515, + "learning_rate": 1.1641949011870169e-05, + "loss": 0.2752, + "step": 24104 + }, + { + "epoch": 0.4475295463068509, + "grad_norm": 0.3934226632118225, + "learning_rate": 1.164079834174747e-05, + "loss": 0.4361, + "step": 24106 + }, + { + "epoch": 0.4475666764442695, + "grad_norm": 0.3137322664260864, + "learning_rate": 1.1639647649298423e-05, + "loss": 0.2348, + "step": 24108 + }, + { + "epoch": 0.44760380658168814, + "grad_norm": 0.41590625047683716, + "learning_rate": 1.163849693453868e-05, + "loss": 0.3274, + "step": 24110 + }, + { + "epoch": 0.4476409367191068, + "grad_norm": 0.43023356795310974, + "learning_rate": 1.1637346197483902e-05, + "loss": 0.194, + "step": 24112 + }, + { + "epoch": 0.44767806685652545, + "grad_norm": 0.441462904214859, + "learning_rate": 1.1636195438149749e-05, + "loss": 0.1265, + "step": 24114 + }, + { + "epoch": 0.4477151969939441, + "grad_norm": 0.3871336579322815, + "learning_rate": 1.1635044656551875e-05, + "loss": 0.2088, + "step": 24116 + }, + { + "epoch": 0.4477523271313627, + "grad_norm": 0.30690649151802063, + "learning_rate": 1.1633893852705943e-05, + "loss": 0.3618, + "step": 24118 + }, + { + "epoch": 0.44778945726878133, + "grad_norm": 0.4032331109046936, + "learning_rate": 1.1632743026627606e-05, + "loss": 0.4696, + "step": 24120 + }, + { + "epoch": 0.4478265874062, + "grad_norm": 0.443729430437088, + "learning_rate": 1.163159217833253e-05, + "loss": 0.299, + "step": 24122 + }, + { + "epoch": 0.44786371754361864, + "grad_norm": 0.28050127625465393, + "learning_rate": 1.1630441307836371e-05, + "loss": 0.1166, + "step": 24124 + }, + { + "epoch": 0.4479008476810373, + "grad_norm": 0.21925316751003265, + "learning_rate": 1.1629290415154788e-05, + "loss": 0.2606, + "step": 24126 + }, + { + "epoch": 0.4479379778184559, + "grad_norm": 0.3937493562698364, + "learning_rate": 1.1628139500303446e-05, + "loss": 0.43, + "step": 24128 + }, + { + "epoch": 0.44797510795587453, + "grad_norm": 0.18187883496284485, + "learning_rate": 1.1626988563298e-05, + "loss": 0.2802, + "step": 24130 + }, + { + "epoch": 0.44801223809329316, + "grad_norm": 0.47616690397262573, + "learning_rate": 1.1625837604154113e-05, + "loss": 0.2522, + "step": 24132 + }, + { + "epoch": 0.44804936823071184, + "grad_norm": 0.4741135835647583, + "learning_rate": 1.1624686622887444e-05, + "loss": 0.186, + "step": 24134 + }, + { + "epoch": 0.44808649836813047, + "grad_norm": 0.3838316798210144, + "learning_rate": 1.1623535619513659e-05, + "loss": 0.2108, + "step": 24136 + }, + { + "epoch": 0.4481236285055491, + "grad_norm": 0.2918176054954529, + "learning_rate": 1.1622384594048418e-05, + "loss": 0.3597, + "step": 24138 + }, + { + "epoch": 0.4481607586429677, + "grad_norm": 0.33782634139060974, + "learning_rate": 1.1621233546507382e-05, + "loss": 0.3898, + "step": 24140 + }, + { + "epoch": 0.44819788878038636, + "grad_norm": 0.25396132469177246, + "learning_rate": 1.1620082476906212e-05, + "loss": 0.3019, + "step": 24142 + }, + { + "epoch": 0.44823501891780504, + "grad_norm": 0.5581413507461548, + "learning_rate": 1.1618931385260574e-05, + "loss": 0.2709, + "step": 24144 + }, + { + "epoch": 0.44827214905522367, + "grad_norm": 0.3434060513973236, + "learning_rate": 1.1617780271586128e-05, + "loss": 0.383, + "step": 24146 + }, + { + "epoch": 0.4483092791926423, + "grad_norm": 0.46190202236175537, + "learning_rate": 1.1616629135898538e-05, + "loss": 0.2948, + "step": 24148 + }, + { + "epoch": 0.4483464093300609, + "grad_norm": 0.3322995603084564, + "learning_rate": 1.1615477978213471e-05, + "loss": 0.2121, + "step": 24150 + }, + { + "epoch": 0.44838353946747955, + "grad_norm": 0.28766751289367676, + "learning_rate": 1.1614326798546585e-05, + "loss": 0.4175, + "step": 24152 + }, + { + "epoch": 0.4484206696048982, + "grad_norm": 0.7717507481575012, + "learning_rate": 1.1613175596913546e-05, + "loss": 0.3161, + "step": 24154 + }, + { + "epoch": 0.44845779974231686, + "grad_norm": 0.7002748250961304, + "learning_rate": 1.161202437333002e-05, + "loss": 0.3003, + "step": 24156 + }, + { + "epoch": 0.4484949298797355, + "grad_norm": 0.3266995847225189, + "learning_rate": 1.161087312781167e-05, + "loss": 0.4411, + "step": 24158 + }, + { + "epoch": 0.4485320600171541, + "grad_norm": 0.33778637647628784, + "learning_rate": 1.1609721860374164e-05, + "loss": 0.1441, + "step": 24160 + }, + { + "epoch": 0.44856919015457275, + "grad_norm": 0.371554434299469, + "learning_rate": 1.1608570571033164e-05, + "loss": 0.2539, + "step": 24162 + }, + { + "epoch": 0.4486063202919914, + "grad_norm": 0.37389928102493286, + "learning_rate": 1.1607419259804336e-05, + "loss": 0.2845, + "step": 24164 + }, + { + "epoch": 0.44864345042941006, + "grad_norm": 0.2681103050708771, + "learning_rate": 1.1606267926703347e-05, + "loss": 0.3067, + "step": 24166 + }, + { + "epoch": 0.4486805805668287, + "grad_norm": 0.34936437010765076, + "learning_rate": 1.1605116571745864e-05, + "loss": 0.2126, + "step": 24168 + }, + { + "epoch": 0.4487177107042473, + "grad_norm": 0.339579701423645, + "learning_rate": 1.160396519494755e-05, + "loss": 0.253, + "step": 24170 + }, + { + "epoch": 0.44875484084166595, + "grad_norm": 0.4387122690677643, + "learning_rate": 1.1602813796324077e-05, + "loss": 0.2073, + "step": 24172 + }, + { + "epoch": 0.4487919709790846, + "grad_norm": 0.45612820982933044, + "learning_rate": 1.160166237589111e-05, + "loss": 0.3018, + "step": 24174 + }, + { + "epoch": 0.44882910111650326, + "grad_norm": 0.3554868996143341, + "learning_rate": 1.160051093366431e-05, + "loss": 0.2237, + "step": 24176 + }, + { + "epoch": 0.4488662312539219, + "grad_norm": 0.7070173025131226, + "learning_rate": 1.1599359469659355e-05, + "loss": 0.3964, + "step": 24178 + }, + { + "epoch": 0.4489033613913405, + "grad_norm": 0.3109132647514343, + "learning_rate": 1.1598207983891907e-05, + "loss": 0.3791, + "step": 24180 + }, + { + "epoch": 0.44894049152875914, + "grad_norm": 0.33284395933151245, + "learning_rate": 1.1597056476377637e-05, + "loss": 0.225, + "step": 24182 + }, + { + "epoch": 0.4489776216661778, + "grad_norm": 0.3307919502258301, + "learning_rate": 1.1595904947132209e-05, + "loss": 0.2377, + "step": 24184 + }, + { + "epoch": 0.4490147518035964, + "grad_norm": 0.3260999023914337, + "learning_rate": 1.1594753396171295e-05, + "loss": 0.345, + "step": 24186 + }, + { + "epoch": 0.4490518819410151, + "grad_norm": 0.2680944502353668, + "learning_rate": 1.1593601823510568e-05, + "loss": 0.3512, + "step": 24188 + }, + { + "epoch": 0.4490890120784337, + "grad_norm": 0.26139354705810547, + "learning_rate": 1.1592450229165688e-05, + "loss": 0.1844, + "step": 24190 + }, + { + "epoch": 0.44912614221585234, + "grad_norm": 0.28114280104637146, + "learning_rate": 1.1591298613152336e-05, + "loss": 0.2858, + "step": 24192 + }, + { + "epoch": 0.44916327235327097, + "grad_norm": 0.28542953729629517, + "learning_rate": 1.1590146975486173e-05, + "loss": 0.5018, + "step": 24194 + }, + { + "epoch": 0.4492004024906896, + "grad_norm": 0.41173604130744934, + "learning_rate": 1.1588995316182874e-05, + "loss": 0.2398, + "step": 24196 + }, + { + "epoch": 0.4492375326281083, + "grad_norm": 0.5837909579277039, + "learning_rate": 1.1587843635258107e-05, + "loss": 0.2741, + "step": 24198 + }, + { + "epoch": 0.4492746627655269, + "grad_norm": 0.33508795499801636, + "learning_rate": 1.1586691932727547e-05, + "loss": 0.1162, + "step": 24200 + }, + { + "epoch": 0.44931179290294554, + "grad_norm": 0.17821308970451355, + "learning_rate": 1.1585540208606863e-05, + "loss": 0.3627, + "step": 24202 + }, + { + "epoch": 0.44934892304036417, + "grad_norm": 0.841312050819397, + "learning_rate": 1.1584388462911723e-05, + "loss": 0.247, + "step": 24204 + }, + { + "epoch": 0.4493860531777828, + "grad_norm": 0.4229127764701843, + "learning_rate": 1.1583236695657802e-05, + "loss": 0.3341, + "step": 24206 + }, + { + "epoch": 0.4494231833152014, + "grad_norm": 0.3443514406681061, + "learning_rate": 1.1582084906860772e-05, + "loss": 0.3204, + "step": 24208 + }, + { + "epoch": 0.4494603134526201, + "grad_norm": 0.3607397675514221, + "learning_rate": 1.1580933096536304e-05, + "loss": 0.2103, + "step": 24210 + }, + { + "epoch": 0.44949744359003874, + "grad_norm": 0.28506115078926086, + "learning_rate": 1.1579781264700075e-05, + "loss": 0.4519, + "step": 24212 + }, + { + "epoch": 0.44953457372745736, + "grad_norm": 0.4430101215839386, + "learning_rate": 1.1578629411367755e-05, + "loss": 0.2865, + "step": 24214 + }, + { + "epoch": 0.449571703864876, + "grad_norm": 0.8077531456947327, + "learning_rate": 1.1577477536555014e-05, + "loss": 0.4132, + "step": 24216 + }, + { + "epoch": 0.4496088340022946, + "grad_norm": 0.4098680019378662, + "learning_rate": 1.1576325640277532e-05, + "loss": 0.3987, + "step": 24218 + }, + { + "epoch": 0.4496459641397133, + "grad_norm": 0.20865680277347565, + "learning_rate": 1.1575173722550978e-05, + "loss": 0.1289, + "step": 24220 + }, + { + "epoch": 0.44968309427713193, + "grad_norm": 0.38363873958587646, + "learning_rate": 1.1574021783391027e-05, + "loss": 0.2021, + "step": 24222 + }, + { + "epoch": 0.44972022441455056, + "grad_norm": 0.5751428008079529, + "learning_rate": 1.1572869822813356e-05, + "loss": 0.5319, + "step": 24224 + }, + { + "epoch": 0.4497573545519692, + "grad_norm": 0.2770703136920929, + "learning_rate": 1.1571717840833638e-05, + "loss": 0.4279, + "step": 24226 + }, + { + "epoch": 0.4497944846893878, + "grad_norm": 0.24907010793685913, + "learning_rate": 1.1570565837467546e-05, + "loss": 0.2851, + "step": 24228 + }, + { + "epoch": 0.44983161482680645, + "grad_norm": 0.3281336724758148, + "learning_rate": 1.156941381273076e-05, + "loss": 0.1999, + "step": 24230 + }, + { + "epoch": 0.44986874496422513, + "grad_norm": 0.4506210684776306, + "learning_rate": 1.1568261766638954e-05, + "loss": 0.2597, + "step": 24232 + }, + { + "epoch": 0.44990587510164376, + "grad_norm": 0.23393791913986206, + "learning_rate": 1.1567109699207801e-05, + "loss": 0.1556, + "step": 24234 + }, + { + "epoch": 0.4499430052390624, + "grad_norm": 0.7359989285469055, + "learning_rate": 1.1565957610452979e-05, + "loss": 0.6043, + "step": 24236 + }, + { + "epoch": 0.449980135376481, + "grad_norm": 0.22738473117351532, + "learning_rate": 1.1564805500390164e-05, + "loss": 0.2146, + "step": 24238 + }, + { + "epoch": 0.45001726551389964, + "grad_norm": 0.3872811496257782, + "learning_rate": 1.156365336903503e-05, + "loss": 0.2651, + "step": 24240 + }, + { + "epoch": 0.45005439565131833, + "grad_norm": 0.34296178817749023, + "learning_rate": 1.1562501216403261e-05, + "loss": 0.342, + "step": 24242 + }, + { + "epoch": 0.45009152578873696, + "grad_norm": 0.4160754680633545, + "learning_rate": 1.1561349042510532e-05, + "loss": 0.3028, + "step": 24244 + }, + { + "epoch": 0.4501286559261556, + "grad_norm": 0.6919189095497131, + "learning_rate": 1.156019684737252e-05, + "loss": 0.3078, + "step": 24246 + }, + { + "epoch": 0.4501657860635742, + "grad_norm": 0.36379021406173706, + "learning_rate": 1.1559044631004898e-05, + "loss": 0.303, + "step": 24248 + }, + { + "epoch": 0.45020291620099284, + "grad_norm": 0.35951507091522217, + "learning_rate": 1.1557892393423351e-05, + "loss": 0.2808, + "step": 24250 + }, + { + "epoch": 0.4502400463384115, + "grad_norm": 0.46340298652648926, + "learning_rate": 1.1556740134643554e-05, + "loss": 0.2855, + "step": 24252 + }, + { + "epoch": 0.45027717647583015, + "grad_norm": 0.33905476331710815, + "learning_rate": 1.1555587854681187e-05, + "loss": 0.4119, + "step": 24254 + }, + { + "epoch": 0.4503143066132488, + "grad_norm": 0.5160535573959351, + "learning_rate": 1.1554435553551933e-05, + "loss": 0.1015, + "step": 24256 + }, + { + "epoch": 0.4503514367506674, + "grad_norm": 0.3321382999420166, + "learning_rate": 1.1553283231271462e-05, + "loss": 0.2341, + "step": 24258 + }, + { + "epoch": 0.45038856688808604, + "grad_norm": 0.4394192397594452, + "learning_rate": 1.1552130887855457e-05, + "loss": 0.5611, + "step": 24260 + }, + { + "epoch": 0.45042569702550467, + "grad_norm": 0.7245113849639893, + "learning_rate": 1.1550978523319603e-05, + "loss": 0.3062, + "step": 24262 + }, + { + "epoch": 0.45046282716292335, + "grad_norm": 0.40561386942863464, + "learning_rate": 1.1549826137679578e-05, + "loss": 0.2109, + "step": 24264 + }, + { + "epoch": 0.450499957300342, + "grad_norm": 0.4951251745223999, + "learning_rate": 1.154867373095106e-05, + "loss": 0.2717, + "step": 24266 + }, + { + "epoch": 0.4505370874377606, + "grad_norm": 0.3473818004131317, + "learning_rate": 1.154752130314973e-05, + "loss": 0.2167, + "step": 24268 + }, + { + "epoch": 0.45057421757517924, + "grad_norm": 0.214506134390831, + "learning_rate": 1.1546368854291275e-05, + "loss": 0.164, + "step": 24270 + }, + { + "epoch": 0.45061134771259787, + "grad_norm": 0.4057979881763458, + "learning_rate": 1.154521638439137e-05, + "loss": 0.1924, + "step": 24272 + }, + { + "epoch": 0.45064847785001655, + "grad_norm": 0.4874143600463867, + "learning_rate": 1.1544063893465695e-05, + "loss": 0.4362, + "step": 24274 + }, + { + "epoch": 0.4506856079874352, + "grad_norm": 0.49551358819007874, + "learning_rate": 1.1542911381529937e-05, + "loss": 0.351, + "step": 24276 + }, + { + "epoch": 0.4507227381248538, + "grad_norm": 0.7293763756752014, + "learning_rate": 1.154175884859978e-05, + "loss": 0.388, + "step": 24278 + }, + { + "epoch": 0.45075986826227243, + "grad_norm": 0.3577684760093689, + "learning_rate": 1.15406062946909e-05, + "loss": 0.3224, + "step": 24280 + }, + { + "epoch": 0.45079699839969106, + "grad_norm": 0.34816378355026245, + "learning_rate": 1.1539453719818985e-05, + "loss": 0.2899, + "step": 24282 + }, + { + "epoch": 0.4508341285371097, + "grad_norm": 0.27185338735580444, + "learning_rate": 1.1538301123999713e-05, + "loss": 0.2826, + "step": 24284 + }, + { + "epoch": 0.4508712586745284, + "grad_norm": 0.3021487593650818, + "learning_rate": 1.1537148507248774e-05, + "loss": 0.3028, + "step": 24286 + }, + { + "epoch": 0.450908388811947, + "grad_norm": 0.49784591794013977, + "learning_rate": 1.1535995869581846e-05, + "loss": 0.4266, + "step": 24288 + }, + { + "epoch": 0.45094551894936563, + "grad_norm": 0.4350736737251282, + "learning_rate": 1.1534843211014614e-05, + "loss": 0.1592, + "step": 24290 + }, + { + "epoch": 0.45098264908678426, + "grad_norm": 0.5790149569511414, + "learning_rate": 1.1533690531562765e-05, + "loss": 0.4709, + "step": 24292 + }, + { + "epoch": 0.4510197792242029, + "grad_norm": 0.43375521898269653, + "learning_rate": 1.1532537831241983e-05, + "loss": 0.2505, + "step": 24294 + }, + { + "epoch": 0.45105690936162157, + "grad_norm": 0.3203457295894623, + "learning_rate": 1.153138511006795e-05, + "loss": 0.4867, + "step": 24296 + }, + { + "epoch": 0.4510940394990402, + "grad_norm": 0.30377355217933655, + "learning_rate": 1.1530232368056354e-05, + "loss": 0.2107, + "step": 24298 + }, + { + "epoch": 0.45113116963645883, + "grad_norm": 0.28263983130455017, + "learning_rate": 1.1529079605222876e-05, + "loss": 0.4215, + "step": 24300 + }, + { + "epoch": 0.45116829977387746, + "grad_norm": 0.41750532388687134, + "learning_rate": 1.1527926821583207e-05, + "loss": 0.2489, + "step": 24302 + }, + { + "epoch": 0.4512054299112961, + "grad_norm": 0.5356645584106445, + "learning_rate": 1.1526774017153029e-05, + "loss": 0.2804, + "step": 24304 + }, + { + "epoch": 0.4512425600487147, + "grad_norm": 0.5181710720062256, + "learning_rate": 1.1525621191948031e-05, + "loss": 0.5768, + "step": 24306 + }, + { + "epoch": 0.4512796901861334, + "grad_norm": 0.3529236316680908, + "learning_rate": 1.15244683459839e-05, + "loss": 0.3728, + "step": 24308 + }, + { + "epoch": 0.451316820323552, + "grad_norm": 0.4705997407436371, + "learning_rate": 1.1523315479276318e-05, + "loss": 0.4743, + "step": 24310 + }, + { + "epoch": 0.45135395046097065, + "grad_norm": 0.4588668942451477, + "learning_rate": 1.1522162591840976e-05, + "loss": 0.2983, + "step": 24312 + }, + { + "epoch": 0.4513910805983893, + "grad_norm": 0.2729796767234802, + "learning_rate": 1.1521009683693563e-05, + "loss": 0.1774, + "step": 24314 + }, + { + "epoch": 0.4514282107358079, + "grad_norm": 0.31937137246131897, + "learning_rate": 1.1519856754849761e-05, + "loss": 0.1868, + "step": 24316 + }, + { + "epoch": 0.4514653408732266, + "grad_norm": 0.40577009320259094, + "learning_rate": 1.1518703805325264e-05, + "loss": 0.3799, + "step": 24318 + }, + { + "epoch": 0.4515024710106452, + "grad_norm": 0.4380381107330322, + "learning_rate": 1.1517550835135756e-05, + "loss": 0.4497, + "step": 24320 + }, + { + "epoch": 0.45153960114806385, + "grad_norm": 0.7073315382003784, + "learning_rate": 1.1516397844296925e-05, + "loss": 0.4003, + "step": 24322 + }, + { + "epoch": 0.4515767312854825, + "grad_norm": 0.44480425119400024, + "learning_rate": 1.151524483282446e-05, + "loss": 0.3039, + "step": 24324 + }, + { + "epoch": 0.4516138614229011, + "grad_norm": 0.32456642389297485, + "learning_rate": 1.1514091800734058e-05, + "loss": 0.3176, + "step": 24326 + }, + { + "epoch": 0.4516509915603198, + "grad_norm": 0.5175254940986633, + "learning_rate": 1.1512938748041396e-05, + "loss": 0.3516, + "step": 24328 + }, + { + "epoch": 0.4516881216977384, + "grad_norm": 0.36971572041511536, + "learning_rate": 1.1511785674762173e-05, + "loss": 0.2398, + "step": 24330 + }, + { + "epoch": 0.45172525183515705, + "grad_norm": 0.4066765308380127, + "learning_rate": 1.1510632580912075e-05, + "loss": 0.5428, + "step": 24332 + }, + { + "epoch": 0.4517623819725757, + "grad_norm": 0.25535905361175537, + "learning_rate": 1.150947946650679e-05, + "loss": 0.4229, + "step": 24334 + }, + { + "epoch": 0.4517995121099943, + "grad_norm": 0.37440025806427, + "learning_rate": 1.1508326331562012e-05, + "loss": 0.268, + "step": 24336 + }, + { + "epoch": 0.45183664224741293, + "grad_norm": 0.4801817834377289, + "learning_rate": 1.1507173176093432e-05, + "loss": 0.3184, + "step": 24338 + }, + { + "epoch": 0.4518737723848316, + "grad_norm": 0.3520878851413727, + "learning_rate": 1.1506020000116739e-05, + "loss": 0.2632, + "step": 24340 + }, + { + "epoch": 0.45191090252225025, + "grad_norm": 0.35772955417633057, + "learning_rate": 1.1504866803647622e-05, + "loss": 0.2517, + "step": 24342 + }, + { + "epoch": 0.4519480326596689, + "grad_norm": 0.42108869552612305, + "learning_rate": 1.1503713586701777e-05, + "loss": 0.2571, + "step": 24344 + }, + { + "epoch": 0.4519851627970875, + "grad_norm": 0.4936422109603882, + "learning_rate": 1.1502560349294896e-05, + "loss": 0.2214, + "step": 24346 + }, + { + "epoch": 0.45202229293450613, + "grad_norm": 0.4932944178581238, + "learning_rate": 1.1501407091442667e-05, + "loss": 0.3233, + "step": 24348 + }, + { + "epoch": 0.4520594230719248, + "grad_norm": 0.32448112964630127, + "learning_rate": 1.1500253813160786e-05, + "loss": 0.2351, + "step": 24350 + }, + { + "epoch": 0.45209655320934344, + "grad_norm": 0.41791751980781555, + "learning_rate": 1.1499100514464947e-05, + "loss": 0.3427, + "step": 24352 + }, + { + "epoch": 0.45213368334676207, + "grad_norm": 0.4049769937992096, + "learning_rate": 1.1497947195370835e-05, + "loss": 0.3262, + "step": 24354 + }, + { + "epoch": 0.4521708134841807, + "grad_norm": 0.38210293650627136, + "learning_rate": 1.1496793855894151e-05, + "loss": 0.1533, + "step": 24356 + }, + { + "epoch": 0.45220794362159933, + "grad_norm": 0.4687541425228119, + "learning_rate": 1.1495640496050588e-05, + "loss": 0.3664, + "step": 24358 + }, + { + "epoch": 0.45224507375901796, + "grad_norm": 0.5578746199607849, + "learning_rate": 1.1494487115855834e-05, + "loss": 0.4291, + "step": 24360 + }, + { + "epoch": 0.45228220389643664, + "grad_norm": 0.34367769956588745, + "learning_rate": 1.149333371532559e-05, + "loss": 0.4349, + "step": 24362 + }, + { + "epoch": 0.45231933403385527, + "grad_norm": 0.30054235458374023, + "learning_rate": 1.1492180294475548e-05, + "loss": 0.3398, + "step": 24364 + }, + { + "epoch": 0.4523564641712739, + "grad_norm": 0.7804811596870422, + "learning_rate": 1.1491026853321398e-05, + "loss": 0.3537, + "step": 24366 + }, + { + "epoch": 0.4523935943086925, + "grad_norm": 0.24323876202106476, + "learning_rate": 1.148987339187884e-05, + "loss": 0.2364, + "step": 24368 + }, + { + "epoch": 0.45243072444611115, + "grad_norm": 0.5521492958068848, + "learning_rate": 1.1488719910163569e-05, + "loss": 0.5575, + "step": 24370 + }, + { + "epoch": 0.45246785458352984, + "grad_norm": 0.3839251399040222, + "learning_rate": 1.148756640819128e-05, + "loss": 0.2073, + "step": 24372 + }, + { + "epoch": 0.45250498472094847, + "grad_norm": 0.2829572856426239, + "learning_rate": 1.1486412885977664e-05, + "loss": 0.4, + "step": 24374 + }, + { + "epoch": 0.4525421148583671, + "grad_norm": 0.45137205719947815, + "learning_rate": 1.1485259343538426e-05, + "loss": 0.5329, + "step": 24376 + }, + { + "epoch": 0.4525792449957857, + "grad_norm": 0.4369995594024658, + "learning_rate": 1.1484105780889253e-05, + "loss": 0.2938, + "step": 24378 + }, + { + "epoch": 0.45261637513320435, + "grad_norm": 0.2578050494194031, + "learning_rate": 1.1482952198045846e-05, + "loss": 0.2111, + "step": 24380 + }, + { + "epoch": 0.452653505270623, + "grad_norm": 0.4284721612930298, + "learning_rate": 1.1481798595023905e-05, + "loss": 0.226, + "step": 24382 + }, + { + "epoch": 0.45269063540804166, + "grad_norm": 0.7003005146980286, + "learning_rate": 1.1480644971839124e-05, + "loss": 0.3373, + "step": 24384 + }, + { + "epoch": 0.4527277655454603, + "grad_norm": 0.34780243039131165, + "learning_rate": 1.1479491328507195e-05, + "loss": 0.3447, + "step": 24386 + }, + { + "epoch": 0.4527648956828789, + "grad_norm": 0.4656771719455719, + "learning_rate": 1.1478337665043822e-05, + "loss": 0.3761, + "step": 24388 + }, + { + "epoch": 0.45280202582029755, + "grad_norm": 0.3914140462875366, + "learning_rate": 1.1477183981464704e-05, + "loss": 0.1832, + "step": 24390 + }, + { + "epoch": 0.4528391559577162, + "grad_norm": 0.4113878309726715, + "learning_rate": 1.1476030277785534e-05, + "loss": 0.3029, + "step": 24392 + }, + { + "epoch": 0.45287628609513486, + "grad_norm": 0.3519241213798523, + "learning_rate": 1.1474876554022016e-05, + "loss": 0.3168, + "step": 24394 + }, + { + "epoch": 0.4529134162325535, + "grad_norm": 0.2517205774784088, + "learning_rate": 1.1473722810189845e-05, + "loss": 0.2098, + "step": 24396 + }, + { + "epoch": 0.4529505463699721, + "grad_norm": 0.562839925289154, + "learning_rate": 1.147256904630472e-05, + "loss": 0.1431, + "step": 24398 + }, + { + "epoch": 0.45298767650739075, + "grad_norm": 0.4227277636528015, + "learning_rate": 1.147141526238234e-05, + "loss": 0.2299, + "step": 24400 + }, + { + "epoch": 0.4530248066448094, + "grad_norm": 0.24431215226650238, + "learning_rate": 1.1470261458438408e-05, + "loss": 0.1003, + "step": 24402 + }, + { + "epoch": 0.45306193678222806, + "grad_norm": 0.3886420428752899, + "learning_rate": 1.146910763448862e-05, + "loss": 0.3595, + "step": 24404 + }, + { + "epoch": 0.4530990669196467, + "grad_norm": 0.5953771471977234, + "learning_rate": 1.1467953790548678e-05, + "loss": 0.2306, + "step": 24406 + }, + { + "epoch": 0.4531361970570653, + "grad_norm": 0.410057932138443, + "learning_rate": 1.1466799926634284e-05, + "loss": 0.2469, + "step": 24408 + }, + { + "epoch": 0.45317332719448394, + "grad_norm": 0.3346385955810547, + "learning_rate": 1.1465646042761134e-05, + "loss": 0.4124, + "step": 24410 + }, + { + "epoch": 0.45321045733190257, + "grad_norm": 0.5741729140281677, + "learning_rate": 1.146449213894493e-05, + "loss": 0.227, + "step": 24412 + }, + { + "epoch": 0.4532475874693212, + "grad_norm": 0.4406915307044983, + "learning_rate": 1.1463338215201378e-05, + "loss": 0.388, + "step": 24414 + }, + { + "epoch": 0.4532847176067399, + "grad_norm": 0.5334178805351257, + "learning_rate": 1.1462184271546177e-05, + "loss": 0.4765, + "step": 24416 + }, + { + "epoch": 0.4533218477441585, + "grad_norm": 0.4429932236671448, + "learning_rate": 1.1461030307995023e-05, + "loss": 0.1941, + "step": 24418 + }, + { + "epoch": 0.45335897788157714, + "grad_norm": 0.51790452003479, + "learning_rate": 1.1459876324563628e-05, + "loss": 0.2628, + "step": 24420 + }, + { + "epoch": 0.45339610801899577, + "grad_norm": 0.4175902009010315, + "learning_rate": 1.1458722321267685e-05, + "loss": 0.2257, + "step": 24422 + }, + { + "epoch": 0.4534332381564144, + "grad_norm": 0.2585659623146057, + "learning_rate": 1.1457568298122903e-05, + "loss": 0.1457, + "step": 24424 + }, + { + "epoch": 0.4534703682938331, + "grad_norm": 0.31611302495002747, + "learning_rate": 1.1456414255144981e-05, + "loss": 0.2361, + "step": 24426 + }, + { + "epoch": 0.4535074984312517, + "grad_norm": 0.5543726682662964, + "learning_rate": 1.1455260192349625e-05, + "loss": 0.1923, + "step": 24428 + }, + { + "epoch": 0.45354462856867034, + "grad_norm": 1.5112606287002563, + "learning_rate": 1.1454106109752535e-05, + "loss": 0.3767, + "step": 24430 + }, + { + "epoch": 0.45358175870608897, + "grad_norm": 0.3529212474822998, + "learning_rate": 1.1452952007369416e-05, + "loss": 0.428, + "step": 24432 + }, + { + "epoch": 0.4536188888435076, + "grad_norm": 0.2337718904018402, + "learning_rate": 1.1451797885215974e-05, + "loss": 0.2674, + "step": 24434 + }, + { + "epoch": 0.4536560189809262, + "grad_norm": 0.566755473613739, + "learning_rate": 1.1450643743307913e-05, + "loss": 0.1397, + "step": 24436 + }, + { + "epoch": 0.4536931491183449, + "grad_norm": 0.38359037041664124, + "learning_rate": 1.1449489581660931e-05, + "loss": 0.3229, + "step": 24438 + }, + { + "epoch": 0.45373027925576354, + "grad_norm": 0.2971417009830475, + "learning_rate": 1.1448335400290741e-05, + "loss": 0.3266, + "step": 24440 + }, + { + "epoch": 0.45376740939318216, + "grad_norm": 0.47858765721321106, + "learning_rate": 1.1447181199213043e-05, + "loss": 0.1577, + "step": 24442 + }, + { + "epoch": 0.4538045395306008, + "grad_norm": 0.44255489110946655, + "learning_rate": 1.1446026978443542e-05, + "loss": 0.2982, + "step": 24444 + }, + { + "epoch": 0.4538416696680194, + "grad_norm": 0.40738019347190857, + "learning_rate": 1.144487273799795e-05, + "loss": 0.3056, + "step": 24446 + }, + { + "epoch": 0.4538787998054381, + "grad_norm": 0.329432874917984, + "learning_rate": 1.1443718477891968e-05, + "loss": 0.2561, + "step": 24448 + }, + { + "epoch": 0.45391592994285673, + "grad_norm": 0.391357421875, + "learning_rate": 1.1442564198141297e-05, + "loss": 0.1623, + "step": 24450 + }, + { + "epoch": 0.45395306008027536, + "grad_norm": 0.5092080235481262, + "learning_rate": 1.1441409898761652e-05, + "loss": 0.2887, + "step": 24452 + }, + { + "epoch": 0.453990190217694, + "grad_norm": 0.3103174865245819, + "learning_rate": 1.1440255579768733e-05, + "loss": 0.4682, + "step": 24454 + }, + { + "epoch": 0.4540273203551126, + "grad_norm": 0.3614620566368103, + "learning_rate": 1.1439101241178252e-05, + "loss": 0.191, + "step": 24456 + }, + { + "epoch": 0.45406445049253125, + "grad_norm": 1.1077080965042114, + "learning_rate": 1.1437946883005915e-05, + "loss": 0.4011, + "step": 24458 + }, + { + "epoch": 0.45410158062994993, + "grad_norm": 0.4501185715198517, + "learning_rate": 1.1436792505267425e-05, + "loss": 0.348, + "step": 24460 + }, + { + "epoch": 0.45413871076736856, + "grad_norm": 0.33334261178970337, + "learning_rate": 1.1435638107978491e-05, + "loss": 0.1522, + "step": 24462 + }, + { + "epoch": 0.4541758409047872, + "grad_norm": 0.3686613142490387, + "learning_rate": 1.1434483691154828e-05, + "loss": 0.1669, + "step": 24464 + }, + { + "epoch": 0.4542129710422058, + "grad_norm": 0.4061455726623535, + "learning_rate": 1.1433329254812133e-05, + "loss": 0.1999, + "step": 24466 + }, + { + "epoch": 0.45425010117962444, + "grad_norm": 0.2769937515258789, + "learning_rate": 1.1432174798966124e-05, + "loss": 0.1557, + "step": 24468 + }, + { + "epoch": 0.4542872313170431, + "grad_norm": 0.4050350487232208, + "learning_rate": 1.1431020323632505e-05, + "loss": 0.5059, + "step": 24470 + }, + { + "epoch": 0.45432436145446176, + "grad_norm": 0.4567606747150421, + "learning_rate": 1.1429865828826987e-05, + "loss": 0.2729, + "step": 24472 + }, + { + "epoch": 0.4543614915918804, + "grad_norm": 0.48036959767341614, + "learning_rate": 1.1428711314565275e-05, + "loss": 0.4366, + "step": 24474 + }, + { + "epoch": 0.454398621729299, + "grad_norm": 0.2840084433555603, + "learning_rate": 1.1427556780863083e-05, + "loss": 0.457, + "step": 24476 + }, + { + "epoch": 0.45443575186671764, + "grad_norm": 0.2606395184993744, + "learning_rate": 1.1426402227736123e-05, + "loss": 0.4134, + "step": 24478 + }, + { + "epoch": 0.4544728820041363, + "grad_norm": 0.3812691867351532, + "learning_rate": 1.1425247655200096e-05, + "loss": 0.2857, + "step": 24480 + }, + { + "epoch": 0.45451001214155495, + "grad_norm": 0.3884766101837158, + "learning_rate": 1.1424093063270719e-05, + "loss": 0.3528, + "step": 24482 + }, + { + "epoch": 0.4545471422789736, + "grad_norm": 0.21928803622722626, + "learning_rate": 1.1422938451963702e-05, + "loss": 0.1583, + "step": 24484 + }, + { + "epoch": 0.4545842724163922, + "grad_norm": 0.273173451423645, + "learning_rate": 1.1421783821294752e-05, + "loss": 0.429, + "step": 24486 + }, + { + "epoch": 0.45462140255381084, + "grad_norm": 0.358315110206604, + "learning_rate": 1.1420629171279587e-05, + "loss": 0.2361, + "step": 24488 + }, + { + "epoch": 0.45465853269122947, + "grad_norm": 0.3807966113090515, + "learning_rate": 1.1419474501933912e-05, + "loss": 0.3653, + "step": 24490 + }, + { + "epoch": 0.45469566282864815, + "grad_norm": 0.35825005173683167, + "learning_rate": 1.141831981327344e-05, + "loss": 0.4645, + "step": 24492 + }, + { + "epoch": 0.4547327929660668, + "grad_norm": 0.6739867925643921, + "learning_rate": 1.1417165105313884e-05, + "loss": 0.1992, + "step": 24494 + }, + { + "epoch": 0.4547699231034854, + "grad_norm": 0.29355382919311523, + "learning_rate": 1.1416010378070958e-05, + "loss": 0.2054, + "step": 24496 + }, + { + "epoch": 0.45480705324090404, + "grad_norm": 0.529712438583374, + "learning_rate": 1.141485563156037e-05, + "loss": 0.3194, + "step": 24498 + }, + { + "epoch": 0.45484418337832266, + "grad_norm": 0.454432874917984, + "learning_rate": 1.1413700865797836e-05, + "loss": 0.1373, + "step": 24500 + }, + { + "epoch": 0.45488131351574135, + "grad_norm": 0.3998640775680542, + "learning_rate": 1.141254608079907e-05, + "loss": 0.2565, + "step": 24502 + }, + { + "epoch": 0.45491844365316, + "grad_norm": 0.32677263021469116, + "learning_rate": 1.141139127657978e-05, + "loss": 0.1635, + "step": 24504 + }, + { + "epoch": 0.4549555737905786, + "grad_norm": 0.35489946603775024, + "learning_rate": 1.1410236453155681e-05, + "loss": 0.2178, + "step": 24506 + }, + { + "epoch": 0.45499270392799723, + "grad_norm": 0.44876015186309814, + "learning_rate": 1.1409081610542489e-05, + "loss": 0.4058, + "step": 24508 + }, + { + "epoch": 0.45502983406541586, + "grad_norm": 0.32283416390419006, + "learning_rate": 1.140792674875592e-05, + "loss": 0.527, + "step": 24510 + }, + { + "epoch": 0.4550669642028345, + "grad_norm": 0.31632810831069946, + "learning_rate": 1.1406771867811682e-05, + "loss": 0.3693, + "step": 24512 + }, + { + "epoch": 0.4551040943402532, + "grad_norm": 0.32010865211486816, + "learning_rate": 1.1405616967725493e-05, + "loss": 0.3847, + "step": 24514 + }, + { + "epoch": 0.4551412244776718, + "grad_norm": 0.28043118119239807, + "learning_rate": 1.1404462048513067e-05, + "loss": 0.2551, + "step": 24516 + }, + { + "epoch": 0.45517835461509043, + "grad_norm": 0.3210687041282654, + "learning_rate": 1.140330711019012e-05, + "loss": 0.3279, + "step": 24518 + }, + { + "epoch": 0.45521548475250906, + "grad_norm": 0.5838424563407898, + "learning_rate": 1.1402152152772368e-05, + "loss": 0.3853, + "step": 24520 + }, + { + "epoch": 0.4552526148899277, + "grad_norm": 0.37957724928855896, + "learning_rate": 1.1400997176275525e-05, + "loss": 0.4725, + "step": 24522 + }, + { + "epoch": 0.45528974502734637, + "grad_norm": 0.6297940611839294, + "learning_rate": 1.1399842180715306e-05, + "loss": 0.2272, + "step": 24524 + }, + { + "epoch": 0.455326875164765, + "grad_norm": 0.5752727389335632, + "learning_rate": 1.1398687166107424e-05, + "loss": 0.3823, + "step": 24526 + }, + { + "epoch": 0.4553640053021836, + "grad_norm": 0.25575560331344604, + "learning_rate": 1.1397532132467606e-05, + "loss": 0.2138, + "step": 24528 + }, + { + "epoch": 0.45540113543960226, + "grad_norm": 0.4276936948299408, + "learning_rate": 1.1396377079811558e-05, + "loss": 0.3427, + "step": 24530 + }, + { + "epoch": 0.4554382655770209, + "grad_norm": 0.3437599241733551, + "learning_rate": 1.1395222008155002e-05, + "loss": 0.2192, + "step": 24532 + }, + { + "epoch": 0.4554753957144395, + "grad_norm": 0.28980952501296997, + "learning_rate": 1.1394066917513656e-05, + "loss": 0.3994, + "step": 24534 + }, + { + "epoch": 0.4555125258518582, + "grad_norm": 0.5903273224830627, + "learning_rate": 1.139291180790323e-05, + "loss": 0.2171, + "step": 24536 + }, + { + "epoch": 0.4555496559892768, + "grad_norm": 0.25357452034950256, + "learning_rate": 1.1391756679339448e-05, + "loss": 0.2338, + "step": 24538 + }, + { + "epoch": 0.45558678612669545, + "grad_norm": 0.41084229946136475, + "learning_rate": 1.1390601531838029e-05, + "loss": 0.1131, + "step": 24540 + }, + { + "epoch": 0.4556239162641141, + "grad_norm": 0.3623978793621063, + "learning_rate": 1.1389446365414685e-05, + "loss": 0.2403, + "step": 24542 + }, + { + "epoch": 0.4556610464015327, + "grad_norm": 0.36783409118652344, + "learning_rate": 1.1388291180085138e-05, + "loss": 0.3034, + "step": 24544 + }, + { + "epoch": 0.4556981765389514, + "grad_norm": 0.3583255708217621, + "learning_rate": 1.1387135975865109e-05, + "loss": 0.2518, + "step": 24546 + }, + { + "epoch": 0.45573530667637, + "grad_norm": 0.41535913944244385, + "learning_rate": 1.1385980752770314e-05, + "loss": 0.3909, + "step": 24548 + }, + { + "epoch": 0.45577243681378865, + "grad_norm": 0.38567739725112915, + "learning_rate": 1.1384825510816468e-05, + "loss": 0.3593, + "step": 24550 + }, + { + "epoch": 0.4558095669512073, + "grad_norm": 0.3696776330471039, + "learning_rate": 1.1383670250019302e-05, + "loss": 0.3823, + "step": 24552 + }, + { + "epoch": 0.4558466970886259, + "grad_norm": 0.2262435406446457, + "learning_rate": 1.1382514970394524e-05, + "loss": 0.3393, + "step": 24554 + }, + { + "epoch": 0.4558838272260446, + "grad_norm": 0.23637531697750092, + "learning_rate": 1.138135967195786e-05, + "loss": 0.2319, + "step": 24556 + }, + { + "epoch": 0.4559209573634632, + "grad_norm": 0.43223726749420166, + "learning_rate": 1.1380204354725024e-05, + "loss": 0.3389, + "step": 24558 + }, + { + "epoch": 0.45595808750088185, + "grad_norm": 0.3760911226272583, + "learning_rate": 1.1379049018711747e-05, + "loss": 0.3885, + "step": 24560 + }, + { + "epoch": 0.4559952176383005, + "grad_norm": 0.5189329981803894, + "learning_rate": 1.1377893663933739e-05, + "loss": 0.3595, + "step": 24562 + }, + { + "epoch": 0.4560323477757191, + "grad_norm": 0.4941771328449249, + "learning_rate": 1.137673829040673e-05, + "loss": 0.2323, + "step": 24564 + }, + { + "epoch": 0.45606947791313773, + "grad_norm": 0.3823826014995575, + "learning_rate": 1.1375582898146436e-05, + "loss": 0.404, + "step": 24566 + }, + { + "epoch": 0.4561066080505564, + "grad_norm": 0.42866823077201843, + "learning_rate": 1.1374427487168575e-05, + "loss": 0.4135, + "step": 24568 + }, + { + "epoch": 0.45614373818797505, + "grad_norm": 0.4048547148704529, + "learning_rate": 1.1373272057488875e-05, + "loss": 0.2561, + "step": 24570 + }, + { + "epoch": 0.4561808683253937, + "grad_norm": 0.3053908348083496, + "learning_rate": 1.1372116609123056e-05, + "loss": 0.3646, + "step": 24572 + }, + { + "epoch": 0.4562179984628123, + "grad_norm": 0.36771899461746216, + "learning_rate": 1.1370961142086843e-05, + "loss": 0.3279, + "step": 24574 + }, + { + "epoch": 0.45625512860023093, + "grad_norm": 0.47602155804634094, + "learning_rate": 1.1369805656395949e-05, + "loss": 0.3204, + "step": 24576 + }, + { + "epoch": 0.4562922587376496, + "grad_norm": 0.33591604232788086, + "learning_rate": 1.1368650152066109e-05, + "loss": 0.3125, + "step": 24578 + }, + { + "epoch": 0.45632938887506824, + "grad_norm": 0.37706390023231506, + "learning_rate": 1.1367494629113036e-05, + "loss": 0.2787, + "step": 24580 + }, + { + "epoch": 0.45636651901248687, + "grad_norm": 0.44107377529144287, + "learning_rate": 1.1366339087552458e-05, + "loss": 0.3419, + "step": 24582 + }, + { + "epoch": 0.4564036491499055, + "grad_norm": 0.3912048935890198, + "learning_rate": 1.1365183527400099e-05, + "loss": 0.2214, + "step": 24584 + }, + { + "epoch": 0.4564407792873241, + "grad_norm": 0.4682738482952118, + "learning_rate": 1.136402794867168e-05, + "loss": 0.4859, + "step": 24586 + }, + { + "epoch": 0.45647790942474276, + "grad_norm": 0.3997812569141388, + "learning_rate": 1.1362872351382927e-05, + "loss": 0.214, + "step": 24588 + }, + { + "epoch": 0.45651503956216144, + "grad_norm": 0.4531441628932953, + "learning_rate": 1.1361716735549565e-05, + "loss": 0.1835, + "step": 24590 + }, + { + "epoch": 0.45655216969958007, + "grad_norm": 0.38408786058425903, + "learning_rate": 1.1360561101187316e-05, + "loss": 0.2306, + "step": 24592 + }, + { + "epoch": 0.4565892998369987, + "grad_norm": 0.3898982107639313, + "learning_rate": 1.1359405448311905e-05, + "loss": 0.3398, + "step": 24594 + }, + { + "epoch": 0.4566264299744173, + "grad_norm": 0.3453042507171631, + "learning_rate": 1.1358249776939056e-05, + "loss": 0.2156, + "step": 24596 + }, + { + "epoch": 0.45666356011183595, + "grad_norm": 0.28083473443984985, + "learning_rate": 1.13570940870845e-05, + "loss": 0.3232, + "step": 24598 + }, + { + "epoch": 0.45670069024925464, + "grad_norm": 0.3714282512664795, + "learning_rate": 1.1355938378763955e-05, + "loss": 0.0918, + "step": 24600 + }, + { + "epoch": 0.45673782038667327, + "grad_norm": 0.41268599033355713, + "learning_rate": 1.135478265199315e-05, + "loss": 0.169, + "step": 24602 + }, + { + "epoch": 0.4567749505240919, + "grad_norm": 0.4764304757118225, + "learning_rate": 1.1353626906787815e-05, + "loss": 0.1995, + "step": 24604 + }, + { + "epoch": 0.4568120806615105, + "grad_norm": 0.6143215894699097, + "learning_rate": 1.1352471143163673e-05, + "loss": 0.2432, + "step": 24606 + }, + { + "epoch": 0.45684921079892915, + "grad_norm": 0.4299054443836212, + "learning_rate": 1.1351315361136443e-05, + "loss": 0.1698, + "step": 24608 + }, + { + "epoch": 0.4568863409363478, + "grad_norm": 0.4971691370010376, + "learning_rate": 1.1350159560721865e-05, + "loss": 0.4941, + "step": 24610 + }, + { + "epoch": 0.45692347107376646, + "grad_norm": 0.4644004702568054, + "learning_rate": 1.1349003741935656e-05, + "loss": 0.2069, + "step": 24612 + }, + { + "epoch": 0.4569606012111851, + "grad_norm": 0.3718923032283783, + "learning_rate": 1.1347847904793547e-05, + "loss": 0.509, + "step": 24614 + }, + { + "epoch": 0.4569977313486037, + "grad_norm": 0.38594526052474976, + "learning_rate": 1.1346692049311267e-05, + "loss": 0.3381, + "step": 24616 + }, + { + "epoch": 0.45703486148602235, + "grad_norm": 0.46163222193717957, + "learning_rate": 1.1345536175504544e-05, + "loss": 0.256, + "step": 24618 + }, + { + "epoch": 0.457071991623441, + "grad_norm": 0.4183988571166992, + "learning_rate": 1.13443802833891e-05, + "loss": 0.2962, + "step": 24620 + }, + { + "epoch": 0.45710912176085966, + "grad_norm": 0.27365225553512573, + "learning_rate": 1.1343224372980668e-05, + "loss": 0.2902, + "step": 24622 + }, + { + "epoch": 0.4571462518982783, + "grad_norm": 0.4900434613227844, + "learning_rate": 1.1342068444294976e-05, + "loss": 0.3922, + "step": 24624 + }, + { + "epoch": 0.4571833820356969, + "grad_norm": 0.3156397342681885, + "learning_rate": 1.1340912497347753e-05, + "loss": 0.2715, + "step": 24626 + }, + { + "epoch": 0.45722051217311555, + "grad_norm": 0.44852909445762634, + "learning_rate": 1.1339756532154728e-05, + "loss": 0.2205, + "step": 24628 + }, + { + "epoch": 0.4572576423105342, + "grad_norm": 0.3294508159160614, + "learning_rate": 1.1338600548731628e-05, + "loss": 0.2872, + "step": 24630 + }, + { + "epoch": 0.45729477244795286, + "grad_norm": 0.5000939965248108, + "learning_rate": 1.1337444547094186e-05, + "loss": 0.2379, + "step": 24632 + }, + { + "epoch": 0.4573319025853715, + "grad_norm": 0.5787004232406616, + "learning_rate": 1.1336288527258125e-05, + "loss": 0.3117, + "step": 24634 + }, + { + "epoch": 0.4573690327227901, + "grad_norm": 0.5235352516174316, + "learning_rate": 1.1335132489239187e-05, + "loss": 0.3567, + "step": 24636 + }, + { + "epoch": 0.45740616286020874, + "grad_norm": 0.3239721953868866, + "learning_rate": 1.1333976433053092e-05, + "loss": 0.2903, + "step": 24638 + }, + { + "epoch": 0.45744329299762737, + "grad_norm": 0.5485575795173645, + "learning_rate": 1.1332820358715572e-05, + "loss": 0.4664, + "step": 24640 + }, + { + "epoch": 0.457480423135046, + "grad_norm": 0.26542019844055176, + "learning_rate": 1.133166426624236e-05, + "loss": 0.5019, + "step": 24642 + }, + { + "epoch": 0.4575175532724647, + "grad_norm": 0.3433041274547577, + "learning_rate": 1.1330508155649187e-05, + "loss": 0.1303, + "step": 24644 + }, + { + "epoch": 0.4575546834098833, + "grad_norm": 0.5400921702384949, + "learning_rate": 1.1329352026951781e-05, + "loss": 0.3413, + "step": 24646 + }, + { + "epoch": 0.45759181354730194, + "grad_norm": 0.3454822897911072, + "learning_rate": 1.132819588016588e-05, + "loss": 0.4525, + "step": 24648 + }, + { + "epoch": 0.45762894368472057, + "grad_norm": 0.29039466381073, + "learning_rate": 1.1327039715307208e-05, + "loss": 0.3117, + "step": 24650 + }, + { + "epoch": 0.4576660738221392, + "grad_norm": 0.46030300855636597, + "learning_rate": 1.13258835323915e-05, + "loss": 0.3963, + "step": 24652 + }, + { + "epoch": 0.4577032039595579, + "grad_norm": 0.35532301664352417, + "learning_rate": 1.1324727331434491e-05, + "loss": 0.2821, + "step": 24654 + }, + { + "epoch": 0.4577403340969765, + "grad_norm": 0.30830031633377075, + "learning_rate": 1.132357111245191e-05, + "loss": 0.3755, + "step": 24656 + }, + { + "epoch": 0.45777746423439514, + "grad_norm": 0.3656231462955475, + "learning_rate": 1.1322414875459492e-05, + "loss": 0.2861, + "step": 24658 + }, + { + "epoch": 0.45781459437181377, + "grad_norm": 0.39429420232772827, + "learning_rate": 1.1321258620472969e-05, + "loss": 0.4696, + "step": 24660 + }, + { + "epoch": 0.4578517245092324, + "grad_norm": 0.577022910118103, + "learning_rate": 1.1320102347508068e-05, + "loss": 0.3116, + "step": 24662 + }, + { + "epoch": 0.457888854646651, + "grad_norm": 0.6878483891487122, + "learning_rate": 1.1318946056580534e-05, + "loss": 0.5136, + "step": 24664 + }, + { + "epoch": 0.4579259847840697, + "grad_norm": 0.32243695855140686, + "learning_rate": 1.1317789747706093e-05, + "loss": 0.3957, + "step": 24666 + }, + { + "epoch": 0.45796311492148833, + "grad_norm": 0.29694223403930664, + "learning_rate": 1.131663342090048e-05, + "loss": 0.4932, + "step": 24668 + }, + { + "epoch": 0.45800024505890696, + "grad_norm": 0.39853808283805847, + "learning_rate": 1.131547707617943e-05, + "loss": 0.3442, + "step": 24670 + }, + { + "epoch": 0.4580373751963256, + "grad_norm": 0.3936103582382202, + "learning_rate": 1.1314320713558682e-05, + "loss": 0.1655, + "step": 24672 + }, + { + "epoch": 0.4580745053337442, + "grad_norm": 0.382458359003067, + "learning_rate": 1.1313164333053962e-05, + "loss": 0.45, + "step": 24674 + }, + { + "epoch": 0.4581116354711629, + "grad_norm": 0.35554081201553345, + "learning_rate": 1.1312007934681006e-05, + "loss": 0.3633, + "step": 24676 + }, + { + "epoch": 0.45814876560858153, + "grad_norm": 0.3385803997516632, + "learning_rate": 1.1310851518455555e-05, + "loss": 0.4222, + "step": 24678 + }, + { + "epoch": 0.45818589574600016, + "grad_norm": 0.3930008113384247, + "learning_rate": 1.1309695084393342e-05, + "loss": 0.2888, + "step": 24680 + }, + { + "epoch": 0.4582230258834188, + "grad_norm": 0.42606255412101746, + "learning_rate": 1.1308538632510099e-05, + "loss": 0.3203, + "step": 24682 + }, + { + "epoch": 0.4582601560208374, + "grad_norm": 0.4616042375564575, + "learning_rate": 1.1307382162821568e-05, + "loss": 0.2457, + "step": 24684 + }, + { + "epoch": 0.45829728615825605, + "grad_norm": 0.3962433636188507, + "learning_rate": 1.130622567534348e-05, + "loss": 0.3392, + "step": 24686 + }, + { + "epoch": 0.45833441629567473, + "grad_norm": 0.32231810688972473, + "learning_rate": 1.1305069170091575e-05, + "loss": 0.203, + "step": 24688 + }, + { + "epoch": 0.45837154643309336, + "grad_norm": 0.6415740847587585, + "learning_rate": 1.1303912647081585e-05, + "loss": 0.1647, + "step": 24690 + }, + { + "epoch": 0.458408676570512, + "grad_norm": 0.38293391466140747, + "learning_rate": 1.1302756106329253e-05, + "loss": 0.099, + "step": 24692 + }, + { + "epoch": 0.4584458067079306, + "grad_norm": 0.3291141390800476, + "learning_rate": 1.130159954785031e-05, + "loss": 0.3832, + "step": 24694 + }, + { + "epoch": 0.45848293684534924, + "grad_norm": 0.3407718539237976, + "learning_rate": 1.1300442971660494e-05, + "loss": 0.3806, + "step": 24696 + }, + { + "epoch": 0.4585200669827679, + "grad_norm": 0.41238492727279663, + "learning_rate": 1.129928637777555e-05, + "loss": 0.3569, + "step": 24698 + }, + { + "epoch": 0.45855719712018655, + "grad_norm": 0.44277670979499817, + "learning_rate": 1.1298129766211205e-05, + "loss": 0.2663, + "step": 24700 + }, + { + "epoch": 0.4585943272576052, + "grad_norm": 0.54229736328125, + "learning_rate": 1.1296973136983205e-05, + "loss": 0.5922, + "step": 24702 + }, + { + "epoch": 0.4586314573950238, + "grad_norm": 0.31878790259361267, + "learning_rate": 1.1295816490107287e-05, + "loss": 0.4151, + "step": 24704 + }, + { + "epoch": 0.45866858753244244, + "grad_norm": 0.40891537070274353, + "learning_rate": 1.1294659825599185e-05, + "loss": 0.2835, + "step": 24706 + }, + { + "epoch": 0.4587057176698611, + "grad_norm": 0.4385741949081421, + "learning_rate": 1.129350314347464e-05, + "loss": 0.253, + "step": 24708 + }, + { + "epoch": 0.45874284780727975, + "grad_norm": 0.3405691683292389, + "learning_rate": 1.1292346443749395e-05, + "loss": 0.2806, + "step": 24710 + }, + { + "epoch": 0.4587799779446984, + "grad_norm": 0.32953062653541565, + "learning_rate": 1.1291189726439184e-05, + "loss": 0.2548, + "step": 24712 + }, + { + "epoch": 0.458817108082117, + "grad_norm": 0.3323723077774048, + "learning_rate": 1.1290032991559748e-05, + "loss": 0.4626, + "step": 24714 + }, + { + "epoch": 0.45885423821953564, + "grad_norm": 0.37523239850997925, + "learning_rate": 1.1288876239126827e-05, + "loss": 0.2649, + "step": 24716 + }, + { + "epoch": 0.45889136835695427, + "grad_norm": 0.6160756349563599, + "learning_rate": 1.1287719469156161e-05, + "loss": 0.2367, + "step": 24718 + }, + { + "epoch": 0.45892849849437295, + "grad_norm": 0.2867957055568695, + "learning_rate": 1.128656268166349e-05, + "loss": 0.1872, + "step": 24720 + }, + { + "epoch": 0.4589656286317916, + "grad_norm": 0.3562508523464203, + "learning_rate": 1.1285405876664558e-05, + "loss": 0.1574, + "step": 24722 + }, + { + "epoch": 0.4590027587692102, + "grad_norm": 0.241835817694664, + "learning_rate": 1.1284249054175101e-05, + "loss": 0.1733, + "step": 24724 + }, + { + "epoch": 0.45903988890662883, + "grad_norm": 0.47047892212867737, + "learning_rate": 1.1283092214210858e-05, + "loss": 0.2462, + "step": 24726 + }, + { + "epoch": 0.45907701904404746, + "grad_norm": 0.4878137707710266, + "learning_rate": 1.1281935356787574e-05, + "loss": 0.5055, + "step": 24728 + }, + { + "epoch": 0.45911414918146615, + "grad_norm": 0.5731932520866394, + "learning_rate": 1.1280778481920993e-05, + "loss": 0.2046, + "step": 24730 + }, + { + "epoch": 0.4591512793188848, + "grad_norm": 0.8377817273139954, + "learning_rate": 1.1279621589626852e-05, + "loss": 0.2873, + "step": 24732 + }, + { + "epoch": 0.4591884094563034, + "grad_norm": 0.2459387481212616, + "learning_rate": 1.127846467992089e-05, + "loss": 0.2742, + "step": 24734 + }, + { + "epoch": 0.45922553959372203, + "grad_norm": 0.3931157886981964, + "learning_rate": 1.127730775281886e-05, + "loss": 0.4358, + "step": 24736 + }, + { + "epoch": 0.45926266973114066, + "grad_norm": 0.46490150690078735, + "learning_rate": 1.1276150808336493e-05, + "loss": 0.2828, + "step": 24738 + }, + { + "epoch": 0.4592997998685593, + "grad_norm": 0.36766037344932556, + "learning_rate": 1.1274993846489535e-05, + "loss": 0.1467, + "step": 24740 + }, + { + "epoch": 0.459336930005978, + "grad_norm": 0.4598161578178406, + "learning_rate": 1.1273836867293732e-05, + "loss": 0.1934, + "step": 24742 + }, + { + "epoch": 0.4593740601433966, + "grad_norm": 0.42807143926620483, + "learning_rate": 1.1272679870764827e-05, + "loss": 0.3257, + "step": 24744 + }, + { + "epoch": 0.45941119028081523, + "grad_norm": 0.22443221509456635, + "learning_rate": 1.1271522856918556e-05, + "loss": 0.3506, + "step": 24746 + }, + { + "epoch": 0.45944832041823386, + "grad_norm": 0.32144659757614136, + "learning_rate": 1.1270365825770671e-05, + "loss": 0.2601, + "step": 24748 + }, + { + "epoch": 0.4594854505556525, + "grad_norm": 0.4392704367637634, + "learning_rate": 1.126920877733691e-05, + "loss": 0.3422, + "step": 24750 + }, + { + "epoch": 0.45952258069307117, + "grad_norm": 0.41745316982269287, + "learning_rate": 1.1268051711633019e-05, + "loss": 0.3134, + "step": 24752 + }, + { + "epoch": 0.4595597108304898, + "grad_norm": 5.275430679321289, + "learning_rate": 1.1266894628674746e-05, + "loss": 0.2722, + "step": 24754 + }, + { + "epoch": 0.4595968409679084, + "grad_norm": 0.32729780673980713, + "learning_rate": 1.1265737528477831e-05, + "loss": 0.3244, + "step": 24756 + }, + { + "epoch": 0.45963397110532705, + "grad_norm": 0.5418132543563843, + "learning_rate": 1.1264580411058017e-05, + "loss": 0.258, + "step": 24758 + }, + { + "epoch": 0.4596711012427457, + "grad_norm": 0.35150980949401855, + "learning_rate": 1.1263423276431051e-05, + "loss": 0.1977, + "step": 24760 + }, + { + "epoch": 0.4597082313801643, + "grad_norm": 0.3756154477596283, + "learning_rate": 1.1262266124612684e-05, + "loss": 0.3478, + "step": 24762 + }, + { + "epoch": 0.459745361517583, + "grad_norm": 0.32415974140167236, + "learning_rate": 1.1261108955618654e-05, + "loss": 0.2035, + "step": 24764 + }, + { + "epoch": 0.4597824916550016, + "grad_norm": 0.4630662798881531, + "learning_rate": 1.1259951769464706e-05, + "loss": 0.2426, + "step": 24766 + }, + { + "epoch": 0.45981962179242025, + "grad_norm": 0.2896573543548584, + "learning_rate": 1.1258794566166588e-05, + "loss": 0.2515, + "step": 24768 + }, + { + "epoch": 0.4598567519298389, + "grad_norm": 0.4056151211261749, + "learning_rate": 1.1257637345740048e-05, + "loss": 0.1871, + "step": 24770 + }, + { + "epoch": 0.4598938820672575, + "grad_norm": 0.20932339131832123, + "learning_rate": 1.1256480108200829e-05, + "loss": 0.067, + "step": 24772 + }, + { + "epoch": 0.4599310122046762, + "grad_norm": 0.379909873008728, + "learning_rate": 1.1255322853564683e-05, + "loss": 0.2877, + "step": 24774 + }, + { + "epoch": 0.4599681423420948, + "grad_norm": 0.31244438886642456, + "learning_rate": 1.1254165581847352e-05, + "loss": 0.1886, + "step": 24776 + }, + { + "epoch": 0.46000527247951345, + "grad_norm": 0.4667509198188782, + "learning_rate": 1.1253008293064583e-05, + "loss": 0.4993, + "step": 24778 + }, + { + "epoch": 0.4600424026169321, + "grad_norm": 0.36874887347221375, + "learning_rate": 1.1251850987232124e-05, + "loss": 0.3163, + "step": 24780 + }, + { + "epoch": 0.4600795327543507, + "grad_norm": 0.5158633589744568, + "learning_rate": 1.1250693664365725e-05, + "loss": 0.3736, + "step": 24782 + }, + { + "epoch": 0.4601166628917694, + "grad_norm": 0.4242803156375885, + "learning_rate": 1.1249536324481127e-05, + "loss": 0.4671, + "step": 24784 + }, + { + "epoch": 0.460153793029188, + "grad_norm": 0.49340811371803284, + "learning_rate": 1.124837896759409e-05, + "loss": 0.3054, + "step": 24786 + }, + { + "epoch": 0.46019092316660665, + "grad_norm": 0.2399706095457077, + "learning_rate": 1.1247221593720349e-05, + "loss": 0.3092, + "step": 24788 + }, + { + "epoch": 0.4602280533040253, + "grad_norm": 0.37511441111564636, + "learning_rate": 1.1246064202875657e-05, + "loss": 0.3362, + "step": 24790 + }, + { + "epoch": 0.4602651834414439, + "grad_norm": 0.33872243762016296, + "learning_rate": 1.1244906795075764e-05, + "loss": 0.3581, + "step": 24792 + }, + { + "epoch": 0.46030231357886253, + "grad_norm": 0.4021400809288025, + "learning_rate": 1.1243749370336421e-05, + "loss": 0.3554, + "step": 24794 + }, + { + "epoch": 0.4603394437162812, + "grad_norm": 0.25100380182266235, + "learning_rate": 1.1242591928673374e-05, + "loss": 0.1628, + "step": 24796 + }, + { + "epoch": 0.46037657385369984, + "grad_norm": 0.44247281551361084, + "learning_rate": 1.1241434470102373e-05, + "loss": 0.2754, + "step": 24798 + }, + { + "epoch": 0.4604137039911185, + "grad_norm": 0.3028048872947693, + "learning_rate": 1.1240276994639166e-05, + "loss": 0.2695, + "step": 24800 + }, + { + "epoch": 0.4604508341285371, + "grad_norm": 0.3111465275287628, + "learning_rate": 1.1239119502299505e-05, + "loss": 0.4772, + "step": 24802 + }, + { + "epoch": 0.46048796426595573, + "grad_norm": 0.3493809700012207, + "learning_rate": 1.1237961993099135e-05, + "loss": 0.4162, + "step": 24804 + }, + { + "epoch": 0.4605250944033744, + "grad_norm": 0.3528115451335907, + "learning_rate": 1.1236804467053816e-05, + "loss": 0.4133, + "step": 24806 + }, + { + "epoch": 0.46056222454079304, + "grad_norm": 0.40689849853515625, + "learning_rate": 1.1235646924179292e-05, + "loss": 0.2373, + "step": 24808 + }, + { + "epoch": 0.46059935467821167, + "grad_norm": 0.4057770073413849, + "learning_rate": 1.1234489364491315e-05, + "loss": 0.1969, + "step": 24810 + }, + { + "epoch": 0.4606364848156303, + "grad_norm": 0.42430856823921204, + "learning_rate": 1.1233331788005635e-05, + "loss": 0.1991, + "step": 24812 + }, + { + "epoch": 0.4606736149530489, + "grad_norm": 0.34488871693611145, + "learning_rate": 1.1232174194738002e-05, + "loss": 0.2081, + "step": 24814 + }, + { + "epoch": 0.46071074509046755, + "grad_norm": 0.6896771788597107, + "learning_rate": 1.123101658470417e-05, + "loss": 0.3053, + "step": 24816 + }, + { + "epoch": 0.46074787522788624, + "grad_norm": 0.33918485045433044, + "learning_rate": 1.122985895791989e-05, + "loss": 0.3627, + "step": 24818 + }, + { + "epoch": 0.46078500536530487, + "grad_norm": 0.46250197291374207, + "learning_rate": 1.1228701314400911e-05, + "loss": 0.3811, + "step": 24820 + }, + { + "epoch": 0.4608221355027235, + "grad_norm": 0.3258797824382782, + "learning_rate": 1.122754365416299e-05, + "loss": 0.2223, + "step": 24822 + }, + { + "epoch": 0.4608592656401421, + "grad_norm": 0.3176458477973938, + "learning_rate": 1.1226385977221876e-05, + "loss": 0.2942, + "step": 24824 + }, + { + "epoch": 0.46089639577756075, + "grad_norm": 0.4141657054424286, + "learning_rate": 1.1225228283593323e-05, + "loss": 0.243, + "step": 24826 + }, + { + "epoch": 0.46093352591497944, + "grad_norm": 0.33861052989959717, + "learning_rate": 1.1224070573293082e-05, + "loss": 0.2692, + "step": 24828 + }, + { + "epoch": 0.46097065605239806, + "grad_norm": 0.2734968066215515, + "learning_rate": 1.1222912846336912e-05, + "loss": 0.3567, + "step": 24830 + }, + { + "epoch": 0.4610077861898167, + "grad_norm": 0.3466772437095642, + "learning_rate": 1.1221755102740555e-05, + "loss": 0.2999, + "step": 24832 + }, + { + "epoch": 0.4610449163272353, + "grad_norm": 0.35979652404785156, + "learning_rate": 1.122059734251977e-05, + "loss": 0.2568, + "step": 24834 + }, + { + "epoch": 0.46108204646465395, + "grad_norm": 0.40245288610458374, + "learning_rate": 1.1219439565690315e-05, + "loss": 0.3132, + "step": 24836 + }, + { + "epoch": 0.4611191766020726, + "grad_norm": 0.28886058926582336, + "learning_rate": 1.1218281772267938e-05, + "loss": 0.1643, + "step": 24838 + }, + { + "epoch": 0.46115630673949126, + "grad_norm": 0.35303306579589844, + "learning_rate": 1.1217123962268398e-05, + "loss": 0.258, + "step": 24840 + }, + { + "epoch": 0.4611934368769099, + "grad_norm": 0.5408716201782227, + "learning_rate": 1.121596613570744e-05, + "loss": 0.2441, + "step": 24842 + }, + { + "epoch": 0.4612305670143285, + "grad_norm": 0.443154901266098, + "learning_rate": 1.1214808292600833e-05, + "loss": 0.2506, + "step": 24844 + }, + { + "epoch": 0.46126769715174715, + "grad_norm": 0.5537800788879395, + "learning_rate": 1.121365043296432e-05, + "loss": 0.315, + "step": 24846 + }, + { + "epoch": 0.4613048272891658, + "grad_norm": 0.41054776310920715, + "learning_rate": 1.1212492556813662e-05, + "loss": 0.3117, + "step": 24848 + }, + { + "epoch": 0.46134195742658446, + "grad_norm": 0.3634506165981293, + "learning_rate": 1.121133466416461e-05, + "loss": 0.2409, + "step": 24850 + }, + { + "epoch": 0.4613790875640031, + "grad_norm": 0.3884308636188507, + "learning_rate": 1.1210176755032922e-05, + "loss": 0.1973, + "step": 24852 + }, + { + "epoch": 0.4614162177014217, + "grad_norm": 0.45028001070022583, + "learning_rate": 1.1209018829434352e-05, + "loss": 0.4625, + "step": 24854 + }, + { + "epoch": 0.46145334783884034, + "grad_norm": 0.2986210882663727, + "learning_rate": 1.120786088738466e-05, + "loss": 0.1974, + "step": 24856 + }, + { + "epoch": 0.461490477976259, + "grad_norm": 0.2877485156059265, + "learning_rate": 1.1206702928899598e-05, + "loss": 0.28, + "step": 24858 + }, + { + "epoch": 0.46152760811367766, + "grad_norm": 0.4148741662502289, + "learning_rate": 1.1205544953994925e-05, + "loss": 0.312, + "step": 24860 + }, + { + "epoch": 0.4615647382510963, + "grad_norm": 0.23268738389015198, + "learning_rate": 1.1204386962686396e-05, + "loss": 0.4079, + "step": 24862 + }, + { + "epoch": 0.4616018683885149, + "grad_norm": 0.3963732123374939, + "learning_rate": 1.1203228954989766e-05, + "loss": 0.4038, + "step": 24864 + }, + { + "epoch": 0.46163899852593354, + "grad_norm": 0.7261603474617004, + "learning_rate": 1.1202070930920794e-05, + "loss": 0.3564, + "step": 24866 + }, + { + "epoch": 0.46167612866335217, + "grad_norm": 0.33898189663887024, + "learning_rate": 1.1200912890495239e-05, + "loss": 0.2924, + "step": 24868 + }, + { + "epoch": 0.4617132588007708, + "grad_norm": 0.3380786180496216, + "learning_rate": 1.1199754833728858e-05, + "loss": 0.4501, + "step": 24870 + }, + { + "epoch": 0.4617503889381895, + "grad_norm": 0.4006290137767792, + "learning_rate": 1.1198596760637403e-05, + "loss": 0.4165, + "step": 24872 + }, + { + "epoch": 0.4617875190756081, + "grad_norm": 0.344072550535202, + "learning_rate": 1.119743867123664e-05, + "loss": 0.2894, + "step": 24874 + }, + { + "epoch": 0.46182464921302674, + "grad_norm": 0.2505243718624115, + "learning_rate": 1.1196280565542322e-05, + "loss": 0.1822, + "step": 24876 + }, + { + "epoch": 0.46186177935044537, + "grad_norm": 0.31893032789230347, + "learning_rate": 1.1195122443570205e-05, + "loss": 0.187, + "step": 24878 + }, + { + "epoch": 0.461898909487864, + "grad_norm": 0.23779061436653137, + "learning_rate": 1.1193964305336058e-05, + "loss": 0.2853, + "step": 24880 + }, + { + "epoch": 0.4619360396252827, + "grad_norm": 0.3483765721321106, + "learning_rate": 1.1192806150855631e-05, + "loss": 0.2709, + "step": 24882 + }, + { + "epoch": 0.4619731697627013, + "grad_norm": 0.34843936562538147, + "learning_rate": 1.1191647980144681e-05, + "loss": 0.3114, + "step": 24884 + }, + { + "epoch": 0.46201029990011994, + "grad_norm": 0.46261489391326904, + "learning_rate": 1.1190489793218975e-05, + "loss": 0.3064, + "step": 24886 + }, + { + "epoch": 0.46204743003753856, + "grad_norm": 0.4472907483577728, + "learning_rate": 1.118933159009427e-05, + "loss": 0.226, + "step": 24888 + }, + { + "epoch": 0.4620845601749572, + "grad_norm": 0.3483368754386902, + "learning_rate": 1.1188173370786321e-05, + "loss": 0.3393, + "step": 24890 + }, + { + "epoch": 0.4621216903123758, + "grad_norm": 0.33254578709602356, + "learning_rate": 1.1187015135310897e-05, + "loss": 0.2397, + "step": 24892 + }, + { + "epoch": 0.4621588204497945, + "grad_norm": 0.28217658400535583, + "learning_rate": 1.118585688368375e-05, + "loss": 0.321, + "step": 24894 + }, + { + "epoch": 0.46219595058721313, + "grad_norm": 0.23270997405052185, + "learning_rate": 1.1184698615920639e-05, + "loss": 0.2167, + "step": 24896 + }, + { + "epoch": 0.46223308072463176, + "grad_norm": 0.2630429267883301, + "learning_rate": 1.118354033203733e-05, + "loss": 0.2045, + "step": 24898 + }, + { + "epoch": 0.4622702108620504, + "grad_norm": 0.31943827867507935, + "learning_rate": 1.1182382032049586e-05, + "loss": 0.5334, + "step": 24900 + }, + { + "epoch": 0.462307340999469, + "grad_norm": 0.37029388546943665, + "learning_rate": 1.1181223715973162e-05, + "loss": 0.1842, + "step": 24902 + }, + { + "epoch": 0.4623444711368877, + "grad_norm": 0.2004348635673523, + "learning_rate": 1.1180065383823821e-05, + "loss": 0.1687, + "step": 24904 + }, + { + "epoch": 0.46238160127430633, + "grad_norm": 0.3348277509212494, + "learning_rate": 1.1178907035617325e-05, + "loss": 0.2744, + "step": 24906 + }, + { + "epoch": 0.46241873141172496, + "grad_norm": 0.5614563226699829, + "learning_rate": 1.1177748671369435e-05, + "loss": 0.3372, + "step": 24908 + }, + { + "epoch": 0.4624558615491436, + "grad_norm": 0.38579264283180237, + "learning_rate": 1.1176590291095912e-05, + "loss": 0.3123, + "step": 24910 + }, + { + "epoch": 0.4624929916865622, + "grad_norm": 0.3339270353317261, + "learning_rate": 1.1175431894812524e-05, + "loss": 0.2235, + "step": 24912 + }, + { + "epoch": 0.46253012182398084, + "grad_norm": 0.49250340461730957, + "learning_rate": 1.1174273482535028e-05, + "loss": 0.4845, + "step": 24914 + }, + { + "epoch": 0.46256725196139953, + "grad_norm": 0.4855112135410309, + "learning_rate": 1.1173115054279182e-05, + "loss": 0.2975, + "step": 24916 + }, + { + "epoch": 0.46260438209881816, + "grad_norm": 0.4341851770877838, + "learning_rate": 1.117195661006076e-05, + "loss": 0.2032, + "step": 24918 + }, + { + "epoch": 0.4626415122362368, + "grad_norm": 0.4957031309604645, + "learning_rate": 1.1170798149895514e-05, + "loss": 0.2484, + "step": 24920 + }, + { + "epoch": 0.4626786423736554, + "grad_norm": 0.3832385241985321, + "learning_rate": 1.1169639673799213e-05, + "loss": 0.2838, + "step": 24922 + }, + { + "epoch": 0.46271577251107404, + "grad_norm": 0.4815942347049713, + "learning_rate": 1.1168481181787622e-05, + "loss": 0.2357, + "step": 24924 + }, + { + "epoch": 0.4627529026484927, + "grad_norm": 0.38324955105781555, + "learning_rate": 1.1167322673876501e-05, + "loss": 0.3678, + "step": 24926 + }, + { + "epoch": 0.46279003278591135, + "grad_norm": 0.25386250019073486, + "learning_rate": 1.1166164150081616e-05, + "loss": 0.3157, + "step": 24928 + }, + { + "epoch": 0.46282716292333, + "grad_norm": 0.5275769829750061, + "learning_rate": 1.1165005610418726e-05, + "loss": 0.1944, + "step": 24930 + }, + { + "epoch": 0.4628642930607486, + "grad_norm": 0.8354781270027161, + "learning_rate": 1.1163847054903605e-05, + "loss": 0.2927, + "step": 24932 + }, + { + "epoch": 0.46290142319816724, + "grad_norm": 0.4561612606048584, + "learning_rate": 1.1162688483552009e-05, + "loss": 0.3681, + "step": 24934 + }, + { + "epoch": 0.46293855333558587, + "grad_norm": 0.6024540662765503, + "learning_rate": 1.1161529896379703e-05, + "loss": 0.2533, + "step": 24936 + }, + { + "epoch": 0.46297568347300455, + "grad_norm": 0.19927014410495758, + "learning_rate": 1.1160371293402458e-05, + "loss": 0.2018, + "step": 24938 + }, + { + "epoch": 0.4630128136104232, + "grad_norm": 0.4563714265823364, + "learning_rate": 1.1159212674636036e-05, + "loss": 0.3889, + "step": 24940 + }, + { + "epoch": 0.4630499437478418, + "grad_norm": 0.3190673589706421, + "learning_rate": 1.1158054040096198e-05, + "loss": 0.2855, + "step": 24942 + }, + { + "epoch": 0.46308707388526044, + "grad_norm": 0.6262931823730469, + "learning_rate": 1.1156895389798714e-05, + "loss": 0.281, + "step": 24944 + }, + { + "epoch": 0.46312420402267906, + "grad_norm": 0.35368961095809937, + "learning_rate": 1.1155736723759352e-05, + "loss": 0.4124, + "step": 24946 + }, + { + "epoch": 0.46316133416009775, + "grad_norm": 0.2934551239013672, + "learning_rate": 1.1154578041993874e-05, + "loss": 0.2523, + "step": 24948 + }, + { + "epoch": 0.4631984642975164, + "grad_norm": 0.3389197289943695, + "learning_rate": 1.1153419344518047e-05, + "loss": 0.4044, + "step": 24950 + }, + { + "epoch": 0.463235594434935, + "grad_norm": 0.4323364198207855, + "learning_rate": 1.1152260631347634e-05, + "loss": 0.1507, + "step": 24952 + }, + { + "epoch": 0.46327272457235363, + "grad_norm": 0.3411848247051239, + "learning_rate": 1.115110190249841e-05, + "loss": 0.2434, + "step": 24954 + }, + { + "epoch": 0.46330985470977226, + "grad_norm": 0.3328692615032196, + "learning_rate": 1.1149943157986136e-05, + "loss": 0.5382, + "step": 24956 + }, + { + "epoch": 0.46334698484719095, + "grad_norm": 0.3350425958633423, + "learning_rate": 1.1148784397826578e-05, + "loss": 0.416, + "step": 24958 + }, + { + "epoch": 0.4633841149846096, + "grad_norm": 0.4458329975605011, + "learning_rate": 1.1147625622035506e-05, + "loss": 0.2581, + "step": 24960 + }, + { + "epoch": 0.4634212451220282, + "grad_norm": 0.3829217851161957, + "learning_rate": 1.1146466830628688e-05, + "loss": 0.1953, + "step": 24962 + }, + { + "epoch": 0.46345837525944683, + "grad_norm": 0.8776864409446716, + "learning_rate": 1.1145308023621887e-05, + "loss": 0.2634, + "step": 24964 + }, + { + "epoch": 0.46349550539686546, + "grad_norm": 0.31340283155441284, + "learning_rate": 1.1144149201030881e-05, + "loss": 0.1179, + "step": 24966 + }, + { + "epoch": 0.4635326355342841, + "grad_norm": 0.3972989618778229, + "learning_rate": 1.1142990362871426e-05, + "loss": 0.4119, + "step": 24968 + }, + { + "epoch": 0.46356976567170277, + "grad_norm": 0.41166236996650696, + "learning_rate": 1.1141831509159298e-05, + "loss": 0.4087, + "step": 24970 + }, + { + "epoch": 0.4636068958091214, + "grad_norm": 0.41264697909355164, + "learning_rate": 1.1140672639910261e-05, + "loss": 0.2968, + "step": 24972 + }, + { + "epoch": 0.46364402594654003, + "grad_norm": 0.3329342305660248, + "learning_rate": 1.1139513755140087e-05, + "loss": 0.2092, + "step": 24974 + }, + { + "epoch": 0.46368115608395866, + "grad_norm": 0.39777353405952454, + "learning_rate": 1.1138354854864549e-05, + "loss": 0.256, + "step": 24976 + }, + { + "epoch": 0.4637182862213773, + "grad_norm": 0.3956587612628937, + "learning_rate": 1.1137195939099408e-05, + "loss": 0.2352, + "step": 24978 + }, + { + "epoch": 0.46375541635879597, + "grad_norm": 0.4261299669742584, + "learning_rate": 1.1136037007860435e-05, + "loss": 0.1663, + "step": 24980 + }, + { + "epoch": 0.4637925464962146, + "grad_norm": 0.2760485112667084, + "learning_rate": 1.1134878061163401e-05, + "loss": 0.3332, + "step": 24982 + }, + { + "epoch": 0.4638296766336332, + "grad_norm": 0.43186715245246887, + "learning_rate": 1.1133719099024076e-05, + "loss": 0.208, + "step": 24984 + }, + { + "epoch": 0.46386680677105185, + "grad_norm": 0.5443124175071716, + "learning_rate": 1.1132560121458234e-05, + "loss": 0.2296, + "step": 24986 + }, + { + "epoch": 0.4639039369084705, + "grad_norm": 0.5913235545158386, + "learning_rate": 1.1131401128481638e-05, + "loss": 0.2801, + "step": 24988 + }, + { + "epoch": 0.4639410670458891, + "grad_norm": 0.35802584886550903, + "learning_rate": 1.113024212011006e-05, + "loss": 0.3241, + "step": 24990 + }, + { + "epoch": 0.4639781971833078, + "grad_norm": 0.25926247239112854, + "learning_rate": 1.1129083096359273e-05, + "loss": 0.2435, + "step": 24992 + }, + { + "epoch": 0.4640153273207264, + "grad_norm": 0.39737045764923096, + "learning_rate": 1.112792405724505e-05, + "loss": 0.1968, + "step": 24994 + }, + { + "epoch": 0.46405245745814505, + "grad_norm": 0.7752198576927185, + "learning_rate": 1.1126765002783155e-05, + "loss": 0.1905, + "step": 24996 + }, + { + "epoch": 0.4640895875955637, + "grad_norm": 0.179983988404274, + "learning_rate": 1.1125605932989367e-05, + "loss": 0.2536, + "step": 24998 + }, + { + "epoch": 0.4641267177329823, + "grad_norm": 0.33478426933288574, + "learning_rate": 1.1124446847879454e-05, + "loss": 0.4343, + "step": 25000 + }, + { + "epoch": 0.464163847870401, + "grad_norm": 0.31066006422042847, + "learning_rate": 1.1123287747469183e-05, + "loss": 0.2546, + "step": 25002 + }, + { + "epoch": 0.4642009780078196, + "grad_norm": 0.5079692602157593, + "learning_rate": 1.1122128631774331e-05, + "loss": 0.3384, + "step": 25004 + }, + { + "epoch": 0.46423810814523825, + "grad_norm": 0.3383162021636963, + "learning_rate": 1.1120969500810672e-05, + "loss": 0.171, + "step": 25006 + }, + { + "epoch": 0.4642752382826569, + "grad_norm": 0.47008898854255676, + "learning_rate": 1.111981035459398e-05, + "loss": 0.3149, + "step": 25008 + }, + { + "epoch": 0.4643123684200755, + "grad_norm": 0.46213915944099426, + "learning_rate": 1.1118651193140016e-05, + "loss": 0.3812, + "step": 25010 + }, + { + "epoch": 0.46434949855749413, + "grad_norm": 0.5277388691902161, + "learning_rate": 1.1117492016464562e-05, + "loss": 0.519, + "step": 25012 + }, + { + "epoch": 0.4643866286949128, + "grad_norm": 0.35149312019348145, + "learning_rate": 1.111633282458339e-05, + "loss": 0.2365, + "step": 25014 + }, + { + "epoch": 0.46442375883233145, + "grad_norm": 0.42730915546417236, + "learning_rate": 1.1115173617512271e-05, + "loss": 0.2942, + "step": 25016 + }, + { + "epoch": 0.4644608889697501, + "grad_norm": 0.23597809672355652, + "learning_rate": 1.111401439526698e-05, + "loss": 0.3043, + "step": 25018 + }, + { + "epoch": 0.4644980191071687, + "grad_norm": 0.483123779296875, + "learning_rate": 1.111285515786329e-05, + "loss": 0.3044, + "step": 25020 + }, + { + "epoch": 0.46453514924458733, + "grad_norm": 0.2866550385951996, + "learning_rate": 1.1111695905316976e-05, + "loss": 0.2033, + "step": 25022 + }, + { + "epoch": 0.464572279382006, + "grad_norm": 0.557029664516449, + "learning_rate": 1.1110536637643807e-05, + "loss": 0.3902, + "step": 25024 + }, + { + "epoch": 0.46460940951942464, + "grad_norm": 0.33052733540534973, + "learning_rate": 1.1109377354859565e-05, + "loss": 0.2201, + "step": 25026 + }, + { + "epoch": 0.46464653965684327, + "grad_norm": 0.42713308334350586, + "learning_rate": 1.1108218056980017e-05, + "loss": 0.3562, + "step": 25028 + }, + { + "epoch": 0.4646836697942619, + "grad_norm": 0.3552428185939789, + "learning_rate": 1.1107058744020945e-05, + "loss": 0.1674, + "step": 25030 + }, + { + "epoch": 0.46472079993168053, + "grad_norm": 0.4714275300502777, + "learning_rate": 1.1105899415998116e-05, + "loss": 0.3118, + "step": 25032 + }, + { + "epoch": 0.4647579300690992, + "grad_norm": 0.39476677775382996, + "learning_rate": 1.1104740072927309e-05, + "loss": 0.2278, + "step": 25034 + }, + { + "epoch": 0.46479506020651784, + "grad_norm": 0.24415062367916107, + "learning_rate": 1.1103580714824298e-05, + "loss": 0.2332, + "step": 25036 + }, + { + "epoch": 0.46483219034393647, + "grad_norm": 0.238313227891922, + "learning_rate": 1.1102421341704861e-05, + "loss": 0.3035, + "step": 25038 + }, + { + "epoch": 0.4648693204813551, + "grad_norm": 0.586731493473053, + "learning_rate": 1.110126195358477e-05, + "loss": 0.2039, + "step": 25040 + }, + { + "epoch": 0.4649064506187737, + "grad_norm": 0.3552999794483185, + "learning_rate": 1.1100102550479803e-05, + "loss": 0.3484, + "step": 25042 + }, + { + "epoch": 0.46494358075619235, + "grad_norm": 0.36459195613861084, + "learning_rate": 1.1098943132405735e-05, + "loss": 0.3841, + "step": 25044 + }, + { + "epoch": 0.46498071089361104, + "grad_norm": 0.40709102153778076, + "learning_rate": 1.1097783699378344e-05, + "loss": 0.3617, + "step": 25046 + }, + { + "epoch": 0.46501784103102967, + "grad_norm": 0.1688622236251831, + "learning_rate": 1.1096624251413401e-05, + "loss": 0.3131, + "step": 25048 + }, + { + "epoch": 0.4650549711684483, + "grad_norm": 0.3475238084793091, + "learning_rate": 1.1095464788526693e-05, + "loss": 0.4155, + "step": 25050 + }, + { + "epoch": 0.4650921013058669, + "grad_norm": 0.4512470066547394, + "learning_rate": 1.1094305310733988e-05, + "loss": 0.2479, + "step": 25052 + }, + { + "epoch": 0.46512923144328555, + "grad_norm": 0.2326812595129013, + "learning_rate": 1.1093145818051063e-05, + "loss": 0.3289, + "step": 25054 + }, + { + "epoch": 0.46516636158070424, + "grad_norm": 0.3711986839771271, + "learning_rate": 1.10919863104937e-05, + "loss": 0.3219, + "step": 25056 + }, + { + "epoch": 0.46520349171812286, + "grad_norm": 0.419181227684021, + "learning_rate": 1.1090826788077675e-05, + "loss": 0.3731, + "step": 25058 + }, + { + "epoch": 0.4652406218555415, + "grad_norm": 2.2664918899536133, + "learning_rate": 1.1089667250818763e-05, + "loss": 0.2771, + "step": 25060 + }, + { + "epoch": 0.4652777519929601, + "grad_norm": 0.29841262102127075, + "learning_rate": 1.1088507698732745e-05, + "loss": 0.3939, + "step": 25062 + }, + { + "epoch": 0.46531488213037875, + "grad_norm": 0.354340523481369, + "learning_rate": 1.1087348131835399e-05, + "loss": 0.1534, + "step": 25064 + }, + { + "epoch": 0.4653520122677974, + "grad_norm": 0.45017555356025696, + "learning_rate": 1.1086188550142498e-05, + "loss": 0.2657, + "step": 25066 + }, + { + "epoch": 0.46538914240521606, + "grad_norm": 0.5132289528846741, + "learning_rate": 1.1085028953669824e-05, + "loss": 0.2018, + "step": 25068 + }, + { + "epoch": 0.4654262725426347, + "grad_norm": 0.30213218927383423, + "learning_rate": 1.108386934243316e-05, + "loss": 0.2668, + "step": 25070 + }, + { + "epoch": 0.4654634026800533, + "grad_norm": 0.5752504467964172, + "learning_rate": 1.1082709716448281e-05, + "loss": 0.1883, + "step": 25072 + }, + { + "epoch": 0.46550053281747195, + "grad_norm": 1.1086883544921875, + "learning_rate": 1.1081550075730962e-05, + "loss": 0.2311, + "step": 25074 + }, + { + "epoch": 0.4655376629548906, + "grad_norm": 0.35609790682792664, + "learning_rate": 1.1080390420296989e-05, + "loss": 0.2149, + "step": 25076 + }, + { + "epoch": 0.46557479309230926, + "grad_norm": 0.3935128152370453, + "learning_rate": 1.1079230750162137e-05, + "loss": 0.1225, + "step": 25078 + }, + { + "epoch": 0.4656119232297279, + "grad_norm": 0.53044593334198, + "learning_rate": 1.1078071065342187e-05, + "loss": 0.2086, + "step": 25080 + }, + { + "epoch": 0.4656490533671465, + "grad_norm": 0.4616038203239441, + "learning_rate": 1.1076911365852921e-05, + "loss": 0.2539, + "step": 25082 + }, + { + "epoch": 0.46568618350456514, + "grad_norm": 0.3729253113269806, + "learning_rate": 1.1075751651710117e-05, + "loss": 0.1893, + "step": 25084 + }, + { + "epoch": 0.46572331364198377, + "grad_norm": 0.28270411491394043, + "learning_rate": 1.1074591922929553e-05, + "loss": 0.408, + "step": 25086 + }, + { + "epoch": 0.4657604437794024, + "grad_norm": 0.6389529705047607, + "learning_rate": 1.1073432179527014e-05, + "loss": 0.2549, + "step": 25088 + }, + { + "epoch": 0.4657975739168211, + "grad_norm": 0.39129218459129333, + "learning_rate": 1.1072272421518275e-05, + "loss": 0.4089, + "step": 25090 + }, + { + "epoch": 0.4658347040542397, + "grad_norm": 0.5073806643486023, + "learning_rate": 1.1071112648919123e-05, + "loss": 0.6049, + "step": 25092 + }, + { + "epoch": 0.46587183419165834, + "grad_norm": 0.5053232908248901, + "learning_rate": 1.1069952861745336e-05, + "loss": 0.2074, + "step": 25094 + }, + { + "epoch": 0.46590896432907697, + "grad_norm": 0.6206637620925903, + "learning_rate": 1.1068793060012695e-05, + "loss": 0.2796, + "step": 25096 + }, + { + "epoch": 0.4659460944664956, + "grad_norm": 0.4092426300048828, + "learning_rate": 1.1067633243736982e-05, + "loss": 0.3888, + "step": 25098 + }, + { + "epoch": 0.4659832246039143, + "grad_norm": 0.3590167462825775, + "learning_rate": 1.1066473412933975e-05, + "loss": 0.2997, + "step": 25100 + }, + { + "epoch": 0.4660203547413329, + "grad_norm": 0.41744059324264526, + "learning_rate": 1.1065313567619464e-05, + "loss": 0.3785, + "step": 25102 + }, + { + "epoch": 0.46605748487875154, + "grad_norm": 0.29979026317596436, + "learning_rate": 1.1064153707809226e-05, + "loss": 0.1919, + "step": 25104 + }, + { + "epoch": 0.46609461501617017, + "grad_norm": 0.3698369562625885, + "learning_rate": 1.1062993833519043e-05, + "loss": 0.6105, + "step": 25106 + }, + { + "epoch": 0.4661317451535888, + "grad_norm": 0.494426965713501, + "learning_rate": 1.10618339447647e-05, + "loss": 0.279, + "step": 25108 + }, + { + "epoch": 0.4661688752910075, + "grad_norm": 0.38214918971061707, + "learning_rate": 1.1060674041561972e-05, + "loss": 0.0795, + "step": 25110 + }, + { + "epoch": 0.4662060054284261, + "grad_norm": 0.3748513460159302, + "learning_rate": 1.105951412392665e-05, + "loss": 0.2659, + "step": 25112 + }, + { + "epoch": 0.46624313556584474, + "grad_norm": 0.21436573565006256, + "learning_rate": 1.1058354191874516e-05, + "loss": 0.3447, + "step": 25114 + }, + { + "epoch": 0.46628026570326336, + "grad_norm": 0.5077074766159058, + "learning_rate": 1.105719424542135e-05, + "loss": 0.2142, + "step": 25116 + }, + { + "epoch": 0.466317395840682, + "grad_norm": 0.2975423038005829, + "learning_rate": 1.1056034284582937e-05, + "loss": 0.2708, + "step": 25118 + }, + { + "epoch": 0.4663545259781006, + "grad_norm": 0.5866014361381531, + "learning_rate": 1.105487430937506e-05, + "loss": 0.2119, + "step": 25120 + }, + { + "epoch": 0.4663916561155193, + "grad_norm": 0.3719688057899475, + "learning_rate": 1.1053714319813504e-05, + "loss": 0.2184, + "step": 25122 + }, + { + "epoch": 0.46642878625293793, + "grad_norm": 0.49235957860946655, + "learning_rate": 1.1052554315914056e-05, + "loss": 0.3743, + "step": 25124 + }, + { + "epoch": 0.46646591639035656, + "grad_norm": 0.4199400842189789, + "learning_rate": 1.1051394297692493e-05, + "loss": 0.3393, + "step": 25126 + }, + { + "epoch": 0.4665030465277752, + "grad_norm": 0.6567108035087585, + "learning_rate": 1.10502342651646e-05, + "loss": 0.2389, + "step": 25128 + }, + { + "epoch": 0.4665401766651938, + "grad_norm": 0.3141202926635742, + "learning_rate": 1.1049074218346167e-05, + "loss": 0.2315, + "step": 25130 + }, + { + "epoch": 0.4665773068026125, + "grad_norm": 0.44798415899276733, + "learning_rate": 1.1047914157252978e-05, + "loss": 0.3434, + "step": 25132 + }, + { + "epoch": 0.46661443694003113, + "grad_norm": 0.5066092014312744, + "learning_rate": 1.1046754081900815e-05, + "loss": 0.2428, + "step": 25134 + }, + { + "epoch": 0.46665156707744976, + "grad_norm": 0.5264660120010376, + "learning_rate": 1.1045593992305466e-05, + "loss": 0.3802, + "step": 25136 + }, + { + "epoch": 0.4666886972148684, + "grad_norm": 0.31128212809562683, + "learning_rate": 1.104443388848271e-05, + "loss": 0.348, + "step": 25138 + }, + { + "epoch": 0.466725827352287, + "grad_norm": 0.3445548713207245, + "learning_rate": 1.1043273770448342e-05, + "loss": 0.4487, + "step": 25140 + }, + { + "epoch": 0.46676295748970564, + "grad_norm": 0.5792197585105896, + "learning_rate": 1.1042113638218141e-05, + "loss": 0.1849, + "step": 25142 + }, + { + "epoch": 0.4668000876271243, + "grad_norm": 0.4217890501022339, + "learning_rate": 1.1040953491807893e-05, + "loss": 0.391, + "step": 25144 + }, + { + "epoch": 0.46683721776454296, + "grad_norm": 0.3623315691947937, + "learning_rate": 1.103979333123339e-05, + "loss": 0.268, + "step": 25146 + }, + { + "epoch": 0.4668743479019616, + "grad_norm": 0.38843193650245667, + "learning_rate": 1.1038633156510413e-05, + "loss": 0.4919, + "step": 25148 + }, + { + "epoch": 0.4669114780393802, + "grad_norm": 0.38810229301452637, + "learning_rate": 1.1037472967654748e-05, + "loss": 0.5589, + "step": 25150 + }, + { + "epoch": 0.46694860817679884, + "grad_norm": 0.5897156000137329, + "learning_rate": 1.1036312764682186e-05, + "loss": 0.4108, + "step": 25152 + }, + { + "epoch": 0.4669857383142175, + "grad_norm": 0.34713464975357056, + "learning_rate": 1.1035152547608507e-05, + "loss": 0.3496, + "step": 25154 + }, + { + "epoch": 0.46702286845163615, + "grad_norm": 0.6029224395751953, + "learning_rate": 1.103399231644951e-05, + "loss": 0.3305, + "step": 25156 + }, + { + "epoch": 0.4670599985890548, + "grad_norm": 0.32493284344673157, + "learning_rate": 1.103283207122097e-05, + "loss": 0.324, + "step": 25158 + }, + { + "epoch": 0.4670971287264734, + "grad_norm": 0.35407131910324097, + "learning_rate": 1.1031671811938679e-05, + "loss": 0.3839, + "step": 25160 + }, + { + "epoch": 0.46713425886389204, + "grad_norm": 0.4451431632041931, + "learning_rate": 1.1030511538618423e-05, + "loss": 0.1529, + "step": 25162 + }, + { + "epoch": 0.46717138900131067, + "grad_norm": 0.38851088285446167, + "learning_rate": 1.1029351251275996e-05, + "loss": 0.4896, + "step": 25164 + }, + { + "epoch": 0.46720851913872935, + "grad_norm": 0.37447088956832886, + "learning_rate": 1.1028190949927177e-05, + "loss": 0.3102, + "step": 25166 + }, + { + "epoch": 0.467245649276148, + "grad_norm": 0.32535800337791443, + "learning_rate": 1.1027030634587763e-05, + "loss": 0.319, + "step": 25168 + }, + { + "epoch": 0.4672827794135666, + "grad_norm": 0.6257903575897217, + "learning_rate": 1.1025870305273539e-05, + "loss": 0.3255, + "step": 25170 + }, + { + "epoch": 0.46731990955098524, + "grad_norm": 0.365803599357605, + "learning_rate": 1.1024709962000288e-05, + "loss": 0.3271, + "step": 25172 + }, + { + "epoch": 0.46735703968840386, + "grad_norm": 0.3670259416103363, + "learning_rate": 1.1023549604783807e-05, + "loss": 0.3423, + "step": 25174 + }, + { + "epoch": 0.46739416982582255, + "grad_norm": 0.568078339099884, + "learning_rate": 1.102238923363988e-05, + "loss": 0.3, + "step": 25176 + }, + { + "epoch": 0.4674312999632412, + "grad_norm": 0.6759040951728821, + "learning_rate": 1.1021228848584302e-05, + "loss": 0.2391, + "step": 25178 + }, + { + "epoch": 0.4674684301006598, + "grad_norm": 0.34443145990371704, + "learning_rate": 1.1020068449632855e-05, + "loss": 0.0866, + "step": 25180 + }, + { + "epoch": 0.46750556023807843, + "grad_norm": 0.34142881631851196, + "learning_rate": 1.1018908036801333e-05, + "loss": 0.2949, + "step": 25182 + }, + { + "epoch": 0.46754269037549706, + "grad_norm": 0.3911501169204712, + "learning_rate": 1.1017747610105524e-05, + "loss": 0.3449, + "step": 25184 + }, + { + "epoch": 0.46757982051291574, + "grad_norm": 0.3580528497695923, + "learning_rate": 1.1016587169561219e-05, + "loss": 0.112, + "step": 25186 + }, + { + "epoch": 0.4676169506503344, + "grad_norm": 0.33637839555740356, + "learning_rate": 1.1015426715184208e-05, + "loss": 0.1929, + "step": 25188 + }, + { + "epoch": 0.467654080787753, + "grad_norm": 0.4197739064693451, + "learning_rate": 1.1014266246990282e-05, + "loss": 0.3444, + "step": 25190 + }, + { + "epoch": 0.46769121092517163, + "grad_norm": 0.4633869528770447, + "learning_rate": 1.1013105764995226e-05, + "loss": 0.4417, + "step": 25192 + }, + { + "epoch": 0.46772834106259026, + "grad_norm": 0.3353174924850464, + "learning_rate": 1.1011945269214838e-05, + "loss": 0.3327, + "step": 25194 + }, + { + "epoch": 0.4677654712000089, + "grad_norm": 0.49701496958732605, + "learning_rate": 1.1010784759664908e-05, + "loss": 0.2775, + "step": 25196 + }, + { + "epoch": 0.46780260133742757, + "grad_norm": 0.33493027091026306, + "learning_rate": 1.1009624236361221e-05, + "loss": 0.2803, + "step": 25198 + }, + { + "epoch": 0.4678397314748462, + "grad_norm": 0.3537202775478363, + "learning_rate": 1.1008463699319576e-05, + "loss": 0.3812, + "step": 25200 + }, + { + "epoch": 0.4678768616122648, + "grad_norm": 0.39976415038108826, + "learning_rate": 1.1007303148555759e-05, + "loss": 0.4311, + "step": 25202 + }, + { + "epoch": 0.46791399174968346, + "grad_norm": 0.4124237596988678, + "learning_rate": 1.1006142584085565e-05, + "loss": 0.3013, + "step": 25204 + }, + { + "epoch": 0.4679511218871021, + "grad_norm": 0.30356067419052124, + "learning_rate": 1.100498200592478e-05, + "loss": 0.3088, + "step": 25206 + }, + { + "epoch": 0.46798825202452077, + "grad_norm": 0.3849911093711853, + "learning_rate": 1.1003821414089204e-05, + "loss": 0.2899, + "step": 25208 + }, + { + "epoch": 0.4680253821619394, + "grad_norm": 0.3869044780731201, + "learning_rate": 1.1002660808594625e-05, + "loss": 0.3382, + "step": 25210 + }, + { + "epoch": 0.468062512299358, + "grad_norm": 0.32600852847099304, + "learning_rate": 1.1001500189456833e-05, + "loss": 0.2665, + "step": 25212 + }, + { + "epoch": 0.46809964243677665, + "grad_norm": 0.3954405188560486, + "learning_rate": 1.1000339556691626e-05, + "loss": 0.2853, + "step": 25214 + }, + { + "epoch": 0.4681367725741953, + "grad_norm": 0.2962939441204071, + "learning_rate": 1.0999178910314793e-05, + "loss": 0.2734, + "step": 25216 + }, + { + "epoch": 0.4681739027116139, + "grad_norm": 0.34606990218162537, + "learning_rate": 1.0998018250342129e-05, + "loss": 0.174, + "step": 25218 + }, + { + "epoch": 0.4682110328490326, + "grad_norm": 0.3607162535190582, + "learning_rate": 1.0996857576789424e-05, + "loss": 0.3884, + "step": 25220 + }, + { + "epoch": 0.4682481629864512, + "grad_norm": 0.39793089032173157, + "learning_rate": 1.0995696889672475e-05, + "loss": 0.2161, + "step": 25222 + }, + { + "epoch": 0.46828529312386985, + "grad_norm": 0.34819117188453674, + "learning_rate": 1.0994536189007072e-05, + "loss": 0.3701, + "step": 25224 + }, + { + "epoch": 0.4683224232612885, + "grad_norm": 0.4439598023891449, + "learning_rate": 1.0993375474809012e-05, + "loss": 0.2586, + "step": 25226 + }, + { + "epoch": 0.4683595533987071, + "grad_norm": 0.4569803774356842, + "learning_rate": 1.0992214747094087e-05, + "loss": 0.2141, + "step": 25228 + }, + { + "epoch": 0.4683966835361258, + "grad_norm": 0.3045515716075897, + "learning_rate": 1.099105400587809e-05, + "loss": 0.4746, + "step": 25230 + }, + { + "epoch": 0.4684338136735444, + "grad_norm": 0.38938406109809875, + "learning_rate": 1.098989325117682e-05, + "loss": 0.413, + "step": 25232 + }, + { + "epoch": 0.46847094381096305, + "grad_norm": 0.5721226334571838, + "learning_rate": 1.0988732483006065e-05, + "loss": 0.3424, + "step": 25234 + }, + { + "epoch": 0.4685080739483817, + "grad_norm": 0.30690598487854004, + "learning_rate": 1.0987571701381622e-05, + "loss": 0.215, + "step": 25236 + }, + { + "epoch": 0.4685452040858003, + "grad_norm": 0.28986838459968567, + "learning_rate": 1.098641090631929e-05, + "loss": 0.3765, + "step": 25238 + }, + { + "epoch": 0.46858233422321893, + "grad_norm": 0.3085126280784607, + "learning_rate": 1.0985250097834856e-05, + "loss": 0.4014, + "step": 25240 + }, + { + "epoch": 0.4686194643606376, + "grad_norm": 0.3355043828487396, + "learning_rate": 1.0984089275944124e-05, + "loss": 0.3825, + "step": 25242 + }, + { + "epoch": 0.46865659449805624, + "grad_norm": 0.3309830129146576, + "learning_rate": 1.098292844066288e-05, + "loss": 0.213, + "step": 25244 + }, + { + "epoch": 0.4686937246354749, + "grad_norm": 0.4079948365688324, + "learning_rate": 1.0981767592006926e-05, + "loss": 0.3177, + "step": 25246 + }, + { + "epoch": 0.4687308547728935, + "grad_norm": 0.3680943250656128, + "learning_rate": 1.0980606729992057e-05, + "loss": 0.3049, + "step": 25248 + }, + { + "epoch": 0.46876798491031213, + "grad_norm": 0.37264418601989746, + "learning_rate": 1.0979445854634063e-05, + "loss": 0.2529, + "step": 25250 + }, + { + "epoch": 0.4688051150477308, + "grad_norm": 0.4549490511417389, + "learning_rate": 1.0978284965948749e-05, + "loss": 0.2483, + "step": 25252 + }, + { + "epoch": 0.46884224518514944, + "grad_norm": 0.4595699906349182, + "learning_rate": 1.0977124063951907e-05, + "loss": 0.3032, + "step": 25254 + }, + { + "epoch": 0.46887937532256807, + "grad_norm": 0.5923108458518982, + "learning_rate": 1.0975963148659332e-05, + "loss": 0.2797, + "step": 25256 + }, + { + "epoch": 0.4689165054599867, + "grad_norm": 0.3875150680541992, + "learning_rate": 1.097480222008682e-05, + "loss": 0.2685, + "step": 25258 + }, + { + "epoch": 0.4689536355974053, + "grad_norm": 0.580773651599884, + "learning_rate": 1.0973641278250174e-05, + "loss": 0.3159, + "step": 25260 + }, + { + "epoch": 0.468990765734824, + "grad_norm": 0.35838016867637634, + "learning_rate": 1.0972480323165184e-05, + "loss": 0.4134, + "step": 25262 + }, + { + "epoch": 0.46902789587224264, + "grad_norm": 0.41542139649391174, + "learning_rate": 1.097131935484765e-05, + "loss": 0.2316, + "step": 25264 + }, + { + "epoch": 0.46906502600966127, + "grad_norm": 0.4149010181427002, + "learning_rate": 1.0970158373313372e-05, + "loss": 0.4205, + "step": 25266 + }, + { + "epoch": 0.4691021561470799, + "grad_norm": 0.582599401473999, + "learning_rate": 1.096899737857814e-05, + "loss": 0.325, + "step": 25268 + }, + { + "epoch": 0.4691392862844985, + "grad_norm": 0.4867689609527588, + "learning_rate": 1.0967836370657756e-05, + "loss": 0.3488, + "step": 25270 + }, + { + "epoch": 0.46917641642191715, + "grad_norm": 0.34824836254119873, + "learning_rate": 1.0966675349568022e-05, + "loss": 0.2708, + "step": 25272 + }, + { + "epoch": 0.46921354655933584, + "grad_norm": 0.4124637842178345, + "learning_rate": 1.096551431532473e-05, + "loss": 0.3572, + "step": 25274 + }, + { + "epoch": 0.46925067669675447, + "grad_norm": 0.28803399205207825, + "learning_rate": 1.0964353267943676e-05, + "loss": 0.2799, + "step": 25276 + }, + { + "epoch": 0.4692878068341731, + "grad_norm": 0.3415999114513397, + "learning_rate": 1.096319220744067e-05, + "loss": 0.3573, + "step": 25278 + }, + { + "epoch": 0.4693249369715917, + "grad_norm": 0.6424931883811951, + "learning_rate": 1.09620311338315e-05, + "loss": 0.4905, + "step": 25280 + }, + { + "epoch": 0.46936206710901035, + "grad_norm": 0.3480803966522217, + "learning_rate": 1.0960870047131964e-05, + "loss": 0.2942, + "step": 25282 + }, + { + "epoch": 0.46939919724642903, + "grad_norm": 0.3467440903186798, + "learning_rate": 1.095970894735787e-05, + "loss": 0.2624, + "step": 25284 + }, + { + "epoch": 0.46943632738384766, + "grad_norm": 0.32604917883872986, + "learning_rate": 1.0958547834525014e-05, + "loss": 0.3658, + "step": 25286 + }, + { + "epoch": 0.4694734575212663, + "grad_norm": 0.5554585456848145, + "learning_rate": 1.0957386708649186e-05, + "loss": 0.2983, + "step": 25288 + }, + { + "epoch": 0.4695105876586849, + "grad_norm": 0.3255753517150879, + "learning_rate": 1.0956225569746197e-05, + "loss": 0.4388, + "step": 25290 + }, + { + "epoch": 0.46954771779610355, + "grad_norm": 0.4001261591911316, + "learning_rate": 1.095506441783184e-05, + "loss": 0.335, + "step": 25292 + }, + { + "epoch": 0.4695848479335222, + "grad_norm": 0.4568459987640381, + "learning_rate": 1.095390325292192e-05, + "loss": 0.2287, + "step": 25294 + }, + { + "epoch": 0.46962197807094086, + "grad_norm": 0.35111257433891296, + "learning_rate": 1.0952742075032233e-05, + "loss": 0.2003, + "step": 25296 + }, + { + "epoch": 0.4696591082083595, + "grad_norm": 0.23794105648994446, + "learning_rate": 1.0951580884178578e-05, + "loss": 0.287, + "step": 25298 + }, + { + "epoch": 0.4696962383457781, + "grad_norm": 0.44977840781211853, + "learning_rate": 1.0950419680376758e-05, + "loss": 0.3437, + "step": 25300 + }, + { + "epoch": 0.46973336848319674, + "grad_norm": 0.37733030319213867, + "learning_rate": 1.0949258463642573e-05, + "loss": 0.1901, + "step": 25302 + }, + { + "epoch": 0.4697704986206154, + "grad_norm": 0.4485464096069336, + "learning_rate": 1.0948097233991826e-05, + "loss": 0.215, + "step": 25304 + }, + { + "epoch": 0.46980762875803406, + "grad_norm": 0.3021954894065857, + "learning_rate": 1.0946935991440316e-05, + "loss": 0.2614, + "step": 25306 + }, + { + "epoch": 0.4698447588954527, + "grad_norm": 0.33837538957595825, + "learning_rate": 1.094577473600384e-05, + "loss": 0.2, + "step": 25308 + }, + { + "epoch": 0.4698818890328713, + "grad_norm": 0.3244591951370239, + "learning_rate": 1.0944613467698206e-05, + "loss": 0.2979, + "step": 25310 + }, + { + "epoch": 0.46991901917028994, + "grad_norm": 0.40208226442337036, + "learning_rate": 1.094345218653921e-05, + "loss": 0.3295, + "step": 25312 + }, + { + "epoch": 0.46995614930770857, + "grad_norm": 0.4616546332836151, + "learning_rate": 1.0942290892542655e-05, + "loss": 0.6368, + "step": 25314 + }, + { + "epoch": 0.4699932794451272, + "grad_norm": 0.3548000454902649, + "learning_rate": 1.0941129585724349e-05, + "loss": 0.3495, + "step": 25316 + }, + { + "epoch": 0.4700304095825459, + "grad_norm": 0.28740596771240234, + "learning_rate": 1.0939968266100082e-05, + "loss": 0.1909, + "step": 25318 + }, + { + "epoch": 0.4700675397199645, + "grad_norm": 0.31129199266433716, + "learning_rate": 1.0938806933685664e-05, + "loss": 0.1985, + "step": 25320 + }, + { + "epoch": 0.47010466985738314, + "grad_norm": 0.31043151021003723, + "learning_rate": 1.0937645588496897e-05, + "loss": 0.3829, + "step": 25322 + }, + { + "epoch": 0.47014179999480177, + "grad_norm": 0.3982013165950775, + "learning_rate": 1.093648423054958e-05, + "loss": 0.1303, + "step": 25324 + }, + { + "epoch": 0.4701789301322204, + "grad_norm": 0.45914027094841003, + "learning_rate": 1.093532285985952e-05, + "loss": 0.3286, + "step": 25326 + }, + { + "epoch": 0.4702160602696391, + "grad_norm": 0.3020865321159363, + "learning_rate": 1.0934161476442517e-05, + "loss": 0.3722, + "step": 25328 + }, + { + "epoch": 0.4702531904070577, + "grad_norm": 0.43697452545166016, + "learning_rate": 1.0933000080314373e-05, + "loss": 0.2866, + "step": 25330 + }, + { + "epoch": 0.47029032054447634, + "grad_norm": 0.5363171100616455, + "learning_rate": 1.0931838671490893e-05, + "loss": 0.3378, + "step": 25332 + }, + { + "epoch": 0.47032745068189497, + "grad_norm": 0.30566373467445374, + "learning_rate": 1.0930677249987882e-05, + "loss": 0.1571, + "step": 25334 + }, + { + "epoch": 0.4703645808193136, + "grad_norm": 0.3327628970146179, + "learning_rate": 1.0929515815821139e-05, + "loss": 0.1733, + "step": 25336 + }, + { + "epoch": 0.4704017109567323, + "grad_norm": 0.4444446563720703, + "learning_rate": 1.092835436900647e-05, + "loss": 0.2512, + "step": 25338 + }, + { + "epoch": 0.4704388410941509, + "grad_norm": 0.5063433647155762, + "learning_rate": 1.0927192909559678e-05, + "loss": 0.4599, + "step": 25340 + }, + { + "epoch": 0.47047597123156953, + "grad_norm": 0.3016646206378937, + "learning_rate": 1.0926031437496571e-05, + "loss": 0.0265, + "step": 25342 + }, + { + "epoch": 0.47051310136898816, + "grad_norm": 0.1943443864583969, + "learning_rate": 1.0924869952832949e-05, + "loss": 0.1243, + "step": 25344 + }, + { + "epoch": 0.4705502315064068, + "grad_norm": 0.47387853264808655, + "learning_rate": 1.0923708455584616e-05, + "loss": 0.2959, + "step": 25346 + }, + { + "epoch": 0.4705873616438254, + "grad_norm": 0.40176093578338623, + "learning_rate": 1.092254694576738e-05, + "loss": 0.2729, + "step": 25348 + }, + { + "epoch": 0.4706244917812441, + "grad_norm": 0.4118253290653229, + "learning_rate": 1.092138542339704e-05, + "loss": 0.2129, + "step": 25350 + }, + { + "epoch": 0.47066162191866273, + "grad_norm": 0.6900607943534851, + "learning_rate": 1.0920223888489405e-05, + "loss": 0.2445, + "step": 25352 + }, + { + "epoch": 0.47069875205608136, + "grad_norm": 0.42615848779678345, + "learning_rate": 1.091906234106028e-05, + "loss": 0.346, + "step": 25354 + }, + { + "epoch": 0.4707358821935, + "grad_norm": 0.4939764738082886, + "learning_rate": 1.091790078112547e-05, + "loss": 0.2522, + "step": 25356 + }, + { + "epoch": 0.4707730123309186, + "grad_norm": 0.40538614988327026, + "learning_rate": 1.091673920870078e-05, + "loss": 0.3133, + "step": 25358 + }, + { + "epoch": 0.4708101424683373, + "grad_norm": 0.3616763949394226, + "learning_rate": 1.0915577623802017e-05, + "loss": 0.232, + "step": 25360 + }, + { + "epoch": 0.47084727260575593, + "grad_norm": 0.3461170494556427, + "learning_rate": 1.091441602644498e-05, + "loss": 0.4564, + "step": 25362 + }, + { + "epoch": 0.47088440274317456, + "grad_norm": 0.5732318758964539, + "learning_rate": 1.0913254416645483e-05, + "loss": 0.4667, + "step": 25364 + }, + { + "epoch": 0.4709215328805932, + "grad_norm": 0.4554021656513214, + "learning_rate": 1.091209279441933e-05, + "loss": 0.4004, + "step": 25366 + }, + { + "epoch": 0.4709586630180118, + "grad_norm": 0.31949055194854736, + "learning_rate": 1.091093115978232e-05, + "loss": 0.165, + "step": 25368 + }, + { + "epoch": 0.47099579315543044, + "grad_norm": 0.41777971386909485, + "learning_rate": 1.0909769512750272e-05, + "loss": 0.2034, + "step": 25370 + }, + { + "epoch": 0.4710329232928491, + "grad_norm": 0.3102082908153534, + "learning_rate": 1.0908607853338986e-05, + "loss": 0.2731, + "step": 25372 + }, + { + "epoch": 0.47107005343026775, + "grad_norm": 0.4500311017036438, + "learning_rate": 1.0907446181564266e-05, + "loss": 0.4227, + "step": 25374 + }, + { + "epoch": 0.4711071835676864, + "grad_norm": 0.5790847539901733, + "learning_rate": 1.0906284497441922e-05, + "loss": 0.2848, + "step": 25376 + }, + { + "epoch": 0.471144313705105, + "grad_norm": 0.29279616475105286, + "learning_rate": 1.0905122800987762e-05, + "loss": 0.2126, + "step": 25378 + }, + { + "epoch": 0.47118144384252364, + "grad_norm": 0.317947655916214, + "learning_rate": 1.0903961092217593e-05, + "loss": 0.277, + "step": 25380 + }, + { + "epoch": 0.4712185739799423, + "grad_norm": 0.4463728666305542, + "learning_rate": 1.0902799371147217e-05, + "loss": 0.2892, + "step": 25382 + }, + { + "epoch": 0.47125570411736095, + "grad_norm": 0.39299476146698, + "learning_rate": 1.0901637637792447e-05, + "loss": 0.2097, + "step": 25384 + }, + { + "epoch": 0.4712928342547796, + "grad_norm": 0.40631529688835144, + "learning_rate": 1.0900475892169088e-05, + "loss": 0.4494, + "step": 25386 + }, + { + "epoch": 0.4713299643921982, + "grad_norm": 0.29346928000450134, + "learning_rate": 1.0899314134292953e-05, + "loss": 0.3716, + "step": 25388 + }, + { + "epoch": 0.47136709452961684, + "grad_norm": 0.4119087755680084, + "learning_rate": 1.0898152364179846e-05, + "loss": 0.2156, + "step": 25390 + }, + { + "epoch": 0.47140422466703547, + "grad_norm": 0.322813481092453, + "learning_rate": 1.0896990581845575e-05, + "loss": 0.3095, + "step": 25392 + }, + { + "epoch": 0.47144135480445415, + "grad_norm": 0.4258841574192047, + "learning_rate": 1.0895828787305949e-05, + "loss": 0.3963, + "step": 25394 + }, + { + "epoch": 0.4714784849418728, + "grad_norm": 0.4616994261741638, + "learning_rate": 1.0894666980576773e-05, + "loss": 0.2597, + "step": 25396 + }, + { + "epoch": 0.4715156150792914, + "grad_norm": 0.36743608117103577, + "learning_rate": 1.0893505161673865e-05, + "loss": 0.4485, + "step": 25398 + }, + { + "epoch": 0.47155274521671003, + "grad_norm": 0.5619312524795532, + "learning_rate": 1.0892343330613026e-05, + "loss": 0.3208, + "step": 25400 + }, + { + "epoch": 0.47158987535412866, + "grad_norm": 0.26258498430252075, + "learning_rate": 1.0891181487410066e-05, + "loss": 0.215, + "step": 25402 + }, + { + "epoch": 0.47162700549154735, + "grad_norm": 0.26424798369407654, + "learning_rate": 1.0890019632080798e-05, + "loss": 0.4018, + "step": 25404 + }, + { + "epoch": 0.471664135628966, + "grad_norm": 0.3750796616077423, + "learning_rate": 1.0888857764641023e-05, + "loss": 0.3046, + "step": 25406 + }, + { + "epoch": 0.4717012657663846, + "grad_norm": 0.4208914041519165, + "learning_rate": 1.088769588510656e-05, + "loss": 0.6732, + "step": 25408 + }, + { + "epoch": 0.47173839590380323, + "grad_norm": 0.31493067741394043, + "learning_rate": 1.0886533993493216e-05, + "loss": 0.3852, + "step": 25410 + }, + { + "epoch": 0.47177552604122186, + "grad_norm": 0.3870280385017395, + "learning_rate": 1.08853720898168e-05, + "loss": 0.3499, + "step": 25412 + }, + { + "epoch": 0.47181265617864054, + "grad_norm": 0.30913302302360535, + "learning_rate": 1.088421017409312e-05, + "loss": 0.2119, + "step": 25414 + }, + { + "epoch": 0.47184978631605917, + "grad_norm": 0.23520545661449432, + "learning_rate": 1.0883048246337988e-05, + "loss": 0.1693, + "step": 25416 + }, + { + "epoch": 0.4718869164534778, + "grad_norm": 0.41441458463668823, + "learning_rate": 1.0881886306567216e-05, + "loss": 0.3543, + "step": 25418 + }, + { + "epoch": 0.47192404659089643, + "grad_norm": 0.38247382640838623, + "learning_rate": 1.088072435479661e-05, + "loss": 0.1647, + "step": 25420 + }, + { + "epoch": 0.47196117672831506, + "grad_norm": 0.5761268734931946, + "learning_rate": 1.0879562391041987e-05, + "loss": 0.2509, + "step": 25422 + }, + { + "epoch": 0.4719983068657337, + "grad_norm": 0.35303500294685364, + "learning_rate": 1.0878400415319152e-05, + "loss": 0.3159, + "step": 25424 + }, + { + "epoch": 0.47203543700315237, + "grad_norm": 0.2870520055294037, + "learning_rate": 1.087723842764392e-05, + "loss": 0.3038, + "step": 25426 + }, + { + "epoch": 0.472072567140571, + "grad_norm": 0.36818113923072815, + "learning_rate": 1.0876076428032098e-05, + "loss": 0.3283, + "step": 25428 + }, + { + "epoch": 0.4721096972779896, + "grad_norm": 0.3182864487171173, + "learning_rate": 1.0874914416499502e-05, + "loss": 0.2689, + "step": 25430 + }, + { + "epoch": 0.47214682741540825, + "grad_norm": 0.35932067036628723, + "learning_rate": 1.0873752393061946e-05, + "loss": 0.4433, + "step": 25432 + }, + { + "epoch": 0.4721839575528269, + "grad_norm": 0.4307151436805725, + "learning_rate": 1.0872590357735228e-05, + "loss": 0.1923, + "step": 25434 + }, + { + "epoch": 0.47222108769024557, + "grad_norm": 0.33131757378578186, + "learning_rate": 1.0871428310535178e-05, + "loss": 0.3132, + "step": 25436 + }, + { + "epoch": 0.4722582178276642, + "grad_norm": 0.43724343180656433, + "learning_rate": 1.0870266251477592e-05, + "loss": 0.1883, + "step": 25438 + }, + { + "epoch": 0.4722953479650828, + "grad_norm": 0.3439030051231384, + "learning_rate": 1.086910418057829e-05, + "loss": 0.3815, + "step": 25440 + }, + { + "epoch": 0.47233247810250145, + "grad_norm": 0.40140312910079956, + "learning_rate": 1.0867942097853086e-05, + "loss": 0.4382, + "step": 25442 + }, + { + "epoch": 0.4723696082399201, + "grad_norm": 0.3417443037033081, + "learning_rate": 1.086678000331779e-05, + "loss": 0.2102, + "step": 25444 + }, + { + "epoch": 0.4724067383773387, + "grad_norm": 0.39919495582580566, + "learning_rate": 1.0865617896988212e-05, + "loss": 0.2417, + "step": 25446 + }, + { + "epoch": 0.4724438685147574, + "grad_norm": 0.39805692434310913, + "learning_rate": 1.0864455778880169e-05, + "loss": 0.1616, + "step": 25448 + }, + { + "epoch": 0.472480998652176, + "grad_norm": 0.4144118130207062, + "learning_rate": 1.0863293649009472e-05, + "loss": 0.3664, + "step": 25450 + }, + { + "epoch": 0.47251812878959465, + "grad_norm": 0.3545556664466858, + "learning_rate": 1.0862131507391933e-05, + "loss": 0.2719, + "step": 25452 + }, + { + "epoch": 0.4725552589270133, + "grad_norm": 0.27268433570861816, + "learning_rate": 1.0860969354043368e-05, + "loss": 0.2867, + "step": 25454 + }, + { + "epoch": 0.4725923890644319, + "grad_norm": 0.4307922422885895, + "learning_rate": 1.085980718897959e-05, + "loss": 0.287, + "step": 25456 + }, + { + "epoch": 0.4726295192018506, + "grad_norm": 0.4680244028568268, + "learning_rate": 1.0858645012216408e-05, + "loss": 0.187, + "step": 25458 + }, + { + "epoch": 0.4726666493392692, + "grad_norm": 0.24568291008472443, + "learning_rate": 1.0857482823769643e-05, + "loss": 0.2515, + "step": 25460 + }, + { + "epoch": 0.47270377947668785, + "grad_norm": 0.36726653575897217, + "learning_rate": 1.0856320623655103e-05, + "loss": 0.3427, + "step": 25462 + }, + { + "epoch": 0.4727409096141065, + "grad_norm": 0.4769476652145386, + "learning_rate": 1.0855158411888606e-05, + "loss": 0.4239, + "step": 25464 + }, + { + "epoch": 0.4727780397515251, + "grad_norm": 0.513557493686676, + "learning_rate": 1.0853996188485963e-05, + "loss": 0.173, + "step": 25466 + }, + { + "epoch": 0.47281516988894373, + "grad_norm": 0.49371397495269775, + "learning_rate": 1.0852833953462992e-05, + "loss": 0.4302, + "step": 25468 + }, + { + "epoch": 0.4728523000263624, + "grad_norm": 0.4058995544910431, + "learning_rate": 1.08516717068355e-05, + "loss": 0.457, + "step": 25470 + }, + { + "epoch": 0.47288943016378104, + "grad_norm": 0.6060823202133179, + "learning_rate": 1.085050944861931e-05, + "loss": 0.3163, + "step": 25472 + }, + { + "epoch": 0.47292656030119967, + "grad_norm": 0.3792531490325928, + "learning_rate": 1.0849347178830236e-05, + "loss": 0.2224, + "step": 25474 + }, + { + "epoch": 0.4729636904386183, + "grad_norm": 0.2610177993774414, + "learning_rate": 1.0848184897484092e-05, + "loss": 0.2312, + "step": 25476 + }, + { + "epoch": 0.47300082057603693, + "grad_norm": 0.45387837290763855, + "learning_rate": 1.0847022604596688e-05, + "loss": 0.3001, + "step": 25478 + }, + { + "epoch": 0.4730379507134556, + "grad_norm": 0.4042569696903229, + "learning_rate": 1.0845860300183848e-05, + "loss": 0.205, + "step": 25480 + }, + { + "epoch": 0.47307508085087424, + "grad_norm": 0.2229710817337036, + "learning_rate": 1.0844697984261378e-05, + "loss": 0.3165, + "step": 25482 + }, + { + "epoch": 0.47311221098829287, + "grad_norm": 0.2741638422012329, + "learning_rate": 1.0843535656845104e-05, + "loss": 0.249, + "step": 25484 + }, + { + "epoch": 0.4731493411257115, + "grad_norm": 0.3848188817501068, + "learning_rate": 1.0842373317950835e-05, + "loss": 0.233, + "step": 25486 + }, + { + "epoch": 0.4731864712631301, + "grad_norm": 0.3627490699291229, + "learning_rate": 1.0841210967594385e-05, + "loss": 0.1811, + "step": 25488 + }, + { + "epoch": 0.4732236014005488, + "grad_norm": 0.34293967485427856, + "learning_rate": 1.0840048605791573e-05, + "loss": 0.3531, + "step": 25490 + }, + { + "epoch": 0.47326073153796744, + "grad_norm": 0.4283178746700287, + "learning_rate": 1.0838886232558219e-05, + "loss": 0.2431, + "step": 25492 + }, + { + "epoch": 0.47329786167538607, + "grad_norm": 0.4440542161464691, + "learning_rate": 1.0837723847910135e-05, + "loss": 0.3731, + "step": 25494 + }, + { + "epoch": 0.4733349918128047, + "grad_norm": 0.5516613721847534, + "learning_rate": 1.0836561451863138e-05, + "loss": 0.367, + "step": 25496 + }, + { + "epoch": 0.4733721219502233, + "grad_norm": 0.26940014958381653, + "learning_rate": 1.0835399044433048e-05, + "loss": 0.2869, + "step": 25498 + }, + { + "epoch": 0.47340925208764195, + "grad_norm": 0.3848629295825958, + "learning_rate": 1.0834236625635678e-05, + "loss": 0.2989, + "step": 25500 + }, + { + "epoch": 0.47344638222506064, + "grad_norm": 0.5390532612800598, + "learning_rate": 1.0833074195486844e-05, + "loss": 0.4017, + "step": 25502 + }, + { + "epoch": 0.47348351236247926, + "grad_norm": 0.2716115117073059, + "learning_rate": 1.0831911754002367e-05, + "loss": 0.3809, + "step": 25504 + }, + { + "epoch": 0.4735206424998979, + "grad_norm": 0.4002685248851776, + "learning_rate": 1.0830749301198062e-05, + "loss": 0.3151, + "step": 25506 + }, + { + "epoch": 0.4735577726373165, + "grad_norm": 0.4001477360725403, + "learning_rate": 1.0829586837089752e-05, + "loss": 0.2348, + "step": 25508 + }, + { + "epoch": 0.47359490277473515, + "grad_norm": 0.3563826382160187, + "learning_rate": 1.0828424361693247e-05, + "loss": 0.1986, + "step": 25510 + }, + { + "epoch": 0.47363203291215383, + "grad_norm": 0.36404919624328613, + "learning_rate": 1.0827261875024366e-05, + "loss": 0.3029, + "step": 25512 + }, + { + "epoch": 0.47366916304957246, + "grad_norm": 0.48661258816719055, + "learning_rate": 1.0826099377098931e-05, + "loss": 0.3824, + "step": 25514 + }, + { + "epoch": 0.4737062931869911, + "grad_norm": 0.3092491328716278, + "learning_rate": 1.0824936867932759e-05, + "loss": 0.2552, + "step": 25516 + }, + { + "epoch": 0.4737434233244097, + "grad_norm": 0.591813862323761, + "learning_rate": 1.0823774347541664e-05, + "loss": 0.2769, + "step": 25518 + }, + { + "epoch": 0.47378055346182835, + "grad_norm": 0.2924026846885681, + "learning_rate": 1.0822611815941468e-05, + "loss": 0.1458, + "step": 25520 + }, + { + "epoch": 0.473817683599247, + "grad_norm": 0.26555246114730835, + "learning_rate": 1.082144927314799e-05, + "loss": 0.2173, + "step": 25522 + }, + { + "epoch": 0.47385481373666566, + "grad_norm": 0.26442041993141174, + "learning_rate": 1.082028671917705e-05, + "loss": 0.2171, + "step": 25524 + }, + { + "epoch": 0.4738919438740843, + "grad_norm": 0.3518792390823364, + "learning_rate": 1.0819124154044463e-05, + "loss": 0.3947, + "step": 25526 + }, + { + "epoch": 0.4739290740115029, + "grad_norm": 0.37692686915397644, + "learning_rate": 1.081796157776605e-05, + "loss": 0.4507, + "step": 25528 + }, + { + "epoch": 0.47396620414892154, + "grad_norm": 0.5003446340560913, + "learning_rate": 1.0816798990357631e-05, + "loss": 0.2765, + "step": 25530 + }, + { + "epoch": 0.47400333428634017, + "grad_norm": 0.22410474717617035, + "learning_rate": 1.0815636391835023e-05, + "loss": 0.1679, + "step": 25532 + }, + { + "epoch": 0.47404046442375886, + "grad_norm": 0.480230450630188, + "learning_rate": 1.0814473782214045e-05, + "loss": 0.2638, + "step": 25534 + }, + { + "epoch": 0.4740775945611775, + "grad_norm": 0.33393579721450806, + "learning_rate": 1.0813311161510522e-05, + "loss": 0.4679, + "step": 25536 + }, + { + "epoch": 0.4741147246985961, + "grad_norm": 0.5020701289176941, + "learning_rate": 1.0812148529740269e-05, + "loss": 0.362, + "step": 25538 + }, + { + "epoch": 0.47415185483601474, + "grad_norm": 0.5628647208213806, + "learning_rate": 1.0810985886919108e-05, + "loss": 0.5066, + "step": 25540 + }, + { + "epoch": 0.47418898497343337, + "grad_norm": 0.5545311570167542, + "learning_rate": 1.0809823233062858e-05, + "loss": 0.2333, + "step": 25542 + }, + { + "epoch": 0.474226115110852, + "grad_norm": 0.37639284133911133, + "learning_rate": 1.0808660568187337e-05, + "loss": 0.6017, + "step": 25544 + }, + { + "epoch": 0.4742632452482707, + "grad_norm": 0.38127413392066956, + "learning_rate": 1.0807497892308367e-05, + "loss": 0.2783, + "step": 25546 + }, + { + "epoch": 0.4743003753856893, + "grad_norm": 0.3367161452770233, + "learning_rate": 1.0806335205441774e-05, + "loss": 0.3212, + "step": 25548 + }, + { + "epoch": 0.47433750552310794, + "grad_norm": 0.26724082231521606, + "learning_rate": 1.0805172507603374e-05, + "loss": 0.3105, + "step": 25550 + }, + { + "epoch": 0.47437463566052657, + "grad_norm": 0.429168164730072, + "learning_rate": 1.0804009798808983e-05, + "loss": 0.3194, + "step": 25552 + }, + { + "epoch": 0.4744117657979452, + "grad_norm": 0.4205878674983978, + "learning_rate": 1.080284707907443e-05, + "loss": 0.3499, + "step": 25554 + }, + { + "epoch": 0.4744488959353639, + "grad_norm": 0.3207207918167114, + "learning_rate": 1.0801684348415534e-05, + "loss": 0.0743, + "step": 25556 + }, + { + "epoch": 0.4744860260727825, + "grad_norm": 0.5857090950012207, + "learning_rate": 1.0800521606848114e-05, + "loss": 0.4589, + "step": 25558 + }, + { + "epoch": 0.47452315621020114, + "grad_norm": 0.43266546726226807, + "learning_rate": 1.0799358854387994e-05, + "loss": 0.2773, + "step": 25560 + }, + { + "epoch": 0.47456028634761976, + "grad_norm": 0.5602853298187256, + "learning_rate": 1.0798196091050994e-05, + "loss": 0.3807, + "step": 25562 + }, + { + "epoch": 0.4745974164850384, + "grad_norm": 0.4169822037220001, + "learning_rate": 1.0797033316852934e-05, + "loss": 0.2354, + "step": 25564 + }, + { + "epoch": 0.4746345466224571, + "grad_norm": 0.31028062105178833, + "learning_rate": 1.079587053180964e-05, + "loss": 0.3197, + "step": 25566 + }, + { + "epoch": 0.4746716767598757, + "grad_norm": 0.4163873493671417, + "learning_rate": 1.0794707735936932e-05, + "loss": 0.1939, + "step": 25568 + }, + { + "epoch": 0.47470880689729433, + "grad_norm": 0.4786476492881775, + "learning_rate": 1.0793544929250632e-05, + "loss": 0.5305, + "step": 25570 + }, + { + "epoch": 0.47474593703471296, + "grad_norm": 0.3375973105430603, + "learning_rate": 1.0792382111766562e-05, + "loss": 0.3852, + "step": 25572 + }, + { + "epoch": 0.4747830671721316, + "grad_norm": 0.31792545318603516, + "learning_rate": 1.0791219283500545e-05, + "loss": 0.2206, + "step": 25574 + }, + { + "epoch": 0.4748201973095502, + "grad_norm": 0.356144517660141, + "learning_rate": 1.0790056444468404e-05, + "loss": 0.4568, + "step": 25576 + }, + { + "epoch": 0.4748573274469689, + "grad_norm": 0.19883421063423157, + "learning_rate": 1.0788893594685959e-05, + "loss": 0.3621, + "step": 25578 + }, + { + "epoch": 0.47489445758438753, + "grad_norm": 0.2478051334619522, + "learning_rate": 1.0787730734169038e-05, + "loss": 0.4037, + "step": 25580 + }, + { + "epoch": 0.47493158772180616, + "grad_norm": 0.5600211024284363, + "learning_rate": 1.078656786293346e-05, + "loss": 0.6172, + "step": 25582 + }, + { + "epoch": 0.4749687178592248, + "grad_norm": 0.4443606436252594, + "learning_rate": 1.0785404980995051e-05, + "loss": 0.2629, + "step": 25584 + }, + { + "epoch": 0.4750058479966434, + "grad_norm": 0.31411972641944885, + "learning_rate": 1.078424208836963e-05, + "loss": 0.179, + "step": 25586 + }, + { + "epoch": 0.4750429781340621, + "grad_norm": 0.3916686475276947, + "learning_rate": 1.0783079185073024e-05, + "loss": 0.2815, + "step": 25588 + }, + { + "epoch": 0.4750801082714807, + "grad_norm": 0.2799230217933655, + "learning_rate": 1.0781916271121055e-05, + "loss": 0.3454, + "step": 25590 + }, + { + "epoch": 0.47511723840889936, + "grad_norm": 0.31751441955566406, + "learning_rate": 1.078075334652955e-05, + "loss": 0.4885, + "step": 25592 + }, + { + "epoch": 0.475154368546318, + "grad_norm": 0.4760719835758209, + "learning_rate": 1.0779590411314334e-05, + "loss": 0.3011, + "step": 25594 + }, + { + "epoch": 0.4751914986837366, + "grad_norm": 0.5476299524307251, + "learning_rate": 1.077842746549122e-05, + "loss": 0.4214, + "step": 25596 + }, + { + "epoch": 0.47522862882115524, + "grad_norm": 0.37174248695373535, + "learning_rate": 1.0777264509076045e-05, + "loss": 0.4231, + "step": 25598 + }, + { + "epoch": 0.4752657589585739, + "grad_norm": 0.6096732020378113, + "learning_rate": 1.0776101542084628e-05, + "loss": 0.3826, + "step": 25600 + }, + { + "epoch": 0.47530288909599255, + "grad_norm": 0.596465528011322, + "learning_rate": 1.077493856453279e-05, + "loss": 0.2254, + "step": 25602 + }, + { + "epoch": 0.4753400192334112, + "grad_norm": 0.33112844824790955, + "learning_rate": 1.0773775576436363e-05, + "loss": 0.3118, + "step": 25604 + }, + { + "epoch": 0.4753771493708298, + "grad_norm": 0.2534058094024658, + "learning_rate": 1.0772612577811167e-05, + "loss": 0.2804, + "step": 25606 + }, + { + "epoch": 0.47541427950824844, + "grad_norm": 0.32938072085380554, + "learning_rate": 1.0771449568673027e-05, + "loss": 0.2835, + "step": 25608 + }, + { + "epoch": 0.4754514096456671, + "grad_norm": 0.49696657061576843, + "learning_rate": 1.077028654903777e-05, + "loss": 0.264, + "step": 25610 + }, + { + "epoch": 0.47548853978308575, + "grad_norm": 0.43749210238456726, + "learning_rate": 1.0769123518921221e-05, + "loss": 0.0831, + "step": 25612 + }, + { + "epoch": 0.4755256699205044, + "grad_norm": 0.3019997477531433, + "learning_rate": 1.0767960478339204e-05, + "loss": 0.1734, + "step": 25614 + }, + { + "epoch": 0.475562800057923, + "grad_norm": 0.3572320342063904, + "learning_rate": 1.0766797427307544e-05, + "loss": 0.3879, + "step": 25616 + }, + { + "epoch": 0.47559993019534164, + "grad_norm": 0.3345218896865845, + "learning_rate": 1.076563436584207e-05, + "loss": 0.3348, + "step": 25618 + }, + { + "epoch": 0.47563706033276026, + "grad_norm": 0.3520383834838867, + "learning_rate": 1.0764471293958602e-05, + "loss": 0.3428, + "step": 25620 + }, + { + "epoch": 0.47567419047017895, + "grad_norm": 0.3539745807647705, + "learning_rate": 1.0763308211672972e-05, + "loss": 0.2175, + "step": 25622 + }, + { + "epoch": 0.4757113206075976, + "grad_norm": 0.4124791920185089, + "learning_rate": 1.0762145119001009e-05, + "loss": 0.4167, + "step": 25624 + }, + { + "epoch": 0.4757484507450162, + "grad_norm": 0.55335533618927, + "learning_rate": 1.0760982015958526e-05, + "loss": 0.2349, + "step": 25626 + }, + { + "epoch": 0.47578558088243483, + "grad_norm": 0.4106026291847229, + "learning_rate": 1.0759818902561358e-05, + "loss": 0.1539, + "step": 25628 + }, + { + "epoch": 0.47582271101985346, + "grad_norm": 0.31891921162605286, + "learning_rate": 1.0758655778825334e-05, + "loss": 0.1954, + "step": 25630 + }, + { + "epoch": 0.47585984115727215, + "grad_norm": 0.3169207274913788, + "learning_rate": 1.0757492644766273e-05, + "loss": 0.586, + "step": 25632 + }, + { + "epoch": 0.4758969712946908, + "grad_norm": 0.7749449014663696, + "learning_rate": 1.0756329500400011e-05, + "loss": 0.4366, + "step": 25634 + }, + { + "epoch": 0.4759341014321094, + "grad_norm": 0.37495163083076477, + "learning_rate": 1.0755166345742368e-05, + "loss": 0.3604, + "step": 25636 + }, + { + "epoch": 0.47597123156952803, + "grad_norm": 0.4163711667060852, + "learning_rate": 1.0754003180809173e-05, + "loss": 0.4244, + "step": 25638 + }, + { + "epoch": 0.47600836170694666, + "grad_norm": 0.4245756268501282, + "learning_rate": 1.0752840005616251e-05, + "loss": 0.3348, + "step": 25640 + }, + { + "epoch": 0.47604549184436534, + "grad_norm": 0.36342230439186096, + "learning_rate": 1.0751676820179433e-05, + "loss": 0.2902, + "step": 25642 + }, + { + "epoch": 0.47608262198178397, + "grad_norm": 0.5366952419281006, + "learning_rate": 1.0750513624514546e-05, + "loss": 0.5316, + "step": 25644 + }, + { + "epoch": 0.4761197521192026, + "grad_norm": 0.3358038365840912, + "learning_rate": 1.0749350418637419e-05, + "loss": 0.409, + "step": 25646 + }, + { + "epoch": 0.47615688225662123, + "grad_norm": 0.3410406708717346, + "learning_rate": 1.0748187202563873e-05, + "loss": 0.307, + "step": 25648 + }, + { + "epoch": 0.47619401239403986, + "grad_norm": 0.36222317814826965, + "learning_rate": 1.0747023976309743e-05, + "loss": 0.2426, + "step": 25650 + }, + { + "epoch": 0.4762311425314585, + "grad_norm": 0.47697383165359497, + "learning_rate": 1.0745860739890852e-05, + "loss": 0.2783, + "step": 25652 + }, + { + "epoch": 0.47626827266887717, + "grad_norm": 0.2913890480995178, + "learning_rate": 1.0744697493323033e-05, + "loss": 0.1946, + "step": 25654 + }, + { + "epoch": 0.4763054028062958, + "grad_norm": 0.4798871874809265, + "learning_rate": 1.074353423662211e-05, + "loss": 0.4034, + "step": 25656 + }, + { + "epoch": 0.4763425329437144, + "grad_norm": 0.46947070956230164, + "learning_rate": 1.0742370969803912e-05, + "loss": 0.3679, + "step": 25658 + }, + { + "epoch": 0.47637966308113305, + "grad_norm": 0.5112821459770203, + "learning_rate": 1.074120769288427e-05, + "loss": 0.3529, + "step": 25660 + }, + { + "epoch": 0.4764167932185517, + "grad_norm": 0.2952834665775299, + "learning_rate": 1.0740044405879012e-05, + "loss": 0.3201, + "step": 25662 + }, + { + "epoch": 0.47645392335597037, + "grad_norm": 0.4461756944656372, + "learning_rate": 1.0738881108803965e-05, + "loss": 0.3363, + "step": 25664 + }, + { + "epoch": 0.476491053493389, + "grad_norm": 0.34060725569725037, + "learning_rate": 1.073771780167496e-05, + "loss": 0.4085, + "step": 25666 + }, + { + "epoch": 0.4765281836308076, + "grad_norm": 0.28135544061660767, + "learning_rate": 1.0736554484507829e-05, + "loss": 0.2788, + "step": 25668 + }, + { + "epoch": 0.47656531376822625, + "grad_norm": 0.5438511967658997, + "learning_rate": 1.0735391157318393e-05, + "loss": 0.3363, + "step": 25670 + }, + { + "epoch": 0.4766024439056449, + "grad_norm": 0.4071270227432251, + "learning_rate": 1.0734227820122487e-05, + "loss": 0.3527, + "step": 25672 + }, + { + "epoch": 0.4766395740430635, + "grad_norm": 0.33415988087654114, + "learning_rate": 1.0733064472935939e-05, + "loss": 0.4921, + "step": 25674 + }, + { + "epoch": 0.4766767041804822, + "grad_norm": 0.4653334319591522, + "learning_rate": 1.0731901115774582e-05, + "loss": 0.3871, + "step": 25676 + }, + { + "epoch": 0.4767138343179008, + "grad_norm": 0.3142509162425995, + "learning_rate": 1.0730737748654241e-05, + "loss": 0.3771, + "step": 25678 + }, + { + "epoch": 0.47675096445531945, + "grad_norm": 0.38125258684158325, + "learning_rate": 1.0729574371590747e-05, + "loss": 0.4175, + "step": 25680 + }, + { + "epoch": 0.4767880945927381, + "grad_norm": 0.5916948318481445, + "learning_rate": 1.0728410984599935e-05, + "loss": 0.3363, + "step": 25682 + }, + { + "epoch": 0.4768252247301567, + "grad_norm": 0.4913061559200287, + "learning_rate": 1.0727247587697627e-05, + "loss": 0.2275, + "step": 25684 + }, + { + "epoch": 0.4768623548675754, + "grad_norm": 0.40251681208610535, + "learning_rate": 1.0726084180899661e-05, + "loss": 0.5608, + "step": 25686 + }, + { + "epoch": 0.476899485004994, + "grad_norm": 0.5869660377502441, + "learning_rate": 1.0724920764221864e-05, + "loss": 0.357, + "step": 25688 + }, + { + "epoch": 0.47693661514241265, + "grad_norm": 0.35948267579078674, + "learning_rate": 1.0723757337680065e-05, + "loss": 0.1441, + "step": 25690 + }, + { + "epoch": 0.4769737452798313, + "grad_norm": 0.35727715492248535, + "learning_rate": 1.0722593901290097e-05, + "loss": 0.3451, + "step": 25692 + }, + { + "epoch": 0.4770108754172499, + "grad_norm": 0.3327142298221588, + "learning_rate": 1.0721430455067791e-05, + "loss": 0.2602, + "step": 25694 + }, + { + "epoch": 0.47704800555466853, + "grad_norm": 0.305571973323822, + "learning_rate": 1.0720266999028976e-05, + "loss": 0.4308, + "step": 25696 + }, + { + "epoch": 0.4770851356920872, + "grad_norm": 0.2570231854915619, + "learning_rate": 1.0719103533189488e-05, + "loss": 0.2282, + "step": 25698 + }, + { + "epoch": 0.47712226582950584, + "grad_norm": 0.34305548667907715, + "learning_rate": 1.0717940057565153e-05, + "loss": 0.2475, + "step": 25700 + }, + { + "epoch": 0.47715939596692447, + "grad_norm": 0.4209199547767639, + "learning_rate": 1.0716776572171803e-05, + "loss": 0.3068, + "step": 25702 + }, + { + "epoch": 0.4771965261043431, + "grad_norm": 0.2675342857837677, + "learning_rate": 1.0715613077025271e-05, + "loss": 0.3316, + "step": 25704 + }, + { + "epoch": 0.47723365624176173, + "grad_norm": 0.32007548213005066, + "learning_rate": 1.071444957214139e-05, + "loss": 0.2145, + "step": 25706 + }, + { + "epoch": 0.4772707863791804, + "grad_norm": 0.32496023178100586, + "learning_rate": 1.071328605753599e-05, + "loss": 0.466, + "step": 25708 + }, + { + "epoch": 0.47730791651659904, + "grad_norm": 0.3902900516986847, + "learning_rate": 1.0712122533224903e-05, + "loss": 0.1209, + "step": 25710 + }, + { + "epoch": 0.47734504665401767, + "grad_norm": 0.3667261600494385, + "learning_rate": 1.071095899922396e-05, + "loss": 0.3444, + "step": 25712 + }, + { + "epoch": 0.4773821767914363, + "grad_norm": 0.37834101915359497, + "learning_rate": 1.0709795455548996e-05, + "loss": 0.2929, + "step": 25714 + }, + { + "epoch": 0.4774193069288549, + "grad_norm": 0.32055819034576416, + "learning_rate": 1.070863190221584e-05, + "loss": 0.1999, + "step": 25716 + }, + { + "epoch": 0.4774564370662736, + "grad_norm": 0.5087131857872009, + "learning_rate": 1.070746833924033e-05, + "loss": 0.2359, + "step": 25718 + }, + { + "epoch": 0.47749356720369224, + "grad_norm": 2.0127739906311035, + "learning_rate": 1.0706304766638296e-05, + "loss": 0.3208, + "step": 25720 + }, + { + "epoch": 0.47753069734111087, + "grad_norm": 0.47244301438331604, + "learning_rate": 1.0705141184425565e-05, + "loss": 0.4341, + "step": 25722 + }, + { + "epoch": 0.4775678274785295, + "grad_norm": 0.35016411542892456, + "learning_rate": 1.0703977592617975e-05, + "loss": 0.1843, + "step": 25724 + }, + { + "epoch": 0.4776049576159481, + "grad_norm": 0.3687516152858734, + "learning_rate": 1.0702813991231363e-05, + "loss": 0.1912, + "step": 25726 + }, + { + "epoch": 0.47764208775336675, + "grad_norm": 0.45923924446105957, + "learning_rate": 1.0701650380281552e-05, + "loss": 0.452, + "step": 25728 + }, + { + "epoch": 0.47767921789078543, + "grad_norm": 0.3328072428703308, + "learning_rate": 1.0700486759784385e-05, + "loss": 0.3818, + "step": 25730 + }, + { + "epoch": 0.47771634802820406, + "grad_norm": 0.2896766662597656, + "learning_rate": 1.0699323129755692e-05, + "loss": 0.3868, + "step": 25732 + }, + { + "epoch": 0.4777534781656227, + "grad_norm": 0.41523781418800354, + "learning_rate": 1.0698159490211304e-05, + "loss": 0.1679, + "step": 25734 + }, + { + "epoch": 0.4777906083030413, + "grad_norm": 0.33768758177757263, + "learning_rate": 1.0696995841167055e-05, + "loss": 0.2751, + "step": 25736 + }, + { + "epoch": 0.47782773844045995, + "grad_norm": 0.378711462020874, + "learning_rate": 1.0695832182638783e-05, + "loss": 0.3012, + "step": 25738 + }, + { + "epoch": 0.47786486857787863, + "grad_norm": 0.41520217061042786, + "learning_rate": 1.069466851464232e-05, + "loss": 0.2973, + "step": 25740 + }, + { + "epoch": 0.47790199871529726, + "grad_norm": 0.5890416502952576, + "learning_rate": 1.0693504837193495e-05, + "loss": 0.1234, + "step": 25742 + }, + { + "epoch": 0.4779391288527159, + "grad_norm": 0.33692285418510437, + "learning_rate": 1.0692341150308153e-05, + "loss": 0.386, + "step": 25744 + }, + { + "epoch": 0.4779762589901345, + "grad_norm": 0.5813331604003906, + "learning_rate": 1.0691177454002115e-05, + "loss": 0.348, + "step": 25746 + }, + { + "epoch": 0.47801338912755315, + "grad_norm": 0.36305853724479675, + "learning_rate": 1.0690013748291226e-05, + "loss": 0.3424, + "step": 25748 + }, + { + "epoch": 0.4780505192649718, + "grad_norm": 0.5061136484146118, + "learning_rate": 1.0688850033191317e-05, + "loss": 0.679, + "step": 25750 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.3417928218841553, + "learning_rate": 1.0687686308718221e-05, + "loss": 0.2822, + "step": 25752 + }, + { + "epoch": 0.4781247795398091, + "grad_norm": 0.5205857157707214, + "learning_rate": 1.0686522574887774e-05, + "loss": 0.2745, + "step": 25754 + }, + { + "epoch": 0.4781619096772277, + "grad_norm": 0.30825477838516235, + "learning_rate": 1.0685358831715811e-05, + "loss": 0.4334, + "step": 25756 + }, + { + "epoch": 0.47819903981464634, + "grad_norm": 0.26875045895576477, + "learning_rate": 1.068419507921817e-05, + "loss": 0.4397, + "step": 25758 + }, + { + "epoch": 0.47823616995206497, + "grad_norm": 0.6560484766960144, + "learning_rate": 1.068303131741068e-05, + "loss": 0.3197, + "step": 25760 + }, + { + "epoch": 0.47827330008948366, + "grad_norm": 0.2730303704738617, + "learning_rate": 1.068186754630918e-05, + "loss": 0.3246, + "step": 25762 + }, + { + "epoch": 0.4783104302269023, + "grad_norm": 0.8990581035614014, + "learning_rate": 1.0680703765929508e-05, + "loss": 0.1455, + "step": 25764 + }, + { + "epoch": 0.4783475603643209, + "grad_norm": 0.257618248462677, + "learning_rate": 1.0679539976287492e-05, + "loss": 0.2712, + "step": 25766 + }, + { + "epoch": 0.47838469050173954, + "grad_norm": 0.2745590806007385, + "learning_rate": 1.0678376177398974e-05, + "loss": 0.3214, + "step": 25768 + }, + { + "epoch": 0.47842182063915817, + "grad_norm": 0.30385804176330566, + "learning_rate": 1.067721236927979e-05, + "loss": 0.3032, + "step": 25770 + }, + { + "epoch": 0.4784589507765768, + "grad_norm": 0.5318928956985474, + "learning_rate": 1.0676048551945775e-05, + "loss": 0.3133, + "step": 25772 + }, + { + "epoch": 0.4784960809139955, + "grad_norm": 0.5492009520530701, + "learning_rate": 1.0674884725412761e-05, + "loss": 0.4143, + "step": 25774 + }, + { + "epoch": 0.4785332110514141, + "grad_norm": 0.4106338322162628, + "learning_rate": 1.0673720889696591e-05, + "loss": 0.299, + "step": 25776 + }, + { + "epoch": 0.47857034118883274, + "grad_norm": 0.3753180503845215, + "learning_rate": 1.0672557044813094e-05, + "loss": 0.3786, + "step": 25778 + }, + { + "epoch": 0.47860747132625137, + "grad_norm": 0.3486310839653015, + "learning_rate": 1.0671393190778112e-05, + "loss": 0.4971, + "step": 25780 + }, + { + "epoch": 0.47864460146367, + "grad_norm": 0.7115786075592041, + "learning_rate": 1.067022932760748e-05, + "loss": 0.4516, + "step": 25782 + }, + { + "epoch": 0.4786817316010887, + "grad_norm": 0.38976845145225525, + "learning_rate": 1.0669065455317037e-05, + "loss": 0.209, + "step": 25784 + }, + { + "epoch": 0.4787188617385073, + "grad_norm": 0.225276917219162, + "learning_rate": 1.0667901573922612e-05, + "loss": 0.333, + "step": 25786 + }, + { + "epoch": 0.47875599187592593, + "grad_norm": 0.38125425577163696, + "learning_rate": 1.066673768344005e-05, + "loss": 0.2695, + "step": 25788 + }, + { + "epoch": 0.47879312201334456, + "grad_norm": 0.3854677081108093, + "learning_rate": 1.0665573783885184e-05, + "loss": 0.2037, + "step": 25790 + }, + { + "epoch": 0.4788302521507632, + "grad_norm": 0.34418985247612, + "learning_rate": 1.0664409875273853e-05, + "loss": 0.1517, + "step": 25792 + }, + { + "epoch": 0.4788673822881819, + "grad_norm": 0.4653046727180481, + "learning_rate": 1.0663245957621892e-05, + "loss": 0.3215, + "step": 25794 + }, + { + "epoch": 0.4789045124256005, + "grad_norm": 0.2933042347431183, + "learning_rate": 1.0662082030945144e-05, + "loss": 0.2735, + "step": 25796 + }, + { + "epoch": 0.47894164256301913, + "grad_norm": 0.220557302236557, + "learning_rate": 1.066091809525944e-05, + "loss": 0.0732, + "step": 25798 + }, + { + "epoch": 0.47897877270043776, + "grad_norm": 0.3641858398914337, + "learning_rate": 1.0659754150580622e-05, + "loss": 0.2119, + "step": 25800 + }, + { + "epoch": 0.4790159028378564, + "grad_norm": 0.3173206150531769, + "learning_rate": 1.0658590196924526e-05, + "loss": 0.3479, + "step": 25802 + }, + { + "epoch": 0.479053032975275, + "grad_norm": 0.6831305027008057, + "learning_rate": 1.065742623430699e-05, + "loss": 0.387, + "step": 25804 + }, + { + "epoch": 0.4790901631126937, + "grad_norm": 0.3711341321468353, + "learning_rate": 1.0656262262743851e-05, + "loss": 0.3475, + "step": 25806 + }, + { + "epoch": 0.47912729325011233, + "grad_norm": 0.3482346534729004, + "learning_rate": 1.0655098282250948e-05, + "loss": 0.3159, + "step": 25808 + }, + { + "epoch": 0.47916442338753096, + "grad_norm": 0.34346863627433777, + "learning_rate": 1.065393429284412e-05, + "loss": 0.239, + "step": 25810 + }, + { + "epoch": 0.4792015535249496, + "grad_norm": 0.3327527642250061, + "learning_rate": 1.0652770294539206e-05, + "loss": 0.1992, + "step": 25812 + }, + { + "epoch": 0.4792386836623682, + "grad_norm": 0.6932603120803833, + "learning_rate": 1.0651606287352042e-05, + "loss": 0.49, + "step": 25814 + }, + { + "epoch": 0.4792758137997869, + "grad_norm": 0.25849881768226624, + "learning_rate": 1.0650442271298472e-05, + "loss": 0.5894, + "step": 25816 + }, + { + "epoch": 0.4793129439372055, + "grad_norm": 0.45103219151496887, + "learning_rate": 1.0649278246394328e-05, + "loss": 0.4128, + "step": 25818 + }, + { + "epoch": 0.47935007407462416, + "grad_norm": 0.3949589133262634, + "learning_rate": 1.0648114212655452e-05, + "loss": 0.3025, + "step": 25820 + }, + { + "epoch": 0.4793872042120428, + "grad_norm": 0.45555379986763, + "learning_rate": 1.0646950170097683e-05, + "loss": 0.1345, + "step": 25822 + }, + { + "epoch": 0.4794243343494614, + "grad_norm": 0.39250314235687256, + "learning_rate": 1.064578611873686e-05, + "loss": 0.2505, + "step": 25824 + }, + { + "epoch": 0.47946146448688004, + "grad_norm": 0.2591609060764313, + "learning_rate": 1.0644622058588825e-05, + "loss": 0.2313, + "step": 25826 + }, + { + "epoch": 0.4794985946242987, + "grad_norm": 0.3499685227870941, + "learning_rate": 1.064345798966941e-05, + "loss": 0.4806, + "step": 25828 + }, + { + "epoch": 0.47953572476171735, + "grad_norm": 0.33018285036087036, + "learning_rate": 1.064229391199446e-05, + "loss": 0.3458, + "step": 25830 + }, + { + "epoch": 0.479572854899136, + "grad_norm": 0.28483545780181885, + "learning_rate": 1.0641129825579817e-05, + "loss": 0.2113, + "step": 25832 + }, + { + "epoch": 0.4796099850365546, + "grad_norm": 0.3753323256969452, + "learning_rate": 1.0639965730441312e-05, + "loss": 0.3736, + "step": 25834 + }, + { + "epoch": 0.47964711517397324, + "grad_norm": 0.3545713424682617, + "learning_rate": 1.0638801626594796e-05, + "loss": 0.2362, + "step": 25836 + }, + { + "epoch": 0.4796842453113919, + "grad_norm": 0.33421653509140015, + "learning_rate": 1.06376375140561e-05, + "loss": 0.3229, + "step": 25838 + }, + { + "epoch": 0.47972137544881055, + "grad_norm": 0.40005314350128174, + "learning_rate": 1.0636473392841069e-05, + "loss": 0.1842, + "step": 25840 + }, + { + "epoch": 0.4797585055862292, + "grad_norm": 0.3972844183444977, + "learning_rate": 1.063530926296554e-05, + "loss": 0.0998, + "step": 25842 + }, + { + "epoch": 0.4797956357236478, + "grad_norm": 0.5402910709381104, + "learning_rate": 1.0634145124445356e-05, + "loss": 0.2587, + "step": 25844 + }, + { + "epoch": 0.47983276586106643, + "grad_norm": 0.379145085811615, + "learning_rate": 1.0632980977296359e-05, + "loss": 0.4205, + "step": 25846 + }, + { + "epoch": 0.47986989599848506, + "grad_norm": 0.3514607548713684, + "learning_rate": 1.0631816821534385e-05, + "loss": 0.3258, + "step": 25848 + }, + { + "epoch": 0.47990702613590375, + "grad_norm": 0.28997018933296204, + "learning_rate": 1.0630652657175273e-05, + "loss": 0.2114, + "step": 25850 + }, + { + "epoch": 0.4799441562733224, + "grad_norm": 0.27410557866096497, + "learning_rate": 1.0629488484234871e-05, + "loss": 0.2977, + "step": 25852 + }, + { + "epoch": 0.479981286410741, + "grad_norm": 0.5679981112480164, + "learning_rate": 1.0628324302729013e-05, + "loss": 0.2115, + "step": 25854 + }, + { + "epoch": 0.48001841654815963, + "grad_norm": 0.3682366609573364, + "learning_rate": 1.0627160112673549e-05, + "loss": 0.4061, + "step": 25856 + }, + { + "epoch": 0.48005554668557826, + "grad_norm": 0.3065265715122223, + "learning_rate": 1.0625995914084313e-05, + "loss": 0.3097, + "step": 25858 + }, + { + "epoch": 0.48009267682299694, + "grad_norm": 0.3950786292552948, + "learning_rate": 1.0624831706977144e-05, + "loss": 0.2392, + "step": 25860 + }, + { + "epoch": 0.4801298069604156, + "grad_norm": 0.5891554951667786, + "learning_rate": 1.0623667491367886e-05, + "loss": 0.3198, + "step": 25862 + }, + { + "epoch": 0.4801669370978342, + "grad_norm": 0.3438788652420044, + "learning_rate": 1.0622503267272387e-05, + "loss": 0.3419, + "step": 25864 + }, + { + "epoch": 0.48020406723525283, + "grad_norm": 0.31763380765914917, + "learning_rate": 1.062133903470648e-05, + "loss": 0.3149, + "step": 25866 + }, + { + "epoch": 0.48024119737267146, + "grad_norm": 0.47302332520484924, + "learning_rate": 1.0620174793686012e-05, + "loss": 0.2338, + "step": 25868 + }, + { + "epoch": 0.48027832751009014, + "grad_norm": 0.33269596099853516, + "learning_rate": 1.061901054422682e-05, + "loss": 0.3203, + "step": 25870 + }, + { + "epoch": 0.48031545764750877, + "grad_norm": 0.3056771159172058, + "learning_rate": 1.0617846286344749e-05, + "loss": 0.4243, + "step": 25872 + }, + { + "epoch": 0.4803525877849274, + "grad_norm": 0.4264232814311981, + "learning_rate": 1.061668202005564e-05, + "loss": 0.3966, + "step": 25874 + }, + { + "epoch": 0.480389717922346, + "grad_norm": 0.25301283597946167, + "learning_rate": 1.061551774537534e-05, + "loss": 0.2643, + "step": 25876 + }, + { + "epoch": 0.48042684805976466, + "grad_norm": 0.3553018569946289, + "learning_rate": 1.0614353462319684e-05, + "loss": 0.1603, + "step": 25878 + }, + { + "epoch": 0.4804639781971833, + "grad_norm": 0.35869327187538147, + "learning_rate": 1.0613189170904518e-05, + "loss": 0.2116, + "step": 25880 + }, + { + "epoch": 0.48050110833460197, + "grad_norm": 0.3207240402698517, + "learning_rate": 1.0612024871145683e-05, + "loss": 0.2569, + "step": 25882 + }, + { + "epoch": 0.4805382384720206, + "grad_norm": 0.3287566900253296, + "learning_rate": 1.0610860563059025e-05, + "loss": 0.3241, + "step": 25884 + }, + { + "epoch": 0.4805753686094392, + "grad_norm": 0.4188264310359955, + "learning_rate": 1.0609696246660382e-05, + "loss": 0.288, + "step": 25886 + }, + { + "epoch": 0.48061249874685785, + "grad_norm": 0.2996284067630768, + "learning_rate": 1.0608531921965602e-05, + "loss": 0.2205, + "step": 25888 + }, + { + "epoch": 0.4806496288842765, + "grad_norm": 0.34011662006378174, + "learning_rate": 1.0607367588990525e-05, + "loss": 0.1616, + "step": 25890 + }, + { + "epoch": 0.48068675902169516, + "grad_norm": 0.50629061460495, + "learning_rate": 1.0606203247750993e-05, + "loss": 0.2595, + "step": 25892 + }, + { + "epoch": 0.4807238891591138, + "grad_norm": 0.296122282743454, + "learning_rate": 1.060503889826285e-05, + "loss": 0.1873, + "step": 25894 + }, + { + "epoch": 0.4807610192965324, + "grad_norm": 0.4013459384441376, + "learning_rate": 1.060387454054194e-05, + "loss": 0.3102, + "step": 25896 + }, + { + "epoch": 0.48079814943395105, + "grad_norm": 0.4383610188961029, + "learning_rate": 1.0602710174604107e-05, + "loss": 0.2114, + "step": 25898 + }, + { + "epoch": 0.4808352795713697, + "grad_norm": 0.4170689284801483, + "learning_rate": 1.0601545800465197e-05, + "loss": 0.2498, + "step": 25900 + }, + { + "epoch": 0.4808724097087883, + "grad_norm": 0.5081580877304077, + "learning_rate": 1.0600381418141047e-05, + "loss": 0.3481, + "step": 25902 + }, + { + "epoch": 0.480909539846207, + "grad_norm": 0.3706102669239044, + "learning_rate": 1.0599217027647504e-05, + "loss": 0.2647, + "step": 25904 + }, + { + "epoch": 0.4809466699836256, + "grad_norm": 0.49083277583122253, + "learning_rate": 1.0598052629000412e-05, + "loss": 0.2301, + "step": 25906 + }, + { + "epoch": 0.48098380012104425, + "grad_norm": 0.36529672145843506, + "learning_rate": 1.0596888222215618e-05, + "loss": 0.3074, + "step": 25908 + }, + { + "epoch": 0.4810209302584629, + "grad_norm": 0.24706918001174927, + "learning_rate": 1.0595723807308963e-05, + "loss": 0.2356, + "step": 25910 + }, + { + "epoch": 0.4810580603958815, + "grad_norm": 0.571923017501831, + "learning_rate": 1.0594559384296286e-05, + "loss": 0.3954, + "step": 25912 + }, + { + "epoch": 0.4810951905333002, + "grad_norm": 0.42421993613243103, + "learning_rate": 1.0593394953193443e-05, + "loss": 0.132, + "step": 25914 + }, + { + "epoch": 0.4811323206707188, + "grad_norm": 0.3086685836315155, + "learning_rate": 1.0592230514016268e-05, + "loss": 0.3092, + "step": 25916 + }, + { + "epoch": 0.48116945080813744, + "grad_norm": 0.42523038387298584, + "learning_rate": 1.0591066066780609e-05, + "loss": 0.3185, + "step": 25918 + }, + { + "epoch": 0.4812065809455561, + "grad_norm": 0.3302091658115387, + "learning_rate": 1.0589901611502315e-05, + "loss": 0.2911, + "step": 25920 + }, + { + "epoch": 0.4812437110829747, + "grad_norm": 0.4887673556804657, + "learning_rate": 1.0588737148197225e-05, + "loss": 0.3035, + "step": 25922 + }, + { + "epoch": 0.48128084122039333, + "grad_norm": 0.4397135376930237, + "learning_rate": 1.0587572676881186e-05, + "loss": 0.2615, + "step": 25924 + }, + { + "epoch": 0.481317971357812, + "grad_norm": 0.39657092094421387, + "learning_rate": 1.0586408197570041e-05, + "loss": 0.3352, + "step": 25926 + }, + { + "epoch": 0.48135510149523064, + "grad_norm": 0.3883914351463318, + "learning_rate": 1.0585243710279643e-05, + "loss": 0.3096, + "step": 25928 + }, + { + "epoch": 0.48139223163264927, + "grad_norm": 0.36172398924827576, + "learning_rate": 1.0584079215025826e-05, + "loss": 0.3868, + "step": 25930 + }, + { + "epoch": 0.4814293617700679, + "grad_norm": 0.30220121145248413, + "learning_rate": 1.058291471182444e-05, + "loss": 0.2566, + "step": 25932 + }, + { + "epoch": 0.4814664919074865, + "grad_norm": 0.517325758934021, + "learning_rate": 1.0581750200691334e-05, + "loss": 0.2998, + "step": 25934 + }, + { + "epoch": 0.4815036220449052, + "grad_norm": 0.3371557593345642, + "learning_rate": 1.0580585681642348e-05, + "loss": 0.27, + "step": 25936 + }, + { + "epoch": 0.48154075218232384, + "grad_norm": 0.420112669467926, + "learning_rate": 1.057942115469333e-05, + "loss": 0.219, + "step": 25938 + }, + { + "epoch": 0.48157788231974247, + "grad_norm": 0.4462807774543762, + "learning_rate": 1.0578256619860128e-05, + "loss": 0.1767, + "step": 25940 + }, + { + "epoch": 0.4816150124571611, + "grad_norm": 0.24513261020183563, + "learning_rate": 1.0577092077158583e-05, + "loss": 0.3393, + "step": 25942 + }, + { + "epoch": 0.4816521425945797, + "grad_norm": 0.22974194586277008, + "learning_rate": 1.0575927526604544e-05, + "loss": 0.3438, + "step": 25944 + }, + { + "epoch": 0.4816892727319984, + "grad_norm": 0.6168479323387146, + "learning_rate": 1.0574762968213857e-05, + "loss": 0.1242, + "step": 25946 + }, + { + "epoch": 0.48172640286941704, + "grad_norm": 0.25510695576667786, + "learning_rate": 1.0573598402002368e-05, + "loss": 0.1252, + "step": 25948 + }, + { + "epoch": 0.48176353300683566, + "grad_norm": 0.26457667350769043, + "learning_rate": 1.057243382798592e-05, + "loss": 0.3293, + "step": 25950 + }, + { + "epoch": 0.4818006631442543, + "grad_norm": 0.23546819388866425, + "learning_rate": 1.0571269246180365e-05, + "loss": 0.1639, + "step": 25952 + }, + { + "epoch": 0.4818377932816729, + "grad_norm": 0.34259888529777527, + "learning_rate": 1.0570104656601548e-05, + "loss": 0.302, + "step": 25954 + }, + { + "epoch": 0.48187492341909155, + "grad_norm": 0.397570937871933, + "learning_rate": 1.056894005926531e-05, + "loss": 0.2128, + "step": 25956 + }, + { + "epoch": 0.48191205355651023, + "grad_norm": 0.20074734091758728, + "learning_rate": 1.0567775454187505e-05, + "loss": 0.2137, + "step": 25958 + }, + { + "epoch": 0.48194918369392886, + "grad_norm": 0.3152163326740265, + "learning_rate": 1.0566610841383975e-05, + "loss": 0.3188, + "step": 25960 + }, + { + "epoch": 0.4819863138313475, + "grad_norm": 0.5078747868537903, + "learning_rate": 1.0565446220870573e-05, + "loss": 0.1864, + "step": 25962 + }, + { + "epoch": 0.4820234439687661, + "grad_norm": 0.48174622654914856, + "learning_rate": 1.0564281592663135e-05, + "loss": 0.2456, + "step": 25964 + }, + { + "epoch": 0.48206057410618475, + "grad_norm": 0.38263198733329773, + "learning_rate": 1.056311695677752e-05, + "loss": 0.2717, + "step": 25966 + }, + { + "epoch": 0.48209770424360343, + "grad_norm": 0.3269321620464325, + "learning_rate": 1.0561952313229567e-05, + "loss": 0.4477, + "step": 25968 + }, + { + "epoch": 0.48213483438102206, + "grad_norm": 0.34150880575180054, + "learning_rate": 1.0560787662035126e-05, + "loss": 0.1791, + "step": 25970 + }, + { + "epoch": 0.4821719645184407, + "grad_norm": 0.241269052028656, + "learning_rate": 1.0559623003210047e-05, + "loss": 0.1179, + "step": 25972 + }, + { + "epoch": 0.4822090946558593, + "grad_norm": 0.2618664801120758, + "learning_rate": 1.0558458336770173e-05, + "loss": 0.2245, + "step": 25974 + }, + { + "epoch": 0.48224622479327794, + "grad_norm": 0.23175397515296936, + "learning_rate": 1.0557293662731357e-05, + "loss": 0.1045, + "step": 25976 + }, + { + "epoch": 0.4822833549306966, + "grad_norm": 0.3808254897594452, + "learning_rate": 1.0556128981109442e-05, + "loss": 0.2747, + "step": 25978 + }, + { + "epoch": 0.48232048506811526, + "grad_norm": 0.36512550711631775, + "learning_rate": 1.0554964291920276e-05, + "loss": 0.3013, + "step": 25980 + }, + { + "epoch": 0.4823576152055339, + "grad_norm": 0.3891080915927887, + "learning_rate": 1.0553799595179707e-05, + "loss": 0.2552, + "step": 25982 + }, + { + "epoch": 0.4823947453429525, + "grad_norm": 0.6203058362007141, + "learning_rate": 1.055263489090359e-05, + "loss": 0.276, + "step": 25984 + }, + { + "epoch": 0.48243187548037114, + "grad_norm": 0.40197300910949707, + "learning_rate": 1.0551470179107763e-05, + "loss": 0.2428, + "step": 25986 + }, + { + "epoch": 0.48246900561778977, + "grad_norm": 0.2707844078540802, + "learning_rate": 1.055030545980808e-05, + "loss": 0.1677, + "step": 25988 + }, + { + "epoch": 0.48250613575520845, + "grad_norm": 0.45740237832069397, + "learning_rate": 1.0549140733020386e-05, + "loss": 0.218, + "step": 25990 + }, + { + "epoch": 0.4825432658926271, + "grad_norm": 0.38537925481796265, + "learning_rate": 1.0547975998760534e-05, + "loss": 0.2205, + "step": 25992 + }, + { + "epoch": 0.4825803960300457, + "grad_norm": 0.5144960880279541, + "learning_rate": 1.054681125704437e-05, + "loss": 0.3108, + "step": 25994 + }, + { + "epoch": 0.48261752616746434, + "grad_norm": 0.27678626775741577, + "learning_rate": 1.0545646507887744e-05, + "loss": 0.1682, + "step": 25996 + }, + { + "epoch": 0.48265465630488297, + "grad_norm": 0.3633998930454254, + "learning_rate": 1.05444817513065e-05, + "loss": 0.3683, + "step": 25998 + }, + { + "epoch": 0.4826917864423016, + "grad_norm": 0.3564842641353607, + "learning_rate": 1.0543316987316489e-05, + "loss": 0.2931, + "step": 26000 + }, + { + "epoch": 0.4827289165797203, + "grad_norm": 0.3071421682834625, + "learning_rate": 1.0542152215933563e-05, + "loss": 0.3321, + "step": 26002 + }, + { + "epoch": 0.4827660467171389, + "grad_norm": 0.46005627512931824, + "learning_rate": 1.0540987437173571e-05, + "loss": 0.2435, + "step": 26004 + }, + { + "epoch": 0.48280317685455754, + "grad_norm": 0.47856539487838745, + "learning_rate": 1.053982265105236e-05, + "loss": 0.3589, + "step": 26006 + }, + { + "epoch": 0.48284030699197616, + "grad_norm": 0.4329420328140259, + "learning_rate": 1.0538657857585779e-05, + "loss": 0.2202, + "step": 26008 + }, + { + "epoch": 0.4828774371293948, + "grad_norm": 0.3009786307811737, + "learning_rate": 1.0537493056789678e-05, + "loss": 0.1983, + "step": 26010 + }, + { + "epoch": 0.4829145672668135, + "grad_norm": 0.5520566701889038, + "learning_rate": 1.0536328248679908e-05, + "loss": 0.3142, + "step": 26012 + }, + { + "epoch": 0.4829516974042321, + "grad_norm": 0.3896366059780121, + "learning_rate": 1.0535163433272315e-05, + "loss": 0.262, + "step": 26014 + }, + { + "epoch": 0.48298882754165073, + "grad_norm": 0.47369384765625, + "learning_rate": 1.0533998610582754e-05, + "loss": 0.4611, + "step": 26016 + }, + { + "epoch": 0.48302595767906936, + "grad_norm": 0.7444141507148743, + "learning_rate": 1.0532833780627067e-05, + "loss": 0.1764, + "step": 26018 + }, + { + "epoch": 0.483063087816488, + "grad_norm": 0.3476954996585846, + "learning_rate": 1.053166894342111e-05, + "loss": 0.4405, + "step": 26020 + }, + { + "epoch": 0.4831002179539067, + "grad_norm": 0.5313552021980286, + "learning_rate": 1.0530504098980732e-05, + "loss": 0.1211, + "step": 26022 + }, + { + "epoch": 0.4831373480913253, + "grad_norm": 0.40248793363571167, + "learning_rate": 1.052933924732178e-05, + "loss": 0.2196, + "step": 26024 + }, + { + "epoch": 0.48317447822874393, + "grad_norm": 0.39716479182243347, + "learning_rate": 1.0528174388460108e-05, + "loss": 0.3005, + "step": 26026 + }, + { + "epoch": 0.48321160836616256, + "grad_norm": 0.2691101133823395, + "learning_rate": 1.0527009522411567e-05, + "loss": 0.3618, + "step": 26028 + }, + { + "epoch": 0.4832487385035812, + "grad_norm": 0.3173148036003113, + "learning_rate": 1.0525844649192001e-05, + "loss": 0.247, + "step": 26030 + }, + { + "epoch": 0.4832858686409998, + "grad_norm": 0.6390669941902161, + "learning_rate": 1.0524679768817264e-05, + "loss": 0.492, + "step": 26032 + }, + { + "epoch": 0.4833229987784185, + "grad_norm": 0.344378799200058, + "learning_rate": 1.0523514881303211e-05, + "loss": 0.3025, + "step": 26034 + }, + { + "epoch": 0.48336012891583713, + "grad_norm": 0.3831423819065094, + "learning_rate": 1.0522349986665685e-05, + "loss": 0.3224, + "step": 26036 + }, + { + "epoch": 0.48339725905325576, + "grad_norm": 0.42256098985671997, + "learning_rate": 1.0521185084920544e-05, + "loss": 0.1753, + "step": 26038 + }, + { + "epoch": 0.4834343891906744, + "grad_norm": 0.5278465747833252, + "learning_rate": 1.0520020176083634e-05, + "loss": 0.3064, + "step": 26040 + }, + { + "epoch": 0.483471519328093, + "grad_norm": 0.2826898992061615, + "learning_rate": 1.0518855260170803e-05, + "loss": 0.0739, + "step": 26042 + }, + { + "epoch": 0.4835086494655117, + "grad_norm": 0.2939543128013611, + "learning_rate": 1.051769033719791e-05, + "loss": 0.2762, + "step": 26044 + }, + { + "epoch": 0.4835457796029303, + "grad_norm": 0.35585254430770874, + "learning_rate": 1.0516525407180801e-05, + "loss": 0.377, + "step": 26046 + }, + { + "epoch": 0.48358290974034895, + "grad_norm": 0.3219221830368042, + "learning_rate": 1.0515360470135329e-05, + "loss": 0.3918, + "step": 26048 + }, + { + "epoch": 0.4836200398777676, + "grad_norm": 0.5564090013504028, + "learning_rate": 1.0514195526077344e-05, + "loss": 0.4431, + "step": 26050 + }, + { + "epoch": 0.4836571700151862, + "grad_norm": 0.9560893774032593, + "learning_rate": 1.0513030575022698e-05, + "loss": 0.2544, + "step": 26052 + }, + { + "epoch": 0.48369430015260484, + "grad_norm": 0.21328911185264587, + "learning_rate": 1.0511865616987245e-05, + "loss": 0.3638, + "step": 26054 + }, + { + "epoch": 0.4837314302900235, + "grad_norm": 0.37342801690101624, + "learning_rate": 1.0510700651986829e-05, + "loss": 0.2924, + "step": 26056 + }, + { + "epoch": 0.48376856042744215, + "grad_norm": 0.4090507924556732, + "learning_rate": 1.0509535680037312e-05, + "loss": 0.3017, + "step": 26058 + }, + { + "epoch": 0.4838056905648608, + "grad_norm": 0.5802554488182068, + "learning_rate": 1.050837070115454e-05, + "loss": 0.2673, + "step": 26060 + }, + { + "epoch": 0.4838428207022794, + "grad_norm": 0.31440210342407227, + "learning_rate": 1.0507205715354363e-05, + "loss": 0.2877, + "step": 26062 + }, + { + "epoch": 0.48387995083969804, + "grad_norm": 0.44029486179351807, + "learning_rate": 1.0506040722652635e-05, + "loss": 0.3705, + "step": 26064 + }, + { + "epoch": 0.4839170809771167, + "grad_norm": 0.4129449427127838, + "learning_rate": 1.0504875723065212e-05, + "loss": 0.3597, + "step": 26066 + }, + { + "epoch": 0.48395421111453535, + "grad_norm": 0.30198583006858826, + "learning_rate": 1.050371071660794e-05, + "loss": 0.4239, + "step": 26068 + }, + { + "epoch": 0.483991341251954, + "grad_norm": 0.4920758605003357, + "learning_rate": 1.0502545703296673e-05, + "loss": 0.3841, + "step": 26070 + }, + { + "epoch": 0.4840284713893726, + "grad_norm": 0.9584425091743469, + "learning_rate": 1.0501380683147269e-05, + "loss": 0.2691, + "step": 26072 + }, + { + "epoch": 0.48406560152679123, + "grad_norm": 0.4189384877681732, + "learning_rate": 1.0500215656175571e-05, + "loss": 0.2275, + "step": 26074 + }, + { + "epoch": 0.48410273166420986, + "grad_norm": 0.27028873562812805, + "learning_rate": 1.0499050622397437e-05, + "loss": 0.3389, + "step": 26076 + }, + { + "epoch": 0.48413986180162855, + "grad_norm": 0.39562690258026123, + "learning_rate": 1.0497885581828721e-05, + "loss": 0.2539, + "step": 26078 + }, + { + "epoch": 0.4841769919390472, + "grad_norm": 0.28657814860343933, + "learning_rate": 1.0496720534485274e-05, + "loss": 0.2297, + "step": 26080 + }, + { + "epoch": 0.4842141220764658, + "grad_norm": 0.33497828245162964, + "learning_rate": 1.0495555480382946e-05, + "loss": 0.3391, + "step": 26082 + }, + { + "epoch": 0.48425125221388443, + "grad_norm": 0.3151029050350189, + "learning_rate": 1.049439041953759e-05, + "loss": 0.3636, + "step": 26084 + }, + { + "epoch": 0.48428838235130306, + "grad_norm": 0.37022027373313904, + "learning_rate": 1.0493225351965066e-05, + "loss": 0.1166, + "step": 26086 + }, + { + "epoch": 0.48432551248872174, + "grad_norm": 0.2984742820262909, + "learning_rate": 1.049206027768122e-05, + "loss": 0.2928, + "step": 26088 + }, + { + "epoch": 0.48436264262614037, + "grad_norm": 0.2641547918319702, + "learning_rate": 1.0490895196701909e-05, + "loss": 0.285, + "step": 26090 + }, + { + "epoch": 0.484399772763559, + "grad_norm": 0.24549604952335358, + "learning_rate": 1.0489730109042986e-05, + "loss": 0.2447, + "step": 26092 + }, + { + "epoch": 0.48443690290097763, + "grad_norm": 0.5230951309204102, + "learning_rate": 1.04885650147203e-05, + "loss": 0.433, + "step": 26094 + }, + { + "epoch": 0.48447403303839626, + "grad_norm": 0.30966290831565857, + "learning_rate": 1.0487399913749707e-05, + "loss": 0.2986, + "step": 26096 + }, + { + "epoch": 0.48451116317581494, + "grad_norm": 0.4245856702327728, + "learning_rate": 1.0486234806147064e-05, + "loss": 0.4317, + "step": 26098 + }, + { + "epoch": 0.48454829331323357, + "grad_norm": 0.4193677008152008, + "learning_rate": 1.048506969192822e-05, + "loss": 0.3559, + "step": 26100 + }, + { + "epoch": 0.4845854234506522, + "grad_norm": 0.3671071529388428, + "learning_rate": 1.0483904571109031e-05, + "loss": 0.0657, + "step": 26102 + }, + { + "epoch": 0.4846225535880708, + "grad_norm": 0.3049987256526947, + "learning_rate": 1.0482739443705352e-05, + "loss": 0.1637, + "step": 26104 + }, + { + "epoch": 0.48465968372548945, + "grad_norm": 0.3050650358200073, + "learning_rate": 1.0481574309733032e-05, + "loss": 0.2439, + "step": 26106 + }, + { + "epoch": 0.4846968138629081, + "grad_norm": 0.5235344171524048, + "learning_rate": 1.0480409169207928e-05, + "loss": 0.3414, + "step": 26108 + }, + { + "epoch": 0.48473394400032677, + "grad_norm": 0.28188449144363403, + "learning_rate": 1.0479244022145897e-05, + "loss": 0.2146, + "step": 26110 + }, + { + "epoch": 0.4847710741377454, + "grad_norm": 0.42991575598716736, + "learning_rate": 1.0478078868562789e-05, + "loss": 0.2435, + "step": 26112 + }, + { + "epoch": 0.484808204275164, + "grad_norm": 0.25120148062705994, + "learning_rate": 1.047691370847446e-05, + "loss": 0.3591, + "step": 26114 + }, + { + "epoch": 0.48484533441258265, + "grad_norm": 0.5862785577774048, + "learning_rate": 1.0475748541896765e-05, + "loss": 0.4838, + "step": 26116 + }, + { + "epoch": 0.4848824645500013, + "grad_norm": 0.38598841428756714, + "learning_rate": 1.0474583368845552e-05, + "loss": 0.1274, + "step": 26118 + }, + { + "epoch": 0.48491959468741996, + "grad_norm": 0.39472344517707825, + "learning_rate": 1.0473418189336686e-05, + "loss": 0.3618, + "step": 26120 + }, + { + "epoch": 0.4849567248248386, + "grad_norm": 0.5864745378494263, + "learning_rate": 1.0472253003386017e-05, + "loss": 0.2096, + "step": 26122 + }, + { + "epoch": 0.4849938549622572, + "grad_norm": 0.6655839085578918, + "learning_rate": 1.0471087811009396e-05, + "loss": 0.3477, + "step": 26124 + }, + { + "epoch": 0.48503098509967585, + "grad_norm": 0.27581992745399475, + "learning_rate": 1.0469922612222683e-05, + "loss": 0.3604, + "step": 26126 + }, + { + "epoch": 0.4850681152370945, + "grad_norm": 0.38262125849723816, + "learning_rate": 1.0468757407041728e-05, + "loss": 0.1624, + "step": 26128 + }, + { + "epoch": 0.4851052453745131, + "grad_norm": 0.28196388483047485, + "learning_rate": 1.0467592195482393e-05, + "loss": 0.3331, + "step": 26130 + }, + { + "epoch": 0.4851423755119318, + "grad_norm": 0.3221602439880371, + "learning_rate": 1.046642697756053e-05, + "loss": 0.2333, + "step": 26132 + }, + { + "epoch": 0.4851795056493504, + "grad_norm": 0.2928047180175781, + "learning_rate": 1.0465261753291987e-05, + "loss": 0.1963, + "step": 26134 + }, + { + "epoch": 0.48521663578676905, + "grad_norm": 0.45597997307777405, + "learning_rate": 1.0464096522692628e-05, + "loss": 0.2277, + "step": 26136 + }, + { + "epoch": 0.4852537659241877, + "grad_norm": 0.24692007899284363, + "learning_rate": 1.0462931285778304e-05, + "loss": 0.356, + "step": 26138 + }, + { + "epoch": 0.4852908960616063, + "grad_norm": 0.44605571031570435, + "learning_rate": 1.0461766042564873e-05, + "loss": 0.4442, + "step": 26140 + }, + { + "epoch": 0.485328026199025, + "grad_norm": 0.2032811939716339, + "learning_rate": 1.046060079306819e-05, + "loss": 0.1136, + "step": 26142 + }, + { + "epoch": 0.4853651563364436, + "grad_norm": 0.4138736426830292, + "learning_rate": 1.0459435537304113e-05, + "loss": 0.2724, + "step": 26144 + }, + { + "epoch": 0.48540228647386224, + "grad_norm": 0.5404435992240906, + "learning_rate": 1.0458270275288489e-05, + "loss": 0.3294, + "step": 26146 + }, + { + "epoch": 0.48543941661128087, + "grad_norm": 0.354728102684021, + "learning_rate": 1.045710500703718e-05, + "loss": 0.1184, + "step": 26148 + }, + { + "epoch": 0.4854765467486995, + "grad_norm": 0.3683900535106659, + "learning_rate": 1.0455939732566042e-05, + "loss": 0.2005, + "step": 26150 + }, + { + "epoch": 0.48551367688611813, + "grad_norm": 0.3551870286464691, + "learning_rate": 1.045477445189093e-05, + "loss": 0.1388, + "step": 26152 + }, + { + "epoch": 0.4855508070235368, + "grad_norm": 0.3943800628185272, + "learning_rate": 1.0453609165027702e-05, + "loss": 0.2921, + "step": 26154 + }, + { + "epoch": 0.48558793716095544, + "grad_norm": 0.5617119669914246, + "learning_rate": 1.045244387199221e-05, + "loss": 0.3084, + "step": 26156 + }, + { + "epoch": 0.48562506729837407, + "grad_norm": 0.47015583515167236, + "learning_rate": 1.0451278572800312e-05, + "loss": 0.1243, + "step": 26158 + }, + { + "epoch": 0.4856621974357927, + "grad_norm": 0.35810357332229614, + "learning_rate": 1.0450113267467865e-05, + "loss": 0.2815, + "step": 26160 + }, + { + "epoch": 0.4856993275732113, + "grad_norm": 0.6324231624603271, + "learning_rate": 1.0448947956010722e-05, + "loss": 0.2323, + "step": 26162 + }, + { + "epoch": 0.48573645771063, + "grad_norm": 0.35674723982810974, + "learning_rate": 1.0447782638444746e-05, + "loss": 0.2651, + "step": 26164 + }, + { + "epoch": 0.48577358784804864, + "grad_norm": 0.513820469379425, + "learning_rate": 1.0446617314785787e-05, + "loss": 0.2444, + "step": 26166 + }, + { + "epoch": 0.48581071798546727, + "grad_norm": 0.38412192463874817, + "learning_rate": 1.0445451985049706e-05, + "loss": 0.1859, + "step": 26168 + }, + { + "epoch": 0.4858478481228859, + "grad_norm": 0.6000122427940369, + "learning_rate": 1.0444286649252356e-05, + "loss": 0.3476, + "step": 26170 + }, + { + "epoch": 0.4858849782603045, + "grad_norm": 0.4199706017971039, + "learning_rate": 1.0443121307409594e-05, + "loss": 0.171, + "step": 26172 + }, + { + "epoch": 0.4859221083977232, + "grad_norm": 0.28809675574302673, + "learning_rate": 1.0441955959537282e-05, + "loss": 0.3298, + "step": 26174 + }, + { + "epoch": 0.48595923853514184, + "grad_norm": 0.3716817796230316, + "learning_rate": 1.0440790605651274e-05, + "loss": 0.2936, + "step": 26176 + }, + { + "epoch": 0.48599636867256046, + "grad_norm": 0.2462117075920105, + "learning_rate": 1.0439625245767422e-05, + "loss": 0.1726, + "step": 26178 + }, + { + "epoch": 0.4860334988099791, + "grad_norm": 0.16147224605083466, + "learning_rate": 1.043845987990159e-05, + "loss": 0.1436, + "step": 26180 + }, + { + "epoch": 0.4860706289473977, + "grad_norm": 0.2748958170413971, + "learning_rate": 1.0437294508069631e-05, + "loss": 0.2125, + "step": 26182 + }, + { + "epoch": 0.48610775908481635, + "grad_norm": 0.3940703570842743, + "learning_rate": 1.0436129130287405e-05, + "loss": 0.4976, + "step": 26184 + }, + { + "epoch": 0.48614488922223503, + "grad_norm": 0.3703155517578125, + "learning_rate": 1.0434963746570767e-05, + "loss": 0.2278, + "step": 26186 + }, + { + "epoch": 0.48618201935965366, + "grad_norm": 0.3123624324798584, + "learning_rate": 1.0433798356935575e-05, + "loss": 0.0522, + "step": 26188 + }, + { + "epoch": 0.4862191494970723, + "grad_norm": 0.3375261723995209, + "learning_rate": 1.0432632961397686e-05, + "loss": 0.1961, + "step": 26190 + }, + { + "epoch": 0.4862562796344909, + "grad_norm": 0.27479127049446106, + "learning_rate": 1.0431467559972961e-05, + "loss": 0.4013, + "step": 26192 + }, + { + "epoch": 0.48629340977190955, + "grad_norm": 0.37649446725845337, + "learning_rate": 1.0430302152677251e-05, + "loss": 0.41, + "step": 26194 + }, + { + "epoch": 0.48633053990932823, + "grad_norm": 0.41825172305107117, + "learning_rate": 1.0429136739526423e-05, + "loss": 0.319, + "step": 26196 + }, + { + "epoch": 0.48636767004674686, + "grad_norm": 0.3431108593940735, + "learning_rate": 1.0427971320536325e-05, + "loss": 0.4502, + "step": 26198 + }, + { + "epoch": 0.4864048001841655, + "grad_norm": 0.34142208099365234, + "learning_rate": 1.042680589572282e-05, + "loss": 0.2738, + "step": 26200 + }, + { + "epoch": 0.4864419303215841, + "grad_norm": 0.2936791777610779, + "learning_rate": 1.0425640465101765e-05, + "loss": 0.3192, + "step": 26202 + }, + { + "epoch": 0.48647906045900274, + "grad_norm": 0.5433480143547058, + "learning_rate": 1.0424475028689021e-05, + "loss": 0.3617, + "step": 26204 + }, + { + "epoch": 0.48651619059642137, + "grad_norm": 0.3263556957244873, + "learning_rate": 1.042330958650044e-05, + "loss": 0.2207, + "step": 26206 + }, + { + "epoch": 0.48655332073384006, + "grad_norm": 0.44900795817375183, + "learning_rate": 1.0422144138551885e-05, + "loss": 0.2401, + "step": 26208 + }, + { + "epoch": 0.4865904508712587, + "grad_norm": 0.3900314271450043, + "learning_rate": 1.0420978684859212e-05, + "loss": 0.3598, + "step": 26210 + }, + { + "epoch": 0.4866275810086773, + "grad_norm": 0.336601197719574, + "learning_rate": 1.041981322543828e-05, + "loss": 0.4088, + "step": 26212 + }, + { + "epoch": 0.48666471114609594, + "grad_norm": 0.2085847407579422, + "learning_rate": 1.0418647760304948e-05, + "loss": 0.2696, + "step": 26214 + }, + { + "epoch": 0.48670184128351457, + "grad_norm": 0.23566988110542297, + "learning_rate": 1.0417482289475074e-05, + "loss": 0.278, + "step": 26216 + }, + { + "epoch": 0.48673897142093325, + "grad_norm": 0.36089327931404114, + "learning_rate": 1.0416316812964517e-05, + "loss": 0.1912, + "step": 26218 + }, + { + "epoch": 0.4867761015583519, + "grad_norm": 0.4002305865287781, + "learning_rate": 1.0415151330789135e-05, + "loss": 0.2394, + "step": 26220 + }, + { + "epoch": 0.4868132316957705, + "grad_norm": 0.315372496843338, + "learning_rate": 1.0413985842964785e-05, + "loss": 0.2527, + "step": 26222 + }, + { + "epoch": 0.48685036183318914, + "grad_norm": 0.2926592528820038, + "learning_rate": 1.0412820349507332e-05, + "loss": 0.4929, + "step": 26224 + }, + { + "epoch": 0.48688749197060777, + "grad_norm": 0.4769361913204193, + "learning_rate": 1.0411654850432627e-05, + "loss": 0.3774, + "step": 26226 + }, + { + "epoch": 0.4869246221080264, + "grad_norm": 0.46475571393966675, + "learning_rate": 1.0410489345756533e-05, + "loss": 0.2797, + "step": 26228 + }, + { + "epoch": 0.4869617522454451, + "grad_norm": 1.1983230113983154, + "learning_rate": 1.0409323835494912e-05, + "loss": 0.336, + "step": 26230 + }, + { + "epoch": 0.4869988823828637, + "grad_norm": 0.6116618514060974, + "learning_rate": 1.0408158319663612e-05, + "loss": 0.3149, + "step": 26232 + }, + { + "epoch": 0.48703601252028234, + "grad_norm": 0.42179369926452637, + "learning_rate": 1.0406992798278505e-05, + "loss": 0.3866, + "step": 26234 + }, + { + "epoch": 0.48707314265770096, + "grad_norm": 0.4507051706314087, + "learning_rate": 1.0405827271355446e-05, + "loss": 0.4276, + "step": 26236 + }, + { + "epoch": 0.4871102727951196, + "grad_norm": 0.5983268022537231, + "learning_rate": 1.0404661738910293e-05, + "loss": 0.3079, + "step": 26238 + }, + { + "epoch": 0.4871474029325383, + "grad_norm": 0.353651762008667, + "learning_rate": 1.0403496200958904e-05, + "loss": 0.2173, + "step": 26240 + }, + { + "epoch": 0.4871845330699569, + "grad_norm": 0.5611802339553833, + "learning_rate": 1.040233065751714e-05, + "loss": 0.1258, + "step": 26242 + }, + { + "epoch": 0.48722166320737553, + "grad_norm": 0.35338711738586426, + "learning_rate": 1.0401165108600863e-05, + "loss": 0.3382, + "step": 26244 + }, + { + "epoch": 0.48725879334479416, + "grad_norm": 0.4124941825866699, + "learning_rate": 1.0399999554225928e-05, + "loss": 0.3234, + "step": 26246 + }, + { + "epoch": 0.4872959234822128, + "grad_norm": 0.41682717204093933, + "learning_rate": 1.03988339944082e-05, + "loss": 0.468, + "step": 26248 + }, + { + "epoch": 0.4873330536196315, + "grad_norm": 0.36976101994514465, + "learning_rate": 1.0397668429163537e-05, + "loss": 0.4514, + "step": 26250 + }, + { + "epoch": 0.4873701837570501, + "grad_norm": 0.36929577589035034, + "learning_rate": 1.0396502858507792e-05, + "loss": 0.1731, + "step": 26252 + }, + { + "epoch": 0.48740731389446873, + "grad_norm": 0.4354987144470215, + "learning_rate": 1.0395337282456834e-05, + "loss": 0.2372, + "step": 26254 + }, + { + "epoch": 0.48744444403188736, + "grad_norm": 0.47942647337913513, + "learning_rate": 1.0394171701026522e-05, + "loss": 0.3352, + "step": 26256 + }, + { + "epoch": 0.487481574169306, + "grad_norm": 0.3604249954223633, + "learning_rate": 1.0393006114232712e-05, + "loss": 0.2125, + "step": 26258 + }, + { + "epoch": 0.4875187043067246, + "grad_norm": 0.2972436547279358, + "learning_rate": 1.0391840522091265e-05, + "loss": 0.3072, + "step": 26260 + }, + { + "epoch": 0.4875558344441433, + "grad_norm": 0.21511903405189514, + "learning_rate": 1.0390674924618046e-05, + "loss": 0.2936, + "step": 26262 + }, + { + "epoch": 0.4875929645815619, + "grad_norm": 0.33157965540885925, + "learning_rate": 1.0389509321828905e-05, + "loss": 0.1684, + "step": 26264 + }, + { + "epoch": 0.48763009471898056, + "grad_norm": 0.6565263867378235, + "learning_rate": 1.0388343713739712e-05, + "loss": 0.2897, + "step": 26266 + }, + { + "epoch": 0.4876672248563992, + "grad_norm": 0.3367452919483185, + "learning_rate": 1.0387178100366326e-05, + "loss": 0.1929, + "step": 26268 + }, + { + "epoch": 0.4877043549938178, + "grad_norm": 0.3277583420276642, + "learning_rate": 1.0386012481724606e-05, + "loss": 0.1916, + "step": 26270 + }, + { + "epoch": 0.4877414851312365, + "grad_norm": 0.37614673376083374, + "learning_rate": 1.038484685783041e-05, + "loss": 0.2698, + "step": 26272 + }, + { + "epoch": 0.4877786152686551, + "grad_norm": 0.5407077074050903, + "learning_rate": 1.0383681228699602e-05, + "loss": 0.2743, + "step": 26274 + }, + { + "epoch": 0.48781574540607375, + "grad_norm": 0.4296466112136841, + "learning_rate": 1.0382515594348042e-05, + "loss": 0.206, + "step": 26276 + }, + { + "epoch": 0.4878528755434924, + "grad_norm": 0.31616124510765076, + "learning_rate": 1.0381349954791588e-05, + "loss": 0.3692, + "step": 26278 + }, + { + "epoch": 0.487890005680911, + "grad_norm": 0.4597640633583069, + "learning_rate": 1.0380184310046106e-05, + "loss": 0.2951, + "step": 26280 + }, + { + "epoch": 0.48792713581832964, + "grad_norm": 0.3943755626678467, + "learning_rate": 1.0379018660127456e-05, + "loss": 0.2196, + "step": 26282 + }, + { + "epoch": 0.4879642659557483, + "grad_norm": 0.30830249190330505, + "learning_rate": 1.0377853005051495e-05, + "loss": 0.3263, + "step": 26284 + }, + { + "epoch": 0.48800139609316695, + "grad_norm": 0.2921684682369232, + "learning_rate": 1.0376687344834087e-05, + "loss": 0.2259, + "step": 26286 + }, + { + "epoch": 0.4880385262305856, + "grad_norm": 0.27995535731315613, + "learning_rate": 1.0375521679491089e-05, + "loss": 0.287, + "step": 26288 + }, + { + "epoch": 0.4880756563680042, + "grad_norm": 0.3882593512535095, + "learning_rate": 1.0374356009038368e-05, + "loss": 0.3016, + "step": 26290 + }, + { + "epoch": 0.48811278650542284, + "grad_norm": 0.32382476329803467, + "learning_rate": 1.0373190333491788e-05, + "loss": 0.2859, + "step": 26292 + }, + { + "epoch": 0.4881499166428415, + "grad_norm": 0.4555091857910156, + "learning_rate": 1.03720246528672e-05, + "loss": 0.3855, + "step": 26294 + }, + { + "epoch": 0.48818704678026015, + "grad_norm": 0.23767714202404022, + "learning_rate": 1.0370858967180472e-05, + "loss": 0.081, + "step": 26296 + }, + { + "epoch": 0.4882241769176788, + "grad_norm": 0.3204522430896759, + "learning_rate": 1.0369693276447464e-05, + "loss": 0.22, + "step": 26298 + }, + { + "epoch": 0.4882613070550974, + "grad_norm": 0.3012599050998688, + "learning_rate": 1.0368527580684038e-05, + "loss": 0.3562, + "step": 26300 + }, + { + "epoch": 0.48829843719251603, + "grad_norm": 0.29556670784950256, + "learning_rate": 1.0367361879906057e-05, + "loss": 0.4121, + "step": 26302 + }, + { + "epoch": 0.48833556732993466, + "grad_norm": 0.26569175720214844, + "learning_rate": 1.0366196174129379e-05, + "loss": 0.2434, + "step": 26304 + }, + { + "epoch": 0.48837269746735334, + "grad_norm": 0.4446493089199066, + "learning_rate": 1.0365030463369869e-05, + "loss": 0.4108, + "step": 26306 + }, + { + "epoch": 0.488409827604772, + "grad_norm": 0.42011359333992004, + "learning_rate": 1.0363864747643388e-05, + "loss": 0.4548, + "step": 26308 + }, + { + "epoch": 0.4884469577421906, + "grad_norm": 0.28344857692718506, + "learning_rate": 1.0362699026965798e-05, + "loss": 0.2177, + "step": 26310 + }, + { + "epoch": 0.48848408787960923, + "grad_norm": 0.26737070083618164, + "learning_rate": 1.036153330135296e-05, + "loss": 0.2392, + "step": 26312 + }, + { + "epoch": 0.48852121801702786, + "grad_norm": 0.4713113009929657, + "learning_rate": 1.0360367570820737e-05, + "loss": 0.2023, + "step": 26314 + }, + { + "epoch": 0.48855834815444654, + "grad_norm": 0.42013394832611084, + "learning_rate": 1.0359201835384989e-05, + "loss": 0.2361, + "step": 26316 + }, + { + "epoch": 0.48859547829186517, + "grad_norm": 0.302665650844574, + "learning_rate": 1.0358036095061583e-05, + "loss": 0.2088, + "step": 26318 + }, + { + "epoch": 0.4886326084292838, + "grad_norm": 0.5672982335090637, + "learning_rate": 1.0356870349866376e-05, + "loss": 0.4448, + "step": 26320 + }, + { + "epoch": 0.4886697385667024, + "grad_norm": 0.474040150642395, + "learning_rate": 1.0355704599815235e-05, + "loss": 0.3476, + "step": 26322 + }, + { + "epoch": 0.48870686870412106, + "grad_norm": 0.6736011505126953, + "learning_rate": 1.0354538844924018e-05, + "loss": 0.331, + "step": 26324 + }, + { + "epoch": 0.48874399884153974, + "grad_norm": 0.28990036249160767, + "learning_rate": 1.0353373085208588e-05, + "loss": 0.4096, + "step": 26326 + }, + { + "epoch": 0.48878112897895837, + "grad_norm": 0.33474770188331604, + "learning_rate": 1.035220732068481e-05, + "loss": 0.2461, + "step": 26328 + }, + { + "epoch": 0.488818259116377, + "grad_norm": 0.2854396402835846, + "learning_rate": 1.0351041551368545e-05, + "loss": 0.2418, + "step": 26330 + }, + { + "epoch": 0.4888553892537956, + "grad_norm": 0.24770832061767578, + "learning_rate": 1.0349875777275658e-05, + "loss": 0.2065, + "step": 26332 + }, + { + "epoch": 0.48889251939121425, + "grad_norm": 0.3799992799758911, + "learning_rate": 1.0348709998422009e-05, + "loss": 0.3352, + "step": 26334 + }, + { + "epoch": 0.4889296495286329, + "grad_norm": 0.4896249771118164, + "learning_rate": 1.0347544214823457e-05, + "loss": 0.2304, + "step": 26336 + }, + { + "epoch": 0.48896677966605157, + "grad_norm": 0.5061522722244263, + "learning_rate": 1.0346378426495876e-05, + "loss": 0.2901, + "step": 26338 + }, + { + "epoch": 0.4890039098034702, + "grad_norm": 0.3306157886981964, + "learning_rate": 1.0345212633455118e-05, + "loss": 0.1861, + "step": 26340 + }, + { + "epoch": 0.4890410399408888, + "grad_norm": 0.45157331228256226, + "learning_rate": 1.034404683571705e-05, + "loss": 0.3889, + "step": 26342 + }, + { + "epoch": 0.48907817007830745, + "grad_norm": 0.27555394172668457, + "learning_rate": 1.0342881033297536e-05, + "loss": 0.3142, + "step": 26344 + }, + { + "epoch": 0.4891153002157261, + "grad_norm": 0.5186547636985779, + "learning_rate": 1.0341715226212438e-05, + "loss": 0.263, + "step": 26346 + }, + { + "epoch": 0.48915243035314476, + "grad_norm": 0.3780006766319275, + "learning_rate": 1.034054941447762e-05, + "loss": 0.4711, + "step": 26348 + }, + { + "epoch": 0.4891895604905634, + "grad_norm": 0.4200582802295685, + "learning_rate": 1.0339383598108946e-05, + "loss": 0.1823, + "step": 26350 + }, + { + "epoch": 0.489226690627982, + "grad_norm": 0.2983166575431824, + "learning_rate": 1.0338217777122275e-05, + "loss": 0.1176, + "step": 26352 + }, + { + "epoch": 0.48926382076540065, + "grad_norm": 0.367590993642807, + "learning_rate": 1.0337051951533474e-05, + "loss": 0.2617, + "step": 26354 + }, + { + "epoch": 0.4893009509028193, + "grad_norm": 0.23578345775604248, + "learning_rate": 1.0335886121358407e-05, + "loss": 0.1151, + "step": 26356 + }, + { + "epoch": 0.4893380810402379, + "grad_norm": 0.34940141439437866, + "learning_rate": 1.0334720286612933e-05, + "loss": 0.3644, + "step": 26358 + }, + { + "epoch": 0.4893752111776566, + "grad_norm": 0.4192293584346771, + "learning_rate": 1.0333554447312919e-05, + "loss": 0.2118, + "step": 26360 + }, + { + "epoch": 0.4894123413150752, + "grad_norm": 0.438501238822937, + "learning_rate": 1.033238860347423e-05, + "loss": 0.2045, + "step": 26362 + }, + { + "epoch": 0.48944947145249385, + "grad_norm": 0.22159920632839203, + "learning_rate": 1.0331222755112726e-05, + "loss": 0.2492, + "step": 26364 + }, + { + "epoch": 0.4894866015899125, + "grad_norm": 0.36134669184684753, + "learning_rate": 1.0330056902244273e-05, + "loss": 0.2277, + "step": 26366 + }, + { + "epoch": 0.4895237317273311, + "grad_norm": 0.2913864254951477, + "learning_rate": 1.0328891044884737e-05, + "loss": 0.349, + "step": 26368 + }, + { + "epoch": 0.4895608618647498, + "grad_norm": 0.3037043809890747, + "learning_rate": 1.0327725183049977e-05, + "loss": 0.2406, + "step": 26370 + }, + { + "epoch": 0.4895979920021684, + "grad_norm": 0.3260821998119354, + "learning_rate": 1.0326559316755855e-05, + "loss": 0.3151, + "step": 26372 + }, + { + "epoch": 0.48963512213958704, + "grad_norm": 0.30364322662353516, + "learning_rate": 1.0325393446018243e-05, + "loss": 0.3556, + "step": 26374 + }, + { + "epoch": 0.48967225227700567, + "grad_norm": 0.2487402707338333, + "learning_rate": 1.0324227570853002e-05, + "loss": 0.353, + "step": 26376 + }, + { + "epoch": 0.4897093824144243, + "grad_norm": 0.5113440155982971, + "learning_rate": 1.0323061691275993e-05, + "loss": 0.205, + "step": 26378 + }, + { + "epoch": 0.4897465125518429, + "grad_norm": 0.4941776692867279, + "learning_rate": 1.0321895807303082e-05, + "loss": 0.3058, + "step": 26380 + }, + { + "epoch": 0.4897836426892616, + "grad_norm": 0.3544394373893738, + "learning_rate": 1.0320729918950134e-05, + "loss": 0.1782, + "step": 26382 + }, + { + "epoch": 0.48982077282668024, + "grad_norm": 0.46059635281562805, + "learning_rate": 1.031956402623301e-05, + "loss": 0.2408, + "step": 26384 + }, + { + "epoch": 0.48985790296409887, + "grad_norm": 0.4019808769226074, + "learning_rate": 1.031839812916758e-05, + "loss": 0.2676, + "step": 26386 + }, + { + "epoch": 0.4898950331015175, + "grad_norm": 0.41258150339126587, + "learning_rate": 1.0317232227769705e-05, + "loss": 0.1422, + "step": 26388 + }, + { + "epoch": 0.4899321632389361, + "grad_norm": 0.39506393671035767, + "learning_rate": 1.0316066322055247e-05, + "loss": 0.2838, + "step": 26390 + }, + { + "epoch": 0.4899692933763548, + "grad_norm": 0.4231836497783661, + "learning_rate": 1.0314900412040073e-05, + "loss": 0.2662, + "step": 26392 + }, + { + "epoch": 0.49000642351377344, + "grad_norm": 0.5631445050239563, + "learning_rate": 1.031373449774005e-05, + "loss": 0.3403, + "step": 26394 + }, + { + "epoch": 0.49004355365119207, + "grad_norm": 0.33038315176963806, + "learning_rate": 1.031256857917104e-05, + "loss": 0.2205, + "step": 26396 + }, + { + "epoch": 0.4900806837886107, + "grad_norm": 0.3521159887313843, + "learning_rate": 1.0311402656348909e-05, + "loss": 0.068, + "step": 26398 + }, + { + "epoch": 0.4901178139260293, + "grad_norm": 0.3315359652042389, + "learning_rate": 1.031023672928952e-05, + "loss": 0.1829, + "step": 26400 + }, + { + "epoch": 0.490154944063448, + "grad_norm": 0.31536856293678284, + "learning_rate": 1.0309070798008736e-05, + "loss": 0.2038, + "step": 26402 + }, + { + "epoch": 0.49019207420086663, + "grad_norm": 0.3454214632511139, + "learning_rate": 1.0307904862522425e-05, + "loss": 0.2291, + "step": 26404 + }, + { + "epoch": 0.49022920433828526, + "grad_norm": 0.32094013690948486, + "learning_rate": 1.0306738922846453e-05, + "loss": 0.2592, + "step": 26406 + }, + { + "epoch": 0.4902663344757039, + "grad_norm": 0.4113997220993042, + "learning_rate": 1.0305572978996683e-05, + "loss": 0.2533, + "step": 26408 + }, + { + "epoch": 0.4903034646131225, + "grad_norm": 0.6455010771751404, + "learning_rate": 1.0304407030988976e-05, + "loss": 0.2174, + "step": 26410 + }, + { + "epoch": 0.49034059475054115, + "grad_norm": 0.4021225869655609, + "learning_rate": 1.0303241078839206e-05, + "loss": 0.2666, + "step": 26412 + }, + { + "epoch": 0.49037772488795983, + "grad_norm": 0.31682640314102173, + "learning_rate": 1.0302075122563229e-05, + "loss": 0.0896, + "step": 26414 + }, + { + "epoch": 0.49041485502537846, + "grad_norm": 0.4463070034980774, + "learning_rate": 1.0300909162176915e-05, + "loss": 0.3754, + "step": 26416 + }, + { + "epoch": 0.4904519851627971, + "grad_norm": 0.3955119550228119, + "learning_rate": 1.0299743197696133e-05, + "loss": 0.2614, + "step": 26418 + }, + { + "epoch": 0.4904891153002157, + "grad_norm": 0.47725868225097656, + "learning_rate": 1.0298577229136741e-05, + "loss": 0.2706, + "step": 26420 + }, + { + "epoch": 0.49052624543763435, + "grad_norm": 0.3843446969985962, + "learning_rate": 1.0297411256514606e-05, + "loss": 0.0535, + "step": 26422 + }, + { + "epoch": 0.49056337557505303, + "grad_norm": 0.6384403109550476, + "learning_rate": 1.0296245279845596e-05, + "loss": 0.4368, + "step": 26424 + }, + { + "epoch": 0.49060050571247166, + "grad_norm": 0.40317273139953613, + "learning_rate": 1.0295079299145576e-05, + "loss": 0.1785, + "step": 26426 + }, + { + "epoch": 0.4906376358498903, + "grad_norm": 0.40376192331314087, + "learning_rate": 1.0293913314430408e-05, + "loss": 0.2981, + "step": 26428 + }, + { + "epoch": 0.4906747659873089, + "grad_norm": 0.292687326669693, + "learning_rate": 1.0292747325715963e-05, + "loss": 0.2739, + "step": 26430 + }, + { + "epoch": 0.49071189612472754, + "grad_norm": 0.47509098052978516, + "learning_rate": 1.0291581333018104e-05, + "loss": 0.2739, + "step": 26432 + }, + { + "epoch": 0.49074902626214617, + "grad_norm": 0.32268375158309937, + "learning_rate": 1.0290415336352692e-05, + "loss": 0.3287, + "step": 26434 + }, + { + "epoch": 0.49078615639956485, + "grad_norm": 0.3302516043186188, + "learning_rate": 1.02892493357356e-05, + "loss": 0.2066, + "step": 26436 + }, + { + "epoch": 0.4908232865369835, + "grad_norm": 0.3668462038040161, + "learning_rate": 1.0288083331182692e-05, + "loss": 0.1934, + "step": 26438 + }, + { + "epoch": 0.4908604166744021, + "grad_norm": 0.3946560323238373, + "learning_rate": 1.0286917322709835e-05, + "loss": 0.268, + "step": 26440 + }, + { + "epoch": 0.49089754681182074, + "grad_norm": 0.3784308433532715, + "learning_rate": 1.0285751310332887e-05, + "loss": 0.433, + "step": 26442 + }, + { + "epoch": 0.49093467694923937, + "grad_norm": 0.4192458689212799, + "learning_rate": 1.0284585294067724e-05, + "loss": 0.349, + "step": 26444 + }, + { + "epoch": 0.49097180708665805, + "grad_norm": 0.24136297404766083, + "learning_rate": 1.0283419273930204e-05, + "loss": 0.1592, + "step": 26446 + }, + { + "epoch": 0.4910089372240767, + "grad_norm": 0.31866827607154846, + "learning_rate": 1.0282253249936198e-05, + "loss": 0.3945, + "step": 26448 + }, + { + "epoch": 0.4910460673614953, + "grad_norm": 0.48707345128059387, + "learning_rate": 1.0281087222101571e-05, + "loss": 0.3245, + "step": 26450 + }, + { + "epoch": 0.49108319749891394, + "grad_norm": 0.29982709884643555, + "learning_rate": 1.027992119044219e-05, + "loss": 0.283, + "step": 26452 + }, + { + "epoch": 0.49112032763633257, + "grad_norm": 1.4749685525894165, + "learning_rate": 1.0278755154973918e-05, + "loss": 0.3387, + "step": 26454 + }, + { + "epoch": 0.4911574577737512, + "grad_norm": 0.24318383634090424, + "learning_rate": 1.0277589115712625e-05, + "loss": 0.1674, + "step": 26456 + }, + { + "epoch": 0.4911945879111699, + "grad_norm": 0.26913678646087646, + "learning_rate": 1.0276423072674175e-05, + "loss": 0.2121, + "step": 26458 + }, + { + "epoch": 0.4912317180485885, + "grad_norm": 0.46446430683135986, + "learning_rate": 1.0275257025874436e-05, + "loss": 0.375, + "step": 26460 + }, + { + "epoch": 0.49126884818600713, + "grad_norm": 0.3124270737171173, + "learning_rate": 1.0274090975329272e-05, + "loss": 0.4954, + "step": 26462 + }, + { + "epoch": 0.49130597832342576, + "grad_norm": 0.4616926908493042, + "learning_rate": 1.0272924921054552e-05, + "loss": 0.3711, + "step": 26464 + }, + { + "epoch": 0.4913431084608444, + "grad_norm": 0.2119692862033844, + "learning_rate": 1.027175886306614e-05, + "loss": 0.2615, + "step": 26466 + }, + { + "epoch": 0.4913802385982631, + "grad_norm": 0.4567694664001465, + "learning_rate": 1.0270592801379902e-05, + "loss": 0.158, + "step": 26468 + }, + { + "epoch": 0.4914173687356817, + "grad_norm": 0.44871777296066284, + "learning_rate": 1.026942673601171e-05, + "loss": 0.1461, + "step": 26470 + }, + { + "epoch": 0.49145449887310033, + "grad_norm": 0.24175108969211578, + "learning_rate": 1.0268260666977428e-05, + "loss": 0.2603, + "step": 26472 + }, + { + "epoch": 0.49149162901051896, + "grad_norm": 0.4066236615180969, + "learning_rate": 1.0267094594292918e-05, + "loss": 0.3231, + "step": 26474 + }, + { + "epoch": 0.4915287591479376, + "grad_norm": 0.5919638276100159, + "learning_rate": 1.0265928517974053e-05, + "loss": 0.3488, + "step": 26476 + }, + { + "epoch": 0.4915658892853563, + "grad_norm": 0.4045667350292206, + "learning_rate": 1.0264762438036695e-05, + "loss": 0.4124, + "step": 26478 + }, + { + "epoch": 0.4916030194227749, + "grad_norm": 0.3669506311416626, + "learning_rate": 1.0263596354496714e-05, + "loss": 0.2583, + "step": 26480 + }, + { + "epoch": 0.49164014956019353, + "grad_norm": 0.4036490023136139, + "learning_rate": 1.0262430267369979e-05, + "loss": 0.4119, + "step": 26482 + }, + { + "epoch": 0.49167727969761216, + "grad_norm": 0.838638186454773, + "learning_rate": 1.0261264176672354e-05, + "loss": 0.4668, + "step": 26484 + }, + { + "epoch": 0.4917144098350308, + "grad_norm": 0.3675265908241272, + "learning_rate": 1.0260098082419702e-05, + "loss": 0.2731, + "step": 26486 + }, + { + "epoch": 0.4917515399724494, + "grad_norm": 0.38461047410964966, + "learning_rate": 1.0258931984627897e-05, + "loss": 0.0963, + "step": 26488 + }, + { + "epoch": 0.4917886701098681, + "grad_norm": 0.40181416273117065, + "learning_rate": 1.0257765883312804e-05, + "loss": 0.4068, + "step": 26490 + }, + { + "epoch": 0.4918258002472867, + "grad_norm": 0.5698877573013306, + "learning_rate": 1.0256599778490289e-05, + "loss": 0.2949, + "step": 26492 + }, + { + "epoch": 0.49186293038470535, + "grad_norm": 0.3826169967651367, + "learning_rate": 1.025543367017622e-05, + "loss": 0.2638, + "step": 26494 + }, + { + "epoch": 0.491900060522124, + "grad_norm": 0.2359253615140915, + "learning_rate": 1.0254267558386461e-05, + "loss": 0.2977, + "step": 26496 + }, + { + "epoch": 0.4919371906595426, + "grad_norm": 0.3967624604701996, + "learning_rate": 1.0253101443136883e-05, + "loss": 0.2672, + "step": 26498 + }, + { + "epoch": 0.4919743207969613, + "grad_norm": 0.33991748094558716, + "learning_rate": 1.0251935324443355e-05, + "loss": 0.3036, + "step": 26500 + }, + { + "epoch": 0.4920114509343799, + "grad_norm": 0.3304373323917389, + "learning_rate": 1.025076920232174e-05, + "loss": 0.3123, + "step": 26502 + }, + { + "epoch": 0.49204858107179855, + "grad_norm": 0.28889793157577515, + "learning_rate": 1.0249603076787908e-05, + "loss": 0.3092, + "step": 26504 + }, + { + "epoch": 0.4920857112092172, + "grad_norm": 0.549604058265686, + "learning_rate": 1.0248436947857725e-05, + "loss": 0.2865, + "step": 26506 + }, + { + "epoch": 0.4921228413466358, + "grad_norm": 0.3449608087539673, + "learning_rate": 1.0247270815547061e-05, + "loss": 0.1442, + "step": 26508 + }, + { + "epoch": 0.49215997148405444, + "grad_norm": 0.30500131845474243, + "learning_rate": 1.024610467987178e-05, + "loss": 0.1518, + "step": 26510 + }, + { + "epoch": 0.4921971016214731, + "grad_norm": 0.53940349817276, + "learning_rate": 1.0244938540847752e-05, + "loss": 0.3516, + "step": 26512 + }, + { + "epoch": 0.49223423175889175, + "grad_norm": 0.428055077791214, + "learning_rate": 1.0243772398490845e-05, + "loss": 0.2377, + "step": 26514 + }, + { + "epoch": 0.4922713618963104, + "grad_norm": 0.4078656733036041, + "learning_rate": 1.0242606252816925e-05, + "loss": 0.3004, + "step": 26516 + }, + { + "epoch": 0.492308492033729, + "grad_norm": 0.34508949518203735, + "learning_rate": 1.024144010384186e-05, + "loss": 0.2106, + "step": 26518 + }, + { + "epoch": 0.49234562217114763, + "grad_norm": 0.4372147023677826, + "learning_rate": 1.0240273951581521e-05, + "loss": 0.2488, + "step": 26520 + }, + { + "epoch": 0.4923827523085663, + "grad_norm": 0.36753395199775696, + "learning_rate": 1.0239107796051768e-05, + "loss": 0.2514, + "step": 26522 + }, + { + "epoch": 0.49241988244598495, + "grad_norm": 0.3852521777153015, + "learning_rate": 1.023794163726848e-05, + "loss": 0.2577, + "step": 26524 + }, + { + "epoch": 0.4924570125834036, + "grad_norm": 0.5499061942100525, + "learning_rate": 1.0236775475247518e-05, + "loss": 0.4012, + "step": 26526 + }, + { + "epoch": 0.4924941427208222, + "grad_norm": 0.37243399024009705, + "learning_rate": 1.0235609310004748e-05, + "loss": 0.268, + "step": 26528 + }, + { + "epoch": 0.49253127285824083, + "grad_norm": 0.24391011893749237, + "learning_rate": 1.023444314155604e-05, + "loss": 0.2759, + "step": 26530 + }, + { + "epoch": 0.49256840299565946, + "grad_norm": 0.34231558442115784, + "learning_rate": 1.0233276969917267e-05, + "loss": 0.4431, + "step": 26532 + }, + { + "epoch": 0.49260553313307814, + "grad_norm": 0.4161551296710968, + "learning_rate": 1.023211079510429e-05, + "loss": 0.1988, + "step": 26534 + }, + { + "epoch": 0.4926426632704968, + "grad_norm": 0.33417993783950806, + "learning_rate": 1.0230944617132985e-05, + "loss": 0.2987, + "step": 26536 + }, + { + "epoch": 0.4926797934079154, + "grad_norm": 0.44479745626449585, + "learning_rate": 1.0229778436019213e-05, + "loss": 0.3708, + "step": 26538 + }, + { + "epoch": 0.49271692354533403, + "grad_norm": 0.38209518790245056, + "learning_rate": 1.0228612251778843e-05, + "loss": 0.3813, + "step": 26540 + }, + { + "epoch": 0.49275405368275266, + "grad_norm": 0.4661446213722229, + "learning_rate": 1.0227446064427746e-05, + "loss": 0.5228, + "step": 26542 + }, + { + "epoch": 0.49279118382017134, + "grad_norm": 0.4897814393043518, + "learning_rate": 1.022627987398179e-05, + "loss": 0.4422, + "step": 26544 + }, + { + "epoch": 0.49282831395758997, + "grad_norm": 0.2601756155490875, + "learning_rate": 1.0225113680456844e-05, + "loss": 0.2809, + "step": 26546 + }, + { + "epoch": 0.4928654440950086, + "grad_norm": 0.3746464252471924, + "learning_rate": 1.0223947483868773e-05, + "loss": 0.414, + "step": 26548 + }, + { + "epoch": 0.4929025742324272, + "grad_norm": 0.4170742332935333, + "learning_rate": 1.0222781284233445e-05, + "loss": 0.2898, + "step": 26550 + }, + { + "epoch": 0.49293970436984585, + "grad_norm": 0.34457671642303467, + "learning_rate": 1.0221615081566737e-05, + "loss": 0.3377, + "step": 26552 + }, + { + "epoch": 0.49297683450726454, + "grad_norm": 0.41983088850975037, + "learning_rate": 1.0220448875884508e-05, + "loss": 0.3021, + "step": 26554 + }, + { + "epoch": 0.49301396464468317, + "grad_norm": 0.3755203187465668, + "learning_rate": 1.021928266720263e-05, + "loss": 0.3243, + "step": 26556 + }, + { + "epoch": 0.4930510947821018, + "grad_norm": 0.38865768909454346, + "learning_rate": 1.0218116455536975e-05, + "loss": 0.26, + "step": 26558 + }, + { + "epoch": 0.4930882249195204, + "grad_norm": 0.3116987943649292, + "learning_rate": 1.0216950240903404e-05, + "loss": 0.1243, + "step": 26560 + }, + { + "epoch": 0.49312535505693905, + "grad_norm": 0.23697133362293243, + "learning_rate": 1.021578402331779e-05, + "loss": 0.2304, + "step": 26562 + }, + { + "epoch": 0.4931624851943577, + "grad_norm": 0.46624237298965454, + "learning_rate": 1.0214617802796005e-05, + "loss": 0.4044, + "step": 26564 + }, + { + "epoch": 0.49319961533177636, + "grad_norm": 0.3294352889060974, + "learning_rate": 1.0213451579353913e-05, + "loss": 0.1986, + "step": 26566 + }, + { + "epoch": 0.493236745469195, + "grad_norm": 0.37446993589401245, + "learning_rate": 1.0212285353007385e-05, + "loss": 0.0786, + "step": 26568 + }, + { + "epoch": 0.4932738756066136, + "grad_norm": 0.2895089089870453, + "learning_rate": 1.0211119123772291e-05, + "loss": 0.1566, + "step": 26570 + }, + { + "epoch": 0.49331100574403225, + "grad_norm": 0.2512679398059845, + "learning_rate": 1.0209952891664495e-05, + "loss": 0.3362, + "step": 26572 + }, + { + "epoch": 0.4933481358814509, + "grad_norm": 0.37703052163124084, + "learning_rate": 1.0208786656699868e-05, + "loss": 0.2679, + "step": 26574 + }, + { + "epoch": 0.49338526601886956, + "grad_norm": 0.49722206592559814, + "learning_rate": 1.0207620418894284e-05, + "loss": 0.3751, + "step": 26576 + }, + { + "epoch": 0.4934223961562882, + "grad_norm": 0.44880250096321106, + "learning_rate": 1.0206454178263605e-05, + "loss": 0.1814, + "step": 26578 + }, + { + "epoch": 0.4934595262937068, + "grad_norm": 0.3312230408191681, + "learning_rate": 1.0205287934823704e-05, + "loss": 0.2786, + "step": 26580 + }, + { + "epoch": 0.49349665643112545, + "grad_norm": 0.42703157663345337, + "learning_rate": 1.0204121688590447e-05, + "loss": 0.3236, + "step": 26582 + }, + { + "epoch": 0.4935337865685441, + "grad_norm": 0.6197587251663208, + "learning_rate": 1.0202955439579709e-05, + "loss": 0.4815, + "step": 26584 + }, + { + "epoch": 0.4935709167059627, + "grad_norm": 0.37159937620162964, + "learning_rate": 1.0201789187807353e-05, + "loss": 0.2774, + "step": 26586 + }, + { + "epoch": 0.4936080468433814, + "grad_norm": 0.40503570437431335, + "learning_rate": 1.0200622933289252e-05, + "loss": 0.2478, + "step": 26588 + }, + { + "epoch": 0.4936451769808, + "grad_norm": 0.4374489486217499, + "learning_rate": 1.0199456676041275e-05, + "loss": 0.3199, + "step": 26590 + }, + { + "epoch": 0.49368230711821864, + "grad_norm": 0.3472742438316345, + "learning_rate": 1.0198290416079287e-05, + "loss": 0.2849, + "step": 26592 + }, + { + "epoch": 0.4937194372556373, + "grad_norm": 0.4937828779220581, + "learning_rate": 1.019712415341916e-05, + "loss": 0.3462, + "step": 26594 + }, + { + "epoch": 0.4937565673930559, + "grad_norm": 0.5056163668632507, + "learning_rate": 1.0195957888076767e-05, + "loss": 0.2618, + "step": 26596 + }, + { + "epoch": 0.4937936975304746, + "grad_norm": 0.34028157591819763, + "learning_rate": 1.0194791620067973e-05, + "loss": 0.3849, + "step": 26598 + }, + { + "epoch": 0.4938308276678932, + "grad_norm": 0.43088021874427795, + "learning_rate": 1.0193625349408646e-05, + "loss": 0.1685, + "step": 26600 + }, + { + "epoch": 0.49386795780531184, + "grad_norm": 0.4028365910053253, + "learning_rate": 1.0192459076114662e-05, + "loss": 0.3712, + "step": 26602 + }, + { + "epoch": 0.49390508794273047, + "grad_norm": 0.5389242768287659, + "learning_rate": 1.0191292800201885e-05, + "loss": 0.2741, + "step": 26604 + }, + { + "epoch": 0.4939422180801491, + "grad_norm": 0.39854696393013, + "learning_rate": 1.0190126521686183e-05, + "loss": 0.1872, + "step": 26606 + }, + { + "epoch": 0.4939793482175677, + "grad_norm": 0.21230767667293549, + "learning_rate": 1.0188960240583433e-05, + "loss": 0.229, + "step": 26608 + }, + { + "epoch": 0.4940164783549864, + "grad_norm": 0.41884365677833557, + "learning_rate": 1.0187793956909497e-05, + "loss": 0.3876, + "step": 26610 + }, + { + "epoch": 0.49405360849240504, + "grad_norm": 0.4534231424331665, + "learning_rate": 1.018662767068025e-05, + "loss": 0.3011, + "step": 26612 + }, + { + "epoch": 0.49409073862982367, + "grad_norm": 0.5291032791137695, + "learning_rate": 1.0185461381911556e-05, + "loss": 0.2397, + "step": 26614 + }, + { + "epoch": 0.4941278687672423, + "grad_norm": 0.4040442407131195, + "learning_rate": 1.0184295090619291e-05, + "loss": 0.265, + "step": 26616 + }, + { + "epoch": 0.4941649989046609, + "grad_norm": 0.28613871335983276, + "learning_rate": 1.0183128796819319e-05, + "loss": 0.344, + "step": 26618 + }, + { + "epoch": 0.4942021290420796, + "grad_norm": 0.8896944522857666, + "learning_rate": 1.0181962500527516e-05, + "loss": 0.2231, + "step": 26620 + }, + { + "epoch": 0.49423925917949824, + "grad_norm": 0.43337246775627136, + "learning_rate": 1.0180796201759748e-05, + "loss": 0.2743, + "step": 26622 + }, + { + "epoch": 0.49427638931691686, + "grad_norm": 0.4792998731136322, + "learning_rate": 1.0179629900531885e-05, + "loss": 0.3272, + "step": 26624 + }, + { + "epoch": 0.4943135194543355, + "grad_norm": 0.4641440808773041, + "learning_rate": 1.0178463596859794e-05, + "loss": 0.2588, + "step": 26626 + }, + { + "epoch": 0.4943506495917541, + "grad_norm": 0.5454699397087097, + "learning_rate": 1.0177297290759352e-05, + "loss": 0.3448, + "step": 26628 + }, + { + "epoch": 0.4943877797291728, + "grad_norm": 0.48575395345687866, + "learning_rate": 1.0176130982246424e-05, + "loss": 0.3347, + "step": 26630 + }, + { + "epoch": 0.49442490986659143, + "grad_norm": 0.16514244675636292, + "learning_rate": 1.017496467133688e-05, + "loss": 0.0776, + "step": 26632 + }, + { + "epoch": 0.49446204000401006, + "grad_norm": 0.31314805150032043, + "learning_rate": 1.0173798358046592e-05, + "loss": 0.3376, + "step": 26634 + }, + { + "epoch": 0.4944991701414287, + "grad_norm": 0.40999871492385864, + "learning_rate": 1.0172632042391428e-05, + "loss": 0.2233, + "step": 26636 + }, + { + "epoch": 0.4945363002788473, + "grad_norm": 0.5122815370559692, + "learning_rate": 1.0171465724387257e-05, + "loss": 0.3336, + "step": 26638 + }, + { + "epoch": 0.49457343041626595, + "grad_norm": 0.44166526198387146, + "learning_rate": 1.0170299404049954e-05, + "loss": 0.1839, + "step": 26640 + }, + { + "epoch": 0.49461056055368463, + "grad_norm": 0.3624061644077301, + "learning_rate": 1.0169133081395388e-05, + "loss": 0.3919, + "step": 26642 + }, + { + "epoch": 0.49464769069110326, + "grad_norm": 0.514481782913208, + "learning_rate": 1.0167966756439423e-05, + "loss": 0.294, + "step": 26644 + }, + { + "epoch": 0.4946848208285219, + "grad_norm": 0.48552820086479187, + "learning_rate": 1.0166800429197936e-05, + "loss": 0.2474, + "step": 26646 + }, + { + "epoch": 0.4947219509659405, + "grad_norm": 0.5801034569740295, + "learning_rate": 1.0165634099686794e-05, + "loss": 0.2883, + "step": 26648 + }, + { + "epoch": 0.49475908110335914, + "grad_norm": 0.28711357712745667, + "learning_rate": 1.0164467767921866e-05, + "loss": 0.2679, + "step": 26650 + }, + { + "epoch": 0.49479621124077783, + "grad_norm": 0.6777228713035583, + "learning_rate": 1.016330143391903e-05, + "loss": 0.3596, + "step": 26652 + }, + { + "epoch": 0.49483334137819646, + "grad_norm": 0.4579601287841797, + "learning_rate": 1.0162135097694145e-05, + "loss": 0.4341, + "step": 26654 + }, + { + "epoch": 0.4948704715156151, + "grad_norm": 0.3473369777202606, + "learning_rate": 1.016096875926309e-05, + "loss": 0.2744, + "step": 26656 + }, + { + "epoch": 0.4949076016530337, + "grad_norm": 0.58405601978302, + "learning_rate": 1.0159802418641734e-05, + "loss": 0.2282, + "step": 26658 + }, + { + "epoch": 0.49494473179045234, + "grad_norm": 0.3529479503631592, + "learning_rate": 1.015863607584594e-05, + "loss": 0.1739, + "step": 26660 + }, + { + "epoch": 0.49498186192787097, + "grad_norm": 0.2871580123901367, + "learning_rate": 1.015746973089159e-05, + "loss": 0.2588, + "step": 26662 + }, + { + "epoch": 0.49501899206528965, + "grad_norm": 0.30716392397880554, + "learning_rate": 1.0156303383794543e-05, + "loss": 0.3405, + "step": 26664 + }, + { + "epoch": 0.4950561222027083, + "grad_norm": 0.2876185178756714, + "learning_rate": 1.015513703457068e-05, + "loss": 0.3794, + "step": 26666 + }, + { + "epoch": 0.4950932523401269, + "grad_norm": 0.3798554241657257, + "learning_rate": 1.0153970683235863e-05, + "loss": 0.3221, + "step": 26668 + }, + { + "epoch": 0.49513038247754554, + "grad_norm": 0.47295188903808594, + "learning_rate": 1.0152804329805967e-05, + "loss": 0.151, + "step": 26670 + }, + { + "epoch": 0.49516751261496417, + "grad_norm": 0.26839762926101685, + "learning_rate": 1.0151637974296863e-05, + "loss": 0.3964, + "step": 26672 + }, + { + "epoch": 0.49520464275238285, + "grad_norm": 0.2356579750776291, + "learning_rate": 1.0150471616724424e-05, + "loss": 0.2817, + "step": 26674 + }, + { + "epoch": 0.4952417728898015, + "grad_norm": 0.5730855464935303, + "learning_rate": 1.0149305257104511e-05, + "loss": 0.3657, + "step": 26676 + }, + { + "epoch": 0.4952789030272201, + "grad_norm": 0.2966228425502777, + "learning_rate": 1.0148138895453004e-05, + "loss": 0.3649, + "step": 26678 + }, + { + "epoch": 0.49531603316463874, + "grad_norm": 0.4350131154060364, + "learning_rate": 1.0146972531785768e-05, + "loss": 0.3816, + "step": 26680 + }, + { + "epoch": 0.49535316330205736, + "grad_norm": 0.4339401423931122, + "learning_rate": 1.0145806166118677e-05, + "loss": 0.3573, + "step": 26682 + }, + { + "epoch": 0.495390293439476, + "grad_norm": 0.5247344970703125, + "learning_rate": 1.0144639798467605e-05, + "loss": 0.349, + "step": 26684 + }, + { + "epoch": 0.4954274235768947, + "grad_norm": 0.41206443309783936, + "learning_rate": 1.0143473428848414e-05, + "loss": 0.4337, + "step": 26686 + }, + { + "epoch": 0.4954645537143133, + "grad_norm": 0.45704370737075806, + "learning_rate": 1.0142307057276978e-05, + "loss": 0.5111, + "step": 26688 + }, + { + "epoch": 0.49550168385173193, + "grad_norm": 0.299344927072525, + "learning_rate": 1.0141140683769172e-05, + "loss": 0.111, + "step": 26690 + }, + { + "epoch": 0.49553881398915056, + "grad_norm": 0.2707463502883911, + "learning_rate": 1.0139974308340864e-05, + "loss": 0.1708, + "step": 26692 + }, + { + "epoch": 0.4955759441265692, + "grad_norm": 0.3805144727230072, + "learning_rate": 1.0138807931007923e-05, + "loss": 0.0725, + "step": 26694 + }, + { + "epoch": 0.4956130742639879, + "grad_norm": 0.35223814845085144, + "learning_rate": 1.0137641551786225e-05, + "loss": 0.4761, + "step": 26696 + }, + { + "epoch": 0.4956502044014065, + "grad_norm": 0.34545236825942993, + "learning_rate": 1.0136475170691635e-05, + "loss": 0.3075, + "step": 26698 + }, + { + "epoch": 0.49568733453882513, + "grad_norm": 0.3345290422439575, + "learning_rate": 1.0135308787740026e-05, + "loss": 0.3072, + "step": 26700 + }, + { + "epoch": 0.49572446467624376, + "grad_norm": 0.30638587474823, + "learning_rate": 1.0134142402947273e-05, + "loss": 0.2923, + "step": 26702 + }, + { + "epoch": 0.4957615948136624, + "grad_norm": 0.48701202869415283, + "learning_rate": 1.013297601632924e-05, + "loss": 0.3216, + "step": 26704 + }, + { + "epoch": 0.49579872495108107, + "grad_norm": 0.570892333984375, + "learning_rate": 1.0131809627901805e-05, + "loss": 0.3229, + "step": 26706 + }, + { + "epoch": 0.4958358550884997, + "grad_norm": 0.4411565065383911, + "learning_rate": 1.0130643237680835e-05, + "loss": 0.2481, + "step": 26708 + }, + { + "epoch": 0.49587298522591833, + "grad_norm": 0.23127366602420807, + "learning_rate": 1.0129476845682202e-05, + "loss": 0.2037, + "step": 26710 + }, + { + "epoch": 0.49591011536333696, + "grad_norm": 0.36747708916664124, + "learning_rate": 1.0128310451921774e-05, + "loss": 0.3093, + "step": 26712 + }, + { + "epoch": 0.4959472455007556, + "grad_norm": 0.4010242521762848, + "learning_rate": 1.012714405641543e-05, + "loss": 0.1851, + "step": 26714 + }, + { + "epoch": 0.4959843756381742, + "grad_norm": 0.4909267723560333, + "learning_rate": 1.0125977659179034e-05, + "loss": 0.3345, + "step": 26716 + }, + { + "epoch": 0.4960215057755929, + "grad_norm": 0.3525868356227875, + "learning_rate": 1.0124811260228457e-05, + "loss": 0.4439, + "step": 26718 + }, + { + "epoch": 0.4960586359130115, + "grad_norm": 0.5203554034233093, + "learning_rate": 1.0123644859579575e-05, + "loss": 0.5961, + "step": 26720 + }, + { + "epoch": 0.49609576605043015, + "grad_norm": 0.3930593430995941, + "learning_rate": 1.0122478457248258e-05, + "loss": 0.3508, + "step": 26722 + }, + { + "epoch": 0.4961328961878488, + "grad_norm": 0.45534875988960266, + "learning_rate": 1.0121312053250371e-05, + "loss": 0.1914, + "step": 26724 + }, + { + "epoch": 0.4961700263252674, + "grad_norm": 0.5005047917366028, + "learning_rate": 1.0120145647601796e-05, + "loss": 0.2731, + "step": 26726 + }, + { + "epoch": 0.4962071564626861, + "grad_norm": 0.3658311367034912, + "learning_rate": 1.0118979240318399e-05, + "loss": 0.4434, + "step": 26728 + }, + { + "epoch": 0.4962442866001047, + "grad_norm": 0.4492465853691101, + "learning_rate": 1.0117812831416048e-05, + "loss": 0.2492, + "step": 26730 + }, + { + "epoch": 0.49628141673752335, + "grad_norm": 0.41235604882240295, + "learning_rate": 1.0116646420910614e-05, + "loss": 0.3944, + "step": 26732 + }, + { + "epoch": 0.496318546874942, + "grad_norm": 0.4014976918697357, + "learning_rate": 1.0115480008817977e-05, + "loss": 0.2699, + "step": 26734 + }, + { + "epoch": 0.4963556770123606, + "grad_norm": 0.5120599865913391, + "learning_rate": 1.0114313595154002e-05, + "loss": 0.2735, + "step": 26736 + }, + { + "epoch": 0.49639280714977924, + "grad_norm": 0.29358986020088196, + "learning_rate": 1.0113147179934562e-05, + "loss": 0.2843, + "step": 26738 + }, + { + "epoch": 0.4964299372871979, + "grad_norm": 0.5018960237503052, + "learning_rate": 1.0111980763175526e-05, + "loss": 0.1552, + "step": 26740 + }, + { + "epoch": 0.49646706742461655, + "grad_norm": 0.2884894609451294, + "learning_rate": 1.0110814344892768e-05, + "loss": 0.2355, + "step": 26742 + }, + { + "epoch": 0.4965041975620352, + "grad_norm": 0.4196353852748871, + "learning_rate": 1.0109647925102155e-05, + "loss": 0.1787, + "step": 26744 + }, + { + "epoch": 0.4965413276994538, + "grad_norm": 0.2575789988040924, + "learning_rate": 1.0108481503819567e-05, + "loss": 0.1943, + "step": 26746 + }, + { + "epoch": 0.49657845783687243, + "grad_norm": 0.3614758849143982, + "learning_rate": 1.0107315081060872e-05, + "loss": 0.3847, + "step": 26748 + }, + { + "epoch": 0.4966155879742911, + "grad_norm": 0.21284295618534088, + "learning_rate": 1.0106148656841934e-05, + "loss": 0.2375, + "step": 26750 + }, + { + "epoch": 0.49665271811170975, + "grad_norm": 0.41940784454345703, + "learning_rate": 1.0104982231178635e-05, + "loss": 0.2821, + "step": 26752 + }, + { + "epoch": 0.4966898482491284, + "grad_norm": 0.42432481050491333, + "learning_rate": 1.0103815804086841e-05, + "loss": 0.3316, + "step": 26754 + }, + { + "epoch": 0.496726978386547, + "grad_norm": 0.29224902391433716, + "learning_rate": 1.0102649375582425e-05, + "loss": 0.1555, + "step": 26756 + }, + { + "epoch": 0.49676410852396563, + "grad_norm": 0.32181861996650696, + "learning_rate": 1.0101482945681261e-05, + "loss": 0.3075, + "step": 26758 + }, + { + "epoch": 0.49680123866138426, + "grad_norm": 0.5196285247802734, + "learning_rate": 1.0100316514399215e-05, + "loss": 0.1112, + "step": 26760 + }, + { + "epoch": 0.49683836879880294, + "grad_norm": 0.3820239007472992, + "learning_rate": 1.0099150081752162e-05, + "loss": 0.3735, + "step": 26762 + }, + { + "epoch": 0.49687549893622157, + "grad_norm": 0.3780510425567627, + "learning_rate": 1.0097983647755974e-05, + "loss": 0.1449, + "step": 26764 + }, + { + "epoch": 0.4969126290736402, + "grad_norm": 0.4542478322982788, + "learning_rate": 1.0096817212426522e-05, + "loss": 0.1787, + "step": 26766 + }, + { + "epoch": 0.49694975921105883, + "grad_norm": 0.30595123767852783, + "learning_rate": 1.0095650775779679e-05, + "loss": 0.316, + "step": 26768 + }, + { + "epoch": 0.49698688934847746, + "grad_norm": 0.34468504786491394, + "learning_rate": 1.0094484337831314e-05, + "loss": 0.4013, + "step": 26770 + }, + { + "epoch": 0.49702401948589614, + "grad_norm": 0.19810239970684052, + "learning_rate": 1.0093317898597301e-05, + "loss": 0.2337, + "step": 26772 + }, + { + "epoch": 0.49706114962331477, + "grad_norm": 0.3810808062553406, + "learning_rate": 1.009215145809351e-05, + "loss": 0.2599, + "step": 26774 + }, + { + "epoch": 0.4970982797607334, + "grad_norm": 0.3304886817932129, + "learning_rate": 1.0090985016335812e-05, + "loss": 0.3857, + "step": 26776 + }, + { + "epoch": 0.497135409898152, + "grad_norm": 0.33980000019073486, + "learning_rate": 1.0089818573340082e-05, + "loss": 0.4227, + "step": 26778 + }, + { + "epoch": 0.49717254003557065, + "grad_norm": 0.38496121764183044, + "learning_rate": 1.0088652129122191e-05, + "loss": 0.2173, + "step": 26780 + }, + { + "epoch": 0.49720967017298934, + "grad_norm": 0.3895653784275055, + "learning_rate": 1.008748568369801e-05, + "loss": 0.5394, + "step": 26782 + }, + { + "epoch": 0.49724680031040797, + "grad_norm": 0.3522690236568451, + "learning_rate": 1.0086319237083413e-05, + "loss": 0.1561, + "step": 26784 + }, + { + "epoch": 0.4972839304478266, + "grad_norm": 0.3797140121459961, + "learning_rate": 1.0085152789294265e-05, + "loss": 0.2534, + "step": 26786 + }, + { + "epoch": 0.4973210605852452, + "grad_norm": 0.30519089102745056, + "learning_rate": 1.0083986340346443e-05, + "loss": 0.3095, + "step": 26788 + }, + { + "epoch": 0.49735819072266385, + "grad_norm": 0.42169806361198425, + "learning_rate": 1.0082819890255824e-05, + "loss": 0.4702, + "step": 26790 + }, + { + "epoch": 0.4973953208600825, + "grad_norm": 0.23226481676101685, + "learning_rate": 1.008165343903827e-05, + "loss": 0.3178, + "step": 26792 + }, + { + "epoch": 0.49743245099750116, + "grad_norm": 0.46743327379226685, + "learning_rate": 1.0080486986709659e-05, + "loss": 0.2234, + "step": 26794 + }, + { + "epoch": 0.4974695811349198, + "grad_norm": 0.5484879016876221, + "learning_rate": 1.0079320533285858e-05, + "loss": 0.2262, + "step": 26796 + }, + { + "epoch": 0.4975067112723384, + "grad_norm": 0.4141293466091156, + "learning_rate": 1.0078154078782745e-05, + "loss": 0.4032, + "step": 26798 + }, + { + "epoch": 0.49754384140975705, + "grad_norm": 0.30675286054611206, + "learning_rate": 1.007698762321619e-05, + "loss": 0.3855, + "step": 26800 + }, + { + "epoch": 0.4975809715471757, + "grad_norm": 0.5084612965583801, + "learning_rate": 1.0075821166602062e-05, + "loss": 0.3221, + "step": 26802 + }, + { + "epoch": 0.49761810168459436, + "grad_norm": 0.2735929489135742, + "learning_rate": 1.0074654708956237e-05, + "loss": 0.3158, + "step": 26804 + }, + { + "epoch": 0.497655231822013, + "grad_norm": 0.35964760184288025, + "learning_rate": 1.007348825029458e-05, + "loss": 0.2437, + "step": 26806 + }, + { + "epoch": 0.4976923619594316, + "grad_norm": 0.5688581466674805, + "learning_rate": 1.0072321790632973e-05, + "loss": 0.2337, + "step": 26808 + }, + { + "epoch": 0.49772949209685025, + "grad_norm": 0.3958166837692261, + "learning_rate": 1.0071155329987283e-05, + "loss": 0.3309, + "step": 26810 + }, + { + "epoch": 0.4977666222342689, + "grad_norm": 0.3531842827796936, + "learning_rate": 1.0069988868373382e-05, + "loss": 0.1752, + "step": 26812 + }, + { + "epoch": 0.4978037523716875, + "grad_norm": 0.4273420572280884, + "learning_rate": 1.006882240580714e-05, + "loss": 0.2729, + "step": 26814 + }, + { + "epoch": 0.4978408825091062, + "grad_norm": 0.5354814529418945, + "learning_rate": 1.0067655942304434e-05, + "loss": 0.2849, + "step": 26816 + }, + { + "epoch": 0.4978780126465248, + "grad_norm": 0.5052360892295837, + "learning_rate": 1.006648947788113e-05, + "loss": 0.2312, + "step": 26818 + }, + { + "epoch": 0.49791514278394344, + "grad_norm": 0.4538573920726776, + "learning_rate": 1.0065323012553102e-05, + "loss": 0.1823, + "step": 26820 + }, + { + "epoch": 0.49795227292136207, + "grad_norm": 0.39478498697280884, + "learning_rate": 1.006415654633623e-05, + "loss": 0.1077, + "step": 26822 + }, + { + "epoch": 0.4979894030587807, + "grad_norm": 0.4058161675930023, + "learning_rate": 1.0062990079246376e-05, + "loss": 0.3839, + "step": 26824 + }, + { + "epoch": 0.4980265331961994, + "grad_norm": 0.36407390236854553, + "learning_rate": 1.0061823611299413e-05, + "loss": 0.2726, + "step": 26826 + }, + { + "epoch": 0.498063663333618, + "grad_norm": 0.2972058057785034, + "learning_rate": 1.0060657142511219e-05, + "loss": 0.2907, + "step": 26828 + }, + { + "epoch": 0.49810079347103664, + "grad_norm": 0.4083372950553894, + "learning_rate": 1.0059490672897663e-05, + "loss": 0.4299, + "step": 26830 + }, + { + "epoch": 0.49813792360845527, + "grad_norm": 0.4232308566570282, + "learning_rate": 1.0058324202474616e-05, + "loss": 0.3249, + "step": 26832 + }, + { + "epoch": 0.4981750537458739, + "grad_norm": 0.28382018208503723, + "learning_rate": 1.0057157731257951e-05, + "loss": 0.2256, + "step": 26834 + }, + { + "epoch": 0.4982121838832925, + "grad_norm": 0.23879002034664154, + "learning_rate": 1.0055991259263543e-05, + "loss": 0.3146, + "step": 26836 + }, + { + "epoch": 0.4982493140207112, + "grad_norm": 0.5346073508262634, + "learning_rate": 1.0054824786507257e-05, + "loss": 0.4178, + "step": 26838 + }, + { + "epoch": 0.49828644415812984, + "grad_norm": 0.4364805519580841, + "learning_rate": 1.0053658313004973e-05, + "loss": 0.3411, + "step": 26840 + }, + { + "epoch": 0.49832357429554847, + "grad_norm": 0.5216670036315918, + "learning_rate": 1.005249183877256e-05, + "loss": 0.2456, + "step": 26842 + }, + { + "epoch": 0.4983607044329671, + "grad_norm": 0.7121615409851074, + "learning_rate": 1.0051325363825892e-05, + "loss": 0.3084, + "step": 26844 + }, + { + "epoch": 0.4983978345703857, + "grad_norm": 0.2606574594974518, + "learning_rate": 1.0050158888180836e-05, + "loss": 0.214, + "step": 26846 + }, + { + "epoch": 0.4984349647078044, + "grad_norm": 0.4554485082626343, + "learning_rate": 1.0048992411853272e-05, + "loss": 0.1396, + "step": 26848 + }, + { + "epoch": 0.49847209484522303, + "grad_norm": 0.47734448313713074, + "learning_rate": 1.0047825934859063e-05, + "loss": 0.1508, + "step": 26850 + }, + { + "epoch": 0.49850922498264166, + "grad_norm": 0.38202816247940063, + "learning_rate": 1.0046659457214091e-05, + "loss": 0.3595, + "step": 26852 + }, + { + "epoch": 0.4985463551200603, + "grad_norm": 0.3905223608016968, + "learning_rate": 1.0045492978934224e-05, + "loss": 0.2276, + "step": 26854 + }, + { + "epoch": 0.4985834852574789, + "grad_norm": 0.34084296226501465, + "learning_rate": 1.0044326500035329e-05, + "loss": 0.1343, + "step": 26856 + }, + { + "epoch": 0.4986206153948976, + "grad_norm": 0.2738212049007416, + "learning_rate": 1.0043160020533284e-05, + "loss": 0.3012, + "step": 26858 + }, + { + "epoch": 0.49865774553231623, + "grad_norm": 0.4875483810901642, + "learning_rate": 1.0041993540443963e-05, + "loss": 0.2466, + "step": 26860 + }, + { + "epoch": 0.49869487566973486, + "grad_norm": 0.34649237990379333, + "learning_rate": 1.0040827059783234e-05, + "loss": 0.2201, + "step": 26862 + }, + { + "epoch": 0.4987320058071535, + "grad_norm": 0.3726114332675934, + "learning_rate": 1.0039660578566971e-05, + "loss": 0.3249, + "step": 26864 + }, + { + "epoch": 0.4987691359445721, + "grad_norm": 0.6323537826538086, + "learning_rate": 1.0038494096811049e-05, + "loss": 0.315, + "step": 26866 + }, + { + "epoch": 0.49880626608199075, + "grad_norm": 0.4168843924999237, + "learning_rate": 1.0037327614531334e-05, + "loss": 0.3033, + "step": 26868 + }, + { + "epoch": 0.49884339621940943, + "grad_norm": 0.2692236006259918, + "learning_rate": 1.0036161131743702e-05, + "loss": 0.2252, + "step": 26870 + }, + { + "epoch": 0.49888052635682806, + "grad_norm": 0.4252229332923889, + "learning_rate": 1.0034994648464025e-05, + "loss": 0.4056, + "step": 26872 + }, + { + "epoch": 0.4989176564942467, + "grad_norm": 0.503212571144104, + "learning_rate": 1.003382816470818e-05, + "loss": 0.3089, + "step": 26874 + }, + { + "epoch": 0.4989547866316653, + "grad_norm": 0.4341176450252533, + "learning_rate": 1.0032661680492033e-05, + "loss": 0.2135, + "step": 26876 + }, + { + "epoch": 0.49899191676908394, + "grad_norm": 0.4128177762031555, + "learning_rate": 1.0031495195831456e-05, + "loss": 0.3238, + "step": 26878 + }, + { + "epoch": 0.4990290469065026, + "grad_norm": 0.27954617142677307, + "learning_rate": 1.0030328710742328e-05, + "loss": 0.4152, + "step": 26880 + }, + { + "epoch": 0.49906617704392126, + "grad_norm": 0.3026414215564728, + "learning_rate": 1.0029162225240513e-05, + "loss": 0.1961, + "step": 26882 + }, + { + "epoch": 0.4991033071813399, + "grad_norm": 0.4159076511859894, + "learning_rate": 1.0027995739341889e-05, + "loss": 0.1404, + "step": 26884 + }, + { + "epoch": 0.4991404373187585, + "grad_norm": 0.2822812795639038, + "learning_rate": 1.002682925306233e-05, + "loss": 0.2133, + "step": 26886 + }, + { + "epoch": 0.49917756745617714, + "grad_norm": 0.591202974319458, + "learning_rate": 1.0025662766417698e-05, + "loss": 0.4922, + "step": 26888 + }, + { + "epoch": 0.49921469759359577, + "grad_norm": 0.5966176986694336, + "learning_rate": 1.0024496279423877e-05, + "loss": 0.3457, + "step": 26890 + }, + { + "epoch": 0.49925182773101445, + "grad_norm": 0.3594204783439636, + "learning_rate": 1.0023329792096737e-05, + "loss": 0.2015, + "step": 26892 + }, + { + "epoch": 0.4992889578684331, + "grad_norm": 0.5917710661888123, + "learning_rate": 1.0022163304452143e-05, + "loss": 0.3288, + "step": 26894 + }, + { + "epoch": 0.4993260880058517, + "grad_norm": 0.31645387411117554, + "learning_rate": 1.0020996816505977e-05, + "loss": 0.3256, + "step": 26896 + }, + { + "epoch": 0.49936321814327034, + "grad_norm": 0.5079969167709351, + "learning_rate": 1.0019830328274108e-05, + "loss": 0.4852, + "step": 26898 + }, + { + "epoch": 0.49940034828068897, + "grad_norm": 0.44086822867393494, + "learning_rate": 1.0018663839772404e-05, + "loss": 0.3844, + "step": 26900 + }, + { + "epoch": 0.49943747841810765, + "grad_norm": 0.443491131067276, + "learning_rate": 1.0017497351016742e-05, + "loss": 0.3064, + "step": 26902 + }, + { + "epoch": 0.4994746085555263, + "grad_norm": 0.5364022254943848, + "learning_rate": 1.0016330862022996e-05, + "loss": 0.3155, + "step": 26904 + }, + { + "epoch": 0.4995117386929449, + "grad_norm": 0.37668925523757935, + "learning_rate": 1.0015164372807035e-05, + "loss": 0.3362, + "step": 26906 + }, + { + "epoch": 0.49954886883036353, + "grad_norm": 0.3791133463382721, + "learning_rate": 1.001399788338473e-05, + "loss": 0.1952, + "step": 26908 + }, + { + "epoch": 0.49958599896778216, + "grad_norm": 0.36500510573387146, + "learning_rate": 1.0012831393771958e-05, + "loss": 0.4677, + "step": 26910 + }, + { + "epoch": 0.4996231291052008, + "grad_norm": 0.3498799800872803, + "learning_rate": 1.0011664903984587e-05, + "loss": 0.4296, + "step": 26912 + }, + { + "epoch": 0.4996602592426195, + "grad_norm": 0.40156617760658264, + "learning_rate": 1.0010498414038491e-05, + "loss": 0.3281, + "step": 26914 + }, + { + "epoch": 0.4996973893800381, + "grad_norm": 0.3518311083316803, + "learning_rate": 1.0009331923949548e-05, + "loss": 0.0444, + "step": 26916 + }, + { + "epoch": 0.49973451951745673, + "grad_norm": 0.5822263360023499, + "learning_rate": 1.0008165433733623e-05, + "loss": 0.1783, + "step": 26918 + }, + { + "epoch": 0.49977164965487536, + "grad_norm": 0.33593299984931946, + "learning_rate": 1.000699894340659e-05, + "loss": 0.1649, + "step": 26920 + }, + { + "epoch": 0.499808779792294, + "grad_norm": 0.3730458617210388, + "learning_rate": 1.000583245298432e-05, + "loss": 0.4119, + "step": 26922 + }, + { + "epoch": 0.4998459099297127, + "grad_norm": 0.31013160943984985, + "learning_rate": 1.0004665962482693e-05, + "loss": 0.1891, + "step": 26924 + }, + { + "epoch": 0.4998830400671313, + "grad_norm": 0.5429261922836304, + "learning_rate": 1.0003499471917573e-05, + "loss": 0.3316, + "step": 26926 + }, + { + "epoch": 0.49992017020454993, + "grad_norm": 0.4259258806705475, + "learning_rate": 1.0002332981304837e-05, + "loss": 0.3527, + "step": 26928 + }, + { + "epoch": 0.49995730034196856, + "grad_norm": 0.2676601707935333, + "learning_rate": 1.0001166490660357e-05, + "loss": 0.2827, + "step": 26930 + }, + { + "epoch": 0.4999944304793872, + "grad_norm": 0.3304806351661682, + "learning_rate": 1e-05, + "loss": 0.2581, + "step": 26932 + }, + { + "epoch": 0.5000315606168059, + "grad_norm": 0.3376041352748871, + "learning_rate": 9.998833509339646e-06, + "loss": 0.4154, + "step": 26934 + }, + { + "epoch": 0.5000686907542244, + "grad_norm": 0.4785219132900238, + "learning_rate": 9.997667018695166e-06, + "loss": 0.2627, + "step": 26936 + }, + { + "epoch": 0.5001058208916431, + "grad_norm": 0.36770257353782654, + "learning_rate": 9.996500528082428e-06, + "loss": 0.4671, + "step": 26938 + }, + { + "epoch": 0.5001429510290618, + "grad_norm": 0.3594304025173187, + "learning_rate": 9.995334037517312e-06, + "loss": 0.2399, + "step": 26940 + }, + { + "epoch": 0.5001800811664804, + "grad_norm": 0.4786681532859802, + "learning_rate": 9.994167547015681e-06, + "loss": 0.3796, + "step": 26942 + }, + { + "epoch": 0.5002172113038991, + "grad_norm": 0.35319259762763977, + "learning_rate": 9.993001056593414e-06, + "loss": 0.3072, + "step": 26944 + }, + { + "epoch": 0.5002543414413176, + "grad_norm": 0.2906579375267029, + "learning_rate": 9.99183456626638e-06, + "loss": 0.3575, + "step": 26946 + }, + { + "epoch": 0.5002914715787363, + "grad_norm": 0.9308989644050598, + "learning_rate": 9.990668076050455e-06, + "loss": 0.471, + "step": 26948 + }, + { + "epoch": 0.5003286017161549, + "grad_norm": 0.26063209772109985, + "learning_rate": 9.989501585961509e-06, + "loss": 0.2708, + "step": 26950 + }, + { + "epoch": 0.5003657318535736, + "grad_norm": 0.27027469873428345, + "learning_rate": 9.988335096015418e-06, + "loss": 0.2939, + "step": 26952 + }, + { + "epoch": 0.5004028619909923, + "grad_norm": 0.4825303852558136, + "learning_rate": 9.987168606228047e-06, + "loss": 0.3055, + "step": 26954 + }, + { + "epoch": 0.5004399921284108, + "grad_norm": 0.2751249074935913, + "learning_rate": 9.986002116615274e-06, + "loss": 0.1922, + "step": 26956 + }, + { + "epoch": 0.5004771222658295, + "grad_norm": 0.32798242568969727, + "learning_rate": 9.984835627192968e-06, + "loss": 0.207, + "step": 26958 + }, + { + "epoch": 0.5005142524032481, + "grad_norm": 0.3726319372653961, + "learning_rate": 9.983669137977008e-06, + "loss": 0.138, + "step": 26960 + }, + { + "epoch": 0.5005513825406668, + "grad_norm": 0.358015239238739, + "learning_rate": 9.98250264898326e-06, + "loss": 0.3447, + "step": 26962 + }, + { + "epoch": 0.5005885126780855, + "grad_norm": 0.36054402589797974, + "learning_rate": 9.9813361602276e-06, + "loss": 0.1727, + "step": 26964 + }, + { + "epoch": 0.500625642815504, + "grad_norm": 0.23488083481788635, + "learning_rate": 9.980169671725897e-06, + "loss": 0.2127, + "step": 26966 + }, + { + "epoch": 0.5006627729529227, + "grad_norm": 0.6105630397796631, + "learning_rate": 9.979003183494025e-06, + "loss": 0.1532, + "step": 26968 + }, + { + "epoch": 0.5006999030903413, + "grad_norm": 0.3569817543029785, + "learning_rate": 9.977836695547859e-06, + "loss": 0.1417, + "step": 26970 + }, + { + "epoch": 0.50073703322776, + "grad_norm": 0.2526264488697052, + "learning_rate": 9.976670207903268e-06, + "loss": 0.3781, + "step": 26972 + }, + { + "epoch": 0.5007741633651787, + "grad_norm": 0.1905379295349121, + "learning_rate": 9.975503720576123e-06, + "loss": 0.3267, + "step": 26974 + }, + { + "epoch": 0.5008112935025972, + "grad_norm": 0.5174555778503418, + "learning_rate": 9.974337233582301e-06, + "loss": 0.2387, + "step": 26976 + }, + { + "epoch": 0.5008484236400159, + "grad_norm": 0.4516080617904663, + "learning_rate": 9.973170746937677e-06, + "loss": 0.2597, + "step": 26978 + }, + { + "epoch": 0.5008855537774345, + "grad_norm": 0.3129976689815521, + "learning_rate": 9.972004260658114e-06, + "loss": 0.2476, + "step": 26980 + }, + { + "epoch": 0.5009226839148532, + "grad_norm": 0.40397852659225464, + "learning_rate": 9.97083777475949e-06, + "loss": 0.3502, + "step": 26982 + }, + { + "epoch": 0.5009598140522719, + "grad_norm": 0.4846981167793274, + "learning_rate": 9.969671289257677e-06, + "loss": 0.2871, + "step": 26984 + }, + { + "epoch": 0.5009969441896904, + "grad_norm": 0.46104303002357483, + "learning_rate": 9.968504804168544e-06, + "loss": 0.3291, + "step": 26986 + }, + { + "epoch": 0.5010340743271091, + "grad_norm": 0.44227516651153564, + "learning_rate": 9.967338319507967e-06, + "loss": 0.5471, + "step": 26988 + }, + { + "epoch": 0.5010712044645277, + "grad_norm": 0.3475395739078522, + "learning_rate": 9.966171835291824e-06, + "loss": 0.4743, + "step": 26990 + }, + { + "epoch": 0.5011083346019464, + "grad_norm": 0.3276630640029907, + "learning_rate": 9.965005351535977e-06, + "loss": 0.4024, + "step": 26992 + }, + { + "epoch": 0.501145464739365, + "grad_norm": 0.5174431204795837, + "learning_rate": 9.9638388682563e-06, + "loss": 0.2456, + "step": 26994 + }, + { + "epoch": 0.5011825948767836, + "grad_norm": 0.36459478735923767, + "learning_rate": 9.962672385468669e-06, + "loss": 0.4116, + "step": 26996 + }, + { + "epoch": 0.5012197250142023, + "grad_norm": 0.31800082325935364, + "learning_rate": 9.961505903188955e-06, + "loss": 0.2623, + "step": 26998 + }, + { + "epoch": 0.5012568551516209, + "grad_norm": 0.3742848038673401, + "learning_rate": 9.96033942143303e-06, + "loss": 0.2844, + "step": 27000 + }, + { + "epoch": 0.5012939852890396, + "grad_norm": 0.5424303412437439, + "learning_rate": 9.95917294021677e-06, + "loss": 0.3803, + "step": 27002 + }, + { + "epoch": 0.5013311154264581, + "grad_norm": 0.3784252405166626, + "learning_rate": 9.958006459556042e-06, + "loss": 0.2611, + "step": 27004 + }, + { + "epoch": 0.5013682455638768, + "grad_norm": 0.33086666464805603, + "learning_rate": 9.956839979466719e-06, + "loss": 0.2506, + "step": 27006 + }, + { + "epoch": 0.5014053757012955, + "grad_norm": 0.38567325472831726, + "learning_rate": 9.955673499964675e-06, + "loss": 0.2842, + "step": 27008 + }, + { + "epoch": 0.5014425058387141, + "grad_norm": 0.5531355142593384, + "learning_rate": 9.95450702106578e-06, + "loss": 0.2822, + "step": 27010 + }, + { + "epoch": 0.5014796359761328, + "grad_norm": 0.46249035000801086, + "learning_rate": 9.953340542785912e-06, + "loss": 0.4188, + "step": 27012 + }, + { + "epoch": 0.5015167661135513, + "grad_norm": 0.42776793241500854, + "learning_rate": 9.95217406514094e-06, + "loss": 0.4403, + "step": 27014 + }, + { + "epoch": 0.50155389625097, + "grad_norm": 0.39458444714546204, + "learning_rate": 9.951007588146733e-06, + "loss": 0.2046, + "step": 27016 + }, + { + "epoch": 0.5015910263883887, + "grad_norm": 0.3665659427642822, + "learning_rate": 9.949841111819167e-06, + "loss": 0.205, + "step": 27018 + }, + { + "epoch": 0.5016281565258073, + "grad_norm": 0.46325308084487915, + "learning_rate": 9.948674636174111e-06, + "loss": 0.4072, + "step": 27020 + }, + { + "epoch": 0.501665286663226, + "grad_norm": 0.4123363792896271, + "learning_rate": 9.947508161227442e-06, + "loss": 0.4834, + "step": 27022 + }, + { + "epoch": 0.5017024168006445, + "grad_norm": 0.35976356267929077, + "learning_rate": 9.946341686995027e-06, + "loss": 0.3714, + "step": 27024 + }, + { + "epoch": 0.5017395469380632, + "grad_norm": 0.3051232397556305, + "learning_rate": 9.945175213492743e-06, + "loss": 0.1394, + "step": 27026 + }, + { + "epoch": 0.5017766770754819, + "grad_norm": 0.5761842727661133, + "learning_rate": 9.944008740736462e-06, + "loss": 0.242, + "step": 27028 + }, + { + "epoch": 0.5018138072129005, + "grad_norm": 0.2948196530342102, + "learning_rate": 9.942842268742052e-06, + "loss": 0.0917, + "step": 27030 + }, + { + "epoch": 0.5018509373503192, + "grad_norm": 0.4692282974720001, + "learning_rate": 9.941675797525386e-06, + "loss": 0.1744, + "step": 27032 + }, + { + "epoch": 0.5018880674877377, + "grad_norm": 0.4681616425514221, + "learning_rate": 9.94050932710234e-06, + "loss": 0.2593, + "step": 27034 + }, + { + "epoch": 0.5019251976251564, + "grad_norm": 0.3291204869747162, + "learning_rate": 9.939342857488783e-06, + "loss": 0.1897, + "step": 27036 + }, + { + "epoch": 0.5019623277625751, + "grad_norm": 0.5584930181503296, + "learning_rate": 9.938176388700587e-06, + "loss": 0.3591, + "step": 27038 + }, + { + "epoch": 0.5019994578999937, + "grad_norm": 0.28577449917793274, + "learning_rate": 9.937009920753631e-06, + "loss": 0.2049, + "step": 27040 + }, + { + "epoch": 0.5020365880374124, + "grad_norm": 0.48338791728019714, + "learning_rate": 9.935843453663775e-06, + "loss": 0.5619, + "step": 27042 + }, + { + "epoch": 0.5020737181748309, + "grad_norm": 0.45104774832725525, + "learning_rate": 9.934676987446901e-06, + "loss": 0.352, + "step": 27044 + }, + { + "epoch": 0.5021108483122496, + "grad_norm": 0.3470545709133148, + "learning_rate": 9.933510522118874e-06, + "loss": 0.2918, + "step": 27046 + }, + { + "epoch": 0.5021479784496683, + "grad_norm": 0.37051069736480713, + "learning_rate": 9.932344057695571e-06, + "loss": 0.2505, + "step": 27048 + }, + { + "epoch": 0.5021851085870869, + "grad_norm": 0.3474474251270294, + "learning_rate": 9.931177594192861e-06, + "loss": 0.2108, + "step": 27050 + }, + { + "epoch": 0.5022222387245056, + "grad_norm": 0.29777202010154724, + "learning_rate": 9.930011131626623e-06, + "loss": 0.1484, + "step": 27052 + }, + { + "epoch": 0.5022593688619241, + "grad_norm": 0.32228752970695496, + "learning_rate": 9.92884467001272e-06, + "loss": 0.4737, + "step": 27054 + }, + { + "epoch": 0.5022964989993428, + "grad_norm": 0.48034149408340454, + "learning_rate": 9.92767820936703e-06, + "loss": 0.2923, + "step": 27056 + }, + { + "epoch": 0.5023336291367614, + "grad_norm": 0.44116613268852234, + "learning_rate": 9.926511749705422e-06, + "loss": 0.2555, + "step": 27058 + }, + { + "epoch": 0.5023707592741801, + "grad_norm": 0.324666291475296, + "learning_rate": 9.925345291043766e-06, + "loss": 0.2746, + "step": 27060 + }, + { + "epoch": 0.5024078894115988, + "grad_norm": 0.5034061670303345, + "learning_rate": 9.92417883339794e-06, + "loss": 0.2197, + "step": 27062 + }, + { + "epoch": 0.5024450195490173, + "grad_norm": 0.3064354658126831, + "learning_rate": 9.923012376783813e-06, + "loss": 0.278, + "step": 27064 + }, + { + "epoch": 0.502482149686436, + "grad_norm": 0.347992479801178, + "learning_rate": 9.921845921217257e-06, + "loss": 0.4737, + "step": 27066 + }, + { + "epoch": 0.5025192798238546, + "grad_norm": 0.39355143904685974, + "learning_rate": 9.920679466714145e-06, + "loss": 0.3105, + "step": 27068 + }, + { + "epoch": 0.5025564099612733, + "grad_norm": 0.4315430521965027, + "learning_rate": 9.919513013290344e-06, + "loss": 0.3077, + "step": 27070 + }, + { + "epoch": 0.502593540098692, + "grad_norm": 0.23621046543121338, + "learning_rate": 9.918346560961732e-06, + "loss": 0.2535, + "step": 27072 + }, + { + "epoch": 0.5026306702361105, + "grad_norm": 0.2695353031158447, + "learning_rate": 9.91718010974418e-06, + "loss": 0.2152, + "step": 27074 + }, + { + "epoch": 0.5026678003735292, + "grad_norm": 0.36385855078697205, + "learning_rate": 9.916013659653555e-06, + "loss": 0.2477, + "step": 27076 + }, + { + "epoch": 0.5027049305109478, + "grad_norm": 0.4169882833957672, + "learning_rate": 9.91484721070574e-06, + "loss": 0.2444, + "step": 27078 + }, + { + "epoch": 0.5027420606483665, + "grad_norm": 0.324619323015213, + "learning_rate": 9.913680762916594e-06, + "loss": 0.2445, + "step": 27080 + }, + { + "epoch": 0.5027791907857851, + "grad_norm": 0.4718611538410187, + "learning_rate": 9.912514316301993e-06, + "loss": 0.1802, + "step": 27082 + }, + { + "epoch": 0.5028163209232037, + "grad_norm": 0.31225594878196716, + "learning_rate": 9.91134787087781e-06, + "loss": 0.2085, + "step": 27084 + }, + { + "epoch": 0.5028534510606224, + "grad_norm": 0.35890844464302063, + "learning_rate": 9.91018142665992e-06, + "loss": 0.2661, + "step": 27086 + }, + { + "epoch": 0.502890581198041, + "grad_norm": 0.35641393065452576, + "learning_rate": 9.90901498366419e-06, + "loss": 0.1958, + "step": 27088 + }, + { + "epoch": 0.5029277113354597, + "grad_norm": 0.4119141101837158, + "learning_rate": 9.907848541906496e-06, + "loss": 0.0782, + "step": 27090 + }, + { + "epoch": 0.5029648414728783, + "grad_norm": 0.29675421118736267, + "learning_rate": 9.906682101402705e-06, + "loss": 0.1886, + "step": 27092 + }, + { + "epoch": 0.5030019716102969, + "grad_norm": 0.3640308976173401, + "learning_rate": 9.90551566216869e-06, + "loss": 0.1102, + "step": 27094 + }, + { + "epoch": 0.5030391017477156, + "grad_norm": 0.40626031160354614, + "learning_rate": 9.904349224220324e-06, + "loss": 0.3634, + "step": 27096 + }, + { + "epoch": 0.5030762318851342, + "grad_norm": 0.5021699666976929, + "learning_rate": 9.903182787573482e-06, + "loss": 0.3339, + "step": 27098 + }, + { + "epoch": 0.5031133620225529, + "grad_norm": 0.48933136463165283, + "learning_rate": 9.902016352244028e-06, + "loss": 0.4875, + "step": 27100 + }, + { + "epoch": 0.5031504921599714, + "grad_norm": 0.3093032240867615, + "learning_rate": 9.90084991824784e-06, + "loss": 0.3027, + "step": 27102 + }, + { + "epoch": 0.5031876222973901, + "grad_norm": 0.25190266966819763, + "learning_rate": 9.89968348560079e-06, + "loss": 0.2034, + "step": 27104 + }, + { + "epoch": 0.5032247524348088, + "grad_norm": 0.41998064517974854, + "learning_rate": 9.898517054318744e-06, + "loss": 0.2548, + "step": 27106 + }, + { + "epoch": 0.5032618825722274, + "grad_norm": 0.40255624055862427, + "learning_rate": 9.897350624417577e-06, + "loss": 0.3753, + "step": 27108 + }, + { + "epoch": 0.503299012709646, + "grad_norm": 0.35159605741500854, + "learning_rate": 9.89618419591316e-06, + "loss": 0.2146, + "step": 27110 + }, + { + "epoch": 0.5033361428470646, + "grad_norm": 0.55653977394104, + "learning_rate": 9.895017768821366e-06, + "loss": 0.4111, + "step": 27112 + }, + { + "epoch": 0.5033732729844833, + "grad_norm": 0.493134468793869, + "learning_rate": 9.893851343158064e-06, + "loss": 0.1941, + "step": 27114 + }, + { + "epoch": 0.503410403121902, + "grad_norm": 0.3710097372531891, + "learning_rate": 9.892684918939133e-06, + "loss": 0.2622, + "step": 27116 + }, + { + "epoch": 0.5034475332593206, + "grad_norm": 0.35841497778892517, + "learning_rate": 9.891518496180436e-06, + "loss": 0.309, + "step": 27118 + }, + { + "epoch": 0.5034846633967393, + "grad_norm": 0.3774789571762085, + "learning_rate": 9.890352074897846e-06, + "loss": 0.3103, + "step": 27120 + }, + { + "epoch": 0.5035217935341578, + "grad_norm": 0.21137604117393494, + "learning_rate": 9.889185655107235e-06, + "loss": 0.2269, + "step": 27122 + }, + { + "epoch": 0.5035589236715765, + "grad_norm": 0.3149380683898926, + "learning_rate": 9.888019236824477e-06, + "loss": 0.198, + "step": 27124 + }, + { + "epoch": 0.5035960538089952, + "grad_norm": 0.3213143050670624, + "learning_rate": 9.88685282006544e-06, + "loss": 0.4271, + "step": 27126 + }, + { + "epoch": 0.5036331839464138, + "grad_norm": 0.3381570875644684, + "learning_rate": 9.885686404846002e-06, + "loss": 0.3979, + "step": 27128 + }, + { + "epoch": 0.5036703140838324, + "grad_norm": 0.47645604610443115, + "learning_rate": 9.884519991182028e-06, + "loss": 0.2976, + "step": 27130 + }, + { + "epoch": 0.503707444221251, + "grad_norm": 0.4178716540336609, + "learning_rate": 9.883353579089388e-06, + "loss": 0.3564, + "step": 27132 + }, + { + "epoch": 0.5037445743586697, + "grad_norm": 0.40592581033706665, + "learning_rate": 9.882187168583957e-06, + "loss": 0.2043, + "step": 27134 + }, + { + "epoch": 0.5037817044960884, + "grad_norm": 0.5035300254821777, + "learning_rate": 9.881020759681604e-06, + "loss": 0.2132, + "step": 27136 + }, + { + "epoch": 0.503818834633507, + "grad_norm": 0.32951027154922485, + "learning_rate": 9.879854352398206e-06, + "loss": 0.2364, + "step": 27138 + }, + { + "epoch": 0.5038559647709256, + "grad_norm": 0.39500245451927185, + "learning_rate": 9.878687946749628e-06, + "loss": 0.4651, + "step": 27140 + }, + { + "epoch": 0.5038930949083442, + "grad_norm": 0.4188745319843292, + "learning_rate": 9.877521542751747e-06, + "loss": 0.3062, + "step": 27142 + }, + { + "epoch": 0.5039302250457629, + "grad_norm": 0.39757540822029114, + "learning_rate": 9.876355140420429e-06, + "loss": 0.3237, + "step": 27144 + }, + { + "epoch": 0.5039673551831816, + "grad_norm": 0.2403126060962677, + "learning_rate": 9.875188739771544e-06, + "loss": 0.1516, + "step": 27146 + }, + { + "epoch": 0.5040044853206002, + "grad_norm": 0.3626934885978699, + "learning_rate": 9.87402234082097e-06, + "loss": 0.084, + "step": 27148 + }, + { + "epoch": 0.5040416154580188, + "grad_norm": 0.4089568257331848, + "learning_rate": 9.872855943584575e-06, + "loss": 0.1532, + "step": 27150 + }, + { + "epoch": 0.5040787455954374, + "grad_norm": 0.3505588471889496, + "learning_rate": 9.871689548078226e-06, + "loss": 0.2182, + "step": 27152 + }, + { + "epoch": 0.5041158757328561, + "grad_norm": 0.4032585322856903, + "learning_rate": 9.870523154317803e-06, + "loss": 0.1941, + "step": 27154 + }, + { + "epoch": 0.5041530058702747, + "grad_norm": 0.37027329206466675, + "learning_rate": 9.869356762319168e-06, + "loss": 0.306, + "step": 27156 + }, + { + "epoch": 0.5041901360076934, + "grad_norm": 0.4290004074573517, + "learning_rate": 9.868190372098198e-06, + "loss": 0.3245, + "step": 27158 + }, + { + "epoch": 0.504227266145112, + "grad_norm": 0.9687259197235107, + "learning_rate": 9.867023983670761e-06, + "loss": 0.2616, + "step": 27160 + }, + { + "epoch": 0.5042643962825306, + "grad_norm": 0.4411226511001587, + "learning_rate": 9.86585759705273e-06, + "loss": 0.3786, + "step": 27162 + }, + { + "epoch": 0.5043015264199493, + "grad_norm": 0.38836535811424255, + "learning_rate": 9.864691212259974e-06, + "loss": 0.2314, + "step": 27164 + }, + { + "epoch": 0.5043386565573679, + "grad_norm": 0.3700661063194275, + "learning_rate": 9.86352482930837e-06, + "loss": 0.389, + "step": 27166 + }, + { + "epoch": 0.5043757866947866, + "grad_norm": 0.3821013569831848, + "learning_rate": 9.86235844821378e-06, + "loss": 0.4283, + "step": 27168 + }, + { + "epoch": 0.5044129168322052, + "grad_norm": 0.3961995542049408, + "learning_rate": 9.86119206899208e-06, + "loss": 0.2786, + "step": 27170 + }, + { + "epoch": 0.5044500469696238, + "grad_norm": 0.28907904028892517, + "learning_rate": 9.860025691659141e-06, + "loss": 0.396, + "step": 27172 + }, + { + "epoch": 0.5044871771070425, + "grad_norm": 0.2272331863641739, + "learning_rate": 9.858859316230831e-06, + "loss": 0.3285, + "step": 27174 + }, + { + "epoch": 0.5045243072444611, + "grad_norm": 0.29686227440834045, + "learning_rate": 9.857692942723024e-06, + "loss": 0.3677, + "step": 27176 + }, + { + "epoch": 0.5045614373818798, + "grad_norm": 0.31016772985458374, + "learning_rate": 9.856526571151593e-06, + "loss": 0.3477, + "step": 27178 + }, + { + "epoch": 0.5045985675192984, + "grad_norm": 0.49688124656677246, + "learning_rate": 9.8553602015324e-06, + "loss": 0.3045, + "step": 27180 + }, + { + "epoch": 0.504635697656717, + "grad_norm": 0.32326236367225647, + "learning_rate": 9.854193833881326e-06, + "loss": 0.4869, + "step": 27182 + }, + { + "epoch": 0.5046728277941357, + "grad_norm": 0.42438217997550964, + "learning_rate": 9.853027468214235e-06, + "loss": 0.2216, + "step": 27184 + }, + { + "epoch": 0.5047099579315543, + "grad_norm": 0.3244577944278717, + "learning_rate": 9.851861104546998e-06, + "loss": 0.2082, + "step": 27186 + }, + { + "epoch": 0.504747088068973, + "grad_norm": 0.4199690520763397, + "learning_rate": 9.850694742895488e-06, + "loss": 0.212, + "step": 27188 + }, + { + "epoch": 0.5047842182063916, + "grad_norm": 0.4289074242115021, + "learning_rate": 9.84952838327558e-06, + "loss": 0.286, + "step": 27190 + }, + { + "epoch": 0.5048213483438102, + "grad_norm": 0.3022020757198334, + "learning_rate": 9.848362025703138e-06, + "loss": 0.2173, + "step": 27192 + }, + { + "epoch": 0.5048584784812289, + "grad_norm": 0.39626890420913696, + "learning_rate": 9.847195670194036e-06, + "loss": 0.3482, + "step": 27194 + }, + { + "epoch": 0.5048956086186475, + "grad_norm": 0.31610792875289917, + "learning_rate": 9.846029316764138e-06, + "loss": 0.118, + "step": 27196 + }, + { + "epoch": 0.5049327387560661, + "grad_norm": 0.5148515105247498, + "learning_rate": 9.844862965429323e-06, + "loss": 0.2664, + "step": 27198 + }, + { + "epoch": 0.5049698688934848, + "grad_norm": 0.2193974256515503, + "learning_rate": 9.843696616205457e-06, + "loss": 0.0685, + "step": 27200 + }, + { + "epoch": 0.5050069990309034, + "grad_norm": 0.44844838976860046, + "learning_rate": 9.842530269108413e-06, + "loss": 0.3262, + "step": 27202 + }, + { + "epoch": 0.5050441291683221, + "grad_norm": 0.42538976669311523, + "learning_rate": 9.841363924154063e-06, + "loss": 0.4215, + "step": 27204 + }, + { + "epoch": 0.5050812593057407, + "grad_norm": 0.509914755821228, + "learning_rate": 9.840197581358273e-06, + "loss": 0.1893, + "step": 27206 + }, + { + "epoch": 0.5051183894431593, + "grad_norm": 0.6601515412330627, + "learning_rate": 9.839031240736913e-06, + "loss": 0.4558, + "step": 27208 + }, + { + "epoch": 0.5051555195805779, + "grad_norm": 0.48657745122909546, + "learning_rate": 9.837864902305856e-06, + "loss": 0.5459, + "step": 27210 + }, + { + "epoch": 0.5051926497179966, + "grad_norm": 0.30325713753700256, + "learning_rate": 9.836698566080972e-06, + "loss": 0.226, + "step": 27212 + }, + { + "epoch": 0.5052297798554153, + "grad_norm": 0.4423142075538635, + "learning_rate": 9.835532232078134e-06, + "loss": 0.1455, + "step": 27214 + }, + { + "epoch": 0.5052669099928339, + "grad_norm": 0.3947449028491974, + "learning_rate": 9.834365900313211e-06, + "loss": 0.268, + "step": 27216 + }, + { + "epoch": 0.5053040401302525, + "grad_norm": 0.4056461751461029, + "learning_rate": 9.833199570802069e-06, + "loss": 0.2889, + "step": 27218 + }, + { + "epoch": 0.5053411702676711, + "grad_norm": 0.3948806822299957, + "learning_rate": 9.832033243560579e-06, + "loss": 0.3232, + "step": 27220 + }, + { + "epoch": 0.5053783004050898, + "grad_norm": 0.34596171975135803, + "learning_rate": 9.830866918604615e-06, + "loss": 0.2442, + "step": 27222 + }, + { + "epoch": 0.5054154305425085, + "grad_norm": 0.3630673587322235, + "learning_rate": 9.829700595950047e-06, + "loss": 0.3623, + "step": 27224 + }, + { + "epoch": 0.505452560679927, + "grad_norm": 0.35214555263519287, + "learning_rate": 9.828534275612743e-06, + "loss": 0.2649, + "step": 27226 + }, + { + "epoch": 0.5054896908173457, + "grad_norm": 0.4082280695438385, + "learning_rate": 9.827367957608574e-06, + "loss": 0.0437, + "step": 27228 + }, + { + "epoch": 0.5055268209547643, + "grad_norm": 0.40199020504951477, + "learning_rate": 9.826201641953413e-06, + "loss": 0.2244, + "step": 27230 + }, + { + "epoch": 0.505563951092183, + "grad_norm": 0.5142661929130554, + "learning_rate": 9.825035328663123e-06, + "loss": 0.2809, + "step": 27232 + }, + { + "epoch": 0.5056010812296017, + "grad_norm": 0.4034191966056824, + "learning_rate": 9.823869017753578e-06, + "loss": 0.4427, + "step": 27234 + }, + { + "epoch": 0.5056382113670203, + "grad_norm": 0.47163230180740356, + "learning_rate": 9.822702709240652e-06, + "loss": 0.4074, + "step": 27236 + }, + { + "epoch": 0.5056753415044389, + "grad_norm": 0.3310648202896118, + "learning_rate": 9.821536403140206e-06, + "loss": 0.3334, + "step": 27238 + }, + { + "epoch": 0.5057124716418575, + "grad_norm": 0.4580729603767395, + "learning_rate": 9.820370099468115e-06, + "loss": 0.2925, + "step": 27240 + }, + { + "epoch": 0.5057496017792762, + "grad_norm": 0.4034956097602844, + "learning_rate": 9.819203798240257e-06, + "loss": 0.3419, + "step": 27242 + }, + { + "epoch": 0.5057867319166949, + "grad_norm": 0.35212817788124084, + "learning_rate": 9.818037499472486e-06, + "loss": 0.2904, + "step": 27244 + }, + { + "epoch": 0.5058238620541134, + "grad_norm": 0.35502949357032776, + "learning_rate": 9.816871203180683e-06, + "loss": 0.4541, + "step": 27246 + }, + { + "epoch": 0.5058609921915321, + "grad_norm": 0.3449929654598236, + "learning_rate": 9.815704909380712e-06, + "loss": 0.2485, + "step": 27248 + }, + { + "epoch": 0.5058981223289507, + "grad_norm": 0.31588485836982727, + "learning_rate": 9.814538618088445e-06, + "loss": 0.344, + "step": 27250 + }, + { + "epoch": 0.5059352524663694, + "grad_norm": 0.2900664210319519, + "learning_rate": 9.813372329319752e-06, + "loss": 0.186, + "step": 27252 + }, + { + "epoch": 0.505972382603788, + "grad_norm": 0.3932534158229828, + "learning_rate": 9.812206043090508e-06, + "loss": 0.2604, + "step": 27254 + }, + { + "epoch": 0.5060095127412066, + "grad_norm": 0.4577184319496155, + "learning_rate": 9.811039759416572e-06, + "loss": 0.0955, + "step": 27256 + }, + { + "epoch": 0.5060466428786253, + "grad_norm": 0.47477906942367554, + "learning_rate": 9.80987347831382e-06, + "loss": 0.4599, + "step": 27258 + }, + { + "epoch": 0.5060837730160439, + "grad_norm": 0.4280111789703369, + "learning_rate": 9.80870719979812e-06, + "loss": 0.3104, + "step": 27260 + }, + { + "epoch": 0.5061209031534626, + "grad_norm": 0.4136579930782318, + "learning_rate": 9.807540923885341e-06, + "loss": 0.3004, + "step": 27262 + }, + { + "epoch": 0.5061580332908812, + "grad_norm": 0.23737718164920807, + "learning_rate": 9.806374650591353e-06, + "loss": 0.1982, + "step": 27264 + }, + { + "epoch": 0.5061951634282998, + "grad_norm": 0.44319865107536316, + "learning_rate": 9.80520837993203e-06, + "loss": 0.161, + "step": 27266 + }, + { + "epoch": 0.5062322935657185, + "grad_norm": 0.43614593148231506, + "learning_rate": 9.804042111923238e-06, + "loss": 0.2234, + "step": 27268 + }, + { + "epoch": 0.5062694237031371, + "grad_norm": 0.29607024788856506, + "learning_rate": 9.802875846580842e-06, + "loss": 0.2625, + "step": 27270 + }, + { + "epoch": 0.5063065538405558, + "grad_norm": 0.4026789963245392, + "learning_rate": 9.801709583920717e-06, + "loss": 0.3906, + "step": 27272 + }, + { + "epoch": 0.5063436839779744, + "grad_norm": 0.5164517760276794, + "learning_rate": 9.800543323958728e-06, + "loss": 0.1882, + "step": 27274 + }, + { + "epoch": 0.506380814115393, + "grad_norm": 0.40611159801483154, + "learning_rate": 9.79937706671075e-06, + "loss": 0.1818, + "step": 27276 + }, + { + "epoch": 0.5064179442528117, + "grad_norm": 0.3482772409915924, + "learning_rate": 9.798210812192649e-06, + "loss": 0.2479, + "step": 27278 + }, + { + "epoch": 0.5064550743902303, + "grad_norm": 0.3575907349586487, + "learning_rate": 9.797044560420296e-06, + "loss": 0.3033, + "step": 27280 + }, + { + "epoch": 0.506492204527649, + "grad_norm": 0.3511286973953247, + "learning_rate": 9.795878311409554e-06, + "loss": 0.1738, + "step": 27282 + }, + { + "epoch": 0.5065293346650676, + "grad_norm": 0.48284628987312317, + "learning_rate": 9.7947120651763e-06, + "loss": 0.4909, + "step": 27284 + }, + { + "epoch": 0.5065664648024862, + "grad_norm": 0.37745344638824463, + "learning_rate": 9.793545821736396e-06, + "loss": 0.3169, + "step": 27286 + }, + { + "epoch": 0.5066035949399049, + "grad_norm": 0.21041035652160645, + "learning_rate": 9.792379581105719e-06, + "loss": 0.2035, + "step": 27288 + }, + { + "epoch": 0.5066407250773235, + "grad_norm": 0.34612002968788147, + "learning_rate": 9.791213343300132e-06, + "loss": 0.3758, + "step": 27290 + }, + { + "epoch": 0.5066778552147422, + "grad_norm": 0.40188127756118774, + "learning_rate": 9.79004710833551e-06, + "loss": 0.2681, + "step": 27292 + }, + { + "epoch": 0.5067149853521608, + "grad_norm": 0.39686840772628784, + "learning_rate": 9.788880876227714e-06, + "loss": 0.0715, + "step": 27294 + }, + { + "epoch": 0.5067521154895794, + "grad_norm": 0.6442857384681702, + "learning_rate": 9.787714646992618e-06, + "loss": 0.0954, + "step": 27296 + }, + { + "epoch": 0.5067892456269981, + "grad_norm": 0.49370160698890686, + "learning_rate": 9.78654842064609e-06, + "loss": 0.1756, + "step": 27298 + }, + { + "epoch": 0.5068263757644167, + "grad_norm": 0.2876185476779938, + "learning_rate": 9.785382197203997e-06, + "loss": 0.3837, + "step": 27300 + }, + { + "epoch": 0.5068635059018354, + "grad_norm": 0.2753759026527405, + "learning_rate": 9.78421597668221e-06, + "loss": 0.3236, + "step": 27302 + }, + { + "epoch": 0.506900636039254, + "grad_norm": 0.2751653790473938, + "learning_rate": 9.7830497590966e-06, + "loss": 0.5996, + "step": 27304 + }, + { + "epoch": 0.5069377661766726, + "grad_norm": 0.4358898997306824, + "learning_rate": 9.781883544463031e-06, + "loss": 0.2122, + "step": 27306 + }, + { + "epoch": 0.5069748963140912, + "grad_norm": 0.3494413495063782, + "learning_rate": 9.780717332797372e-06, + "loss": 0.3515, + "step": 27308 + }, + { + "epoch": 0.5070120264515099, + "grad_norm": 0.3184601068496704, + "learning_rate": 9.779551124115497e-06, + "loss": 0.2073, + "step": 27310 + }, + { + "epoch": 0.5070491565889286, + "grad_norm": 0.4273928105831146, + "learning_rate": 9.778384918433267e-06, + "loss": 0.2541, + "step": 27312 + }, + { + "epoch": 0.5070862867263471, + "grad_norm": 0.4476768672466278, + "learning_rate": 9.777218715766555e-06, + "loss": 0.308, + "step": 27314 + }, + { + "epoch": 0.5071234168637658, + "grad_norm": 0.3439481556415558, + "learning_rate": 9.776052516131229e-06, + "loss": 0.6137, + "step": 27316 + }, + { + "epoch": 0.5071605470011844, + "grad_norm": 0.4766983389854431, + "learning_rate": 9.774886319543161e-06, + "loss": 0.304, + "step": 27318 + }, + { + "epoch": 0.5071976771386031, + "grad_norm": 0.36192506551742554, + "learning_rate": 9.773720126018212e-06, + "loss": 0.2094, + "step": 27320 + }, + { + "epoch": 0.5072348072760218, + "grad_norm": 0.31431835889816284, + "learning_rate": 9.772553935572258e-06, + "loss": 0.3545, + "step": 27322 + }, + { + "epoch": 0.5072719374134403, + "grad_norm": 0.31274521350860596, + "learning_rate": 9.771387748221159e-06, + "loss": 0.3507, + "step": 27324 + }, + { + "epoch": 0.507309067550859, + "grad_norm": 0.42054906487464905, + "learning_rate": 9.77022156398079e-06, + "loss": 0.5935, + "step": 27326 + }, + { + "epoch": 0.5073461976882776, + "grad_norm": 0.4115454852581024, + "learning_rate": 9.769055382867018e-06, + "loss": 0.3246, + "step": 27328 + }, + { + "epoch": 0.5073833278256963, + "grad_norm": 0.39358383417129517, + "learning_rate": 9.767889204895711e-06, + "loss": 0.1866, + "step": 27330 + }, + { + "epoch": 0.507420457963115, + "grad_norm": 0.43749693036079407, + "learning_rate": 9.766723030082738e-06, + "loss": 0.2156, + "step": 27332 + }, + { + "epoch": 0.5074575881005335, + "grad_norm": 0.39062029123306274, + "learning_rate": 9.765556858443961e-06, + "loss": 0.3841, + "step": 27334 + }, + { + "epoch": 0.5074947182379522, + "grad_norm": 0.4628467261791229, + "learning_rate": 9.764390689995255e-06, + "loss": 0.2981, + "step": 27336 + }, + { + "epoch": 0.5075318483753708, + "grad_norm": 0.5927623510360718, + "learning_rate": 9.763224524752487e-06, + "loss": 0.4069, + "step": 27338 + }, + { + "epoch": 0.5075689785127895, + "grad_norm": 0.1744084656238556, + "learning_rate": 9.762058362731524e-06, + "loss": 0.4431, + "step": 27340 + }, + { + "epoch": 0.5076061086502082, + "grad_norm": 0.41308119893074036, + "learning_rate": 9.760892203948234e-06, + "loss": 0.2354, + "step": 27342 + }, + { + "epoch": 0.5076432387876267, + "grad_norm": 0.390735387802124, + "learning_rate": 9.759726048418485e-06, + "loss": 0.3865, + "step": 27344 + }, + { + "epoch": 0.5076803689250454, + "grad_norm": 0.3669309616088867, + "learning_rate": 9.758559896158142e-06, + "loss": 0.3652, + "step": 27346 + }, + { + "epoch": 0.507717499062464, + "grad_norm": 0.40453261137008667, + "learning_rate": 9.757393747183077e-06, + "loss": 0.1849, + "step": 27348 + }, + { + "epoch": 0.5077546291998827, + "grad_norm": 0.5804651975631714, + "learning_rate": 9.756227601509157e-06, + "loss": 0.2931, + "step": 27350 + }, + { + "epoch": 0.5077917593373014, + "grad_norm": 0.42025619745254517, + "learning_rate": 9.75506145915225e-06, + "loss": 0.4583, + "step": 27352 + }, + { + "epoch": 0.5078288894747199, + "grad_norm": 0.41886240243911743, + "learning_rate": 9.753895320128221e-06, + "loss": 0.2414, + "step": 27354 + }, + { + "epoch": 0.5078660196121386, + "grad_norm": 0.3709939420223236, + "learning_rate": 9.752729184452944e-06, + "loss": 0.1909, + "step": 27356 + }, + { + "epoch": 0.5079031497495572, + "grad_norm": 1.063105583190918, + "learning_rate": 9.751563052142278e-06, + "loss": 0.1721, + "step": 27358 + }, + { + "epoch": 0.5079402798869759, + "grad_norm": 0.492556631565094, + "learning_rate": 9.750396923212093e-06, + "loss": 0.338, + "step": 27360 + }, + { + "epoch": 0.5079774100243944, + "grad_norm": 0.25917309522628784, + "learning_rate": 9.749230797678264e-06, + "loss": 0.1779, + "step": 27362 + }, + { + "epoch": 0.5080145401618131, + "grad_norm": 0.30102917551994324, + "learning_rate": 9.748064675556647e-06, + "loss": 0.2104, + "step": 27364 + }, + { + "epoch": 0.5080516702992318, + "grad_norm": 0.33802661299705505, + "learning_rate": 9.746898556863116e-06, + "loss": 0.3439, + "step": 27366 + }, + { + "epoch": 0.5080888004366504, + "grad_norm": 0.5920265316963196, + "learning_rate": 9.745732441613542e-06, + "loss": 0.2545, + "step": 27368 + }, + { + "epoch": 0.5081259305740691, + "grad_norm": 0.3891545534133911, + "learning_rate": 9.744566329823784e-06, + "loss": 0.3775, + "step": 27370 + }, + { + "epoch": 0.5081630607114876, + "grad_norm": 0.4400654435157776, + "learning_rate": 9.743400221509713e-06, + "loss": 0.3398, + "step": 27372 + }, + { + "epoch": 0.5082001908489063, + "grad_norm": 0.19824333488941193, + "learning_rate": 9.742234116687199e-06, + "loss": 0.3071, + "step": 27374 + }, + { + "epoch": 0.508237320986325, + "grad_norm": 0.28075167536735535, + "learning_rate": 9.741068015372104e-06, + "loss": 0.2828, + "step": 27376 + }, + { + "epoch": 0.5082744511237436, + "grad_norm": 0.5438506007194519, + "learning_rate": 9.739901917580298e-06, + "loss": 0.3065, + "step": 27378 + }, + { + "epoch": 0.5083115812611623, + "grad_norm": 0.30266237258911133, + "learning_rate": 9.738735823327652e-06, + "loss": 0.2322, + "step": 27380 + }, + { + "epoch": 0.5083487113985808, + "grad_norm": 0.43106696009635925, + "learning_rate": 9.737569732630023e-06, + "loss": 0.2476, + "step": 27382 + }, + { + "epoch": 0.5083858415359995, + "grad_norm": 0.36497265100479126, + "learning_rate": 9.73640364550329e-06, + "loss": 0.3701, + "step": 27384 + }, + { + "epoch": 0.5084229716734182, + "grad_norm": 0.34190189838409424, + "learning_rate": 9.735237561963307e-06, + "loss": 0.3332, + "step": 27386 + }, + { + "epoch": 0.5084601018108368, + "grad_norm": 0.37989184260368347, + "learning_rate": 9.73407148202595e-06, + "loss": 0.3233, + "step": 27388 + }, + { + "epoch": 0.5084972319482555, + "grad_norm": 0.2973982095718384, + "learning_rate": 9.732905405707082e-06, + "loss": 0.1501, + "step": 27390 + }, + { + "epoch": 0.508534362085674, + "grad_norm": 0.495017409324646, + "learning_rate": 9.731739333022576e-06, + "loss": 0.3307, + "step": 27392 + }, + { + "epoch": 0.5085714922230927, + "grad_norm": 0.3756818473339081, + "learning_rate": 9.730573263988293e-06, + "loss": 0.2262, + "step": 27394 + }, + { + "epoch": 0.5086086223605114, + "grad_norm": 0.2919599413871765, + "learning_rate": 9.729407198620101e-06, + "loss": 0.2101, + "step": 27396 + }, + { + "epoch": 0.50864575249793, + "grad_norm": 0.3531649112701416, + "learning_rate": 9.728241136933865e-06, + "loss": 0.2044, + "step": 27398 + }, + { + "epoch": 0.5086828826353487, + "grad_norm": 0.32702580094337463, + "learning_rate": 9.727075078945451e-06, + "loss": 0.2058, + "step": 27400 + }, + { + "epoch": 0.5087200127727672, + "grad_norm": 0.42790696024894714, + "learning_rate": 9.72590902467073e-06, + "loss": 0.2116, + "step": 27402 + }, + { + "epoch": 0.5087571429101859, + "grad_norm": 0.37379634380340576, + "learning_rate": 9.724742974125567e-06, + "loss": 0.2655, + "step": 27404 + }, + { + "epoch": 0.5087942730476045, + "grad_norm": 0.3516804575920105, + "learning_rate": 9.72357692732583e-06, + "loss": 0.3382, + "step": 27406 + }, + { + "epoch": 0.5088314031850232, + "grad_norm": 0.4283115863800049, + "learning_rate": 9.722410884287378e-06, + "loss": 0.1795, + "step": 27408 + }, + { + "epoch": 0.5088685333224419, + "grad_norm": 0.5059515237808228, + "learning_rate": 9.721244845026084e-06, + "loss": 0.5063, + "step": 27410 + }, + { + "epoch": 0.5089056634598604, + "grad_norm": 0.3887662887573242, + "learning_rate": 9.720078809557813e-06, + "loss": 0.3849, + "step": 27412 + }, + { + "epoch": 0.5089427935972791, + "grad_norm": 0.2702990472316742, + "learning_rate": 9.71891277789843e-06, + "loss": 0.2979, + "step": 27414 + }, + { + "epoch": 0.5089799237346977, + "grad_norm": 0.4402986466884613, + "learning_rate": 9.717746750063803e-06, + "loss": 0.317, + "step": 27416 + }, + { + "epoch": 0.5090170538721164, + "grad_norm": 0.42464402318000793, + "learning_rate": 9.716580726069801e-06, + "loss": 0.3432, + "step": 27418 + }, + { + "epoch": 0.5090541840095351, + "grad_norm": 0.5096556544303894, + "learning_rate": 9.715414705932281e-06, + "loss": 0.1925, + "step": 27420 + }, + { + "epoch": 0.5090913141469536, + "grad_norm": 0.21243000030517578, + "learning_rate": 9.714248689667115e-06, + "loss": 0.1497, + "step": 27422 + }, + { + "epoch": 0.5091284442843723, + "grad_norm": 0.27646562457084656, + "learning_rate": 9.713082677290168e-06, + "loss": 0.2693, + "step": 27424 + }, + { + "epoch": 0.5091655744217909, + "grad_norm": 0.3234902322292328, + "learning_rate": 9.71191666881731e-06, + "loss": 0.1154, + "step": 27426 + }, + { + "epoch": 0.5092027045592096, + "grad_norm": 0.46533554792404175, + "learning_rate": 9.7107506642644e-06, + "loss": 0.1844, + "step": 27428 + }, + { + "epoch": 0.5092398346966283, + "grad_norm": 0.3183642625808716, + "learning_rate": 9.709584663647306e-06, + "loss": 0.1446, + "step": 27430 + }, + { + "epoch": 0.5092769648340468, + "grad_norm": 0.42225557565689087, + "learning_rate": 9.7084186669819e-06, + "loss": 0.2338, + "step": 27432 + }, + { + "epoch": 0.5093140949714655, + "grad_norm": 0.4061495363712311, + "learning_rate": 9.70725267428404e-06, + "loss": 0.248, + "step": 27434 + }, + { + "epoch": 0.5093512251088841, + "grad_norm": 0.3309496343135834, + "learning_rate": 9.706086685569594e-06, + "loss": 0.3234, + "step": 27436 + }, + { + "epoch": 0.5093883552463028, + "grad_norm": 0.4352647662162781, + "learning_rate": 9.704920700854428e-06, + "loss": 0.3005, + "step": 27438 + }, + { + "epoch": 0.5094254853837215, + "grad_norm": 0.4708001911640167, + "learning_rate": 9.703754720154406e-06, + "loss": 0.2254, + "step": 27440 + }, + { + "epoch": 0.50946261552114, + "grad_norm": 0.34142231941223145, + "learning_rate": 9.702588743485394e-06, + "loss": 0.3171, + "step": 27442 + }, + { + "epoch": 0.5094997456585587, + "grad_norm": 0.43381214141845703, + "learning_rate": 9.701422770863264e-06, + "loss": 0.2035, + "step": 27444 + }, + { + "epoch": 0.5095368757959773, + "grad_norm": 0.3174418807029724, + "learning_rate": 9.70025680230387e-06, + "loss": 0.2195, + "step": 27446 + }, + { + "epoch": 0.509574005933396, + "grad_norm": 0.22408679127693176, + "learning_rate": 9.699090837823088e-06, + "loss": 0.1321, + "step": 27448 + }, + { + "epoch": 0.5096111360708147, + "grad_norm": 0.3952559530735016, + "learning_rate": 9.697924877436773e-06, + "loss": 0.1671, + "step": 27450 + }, + { + "epoch": 0.5096482662082332, + "grad_norm": 0.45117366313934326, + "learning_rate": 9.696758921160797e-06, + "loss": 0.3419, + "step": 27452 + }, + { + "epoch": 0.5096853963456519, + "grad_norm": 0.27644842863082886, + "learning_rate": 9.695592969011023e-06, + "loss": 0.1969, + "step": 27454 + }, + { + "epoch": 0.5097225264830705, + "grad_norm": 0.3339453637599945, + "learning_rate": 9.694427021003322e-06, + "loss": 0.4069, + "step": 27456 + }, + { + "epoch": 0.5097596566204892, + "grad_norm": 0.36425602436065674, + "learning_rate": 9.69326107715355e-06, + "loss": 0.2779, + "step": 27458 + }, + { + "epoch": 0.5097967867579077, + "grad_norm": 0.34550178050994873, + "learning_rate": 9.692095137477579e-06, + "loss": 0.2057, + "step": 27460 + }, + { + "epoch": 0.5098339168953264, + "grad_norm": 0.3911181390285492, + "learning_rate": 9.690929201991268e-06, + "loss": 0.2168, + "step": 27462 + }, + { + "epoch": 0.5098710470327451, + "grad_norm": 0.3915248513221741, + "learning_rate": 9.689763270710484e-06, + "loss": 0.3805, + "step": 27464 + }, + { + "epoch": 0.5099081771701637, + "grad_norm": 0.3424127697944641, + "learning_rate": 9.688597343651093e-06, + "loss": 0.3677, + "step": 27466 + }, + { + "epoch": 0.5099453073075824, + "grad_norm": 0.2904253304004669, + "learning_rate": 9.687431420828963e-06, + "loss": 0.285, + "step": 27468 + }, + { + "epoch": 0.5099824374450009, + "grad_norm": 0.3368496894836426, + "learning_rate": 9.686265502259953e-06, + "loss": 0.2684, + "step": 27470 + }, + { + "epoch": 0.5100195675824196, + "grad_norm": 0.2652243971824646, + "learning_rate": 9.685099587959928e-06, + "loss": 0.3147, + "step": 27472 + }, + { + "epoch": 0.5100566977198383, + "grad_norm": 0.6277669072151184, + "learning_rate": 9.683933677944755e-06, + "loss": 0.1486, + "step": 27474 + }, + { + "epoch": 0.5100938278572569, + "grad_norm": 0.2805199921131134, + "learning_rate": 9.682767772230297e-06, + "loss": 0.34, + "step": 27476 + }, + { + "epoch": 0.5101309579946756, + "grad_norm": 0.3326171338558197, + "learning_rate": 9.681601870832422e-06, + "loss": 0.208, + "step": 27478 + }, + { + "epoch": 0.5101680881320941, + "grad_norm": 0.2681579291820526, + "learning_rate": 9.680435973766991e-06, + "loss": 0.4093, + "step": 27480 + }, + { + "epoch": 0.5102052182695128, + "grad_norm": 0.25577929615974426, + "learning_rate": 9.679270081049872e-06, + "loss": 0.1657, + "step": 27482 + }, + { + "epoch": 0.5102423484069315, + "grad_norm": 0.24016334116458893, + "learning_rate": 9.678104192696921e-06, + "loss": 0.1867, + "step": 27484 + }, + { + "epoch": 0.5102794785443501, + "grad_norm": 0.3700762987136841, + "learning_rate": 9.67693830872401e-06, + "loss": 0.1944, + "step": 27486 + }, + { + "epoch": 0.5103166086817688, + "grad_norm": 0.32547080516815186, + "learning_rate": 9.675772429147e-06, + "loss": 0.419, + "step": 27488 + }, + { + "epoch": 0.5103537388191873, + "grad_norm": 0.5849716067314148, + "learning_rate": 9.674606553981759e-06, + "loss": 0.3567, + "step": 27490 + }, + { + "epoch": 0.510390868956606, + "grad_norm": 0.2290562093257904, + "learning_rate": 9.673440683244144e-06, + "loss": 0.2867, + "step": 27492 + }, + { + "epoch": 0.5104279990940247, + "grad_norm": 0.343606173992157, + "learning_rate": 9.67227481695003e-06, + "loss": 0.3108, + "step": 27494 + }, + { + "epoch": 0.5104651292314433, + "grad_norm": 0.5379561185836792, + "learning_rate": 9.671108955115268e-06, + "loss": 0.235, + "step": 27496 + }, + { + "epoch": 0.510502259368862, + "grad_norm": 0.33252012729644775, + "learning_rate": 9.669943097755728e-06, + "loss": 0.511, + "step": 27498 + }, + { + "epoch": 0.5105393895062805, + "grad_norm": 0.45400112867355347, + "learning_rate": 9.668777244887276e-06, + "loss": 0.172, + "step": 27500 + }, + { + "epoch": 0.5105765196436992, + "grad_norm": 0.47489675879478455, + "learning_rate": 9.667611396525773e-06, + "loss": 0.2856, + "step": 27502 + }, + { + "epoch": 0.5106136497811179, + "grad_norm": 0.44201552867889404, + "learning_rate": 9.666445552687081e-06, + "loss": 0.4246, + "step": 27504 + }, + { + "epoch": 0.5106507799185365, + "grad_norm": 0.25042638182640076, + "learning_rate": 9.665279713387072e-06, + "loss": 0.2083, + "step": 27506 + }, + { + "epoch": 0.5106879100559552, + "grad_norm": 0.3039798438549042, + "learning_rate": 9.664113878641598e-06, + "loss": 0.3318, + "step": 27508 + }, + { + "epoch": 0.5107250401933737, + "grad_norm": 0.4846624732017517, + "learning_rate": 9.662948048466529e-06, + "loss": 0.3854, + "step": 27510 + }, + { + "epoch": 0.5107621703307924, + "grad_norm": 0.1738000065088272, + "learning_rate": 9.66178222287773e-06, + "loss": 0.1097, + "step": 27512 + }, + { + "epoch": 0.510799300468211, + "grad_norm": 0.26782912015914917, + "learning_rate": 9.660616401891057e-06, + "loss": 0.3062, + "step": 27514 + }, + { + "epoch": 0.5108364306056297, + "grad_norm": 0.3808671236038208, + "learning_rate": 9.659450585522382e-06, + "loss": 0.2983, + "step": 27516 + }, + { + "epoch": 0.5108735607430483, + "grad_norm": 0.3146984875202179, + "learning_rate": 9.658284773787562e-06, + "loss": 0.2566, + "step": 27518 + }, + { + "epoch": 0.5109106908804669, + "grad_norm": 0.5071689486503601, + "learning_rate": 9.657118966702467e-06, + "loss": 0.3832, + "step": 27520 + }, + { + "epoch": 0.5109478210178856, + "grad_norm": 0.37467461824417114, + "learning_rate": 9.655953164282953e-06, + "loss": 0.2528, + "step": 27522 + }, + { + "epoch": 0.5109849511553042, + "grad_norm": 0.5074592232704163, + "learning_rate": 9.654787366544887e-06, + "loss": 0.1873, + "step": 27524 + }, + { + "epoch": 0.5110220812927229, + "grad_norm": 0.34784483909606934, + "learning_rate": 9.653621573504129e-06, + "loss": 0.241, + "step": 27526 + }, + { + "epoch": 0.5110592114301415, + "grad_norm": 0.4454055726528168, + "learning_rate": 9.652455785176541e-06, + "loss": 0.3974, + "step": 27528 + }, + { + "epoch": 0.5110963415675601, + "grad_norm": 0.4718903601169586, + "learning_rate": 9.651290001577995e-06, + "loss": 0.2497, + "step": 27530 + }, + { + "epoch": 0.5111334717049788, + "grad_norm": 0.33056846261024475, + "learning_rate": 9.650124222724347e-06, + "loss": 0.4463, + "step": 27532 + }, + { + "epoch": 0.5111706018423974, + "grad_norm": 0.4470878839492798, + "learning_rate": 9.648958448631458e-06, + "loss": 0.2981, + "step": 27534 + }, + { + "epoch": 0.5112077319798161, + "grad_norm": 0.6056501269340515, + "learning_rate": 9.647792679315193e-06, + "loss": 0.3201, + "step": 27536 + }, + { + "epoch": 0.5112448621172347, + "grad_norm": 0.46294721961021423, + "learning_rate": 9.646626914791413e-06, + "loss": 0.3699, + "step": 27538 + }, + { + "epoch": 0.5112819922546533, + "grad_norm": 0.3775184750556946, + "learning_rate": 9.645461155075984e-06, + "loss": 0.3386, + "step": 27540 + }, + { + "epoch": 0.511319122392072, + "grad_norm": 0.23049336671829224, + "learning_rate": 9.64429540018477e-06, + "loss": 0.306, + "step": 27542 + }, + { + "epoch": 0.5113562525294906, + "grad_norm": 0.4777885973453522, + "learning_rate": 9.643129650133629e-06, + "loss": 0.3088, + "step": 27544 + }, + { + "epoch": 0.5113933826669093, + "grad_norm": 0.6331747770309448, + "learning_rate": 9.641963904938422e-06, + "loss": 0.1966, + "step": 27546 + }, + { + "epoch": 0.5114305128043279, + "grad_norm": 0.3435724973678589, + "learning_rate": 9.640798164615013e-06, + "loss": 0.503, + "step": 27548 + }, + { + "epoch": 0.5114676429417465, + "grad_norm": 0.3178273141384125, + "learning_rate": 9.639632429179266e-06, + "loss": 0.1723, + "step": 27550 + }, + { + "epoch": 0.5115047730791652, + "grad_norm": 0.41977745294570923, + "learning_rate": 9.638466698647044e-06, + "loss": 0.213, + "step": 27552 + }, + { + "epoch": 0.5115419032165838, + "grad_norm": 0.3878977298736572, + "learning_rate": 9.637300973034204e-06, + "loss": 0.272, + "step": 27554 + }, + { + "epoch": 0.5115790333540025, + "grad_norm": 0.3197470009326935, + "learning_rate": 9.636135252356614e-06, + "loss": 0.3322, + "step": 27556 + }, + { + "epoch": 0.511616163491421, + "grad_norm": 0.37615951895713806, + "learning_rate": 9.634969536630135e-06, + "loss": 0.1032, + "step": 27558 + }, + { + "epoch": 0.5116532936288397, + "grad_norm": 0.3494502007961273, + "learning_rate": 9.633803825870624e-06, + "loss": 0.3145, + "step": 27560 + }, + { + "epoch": 0.5116904237662584, + "grad_norm": 0.3623175024986267, + "learning_rate": 9.632638120093946e-06, + "loss": 0.2021, + "step": 27562 + }, + { + "epoch": 0.511727553903677, + "grad_norm": 0.3752327561378479, + "learning_rate": 9.631472419315965e-06, + "loss": 0.2877, + "step": 27564 + }, + { + "epoch": 0.5117646840410957, + "grad_norm": 0.36790701746940613, + "learning_rate": 9.630306723552536e-06, + "loss": 0.5292, + "step": 27566 + }, + { + "epoch": 0.5118018141785142, + "grad_norm": 0.30650147795677185, + "learning_rate": 9.62914103281953e-06, + "loss": 0.2868, + "step": 27568 + }, + { + "epoch": 0.5118389443159329, + "grad_norm": 0.2874961197376251, + "learning_rate": 9.627975347132804e-06, + "loss": 0.41, + "step": 27570 + }, + { + "epoch": 0.5118760744533516, + "grad_norm": 0.5278917551040649, + "learning_rate": 9.626809666508217e-06, + "loss": 0.2459, + "step": 27572 + }, + { + "epoch": 0.5119132045907702, + "grad_norm": 0.49385857582092285, + "learning_rate": 9.625643990961633e-06, + "loss": 0.1222, + "step": 27574 + }, + { + "epoch": 0.5119503347281888, + "grad_norm": 0.2153642475605011, + "learning_rate": 9.624478320508913e-06, + "loss": 0.2939, + "step": 27576 + }, + { + "epoch": 0.5119874648656074, + "grad_norm": 0.4303489923477173, + "learning_rate": 9.623312655165916e-06, + "loss": 0.4353, + "step": 27578 + }, + { + "epoch": 0.5120245950030261, + "grad_norm": 0.35985898971557617, + "learning_rate": 9.622146994948506e-06, + "loss": 0.1548, + "step": 27580 + }, + { + "epoch": 0.5120617251404448, + "grad_norm": 0.5189068913459778, + "learning_rate": 9.620981339872549e-06, + "loss": 0.2133, + "step": 27582 + }, + { + "epoch": 0.5120988552778634, + "grad_norm": 0.4007475674152374, + "learning_rate": 9.619815689953896e-06, + "loss": 0.3123, + "step": 27584 + }, + { + "epoch": 0.512135985415282, + "grad_norm": 0.3837631940841675, + "learning_rate": 9.618650045208415e-06, + "loss": 0.1508, + "step": 27586 + }, + { + "epoch": 0.5121731155527006, + "grad_norm": 0.43478071689605713, + "learning_rate": 9.617484405651961e-06, + "loss": 0.2845, + "step": 27588 + }, + { + "epoch": 0.5122102456901193, + "grad_norm": 0.437438428401947, + "learning_rate": 9.6163187713004e-06, + "loss": 0.3461, + "step": 27590 + }, + { + "epoch": 0.512247375827538, + "grad_norm": 0.3370918333530426, + "learning_rate": 9.615153142169592e-06, + "loss": 0.3641, + "step": 27592 + }, + { + "epoch": 0.5122845059649566, + "grad_norm": 0.32581931352615356, + "learning_rate": 9.613987518275399e-06, + "loss": 0.2029, + "step": 27594 + }, + { + "epoch": 0.5123216361023752, + "grad_norm": 1.029665231704712, + "learning_rate": 9.612821899633676e-06, + "loss": 0.1393, + "step": 27596 + }, + { + "epoch": 0.5123587662397938, + "grad_norm": 0.4986780285835266, + "learning_rate": 9.61165628626029e-06, + "loss": 0.363, + "step": 27598 + }, + { + "epoch": 0.5123958963772125, + "grad_norm": 0.35451340675354004, + "learning_rate": 9.610490678171096e-06, + "loss": 0.3287, + "step": 27600 + }, + { + "epoch": 0.5124330265146312, + "grad_norm": 0.46983006596565247, + "learning_rate": 9.609325075381958e-06, + "loss": 0.359, + "step": 27602 + }, + { + "epoch": 0.5124701566520498, + "grad_norm": 0.38885220885276794, + "learning_rate": 9.608159477908737e-06, + "loss": 0.3191, + "step": 27604 + }, + { + "epoch": 0.5125072867894684, + "grad_norm": 0.29006606340408325, + "learning_rate": 9.60699388576729e-06, + "loss": 0.2564, + "step": 27606 + }, + { + "epoch": 0.512544416926887, + "grad_norm": 0.4570528268814087, + "learning_rate": 9.605828298973483e-06, + "loss": 0.3213, + "step": 27608 + }, + { + "epoch": 0.5125815470643057, + "grad_norm": 0.38161566853523254, + "learning_rate": 9.604662717543169e-06, + "loss": 0.131, + "step": 27610 + }, + { + "epoch": 0.5126186772017243, + "grad_norm": 0.49640652537345886, + "learning_rate": 9.60349714149221e-06, + "loss": 0.3664, + "step": 27612 + }, + { + "epoch": 0.512655807339143, + "grad_norm": 0.3071663975715637, + "learning_rate": 9.602331570836467e-06, + "loss": 0.2587, + "step": 27614 + }, + { + "epoch": 0.5126929374765616, + "grad_norm": 0.34845954179763794, + "learning_rate": 9.601166005591802e-06, + "loss": 0.2551, + "step": 27616 + }, + { + "epoch": 0.5127300676139802, + "grad_norm": 0.3759823441505432, + "learning_rate": 9.600000445774073e-06, + "loss": 0.1983, + "step": 27618 + }, + { + "epoch": 0.5127671977513989, + "grad_norm": 0.3184506595134735, + "learning_rate": 9.598834891399142e-06, + "loss": 0.227, + "step": 27620 + }, + { + "epoch": 0.5128043278888175, + "grad_norm": 0.3116915225982666, + "learning_rate": 9.597669342482863e-06, + "loss": 0.2744, + "step": 27622 + }, + { + "epoch": 0.5128414580262362, + "grad_norm": 0.35930871963500977, + "learning_rate": 9.5965037990411e-06, + "loss": 0.2065, + "step": 27624 + }, + { + "epoch": 0.5128785881636548, + "grad_norm": 0.2619815170764923, + "learning_rate": 9.59533826108971e-06, + "loss": 0.3548, + "step": 27626 + }, + { + "epoch": 0.5129157183010734, + "grad_norm": 0.413163959980011, + "learning_rate": 9.594172728644557e-06, + "loss": 0.3184, + "step": 27628 + }, + { + "epoch": 0.5129528484384921, + "grad_norm": 0.4463177025318146, + "learning_rate": 9.593007201721496e-06, + "loss": 0.1993, + "step": 27630 + }, + { + "epoch": 0.5129899785759107, + "grad_norm": 0.5362995266914368, + "learning_rate": 9.591841680336391e-06, + "loss": 0.3493, + "step": 27632 + }, + { + "epoch": 0.5130271087133293, + "grad_norm": 0.30266591906547546, + "learning_rate": 9.590676164505095e-06, + "loss": 0.1993, + "step": 27634 + }, + { + "epoch": 0.513064238850748, + "grad_norm": 0.4228737950325012, + "learning_rate": 9.58951065424347e-06, + "loss": 0.3133, + "step": 27636 + }, + { + "epoch": 0.5131013689881666, + "grad_norm": 0.3493140637874603, + "learning_rate": 9.588345149567378e-06, + "loss": 0.323, + "step": 27638 + }, + { + "epoch": 0.5131384991255853, + "grad_norm": 0.5333438515663147, + "learning_rate": 9.587179650492672e-06, + "loss": 0.3137, + "step": 27640 + }, + { + "epoch": 0.5131756292630039, + "grad_norm": 0.3421265780925751, + "learning_rate": 9.586014157035215e-06, + "loss": 0.3101, + "step": 27642 + }, + { + "epoch": 0.5132127594004225, + "grad_norm": 0.49278348684310913, + "learning_rate": 9.584848669210865e-06, + "loss": 0.1394, + "step": 27644 + }, + { + "epoch": 0.5132498895378412, + "grad_norm": 0.4265742897987366, + "learning_rate": 9.583683187035486e-06, + "loss": 0.2435, + "step": 27646 + }, + { + "epoch": 0.5132870196752598, + "grad_norm": 0.4515606462955475, + "learning_rate": 9.582517710524928e-06, + "loss": 0.2817, + "step": 27648 + }, + { + "epoch": 0.5133241498126785, + "grad_norm": 0.2932735085487366, + "learning_rate": 9.581352239695055e-06, + "loss": 0.2149, + "step": 27650 + }, + { + "epoch": 0.5133612799500971, + "grad_norm": 0.32840389013290405, + "learning_rate": 9.580186774561722e-06, + "loss": 0.243, + "step": 27652 + }, + { + "epoch": 0.5133984100875157, + "grad_norm": 0.4383675158023834, + "learning_rate": 9.57902131514079e-06, + "loss": 0.221, + "step": 27654 + }, + { + "epoch": 0.5134355402249343, + "grad_norm": 0.35928624868392944, + "learning_rate": 9.577855861448115e-06, + "loss": 0.201, + "step": 27656 + }, + { + "epoch": 0.513472670362353, + "grad_norm": 0.49422237277030945, + "learning_rate": 9.576690413499564e-06, + "loss": 0.2742, + "step": 27658 + }, + { + "epoch": 0.5135098004997717, + "grad_norm": 0.36891844868659973, + "learning_rate": 9.575524971310986e-06, + "loss": 0.4366, + "step": 27660 + }, + { + "epoch": 0.5135469306371903, + "grad_norm": 0.29024386405944824, + "learning_rate": 9.574359534898237e-06, + "loss": 0.4735, + "step": 27662 + }, + { + "epoch": 0.5135840607746089, + "grad_norm": 0.35033005475997925, + "learning_rate": 9.573194104277183e-06, + "loss": 0.266, + "step": 27664 + }, + { + "epoch": 0.5136211909120275, + "grad_norm": 0.2778182029724121, + "learning_rate": 9.572028679463676e-06, + "loss": 0.2173, + "step": 27666 + }, + { + "epoch": 0.5136583210494462, + "grad_norm": 0.34723880887031555, + "learning_rate": 9.57086326047358e-06, + "loss": 0.318, + "step": 27668 + }, + { + "epoch": 0.5136954511868649, + "grad_norm": 0.5657864809036255, + "learning_rate": 9.56969784732275e-06, + "loss": 0.2148, + "step": 27670 + }, + { + "epoch": 0.5137325813242835, + "grad_norm": 0.4703945517539978, + "learning_rate": 9.568532440027044e-06, + "loss": 0.3285, + "step": 27672 + }, + { + "epoch": 0.5137697114617021, + "grad_norm": 0.23526544868946075, + "learning_rate": 9.567367038602316e-06, + "loss": 0.225, + "step": 27674 + }, + { + "epoch": 0.5138068415991207, + "grad_norm": 0.40574878454208374, + "learning_rate": 9.566201643064428e-06, + "loss": 0.3226, + "step": 27676 + }, + { + "epoch": 0.5138439717365394, + "grad_norm": 0.3196861147880554, + "learning_rate": 9.565036253429235e-06, + "loss": 0.2961, + "step": 27678 + }, + { + "epoch": 0.5138811018739581, + "grad_norm": 0.6547278761863708, + "learning_rate": 9.563870869712598e-06, + "loss": 0.3287, + "step": 27680 + }, + { + "epoch": 0.5139182320113767, + "grad_norm": 0.3820594847202301, + "learning_rate": 9.56270549193037e-06, + "loss": 0.2355, + "step": 27682 + }, + { + "epoch": 0.5139553621487953, + "grad_norm": 0.34088024497032166, + "learning_rate": 9.561540120098416e-06, + "loss": 0.2137, + "step": 27684 + }, + { + "epoch": 0.5139924922862139, + "grad_norm": 0.39357149600982666, + "learning_rate": 9.560374754232581e-06, + "loss": 0.1701, + "step": 27686 + }, + { + "epoch": 0.5140296224236326, + "grad_norm": 0.47195547819137573, + "learning_rate": 9.55920939434873e-06, + "loss": 0.3294, + "step": 27688 + }, + { + "epoch": 0.5140667525610513, + "grad_norm": 0.47095268964767456, + "learning_rate": 9.558044040462721e-06, + "loss": 0.2283, + "step": 27690 + }, + { + "epoch": 0.5141038826984698, + "grad_norm": 0.3605042099952698, + "learning_rate": 9.556878692590406e-06, + "loss": 0.3657, + "step": 27692 + }, + { + "epoch": 0.5141410128358885, + "grad_norm": 0.2524566352367401, + "learning_rate": 9.555713350747646e-06, + "loss": 0.3753, + "step": 27694 + }, + { + "epoch": 0.5141781429733071, + "grad_norm": 0.5168209075927734, + "learning_rate": 9.554548014950299e-06, + "loss": 0.1736, + "step": 27696 + }, + { + "epoch": 0.5142152731107258, + "grad_norm": 0.6899694204330444, + "learning_rate": 9.553382685214216e-06, + "loss": 0.2238, + "step": 27698 + }, + { + "epoch": 0.5142524032481445, + "grad_norm": 0.25912654399871826, + "learning_rate": 9.552217361555258e-06, + "loss": 0.3107, + "step": 27700 + }, + { + "epoch": 0.514289533385563, + "grad_norm": 0.35134774446487427, + "learning_rate": 9.55105204398928e-06, + "loss": 0.3746, + "step": 27702 + }, + { + "epoch": 0.5143266635229817, + "grad_norm": 0.47562161087989807, + "learning_rate": 9.54988673253214e-06, + "loss": 0.1151, + "step": 27704 + }, + { + "epoch": 0.5143637936604003, + "grad_norm": 0.4155772924423218, + "learning_rate": 9.54872142719969e-06, + "loss": 0.2649, + "step": 27706 + }, + { + "epoch": 0.514400923797819, + "grad_norm": 0.4811541438102722, + "learning_rate": 9.547556128007796e-06, + "loss": 0.3068, + "step": 27708 + }, + { + "epoch": 0.5144380539352376, + "grad_norm": 0.5299921035766602, + "learning_rate": 9.546390834972303e-06, + "loss": 0.1566, + "step": 27710 + }, + { + "epoch": 0.5144751840726562, + "grad_norm": 0.4587494730949402, + "learning_rate": 9.545225548109074e-06, + "loss": 0.339, + "step": 27712 + }, + { + "epoch": 0.5145123142100749, + "grad_norm": 0.2998519837856293, + "learning_rate": 9.54406026743396e-06, + "loss": 0.1944, + "step": 27714 + }, + { + "epoch": 0.5145494443474935, + "grad_norm": 0.5208114385604858, + "learning_rate": 9.542894992962821e-06, + "loss": 0.5225, + "step": 27716 + }, + { + "epoch": 0.5145865744849122, + "grad_norm": 0.33112192153930664, + "learning_rate": 9.541729724711513e-06, + "loss": 0.3084, + "step": 27718 + }, + { + "epoch": 0.5146237046223308, + "grad_norm": 0.4124448895454407, + "learning_rate": 9.54056446269589e-06, + "loss": 0.2268, + "step": 27720 + }, + { + "epoch": 0.5146608347597494, + "grad_norm": 0.7226142883300781, + "learning_rate": 9.539399206931812e-06, + "loss": 0.3854, + "step": 27722 + }, + { + "epoch": 0.5146979648971681, + "grad_norm": 0.34287604689598083, + "learning_rate": 9.53823395743513e-06, + "loss": 0.3085, + "step": 27724 + }, + { + "epoch": 0.5147350950345867, + "grad_norm": 0.3313058316707611, + "learning_rate": 9.537068714221698e-06, + "loss": 0.2894, + "step": 27726 + }, + { + "epoch": 0.5147722251720054, + "grad_norm": 0.39880281686782837, + "learning_rate": 9.535903477307375e-06, + "loss": 0.3512, + "step": 27728 + }, + { + "epoch": 0.514809355309424, + "grad_norm": 0.35275503993034363, + "learning_rate": 9.534738246708013e-06, + "loss": 0.2703, + "step": 27730 + }, + { + "epoch": 0.5148464854468426, + "grad_norm": 0.4860374331474304, + "learning_rate": 9.533573022439476e-06, + "loss": 0.222, + "step": 27732 + }, + { + "epoch": 0.5148836155842613, + "grad_norm": 0.5078479647636414, + "learning_rate": 9.53240780451761e-06, + "loss": 0.2027, + "step": 27734 + }, + { + "epoch": 0.5149207457216799, + "grad_norm": 0.25618186593055725, + "learning_rate": 9.531242592958274e-06, + "loss": 0.1937, + "step": 27736 + }, + { + "epoch": 0.5149578758590986, + "grad_norm": 0.3207758963108063, + "learning_rate": 9.53007738777732e-06, + "loss": 0.2233, + "step": 27738 + }, + { + "epoch": 0.5149950059965172, + "grad_norm": 0.2983584403991699, + "learning_rate": 9.528912188990605e-06, + "loss": 0.3669, + "step": 27740 + }, + { + "epoch": 0.5150321361339358, + "grad_norm": 0.4820159375667572, + "learning_rate": 9.527746996613985e-06, + "loss": 0.3109, + "step": 27742 + }, + { + "epoch": 0.5150692662713545, + "grad_norm": 0.3552098870277405, + "learning_rate": 9.526581810663315e-06, + "loss": 0.2484, + "step": 27744 + }, + { + "epoch": 0.5151063964087731, + "grad_norm": 0.45764485001564026, + "learning_rate": 9.525416631154452e-06, + "loss": 0.3487, + "step": 27746 + }, + { + "epoch": 0.5151435265461918, + "grad_norm": 0.41365164518356323, + "learning_rate": 9.524251458103242e-06, + "loss": 0.2458, + "step": 27748 + }, + { + "epoch": 0.5151806566836103, + "grad_norm": 0.3427340090274811, + "learning_rate": 9.523086291525544e-06, + "loss": 0.1638, + "step": 27750 + }, + { + "epoch": 0.515217786821029, + "grad_norm": 0.45375701785087585, + "learning_rate": 9.521921131437213e-06, + "loss": 0.4803, + "step": 27752 + }, + { + "epoch": 0.5152549169584477, + "grad_norm": 0.6443138718605042, + "learning_rate": 9.520755977854107e-06, + "loss": 0.3183, + "step": 27754 + }, + { + "epoch": 0.5152920470958663, + "grad_norm": 0.33075985312461853, + "learning_rate": 9.519590830792072e-06, + "loss": 0.4141, + "step": 27756 + }, + { + "epoch": 0.515329177233285, + "grad_norm": 0.3032643496990204, + "learning_rate": 9.518425690266973e-06, + "loss": 0.1921, + "step": 27758 + }, + { + "epoch": 0.5153663073707035, + "grad_norm": 0.29809653759002686, + "learning_rate": 9.517260556294655e-06, + "loss": 0.3433, + "step": 27760 + }, + { + "epoch": 0.5154034375081222, + "grad_norm": 0.8759490251541138, + "learning_rate": 9.516095428890972e-06, + "loss": 0.1899, + "step": 27762 + }, + { + "epoch": 0.5154405676455408, + "grad_norm": 0.42079952359199524, + "learning_rate": 9.514930308071781e-06, + "loss": 0.2934, + "step": 27764 + }, + { + "epoch": 0.5154776977829595, + "grad_norm": 0.4169912338256836, + "learning_rate": 9.513765193852939e-06, + "loss": 0.1118, + "step": 27766 + }, + { + "epoch": 0.5155148279203782, + "grad_norm": 0.3454352915287018, + "learning_rate": 9.512600086250295e-06, + "loss": 0.375, + "step": 27768 + }, + { + "epoch": 0.5155519580577967, + "grad_norm": 0.43056464195251465, + "learning_rate": 9.511434985279702e-06, + "loss": 0.3636, + "step": 27770 + }, + { + "epoch": 0.5155890881952154, + "grad_norm": 0.3244856297969818, + "learning_rate": 9.51026989095702e-06, + "loss": 0.3136, + "step": 27772 + }, + { + "epoch": 0.515626218332634, + "grad_norm": 0.3567945659160614, + "learning_rate": 9.509104803298095e-06, + "loss": 0.2152, + "step": 27774 + }, + { + "epoch": 0.5156633484700527, + "grad_norm": 0.4030345678329468, + "learning_rate": 9.507939722318783e-06, + "loss": 0.2069, + "step": 27776 + }, + { + "epoch": 0.5157004786074714, + "grad_norm": 0.3154042065143585, + "learning_rate": 9.506774648034937e-06, + "loss": 0.2484, + "step": 27778 + }, + { + "epoch": 0.5157376087448899, + "grad_norm": 0.3310115337371826, + "learning_rate": 9.50560958046241e-06, + "loss": 0.2657, + "step": 27780 + }, + { + "epoch": 0.5157747388823086, + "grad_norm": 0.3910242021083832, + "learning_rate": 9.504444519617056e-06, + "loss": 0.2425, + "step": 27782 + }, + { + "epoch": 0.5158118690197272, + "grad_norm": 0.47065800428390503, + "learning_rate": 9.503279465514732e-06, + "loss": 0.2677, + "step": 27784 + }, + { + "epoch": 0.5158489991571459, + "grad_norm": 0.49375393986701965, + "learning_rate": 9.502114418171282e-06, + "loss": 0.2887, + "step": 27786 + }, + { + "epoch": 0.5158861292945646, + "grad_norm": 0.302882581949234, + "learning_rate": 9.500949377602565e-06, + "loss": 0.3542, + "step": 27788 + }, + { + "epoch": 0.5159232594319831, + "grad_norm": 0.28566449880599976, + "learning_rate": 9.499784343824432e-06, + "loss": 0.2946, + "step": 27790 + }, + { + "epoch": 0.5159603895694018, + "grad_norm": 0.31632503867149353, + "learning_rate": 9.498619316852735e-06, + "loss": 0.4188, + "step": 27792 + }, + { + "epoch": 0.5159975197068204, + "grad_norm": 0.2564805746078491, + "learning_rate": 9.497454296703325e-06, + "loss": 0.1568, + "step": 27794 + }, + { + "epoch": 0.5160346498442391, + "grad_norm": 0.4565523564815521, + "learning_rate": 9.496289283392065e-06, + "loss": 0.3079, + "step": 27796 + }, + { + "epoch": 0.5160717799816578, + "grad_norm": 0.38501212000846863, + "learning_rate": 9.495124276934794e-06, + "loss": 0.1923, + "step": 27798 + }, + { + "epoch": 0.5161089101190763, + "grad_norm": 0.4378299117088318, + "learning_rate": 9.493959277347368e-06, + "loss": 0.1228, + "step": 27800 + }, + { + "epoch": 0.516146040256495, + "grad_norm": 0.5729774236679077, + "learning_rate": 9.49279428464564e-06, + "loss": 0.4596, + "step": 27802 + }, + { + "epoch": 0.5161831703939136, + "grad_norm": 0.3421713709831238, + "learning_rate": 9.491629298845463e-06, + "loss": 0.2026, + "step": 27804 + }, + { + "epoch": 0.5162203005313323, + "grad_norm": 0.36979350447654724, + "learning_rate": 9.490464319962691e-06, + "loss": 0.2302, + "step": 27806 + }, + { + "epoch": 0.5162574306687508, + "grad_norm": 0.35657092928886414, + "learning_rate": 9.489299348013171e-06, + "loss": 0.2773, + "step": 27808 + }, + { + "epoch": 0.5162945608061695, + "grad_norm": 0.5680682063102722, + "learning_rate": 9.488134383012762e-06, + "loss": 0.3563, + "step": 27810 + }, + { + "epoch": 0.5163316909435882, + "grad_norm": 0.42965248227119446, + "learning_rate": 9.486969424977305e-06, + "loss": 0.2811, + "step": 27812 + }, + { + "epoch": 0.5163688210810068, + "grad_norm": 0.35181960463523865, + "learning_rate": 9.485804473922658e-06, + "loss": 0.3888, + "step": 27814 + }, + { + "epoch": 0.5164059512184255, + "grad_norm": 0.3829491436481476, + "learning_rate": 9.484639529864673e-06, + "loss": 0.4518, + "step": 27816 + }, + { + "epoch": 0.516443081355844, + "grad_norm": 0.31823301315307617, + "learning_rate": 9.483474592819202e-06, + "loss": 0.3236, + "step": 27818 + }, + { + "epoch": 0.5164802114932627, + "grad_norm": 0.44607260823249817, + "learning_rate": 9.482309662802092e-06, + "loss": 0.3969, + "step": 27820 + }, + { + "epoch": 0.5165173416306814, + "grad_norm": 0.27846381068229675, + "learning_rate": 9.481144739829202e-06, + "loss": 0.2398, + "step": 27822 + }, + { + "epoch": 0.5165544717681, + "grad_norm": 0.5593746304512024, + "learning_rate": 9.479979823916373e-06, + "loss": 0.279, + "step": 27824 + }, + { + "epoch": 0.5165916019055187, + "grad_norm": 0.4391820728778839, + "learning_rate": 9.478814915079459e-06, + "loss": 0.2281, + "step": 27826 + }, + { + "epoch": 0.5166287320429372, + "grad_norm": 0.5220600962638855, + "learning_rate": 9.477650013334317e-06, + "loss": 0.3077, + "step": 27828 + }, + { + "epoch": 0.5166658621803559, + "grad_norm": 0.4352561831474304, + "learning_rate": 9.476485118696792e-06, + "loss": 0.3293, + "step": 27830 + }, + { + "epoch": 0.5167029923177746, + "grad_norm": 0.5185238718986511, + "learning_rate": 9.475320231182735e-06, + "loss": 0.3076, + "step": 27832 + }, + { + "epoch": 0.5167401224551932, + "grad_norm": 0.32103052735328674, + "learning_rate": 9.474155350808004e-06, + "loss": 0.3127, + "step": 27834 + }, + { + "epoch": 0.5167772525926119, + "grad_norm": 0.36642470955848694, + "learning_rate": 9.472990477588438e-06, + "loss": 0.287, + "step": 27836 + }, + { + "epoch": 0.5168143827300304, + "grad_norm": 0.4976699948310852, + "learning_rate": 9.471825611539895e-06, + "loss": 0.4656, + "step": 27838 + }, + { + "epoch": 0.5168515128674491, + "grad_norm": 0.40556374192237854, + "learning_rate": 9.470660752678222e-06, + "loss": 0.112, + "step": 27840 + }, + { + "epoch": 0.5168886430048678, + "grad_norm": 0.3443363904953003, + "learning_rate": 9.46949590101927e-06, + "loss": 0.2885, + "step": 27842 + }, + { + "epoch": 0.5169257731422864, + "grad_norm": 0.336807519197464, + "learning_rate": 9.468331056578892e-06, + "loss": 0.2502, + "step": 27844 + }, + { + "epoch": 0.5169629032797051, + "grad_norm": 0.45986685156822205, + "learning_rate": 9.467166219372934e-06, + "loss": 0.1523, + "step": 27846 + }, + { + "epoch": 0.5170000334171236, + "grad_norm": 0.45763063430786133, + "learning_rate": 9.466001389417251e-06, + "loss": 0.2274, + "step": 27848 + }, + { + "epoch": 0.5170371635545423, + "grad_norm": 0.45369982719421387, + "learning_rate": 9.464836566727686e-06, + "loss": 0.2806, + "step": 27850 + }, + { + "epoch": 0.517074293691961, + "grad_norm": 0.3789762556552887, + "learning_rate": 9.463671751320096e-06, + "loss": 0.283, + "step": 27852 + }, + { + "epoch": 0.5171114238293796, + "grad_norm": 0.5189527869224548, + "learning_rate": 9.462506943210324e-06, + "loss": 0.3772, + "step": 27854 + }, + { + "epoch": 0.5171485539667983, + "grad_norm": 0.4830610156059265, + "learning_rate": 9.461342142414221e-06, + "loss": 0.1923, + "step": 27856 + }, + { + "epoch": 0.5171856841042168, + "grad_norm": 0.3147564232349396, + "learning_rate": 9.460177348947642e-06, + "loss": 0.3133, + "step": 27858 + }, + { + "epoch": 0.5172228142416355, + "grad_norm": 0.2779880464076996, + "learning_rate": 9.459012562826432e-06, + "loss": 0.2671, + "step": 27860 + }, + { + "epoch": 0.5172599443790541, + "grad_norm": 0.5062434673309326, + "learning_rate": 9.457847784066439e-06, + "loss": 0.1997, + "step": 27862 + }, + { + "epoch": 0.5172970745164728, + "grad_norm": 0.27005279064178467, + "learning_rate": 9.456683012683514e-06, + "loss": 0.3134, + "step": 27864 + }, + { + "epoch": 0.5173342046538915, + "grad_norm": 0.3421546518802643, + "learning_rate": 9.455518248693504e-06, + "loss": 0.2436, + "step": 27866 + }, + { + "epoch": 0.51737133479131, + "grad_norm": 0.4909321963787079, + "learning_rate": 9.45435349211226e-06, + "loss": 0.2767, + "step": 27868 + }, + { + "epoch": 0.5174084649287287, + "grad_norm": 0.37666478753089905, + "learning_rate": 9.453188742955634e-06, + "loss": 0.279, + "step": 27870 + }, + { + "epoch": 0.5174455950661473, + "grad_norm": 0.3315078616142273, + "learning_rate": 9.45202400123947e-06, + "loss": 0.3265, + "step": 27872 + }, + { + "epoch": 0.517482725203566, + "grad_norm": 0.17898738384246826, + "learning_rate": 9.450859266979617e-06, + "loss": 0.1656, + "step": 27874 + }, + { + "epoch": 0.5175198553409847, + "grad_norm": 0.49123722314834595, + "learning_rate": 9.449694540191925e-06, + "loss": 0.432, + "step": 27876 + }, + { + "epoch": 0.5175569854784032, + "grad_norm": 0.38956162333488464, + "learning_rate": 9.44852982089224e-06, + "loss": 0.375, + "step": 27878 + }, + { + "epoch": 0.5175941156158219, + "grad_norm": 0.37408027052879333, + "learning_rate": 9.447365109096412e-06, + "loss": 0.3232, + "step": 27880 + }, + { + "epoch": 0.5176312457532405, + "grad_norm": 0.2778705358505249, + "learning_rate": 9.446200404820294e-06, + "loss": 0.4696, + "step": 27882 + }, + { + "epoch": 0.5176683758906592, + "grad_norm": 0.32104092836380005, + "learning_rate": 9.44503570807973e-06, + "loss": 0.3303, + "step": 27884 + }, + { + "epoch": 0.5177055060280779, + "grad_norm": 0.44376298785209656, + "learning_rate": 9.443871018890563e-06, + "loss": 0.3465, + "step": 27886 + }, + { + "epoch": 0.5177426361654964, + "grad_norm": 0.3576256334781647, + "learning_rate": 9.442706337268646e-06, + "loss": 0.2748, + "step": 27888 + }, + { + "epoch": 0.5177797663029151, + "grad_norm": 0.36411380767822266, + "learning_rate": 9.441541663229828e-06, + "loss": 0.3017, + "step": 27890 + }, + { + "epoch": 0.5178168964403337, + "grad_norm": 0.38967758417129517, + "learning_rate": 9.440376996789956e-06, + "loss": 0.6552, + "step": 27892 + }, + { + "epoch": 0.5178540265777524, + "grad_norm": 0.3927607834339142, + "learning_rate": 9.439212337964874e-06, + "loss": 0.3014, + "step": 27894 + }, + { + "epoch": 0.517891156715171, + "grad_norm": 0.41854771971702576, + "learning_rate": 9.438047686770435e-06, + "loss": 0.5006, + "step": 27896 + }, + { + "epoch": 0.5179282868525896, + "grad_norm": 0.36931127309799194, + "learning_rate": 9.436883043222485e-06, + "loss": 0.2831, + "step": 27898 + }, + { + "epoch": 0.5179654169900083, + "grad_norm": 0.5507428050041199, + "learning_rate": 9.435718407336866e-06, + "loss": 0.2454, + "step": 27900 + }, + { + "epoch": 0.5180025471274269, + "grad_norm": 0.3885299563407898, + "learning_rate": 9.434553779129432e-06, + "loss": 0.4107, + "step": 27902 + }, + { + "epoch": 0.5180396772648456, + "grad_norm": 0.26246514916419983, + "learning_rate": 9.433389158616027e-06, + "loss": 0.2621, + "step": 27904 + }, + { + "epoch": 0.5180768074022642, + "grad_norm": 0.3750298321247101, + "learning_rate": 9.432224545812497e-06, + "loss": 0.2546, + "step": 27906 + }, + { + "epoch": 0.5181139375396828, + "grad_norm": 0.5171964764595032, + "learning_rate": 9.431059940734691e-06, + "loss": 0.2664, + "step": 27908 + }, + { + "epoch": 0.5181510676771015, + "grad_norm": 0.28415557742118835, + "learning_rate": 9.429895343398459e-06, + "loss": 0.318, + "step": 27910 + }, + { + "epoch": 0.5181881978145201, + "grad_norm": 0.26206061244010925, + "learning_rate": 9.428730753819638e-06, + "loss": 0.4525, + "step": 27912 + }, + { + "epoch": 0.5182253279519388, + "grad_norm": 0.4848164916038513, + "learning_rate": 9.427566172014083e-06, + "loss": 0.2962, + "step": 27914 + }, + { + "epoch": 0.5182624580893573, + "grad_norm": 0.46354538202285767, + "learning_rate": 9.426401597997637e-06, + "loss": 0.3552, + "step": 27916 + }, + { + "epoch": 0.518299588226776, + "grad_norm": 0.5734918713569641, + "learning_rate": 9.425237031786145e-06, + "loss": 0.5306, + "step": 27918 + }, + { + "epoch": 0.5183367183641947, + "grad_norm": 0.49697911739349365, + "learning_rate": 9.424072473395457e-06, + "loss": 0.3912, + "step": 27920 + }, + { + "epoch": 0.5183738485016133, + "grad_norm": 0.3033851087093353, + "learning_rate": 9.42290792284142e-06, + "loss": 0.3631, + "step": 27922 + }, + { + "epoch": 0.518410978639032, + "grad_norm": 0.3845214545726776, + "learning_rate": 9.421743380139877e-06, + "loss": 0.4111, + "step": 27924 + }, + { + "epoch": 0.5184481087764505, + "grad_norm": 0.4567599296569824, + "learning_rate": 9.420578845306675e-06, + "loss": 0.5958, + "step": 27926 + }, + { + "epoch": 0.5184852389138692, + "grad_norm": 0.34994688630104065, + "learning_rate": 9.419414318357655e-06, + "loss": 0.3037, + "step": 27928 + }, + { + "epoch": 0.5185223690512879, + "grad_norm": 0.2931062877178192, + "learning_rate": 9.418249799308668e-06, + "loss": 0.2392, + "step": 27930 + }, + { + "epoch": 0.5185594991887065, + "grad_norm": 0.44967758655548096, + "learning_rate": 9.41708528817556e-06, + "loss": 0.1492, + "step": 27932 + }, + { + "epoch": 0.5185966293261252, + "grad_norm": 0.3435254693031311, + "learning_rate": 9.415920784974176e-06, + "loss": 0.296, + "step": 27934 + }, + { + "epoch": 0.5186337594635437, + "grad_norm": 0.5711683630943298, + "learning_rate": 9.414756289720364e-06, + "loss": 0.257, + "step": 27936 + }, + { + "epoch": 0.5186708896009624, + "grad_norm": 0.4668540060520172, + "learning_rate": 9.41359180242996e-06, + "loss": 0.3236, + "step": 27938 + }, + { + "epoch": 0.5187080197383811, + "grad_norm": 0.3586547374725342, + "learning_rate": 9.412427323118818e-06, + "loss": 0.2377, + "step": 27940 + }, + { + "epoch": 0.5187451498757997, + "grad_norm": 0.26479896903038025, + "learning_rate": 9.411262851802776e-06, + "loss": 0.1784, + "step": 27942 + }, + { + "epoch": 0.5187822800132184, + "grad_norm": 0.37328359484672546, + "learning_rate": 9.410098388497688e-06, + "loss": 0.1335, + "step": 27944 + }, + { + "epoch": 0.5188194101506369, + "grad_norm": 0.2846425771713257, + "learning_rate": 9.40893393321939e-06, + "loss": 0.3318, + "step": 27946 + }, + { + "epoch": 0.5188565402880556, + "grad_norm": 0.45188888907432556, + "learning_rate": 9.407769485983737e-06, + "loss": 0.2089, + "step": 27948 + }, + { + "epoch": 0.5188936704254743, + "grad_norm": 0.3138563632965088, + "learning_rate": 9.406605046806562e-06, + "loss": 0.3846, + "step": 27950 + }, + { + "epoch": 0.5189308005628929, + "grad_norm": 0.31780603528022766, + "learning_rate": 9.405440615703715e-06, + "loss": 0.3695, + "step": 27952 + }, + { + "epoch": 0.5189679307003116, + "grad_norm": 0.28879013657569885, + "learning_rate": 9.404276192691042e-06, + "loss": 0.4848, + "step": 27954 + }, + { + "epoch": 0.5190050608377301, + "grad_norm": 0.4851306974887848, + "learning_rate": 9.403111777784387e-06, + "loss": 0.118, + "step": 27956 + }, + { + "epoch": 0.5190421909751488, + "grad_norm": 0.4309792220592499, + "learning_rate": 9.40194737099959e-06, + "loss": 0.2774, + "step": 27958 + }, + { + "epoch": 0.5190793211125674, + "grad_norm": 0.31807616353034973, + "learning_rate": 9.4007829723525e-06, + "loss": 0.1428, + "step": 27960 + }, + { + "epoch": 0.5191164512499861, + "grad_norm": 0.44116929173469543, + "learning_rate": 9.399618581858958e-06, + "loss": 0.2778, + "step": 27962 + }, + { + "epoch": 0.5191535813874048, + "grad_norm": 0.47313371300697327, + "learning_rate": 9.398454199534807e-06, + "loss": 0.5085, + "step": 27964 + }, + { + "epoch": 0.5191907115248233, + "grad_norm": 0.3320086896419525, + "learning_rate": 9.397289825395896e-06, + "loss": 0.3695, + "step": 27966 + }, + { + "epoch": 0.519227841662242, + "grad_norm": 0.3190910816192627, + "learning_rate": 9.396125459458062e-06, + "loss": 0.2845, + "step": 27968 + }, + { + "epoch": 0.5192649717996606, + "grad_norm": 2.3000662326812744, + "learning_rate": 9.394961101737152e-06, + "loss": 0.4036, + "step": 27970 + }, + { + "epoch": 0.5193021019370793, + "grad_norm": 0.4869636297225952, + "learning_rate": 9.393796752249009e-06, + "loss": 0.2669, + "step": 27972 + }, + { + "epoch": 0.519339232074498, + "grad_norm": 0.25937286019325256, + "learning_rate": 9.39263241100948e-06, + "loss": 0.4043, + "step": 27974 + }, + { + "epoch": 0.5193763622119165, + "grad_norm": 0.21920296549797058, + "learning_rate": 9.3914680780344e-06, + "loss": 0.2946, + "step": 27976 + }, + { + "epoch": 0.5194134923493352, + "grad_norm": 0.29129552841186523, + "learning_rate": 9.39030375333962e-06, + "loss": 0.2387, + "step": 27978 + }, + { + "epoch": 0.5194506224867538, + "grad_norm": 0.5493195056915283, + "learning_rate": 9.389139436940978e-06, + "loss": 0.3199, + "step": 27980 + }, + { + "epoch": 0.5194877526241725, + "grad_norm": 0.5561234951019287, + "learning_rate": 9.387975128854317e-06, + "loss": 0.3273, + "step": 27982 + }, + { + "epoch": 0.5195248827615911, + "grad_norm": 0.3792039453983307, + "learning_rate": 9.386810829095483e-06, + "loss": 0.2395, + "step": 27984 + }, + { + "epoch": 0.5195620128990097, + "grad_norm": 0.3583786189556122, + "learning_rate": 9.385646537680321e-06, + "loss": 0.2593, + "step": 27986 + }, + { + "epoch": 0.5195991430364284, + "grad_norm": 0.3538975715637207, + "learning_rate": 9.384482254624664e-06, + "loss": 0.2736, + "step": 27988 + }, + { + "epoch": 0.519636273173847, + "grad_norm": 0.38106194138526917, + "learning_rate": 9.383317979944362e-06, + "loss": 0.2387, + "step": 27990 + }, + { + "epoch": 0.5196734033112657, + "grad_norm": 0.2817035913467407, + "learning_rate": 9.382153713655253e-06, + "loss": 0.3735, + "step": 27992 + }, + { + "epoch": 0.5197105334486843, + "grad_norm": 0.4266744554042816, + "learning_rate": 9.380989455773181e-06, + "loss": 0.2872, + "step": 27994 + }, + { + "epoch": 0.5197476635861029, + "grad_norm": 0.42260074615478516, + "learning_rate": 9.379825206313993e-06, + "loss": 0.2972, + "step": 27996 + }, + { + "epoch": 0.5197847937235216, + "grad_norm": 0.35097286105155945, + "learning_rate": 9.378660965293523e-06, + "loss": 0.1369, + "step": 27998 + }, + { + "epoch": 0.5198219238609402, + "grad_norm": 0.5277460813522339, + "learning_rate": 9.377496732727618e-06, + "loss": 0.2276, + "step": 28000 + }, + { + "epoch": 0.5198590539983589, + "grad_norm": 0.23006442189216614, + "learning_rate": 9.376332508632115e-06, + "loss": 0.2022, + "step": 28002 + }, + { + "epoch": 0.5198961841357775, + "grad_norm": 0.29697734117507935, + "learning_rate": 9.37516829302286e-06, + "loss": 0.4618, + "step": 28004 + }, + { + "epoch": 0.5199333142731961, + "grad_norm": 0.4390746057033539, + "learning_rate": 9.374004085915692e-06, + "loss": 0.3349, + "step": 28006 + }, + { + "epoch": 0.5199704444106148, + "grad_norm": 0.38183993101119995, + "learning_rate": 9.372839887326455e-06, + "loss": 0.4673, + "step": 28008 + }, + { + "epoch": 0.5200075745480334, + "grad_norm": 0.34838250279426575, + "learning_rate": 9.371675697270985e-06, + "loss": 0.4356, + "step": 28010 + }, + { + "epoch": 0.520044704685452, + "grad_norm": 0.3497498631477356, + "learning_rate": 9.370511515765134e-06, + "loss": 0.2554, + "step": 28012 + }, + { + "epoch": 0.5200818348228706, + "grad_norm": 0.33054205775260925, + "learning_rate": 9.369347342824729e-06, + "loss": 0.2306, + "step": 28014 + }, + { + "epoch": 0.5201189649602893, + "grad_norm": 0.40547436475753784, + "learning_rate": 9.36818317846562e-06, + "loss": 0.2854, + "step": 28016 + }, + { + "epoch": 0.520156095097708, + "grad_norm": 0.35170090198516846, + "learning_rate": 9.367019022703643e-06, + "loss": 0.2572, + "step": 28018 + }, + { + "epoch": 0.5201932252351266, + "grad_norm": 0.35822197794914246, + "learning_rate": 9.365854875554646e-06, + "loss": 0.3626, + "step": 28020 + }, + { + "epoch": 0.5202303553725453, + "grad_norm": 0.22573544085025787, + "learning_rate": 9.36469073703446e-06, + "loss": 0.1807, + "step": 28022 + }, + { + "epoch": 0.5202674855099638, + "grad_norm": 0.32722705602645874, + "learning_rate": 9.363526607158935e-06, + "loss": 0.3243, + "step": 28024 + }, + { + "epoch": 0.5203046156473825, + "grad_norm": 0.39880236983299255, + "learning_rate": 9.362362485943903e-06, + "loss": 0.2678, + "step": 28026 + }, + { + "epoch": 0.5203417457848012, + "grad_norm": 0.38540735840797424, + "learning_rate": 9.361198373405207e-06, + "loss": 0.1603, + "step": 28028 + }, + { + "epoch": 0.5203788759222198, + "grad_norm": 0.4847244918346405, + "learning_rate": 9.36003426955869e-06, + "loss": 0.2196, + "step": 28030 + }, + { + "epoch": 0.5204160060596384, + "grad_norm": 0.3694347143173218, + "learning_rate": 9.358870174420187e-06, + "loss": 0.3299, + "step": 28032 + }, + { + "epoch": 0.520453136197057, + "grad_norm": 0.7537364363670349, + "learning_rate": 9.35770608800554e-06, + "loss": 0.2433, + "step": 28034 + }, + { + "epoch": 0.5204902663344757, + "grad_norm": 0.31515857577323914, + "learning_rate": 9.356542010330594e-06, + "loss": 0.2468, + "step": 28036 + }, + { + "epoch": 0.5205273964718944, + "grad_norm": 0.4077747166156769, + "learning_rate": 9.355377941411182e-06, + "loss": 0.3577, + "step": 28038 + }, + { + "epoch": 0.520564526609313, + "grad_norm": 0.5394335389137268, + "learning_rate": 9.354213881263143e-06, + "loss": 0.1876, + "step": 28040 + }, + { + "epoch": 0.5206016567467316, + "grad_norm": 0.2732018530368805, + "learning_rate": 9.353049829902322e-06, + "loss": 0.2012, + "step": 28042 + }, + { + "epoch": 0.5206387868841502, + "grad_norm": 0.9316643476486206, + "learning_rate": 9.351885787344551e-06, + "loss": 0.2581, + "step": 28044 + }, + { + "epoch": 0.5206759170215689, + "grad_norm": 0.3779129385948181, + "learning_rate": 9.350721753605673e-06, + "loss": 0.1904, + "step": 28046 + }, + { + "epoch": 0.5207130471589876, + "grad_norm": 0.3806953728199005, + "learning_rate": 9.349557728701533e-06, + "loss": 0.3752, + "step": 28048 + }, + { + "epoch": 0.5207501772964062, + "grad_norm": 0.3448702096939087, + "learning_rate": 9.34839371264796e-06, + "loss": 0.5401, + "step": 28050 + }, + { + "epoch": 0.5207873074338248, + "grad_norm": 0.309926301240921, + "learning_rate": 9.3472297054608e-06, + "loss": 0.2669, + "step": 28052 + }, + { + "epoch": 0.5208244375712434, + "grad_norm": 0.470135360956192, + "learning_rate": 9.346065707155882e-06, + "loss": 0.1578, + "step": 28054 + }, + { + "epoch": 0.5208615677086621, + "grad_norm": 0.3822442591190338, + "learning_rate": 9.344901717749054e-06, + "loss": 0.392, + "step": 28056 + }, + { + "epoch": 0.5208986978460808, + "grad_norm": 0.3793529272079468, + "learning_rate": 9.34373773725615e-06, + "loss": 0.2133, + "step": 28058 + }, + { + "epoch": 0.5209358279834994, + "grad_norm": 0.34007447957992554, + "learning_rate": 9.342573765693014e-06, + "loss": 0.1548, + "step": 28060 + }, + { + "epoch": 0.520972958120918, + "grad_norm": 0.3707200288772583, + "learning_rate": 9.34140980307548e-06, + "loss": 0.3097, + "step": 28062 + }, + { + "epoch": 0.5210100882583366, + "grad_norm": 0.35025542974472046, + "learning_rate": 9.340245849419382e-06, + "loss": 0.4204, + "step": 28064 + }, + { + "epoch": 0.5210472183957553, + "grad_norm": 0.4748229682445526, + "learning_rate": 9.339081904740563e-06, + "loss": 0.3603, + "step": 28066 + }, + { + "epoch": 0.5210843485331739, + "grad_norm": 0.4065103530883789, + "learning_rate": 9.337917969054858e-06, + "loss": 0.3187, + "step": 28068 + }, + { + "epoch": 0.5211214786705926, + "grad_norm": 0.40040847659111023, + "learning_rate": 9.336754042378106e-06, + "loss": 0.2666, + "step": 28070 + }, + { + "epoch": 0.5211586088080112, + "grad_norm": 0.2917388379573822, + "learning_rate": 9.335590124726149e-06, + "loss": 0.1755, + "step": 28072 + }, + { + "epoch": 0.5211957389454298, + "grad_norm": 0.2174740731716156, + "learning_rate": 9.334426216114821e-06, + "loss": 0.1865, + "step": 28074 + }, + { + "epoch": 0.5212328690828485, + "grad_norm": 0.3238449692726135, + "learning_rate": 9.333262316559955e-06, + "loss": 0.2982, + "step": 28076 + }, + { + "epoch": 0.5212699992202671, + "grad_norm": 0.3809027671813965, + "learning_rate": 9.33209842607739e-06, + "loss": 0.4384, + "step": 28078 + }, + { + "epoch": 0.5213071293576858, + "grad_norm": 0.39081084728240967, + "learning_rate": 9.330934544682968e-06, + "loss": 0.1902, + "step": 28080 + }, + { + "epoch": 0.5213442594951044, + "grad_norm": 0.3424111604690552, + "learning_rate": 9.329770672392522e-06, + "loss": 0.2501, + "step": 28082 + }, + { + "epoch": 0.521381389632523, + "grad_norm": 0.49280112981796265, + "learning_rate": 9.32860680922189e-06, + "loss": 0.1777, + "step": 28084 + }, + { + "epoch": 0.5214185197699417, + "grad_norm": 0.5211786031723022, + "learning_rate": 9.327442955186911e-06, + "loss": 0.3472, + "step": 28086 + }, + { + "epoch": 0.5214556499073603, + "grad_norm": 0.8068822026252747, + "learning_rate": 9.326279110303414e-06, + "loss": 0.2726, + "step": 28088 + }, + { + "epoch": 0.521492780044779, + "grad_norm": 0.35349878668785095, + "learning_rate": 9.32511527458724e-06, + "loss": 0.3564, + "step": 28090 + }, + { + "epoch": 0.5215299101821976, + "grad_norm": 0.5040069222450256, + "learning_rate": 9.323951448054227e-06, + "loss": 0.2808, + "step": 28092 + }, + { + "epoch": 0.5215670403196162, + "grad_norm": 0.3312101364135742, + "learning_rate": 9.322787630720212e-06, + "loss": 0.3297, + "step": 28094 + }, + { + "epoch": 0.5216041704570349, + "grad_norm": 1.3002091646194458, + "learning_rate": 9.321623822601026e-06, + "loss": 0.2575, + "step": 28096 + }, + { + "epoch": 0.5216413005944535, + "grad_norm": 0.5194166898727417, + "learning_rate": 9.320460023712508e-06, + "loss": 0.2088, + "step": 28098 + }, + { + "epoch": 0.5216784307318721, + "grad_norm": 0.34553706645965576, + "learning_rate": 9.319296234070497e-06, + "loss": 0.3115, + "step": 28100 + }, + { + "epoch": 0.5217155608692908, + "grad_norm": 0.3347233533859253, + "learning_rate": 9.318132453690822e-06, + "loss": 0.2175, + "step": 28102 + }, + { + "epoch": 0.5217526910067094, + "grad_norm": 0.3491487503051758, + "learning_rate": 9.316968682589325e-06, + "loss": 0.2171, + "step": 28104 + }, + { + "epoch": 0.5217898211441281, + "grad_norm": 0.28038862347602844, + "learning_rate": 9.315804920781835e-06, + "loss": 0.4354, + "step": 28106 + }, + { + "epoch": 0.5218269512815467, + "grad_norm": 0.48502781987190247, + "learning_rate": 9.31464116828419e-06, + "loss": 0.3773, + "step": 28108 + }, + { + "epoch": 0.5218640814189653, + "grad_norm": 0.3009917438030243, + "learning_rate": 9.313477425112228e-06, + "loss": 0.055, + "step": 28110 + }, + { + "epoch": 0.5219012115563839, + "grad_norm": 0.315805584192276, + "learning_rate": 9.312313691281784e-06, + "loss": 0.2719, + "step": 28112 + }, + { + "epoch": 0.5219383416938026, + "grad_norm": 0.40033459663391113, + "learning_rate": 9.311149966808687e-06, + "loss": 0.1994, + "step": 28114 + }, + { + "epoch": 0.5219754718312213, + "grad_norm": 0.31273797154426575, + "learning_rate": 9.309986251708779e-06, + "loss": 0.2127, + "step": 28116 + }, + { + "epoch": 0.5220126019686399, + "grad_norm": 0.4080374240875244, + "learning_rate": 9.308822545997887e-06, + "loss": 0.4004, + "step": 28118 + }, + { + "epoch": 0.5220497321060585, + "grad_norm": 0.5991426110267639, + "learning_rate": 9.30765884969185e-06, + "loss": 0.3441, + "step": 28120 + }, + { + "epoch": 0.5220868622434771, + "grad_norm": 0.3351306617259979, + "learning_rate": 9.306495162806503e-06, + "loss": 0.2095, + "step": 28122 + }, + { + "epoch": 0.5221239923808958, + "grad_norm": 0.48446792364120483, + "learning_rate": 9.305331485357685e-06, + "loss": 0.5414, + "step": 28124 + }, + { + "epoch": 0.5221611225183145, + "grad_norm": 0.4659987688064575, + "learning_rate": 9.30416781736122e-06, + "loss": 0.0927, + "step": 28126 + }, + { + "epoch": 0.522198252655733, + "grad_norm": 0.26826244592666626, + "learning_rate": 9.303004158832948e-06, + "loss": 0.216, + "step": 28128 + }, + { + "epoch": 0.5222353827931517, + "grad_norm": 0.40166664123535156, + "learning_rate": 9.3018405097887e-06, + "loss": 0.3425, + "step": 28130 + }, + { + "epoch": 0.5222725129305703, + "grad_norm": 0.5637459754943848, + "learning_rate": 9.300676870244311e-06, + "loss": 0.3456, + "step": 28132 + }, + { + "epoch": 0.522309643067989, + "grad_norm": 0.4651561975479126, + "learning_rate": 9.299513240215617e-06, + "loss": 0.299, + "step": 28134 + }, + { + "epoch": 0.5223467732054077, + "grad_norm": 0.1611827313899994, + "learning_rate": 9.298349619718448e-06, + "loss": 0.1383, + "step": 28136 + }, + { + "epoch": 0.5223839033428263, + "grad_norm": 0.39330193400382996, + "learning_rate": 9.297186008768644e-06, + "loss": 0.2466, + "step": 28138 + }, + { + "epoch": 0.5224210334802449, + "grad_norm": 0.9296402335166931, + "learning_rate": 9.296022407382026e-06, + "loss": 0.3551, + "step": 28140 + }, + { + "epoch": 0.5224581636176635, + "grad_norm": 0.4432893395423889, + "learning_rate": 9.294858815574438e-06, + "loss": 0.3705, + "step": 28142 + }, + { + "epoch": 0.5224952937550822, + "grad_norm": 0.6663895845413208, + "learning_rate": 9.293695233361709e-06, + "loss": 0.2025, + "step": 28144 + }, + { + "epoch": 0.5225324238925009, + "grad_norm": 0.3534146249294281, + "learning_rate": 9.292531660759673e-06, + "loss": 0.1761, + "step": 28146 + }, + { + "epoch": 0.5225695540299194, + "grad_norm": 0.4033986032009125, + "learning_rate": 9.29136809778416e-06, + "loss": 0.4726, + "step": 28148 + }, + { + "epoch": 0.5226066841673381, + "grad_norm": 0.3837416172027588, + "learning_rate": 9.290204544451009e-06, + "loss": 0.2261, + "step": 28150 + }, + { + "epoch": 0.5226438143047567, + "grad_norm": 0.48960021138191223, + "learning_rate": 9.289041000776044e-06, + "loss": 0.2718, + "step": 28152 + }, + { + "epoch": 0.5226809444421754, + "grad_norm": 0.3605748414993286, + "learning_rate": 9.2878774667751e-06, + "loss": 0.3043, + "step": 28154 + }, + { + "epoch": 0.5227180745795941, + "grad_norm": 0.4591005742549896, + "learning_rate": 9.286713942464012e-06, + "loss": 0.4019, + "step": 28156 + }, + { + "epoch": 0.5227552047170126, + "grad_norm": 0.39213526248931885, + "learning_rate": 9.285550427858613e-06, + "loss": 0.1753, + "step": 28158 + }, + { + "epoch": 0.5227923348544313, + "grad_norm": 0.4175074100494385, + "learning_rate": 9.28438692297473e-06, + "loss": 0.3902, + "step": 28160 + }, + { + "epoch": 0.5228294649918499, + "grad_norm": 0.31997689604759216, + "learning_rate": 9.283223427828202e-06, + "loss": 0.2936, + "step": 28162 + }, + { + "epoch": 0.5228665951292686, + "grad_norm": 0.36666950583457947, + "learning_rate": 9.282059942434853e-06, + "loss": 0.2091, + "step": 28164 + }, + { + "epoch": 0.5229037252666872, + "grad_norm": 0.3158455491065979, + "learning_rate": 9.280896466810517e-06, + "loss": 0.319, + "step": 28166 + }, + { + "epoch": 0.5229408554041058, + "grad_norm": 0.19260850548744202, + "learning_rate": 9.279733000971026e-06, + "loss": 0.3322, + "step": 28168 + }, + { + "epoch": 0.5229779855415245, + "grad_norm": 0.7922513484954834, + "learning_rate": 9.278569544932212e-06, + "loss": 0.2031, + "step": 28170 + }, + { + "epoch": 0.5230151156789431, + "grad_norm": 0.40140706300735474, + "learning_rate": 9.277406098709904e-06, + "loss": 0.1891, + "step": 28172 + }, + { + "epoch": 0.5230522458163618, + "grad_norm": 0.3927055299282074, + "learning_rate": 9.27624266231994e-06, + "loss": 0.3235, + "step": 28174 + }, + { + "epoch": 0.5230893759537804, + "grad_norm": 0.4807228147983551, + "learning_rate": 9.275079235778141e-06, + "loss": 0.2437, + "step": 28176 + }, + { + "epoch": 0.523126506091199, + "grad_norm": 0.47512444853782654, + "learning_rate": 9.27391581910034e-06, + "loss": 0.3789, + "step": 28178 + }, + { + "epoch": 0.5231636362286177, + "grad_norm": 0.3931896388530731, + "learning_rate": 9.272752412302375e-06, + "loss": 0.2345, + "step": 28180 + }, + { + "epoch": 0.5232007663660363, + "grad_norm": 0.36406317353248596, + "learning_rate": 9.271589015400068e-06, + "loss": 0.244, + "step": 28182 + }, + { + "epoch": 0.523237896503455, + "grad_norm": 0.33124038577079773, + "learning_rate": 9.270425628409253e-06, + "loss": 0.1856, + "step": 28184 + }, + { + "epoch": 0.5232750266408736, + "grad_norm": 0.3963111937046051, + "learning_rate": 9.26926225134576e-06, + "loss": 0.3992, + "step": 28186 + }, + { + "epoch": 0.5233121567782922, + "grad_norm": 0.4703839123249054, + "learning_rate": 9.268098884225423e-06, + "loss": 0.1952, + "step": 28188 + }, + { + "epoch": 0.5233492869157109, + "grad_norm": 0.4198876619338989, + "learning_rate": 9.266935527064064e-06, + "loss": 0.3384, + "step": 28190 + }, + { + "epoch": 0.5233864170531295, + "grad_norm": 0.30893459916114807, + "learning_rate": 9.265772179877516e-06, + "loss": 0.1143, + "step": 28192 + }, + { + "epoch": 0.5234235471905482, + "grad_norm": 0.3882627785205841, + "learning_rate": 9.26460884268161e-06, + "loss": 0.3301, + "step": 28194 + }, + { + "epoch": 0.5234606773279668, + "grad_norm": 0.4515036940574646, + "learning_rate": 9.263445515492175e-06, + "loss": 0.2258, + "step": 28196 + }, + { + "epoch": 0.5234978074653854, + "grad_norm": 0.2816133499145508, + "learning_rate": 9.262282198325042e-06, + "loss": 0.2502, + "step": 28198 + }, + { + "epoch": 0.5235349376028041, + "grad_norm": 0.40757328271865845, + "learning_rate": 9.261118891196037e-06, + "loss": 0.3263, + "step": 28200 + }, + { + "epoch": 0.5235720677402227, + "grad_norm": 0.43003734946250916, + "learning_rate": 9.259955594120993e-06, + "loss": 0.2896, + "step": 28202 + }, + { + "epoch": 0.5236091978776414, + "grad_norm": 0.33691883087158203, + "learning_rate": 9.258792307115734e-06, + "loss": 0.3621, + "step": 28204 + }, + { + "epoch": 0.52364632801506, + "grad_norm": 0.4610239267349243, + "learning_rate": 9.25762903019609e-06, + "loss": 0.4275, + "step": 28206 + }, + { + "epoch": 0.5236834581524786, + "grad_norm": 0.4140169322490692, + "learning_rate": 9.256465763377893e-06, + "loss": 0.2953, + "step": 28208 + }, + { + "epoch": 0.5237205882898973, + "grad_norm": 0.4700329005718231, + "learning_rate": 9.25530250667697e-06, + "loss": 0.3914, + "step": 28210 + }, + { + "epoch": 0.5237577184273159, + "grad_norm": 0.38899728655815125, + "learning_rate": 9.254139260109154e-06, + "loss": 0.2721, + "step": 28212 + }, + { + "epoch": 0.5237948485647346, + "grad_norm": 0.40652865171432495, + "learning_rate": 9.252976023690262e-06, + "loss": 0.2205, + "step": 28214 + }, + { + "epoch": 0.5238319787021531, + "grad_norm": 0.4482325315475464, + "learning_rate": 9.25181279743613e-06, + "loss": 0.1083, + "step": 28216 + }, + { + "epoch": 0.5238691088395718, + "grad_norm": 0.5338302254676819, + "learning_rate": 9.250649581362584e-06, + "loss": 0.2671, + "step": 28218 + }, + { + "epoch": 0.5239062389769904, + "grad_norm": 0.45330163836479187, + "learning_rate": 9.249486375485455e-06, + "loss": 0.2875, + "step": 28220 + }, + { + "epoch": 0.5239433691144091, + "grad_norm": 0.4857039153575897, + "learning_rate": 9.248323179820567e-06, + "loss": 0.2919, + "step": 28222 + }, + { + "epoch": 0.5239804992518278, + "grad_norm": 0.5985873341560364, + "learning_rate": 9.247159994383749e-06, + "loss": 0.2469, + "step": 28224 + }, + { + "epoch": 0.5240176293892463, + "grad_norm": 0.5718949437141418, + "learning_rate": 9.245996819190832e-06, + "loss": 0.1501, + "step": 28226 + }, + { + "epoch": 0.524054759526665, + "grad_norm": 0.2761910557746887, + "learning_rate": 9.244833654257636e-06, + "loss": 0.3437, + "step": 28228 + }, + { + "epoch": 0.5240918896640836, + "grad_norm": 0.3907056450843811, + "learning_rate": 9.243670499599992e-06, + "loss": 0.3118, + "step": 28230 + }, + { + "epoch": 0.5241290198015023, + "grad_norm": 0.3935077488422394, + "learning_rate": 9.242507355233728e-06, + "loss": 0.381, + "step": 28232 + }, + { + "epoch": 0.524166149938921, + "grad_norm": 0.4161234200000763, + "learning_rate": 9.241344221174668e-06, + "loss": 0.354, + "step": 28234 + }, + { + "epoch": 0.5242032800763395, + "grad_norm": 0.29580315947532654, + "learning_rate": 9.240181097438642e-06, + "loss": 0.1385, + "step": 28236 + }, + { + "epoch": 0.5242404102137582, + "grad_norm": 0.3137395977973938, + "learning_rate": 9.23901798404148e-06, + "loss": 0.3989, + "step": 28238 + }, + { + "epoch": 0.5242775403511768, + "grad_norm": 0.24473921954631805, + "learning_rate": 9.237854880998998e-06, + "loss": 0.3554, + "step": 28240 + }, + { + "epoch": 0.5243146704885955, + "grad_norm": 0.3380820155143738, + "learning_rate": 9.23669178832703e-06, + "loss": 0.3023, + "step": 28242 + }, + { + "epoch": 0.5243518006260142, + "grad_norm": 0.32730650901794434, + "learning_rate": 9.2355287060414e-06, + "loss": 0.1512, + "step": 28244 + }, + { + "epoch": 0.5243889307634327, + "grad_norm": 0.4202280640602112, + "learning_rate": 9.234365634157933e-06, + "loss": 0.1769, + "step": 28246 + }, + { + "epoch": 0.5244260609008514, + "grad_norm": 0.3653663694858551, + "learning_rate": 9.233202572692457e-06, + "loss": 0.1369, + "step": 28248 + }, + { + "epoch": 0.52446319103827, + "grad_norm": 0.29243552684783936, + "learning_rate": 9.232039521660801e-06, + "loss": 0.1576, + "step": 28250 + }, + { + "epoch": 0.5245003211756887, + "grad_norm": 0.2632932662963867, + "learning_rate": 9.230876481078784e-06, + "loss": 0.2622, + "step": 28252 + }, + { + "epoch": 0.5245374513131074, + "grad_norm": 0.3520936965942383, + "learning_rate": 9.229713450962235e-06, + "loss": 0.2851, + "step": 28254 + }, + { + "epoch": 0.5245745814505259, + "grad_norm": 0.4301888942718506, + "learning_rate": 9.228550431326976e-06, + "loss": 0.2411, + "step": 28256 + }, + { + "epoch": 0.5246117115879446, + "grad_norm": 0.4315635561943054, + "learning_rate": 9.227387422188836e-06, + "loss": 0.3548, + "step": 28258 + }, + { + "epoch": 0.5246488417253632, + "grad_norm": 0.597524881362915, + "learning_rate": 9.226224423563639e-06, + "loss": 0.2527, + "step": 28260 + }, + { + "epoch": 0.5246859718627819, + "grad_norm": 0.29058656096458435, + "learning_rate": 9.225061435467211e-06, + "loss": 0.2446, + "step": 28262 + }, + { + "epoch": 0.5247231020002004, + "grad_norm": 0.40379804372787476, + "learning_rate": 9.223898457915377e-06, + "loss": 0.422, + "step": 28264 + }, + { + "epoch": 0.5247602321376191, + "grad_norm": 0.2448199838399887, + "learning_rate": 9.22273549092396e-06, + "loss": 0.0734, + "step": 28266 + }, + { + "epoch": 0.5247973622750378, + "grad_norm": 0.36263418197631836, + "learning_rate": 9.221572534508781e-06, + "loss": 0.1742, + "step": 28268 + }, + { + "epoch": 0.5248344924124564, + "grad_norm": 0.29628270864486694, + "learning_rate": 9.220409588685671e-06, + "loss": 0.3642, + "step": 28270 + }, + { + "epoch": 0.5248716225498751, + "grad_norm": 0.4896189868450165, + "learning_rate": 9.219246653470448e-06, + "loss": 0.1655, + "step": 28272 + }, + { + "epoch": 0.5249087526872936, + "grad_norm": 0.38273170590400696, + "learning_rate": 9.218083728878943e-06, + "loss": 0.2669, + "step": 28274 + }, + { + "epoch": 0.5249458828247123, + "grad_norm": 0.32097193598747253, + "learning_rate": 9.21692081492698e-06, + "loss": 0.346, + "step": 28276 + }, + { + "epoch": 0.524983012962131, + "grad_norm": 0.5770221948623657, + "learning_rate": 9.215757911630373e-06, + "loss": 0.425, + "step": 28278 + }, + { + "epoch": 0.5250201430995496, + "grad_norm": 0.45663943886756897, + "learning_rate": 9.214595019004952e-06, + "loss": 0.2861, + "step": 28280 + }, + { + "epoch": 0.5250572732369683, + "grad_norm": 0.5390895009040833, + "learning_rate": 9.213432137066541e-06, + "loss": 0.402, + "step": 28282 + }, + { + "epoch": 0.5250944033743868, + "grad_norm": 0.30878981947898865, + "learning_rate": 9.212269265830964e-06, + "loss": 0.1949, + "step": 28284 + }, + { + "epoch": 0.5251315335118055, + "grad_norm": 0.43309304118156433, + "learning_rate": 9.211106405314043e-06, + "loss": 0.2935, + "step": 28286 + }, + { + "epoch": 0.5251686636492242, + "grad_norm": 0.5291520953178406, + "learning_rate": 9.209943555531602e-06, + "loss": 0.1873, + "step": 28288 + }, + { + "epoch": 0.5252057937866428, + "grad_norm": 0.4113317131996155, + "learning_rate": 9.208780716499459e-06, + "loss": 0.3627, + "step": 28290 + }, + { + "epoch": 0.5252429239240615, + "grad_norm": 0.4700908064842224, + "learning_rate": 9.20761788823344e-06, + "loss": 0.2292, + "step": 28292 + }, + { + "epoch": 0.52528005406148, + "grad_norm": 0.3196422755718231, + "learning_rate": 9.20645507074937e-06, + "loss": 0.1897, + "step": 28294 + }, + { + "epoch": 0.5253171841988987, + "grad_norm": 0.6900250315666199, + "learning_rate": 9.20529226406307e-06, + "loss": 0.1252, + "step": 28296 + }, + { + "epoch": 0.5253543143363174, + "grad_norm": 0.2927192151546478, + "learning_rate": 9.204129468190362e-06, + "loss": 0.2493, + "step": 28298 + }, + { + "epoch": 0.525391444473736, + "grad_norm": 0.461359441280365, + "learning_rate": 9.202966683147065e-06, + "loss": 0.2976, + "step": 28300 + }, + { + "epoch": 0.5254285746111547, + "grad_norm": 0.27889353036880493, + "learning_rate": 9.201803908949011e-06, + "loss": 0.2777, + "step": 28302 + }, + { + "epoch": 0.5254657047485732, + "grad_norm": 0.3735930621623993, + "learning_rate": 9.20064114561201e-06, + "loss": 0.4009, + "step": 28304 + }, + { + "epoch": 0.5255028348859919, + "grad_norm": 0.5680769681930542, + "learning_rate": 9.19947839315189e-06, + "loss": 0.3545, + "step": 28306 + }, + { + "epoch": 0.5255399650234106, + "grad_norm": 0.47031331062316895, + "learning_rate": 9.198315651584468e-06, + "loss": 0.4245, + "step": 28308 + }, + { + "epoch": 0.5255770951608292, + "grad_norm": 0.4699476659297943, + "learning_rate": 9.19715292092557e-06, + "loss": 0.1753, + "step": 28310 + }, + { + "epoch": 0.5256142252982479, + "grad_norm": 0.5114132165908813, + "learning_rate": 9.195990201191017e-06, + "loss": 0.484, + "step": 28312 + }, + { + "epoch": 0.5256513554356664, + "grad_norm": 0.23072189092636108, + "learning_rate": 9.194827492396631e-06, + "loss": 0.4752, + "step": 28314 + }, + { + "epoch": 0.5256884855730851, + "grad_norm": 0.37426748871803284, + "learning_rate": 9.19366479455823e-06, + "loss": 0.2425, + "step": 28316 + }, + { + "epoch": 0.5257256157105037, + "grad_norm": 0.38548386096954346, + "learning_rate": 9.192502107691636e-06, + "loss": 0.4286, + "step": 28318 + }, + { + "epoch": 0.5257627458479224, + "grad_norm": 0.3310141861438751, + "learning_rate": 9.191339431812666e-06, + "loss": 0.2724, + "step": 28320 + }, + { + "epoch": 0.5257998759853411, + "grad_norm": 0.2608674466609955, + "learning_rate": 9.190176766937147e-06, + "loss": 0.1642, + "step": 28322 + }, + { + "epoch": 0.5258370061227596, + "grad_norm": 0.5083295106887817, + "learning_rate": 9.189014113080894e-06, + "loss": 0.2406, + "step": 28324 + }, + { + "epoch": 0.5258741362601783, + "grad_norm": 0.4197280704975128, + "learning_rate": 9.187851470259736e-06, + "loss": 0.1947, + "step": 28326 + }, + { + "epoch": 0.5259112663975969, + "grad_norm": 0.41674289107322693, + "learning_rate": 9.186688838489483e-06, + "loss": 0.2755, + "step": 28328 + }, + { + "epoch": 0.5259483965350156, + "grad_norm": 0.2371097356081009, + "learning_rate": 9.185526217785959e-06, + "loss": 0.2025, + "step": 28330 + }, + { + "epoch": 0.5259855266724343, + "grad_norm": 0.4113895297050476, + "learning_rate": 9.18436360816498e-06, + "loss": 0.2106, + "step": 28332 + }, + { + "epoch": 0.5260226568098528, + "grad_norm": 0.3476105034351349, + "learning_rate": 9.183201009642372e-06, + "loss": 0.2154, + "step": 28334 + }, + { + "epoch": 0.5260597869472715, + "grad_norm": 0.37932664155960083, + "learning_rate": 9.182038422233952e-06, + "loss": 0.5002, + "step": 28336 + }, + { + "epoch": 0.5260969170846901, + "grad_norm": 0.4214911162853241, + "learning_rate": 9.180875845955542e-06, + "loss": 0.1484, + "step": 28338 + }, + { + "epoch": 0.5261340472221088, + "grad_norm": 0.32194986939430237, + "learning_rate": 9.179713280822955e-06, + "loss": 0.349, + "step": 28340 + }, + { + "epoch": 0.5261711773595275, + "grad_norm": 0.49952277541160583, + "learning_rate": 9.178550726852012e-06, + "loss": 0.2022, + "step": 28342 + }, + { + "epoch": 0.526208307496946, + "grad_norm": 0.3168894648551941, + "learning_rate": 9.177388184058533e-06, + "loss": 0.2835, + "step": 28344 + }, + { + "epoch": 0.5262454376343647, + "grad_norm": 0.31756705045700073, + "learning_rate": 9.176225652458338e-06, + "loss": 0.3117, + "step": 28346 + }, + { + "epoch": 0.5262825677717833, + "grad_norm": 0.35774365067481995, + "learning_rate": 9.175063132067246e-06, + "loss": 0.3104, + "step": 28348 + }, + { + "epoch": 0.526319697909202, + "grad_norm": 0.4334663152694702, + "learning_rate": 9.17390062290107e-06, + "loss": 0.2554, + "step": 28350 + }, + { + "epoch": 0.5263568280466207, + "grad_norm": 0.39199215173721313, + "learning_rate": 9.172738124975639e-06, + "loss": 0.2479, + "step": 28352 + }, + { + "epoch": 0.5263939581840392, + "grad_norm": 0.2825665771961212, + "learning_rate": 9.171575638306758e-06, + "loss": 0.4201, + "step": 28354 + }, + { + "epoch": 0.5264310883214579, + "grad_norm": 0.51008540391922, + "learning_rate": 9.170413162910252e-06, + "loss": 0.3606, + "step": 28356 + }, + { + "epoch": 0.5264682184588765, + "grad_norm": 0.3553040027618408, + "learning_rate": 9.16925069880194e-06, + "loss": 0.2364, + "step": 28358 + }, + { + "epoch": 0.5265053485962952, + "grad_norm": 0.43595871329307556, + "learning_rate": 9.168088245997635e-06, + "loss": 0.3047, + "step": 28360 + }, + { + "epoch": 0.5265424787337138, + "grad_norm": 0.2794182002544403, + "learning_rate": 9.166925804513157e-06, + "loss": 0.3564, + "step": 28362 + }, + { + "epoch": 0.5265796088711324, + "grad_norm": 0.5841917395591736, + "learning_rate": 9.165763374364327e-06, + "loss": 0.2978, + "step": 28364 + }, + { + "epoch": 0.5266167390085511, + "grad_norm": 0.4386557936668396, + "learning_rate": 9.164600955566957e-06, + "loss": 0.2326, + "step": 28366 + }, + { + "epoch": 0.5266538691459697, + "grad_norm": 0.4702779948711395, + "learning_rate": 9.163438548136863e-06, + "loss": 0.2682, + "step": 28368 + }, + { + "epoch": 0.5266909992833884, + "grad_norm": 0.30933886766433716, + "learning_rate": 9.162276152089868e-06, + "loss": 0.3006, + "step": 28370 + }, + { + "epoch": 0.5267281294208069, + "grad_norm": 0.3269096612930298, + "learning_rate": 9.161113767441784e-06, + "loss": 0.2067, + "step": 28372 + }, + { + "epoch": 0.5267652595582256, + "grad_norm": 0.3352173864841461, + "learning_rate": 9.159951394208427e-06, + "loss": 0.4423, + "step": 28374 + }, + { + "epoch": 0.5268023896956443, + "grad_norm": 0.5195205211639404, + "learning_rate": 9.15878903240562e-06, + "loss": 0.2642, + "step": 28376 + }, + { + "epoch": 0.5268395198330629, + "grad_norm": 0.2897060811519623, + "learning_rate": 9.157626682049172e-06, + "loss": 0.2127, + "step": 28378 + }, + { + "epoch": 0.5268766499704816, + "grad_norm": 0.40540388226509094, + "learning_rate": 9.1564643431549e-06, + "loss": 0.3407, + "step": 28380 + }, + { + "epoch": 0.5269137801079001, + "grad_norm": 0.5326733589172363, + "learning_rate": 9.155302015738623e-06, + "loss": 0.2676, + "step": 28382 + }, + { + "epoch": 0.5269509102453188, + "grad_norm": 0.46720582246780396, + "learning_rate": 9.154139699816155e-06, + "loss": 0.3279, + "step": 28384 + }, + { + "epoch": 0.5269880403827375, + "grad_norm": 0.466681569814682, + "learning_rate": 9.152977395403312e-06, + "loss": 0.3336, + "step": 28386 + }, + { + "epoch": 0.5270251705201561, + "grad_norm": 0.495192289352417, + "learning_rate": 9.15181510251591e-06, + "loss": 0.0963, + "step": 28388 + }, + { + "epoch": 0.5270623006575748, + "grad_norm": 0.3837859034538269, + "learning_rate": 9.150652821169766e-06, + "loss": 0.3577, + "step": 28390 + }, + { + "epoch": 0.5270994307949933, + "grad_norm": 0.31270748376846313, + "learning_rate": 9.149490551380692e-06, + "loss": 0.2454, + "step": 28392 + }, + { + "epoch": 0.527136560932412, + "grad_norm": 0.30723896622657776, + "learning_rate": 9.148328293164502e-06, + "loss": 0.2585, + "step": 28394 + }, + { + "epoch": 0.5271736910698307, + "grad_norm": 0.4640527367591858, + "learning_rate": 9.147166046537013e-06, + "loss": 0.4626, + "step": 28396 + }, + { + "epoch": 0.5272108212072493, + "grad_norm": 0.4258682429790497, + "learning_rate": 9.146003811514039e-06, + "loss": 0.1764, + "step": 28398 + }, + { + "epoch": 0.527247951344668, + "grad_norm": 0.46926218271255493, + "learning_rate": 9.144841588111397e-06, + "loss": 0.2698, + "step": 28400 + }, + { + "epoch": 0.5272850814820865, + "grad_norm": 0.3380294442176819, + "learning_rate": 9.1436793763449e-06, + "loss": 0.2317, + "step": 28402 + }, + { + "epoch": 0.5273222116195052, + "grad_norm": 0.3501266539096832, + "learning_rate": 9.142517176230362e-06, + "loss": 0.2335, + "step": 28404 + }, + { + "epoch": 0.5273593417569239, + "grad_norm": 0.29773467779159546, + "learning_rate": 9.141354987783595e-06, + "loss": 0.2064, + "step": 28406 + }, + { + "epoch": 0.5273964718943425, + "grad_norm": 0.42288297414779663, + "learning_rate": 9.140192811020412e-06, + "loss": 0.3167, + "step": 28408 + }, + { + "epoch": 0.5274336020317612, + "grad_norm": 0.24485230445861816, + "learning_rate": 9.139030645956633e-06, + "loss": 0.0877, + "step": 28410 + }, + { + "epoch": 0.5274707321691797, + "grad_norm": 0.4290095567703247, + "learning_rate": 9.137868492608069e-06, + "loss": 0.2641, + "step": 28412 + }, + { + "epoch": 0.5275078623065984, + "grad_norm": 0.4234887361526489, + "learning_rate": 9.136706350990534e-06, + "loss": 0.359, + "step": 28414 + }, + { + "epoch": 0.527544992444017, + "grad_norm": 0.2719157934188843, + "learning_rate": 9.135544221119836e-06, + "loss": 0.3014, + "step": 28416 + }, + { + "epoch": 0.5275821225814357, + "grad_norm": 0.35349541902542114, + "learning_rate": 9.13438210301179e-06, + "loss": 0.2665, + "step": 28418 + }, + { + "epoch": 0.5276192527188543, + "grad_norm": 0.3618432879447937, + "learning_rate": 9.133219996682213e-06, + "loss": 0.2824, + "step": 28420 + }, + { + "epoch": 0.5276563828562729, + "grad_norm": 0.3793849050998688, + "learning_rate": 9.132057902146917e-06, + "loss": 0.5318, + "step": 28422 + }, + { + "epoch": 0.5276935129936916, + "grad_norm": 0.4053475856781006, + "learning_rate": 9.13089581942171e-06, + "loss": 0.3727, + "step": 28424 + }, + { + "epoch": 0.5277306431311102, + "grad_norm": 0.3458296060562134, + "learning_rate": 9.12973374852241e-06, + "loss": 0.3271, + "step": 28426 + }, + { + "epoch": 0.5277677732685289, + "grad_norm": 0.31473100185394287, + "learning_rate": 9.128571689464829e-06, + "loss": 0.05, + "step": 28428 + }, + { + "epoch": 0.5278049034059475, + "grad_norm": 0.38306495547294617, + "learning_rate": 9.127409642264773e-06, + "loss": 0.2475, + "step": 28430 + }, + { + "epoch": 0.5278420335433661, + "grad_norm": 0.2218981236219406, + "learning_rate": 9.12624760693806e-06, + "loss": 0.338, + "step": 28432 + }, + { + "epoch": 0.5278791636807848, + "grad_norm": 0.6062273979187012, + "learning_rate": 9.1250855835005e-06, + "loss": 0.5244, + "step": 28434 + }, + { + "epoch": 0.5279162938182034, + "grad_norm": 0.48028719425201416, + "learning_rate": 9.123923571967902e-06, + "loss": 0.3459, + "step": 28436 + }, + { + "epoch": 0.5279534239556221, + "grad_norm": 0.43190088868141174, + "learning_rate": 9.122761572356082e-06, + "loss": 0.4084, + "step": 28438 + }, + { + "epoch": 0.5279905540930407, + "grad_norm": 0.2599162459373474, + "learning_rate": 9.121599584680853e-06, + "loss": 0.1965, + "step": 28440 + }, + { + "epoch": 0.5280276842304593, + "grad_norm": 0.28590288758277893, + "learning_rate": 9.120437608958016e-06, + "loss": 0.2318, + "step": 28442 + }, + { + "epoch": 0.528064814367878, + "grad_norm": 0.3959429860115051, + "learning_rate": 9.119275645203394e-06, + "loss": 0.247, + "step": 28444 + }, + { + "epoch": 0.5281019445052966, + "grad_norm": 0.2837539315223694, + "learning_rate": 9.118113693432788e-06, + "loss": 0.3302, + "step": 28446 + }, + { + "epoch": 0.5281390746427153, + "grad_norm": 0.4189326763153076, + "learning_rate": 9.116951753662014e-06, + "loss": 0.4026, + "step": 28448 + }, + { + "epoch": 0.5281762047801339, + "grad_norm": 0.4856795370578766, + "learning_rate": 9.115789825906882e-06, + "loss": 0.495, + "step": 28450 + }, + { + "epoch": 0.5282133349175525, + "grad_norm": 0.47973716259002686, + "learning_rate": 9.114627910183205e-06, + "loss": 0.2553, + "step": 28452 + }, + { + "epoch": 0.5282504650549712, + "grad_norm": 0.3574253022670746, + "learning_rate": 9.113466006506787e-06, + "loss": 0.2375, + "step": 28454 + }, + { + "epoch": 0.5282875951923898, + "grad_norm": 0.30163365602493286, + "learning_rate": 9.112304114893443e-06, + "loss": 0.4018, + "step": 28456 + }, + { + "epoch": 0.5283247253298085, + "grad_norm": 0.44410839676856995, + "learning_rate": 9.11114223535898e-06, + "loss": 0.2864, + "step": 28458 + }, + { + "epoch": 0.5283618554672271, + "grad_norm": 0.7240673899650574, + "learning_rate": 9.109980367919207e-06, + "loss": 0.2536, + "step": 28460 + }, + { + "epoch": 0.5283989856046457, + "grad_norm": 0.3009434938430786, + "learning_rate": 9.108818512589936e-06, + "loss": 0.4371, + "step": 28462 + }, + { + "epoch": 0.5284361157420644, + "grad_norm": 0.2967061698436737, + "learning_rate": 9.10765666938698e-06, + "loss": 0.1812, + "step": 28464 + }, + { + "epoch": 0.528473245879483, + "grad_norm": 0.2505098879337311, + "learning_rate": 9.106494838326142e-06, + "loss": 0.1836, + "step": 28466 + }, + { + "epoch": 0.5285103760169017, + "grad_norm": 0.543154239654541, + "learning_rate": 9.105333019423229e-06, + "loss": 0.2606, + "step": 28468 + }, + { + "epoch": 0.5285475061543202, + "grad_norm": 0.35611391067504883, + "learning_rate": 9.104171212694055e-06, + "loss": 0.1888, + "step": 28470 + }, + { + "epoch": 0.5285846362917389, + "grad_norm": 0.3405942916870117, + "learning_rate": 9.103009418154427e-06, + "loss": 0.3491, + "step": 28472 + }, + { + "epoch": 0.5286217664291576, + "grad_norm": 0.41428476572036743, + "learning_rate": 9.101847635820157e-06, + "loss": 0.2503, + "step": 28474 + }, + { + "epoch": 0.5286588965665762, + "grad_norm": 0.3893307149410248, + "learning_rate": 9.100685865707049e-06, + "loss": 0.3167, + "step": 28476 + }, + { + "epoch": 0.5286960267039948, + "grad_norm": 0.3714529275894165, + "learning_rate": 9.099524107830915e-06, + "loss": 0.2076, + "step": 28478 + }, + { + "epoch": 0.5287331568414134, + "grad_norm": 0.4382079243659973, + "learning_rate": 9.098362362207557e-06, + "loss": 0.2234, + "step": 28480 + }, + { + "epoch": 0.5287702869788321, + "grad_norm": 0.48060309886932373, + "learning_rate": 9.097200628852788e-06, + "loss": 0.2515, + "step": 28482 + }, + { + "epoch": 0.5288074171162508, + "grad_norm": 0.16425210237503052, + "learning_rate": 9.096038907782412e-06, + "loss": 0.2991, + "step": 28484 + }, + { + "epoch": 0.5288445472536694, + "grad_norm": 0.3998776376247406, + "learning_rate": 9.094877199012241e-06, + "loss": 0.4614, + "step": 28486 + }, + { + "epoch": 0.528881677391088, + "grad_norm": 0.3245198130607605, + "learning_rate": 9.09371550255808e-06, + "loss": 0.2493, + "step": 28488 + }, + { + "epoch": 0.5289188075285066, + "grad_norm": 0.3437099754810333, + "learning_rate": 9.092553818435739e-06, + "loss": 0.2995, + "step": 28490 + }, + { + "epoch": 0.5289559376659253, + "grad_norm": 0.3244602084159851, + "learning_rate": 9.09139214666102e-06, + "loss": 0.5073, + "step": 28492 + }, + { + "epoch": 0.528993067803344, + "grad_norm": 0.4118961691856384, + "learning_rate": 9.09023048724973e-06, + "loss": 0.3693, + "step": 28494 + }, + { + "epoch": 0.5290301979407626, + "grad_norm": 0.4771861135959625, + "learning_rate": 9.089068840217681e-06, + "loss": 0.2352, + "step": 28496 + }, + { + "epoch": 0.5290673280781812, + "grad_norm": 0.3344717025756836, + "learning_rate": 9.087907205580675e-06, + "loss": 0.3935, + "step": 28498 + }, + { + "epoch": 0.5291044582155998, + "grad_norm": 0.7406889200210571, + "learning_rate": 9.086745583354518e-06, + "loss": 0.4143, + "step": 28500 + }, + { + "epoch": 0.5291415883530185, + "grad_norm": 0.3255789279937744, + "learning_rate": 9.085583973555024e-06, + "loss": 0.3686, + "step": 28502 + }, + { + "epoch": 0.5291787184904372, + "grad_norm": 0.36220628023147583, + "learning_rate": 9.08442237619799e-06, + "loss": 0.2573, + "step": 28504 + }, + { + "epoch": 0.5292158486278558, + "grad_norm": 0.3419683277606964, + "learning_rate": 9.083260791299223e-06, + "loss": 0.2444, + "step": 28506 + }, + { + "epoch": 0.5292529787652744, + "grad_norm": 0.3914368450641632, + "learning_rate": 9.082099218874535e-06, + "loss": 0.4161, + "step": 28508 + }, + { + "epoch": 0.529290108902693, + "grad_norm": 0.38675177097320557, + "learning_rate": 9.080937658939721e-06, + "loss": 0.2112, + "step": 28510 + }, + { + "epoch": 0.5293272390401117, + "grad_norm": 0.37038740515708923, + "learning_rate": 9.079776111510597e-06, + "loss": 0.2183, + "step": 28512 + }, + { + "epoch": 0.5293643691775304, + "grad_norm": 0.2745799124240875, + "learning_rate": 9.07861457660296e-06, + "loss": 0.2575, + "step": 28514 + }, + { + "epoch": 0.529401499314949, + "grad_norm": 0.26264482736587524, + "learning_rate": 9.077453054232625e-06, + "loss": 0.3371, + "step": 28516 + }, + { + "epoch": 0.5294386294523676, + "grad_norm": 0.43180668354034424, + "learning_rate": 9.076291544415387e-06, + "loss": 0.1423, + "step": 28518 + }, + { + "epoch": 0.5294757595897862, + "grad_norm": 0.36550915241241455, + "learning_rate": 9.075130047167056e-06, + "loss": 0.2119, + "step": 28520 + }, + { + "epoch": 0.5295128897272049, + "grad_norm": 0.2861661911010742, + "learning_rate": 9.07396856250343e-06, + "loss": 0.4124, + "step": 28522 + }, + { + "epoch": 0.5295500198646235, + "grad_norm": 0.5105169415473938, + "learning_rate": 9.072807090440321e-06, + "loss": 0.2854, + "step": 28524 + }, + { + "epoch": 0.5295871500020422, + "grad_norm": 0.2688589096069336, + "learning_rate": 9.071645630993533e-06, + "loss": 0.3294, + "step": 28526 + }, + { + "epoch": 0.5296242801394608, + "grad_norm": 0.3442625403404236, + "learning_rate": 9.070484184178865e-06, + "loss": 0.2519, + "step": 28528 + }, + { + "epoch": 0.5296614102768794, + "grad_norm": 0.4653208255767822, + "learning_rate": 9.069322750012125e-06, + "loss": 0.3002, + "step": 28530 + }, + { + "epoch": 0.5296985404142981, + "grad_norm": 0.27355819940567017, + "learning_rate": 9.06816132850911e-06, + "loss": 0.2936, + "step": 28532 + }, + { + "epoch": 0.5297356705517167, + "grad_norm": 0.20658884942531586, + "learning_rate": 9.06699991968563e-06, + "loss": 0.1736, + "step": 28534 + }, + { + "epoch": 0.5297728006891353, + "grad_norm": 0.49564096331596375, + "learning_rate": 9.065838523557485e-06, + "loss": 0.4089, + "step": 28536 + }, + { + "epoch": 0.529809930826554, + "grad_norm": 0.5529326796531677, + "learning_rate": 9.064677140140483e-06, + "loss": 0.2418, + "step": 28538 + }, + { + "epoch": 0.5298470609639726, + "grad_norm": 0.4253714680671692, + "learning_rate": 9.063515769450423e-06, + "loss": 0.1565, + "step": 28540 + }, + { + "epoch": 0.5298841911013913, + "grad_norm": 0.5363646745681763, + "learning_rate": 9.062354411503108e-06, + "loss": 0.3174, + "step": 28542 + }, + { + "epoch": 0.5299213212388099, + "grad_norm": 0.2433885931968689, + "learning_rate": 9.06119306631434e-06, + "loss": 0.3749, + "step": 28544 + }, + { + "epoch": 0.5299584513762285, + "grad_norm": 0.3358222544193268, + "learning_rate": 9.060031733899921e-06, + "loss": 0.3841, + "step": 28546 + }, + { + "epoch": 0.5299955815136472, + "grad_norm": 0.336128830909729, + "learning_rate": 9.058870414275656e-06, + "loss": 0.3237, + "step": 28548 + }, + { + "epoch": 0.5300327116510658, + "grad_norm": 0.3295275866985321, + "learning_rate": 9.057709107457347e-06, + "loss": 0.2777, + "step": 28550 + }, + { + "epoch": 0.5300698417884845, + "grad_norm": 0.31791406869888306, + "learning_rate": 9.05654781346079e-06, + "loss": 0.1994, + "step": 28552 + }, + { + "epoch": 0.5301069719259031, + "grad_norm": 0.4116072356700897, + "learning_rate": 9.055386532301799e-06, + "loss": 0.3552, + "step": 28554 + }, + { + "epoch": 0.5301441020633217, + "grad_norm": 0.5951401591300964, + "learning_rate": 9.054225263996162e-06, + "loss": 0.3792, + "step": 28556 + }, + { + "epoch": 0.5301812322007404, + "grad_norm": 0.40632858872413635, + "learning_rate": 9.053064008559686e-06, + "loss": 0.3236, + "step": 28558 + }, + { + "epoch": 0.530218362338159, + "grad_norm": 0.337022989988327, + "learning_rate": 9.051902766008175e-06, + "loss": 0.3186, + "step": 28560 + }, + { + "epoch": 0.5302554924755777, + "grad_norm": 0.40571415424346924, + "learning_rate": 9.050741536357427e-06, + "loss": 0.1985, + "step": 28562 + }, + { + "epoch": 0.5302926226129963, + "grad_norm": 0.3221340477466583, + "learning_rate": 9.049580319623242e-06, + "loss": 0.4269, + "step": 28564 + }, + { + "epoch": 0.5303297527504149, + "grad_norm": 0.27010414004325867, + "learning_rate": 9.048419115821427e-06, + "loss": 0.2755, + "step": 28566 + }, + { + "epoch": 0.5303668828878335, + "grad_norm": 0.3577069640159607, + "learning_rate": 9.047257924967772e-06, + "loss": 0.173, + "step": 28568 + }, + { + "epoch": 0.5304040130252522, + "grad_norm": 0.4532218277454376, + "learning_rate": 9.046096747078083e-06, + "loss": 0.2328, + "step": 28570 + }, + { + "epoch": 0.5304411431626709, + "grad_norm": 0.27819734811782837, + "learning_rate": 9.044935582168162e-06, + "loss": 0.21, + "step": 28572 + }, + { + "epoch": 0.5304782733000895, + "grad_norm": 0.4359601140022278, + "learning_rate": 9.043774430253806e-06, + "loss": 0.2805, + "step": 28574 + }, + { + "epoch": 0.5305154034375081, + "grad_norm": 0.3099932372570038, + "learning_rate": 9.042613291350815e-06, + "loss": 0.1423, + "step": 28576 + }, + { + "epoch": 0.5305525335749267, + "grad_norm": 0.4820757806301117, + "learning_rate": 9.041452165474993e-06, + "loss": 0.3898, + "step": 28578 + }, + { + "epoch": 0.5305896637123454, + "grad_norm": 0.556048572063446, + "learning_rate": 9.040291052642131e-06, + "loss": 0.3458, + "step": 28580 + }, + { + "epoch": 0.5306267938497641, + "grad_norm": 0.3959524929523468, + "learning_rate": 9.039129952868037e-06, + "loss": 0.1553, + "step": 28582 + }, + { + "epoch": 0.5306639239871827, + "grad_norm": 0.3517950475215912, + "learning_rate": 9.037968866168504e-06, + "loss": 0.3405, + "step": 28584 + }, + { + "epoch": 0.5307010541246013, + "grad_norm": 0.32837456464767456, + "learning_rate": 9.036807792559334e-06, + "loss": 0.0859, + "step": 28586 + }, + { + "epoch": 0.5307381842620199, + "grad_norm": 0.41319870948791504, + "learning_rate": 9.035646732056322e-06, + "loss": 0.2147, + "step": 28588 + }, + { + "epoch": 0.5307753143994386, + "grad_norm": 0.33680450916290283, + "learning_rate": 9.034485684675273e-06, + "loss": 0.4008, + "step": 28590 + }, + { + "epoch": 0.5308124445368573, + "grad_norm": 0.4057881832122803, + "learning_rate": 9.033324650431983e-06, + "loss": 0.1335, + "step": 28592 + }, + { + "epoch": 0.5308495746742758, + "grad_norm": 0.27993640303611755, + "learning_rate": 9.032163629342248e-06, + "loss": 0.2056, + "step": 28594 + }, + { + "epoch": 0.5308867048116945, + "grad_norm": 0.32959744334220886, + "learning_rate": 9.031002621421864e-06, + "loss": 0.3711, + "step": 28596 + }, + { + "epoch": 0.5309238349491131, + "grad_norm": 0.5029144883155823, + "learning_rate": 9.029841626686633e-06, + "loss": 0.235, + "step": 28598 + }, + { + "epoch": 0.5309609650865318, + "grad_norm": 0.3615346848964691, + "learning_rate": 9.02868064515235e-06, + "loss": 0.2299, + "step": 28600 + }, + { + "epoch": 0.5309980952239505, + "grad_norm": 0.49010297656059265, + "learning_rate": 9.027519676834817e-06, + "loss": 0.1367, + "step": 28602 + }, + { + "epoch": 0.531035225361369, + "grad_norm": 0.43259549140930176, + "learning_rate": 9.026358721749831e-06, + "loss": 0.3543, + "step": 28604 + }, + { + "epoch": 0.5310723554987877, + "grad_norm": 0.5976904630661011, + "learning_rate": 9.025197779913182e-06, + "loss": 0.2803, + "step": 28606 + }, + { + "epoch": 0.5311094856362063, + "grad_norm": 0.44180819392204285, + "learning_rate": 9.024036851340671e-06, + "loss": 0.1787, + "step": 28608 + }, + { + "epoch": 0.531146615773625, + "grad_norm": 0.3821260929107666, + "learning_rate": 9.022875936048095e-06, + "loss": 0.3309, + "step": 28610 + }, + { + "epoch": 0.5311837459110437, + "grad_norm": 0.27863022685050964, + "learning_rate": 9.021715034051253e-06, + "loss": 0.2108, + "step": 28612 + }, + { + "epoch": 0.5312208760484622, + "grad_norm": 0.27270522713661194, + "learning_rate": 9.020554145365937e-06, + "loss": 0.3301, + "step": 28614 + }, + { + "epoch": 0.5312580061858809, + "grad_norm": 0.09505432844161987, + "learning_rate": 9.01939327000795e-06, + "loss": 0.1631, + "step": 28616 + }, + { + "epoch": 0.5312951363232995, + "grad_norm": 0.3926463723182678, + "learning_rate": 9.018232407993079e-06, + "loss": 0.4673, + "step": 28618 + }, + { + "epoch": 0.5313322664607182, + "grad_norm": 0.3896433115005493, + "learning_rate": 9.017071559337122e-06, + "loss": 0.3096, + "step": 28620 + }, + { + "epoch": 0.5313693965981368, + "grad_norm": 0.5015110373497009, + "learning_rate": 9.01591072405588e-06, + "loss": 0.2623, + "step": 28622 + }, + { + "epoch": 0.5314065267355554, + "grad_norm": 0.25698205828666687, + "learning_rate": 9.014749902165145e-06, + "loss": 0.2308, + "step": 28624 + }, + { + "epoch": 0.5314436568729741, + "grad_norm": 0.2424246072769165, + "learning_rate": 9.013589093680712e-06, + "loss": 0.1352, + "step": 28626 + }, + { + "epoch": 0.5314807870103927, + "grad_norm": 0.328136146068573, + "learning_rate": 9.012428298618381e-06, + "loss": 0.3104, + "step": 28628 + }, + { + "epoch": 0.5315179171478114, + "grad_norm": 0.29206541180610657, + "learning_rate": 9.011267516993938e-06, + "loss": 0.2803, + "step": 28630 + }, + { + "epoch": 0.53155504728523, + "grad_norm": 0.6134827136993408, + "learning_rate": 9.010106748823184e-06, + "loss": 0.3915, + "step": 28632 + }, + { + "epoch": 0.5315921774226486, + "grad_norm": 0.3014030158519745, + "learning_rate": 9.008945994121912e-06, + "loss": 0.2853, + "step": 28634 + }, + { + "epoch": 0.5316293075600673, + "grad_norm": 0.38580888509750366, + "learning_rate": 9.007785252905914e-06, + "loss": 0.1835, + "step": 28636 + }, + { + "epoch": 0.5316664376974859, + "grad_norm": 0.3474043309688568, + "learning_rate": 9.006624525190988e-06, + "loss": 0.2729, + "step": 28638 + }, + { + "epoch": 0.5317035678349046, + "grad_norm": 0.37960511445999146, + "learning_rate": 9.005463810992928e-06, + "loss": 0.193, + "step": 28640 + }, + { + "epoch": 0.5317406979723232, + "grad_norm": 0.29074230790138245, + "learning_rate": 9.00430311032753e-06, + "loss": 0.1112, + "step": 28642 + }, + { + "epoch": 0.5317778281097418, + "grad_norm": 0.2615014314651489, + "learning_rate": 9.003142423210578e-06, + "loss": 0.2699, + "step": 28644 + }, + { + "epoch": 0.5318149582471605, + "grad_norm": 0.47838708758354187, + "learning_rate": 9.001981749657876e-06, + "loss": 0.4673, + "step": 28646 + }, + { + "epoch": 0.5318520883845791, + "grad_norm": 0.5226272344589233, + "learning_rate": 9.00082108968521e-06, + "loss": 0.3158, + "step": 28648 + }, + { + "epoch": 0.5318892185219978, + "grad_norm": 0.3913807272911072, + "learning_rate": 8.999660443308376e-06, + "loss": 0.4194, + "step": 28650 + }, + { + "epoch": 0.5319263486594163, + "grad_norm": 0.31727370619773865, + "learning_rate": 8.998499810543167e-06, + "loss": 0.2641, + "step": 28652 + }, + { + "epoch": 0.531963478796835, + "grad_norm": 0.6132047772407532, + "learning_rate": 8.99733919140538e-06, + "loss": 0.2269, + "step": 28654 + }, + { + "epoch": 0.5320006089342537, + "grad_norm": 0.5233863592147827, + "learning_rate": 8.9961785859108e-06, + "loss": 0.3324, + "step": 28656 + }, + { + "epoch": 0.5320377390716723, + "grad_norm": 0.3149389326572418, + "learning_rate": 8.995017994075223e-06, + "loss": 0.4466, + "step": 28658 + }, + { + "epoch": 0.532074869209091, + "grad_norm": 0.5390313863754272, + "learning_rate": 8.99385741591444e-06, + "loss": 0.1472, + "step": 28660 + }, + { + "epoch": 0.5321119993465095, + "grad_norm": 0.5517196655273438, + "learning_rate": 8.992696851444243e-06, + "loss": 0.2837, + "step": 28662 + }, + { + "epoch": 0.5321491294839282, + "grad_norm": 0.5110749006271362, + "learning_rate": 8.991536300680427e-06, + "loss": 0.3229, + "step": 28664 + }, + { + "epoch": 0.5321862596213469, + "grad_norm": 0.2862986624240875, + "learning_rate": 8.99037576363878e-06, + "loss": 0.3643, + "step": 28666 + }, + { + "epoch": 0.5322233897587655, + "grad_norm": 0.30766740441322327, + "learning_rate": 8.989215240335099e-06, + "loss": 0.2406, + "step": 28668 + }, + { + "epoch": 0.5322605198961842, + "grad_norm": 0.5216443538665771, + "learning_rate": 8.988054730785165e-06, + "loss": 0.2923, + "step": 28670 + }, + { + "epoch": 0.5322976500336027, + "grad_norm": 0.4727714955806732, + "learning_rate": 8.986894235004775e-06, + "loss": 0.276, + "step": 28672 + }, + { + "epoch": 0.5323347801710214, + "grad_norm": 0.5130835771560669, + "learning_rate": 8.985733753009722e-06, + "loss": 0.0979, + "step": 28674 + }, + { + "epoch": 0.53237191030844, + "grad_norm": 0.5627540349960327, + "learning_rate": 8.984573284815796e-06, + "loss": 0.1872, + "step": 28676 + }, + { + "epoch": 0.5324090404458587, + "grad_norm": 0.4905412197113037, + "learning_rate": 8.983412830438783e-06, + "loss": 0.3372, + "step": 28678 + }, + { + "epoch": 0.5324461705832774, + "grad_norm": 0.45058009028434753, + "learning_rate": 8.98225238989448e-06, + "loss": 0.3828, + "step": 28680 + }, + { + "epoch": 0.5324833007206959, + "grad_norm": 0.4002496004104614, + "learning_rate": 8.98109196319867e-06, + "loss": 0.3114, + "step": 28682 + }, + { + "epoch": 0.5325204308581146, + "grad_norm": 0.3322855234146118, + "learning_rate": 8.979931550367147e-06, + "loss": 0.2278, + "step": 28684 + }, + { + "epoch": 0.5325575609955332, + "grad_norm": 0.2826186418533325, + "learning_rate": 8.9787711514157e-06, + "loss": 0.2277, + "step": 28686 + }, + { + "epoch": 0.5325946911329519, + "grad_norm": 0.5161633491516113, + "learning_rate": 8.97761076636012e-06, + "loss": 0.3175, + "step": 28688 + }, + { + "epoch": 0.5326318212703706, + "grad_norm": 0.3529422879219055, + "learning_rate": 8.976450395216194e-06, + "loss": 0.2363, + "step": 28690 + }, + { + "epoch": 0.5326689514077891, + "grad_norm": 0.29578402638435364, + "learning_rate": 8.975290037999715e-06, + "loss": 0.2273, + "step": 28692 + }, + { + "epoch": 0.5327060815452078, + "grad_norm": 0.4117681086063385, + "learning_rate": 8.974129694726466e-06, + "loss": 0.2895, + "step": 28694 + }, + { + "epoch": 0.5327432116826264, + "grad_norm": 0.39813968539237976, + "learning_rate": 8.97296936541224e-06, + "loss": 0.3968, + "step": 28696 + }, + { + "epoch": 0.5327803418200451, + "grad_norm": 0.2933594584465027, + "learning_rate": 8.971809050072824e-06, + "loss": 0.2852, + "step": 28698 + }, + { + "epoch": 0.5328174719574638, + "grad_norm": 0.5049290657043457, + "learning_rate": 8.970648748724008e-06, + "loss": 0.2552, + "step": 28700 + }, + { + "epoch": 0.5328546020948823, + "grad_norm": 0.687104344367981, + "learning_rate": 8.969488461381579e-06, + "loss": 0.2803, + "step": 28702 + }, + { + "epoch": 0.532891732232301, + "grad_norm": 0.6806204319000244, + "learning_rate": 8.968328188061326e-06, + "loss": 0.2774, + "step": 28704 + }, + { + "epoch": 0.5329288623697196, + "grad_norm": 0.5600220561027527, + "learning_rate": 8.967167928779036e-06, + "loss": 0.2249, + "step": 28706 + }, + { + "epoch": 0.5329659925071383, + "grad_norm": 0.2667781412601471, + "learning_rate": 8.966007683550495e-06, + "loss": 0.0993, + "step": 28708 + }, + { + "epoch": 0.533003122644557, + "grad_norm": 0.5475757718086243, + "learning_rate": 8.964847452391494e-06, + "loss": 0.1468, + "step": 28710 + }, + { + "epoch": 0.5330402527819755, + "grad_norm": 0.5000697374343872, + "learning_rate": 8.963687235317818e-06, + "loss": 0.3236, + "step": 28712 + }, + { + "epoch": 0.5330773829193942, + "grad_norm": 0.4197654128074646, + "learning_rate": 8.962527032345252e-06, + "loss": 0.2815, + "step": 28714 + }, + { + "epoch": 0.5331145130568128, + "grad_norm": 0.5457786917686462, + "learning_rate": 8.961366843489587e-06, + "loss": 0.2604, + "step": 28716 + }, + { + "epoch": 0.5331516431942315, + "grad_norm": 0.5328965187072754, + "learning_rate": 8.960206668766614e-06, + "loss": 0.3618, + "step": 28718 + }, + { + "epoch": 0.53318877333165, + "grad_norm": 0.2846589684486389, + "learning_rate": 8.95904650819211e-06, + "loss": 0.286, + "step": 28720 + }, + { + "epoch": 0.5332259034690687, + "grad_norm": 0.28273579478263855, + "learning_rate": 8.957886361781862e-06, + "loss": 0.3344, + "step": 28722 + }, + { + "epoch": 0.5332630336064874, + "grad_norm": 0.49682775139808655, + "learning_rate": 8.95672622955166e-06, + "loss": 0.2476, + "step": 28724 + }, + { + "epoch": 0.533300163743906, + "grad_norm": 0.5343006253242493, + "learning_rate": 8.955566111517288e-06, + "loss": 0.324, + "step": 28726 + }, + { + "epoch": 0.5333372938813247, + "grad_norm": 0.3209720551967621, + "learning_rate": 8.954406007694537e-06, + "loss": 0.3463, + "step": 28728 + }, + { + "epoch": 0.5333744240187432, + "grad_norm": 0.41841772198677063, + "learning_rate": 8.953245918099188e-06, + "loss": 0.1869, + "step": 28730 + }, + { + "epoch": 0.5334115541561619, + "grad_norm": 0.32088202238082886, + "learning_rate": 8.952085842747026e-06, + "loss": 0.4696, + "step": 28732 + }, + { + "epoch": 0.5334486842935806, + "grad_norm": 0.2829832136631012, + "learning_rate": 8.950925781653834e-06, + "loss": 0.3488, + "step": 28734 + }, + { + "epoch": 0.5334858144309992, + "grad_norm": 0.2852284908294678, + "learning_rate": 8.949765734835401e-06, + "loss": 0.2503, + "step": 28736 + }, + { + "epoch": 0.5335229445684179, + "grad_norm": 0.3803352415561676, + "learning_rate": 8.94860570230751e-06, + "loss": 0.4592, + "step": 28738 + }, + { + "epoch": 0.5335600747058364, + "grad_norm": 0.3303178548812866, + "learning_rate": 8.947445684085948e-06, + "loss": 0.2805, + "step": 28740 + }, + { + "epoch": 0.5335972048432551, + "grad_norm": 0.4031260907649994, + "learning_rate": 8.9462856801865e-06, + "loss": 0.4815, + "step": 28742 + }, + { + "epoch": 0.5336343349806738, + "grad_norm": 0.2629953920841217, + "learning_rate": 8.945125690624943e-06, + "loss": 0.2595, + "step": 28744 + }, + { + "epoch": 0.5336714651180924, + "grad_norm": 0.3868923485279083, + "learning_rate": 8.943965715417065e-06, + "loss": 0.2858, + "step": 28746 + }, + { + "epoch": 0.5337085952555111, + "grad_norm": 0.4546166658401489, + "learning_rate": 8.942805754578651e-06, + "loss": 0.175, + "step": 28748 + }, + { + "epoch": 0.5337457253929296, + "grad_norm": 0.3049778640270233, + "learning_rate": 8.941645808125487e-06, + "loss": 0.2588, + "step": 28750 + }, + { + "epoch": 0.5337828555303483, + "grad_norm": 0.36252841353416443, + "learning_rate": 8.94048587607335e-06, + "loss": 0.2864, + "step": 28752 + }, + { + "epoch": 0.533819985667767, + "grad_norm": 0.4173068702220917, + "learning_rate": 8.939325958438033e-06, + "loss": 0.3098, + "step": 28754 + }, + { + "epoch": 0.5338571158051856, + "grad_norm": 0.5044727921485901, + "learning_rate": 8.938166055235307e-06, + "loss": 0.254, + "step": 28756 + }, + { + "epoch": 0.5338942459426043, + "grad_norm": 0.21815672516822815, + "learning_rate": 8.93700616648096e-06, + "loss": 0.2525, + "step": 28758 + }, + { + "epoch": 0.5339313760800228, + "grad_norm": 0.2718968093395233, + "learning_rate": 8.935846292190775e-06, + "loss": 0.3024, + "step": 28760 + }, + { + "epoch": 0.5339685062174415, + "grad_norm": 0.38695719838142395, + "learning_rate": 8.934686432380537e-06, + "loss": 0.4865, + "step": 28762 + }, + { + "epoch": 0.5340056363548602, + "grad_norm": 0.5715163350105286, + "learning_rate": 8.933526587066023e-06, + "loss": 0.3981, + "step": 28764 + }, + { + "epoch": 0.5340427664922788, + "grad_norm": 0.49943724274635315, + "learning_rate": 8.93236675626302e-06, + "loss": 0.3209, + "step": 28766 + }, + { + "epoch": 0.5340798966296975, + "grad_norm": 0.2775653600692749, + "learning_rate": 8.931206939987308e-06, + "loss": 0.2458, + "step": 28768 + }, + { + "epoch": 0.534117026767116, + "grad_norm": 0.5197280645370483, + "learning_rate": 8.930047138254667e-06, + "loss": 0.2593, + "step": 28770 + }, + { + "epoch": 0.5341541569045347, + "grad_norm": 0.405407190322876, + "learning_rate": 8.92888735108088e-06, + "loss": 0.2056, + "step": 28772 + }, + { + "epoch": 0.5341912870419533, + "grad_norm": 0.2584144175052643, + "learning_rate": 8.927727578481727e-06, + "loss": 0.338, + "step": 28774 + }, + { + "epoch": 0.534228417179372, + "grad_norm": 0.23247696459293365, + "learning_rate": 8.926567820472989e-06, + "loss": 0.2809, + "step": 28776 + }, + { + "epoch": 0.5342655473167907, + "grad_norm": 0.1724630445241928, + "learning_rate": 8.925408077070448e-06, + "loss": 0.1288, + "step": 28778 + }, + { + "epoch": 0.5343026774542092, + "grad_norm": 0.5522811412811279, + "learning_rate": 8.924248348289888e-06, + "loss": 0.2924, + "step": 28780 + }, + { + "epoch": 0.5343398075916279, + "grad_norm": 0.5627481937408447, + "learning_rate": 8.92308863414708e-06, + "loss": 0.3517, + "step": 28782 + }, + { + "epoch": 0.5343769377290465, + "grad_norm": 0.43672919273376465, + "learning_rate": 8.921928934657814e-06, + "loss": 0.1331, + "step": 28784 + }, + { + "epoch": 0.5344140678664652, + "grad_norm": 0.28904980421066284, + "learning_rate": 8.920769249837865e-06, + "loss": 0.3959, + "step": 28786 + }, + { + "epoch": 0.5344511980038839, + "grad_norm": 0.3805067837238312, + "learning_rate": 8.919609579703013e-06, + "loss": 0.1751, + "step": 28788 + }, + { + "epoch": 0.5344883281413024, + "grad_norm": 0.41834399104118347, + "learning_rate": 8.918449924269038e-06, + "loss": 0.3534, + "step": 28790 + }, + { + "epoch": 0.5345254582787211, + "grad_norm": 0.3008694052696228, + "learning_rate": 8.917290283551724e-06, + "loss": 0.4662, + "step": 28792 + }, + { + "epoch": 0.5345625884161397, + "grad_norm": 0.35984915494918823, + "learning_rate": 8.916130657566844e-06, + "loss": 0.3961, + "step": 28794 + }, + { + "epoch": 0.5345997185535584, + "grad_norm": 0.26578521728515625, + "learning_rate": 8.914971046330177e-06, + "loss": 0.2273, + "step": 28796 + }, + { + "epoch": 0.534636848690977, + "grad_norm": 0.4290737807750702, + "learning_rate": 8.913811449857505e-06, + "loss": 0.3847, + "step": 28798 + }, + { + "epoch": 0.5346739788283956, + "grad_norm": 0.30025798082351685, + "learning_rate": 8.912651868164604e-06, + "loss": 0.1242, + "step": 28800 + }, + { + "epoch": 0.5347111089658143, + "grad_norm": 0.5194622278213501, + "learning_rate": 8.911492301267257e-06, + "loss": 0.2932, + "step": 28802 + }, + { + "epoch": 0.5347482391032329, + "grad_norm": 0.3497410714626312, + "learning_rate": 8.910332749181239e-06, + "loss": 0.1945, + "step": 28804 + }, + { + "epoch": 0.5347853692406516, + "grad_norm": 0.5166685581207275, + "learning_rate": 8.909173211922331e-06, + "loss": 0.3248, + "step": 28806 + }, + { + "epoch": 0.5348224993780702, + "grad_norm": 0.6599903106689453, + "learning_rate": 8.908013689506302e-06, + "loss": 0.2456, + "step": 28808 + }, + { + "epoch": 0.5348596295154888, + "grad_norm": 0.24852032959461212, + "learning_rate": 8.906854181948939e-06, + "loss": 0.1787, + "step": 28810 + }, + { + "epoch": 0.5348967596529075, + "grad_norm": 0.5869738459587097, + "learning_rate": 8.905694689266014e-06, + "loss": 0.3927, + "step": 28812 + }, + { + "epoch": 0.5349338897903261, + "grad_norm": 0.7700765132904053, + "learning_rate": 8.90453521147331e-06, + "loss": 0.3076, + "step": 28814 + }, + { + "epoch": 0.5349710199277448, + "grad_norm": 0.3175654113292694, + "learning_rate": 8.903375748586597e-06, + "loss": 0.2964, + "step": 28816 + }, + { + "epoch": 0.5350081500651634, + "grad_norm": 0.24191737174987793, + "learning_rate": 8.902216300621662e-06, + "loss": 0.452, + "step": 28818 + }, + { + "epoch": 0.535045280202582, + "grad_norm": 0.4792058765888214, + "learning_rate": 8.901056867594269e-06, + "loss": 0.1772, + "step": 28820 + }, + { + "epoch": 0.5350824103400007, + "grad_norm": 0.4530286192893982, + "learning_rate": 8.8998974495202e-06, + "loss": 0.2067, + "step": 28822 + }, + { + "epoch": 0.5351195404774193, + "grad_norm": 0.3312215209007263, + "learning_rate": 8.898738046415231e-06, + "loss": 0.2798, + "step": 28824 + }, + { + "epoch": 0.535156670614838, + "grad_norm": 0.4080938398838043, + "learning_rate": 8.897578658295142e-06, + "loss": 0.2407, + "step": 28826 + }, + { + "epoch": 0.5351938007522565, + "grad_norm": 0.34011387825012207, + "learning_rate": 8.896419285175704e-06, + "loss": 0.2968, + "step": 28828 + }, + { + "epoch": 0.5352309308896752, + "grad_norm": 0.345647394657135, + "learning_rate": 8.895259927072698e-06, + "loss": 0.3059, + "step": 28830 + }, + { + "epoch": 0.5352680610270939, + "grad_norm": 0.5208058953285217, + "learning_rate": 8.894100584001889e-06, + "loss": 0.32, + "step": 28832 + }, + { + "epoch": 0.5353051911645125, + "grad_norm": 0.3290981352329254, + "learning_rate": 8.89294125597906e-06, + "loss": 0.3565, + "step": 28834 + }, + { + "epoch": 0.5353423213019312, + "grad_norm": 0.2666882872581482, + "learning_rate": 8.891781943019986e-06, + "loss": 0.2079, + "step": 28836 + }, + { + "epoch": 0.5353794514393497, + "grad_norm": 0.35050931572914124, + "learning_rate": 8.890622645140438e-06, + "loss": 0.1663, + "step": 28838 + }, + { + "epoch": 0.5354165815767684, + "grad_norm": 0.41442951560020447, + "learning_rate": 8.889463362356193e-06, + "loss": 0.2077, + "step": 28840 + }, + { + "epoch": 0.5354537117141871, + "grad_norm": 0.3473535180091858, + "learning_rate": 8.888304094683025e-06, + "loss": 0.1456, + "step": 28842 + }, + { + "epoch": 0.5354908418516057, + "grad_norm": 0.392761766910553, + "learning_rate": 8.887144842136713e-06, + "loss": 0.2245, + "step": 28844 + }, + { + "epoch": 0.5355279719890244, + "grad_norm": 0.4616940915584564, + "learning_rate": 8.885985604733021e-06, + "loss": 0.348, + "step": 28846 + }, + { + "epoch": 0.5355651021264429, + "grad_norm": 0.3946167826652527, + "learning_rate": 8.884826382487732e-06, + "loss": 0.1951, + "step": 28848 + }, + { + "epoch": 0.5356022322638616, + "grad_norm": 0.3316513001918793, + "learning_rate": 8.883667175416613e-06, + "loss": 0.2824, + "step": 28850 + }, + { + "epoch": 0.5356393624012803, + "grad_norm": 0.3715413510799408, + "learning_rate": 8.882507983535438e-06, + "loss": 0.2765, + "step": 28852 + }, + { + "epoch": 0.5356764925386989, + "grad_norm": 0.33346349000930786, + "learning_rate": 8.881348806859984e-06, + "loss": 0.437, + "step": 28854 + }, + { + "epoch": 0.5357136226761176, + "grad_norm": 0.6776406764984131, + "learning_rate": 8.880189645406026e-06, + "loss": 0.4656, + "step": 28856 + }, + { + "epoch": 0.5357507528135361, + "grad_norm": 0.5393676161766052, + "learning_rate": 8.879030499189331e-06, + "loss": 0.3356, + "step": 28858 + }, + { + "epoch": 0.5357878829509548, + "grad_norm": 0.3181859254837036, + "learning_rate": 8.87787136822567e-06, + "loss": 0.3038, + "step": 28860 + }, + { + "epoch": 0.5358250130883735, + "grad_norm": 0.3831586539745331, + "learning_rate": 8.876712252530819e-06, + "loss": 0.3269, + "step": 28862 + }, + { + "epoch": 0.5358621432257921, + "grad_norm": 0.3261096775531769, + "learning_rate": 8.87555315212055e-06, + "loss": 0.3043, + "step": 28864 + }, + { + "epoch": 0.5358992733632107, + "grad_norm": 0.43998849391937256, + "learning_rate": 8.874394067010636e-06, + "loss": 0.3892, + "step": 28866 + }, + { + "epoch": 0.5359364035006293, + "grad_norm": 0.32244300842285156, + "learning_rate": 8.873234997216847e-06, + "loss": 0.3034, + "step": 28868 + }, + { + "epoch": 0.535973533638048, + "grad_norm": 0.670664370059967, + "learning_rate": 8.872075942754956e-06, + "loss": 0.4198, + "step": 28870 + }, + { + "epoch": 0.5360106637754666, + "grad_norm": 0.3537195324897766, + "learning_rate": 8.870916903640729e-06, + "loss": 0.2529, + "step": 28872 + }, + { + "epoch": 0.5360477939128853, + "grad_norm": 0.31270405650138855, + "learning_rate": 8.869757879889941e-06, + "loss": 0.1513, + "step": 28874 + }, + { + "epoch": 0.536084924050304, + "grad_norm": 0.5249885320663452, + "learning_rate": 8.868598871518365e-06, + "loss": 0.2813, + "step": 28876 + }, + { + "epoch": 0.5361220541877225, + "grad_norm": 0.2855727970600128, + "learning_rate": 8.86743987854177e-06, + "loss": 0.107, + "step": 28878 + }, + { + "epoch": 0.5361591843251412, + "grad_norm": 0.3103468120098114, + "learning_rate": 8.866280900975924e-06, + "loss": 0.3253, + "step": 28880 + }, + { + "epoch": 0.5361963144625598, + "grad_norm": 0.3224376440048218, + "learning_rate": 8.865121938836602e-06, + "loss": 0.2603, + "step": 28882 + }, + { + "epoch": 0.5362334445999785, + "grad_norm": 0.3249346613883972, + "learning_rate": 8.863962992139569e-06, + "loss": 0.1916, + "step": 28884 + }, + { + "epoch": 0.5362705747373971, + "grad_norm": 0.5392810702323914, + "learning_rate": 8.862804060900597e-06, + "loss": 0.2862, + "step": 28886 + }, + { + "epoch": 0.5363077048748157, + "grad_norm": 0.337587833404541, + "learning_rate": 8.861645145135456e-06, + "loss": 0.4876, + "step": 28888 + }, + { + "epoch": 0.5363448350122344, + "grad_norm": 0.24969573318958282, + "learning_rate": 8.860486244859911e-06, + "loss": 0.1564, + "step": 28890 + }, + { + "epoch": 0.536381965149653, + "grad_norm": 0.4109410047531128, + "learning_rate": 8.859327360089737e-06, + "loss": 0.4201, + "step": 28892 + }, + { + "epoch": 0.5364190952870717, + "grad_norm": 0.30118539929389954, + "learning_rate": 8.858168490840706e-06, + "loss": 0.3436, + "step": 28894 + }, + { + "epoch": 0.5364562254244903, + "grad_norm": 0.22410225868225098, + "learning_rate": 8.857009637128577e-06, + "loss": 0.3657, + "step": 28896 + }, + { + "epoch": 0.5364933555619089, + "grad_norm": 0.3994862139225006, + "learning_rate": 8.855850798969122e-06, + "loss": 0.4053, + "step": 28898 + }, + { + "epoch": 0.5365304856993276, + "grad_norm": 0.2635518014431, + "learning_rate": 8.854691976378114e-06, + "loss": 0.3909, + "step": 28900 + }, + { + "epoch": 0.5365676158367462, + "grad_norm": 0.4947318732738495, + "learning_rate": 8.853533169371315e-06, + "loss": 0.4727, + "step": 28902 + }, + { + "epoch": 0.5366047459741649, + "grad_norm": 0.3866179287433624, + "learning_rate": 8.852374377964496e-06, + "loss": 0.2078, + "step": 28904 + }, + { + "epoch": 0.5366418761115835, + "grad_norm": 0.3277641236782074, + "learning_rate": 8.851215602173427e-06, + "loss": 0.2229, + "step": 28906 + }, + { + "epoch": 0.5366790062490021, + "grad_norm": 0.39967960119247437, + "learning_rate": 8.85005684201387e-06, + "loss": 0.1722, + "step": 28908 + }, + { + "epoch": 0.5367161363864208, + "grad_norm": 0.3952726125717163, + "learning_rate": 8.848898097501594e-06, + "loss": 0.2345, + "step": 28910 + }, + { + "epoch": 0.5367532665238394, + "grad_norm": 0.8256970643997192, + "learning_rate": 8.847739368652368e-06, + "loss": 0.3172, + "step": 28912 + }, + { + "epoch": 0.536790396661258, + "grad_norm": 0.3216754198074341, + "learning_rate": 8.846580655481958e-06, + "loss": 0.4198, + "step": 28914 + }, + { + "epoch": 0.5368275267986767, + "grad_norm": 0.7213714718818665, + "learning_rate": 8.845421958006128e-06, + "loss": 0.4123, + "step": 28916 + }, + { + "epoch": 0.5368646569360953, + "grad_norm": 0.32979604601860046, + "learning_rate": 8.844263276240653e-06, + "loss": 0.1941, + "step": 28918 + }, + { + "epoch": 0.536901787073514, + "grad_norm": 0.19757139682769775, + "learning_rate": 8.843104610201288e-06, + "loss": 0.2762, + "step": 28920 + }, + { + "epoch": 0.5369389172109326, + "grad_norm": 0.44028541445732117, + "learning_rate": 8.841945959903807e-06, + "loss": 0.2592, + "step": 28922 + }, + { + "epoch": 0.5369760473483512, + "grad_norm": 0.2324734926223755, + "learning_rate": 8.840787325363969e-06, + "loss": 0.239, + "step": 28924 + }, + { + "epoch": 0.5370131774857698, + "grad_norm": 0.6083295941352844, + "learning_rate": 8.839628706597543e-06, + "loss": 0.1767, + "step": 28926 + }, + { + "epoch": 0.5370503076231885, + "grad_norm": 0.47903871536254883, + "learning_rate": 8.838470103620296e-06, + "loss": 0.2226, + "step": 28928 + }, + { + "epoch": 0.5370874377606072, + "grad_norm": 0.39700552821159363, + "learning_rate": 8.837311516447993e-06, + "loss": 0.2916, + "step": 28930 + }, + { + "epoch": 0.5371245678980258, + "grad_norm": 0.2975066304206848, + "learning_rate": 8.836152945096398e-06, + "loss": 0.3395, + "step": 28932 + }, + { + "epoch": 0.5371616980354444, + "grad_norm": 0.4705333411693573, + "learning_rate": 8.834994389581275e-06, + "loss": 0.3068, + "step": 28934 + }, + { + "epoch": 0.537198828172863, + "grad_norm": 0.5135672688484192, + "learning_rate": 8.833835849918388e-06, + "loss": 0.4573, + "step": 28936 + }, + { + "epoch": 0.5372359583102817, + "grad_norm": 0.2641277611255646, + "learning_rate": 8.8326773261235e-06, + "loss": 0.2299, + "step": 28938 + }, + { + "epoch": 0.5372730884477004, + "grad_norm": 0.4112910330295563, + "learning_rate": 8.831518818212378e-06, + "loss": 0.2287, + "step": 28940 + }, + { + "epoch": 0.537310218585119, + "grad_norm": 0.38870710134506226, + "learning_rate": 8.830360326200787e-06, + "loss": 0.5895, + "step": 28942 + }, + { + "epoch": 0.5373473487225376, + "grad_norm": 0.22866074740886688, + "learning_rate": 8.829201850104492e-06, + "loss": 0.3707, + "step": 28944 + }, + { + "epoch": 0.5373844788599562, + "grad_norm": 0.3247593343257904, + "learning_rate": 8.828043389939246e-06, + "loss": 0.4327, + "step": 28946 + }, + { + "epoch": 0.5374216089973749, + "grad_norm": 0.4620649516582489, + "learning_rate": 8.82688494572082e-06, + "loss": 0.1971, + "step": 28948 + }, + { + "epoch": 0.5374587391347936, + "grad_norm": 0.36376142501831055, + "learning_rate": 8.825726517464976e-06, + "loss": 0.4478, + "step": 28950 + }, + { + "epoch": 0.5374958692722122, + "grad_norm": 0.5271807312965393, + "learning_rate": 8.824568105187478e-06, + "loss": 0.3627, + "step": 28952 + }, + { + "epoch": 0.5375329994096308, + "grad_norm": 0.6430363655090332, + "learning_rate": 8.823409708904087e-06, + "loss": 0.2088, + "step": 28954 + }, + { + "epoch": 0.5375701295470494, + "grad_norm": 0.34231269359588623, + "learning_rate": 8.82225132863057e-06, + "loss": 0.1291, + "step": 28956 + }, + { + "epoch": 0.5376072596844681, + "grad_norm": 0.37590786814689636, + "learning_rate": 8.82109296438268e-06, + "loss": 0.3799, + "step": 28958 + }, + { + "epoch": 0.5376443898218868, + "grad_norm": 0.36601924896240234, + "learning_rate": 8.819934616176182e-06, + "loss": 0.3098, + "step": 28960 + }, + { + "epoch": 0.5376815199593054, + "grad_norm": 0.2647835910320282, + "learning_rate": 8.81877628402684e-06, + "loss": 0.3885, + "step": 28962 + }, + { + "epoch": 0.537718650096724, + "grad_norm": 0.4027288258075714, + "learning_rate": 8.817617967950417e-06, + "loss": 0.0979, + "step": 28964 + }, + { + "epoch": 0.5377557802341426, + "grad_norm": 0.3420749306678772, + "learning_rate": 8.81645966796267e-06, + "loss": 0.27, + "step": 28966 + }, + { + "epoch": 0.5377929103715613, + "grad_norm": 0.48124876618385315, + "learning_rate": 8.81530138407936e-06, + "loss": 0.3065, + "step": 28968 + }, + { + "epoch": 0.53783004050898, + "grad_norm": 0.42390236258506775, + "learning_rate": 8.814143116316257e-06, + "loss": 0.3103, + "step": 28970 + }, + { + "epoch": 0.5378671706463986, + "grad_norm": 0.2607991695404053, + "learning_rate": 8.812984864689107e-06, + "loss": 0.2639, + "step": 28972 + }, + { + "epoch": 0.5379043007838172, + "grad_norm": 0.3881067633628845, + "learning_rate": 8.81182662921368e-06, + "loss": 0.3044, + "step": 28974 + }, + { + "epoch": 0.5379414309212358, + "grad_norm": 0.36070653796195984, + "learning_rate": 8.810668409905733e-06, + "loss": 0.2133, + "step": 28976 + }, + { + "epoch": 0.5379785610586545, + "grad_norm": 0.3744252026081085, + "learning_rate": 8.809510206781025e-06, + "loss": 0.1141, + "step": 28978 + }, + { + "epoch": 0.5380156911960731, + "grad_norm": 0.42860186100006104, + "learning_rate": 8.808352019855317e-06, + "loss": 0.31, + "step": 28980 + }, + { + "epoch": 0.5380528213334917, + "grad_norm": 0.41899728775024414, + "learning_rate": 8.807193849144374e-06, + "loss": 0.1678, + "step": 28982 + }, + { + "epoch": 0.5380899514709104, + "grad_norm": 0.24706198275089264, + "learning_rate": 8.806035694663945e-06, + "loss": 0.3168, + "step": 28984 + }, + { + "epoch": 0.538127081608329, + "grad_norm": 0.34947726130485535, + "learning_rate": 8.804877556429797e-06, + "loss": 0.2965, + "step": 28986 + }, + { + "epoch": 0.5381642117457477, + "grad_norm": 0.3506985008716583, + "learning_rate": 8.803719434457683e-06, + "loss": 0.3101, + "step": 28988 + }, + { + "epoch": 0.5382013418831663, + "grad_norm": 0.38096871972084045, + "learning_rate": 8.802561328763364e-06, + "loss": 0.4228, + "step": 28990 + }, + { + "epoch": 0.538238472020585, + "grad_norm": 0.3246283531188965, + "learning_rate": 8.801403239362598e-06, + "loss": 0.2242, + "step": 28992 + }, + { + "epoch": 0.5382756021580036, + "grad_norm": 0.3986336588859558, + "learning_rate": 8.800245166271149e-06, + "loss": 0.2624, + "step": 28994 + }, + { + "epoch": 0.5383127322954222, + "grad_norm": 0.35336074233055115, + "learning_rate": 8.799087109504766e-06, + "loss": 0.1392, + "step": 28996 + }, + { + "epoch": 0.5383498624328409, + "grad_norm": 0.44116055965423584, + "learning_rate": 8.79792906907921e-06, + "loss": 0.147, + "step": 28998 + }, + { + "epoch": 0.5383869925702595, + "grad_norm": 0.33933666348457336, + "learning_rate": 8.796771045010237e-06, + "loss": 0.2637, + "step": 29000 + }, + { + "epoch": 0.5384241227076781, + "grad_norm": 0.3280552327632904, + "learning_rate": 8.795613037313607e-06, + "loss": 0.4042, + "step": 29002 + }, + { + "epoch": 0.5384612528450968, + "grad_norm": 0.4678749442100525, + "learning_rate": 8.794455046005079e-06, + "loss": 0.3733, + "step": 29004 + }, + { + "epoch": 0.5384983829825154, + "grad_norm": 0.23080037534236908, + "learning_rate": 8.793297071100402e-06, + "loss": 0.3163, + "step": 29006 + }, + { + "epoch": 0.5385355131199341, + "grad_norm": 0.41035544872283936, + "learning_rate": 8.792139112615345e-06, + "loss": 0.4399, + "step": 29008 + }, + { + "epoch": 0.5385726432573527, + "grad_norm": 0.38496923446655273, + "learning_rate": 8.79098117056565e-06, + "loss": 0.3785, + "step": 29010 + }, + { + "epoch": 0.5386097733947713, + "grad_norm": 0.4847329258918762, + "learning_rate": 8.789823244967081e-06, + "loss": 0.3321, + "step": 29012 + }, + { + "epoch": 0.53864690353219, + "grad_norm": 0.42741239070892334, + "learning_rate": 8.788665335835391e-06, + "loss": 0.2095, + "step": 29014 + }, + { + "epoch": 0.5386840336696086, + "grad_norm": 0.2723098397254944, + "learning_rate": 8.787507443186341e-06, + "loss": 0.1508, + "step": 29016 + }, + { + "epoch": 0.5387211638070273, + "grad_norm": 0.42734912037849426, + "learning_rate": 8.786349567035681e-06, + "loss": 0.3142, + "step": 29018 + }, + { + "epoch": 0.5387582939444459, + "grad_norm": 0.3509759306907654, + "learning_rate": 8.785191707399172e-06, + "loss": 0.2006, + "step": 29020 + }, + { + "epoch": 0.5387954240818645, + "grad_norm": 0.5491913557052612, + "learning_rate": 8.784033864292561e-06, + "loss": 0.3318, + "step": 29022 + }, + { + "epoch": 0.5388325542192831, + "grad_norm": 0.4884089231491089, + "learning_rate": 8.782876037731607e-06, + "loss": 0.1724, + "step": 29024 + }, + { + "epoch": 0.5388696843567018, + "grad_norm": 0.381264328956604, + "learning_rate": 8.781718227732065e-06, + "loss": 0.3064, + "step": 29026 + }, + { + "epoch": 0.5389068144941205, + "grad_norm": 0.515300452709198, + "learning_rate": 8.780560434309688e-06, + "loss": 0.2855, + "step": 29028 + }, + { + "epoch": 0.538943944631539, + "grad_norm": 0.19382821023464203, + "learning_rate": 8.779402657480231e-06, + "loss": 0.2689, + "step": 29030 + }, + { + "epoch": 0.5389810747689577, + "grad_norm": 0.45632404088974, + "learning_rate": 8.778244897259452e-06, + "loss": 0.271, + "step": 29032 + }, + { + "epoch": 0.5390182049063763, + "grad_norm": 0.8036473989486694, + "learning_rate": 8.777087153663095e-06, + "loss": 0.2716, + "step": 29034 + }, + { + "epoch": 0.539055335043795, + "grad_norm": 0.49166810512542725, + "learning_rate": 8.77592942670692e-06, + "loss": 0.3119, + "step": 29036 + }, + { + "epoch": 0.5390924651812137, + "grad_norm": 0.3464691936969757, + "learning_rate": 8.77477171640668e-06, + "loss": 0.2479, + "step": 29038 + }, + { + "epoch": 0.5391295953186322, + "grad_norm": 0.39825841784477234, + "learning_rate": 8.773614022778126e-06, + "loss": 0.1964, + "step": 29040 + }, + { + "epoch": 0.5391667254560509, + "grad_norm": 0.35477596521377563, + "learning_rate": 8.77245634583701e-06, + "loss": 0.5109, + "step": 29042 + }, + { + "epoch": 0.5392038555934695, + "grad_norm": 0.3489423096179962, + "learning_rate": 8.771298685599092e-06, + "loss": 0.3363, + "step": 29044 + }, + { + "epoch": 0.5392409857308882, + "grad_norm": 0.32201874256134033, + "learning_rate": 8.770141042080115e-06, + "loss": 0.3162, + "step": 29046 + }, + { + "epoch": 0.5392781158683069, + "grad_norm": 0.3752102553844452, + "learning_rate": 8.768983415295833e-06, + "loss": 0.2798, + "step": 29048 + }, + { + "epoch": 0.5393152460057254, + "grad_norm": 0.3636285066604614, + "learning_rate": 8.767825805262001e-06, + "loss": 0.2117, + "step": 29050 + }, + { + "epoch": 0.5393523761431441, + "grad_norm": 0.3099227249622345, + "learning_rate": 8.766668211994369e-06, + "loss": 0.1141, + "step": 29052 + }, + { + "epoch": 0.5393895062805627, + "grad_norm": 0.36934393644332886, + "learning_rate": 8.765510635508686e-06, + "loss": 0.3204, + "step": 29054 + }, + { + "epoch": 0.5394266364179814, + "grad_norm": 0.33397069573402405, + "learning_rate": 8.76435307582071e-06, + "loss": 0.3264, + "step": 29056 + }, + { + "epoch": 0.5394637665554001, + "grad_norm": 0.5294509530067444, + "learning_rate": 8.763195532946185e-06, + "loss": 0.2112, + "step": 29058 + }, + { + "epoch": 0.5395008966928186, + "grad_norm": 0.43548229336738586, + "learning_rate": 8.762038006900866e-06, + "loss": 0.1291, + "step": 29060 + }, + { + "epoch": 0.5395380268302373, + "grad_norm": 0.5218189358711243, + "learning_rate": 8.7608804977005e-06, + "loss": 0.3903, + "step": 29062 + }, + { + "epoch": 0.5395751569676559, + "grad_norm": 0.39514532685279846, + "learning_rate": 8.759723005360837e-06, + "loss": 0.1726, + "step": 29064 + }, + { + "epoch": 0.5396122871050746, + "grad_norm": 0.4090101718902588, + "learning_rate": 8.758565529897629e-06, + "loss": 0.3336, + "step": 29066 + }, + { + "epoch": 0.5396494172424933, + "grad_norm": 0.41136986017227173, + "learning_rate": 8.757408071326629e-06, + "loss": 0.2609, + "step": 29068 + }, + { + "epoch": 0.5396865473799118, + "grad_norm": 0.39628228545188904, + "learning_rate": 8.756250629663582e-06, + "loss": 0.1928, + "step": 29070 + }, + { + "epoch": 0.5397236775173305, + "grad_norm": 0.2596226930618286, + "learning_rate": 8.755093204924239e-06, + "loss": 0.186, + "step": 29072 + }, + { + "epoch": 0.5397608076547491, + "grad_norm": 0.412746787071228, + "learning_rate": 8.753935797124346e-06, + "loss": 0.2539, + "step": 29074 + }, + { + "epoch": 0.5397979377921678, + "grad_norm": 0.26234158873558044, + "learning_rate": 8.752778406279655e-06, + "loss": 0.3241, + "step": 29076 + }, + { + "epoch": 0.5398350679295864, + "grad_norm": 0.43193626403808594, + "learning_rate": 8.751621032405915e-06, + "loss": 0.3383, + "step": 29078 + }, + { + "epoch": 0.539872198067005, + "grad_norm": 0.23748880624771118, + "learning_rate": 8.750463675518873e-06, + "loss": 0.2931, + "step": 29080 + }, + { + "epoch": 0.5399093282044237, + "grad_norm": 0.274153470993042, + "learning_rate": 8.749306335634282e-06, + "loss": 0.3192, + "step": 29082 + }, + { + "epoch": 0.5399464583418423, + "grad_norm": 0.2875758111476898, + "learning_rate": 8.74814901276788e-06, + "loss": 0.4605, + "step": 29084 + }, + { + "epoch": 0.539983588479261, + "grad_norm": 0.5302455425262451, + "learning_rate": 8.74699170693542e-06, + "loss": 0.1804, + "step": 29086 + }, + { + "epoch": 0.5400207186166796, + "grad_norm": 0.4128270745277405, + "learning_rate": 8.745834418152651e-06, + "loss": 0.2892, + "step": 29088 + }, + { + "epoch": 0.5400578487540982, + "grad_norm": 0.44809195399284363, + "learning_rate": 8.74467714643532e-06, + "loss": 0.4741, + "step": 29090 + }, + { + "epoch": 0.5400949788915169, + "grad_norm": 0.44445866346359253, + "learning_rate": 8.743519891799171e-06, + "loss": 0.2123, + "step": 29092 + }, + { + "epoch": 0.5401321090289355, + "grad_norm": 0.48729515075683594, + "learning_rate": 8.742362654259953e-06, + "loss": 0.175, + "step": 29094 + }, + { + "epoch": 0.5401692391663542, + "grad_norm": 0.34059056639671326, + "learning_rate": 8.741205433833417e-06, + "loss": 0.3175, + "step": 29096 + }, + { + "epoch": 0.5402063693037727, + "grad_norm": 0.34828782081604004, + "learning_rate": 8.740048230535298e-06, + "loss": 0.2684, + "step": 29098 + }, + { + "epoch": 0.5402434994411914, + "grad_norm": 0.2956511080265045, + "learning_rate": 8.73889104438135e-06, + "loss": 0.2526, + "step": 29100 + }, + { + "epoch": 0.5402806295786101, + "grad_norm": 0.3331354856491089, + "learning_rate": 8.73773387538732e-06, + "loss": 0.1748, + "step": 29102 + }, + { + "epoch": 0.5403177597160287, + "grad_norm": 0.33080562949180603, + "learning_rate": 8.736576723568949e-06, + "loss": 0.2284, + "step": 29104 + }, + { + "epoch": 0.5403548898534474, + "grad_norm": 0.5027536153793335, + "learning_rate": 8.735419588941984e-06, + "loss": 0.4092, + "step": 29106 + }, + { + "epoch": 0.540392019990866, + "grad_norm": 0.458329975605011, + "learning_rate": 8.734262471522174e-06, + "loss": 0.4297, + "step": 29108 + }, + { + "epoch": 0.5404291501282846, + "grad_norm": 0.3305177092552185, + "learning_rate": 8.733105371325256e-06, + "loss": 0.2675, + "step": 29110 + }, + { + "epoch": 0.5404662802657033, + "grad_norm": 0.4668492376804352, + "learning_rate": 8.731948288366983e-06, + "loss": 0.3893, + "step": 29112 + }, + { + "epoch": 0.5405034104031219, + "grad_norm": 0.32844310998916626, + "learning_rate": 8.730791222663093e-06, + "loss": 0.1768, + "step": 29114 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.3423122763633728, + "learning_rate": 8.72963417422933e-06, + "loss": 0.2147, + "step": 29116 + }, + { + "epoch": 0.5405776706779591, + "grad_norm": 0.26020705699920654, + "learning_rate": 8.728477143081444e-06, + "loss": 0.2379, + "step": 29118 + }, + { + "epoch": 0.5406148008153778, + "grad_norm": 0.4827598035335541, + "learning_rate": 8.72732012923518e-06, + "loss": 0.2814, + "step": 29120 + }, + { + "epoch": 0.5406519309527965, + "grad_norm": 0.2928997576236725, + "learning_rate": 8.726163132706271e-06, + "loss": 0.3541, + "step": 29122 + }, + { + "epoch": 0.5406890610902151, + "grad_norm": 0.5309603214263916, + "learning_rate": 8.725006153510468e-06, + "loss": 0.489, + "step": 29124 + }, + { + "epoch": 0.5407261912276338, + "grad_norm": 0.4305154085159302, + "learning_rate": 8.723849191663512e-06, + "loss": 0.2963, + "step": 29126 + }, + { + "epoch": 0.5407633213650523, + "grad_norm": 0.2677434980869293, + "learning_rate": 8.722692247181143e-06, + "loss": 0.2165, + "step": 29128 + }, + { + "epoch": 0.540800451502471, + "grad_norm": 0.30531907081604004, + "learning_rate": 8.721535320079109e-06, + "loss": 0.2333, + "step": 29130 + }, + { + "epoch": 0.5408375816398896, + "grad_norm": 0.40117356181144714, + "learning_rate": 8.720378410373152e-06, + "loss": 0.2355, + "step": 29132 + }, + { + "epoch": 0.5408747117773083, + "grad_norm": 1.154480218887329, + "learning_rate": 8.719221518079012e-06, + "loss": 0.4209, + "step": 29134 + }, + { + "epoch": 0.540911841914727, + "grad_norm": 0.36175766587257385, + "learning_rate": 8.718064643212428e-06, + "loss": 0.1844, + "step": 29136 + }, + { + "epoch": 0.5409489720521455, + "grad_norm": 0.521953821182251, + "learning_rate": 8.716907785789143e-06, + "loss": 0.2785, + "step": 29138 + }, + { + "epoch": 0.5409861021895642, + "grad_norm": 0.7608530521392822, + "learning_rate": 8.7157509458249e-06, + "loss": 0.3054, + "step": 29140 + }, + { + "epoch": 0.5410232323269828, + "grad_norm": 0.32299312949180603, + "learning_rate": 8.714594123335444e-06, + "loss": 0.3696, + "step": 29142 + }, + { + "epoch": 0.5410603624644015, + "grad_norm": 0.4030701518058777, + "learning_rate": 8.71343731833651e-06, + "loss": 0.3391, + "step": 29144 + }, + { + "epoch": 0.5410974926018202, + "grad_norm": 0.4206581711769104, + "learning_rate": 8.712280530843842e-06, + "loss": 0.3013, + "step": 29146 + }, + { + "epoch": 0.5411346227392387, + "grad_norm": 0.27141445875167847, + "learning_rate": 8.711123760873176e-06, + "loss": 0.3519, + "step": 29148 + }, + { + "epoch": 0.5411717528766574, + "grad_norm": 0.33926326036453247, + "learning_rate": 8.709967008440254e-06, + "loss": 0.3592, + "step": 29150 + }, + { + "epoch": 0.541208883014076, + "grad_norm": 0.32513123750686646, + "learning_rate": 8.708810273560818e-06, + "loss": 0.2045, + "step": 29152 + }, + { + "epoch": 0.5412460131514947, + "grad_norm": 0.34170204401016235, + "learning_rate": 8.707653556250608e-06, + "loss": 0.3308, + "step": 29154 + }, + { + "epoch": 0.5412831432889134, + "grad_norm": 0.2505717873573303, + "learning_rate": 8.70649685652536e-06, + "loss": 0.2412, + "step": 29156 + }, + { + "epoch": 0.5413202734263319, + "grad_norm": 0.37244531512260437, + "learning_rate": 8.70534017440082e-06, + "loss": 0.2982, + "step": 29158 + }, + { + "epoch": 0.5413574035637506, + "grad_norm": 0.36878788471221924, + "learning_rate": 8.704183509892718e-06, + "loss": 0.4244, + "step": 29160 + }, + { + "epoch": 0.5413945337011692, + "grad_norm": 0.28823912143707275, + "learning_rate": 8.703026863016797e-06, + "loss": 0.2015, + "step": 29162 + }, + { + "epoch": 0.5414316638385879, + "grad_norm": 0.39584553241729736, + "learning_rate": 8.701870233788796e-06, + "loss": 0.267, + "step": 29164 + }, + { + "epoch": 0.5414687939760066, + "grad_norm": 0.31679457426071167, + "learning_rate": 8.700713622224454e-06, + "loss": 0.2615, + "step": 29166 + }, + { + "epoch": 0.5415059241134251, + "grad_norm": 0.2750388979911804, + "learning_rate": 8.699557028339504e-06, + "loss": 0.3156, + "step": 29168 + }, + { + "epoch": 0.5415430542508438, + "grad_norm": 0.358825147151947, + "learning_rate": 8.698400452149692e-06, + "loss": 0.5301, + "step": 29170 + }, + { + "epoch": 0.5415801843882624, + "grad_norm": 0.38050150871276855, + "learning_rate": 8.697243893670752e-06, + "loss": 0.2253, + "step": 29172 + }, + { + "epoch": 0.5416173145256811, + "grad_norm": 0.5608035922050476, + "learning_rate": 8.696087352918416e-06, + "loss": 0.2824, + "step": 29174 + }, + { + "epoch": 0.5416544446630996, + "grad_norm": 0.2506846785545349, + "learning_rate": 8.694930829908429e-06, + "loss": 0.3267, + "step": 29176 + }, + { + "epoch": 0.5416915748005183, + "grad_norm": 0.46021321415901184, + "learning_rate": 8.693774324656523e-06, + "loss": 0.2858, + "step": 29178 + }, + { + "epoch": 0.541728704937937, + "grad_norm": 0.2649284601211548, + "learning_rate": 8.692617837178432e-06, + "loss": 0.2535, + "step": 29180 + }, + { + "epoch": 0.5417658350753556, + "grad_norm": 0.4189710021018982, + "learning_rate": 8.691461367489899e-06, + "loss": 0.2883, + "step": 29182 + }, + { + "epoch": 0.5418029652127743, + "grad_norm": 0.3972805142402649, + "learning_rate": 8.690304915606663e-06, + "loss": 0.206, + "step": 29184 + }, + { + "epoch": 0.5418400953501928, + "grad_norm": 0.3949586749076843, + "learning_rate": 8.689148481544447e-06, + "loss": 0.2376, + "step": 29186 + }, + { + "epoch": 0.5418772254876115, + "grad_norm": 0.30991628766059875, + "learning_rate": 8.687992065318997e-06, + "loss": 0.1523, + "step": 29188 + }, + { + "epoch": 0.5419143556250302, + "grad_norm": 0.5625579357147217, + "learning_rate": 8.686835666946043e-06, + "loss": 0.4294, + "step": 29190 + }, + { + "epoch": 0.5419514857624488, + "grad_norm": 0.4907143712043762, + "learning_rate": 8.685679286441321e-06, + "loss": 0.363, + "step": 29192 + }, + { + "epoch": 0.5419886158998675, + "grad_norm": 0.3919946849346161, + "learning_rate": 8.684522923820571e-06, + "loss": 0.1879, + "step": 29194 + }, + { + "epoch": 0.542025746037286, + "grad_norm": 0.41752931475639343, + "learning_rate": 8.683366579099524e-06, + "loss": 0.2567, + "step": 29196 + }, + { + "epoch": 0.5420628761747047, + "grad_norm": 0.35558754205703735, + "learning_rate": 8.682210252293912e-06, + "loss": 0.1417, + "step": 29198 + }, + { + "epoch": 0.5421000063121234, + "grad_norm": 0.4281615912914276, + "learning_rate": 8.681053943419471e-06, + "loss": 0.2674, + "step": 29200 + }, + { + "epoch": 0.542137136449542, + "grad_norm": 0.29554855823516846, + "learning_rate": 8.679897652491934e-06, + "loss": 0.3941, + "step": 29202 + }, + { + "epoch": 0.5421742665869607, + "grad_norm": 0.3773784637451172, + "learning_rate": 8.678741379527036e-06, + "loss": 0.4441, + "step": 29204 + }, + { + "epoch": 0.5422113967243792, + "grad_norm": 0.2607143223285675, + "learning_rate": 8.677585124540512e-06, + "loss": 0.3643, + "step": 29206 + }, + { + "epoch": 0.5422485268617979, + "grad_norm": 0.5972867608070374, + "learning_rate": 8.676428887548094e-06, + "loss": 0.2166, + "step": 29208 + }, + { + "epoch": 0.5422856569992166, + "grad_norm": 0.37390974164009094, + "learning_rate": 8.675272668565514e-06, + "loss": 0.3442, + "step": 29210 + }, + { + "epoch": 0.5423227871366352, + "grad_norm": 0.4715038537979126, + "learning_rate": 8.674116467608502e-06, + "loss": 0.2342, + "step": 29212 + }, + { + "epoch": 0.5423599172740539, + "grad_norm": 0.3736487030982971, + "learning_rate": 8.672960284692795e-06, + "loss": 0.432, + "step": 29214 + }, + { + "epoch": 0.5423970474114724, + "grad_norm": 0.2512434422969818, + "learning_rate": 8.671804119834123e-06, + "loss": 0.2563, + "step": 29216 + }, + { + "epoch": 0.5424341775488911, + "grad_norm": 0.3261411190032959, + "learning_rate": 8.67064797304822e-06, + "loss": 0.4319, + "step": 29218 + }, + { + "epoch": 0.5424713076863098, + "grad_norm": 0.4159064292907715, + "learning_rate": 8.669491844350815e-06, + "loss": 0.3327, + "step": 29220 + }, + { + "epoch": 0.5425084378237284, + "grad_norm": 0.2847120463848114, + "learning_rate": 8.668335733757644e-06, + "loss": 0.2616, + "step": 29222 + }, + { + "epoch": 0.5425455679611471, + "grad_norm": 0.3157494068145752, + "learning_rate": 8.667179641284431e-06, + "loss": 0.184, + "step": 29224 + }, + { + "epoch": 0.5425826980985656, + "grad_norm": 0.30882105231285095, + "learning_rate": 8.66602356694691e-06, + "loss": 0.4316, + "step": 29226 + }, + { + "epoch": 0.5426198282359843, + "grad_norm": 0.5159033536911011, + "learning_rate": 8.664867510760817e-06, + "loss": 0.431, + "step": 29228 + }, + { + "epoch": 0.5426569583734029, + "grad_norm": 0.4435185194015503, + "learning_rate": 8.663711472741873e-06, + "loss": 0.2369, + "step": 29230 + }, + { + "epoch": 0.5426940885108216, + "grad_norm": 0.2777823805809021, + "learning_rate": 8.662555452905816e-06, + "loss": 0.2603, + "step": 29232 + }, + { + "epoch": 0.5427312186482403, + "grad_norm": 0.44630444049835205, + "learning_rate": 8.661399451268377e-06, + "loss": 0.4203, + "step": 29234 + }, + { + "epoch": 0.5427683487856588, + "grad_norm": 0.3093295991420746, + "learning_rate": 8.660243467845275e-06, + "loss": 0.1436, + "step": 29236 + }, + { + "epoch": 0.5428054789230775, + "grad_norm": 0.2716151475906372, + "learning_rate": 8.659087502652249e-06, + "loss": 0.1959, + "step": 29238 + }, + { + "epoch": 0.5428426090604961, + "grad_norm": 0.28655266761779785, + "learning_rate": 8.657931555705025e-06, + "loss": 0.2633, + "step": 29240 + }, + { + "epoch": 0.5428797391979148, + "grad_norm": 0.4227493107318878, + "learning_rate": 8.656775627019334e-06, + "loss": 0.2132, + "step": 29242 + }, + { + "epoch": 0.5429168693353335, + "grad_norm": 0.3981489837169647, + "learning_rate": 8.655619716610901e-06, + "loss": 0.4335, + "step": 29244 + }, + { + "epoch": 0.542953999472752, + "grad_norm": 0.2782026529312134, + "learning_rate": 8.654463824495461e-06, + "loss": 0.1388, + "step": 29246 + }, + { + "epoch": 0.5429911296101707, + "grad_norm": 0.553490400314331, + "learning_rate": 8.653307950688734e-06, + "loss": 0.4726, + "step": 29248 + }, + { + "epoch": 0.5430282597475893, + "grad_norm": 0.2811882793903351, + "learning_rate": 8.652152095206454e-06, + "loss": 0.1968, + "step": 29250 + }, + { + "epoch": 0.543065389885008, + "grad_norm": 0.39556363224983215, + "learning_rate": 8.650996258064345e-06, + "loss": 0.3158, + "step": 29252 + }, + { + "epoch": 0.5431025200224266, + "grad_norm": 0.3316425383090973, + "learning_rate": 8.649840439278138e-06, + "loss": 0.233, + "step": 29254 + }, + { + "epoch": 0.5431396501598452, + "grad_norm": 0.36390629410743713, + "learning_rate": 8.648684638863555e-06, + "loss": 0.2459, + "step": 29256 + }, + { + "epoch": 0.5431767802972639, + "grad_norm": 0.4033108055591583, + "learning_rate": 8.647528856836332e-06, + "loss": 0.3237, + "step": 29258 + }, + { + "epoch": 0.5432139104346825, + "grad_norm": 0.5155245661735535, + "learning_rate": 8.646373093212188e-06, + "loss": 0.3359, + "step": 29260 + }, + { + "epoch": 0.5432510405721012, + "grad_norm": 0.33235734701156616, + "learning_rate": 8.645217348006852e-06, + "loss": 0.3862, + "step": 29262 + }, + { + "epoch": 0.5432881707095198, + "grad_norm": 0.4683685004711151, + "learning_rate": 8.644061621236048e-06, + "loss": 0.27, + "step": 29264 + }, + { + "epoch": 0.5433253008469384, + "grad_norm": 0.3547971546649933, + "learning_rate": 8.642905912915502e-06, + "loss": 0.2596, + "step": 29266 + }, + { + "epoch": 0.5433624309843571, + "grad_norm": 0.47313353419303894, + "learning_rate": 8.641750223060944e-06, + "loss": 0.3379, + "step": 29268 + }, + { + "epoch": 0.5433995611217757, + "grad_norm": 0.20375724136829376, + "learning_rate": 8.640594551688097e-06, + "loss": 0.1693, + "step": 29270 + }, + { + "epoch": 0.5434366912591944, + "grad_norm": 0.3484920561313629, + "learning_rate": 8.639438898812691e-06, + "loss": 0.3134, + "step": 29272 + }, + { + "epoch": 0.543473821396613, + "grad_norm": 0.2577415108680725, + "learning_rate": 8.63828326445044e-06, + "loss": 0.2449, + "step": 29274 + }, + { + "epoch": 0.5435109515340316, + "grad_norm": 0.30464401841163635, + "learning_rate": 8.637127648617076e-06, + "loss": 0.2087, + "step": 29276 + }, + { + "epoch": 0.5435480816714503, + "grad_norm": 0.4083777368068695, + "learning_rate": 8.635972051328321e-06, + "loss": 0.3699, + "step": 29278 + }, + { + "epoch": 0.5435852118088689, + "grad_norm": 0.32922303676605225, + "learning_rate": 8.634816472599905e-06, + "loss": 0.2812, + "step": 29280 + }, + { + "epoch": 0.5436223419462876, + "grad_norm": 0.4418775141239166, + "learning_rate": 8.633660912447544e-06, + "loss": 0.2437, + "step": 29282 + }, + { + "epoch": 0.5436594720837061, + "grad_norm": 0.4124562442302704, + "learning_rate": 8.63250537088697e-06, + "loss": 0.3922, + "step": 29284 + }, + { + "epoch": 0.5436966022211248, + "grad_norm": 0.5106393098831177, + "learning_rate": 8.631349847933896e-06, + "loss": 0.3073, + "step": 29286 + }, + { + "epoch": 0.5437337323585435, + "grad_norm": 0.3068910539150238, + "learning_rate": 8.630194343604053e-06, + "loss": 0.3269, + "step": 29288 + }, + { + "epoch": 0.5437708624959621, + "grad_norm": 0.5096837282180786, + "learning_rate": 8.62903885791316e-06, + "loss": 0.3069, + "step": 29290 + }, + { + "epoch": 0.5438079926333808, + "grad_norm": 0.34762489795684814, + "learning_rate": 8.627883390876946e-06, + "loss": 0.2773, + "step": 29292 + }, + { + "epoch": 0.5438451227707993, + "grad_norm": 1.0538008213043213, + "learning_rate": 8.626727942511127e-06, + "loss": 0.3702, + "step": 29294 + }, + { + "epoch": 0.543882252908218, + "grad_norm": 0.5484046339988708, + "learning_rate": 8.625572512831425e-06, + "loss": 0.1806, + "step": 29296 + }, + { + "epoch": 0.5439193830456367, + "grad_norm": 0.30598756670951843, + "learning_rate": 8.62441710185357e-06, + "loss": 0.3188, + "step": 29298 + }, + { + "epoch": 0.5439565131830553, + "grad_norm": 0.4106521010398865, + "learning_rate": 8.623261709593274e-06, + "loss": 0.266, + "step": 29300 + }, + { + "epoch": 0.543993643320474, + "grad_norm": 0.5405019521713257, + "learning_rate": 8.622106336066263e-06, + "loss": 0.1675, + "step": 29302 + }, + { + "epoch": 0.5440307734578925, + "grad_norm": 0.571747899055481, + "learning_rate": 8.620950981288256e-06, + "loss": 0.3958, + "step": 29304 + }, + { + "epoch": 0.5440679035953112, + "grad_norm": 0.331687331199646, + "learning_rate": 8.619795645274976e-06, + "loss": 0.3091, + "step": 29306 + }, + { + "epoch": 0.5441050337327299, + "grad_norm": 0.43747061491012573, + "learning_rate": 8.618640328042142e-06, + "loss": 0.3649, + "step": 29308 + }, + { + "epoch": 0.5441421638701485, + "grad_norm": 0.4858039617538452, + "learning_rate": 8.617485029605481e-06, + "loss": 0.4178, + "step": 29310 + }, + { + "epoch": 0.5441792940075671, + "grad_norm": 0.7736327052116394, + "learning_rate": 8.616329749980703e-06, + "loss": 0.3369, + "step": 29312 + }, + { + "epoch": 0.5442164241449857, + "grad_norm": 0.49970731139183044, + "learning_rate": 8.615174489183534e-06, + "loss": 0.2086, + "step": 29314 + }, + { + "epoch": 0.5442535542824044, + "grad_norm": 0.34400734305381775, + "learning_rate": 8.61401924722969e-06, + "loss": 0.5262, + "step": 29316 + }, + { + "epoch": 0.5442906844198231, + "grad_norm": 0.36918163299560547, + "learning_rate": 8.612864024134893e-06, + "loss": 0.2009, + "step": 29318 + }, + { + "epoch": 0.5443278145572417, + "grad_norm": 0.34294623136520386, + "learning_rate": 8.611708819914862e-06, + "loss": 0.4299, + "step": 29320 + }, + { + "epoch": 0.5443649446946603, + "grad_norm": 0.6117779016494751, + "learning_rate": 8.610553634585319e-06, + "loss": 0.1814, + "step": 29322 + }, + { + "epoch": 0.5444020748320789, + "grad_norm": 0.26201289892196655, + "learning_rate": 8.609398468161976e-06, + "loss": 0.3145, + "step": 29324 + }, + { + "epoch": 0.5444392049694976, + "grad_norm": 0.37982288002967834, + "learning_rate": 8.608243320660556e-06, + "loss": 0.2892, + "step": 29326 + }, + { + "epoch": 0.5444763351069162, + "grad_norm": 0.28766703605651855, + "learning_rate": 8.607088192096772e-06, + "loss": 0.2514, + "step": 29328 + }, + { + "epoch": 0.5445134652443349, + "grad_norm": 0.42481979727745056, + "learning_rate": 8.605933082486349e-06, + "loss": 0.3786, + "step": 29330 + }, + { + "epoch": 0.5445505953817535, + "grad_norm": 0.2438274323940277, + "learning_rate": 8.604777991844998e-06, + "loss": 0.0748, + "step": 29332 + }, + { + "epoch": 0.5445877255191721, + "grad_norm": 0.43185245990753174, + "learning_rate": 8.603622920188446e-06, + "loss": 0.2974, + "step": 29334 + }, + { + "epoch": 0.5446248556565908, + "grad_norm": 0.4894871711730957, + "learning_rate": 8.602467867532399e-06, + "loss": 0.3289, + "step": 29336 + }, + { + "epoch": 0.5446619857940094, + "grad_norm": 0.3500831127166748, + "learning_rate": 8.601312833892577e-06, + "loss": 0.1911, + "step": 29338 + }, + { + "epoch": 0.5446991159314281, + "grad_norm": 0.3648889660835266, + "learning_rate": 8.600157819284699e-06, + "loss": 0.3523, + "step": 29340 + }, + { + "epoch": 0.5447362460688467, + "grad_norm": 0.5154054164886475, + "learning_rate": 8.599002823724478e-06, + "loss": 0.2669, + "step": 29342 + }, + { + "epoch": 0.5447733762062653, + "grad_norm": 0.3284898102283478, + "learning_rate": 8.597847847227636e-06, + "loss": 0.2639, + "step": 29344 + }, + { + "epoch": 0.544810506343684, + "grad_norm": 0.24722422659397125, + "learning_rate": 8.596692889809881e-06, + "loss": 0.2497, + "step": 29346 + }, + { + "epoch": 0.5448476364811026, + "grad_norm": 0.41449519991874695, + "learning_rate": 8.595537951486938e-06, + "loss": 0.3893, + "step": 29348 + }, + { + "epoch": 0.5448847666185213, + "grad_norm": 0.49547505378723145, + "learning_rate": 8.594383032274512e-06, + "loss": 0.2032, + "step": 29350 + }, + { + "epoch": 0.5449218967559399, + "grad_norm": 0.3524686396121979, + "learning_rate": 8.593228132188322e-06, + "loss": 0.1248, + "step": 29352 + }, + { + "epoch": 0.5449590268933585, + "grad_norm": 0.3160668611526489, + "learning_rate": 8.592073251244083e-06, + "loss": 0.2721, + "step": 29354 + }, + { + "epoch": 0.5449961570307772, + "grad_norm": 0.6410817503929138, + "learning_rate": 8.590918389457513e-06, + "loss": 0.4092, + "step": 29356 + }, + { + "epoch": 0.5450332871681958, + "grad_norm": 0.4196353256702423, + "learning_rate": 8.58976354684432e-06, + "loss": 0.2926, + "step": 29358 + }, + { + "epoch": 0.5450704173056145, + "grad_norm": 0.3630804419517517, + "learning_rate": 8.588608723420226e-06, + "loss": 0.277, + "step": 29360 + }, + { + "epoch": 0.5451075474430331, + "grad_norm": 0.38126394152641296, + "learning_rate": 8.587453919200937e-06, + "loss": 0.2567, + "step": 29362 + }, + { + "epoch": 0.5451446775804517, + "grad_norm": 0.47089827060699463, + "learning_rate": 8.586299134202165e-06, + "loss": 0.2469, + "step": 29364 + }, + { + "epoch": 0.5451818077178704, + "grad_norm": 0.4572823941707611, + "learning_rate": 8.585144368439632e-06, + "loss": 0.2357, + "step": 29366 + }, + { + "epoch": 0.545218937855289, + "grad_norm": 0.3201158046722412, + "learning_rate": 8.583989621929045e-06, + "loss": 0.516, + "step": 29368 + }, + { + "epoch": 0.5452560679927076, + "grad_norm": 0.25221237540245056, + "learning_rate": 8.582834894686116e-06, + "loss": 0.268, + "step": 29370 + }, + { + "epoch": 0.5452931981301263, + "grad_norm": 0.4691847264766693, + "learning_rate": 8.581680186726565e-06, + "loss": 0.1847, + "step": 29372 + }, + { + "epoch": 0.5453303282675449, + "grad_norm": 0.2895698547363281, + "learning_rate": 8.580525498066092e-06, + "loss": 0.3571, + "step": 29374 + }, + { + "epoch": 0.5453674584049636, + "grad_norm": 0.2594881057739258, + "learning_rate": 8.579370828720418e-06, + "loss": 0.2436, + "step": 29376 + }, + { + "epoch": 0.5454045885423822, + "grad_norm": 0.32218727469444275, + "learning_rate": 8.57821617870525e-06, + "loss": 0.3968, + "step": 29378 + }, + { + "epoch": 0.5454417186798008, + "grad_norm": 0.48512133955955505, + "learning_rate": 8.577061548036302e-06, + "loss": 0.3378, + "step": 29380 + }, + { + "epoch": 0.5454788488172194, + "grad_norm": 0.3078174591064453, + "learning_rate": 8.575906936729283e-06, + "loss": 0.1891, + "step": 29382 + }, + { + "epoch": 0.5455159789546381, + "grad_norm": 0.3876544237136841, + "learning_rate": 8.574752344799906e-06, + "loss": 0.4891, + "step": 29384 + }, + { + "epoch": 0.5455531090920568, + "grad_norm": 0.22093725204467773, + "learning_rate": 8.573597772263884e-06, + "loss": 0.1763, + "step": 29386 + }, + { + "epoch": 0.5455902392294754, + "grad_norm": 0.45386743545532227, + "learning_rate": 8.57244321913692e-06, + "loss": 0.3071, + "step": 29388 + }, + { + "epoch": 0.545627369366894, + "grad_norm": 0.4772425591945648, + "learning_rate": 8.571288685434727e-06, + "loss": 0.1918, + "step": 29390 + }, + { + "epoch": 0.5456644995043126, + "grad_norm": 0.530746340751648, + "learning_rate": 8.570134171173017e-06, + "loss": 0.2592, + "step": 29392 + }, + { + "epoch": 0.5457016296417313, + "grad_norm": 0.430203914642334, + "learning_rate": 8.568979676367495e-06, + "loss": 0.3222, + "step": 29394 + }, + { + "epoch": 0.54573875977915, + "grad_norm": 0.3285220265388489, + "learning_rate": 8.567825201033878e-06, + "loss": 0.2714, + "step": 29396 + }, + { + "epoch": 0.5457758899165686, + "grad_norm": 0.63829106092453, + "learning_rate": 8.566670745187869e-06, + "loss": 0.4461, + "step": 29398 + }, + { + "epoch": 0.5458130200539872, + "grad_norm": 0.4500024616718292, + "learning_rate": 8.565516308845179e-06, + "loss": 0.4219, + "step": 29400 + }, + { + "epoch": 0.5458501501914058, + "grad_norm": 0.5143587589263916, + "learning_rate": 8.56436189202151e-06, + "loss": 0.2334, + "step": 29402 + }, + { + "epoch": 0.5458872803288245, + "grad_norm": 0.6377624869346619, + "learning_rate": 8.56320749473258e-06, + "loss": 0.3156, + "step": 29404 + }, + { + "epoch": 0.5459244104662432, + "grad_norm": 0.25424185395240784, + "learning_rate": 8.562053116994088e-06, + "loss": 0.2873, + "step": 29406 + }, + { + "epoch": 0.5459615406036618, + "grad_norm": 0.2730737626552582, + "learning_rate": 8.560898758821751e-06, + "loss": 0.2995, + "step": 29408 + }, + { + "epoch": 0.5459986707410804, + "grad_norm": 0.4875289499759674, + "learning_rate": 8.559744420231272e-06, + "loss": 0.2175, + "step": 29410 + }, + { + "epoch": 0.546035800878499, + "grad_norm": 0.3255956172943115, + "learning_rate": 8.558590101238353e-06, + "loss": 0.3691, + "step": 29412 + }, + { + "epoch": 0.5460729310159177, + "grad_norm": 0.3296992778778076, + "learning_rate": 8.557435801858706e-06, + "loss": 0.1409, + "step": 29414 + }, + { + "epoch": 0.5461100611533364, + "grad_norm": 0.34794560074806213, + "learning_rate": 8.556281522108037e-06, + "loss": 0.3194, + "step": 29416 + }, + { + "epoch": 0.546147191290755, + "grad_norm": 0.4201442003250122, + "learning_rate": 8.555127262002054e-06, + "loss": 0.3529, + "step": 29418 + }, + { + "epoch": 0.5461843214281736, + "grad_norm": 0.3547205626964569, + "learning_rate": 8.553973021556457e-06, + "loss": 0.5032, + "step": 29420 + }, + { + "epoch": 0.5462214515655922, + "grad_norm": 0.3350699245929718, + "learning_rate": 8.552818800786957e-06, + "loss": 0.1418, + "step": 29422 + }, + { + "epoch": 0.5462585817030109, + "grad_norm": 0.43205714225769043, + "learning_rate": 8.551664599709264e-06, + "loss": 0.2846, + "step": 29424 + }, + { + "epoch": 0.5462957118404296, + "grad_norm": 0.26601508259773254, + "learning_rate": 8.550510418339072e-06, + "loss": 0.2907, + "step": 29426 + }, + { + "epoch": 0.5463328419778481, + "grad_norm": 0.47079217433929443, + "learning_rate": 8.549356256692092e-06, + "loss": 0.2974, + "step": 29428 + }, + { + "epoch": 0.5463699721152668, + "grad_norm": 0.3658428192138672, + "learning_rate": 8.54820211478403e-06, + "loss": 0.2326, + "step": 29430 + }, + { + "epoch": 0.5464071022526854, + "grad_norm": 0.2766571044921875, + "learning_rate": 8.547047992630586e-06, + "loss": 0.4282, + "step": 29432 + }, + { + "epoch": 0.5464442323901041, + "grad_norm": 0.4139076769351959, + "learning_rate": 8.545893890247467e-06, + "loss": 0.2459, + "step": 29434 + }, + { + "epoch": 0.5464813625275227, + "grad_norm": 0.408913254737854, + "learning_rate": 8.54473980765038e-06, + "loss": 0.3762, + "step": 29436 + }, + { + "epoch": 0.5465184926649413, + "grad_norm": 0.580410361289978, + "learning_rate": 8.543585744855024e-06, + "loss": 0.2703, + "step": 29438 + }, + { + "epoch": 0.54655562280236, + "grad_norm": 0.5737597942352295, + "learning_rate": 8.5424317018771e-06, + "loss": 0.2995, + "step": 29440 + }, + { + "epoch": 0.5465927529397786, + "grad_norm": 0.4350390136241913, + "learning_rate": 8.541277678732318e-06, + "loss": 0.2394, + "step": 29442 + }, + { + "epoch": 0.5466298830771973, + "grad_norm": 0.3459354341030121, + "learning_rate": 8.540123675436375e-06, + "loss": 0.4399, + "step": 29444 + }, + { + "epoch": 0.5466670132146159, + "grad_norm": 0.20533688366413116, + "learning_rate": 8.538969692004977e-06, + "loss": 0.2288, + "step": 29446 + }, + { + "epoch": 0.5467041433520345, + "grad_norm": 0.5552954077720642, + "learning_rate": 8.53781572845383e-06, + "loss": 0.3677, + "step": 29448 + }, + { + "epoch": 0.5467412734894532, + "grad_norm": 0.42814841866493225, + "learning_rate": 8.536661784798625e-06, + "loss": 0.2172, + "step": 29450 + }, + { + "epoch": 0.5467784036268718, + "grad_norm": 0.4103059768676758, + "learning_rate": 8.535507861055072e-06, + "loss": 0.2807, + "step": 29452 + }, + { + "epoch": 0.5468155337642905, + "grad_norm": 0.3233448266983032, + "learning_rate": 8.53435395723887e-06, + "loss": 0.1688, + "step": 29454 + }, + { + "epoch": 0.5468526639017091, + "grad_norm": 0.30224910378456116, + "learning_rate": 8.53320007336572e-06, + "loss": 0.3671, + "step": 29456 + }, + { + "epoch": 0.5468897940391277, + "grad_norm": 0.36225661635398865, + "learning_rate": 8.532046209451322e-06, + "loss": 0.2667, + "step": 29458 + }, + { + "epoch": 0.5469269241765464, + "grad_norm": 0.2656540274620056, + "learning_rate": 8.530892365511381e-06, + "loss": 0.2337, + "step": 29460 + }, + { + "epoch": 0.546964054313965, + "grad_norm": 0.6690658330917358, + "learning_rate": 8.529738541561595e-06, + "loss": 0.2246, + "step": 29462 + }, + { + "epoch": 0.5470011844513837, + "grad_norm": 0.38893425464630127, + "learning_rate": 8.528584737617664e-06, + "loss": 0.215, + "step": 29464 + }, + { + "epoch": 0.5470383145888023, + "grad_norm": 0.27084463834762573, + "learning_rate": 8.527430953695284e-06, + "loss": 0.307, + "step": 29466 + }, + { + "epoch": 0.5470754447262209, + "grad_norm": 0.332878977060318, + "learning_rate": 8.526277189810157e-06, + "loss": 0.2087, + "step": 29468 + }, + { + "epoch": 0.5471125748636396, + "grad_norm": 0.2772464454174042, + "learning_rate": 8.525123445977985e-06, + "loss": 0.1552, + "step": 29470 + }, + { + "epoch": 0.5471497050010582, + "grad_norm": 2.5414254665374756, + "learning_rate": 8.523969722214467e-06, + "loss": 0.2646, + "step": 29472 + }, + { + "epoch": 0.5471868351384769, + "grad_norm": 0.45054513216018677, + "learning_rate": 8.522816018535301e-06, + "loss": 0.2148, + "step": 29474 + }, + { + "epoch": 0.5472239652758955, + "grad_norm": 0.42739564180374146, + "learning_rate": 8.52166233495618e-06, + "loss": 0.3996, + "step": 29476 + }, + { + "epoch": 0.5472610954133141, + "grad_norm": 0.5109399557113647, + "learning_rate": 8.520508671492807e-06, + "loss": 0.1984, + "step": 29478 + }, + { + "epoch": 0.5472982255507327, + "grad_norm": 0.41114407777786255, + "learning_rate": 8.519355028160881e-06, + "loss": 0.2378, + "step": 29480 + }, + { + "epoch": 0.5473353556881514, + "grad_norm": 0.41081589460372925, + "learning_rate": 8.518201404976097e-06, + "loss": 0.2549, + "step": 29482 + }, + { + "epoch": 0.5473724858255701, + "grad_norm": 0.2856124937534332, + "learning_rate": 8.517047801954154e-06, + "loss": 0.3273, + "step": 29484 + }, + { + "epoch": 0.5474096159629886, + "grad_norm": 0.39343586564064026, + "learning_rate": 8.515894219110752e-06, + "loss": 0.2401, + "step": 29486 + }, + { + "epoch": 0.5474467461004073, + "grad_norm": 0.32764309644699097, + "learning_rate": 8.514740656461579e-06, + "loss": 0.2299, + "step": 29488 + }, + { + "epoch": 0.5474838762378259, + "grad_norm": 0.2899824380874634, + "learning_rate": 8.513587114022338e-06, + "loss": 0.1612, + "step": 29490 + }, + { + "epoch": 0.5475210063752446, + "grad_norm": 0.4124244749546051, + "learning_rate": 8.512433591808724e-06, + "loss": 0.3067, + "step": 29492 + }, + { + "epoch": 0.5475581365126633, + "grad_norm": 0.29867270588874817, + "learning_rate": 8.511280089836433e-06, + "loss": 0.1515, + "step": 29494 + }, + { + "epoch": 0.5475952666500818, + "grad_norm": 0.30294597148895264, + "learning_rate": 8.510126608121161e-06, + "loss": 0.4286, + "step": 29496 + }, + { + "epoch": 0.5476323967875005, + "grad_norm": 0.5003699064254761, + "learning_rate": 8.508973146678605e-06, + "loss": 0.1303, + "step": 29498 + }, + { + "epoch": 0.5476695269249191, + "grad_norm": 0.6814979314804077, + "learning_rate": 8.507819705524457e-06, + "loss": 0.3632, + "step": 29500 + }, + { + "epoch": 0.5477066570623378, + "grad_norm": 0.4204842746257782, + "learning_rate": 8.506666284674412e-06, + "loss": 0.3272, + "step": 29502 + }, + { + "epoch": 0.5477437871997565, + "grad_norm": 0.19407132267951965, + "learning_rate": 8.505512884144167e-06, + "loss": 0.2306, + "step": 29504 + }, + { + "epoch": 0.547780917337175, + "grad_norm": 0.5600400567054749, + "learning_rate": 8.504359503949415e-06, + "loss": 0.5105, + "step": 29506 + }, + { + "epoch": 0.5478180474745937, + "grad_norm": 0.27770617604255676, + "learning_rate": 8.503206144105849e-06, + "loss": 0.2642, + "step": 29508 + }, + { + "epoch": 0.5478551776120123, + "grad_norm": 0.39783868193626404, + "learning_rate": 8.502052804629163e-06, + "loss": 0.2691, + "step": 29510 + }, + { + "epoch": 0.547892307749431, + "grad_norm": 0.4914141595363617, + "learning_rate": 8.500899485535058e-06, + "loss": 0.3238, + "step": 29512 + }, + { + "epoch": 0.5479294378868497, + "grad_norm": 0.5469263195991516, + "learning_rate": 8.499746186839215e-06, + "loss": 0.3548, + "step": 29514 + }, + { + "epoch": 0.5479665680242682, + "grad_norm": 0.3003077507019043, + "learning_rate": 8.498592908557334e-06, + "loss": 0.1714, + "step": 29516 + }, + { + "epoch": 0.5480036981616869, + "grad_norm": 0.32447564601898193, + "learning_rate": 8.497439650705106e-06, + "loss": 0.2247, + "step": 29518 + }, + { + "epoch": 0.5480408282991055, + "grad_norm": 0.4069264829158783, + "learning_rate": 8.496286413298223e-06, + "loss": 0.2087, + "step": 29520 + }, + { + "epoch": 0.5480779584365242, + "grad_norm": 0.3681446611881256, + "learning_rate": 8.495133196352377e-06, + "loss": 0.1478, + "step": 29522 + }, + { + "epoch": 0.5481150885739429, + "grad_norm": 0.30589357018470764, + "learning_rate": 8.493979999883266e-06, + "loss": 0.3183, + "step": 29524 + }, + { + "epoch": 0.5481522187113614, + "grad_norm": 0.2754327356815338, + "learning_rate": 8.492826823906573e-06, + "loss": 0.3515, + "step": 29526 + }, + { + "epoch": 0.5481893488487801, + "grad_norm": 0.3715721666812897, + "learning_rate": 8.491673668437992e-06, + "loss": 0.2342, + "step": 29528 + }, + { + "epoch": 0.5482264789861987, + "grad_norm": 0.15907639265060425, + "learning_rate": 8.490520533493211e-06, + "loss": 0.0624, + "step": 29530 + }, + { + "epoch": 0.5482636091236174, + "grad_norm": 0.3349675238132477, + "learning_rate": 8.489367419087926e-06, + "loss": 0.3386, + "step": 29532 + }, + { + "epoch": 0.548300739261036, + "grad_norm": 0.5463942289352417, + "learning_rate": 8.488214325237829e-06, + "loss": 0.2161, + "step": 29534 + }, + { + "epoch": 0.5483378693984546, + "grad_norm": 0.3647940456867218, + "learning_rate": 8.487061251958606e-06, + "loss": 0.4235, + "step": 29536 + }, + { + "epoch": 0.5483749995358733, + "grad_norm": 0.23527753353118896, + "learning_rate": 8.485908199265947e-06, + "loss": 0.2033, + "step": 29538 + }, + { + "epoch": 0.5484121296732919, + "grad_norm": 0.3834899961948395, + "learning_rate": 8.484755167175541e-06, + "loss": 0.3793, + "step": 29540 + }, + { + "epoch": 0.5484492598107106, + "grad_norm": 0.39678052067756653, + "learning_rate": 8.483602155703079e-06, + "loss": 0.3552, + "step": 29542 + }, + { + "epoch": 0.5484863899481291, + "grad_norm": 0.37919747829437256, + "learning_rate": 8.482449164864248e-06, + "loss": 0.2535, + "step": 29544 + }, + { + "epoch": 0.5485235200855478, + "grad_norm": 0.5572365522384644, + "learning_rate": 8.48129619467474e-06, + "loss": 0.2346, + "step": 29546 + }, + { + "epoch": 0.5485606502229665, + "grad_norm": 0.3966858386993408, + "learning_rate": 8.48014324515024e-06, + "loss": 0.4016, + "step": 29548 + }, + { + "epoch": 0.5485977803603851, + "grad_norm": 0.26546618342399597, + "learning_rate": 8.478990316306442e-06, + "loss": 0.209, + "step": 29550 + }, + { + "epoch": 0.5486349104978038, + "grad_norm": 0.47832462191581726, + "learning_rate": 8.477837408159026e-06, + "loss": 0.3201, + "step": 29552 + }, + { + "epoch": 0.5486720406352223, + "grad_norm": 0.25968796014785767, + "learning_rate": 8.476684520723683e-06, + "loss": 0.2261, + "step": 29554 + }, + { + "epoch": 0.548709170772641, + "grad_norm": 0.4551369249820709, + "learning_rate": 8.475531654016104e-06, + "loss": 0.2591, + "step": 29556 + }, + { + "epoch": 0.5487463009100597, + "grad_norm": 0.2256239950656891, + "learning_rate": 8.47437880805197e-06, + "loss": 0.254, + "step": 29558 + }, + { + "epoch": 0.5487834310474783, + "grad_norm": 0.38865092396736145, + "learning_rate": 8.473225982846971e-06, + "loss": 0.471, + "step": 29560 + }, + { + "epoch": 0.548820561184897, + "grad_norm": 0.38139957189559937, + "learning_rate": 8.472073178416798e-06, + "loss": 0.2729, + "step": 29562 + }, + { + "epoch": 0.5488576913223155, + "grad_norm": 0.21085870265960693, + "learning_rate": 8.470920394777127e-06, + "loss": 0.1975, + "step": 29564 + }, + { + "epoch": 0.5488948214597342, + "grad_norm": 0.3681356906890869, + "learning_rate": 8.469767631943651e-06, + "loss": 0.3372, + "step": 29566 + }, + { + "epoch": 0.5489319515971529, + "grad_norm": 0.2004367709159851, + "learning_rate": 8.468614889932055e-06, + "loss": 0.391, + "step": 29568 + }, + { + "epoch": 0.5489690817345715, + "grad_norm": 0.3810936212539673, + "learning_rate": 8.46746216875802e-06, + "loss": 0.3781, + "step": 29570 + }, + { + "epoch": 0.5490062118719902, + "grad_norm": 0.5741381049156189, + "learning_rate": 8.466309468437235e-06, + "loss": 0.26, + "step": 29572 + }, + { + "epoch": 0.5490433420094087, + "grad_norm": 0.48123329877853394, + "learning_rate": 8.46515678898539e-06, + "loss": 0.3651, + "step": 29574 + }, + { + "epoch": 0.5490804721468274, + "grad_norm": 0.4998578429222107, + "learning_rate": 8.464004130418157e-06, + "loss": 0.2832, + "step": 29576 + }, + { + "epoch": 0.5491176022842461, + "grad_norm": 0.48362430930137634, + "learning_rate": 8.46285149275123e-06, + "loss": 0.2779, + "step": 29578 + }, + { + "epoch": 0.5491547324216647, + "grad_norm": 0.4045029878616333, + "learning_rate": 8.46169887600029e-06, + "loss": 0.3614, + "step": 29580 + }, + { + "epoch": 0.5491918625590834, + "grad_norm": 0.3947937488555908, + "learning_rate": 8.460546280181018e-06, + "loss": 0.28, + "step": 29582 + }, + { + "epoch": 0.5492289926965019, + "grad_norm": 0.3645005226135254, + "learning_rate": 8.4593937053091e-06, + "loss": 0.3471, + "step": 29584 + }, + { + "epoch": 0.5492661228339206, + "grad_norm": 0.6296367049217224, + "learning_rate": 8.458241151400223e-06, + "loss": 0.3591, + "step": 29586 + }, + { + "epoch": 0.5493032529713392, + "grad_norm": 0.34151867032051086, + "learning_rate": 8.457088618470064e-06, + "loss": 0.2352, + "step": 29588 + }, + { + "epoch": 0.5493403831087579, + "grad_norm": 0.4820873737335205, + "learning_rate": 8.455936106534308e-06, + "loss": 0.3794, + "step": 29590 + }, + { + "epoch": 0.5493775132461766, + "grad_norm": 0.4958608150482178, + "learning_rate": 8.454783615608634e-06, + "loss": 0.2119, + "step": 29592 + }, + { + "epoch": 0.5494146433835951, + "grad_norm": 0.46441322565078735, + "learning_rate": 8.453631145708726e-06, + "loss": 0.383, + "step": 29594 + }, + { + "epoch": 0.5494517735210138, + "grad_norm": 0.354354590177536, + "learning_rate": 8.452478696850268e-06, + "loss": 0.2379, + "step": 29596 + }, + { + "epoch": 0.5494889036584324, + "grad_norm": 0.410057932138443, + "learning_rate": 8.451326269048941e-06, + "loss": 0.2988, + "step": 29598 + }, + { + "epoch": 0.5495260337958511, + "grad_norm": 0.18379488587379456, + "learning_rate": 8.450173862320425e-06, + "loss": 0.396, + "step": 29600 + }, + { + "epoch": 0.5495631639332698, + "grad_norm": 0.34139513969421387, + "learning_rate": 8.4490214766804e-06, + "loss": 0.2417, + "step": 29602 + }, + { + "epoch": 0.5496002940706883, + "grad_norm": 0.535956084728241, + "learning_rate": 8.447869112144544e-06, + "loss": 0.3587, + "step": 29604 + }, + { + "epoch": 0.549637424208107, + "grad_norm": 0.45256224274635315, + "learning_rate": 8.446716768728543e-06, + "loss": 0.2962, + "step": 29606 + }, + { + "epoch": 0.5496745543455256, + "grad_norm": 0.4366180896759033, + "learning_rate": 8.445564446448072e-06, + "loss": 0.2863, + "step": 29608 + }, + { + "epoch": 0.5497116844829443, + "grad_norm": 0.4836982786655426, + "learning_rate": 8.444412145318815e-06, + "loss": 0.1852, + "step": 29610 + }, + { + "epoch": 0.549748814620363, + "grad_norm": 0.5191369652748108, + "learning_rate": 8.44325986535645e-06, + "loss": 0.3641, + "step": 29612 + }, + { + "epoch": 0.5497859447577815, + "grad_norm": 0.3350653350353241, + "learning_rate": 8.442107606576652e-06, + "loss": 0.4311, + "step": 29614 + }, + { + "epoch": 0.5498230748952002, + "grad_norm": 0.5967183709144592, + "learning_rate": 8.440955368995105e-06, + "loss": 0.3083, + "step": 29616 + }, + { + "epoch": 0.5498602050326188, + "grad_norm": 0.39591431617736816, + "learning_rate": 8.439803152627483e-06, + "loss": 0.3851, + "step": 29618 + }, + { + "epoch": 0.5498973351700375, + "grad_norm": 0.37901535630226135, + "learning_rate": 8.43865095748947e-06, + "loss": 0.345, + "step": 29620 + }, + { + "epoch": 0.5499344653074562, + "grad_norm": 0.3170936405658722, + "learning_rate": 8.437498783596739e-06, + "loss": 0.2525, + "step": 29622 + }, + { + "epoch": 0.5499715954448747, + "grad_norm": 0.33359119296073914, + "learning_rate": 8.436346630964973e-06, + "loss": 0.2222, + "step": 29624 + }, + { + "epoch": 0.5500087255822934, + "grad_norm": 0.39624372124671936, + "learning_rate": 8.435194499609841e-06, + "loss": 0.235, + "step": 29626 + }, + { + "epoch": 0.550045855719712, + "grad_norm": 0.29178348183631897, + "learning_rate": 8.434042389547026e-06, + "loss": 0.2949, + "step": 29628 + }, + { + "epoch": 0.5500829858571307, + "grad_norm": 0.4446309804916382, + "learning_rate": 8.432890300792202e-06, + "loss": 0.2422, + "step": 29630 + }, + { + "epoch": 0.5501201159945492, + "grad_norm": 0.34932273626327515, + "learning_rate": 8.431738233361051e-06, + "loss": 0.2506, + "step": 29632 + }, + { + "epoch": 0.5501572461319679, + "grad_norm": 0.30129197239875793, + "learning_rate": 8.43058618726924e-06, + "loss": 0.4619, + "step": 29634 + }, + { + "epoch": 0.5501943762693866, + "grad_norm": 0.4278201758861542, + "learning_rate": 8.429434162532454e-06, + "loss": 0.2816, + "step": 29636 + }, + { + "epoch": 0.5502315064068052, + "grad_norm": 0.5870686173439026, + "learning_rate": 8.428282159166365e-06, + "loss": 0.2263, + "step": 29638 + }, + { + "epoch": 0.5502686365442239, + "grad_norm": 0.4553792476654053, + "learning_rate": 8.427130177186646e-06, + "loss": 0.3765, + "step": 29640 + }, + { + "epoch": 0.5503057666816424, + "grad_norm": 0.37603169679641724, + "learning_rate": 8.425978216608976e-06, + "loss": 0.3398, + "step": 29642 + }, + { + "epoch": 0.5503428968190611, + "grad_norm": 0.474406898021698, + "learning_rate": 8.424826277449025e-06, + "loss": 0.2323, + "step": 29644 + }, + { + "epoch": 0.5503800269564798, + "grad_norm": 0.2842349708080292, + "learning_rate": 8.423674359722471e-06, + "loss": 0.1291, + "step": 29646 + }, + { + "epoch": 0.5504171570938984, + "grad_norm": 0.2941468358039856, + "learning_rate": 8.422522463444986e-06, + "loss": 0.2435, + "step": 29648 + }, + { + "epoch": 0.5504542872313171, + "grad_norm": 0.45396292209625244, + "learning_rate": 8.42137058863225e-06, + "loss": 0.3079, + "step": 29650 + }, + { + "epoch": 0.5504914173687356, + "grad_norm": 0.36189764738082886, + "learning_rate": 8.420218735299929e-06, + "loss": 0.3917, + "step": 29652 + }, + { + "epoch": 0.5505285475061543, + "grad_norm": 0.3167206346988678, + "learning_rate": 8.4190669034637e-06, + "loss": 0.1783, + "step": 29654 + }, + { + "epoch": 0.550565677643573, + "grad_norm": 0.40876254439353943, + "learning_rate": 8.417915093139232e-06, + "loss": 0.4082, + "step": 29656 + }, + { + "epoch": 0.5506028077809916, + "grad_norm": 0.7512531280517578, + "learning_rate": 8.416763304342202e-06, + "loss": 0.3855, + "step": 29658 + }, + { + "epoch": 0.5506399379184103, + "grad_norm": 0.40257060527801514, + "learning_rate": 8.415611537088279e-06, + "loss": 0.4172, + "step": 29660 + }, + { + "epoch": 0.5506770680558288, + "grad_norm": 0.35208678245544434, + "learning_rate": 8.414459791393144e-06, + "loss": 0.2877, + "step": 29662 + }, + { + "epoch": 0.5507141981932475, + "grad_norm": 0.4353802502155304, + "learning_rate": 8.41330806727246e-06, + "loss": 0.2873, + "step": 29664 + }, + { + "epoch": 0.5507513283306662, + "grad_norm": 0.5050287246704102, + "learning_rate": 8.412156364741896e-06, + "loss": 0.3522, + "step": 29666 + }, + { + "epoch": 0.5507884584680848, + "grad_norm": 0.2604857087135315, + "learning_rate": 8.411004683817129e-06, + "loss": 0.2177, + "step": 29668 + }, + { + "epoch": 0.5508255886055035, + "grad_norm": 0.3294623792171478, + "learning_rate": 8.409853024513828e-06, + "loss": 0.192, + "step": 29670 + }, + { + "epoch": 0.550862718742922, + "grad_norm": 0.5258513689041138, + "learning_rate": 8.408701386847668e-06, + "loss": 0.3332, + "step": 29672 + }, + { + "epoch": 0.5508998488803407, + "grad_norm": 0.2859926223754883, + "learning_rate": 8.407549770834312e-06, + "loss": 0.2652, + "step": 29674 + }, + { + "epoch": 0.5509369790177594, + "grad_norm": 0.4033631682395935, + "learning_rate": 8.40639817648944e-06, + "loss": 0.3011, + "step": 29676 + }, + { + "epoch": 0.550974109155178, + "grad_norm": 0.17057602107524872, + "learning_rate": 8.405246603828707e-06, + "loss": 0.2408, + "step": 29678 + }, + { + "epoch": 0.5510112392925967, + "grad_norm": 0.4241195321083069, + "learning_rate": 8.404095052867793e-06, + "loss": 0.2024, + "step": 29680 + }, + { + "epoch": 0.5510483694300152, + "grad_norm": 0.2661696970462799, + "learning_rate": 8.402943523622366e-06, + "loss": 0.1341, + "step": 29682 + }, + { + "epoch": 0.5510854995674339, + "grad_norm": 0.2898448705673218, + "learning_rate": 8.401792016108096e-06, + "loss": 0.2322, + "step": 29684 + }, + { + "epoch": 0.5511226297048525, + "grad_norm": 0.4003200829029083, + "learning_rate": 8.400640530340647e-06, + "loss": 0.2691, + "step": 29686 + }, + { + "epoch": 0.5511597598422712, + "grad_norm": 0.2662317156791687, + "learning_rate": 8.399489066335695e-06, + "loss": 0.173, + "step": 29688 + }, + { + "epoch": 0.5511968899796899, + "grad_norm": 0.45847034454345703, + "learning_rate": 8.398337624108897e-06, + "loss": 0.1946, + "step": 29690 + }, + { + "epoch": 0.5512340201171084, + "grad_norm": 0.49228665232658386, + "learning_rate": 8.397186203675926e-06, + "loss": 0.2572, + "step": 29692 + }, + { + "epoch": 0.5512711502545271, + "grad_norm": 0.37093785405158997, + "learning_rate": 8.396034805052453e-06, + "loss": 0.3547, + "step": 29694 + }, + { + "epoch": 0.5513082803919457, + "grad_norm": 0.4184906780719757, + "learning_rate": 8.394883428254139e-06, + "loss": 0.4533, + "step": 29696 + }, + { + "epoch": 0.5513454105293644, + "grad_norm": 0.5013591647148132, + "learning_rate": 8.393732073296654e-06, + "loss": 0.208, + "step": 29698 + }, + { + "epoch": 0.551382540666783, + "grad_norm": 0.4134382903575897, + "learning_rate": 8.392580740195669e-06, + "loss": 0.3003, + "step": 29700 + }, + { + "epoch": 0.5514196708042016, + "grad_norm": 0.46402209997177124, + "learning_rate": 8.39142942896684e-06, + "loss": 0.305, + "step": 29702 + }, + { + "epoch": 0.5514568009416203, + "grad_norm": 0.28141483664512634, + "learning_rate": 8.390278139625839e-06, + "loss": 0.2202, + "step": 29704 + }, + { + "epoch": 0.5514939310790389, + "grad_norm": 0.3504815995693207, + "learning_rate": 8.389126872188333e-06, + "loss": 0.1518, + "step": 29706 + }, + { + "epoch": 0.5515310612164576, + "grad_norm": 0.36789146065711975, + "learning_rate": 8.387975626669982e-06, + "loss": 0.2319, + "step": 29708 + }, + { + "epoch": 0.5515681913538762, + "grad_norm": 0.4471376836299896, + "learning_rate": 8.386824403086455e-06, + "loss": 0.4276, + "step": 29710 + }, + { + "epoch": 0.5516053214912948, + "grad_norm": 0.3521673083305359, + "learning_rate": 8.385673201453416e-06, + "loss": 0.2054, + "step": 29712 + }, + { + "epoch": 0.5516424516287135, + "grad_norm": 0.4252486824989319, + "learning_rate": 8.384522021786534e-06, + "loss": 0.2412, + "step": 29714 + }, + { + "epoch": 0.5516795817661321, + "grad_norm": 0.31602349877357483, + "learning_rate": 8.383370864101464e-06, + "loss": 0.1956, + "step": 29716 + }, + { + "epoch": 0.5517167119035508, + "grad_norm": 0.5742897391319275, + "learning_rate": 8.382219728413875e-06, + "loss": 0.3732, + "step": 29718 + }, + { + "epoch": 0.5517538420409694, + "grad_norm": 0.4237477481365204, + "learning_rate": 8.381068614739428e-06, + "loss": 0.3486, + "step": 29720 + }, + { + "epoch": 0.551790972178388, + "grad_norm": 0.34446054697036743, + "learning_rate": 8.379917523093788e-06, + "loss": 0.3259, + "step": 29722 + }, + { + "epoch": 0.5518281023158067, + "grad_norm": 0.42857325077056885, + "learning_rate": 8.378766453492621e-06, + "loss": 0.3333, + "step": 29724 + }, + { + "epoch": 0.5518652324532253, + "grad_norm": 0.20612427592277527, + "learning_rate": 8.377615405951584e-06, + "loss": 0.4361, + "step": 29726 + }, + { + "epoch": 0.551902362590644, + "grad_norm": 0.8472954034805298, + "learning_rate": 8.376464380486344e-06, + "loss": 0.343, + "step": 29728 + }, + { + "epoch": 0.5519394927280626, + "grad_norm": 0.25781169533729553, + "learning_rate": 8.375313377112558e-06, + "loss": 0.3917, + "step": 29730 + }, + { + "epoch": 0.5519766228654812, + "grad_norm": 0.3379178047180176, + "learning_rate": 8.37416239584589e-06, + "loss": 0.2115, + "step": 29732 + }, + { + "epoch": 0.5520137530028999, + "grad_norm": 0.28762543201446533, + "learning_rate": 8.373011436702003e-06, + "loss": 0.3057, + "step": 29734 + }, + { + "epoch": 0.5520508831403185, + "grad_norm": 0.2951710522174835, + "learning_rate": 8.371860499696558e-06, + "loss": 0.218, + "step": 29736 + }, + { + "epoch": 0.5520880132777372, + "grad_norm": 0.4116617739200592, + "learning_rate": 8.370709584845215e-06, + "loss": 0.306, + "step": 29738 + }, + { + "epoch": 0.5521251434151557, + "grad_norm": 0.4373941123485565, + "learning_rate": 8.369558692163634e-06, + "loss": 0.2075, + "step": 29740 + }, + { + "epoch": 0.5521622735525744, + "grad_norm": 0.3932211995124817, + "learning_rate": 8.368407821667473e-06, + "loss": 0.1079, + "step": 29742 + }, + { + "epoch": 0.5521994036899931, + "grad_norm": 0.4484252631664276, + "learning_rate": 8.367256973372396e-06, + "loss": 0.1403, + "step": 29744 + }, + { + "epoch": 0.5522365338274117, + "grad_norm": 0.348652720451355, + "learning_rate": 8.36610614729406e-06, + "loss": 0.2509, + "step": 29746 + }, + { + "epoch": 0.5522736639648304, + "grad_norm": 0.2790572941303253, + "learning_rate": 8.364955343448127e-06, + "loss": 0.2477, + "step": 29748 + }, + { + "epoch": 0.5523107941022489, + "grad_norm": 0.30164262652397156, + "learning_rate": 8.363804561850253e-06, + "loss": 0.3405, + "step": 29750 + }, + { + "epoch": 0.5523479242396676, + "grad_norm": 0.4022175967693329, + "learning_rate": 8.362653802516101e-06, + "loss": 0.3793, + "step": 29752 + }, + { + "epoch": 0.5523850543770863, + "grad_norm": 0.2693368196487427, + "learning_rate": 8.361503065461323e-06, + "loss": 0.2227, + "step": 29754 + }, + { + "epoch": 0.5524221845145049, + "grad_norm": 0.2327956259250641, + "learning_rate": 8.360352350701579e-06, + "loss": 0.3339, + "step": 29756 + }, + { + "epoch": 0.5524593146519235, + "grad_norm": 0.48128634691238403, + "learning_rate": 8.359201658252531e-06, + "loss": 0.3183, + "step": 29758 + }, + { + "epoch": 0.5524964447893421, + "grad_norm": 0.37836596369743347, + "learning_rate": 8.358050988129833e-06, + "loss": 0.2509, + "step": 29760 + }, + { + "epoch": 0.5525335749267608, + "grad_norm": 0.3320165276527405, + "learning_rate": 8.35690034034914e-06, + "loss": 0.2313, + "step": 29762 + }, + { + "epoch": 0.5525707050641795, + "grad_norm": 0.41630059480667114, + "learning_rate": 8.35574971492612e-06, + "loss": 0.2661, + "step": 29764 + }, + { + "epoch": 0.5526078352015981, + "grad_norm": 0.38498005270957947, + "learning_rate": 8.354599111876415e-06, + "loss": 0.2546, + "step": 29766 + }, + { + "epoch": 0.5526449653390167, + "grad_norm": 0.7790164351463318, + "learning_rate": 8.353448531215686e-06, + "loss": 0.1586, + "step": 29768 + }, + { + "epoch": 0.5526820954764353, + "grad_norm": 0.6608381271362305, + "learning_rate": 8.352297972959594e-06, + "loss": 0.3292, + "step": 29770 + }, + { + "epoch": 0.552719225613854, + "grad_norm": 0.5020132660865784, + "learning_rate": 8.35114743712379e-06, + "loss": 0.3759, + "step": 29772 + }, + { + "epoch": 0.5527563557512727, + "grad_norm": 0.5110713839530945, + "learning_rate": 8.349996923723929e-06, + "loss": 0.2945, + "step": 29774 + }, + { + "epoch": 0.5527934858886913, + "grad_norm": 0.26747772097587585, + "learning_rate": 8.348846432775672e-06, + "loss": 0.245, + "step": 29776 + }, + { + "epoch": 0.5528306160261099, + "grad_norm": 0.3154778480529785, + "learning_rate": 8.347695964294665e-06, + "loss": 0.3245, + "step": 29778 + }, + { + "epoch": 0.5528677461635285, + "grad_norm": 0.33503904938697815, + "learning_rate": 8.346545518296569e-06, + "loss": 0.2878, + "step": 29780 + }, + { + "epoch": 0.5529048763009472, + "grad_norm": 0.36724853515625, + "learning_rate": 8.345395094797035e-06, + "loss": 0.345, + "step": 29782 + }, + { + "epoch": 0.5529420064383658, + "grad_norm": 0.2365207076072693, + "learning_rate": 8.344244693811717e-06, + "loss": 0.3763, + "step": 29784 + }, + { + "epoch": 0.5529791365757845, + "grad_norm": 0.7945840954780579, + "learning_rate": 8.343094315356268e-06, + "loss": 0.4443, + "step": 29786 + }, + { + "epoch": 0.5530162667132031, + "grad_norm": 0.3459824025630951, + "learning_rate": 8.341943959446347e-06, + "loss": 0.4156, + "step": 29788 + }, + { + "epoch": 0.5530533968506217, + "grad_norm": 0.24934335052967072, + "learning_rate": 8.340793626097599e-06, + "loss": 0.2112, + "step": 29790 + }, + { + "epoch": 0.5530905269880404, + "grad_norm": 0.7116324305534363, + "learning_rate": 8.33964331532568e-06, + "loss": 0.3924, + "step": 29792 + }, + { + "epoch": 0.553127657125459, + "grad_norm": 0.442314475774765, + "learning_rate": 8.338493027146241e-06, + "loss": 0.2192, + "step": 29794 + }, + { + "epoch": 0.5531647872628777, + "grad_norm": 0.4371623694896698, + "learning_rate": 8.337342761574937e-06, + "loss": 0.4533, + "step": 29796 + }, + { + "epoch": 0.5532019174002963, + "grad_norm": 0.4227270185947418, + "learning_rate": 8.336192518627414e-06, + "loss": 0.201, + "step": 29798 + }, + { + "epoch": 0.5532390475377149, + "grad_norm": 0.36401867866516113, + "learning_rate": 8.335042298319333e-06, + "loss": 0.3025, + "step": 29800 + }, + { + "epoch": 0.5532761776751336, + "grad_norm": 0.296690434217453, + "learning_rate": 8.333892100666338e-06, + "loss": 0.4312, + "step": 29802 + }, + { + "epoch": 0.5533133078125522, + "grad_norm": 0.4659436047077179, + "learning_rate": 8.33274192568408e-06, + "loss": 0.3107, + "step": 29804 + }, + { + "epoch": 0.5533504379499709, + "grad_norm": 0.28338250517845154, + "learning_rate": 8.331591773388208e-06, + "loss": 0.2867, + "step": 29806 + }, + { + "epoch": 0.5533875680873895, + "grad_norm": 0.34068912267684937, + "learning_rate": 8.330441643794376e-06, + "loss": 0.5331, + "step": 29808 + }, + { + "epoch": 0.5534246982248081, + "grad_norm": 0.41322460770606995, + "learning_rate": 8.329291536918234e-06, + "loss": 0.2053, + "step": 29810 + }, + { + "epoch": 0.5534618283622268, + "grad_norm": 0.44815555214881897, + "learning_rate": 8.328141452775427e-06, + "loss": 0.2053, + "step": 29812 + }, + { + "epoch": 0.5534989584996454, + "grad_norm": 0.34072810411453247, + "learning_rate": 8.326991391381611e-06, + "loss": 0.3035, + "step": 29814 + }, + { + "epoch": 0.553536088637064, + "grad_norm": 0.3203069269657135, + "learning_rate": 8.325841352752427e-06, + "loss": 0.3049, + "step": 29816 + }, + { + "epoch": 0.5535732187744827, + "grad_norm": 0.27456966042518616, + "learning_rate": 8.324691336903528e-06, + "loss": 0.3151, + "step": 29818 + }, + { + "epoch": 0.5536103489119013, + "grad_norm": 0.3897068202495575, + "learning_rate": 8.323541343850561e-06, + "loss": 0.501, + "step": 29820 + }, + { + "epoch": 0.55364747904932, + "grad_norm": 0.41484394669532776, + "learning_rate": 8.32239137360918e-06, + "loss": 0.3531, + "step": 29822 + }, + { + "epoch": 0.5536846091867386, + "grad_norm": 0.40265825390815735, + "learning_rate": 8.321241426195022e-06, + "loss": 0.284, + "step": 29824 + }, + { + "epoch": 0.5537217393241572, + "grad_norm": 0.35671958327293396, + "learning_rate": 8.320091501623744e-06, + "loss": 0.1586, + "step": 29826 + }, + { + "epoch": 0.5537588694615759, + "grad_norm": 0.3852379322052002, + "learning_rate": 8.318941599910986e-06, + "loss": 0.2807, + "step": 29828 + }, + { + "epoch": 0.5537959995989945, + "grad_norm": 0.49228161573410034, + "learning_rate": 8.317791721072396e-06, + "loss": 0.1886, + "step": 29830 + }, + { + "epoch": 0.5538331297364132, + "grad_norm": 0.3709466755390167, + "learning_rate": 8.316641865123624e-06, + "loss": 0.391, + "step": 29832 + }, + { + "epoch": 0.5538702598738318, + "grad_norm": 0.29470640420913696, + "learning_rate": 8.315492032080314e-06, + "loss": 0.1621, + "step": 29834 + }, + { + "epoch": 0.5539073900112504, + "grad_norm": 0.4109002351760864, + "learning_rate": 8.314342221958109e-06, + "loss": 0.0796, + "step": 29836 + }, + { + "epoch": 0.553944520148669, + "grad_norm": 0.3403627574443817, + "learning_rate": 8.313192434772659e-06, + "loss": 0.3232, + "step": 29838 + }, + { + "epoch": 0.5539816502860877, + "grad_norm": 0.37002187967300415, + "learning_rate": 8.312042670539612e-06, + "loss": 0.2959, + "step": 29840 + }, + { + "epoch": 0.5540187804235064, + "grad_norm": 0.3811722695827484, + "learning_rate": 8.310892929274603e-06, + "loss": 0.4498, + "step": 29842 + }, + { + "epoch": 0.554055910560925, + "grad_norm": 0.2105754315853119, + "learning_rate": 8.309743210993283e-06, + "loss": 0.2821, + "step": 29844 + }, + { + "epoch": 0.5540930406983436, + "grad_norm": 0.34644967317581177, + "learning_rate": 8.308593515711293e-06, + "loss": 0.2091, + "step": 29846 + }, + { + "epoch": 0.5541301708357622, + "grad_norm": 0.32401999831199646, + "learning_rate": 8.30744384344428e-06, + "loss": 0.173, + "step": 29848 + }, + { + "epoch": 0.5541673009731809, + "grad_norm": 0.4948294460773468, + "learning_rate": 8.306294194207888e-06, + "loss": 0.2431, + "step": 29850 + }, + { + "epoch": 0.5542044311105996, + "grad_norm": 0.37393561005592346, + "learning_rate": 8.305144568017762e-06, + "loss": 0.2617, + "step": 29852 + }, + { + "epoch": 0.5542415612480182, + "grad_norm": 0.5742794275283813, + "learning_rate": 8.303994964889537e-06, + "loss": 0.1826, + "step": 29854 + }, + { + "epoch": 0.5542786913854368, + "grad_norm": 0.42632126808166504, + "learning_rate": 8.302845384838861e-06, + "loss": 0.2757, + "step": 29856 + }, + { + "epoch": 0.5543158215228554, + "grad_norm": 0.4956934154033661, + "learning_rate": 8.301695827881375e-06, + "loss": 0.1983, + "step": 29858 + }, + { + "epoch": 0.5543529516602741, + "grad_norm": 0.39284560084342957, + "learning_rate": 8.300546294032723e-06, + "loss": 0.2687, + "step": 29860 + }, + { + "epoch": 0.5543900817976928, + "grad_norm": 0.2390819936990738, + "learning_rate": 8.299396783308544e-06, + "loss": 0.1368, + "step": 29862 + }, + { + "epoch": 0.5544272119351114, + "grad_norm": 0.7485839128494263, + "learning_rate": 8.298247295724486e-06, + "loss": 0.3089, + "step": 29864 + }, + { + "epoch": 0.55446434207253, + "grad_norm": 0.3891410529613495, + "learning_rate": 8.297097831296182e-06, + "loss": 0.1834, + "step": 29866 + }, + { + "epoch": 0.5545014722099486, + "grad_norm": 0.3280470669269562, + "learning_rate": 8.295948390039273e-06, + "loss": 0.1663, + "step": 29868 + }, + { + "epoch": 0.5545386023473673, + "grad_norm": 0.3790913224220276, + "learning_rate": 8.294798971969402e-06, + "loss": 0.3071, + "step": 29870 + }, + { + "epoch": 0.554575732484786, + "grad_norm": 0.4453819990158081, + "learning_rate": 8.293649577102213e-06, + "loss": 0.2461, + "step": 29872 + }, + { + "epoch": 0.5546128626222045, + "grad_norm": 0.5803861618041992, + "learning_rate": 8.29250020545334e-06, + "loss": 0.4481, + "step": 29874 + }, + { + "epoch": 0.5546499927596232, + "grad_norm": 0.5808305740356445, + "learning_rate": 8.291350857038426e-06, + "loss": 0.2343, + "step": 29876 + }, + { + "epoch": 0.5546871228970418, + "grad_norm": 0.37444931268692017, + "learning_rate": 8.29020153187311e-06, + "loss": 0.4974, + "step": 29878 + }, + { + "epoch": 0.5547242530344605, + "grad_norm": 0.293048620223999, + "learning_rate": 8.289052229973027e-06, + "loss": 0.2064, + "step": 29880 + }, + { + "epoch": 0.5547613831718792, + "grad_norm": 0.5807592272758484, + "learning_rate": 8.287902951353818e-06, + "loss": 0.1936, + "step": 29882 + }, + { + "epoch": 0.5547985133092977, + "grad_norm": 0.3036838173866272, + "learning_rate": 8.286753696031121e-06, + "loss": 0.1635, + "step": 29884 + }, + { + "epoch": 0.5548356434467164, + "grad_norm": 0.865731418132782, + "learning_rate": 8.285604464020576e-06, + "loss": 0.2447, + "step": 29886 + }, + { + "epoch": 0.554872773584135, + "grad_norm": 0.24341550469398499, + "learning_rate": 8.284455255337818e-06, + "loss": 0.2342, + "step": 29888 + }, + { + "epoch": 0.5549099037215537, + "grad_norm": 0.552036702632904, + "learning_rate": 8.28330606999849e-06, + "loss": 0.2524, + "step": 29890 + }, + { + "epoch": 0.5549470338589723, + "grad_norm": 0.6531091928482056, + "learning_rate": 8.28215690801822e-06, + "loss": 0.4213, + "step": 29892 + }, + { + "epoch": 0.554984163996391, + "grad_norm": 0.32186159491539, + "learning_rate": 8.281007769412649e-06, + "loss": 0.1684, + "step": 29894 + }, + { + "epoch": 0.5550212941338096, + "grad_norm": 0.40910765528678894, + "learning_rate": 8.279858654197415e-06, + "loss": 0.3347, + "step": 29896 + }, + { + "epoch": 0.5550584242712282, + "grad_norm": 0.5070319771766663, + "learning_rate": 8.27870956238815e-06, + "loss": 0.3065, + "step": 29898 + }, + { + "epoch": 0.5550955544086469, + "grad_norm": 0.4051646590232849, + "learning_rate": 8.277560494000491e-06, + "loss": 0.3031, + "step": 29900 + }, + { + "epoch": 0.5551326845460655, + "grad_norm": 0.33359289169311523, + "learning_rate": 8.27641144905008e-06, + "loss": 0.3236, + "step": 29902 + }, + { + "epoch": 0.5551698146834841, + "grad_norm": 0.9940642714500427, + "learning_rate": 8.275262427552541e-06, + "loss": 0.3611, + "step": 29904 + }, + { + "epoch": 0.5552069448209028, + "grad_norm": 0.29814261198043823, + "learning_rate": 8.274113429523516e-06, + "loss": 0.3249, + "step": 29906 + }, + { + "epoch": 0.5552440749583214, + "grad_norm": 0.3672918677330017, + "learning_rate": 8.272964454978638e-06, + "loss": 0.4686, + "step": 29908 + }, + { + "epoch": 0.5552812050957401, + "grad_norm": 0.5019917488098145, + "learning_rate": 8.27181550393354e-06, + "loss": 0.3553, + "step": 29910 + }, + { + "epoch": 0.5553183352331587, + "grad_norm": 0.38007551431655884, + "learning_rate": 8.270666576403856e-06, + "loss": 0.153, + "step": 29912 + }, + { + "epoch": 0.5553554653705773, + "grad_norm": 0.4911530315876007, + "learning_rate": 8.269517672405226e-06, + "loss": 0.2503, + "step": 29914 + }, + { + "epoch": 0.555392595507996, + "grad_norm": 0.311686247587204, + "learning_rate": 8.268368791953269e-06, + "loss": 0.1289, + "step": 29916 + }, + { + "epoch": 0.5554297256454146, + "grad_norm": 0.33318284153938293, + "learning_rate": 8.267219935063631e-06, + "loss": 0.3504, + "step": 29918 + }, + { + "epoch": 0.5554668557828333, + "grad_norm": 0.561971127986908, + "learning_rate": 8.266071101751936e-06, + "loss": 0.4043, + "step": 29920 + }, + { + "epoch": 0.5555039859202519, + "grad_norm": 0.43648287653923035, + "learning_rate": 8.264922292033819e-06, + "loss": 0.1611, + "step": 29922 + }, + { + "epoch": 0.5555411160576705, + "grad_norm": 0.2933158874511719, + "learning_rate": 8.263773505924914e-06, + "loss": 0.2958, + "step": 29924 + }, + { + "epoch": 0.5555782461950892, + "grad_norm": 0.3781273365020752, + "learning_rate": 8.262624743440852e-06, + "loss": 0.1689, + "step": 29926 + }, + { + "epoch": 0.5556153763325078, + "grad_norm": 0.32228848338127136, + "learning_rate": 8.261476004597263e-06, + "loss": 0.4075, + "step": 29928 + }, + { + "epoch": 0.5556525064699265, + "grad_norm": 0.3182445466518402, + "learning_rate": 8.260327289409779e-06, + "loss": 0.4898, + "step": 29930 + }, + { + "epoch": 0.555689636607345, + "grad_norm": 0.2422630339860916, + "learning_rate": 8.259178597894027e-06, + "loss": 0.3613, + "step": 29932 + }, + { + "epoch": 0.5557267667447637, + "grad_norm": 0.4437832236289978, + "learning_rate": 8.258029930065641e-06, + "loss": 0.3255, + "step": 29934 + }, + { + "epoch": 0.5557638968821823, + "grad_norm": 0.34847161173820496, + "learning_rate": 8.256881285940248e-06, + "loss": 0.1816, + "step": 29936 + }, + { + "epoch": 0.555801027019601, + "grad_norm": 0.6126718521118164, + "learning_rate": 8.255732665533482e-06, + "loss": 0.388, + "step": 29938 + }, + { + "epoch": 0.5558381571570197, + "grad_norm": 1.5948585271835327, + "learning_rate": 8.254584068860973e-06, + "loss": 0.232, + "step": 29940 + }, + { + "epoch": 0.5558752872944382, + "grad_norm": 0.410330206155777, + "learning_rate": 8.253435495938342e-06, + "loss": 0.1965, + "step": 29942 + }, + { + "epoch": 0.5559124174318569, + "grad_norm": 0.4859998822212219, + "learning_rate": 8.252286946781221e-06, + "loss": 0.269, + "step": 29944 + }, + { + "epoch": 0.5559495475692755, + "grad_norm": 0.3334956467151642, + "learning_rate": 8.251138421405241e-06, + "loss": 0.3332, + "step": 29946 + }, + { + "epoch": 0.5559866777066942, + "grad_norm": 0.3331974744796753, + "learning_rate": 8.249989919826031e-06, + "loss": 0.1801, + "step": 29948 + }, + { + "epoch": 0.5560238078441129, + "grad_norm": 0.4292665719985962, + "learning_rate": 8.248841442059214e-06, + "loss": 0.2981, + "step": 29950 + }, + { + "epoch": 0.5560609379815314, + "grad_norm": 0.2759549021720886, + "learning_rate": 8.247692988120424e-06, + "loss": 0.1597, + "step": 29952 + }, + { + "epoch": 0.5560980681189501, + "grad_norm": 0.46558675169944763, + "learning_rate": 8.246544558025279e-06, + "loss": 0.2625, + "step": 29954 + }, + { + "epoch": 0.5561351982563687, + "grad_norm": 0.6499155759811401, + "learning_rate": 8.24539615178941e-06, + "loss": 0.2623, + "step": 29956 + }, + { + "epoch": 0.5561723283937874, + "grad_norm": 0.39783161878585815, + "learning_rate": 8.244247769428444e-06, + "loss": 0.3455, + "step": 29958 + }, + { + "epoch": 0.5562094585312061, + "grad_norm": 0.34358033537864685, + "learning_rate": 8.24309941095801e-06, + "loss": 0.3624, + "step": 29960 + }, + { + "epoch": 0.5562465886686246, + "grad_norm": 0.4012523889541626, + "learning_rate": 8.241951076393726e-06, + "loss": 0.2804, + "step": 29962 + }, + { + "epoch": 0.5562837188060433, + "grad_norm": 0.3931114077568054, + "learning_rate": 8.240802765751223e-06, + "loss": 0.3491, + "step": 29964 + }, + { + "epoch": 0.5563208489434619, + "grad_norm": 0.45764410495758057, + "learning_rate": 8.23965447904613e-06, + "loss": 0.3176, + "step": 29966 + }, + { + "epoch": 0.5563579790808806, + "grad_norm": 0.42533084750175476, + "learning_rate": 8.238506216294062e-06, + "loss": 0.0987, + "step": 29968 + }, + { + "epoch": 0.5563951092182993, + "grad_norm": 0.39778098464012146, + "learning_rate": 8.237357977510649e-06, + "loss": 0.1238, + "step": 29970 + }, + { + "epoch": 0.5564322393557178, + "grad_norm": 0.4398359954357147, + "learning_rate": 8.236209762711516e-06, + "loss": 0.476, + "step": 29972 + }, + { + "epoch": 0.5564693694931365, + "grad_norm": 0.37641459703445435, + "learning_rate": 8.235061571912282e-06, + "loss": 0.1082, + "step": 29974 + }, + { + "epoch": 0.5565064996305551, + "grad_norm": 0.2843545079231262, + "learning_rate": 8.233913405128572e-06, + "loss": 0.225, + "step": 29976 + }, + { + "epoch": 0.5565436297679738, + "grad_norm": 0.35863617062568665, + "learning_rate": 8.232765262376017e-06, + "loss": 0.1856, + "step": 29978 + }, + { + "epoch": 0.5565807599053925, + "grad_norm": 0.41007199883461, + "learning_rate": 8.23161714367023e-06, + "loss": 0.327, + "step": 29980 + }, + { + "epoch": 0.556617890042811, + "grad_norm": 0.47068873047828674, + "learning_rate": 8.230469049026835e-06, + "loss": 0.2472, + "step": 29982 + }, + { + "epoch": 0.5566550201802297, + "grad_norm": 0.5136558413505554, + "learning_rate": 8.229320978461457e-06, + "loss": 0.2835, + "step": 29984 + }, + { + "epoch": 0.5566921503176483, + "grad_norm": 0.8411747217178345, + "learning_rate": 8.228172931989715e-06, + "loss": 0.2659, + "step": 29986 + }, + { + "epoch": 0.556729280455067, + "grad_norm": 0.3823561668395996, + "learning_rate": 8.227024909627234e-06, + "loss": 0.1061, + "step": 29988 + }, + { + "epoch": 0.5567664105924855, + "grad_norm": 0.537619411945343, + "learning_rate": 8.225876911389636e-06, + "loss": 0.2512, + "step": 29990 + }, + { + "epoch": 0.5568035407299042, + "grad_norm": 0.4279894530773163, + "learning_rate": 8.224728937292535e-06, + "loss": 0.2858, + "step": 29992 + }, + { + "epoch": 0.5568406708673229, + "grad_norm": 0.45453375577926636, + "learning_rate": 8.223580987351559e-06, + "loss": 0.2551, + "step": 29994 + }, + { + "epoch": 0.5568778010047415, + "grad_norm": 0.3921010494232178, + "learning_rate": 8.22243306158232e-06, + "loss": 0.4282, + "step": 29996 + }, + { + "epoch": 0.5569149311421602, + "grad_norm": 0.6211047172546387, + "learning_rate": 8.221285160000445e-06, + "loss": 0.241, + "step": 29998 + }, + { + "epoch": 0.5569520612795787, + "grad_norm": 0.5025211572647095, + "learning_rate": 8.220137282621551e-06, + "loss": 0.2081, + "step": 30000 + }, + { + "epoch": 0.5569891914169974, + "grad_norm": 0.3004184663295746, + "learning_rate": 8.21898942946126e-06, + "loss": 0.4883, + "step": 30002 + }, + { + "epoch": 0.5570263215544161, + "grad_norm": 0.2746807336807251, + "learning_rate": 8.217841600535187e-06, + "loss": 0.2318, + "step": 30004 + }, + { + "epoch": 0.5570634516918347, + "grad_norm": 0.36326611042022705, + "learning_rate": 8.21669379585895e-06, + "loss": 0.3016, + "step": 30006 + }, + { + "epoch": 0.5571005818292534, + "grad_norm": 0.4255525767803192, + "learning_rate": 8.215546015448169e-06, + "loss": 0.4402, + "step": 30008 + }, + { + "epoch": 0.557137711966672, + "grad_norm": 0.2435346096754074, + "learning_rate": 8.214398259318461e-06, + "loss": 0.2103, + "step": 30010 + }, + { + "epoch": 0.5571748421040906, + "grad_norm": 0.2673097848892212, + "learning_rate": 8.213250527485446e-06, + "loss": 0.1927, + "step": 30012 + }, + { + "epoch": 0.5572119722415093, + "grad_norm": 0.5042145848274231, + "learning_rate": 8.212102819964738e-06, + "loss": 0.3071, + "step": 30014 + }, + { + "epoch": 0.5572491023789279, + "grad_norm": 0.38507479429244995, + "learning_rate": 8.210955136771958e-06, + "loss": 0.336, + "step": 30016 + }, + { + "epoch": 0.5572862325163466, + "grad_norm": 0.37809982895851135, + "learning_rate": 8.209807477922718e-06, + "loss": 0.1269, + "step": 30018 + }, + { + "epoch": 0.5573233626537651, + "grad_norm": 0.3661077916622162, + "learning_rate": 8.208659843432633e-06, + "loss": 0.1981, + "step": 30020 + }, + { + "epoch": 0.5573604927911838, + "grad_norm": 0.3845650553703308, + "learning_rate": 8.207512233317324e-06, + "loss": 0.2308, + "step": 30022 + }, + { + "epoch": 0.5573976229286025, + "grad_norm": 0.550507664680481, + "learning_rate": 8.206364647592406e-06, + "loss": 0.4359, + "step": 30024 + }, + { + "epoch": 0.5574347530660211, + "grad_norm": 0.24292199313640594, + "learning_rate": 8.205217086273491e-06, + "loss": 0.3145, + "step": 30026 + }, + { + "epoch": 0.5574718832034398, + "grad_norm": 0.32117652893066406, + "learning_rate": 8.204069549376198e-06, + "loss": 0.3141, + "step": 30028 + }, + { + "epoch": 0.5575090133408583, + "grad_norm": 0.4574264585971832, + "learning_rate": 8.202922036916136e-06, + "loss": 0.3445, + "step": 30030 + }, + { + "epoch": 0.557546143478277, + "grad_norm": 0.26199522614479065, + "learning_rate": 8.201774548908921e-06, + "loss": 0.1583, + "step": 30032 + }, + { + "epoch": 0.5575832736156957, + "grad_norm": 0.24008169770240784, + "learning_rate": 8.200627085370172e-06, + "loss": 0.3128, + "step": 30034 + }, + { + "epoch": 0.5576204037531143, + "grad_norm": 0.45081865787506104, + "learning_rate": 8.199479646315496e-06, + "loss": 0.3574, + "step": 30036 + }, + { + "epoch": 0.557657533890533, + "grad_norm": 0.4980800151824951, + "learning_rate": 8.198332231760508e-06, + "loss": 0.2122, + "step": 30038 + }, + { + "epoch": 0.5576946640279515, + "grad_norm": 0.36799395084381104, + "learning_rate": 8.197184841720822e-06, + "loss": 0.1784, + "step": 30040 + }, + { + "epoch": 0.5577317941653702, + "grad_norm": 0.40446123480796814, + "learning_rate": 8.196037476212056e-06, + "loss": 0.4325, + "step": 30042 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.38782864809036255, + "learning_rate": 8.19489013524981e-06, + "loss": 0.2808, + "step": 30044 + }, + { + "epoch": 0.5578060544402075, + "grad_norm": 0.3911173939704895, + "learning_rate": 8.193742818849705e-06, + "loss": 0.3478, + "step": 30046 + }, + { + "epoch": 0.5578431845776262, + "grad_norm": 0.3728505074977875, + "learning_rate": 8.192595527027349e-06, + "loss": 0.34, + "step": 30048 + }, + { + "epoch": 0.5578803147150447, + "grad_norm": 0.373843252658844, + "learning_rate": 8.191448259798353e-06, + "loss": 0.2082, + "step": 30050 + }, + { + "epoch": 0.5579174448524634, + "grad_norm": 0.2734020948410034, + "learning_rate": 8.19030101717833e-06, + "loss": 0.1867, + "step": 30052 + }, + { + "epoch": 0.557954574989882, + "grad_norm": 0.37600746750831604, + "learning_rate": 8.189153799182891e-06, + "loss": 0.2425, + "step": 30054 + }, + { + "epoch": 0.5579917051273007, + "grad_norm": 0.4295004606246948, + "learning_rate": 8.188006605827646e-06, + "loss": 0.2155, + "step": 30056 + }, + { + "epoch": 0.5580288352647194, + "grad_norm": 0.5866034626960754, + "learning_rate": 8.186859437128199e-06, + "loss": 0.2216, + "step": 30058 + }, + { + "epoch": 0.5580659654021379, + "grad_norm": 0.47462812066078186, + "learning_rate": 8.185712293100166e-06, + "loss": 0.3099, + "step": 30060 + }, + { + "epoch": 0.5581030955395566, + "grad_norm": 0.4157017767429352, + "learning_rate": 8.184565173759153e-06, + "loss": 0.2573, + "step": 30062 + }, + { + "epoch": 0.5581402256769752, + "grad_norm": 0.5625922083854675, + "learning_rate": 8.183418079120773e-06, + "loss": 0.2275, + "step": 30064 + }, + { + "epoch": 0.5581773558143939, + "grad_norm": 0.28348788619041443, + "learning_rate": 8.182271009200631e-06, + "loss": 0.2031, + "step": 30066 + }, + { + "epoch": 0.5582144859518126, + "grad_norm": 0.2383648008108139, + "learning_rate": 8.181123964014336e-06, + "loss": 0.1278, + "step": 30068 + }, + { + "epoch": 0.5582516160892311, + "grad_norm": 0.3857385814189911, + "learning_rate": 8.179976943577494e-06, + "loss": 0.2875, + "step": 30070 + }, + { + "epoch": 0.5582887462266498, + "grad_norm": 0.4877139627933502, + "learning_rate": 8.178829947905713e-06, + "loss": 0.2782, + "step": 30072 + }, + { + "epoch": 0.5583258763640684, + "grad_norm": 0.35712820291519165, + "learning_rate": 8.177682977014602e-06, + "loss": 0.2625, + "step": 30074 + }, + { + "epoch": 0.5583630065014871, + "grad_norm": 0.26694273948669434, + "learning_rate": 8.17653603091977e-06, + "loss": 0.3961, + "step": 30076 + }, + { + "epoch": 0.5584001366389058, + "grad_norm": 0.624577522277832, + "learning_rate": 8.17538910963682e-06, + "loss": 0.2291, + "step": 30078 + }, + { + "epoch": 0.5584372667763243, + "grad_norm": 0.2896917164325714, + "learning_rate": 8.174242213181358e-06, + "loss": 0.3155, + "step": 30080 + }, + { + "epoch": 0.558474396913743, + "grad_norm": 0.4383455216884613, + "learning_rate": 8.17309534156899e-06, + "loss": 0.2642, + "step": 30082 + }, + { + "epoch": 0.5585115270511616, + "grad_norm": 0.4391283094882965, + "learning_rate": 8.171948494815321e-06, + "loss": 0.4211, + "step": 30084 + }, + { + "epoch": 0.5585486571885803, + "grad_norm": 0.3569841980934143, + "learning_rate": 8.170801672935961e-06, + "loss": 0.2293, + "step": 30086 + }, + { + "epoch": 0.5585857873259988, + "grad_norm": 0.5980402231216431, + "learning_rate": 8.169654875946508e-06, + "loss": 0.3138, + "step": 30088 + }, + { + "epoch": 0.5586229174634175, + "grad_norm": 0.3345278203487396, + "learning_rate": 8.16850810386257e-06, + "loss": 0.1601, + "step": 30090 + }, + { + "epoch": 0.5586600476008362, + "grad_norm": 0.2378118485212326, + "learning_rate": 8.167361356699756e-06, + "loss": 0.1996, + "step": 30092 + }, + { + "epoch": 0.5586971777382548, + "grad_norm": 0.3939962685108185, + "learning_rate": 8.166214634473658e-06, + "loss": 0.3647, + "step": 30094 + }, + { + "epoch": 0.5587343078756735, + "grad_norm": 0.33015215396881104, + "learning_rate": 8.165067937199888e-06, + "loss": 0.1566, + "step": 30096 + }, + { + "epoch": 0.558771438013092, + "grad_norm": 0.3020341098308563, + "learning_rate": 8.163921264894047e-06, + "loss": 0.3261, + "step": 30098 + }, + { + "epoch": 0.5588085681505107, + "grad_norm": 0.3373798727989197, + "learning_rate": 8.162774617571739e-06, + "loss": 0.3478, + "step": 30100 + }, + { + "epoch": 0.5588456982879294, + "grad_norm": 0.5846282243728638, + "learning_rate": 8.161627995248562e-06, + "loss": 0.4073, + "step": 30102 + }, + { + "epoch": 0.558882828425348, + "grad_norm": 0.37299925088882446, + "learning_rate": 8.160481397940128e-06, + "loss": 0.2078, + "step": 30104 + }, + { + "epoch": 0.5589199585627667, + "grad_norm": 0.28174498677253723, + "learning_rate": 8.159334825662026e-06, + "loss": 0.1936, + "step": 30106 + }, + { + "epoch": 0.5589570887001852, + "grad_norm": 0.55593341588974, + "learning_rate": 8.158188278429867e-06, + "loss": 0.1707, + "step": 30108 + }, + { + "epoch": 0.5589942188376039, + "grad_norm": 0.4053742289543152, + "learning_rate": 8.157041756259247e-06, + "loss": 0.2618, + "step": 30110 + }, + { + "epoch": 0.5590313489750226, + "grad_norm": 0.29380252957344055, + "learning_rate": 8.155895259165769e-06, + "loss": 0.2076, + "step": 30112 + }, + { + "epoch": 0.5590684791124412, + "grad_norm": 0.35082343220710754, + "learning_rate": 8.15474878716503e-06, + "loss": 0.1715, + "step": 30114 + }, + { + "epoch": 0.5591056092498599, + "grad_norm": 0.6001505851745605, + "learning_rate": 8.15360234027264e-06, + "loss": 0.2689, + "step": 30116 + }, + { + "epoch": 0.5591427393872784, + "grad_norm": 0.402310848236084, + "learning_rate": 8.152455918504185e-06, + "loss": 0.2558, + "step": 30118 + }, + { + "epoch": 0.5591798695246971, + "grad_norm": 0.34992268681526184, + "learning_rate": 8.151309521875275e-06, + "loss": 0.4904, + "step": 30120 + }, + { + "epoch": 0.5592169996621158, + "grad_norm": 0.4056401550769806, + "learning_rate": 8.1501631504015e-06, + "loss": 0.3431, + "step": 30122 + }, + { + "epoch": 0.5592541297995344, + "grad_norm": 0.5071715116500854, + "learning_rate": 8.149016804098467e-06, + "loss": 0.2207, + "step": 30124 + }, + { + "epoch": 0.5592912599369531, + "grad_norm": 0.36764124035835266, + "learning_rate": 8.14787048298177e-06, + "loss": 0.4472, + "step": 30126 + }, + { + "epoch": 0.5593283900743716, + "grad_norm": 0.2616311013698578, + "learning_rate": 8.146724187067008e-06, + "loss": 0.2446, + "step": 30128 + }, + { + "epoch": 0.5593655202117903, + "grad_norm": 0.39513325691223145, + "learning_rate": 8.14557791636978e-06, + "loss": 0.388, + "step": 30130 + }, + { + "epoch": 0.559402650349209, + "grad_norm": 0.324196994304657, + "learning_rate": 8.144431670905683e-06, + "loss": 0.3151, + "step": 30132 + }, + { + "epoch": 0.5594397804866276, + "grad_norm": 0.4376422166824341, + "learning_rate": 8.14328545069031e-06, + "loss": 0.3717, + "step": 30134 + }, + { + "epoch": 0.5594769106240463, + "grad_norm": 0.6306324005126953, + "learning_rate": 8.14213925573926e-06, + "loss": 0.2731, + "step": 30136 + }, + { + "epoch": 0.5595140407614648, + "grad_norm": 0.3850463628768921, + "learning_rate": 8.14099308606813e-06, + "loss": 0.1809, + "step": 30138 + }, + { + "epoch": 0.5595511708988835, + "grad_norm": 0.4078371822834015, + "learning_rate": 8.139846941692517e-06, + "loss": 0.1432, + "step": 30140 + }, + { + "epoch": 0.5595883010363021, + "grad_norm": 0.47897517681121826, + "learning_rate": 8.138700822628018e-06, + "loss": 0.3573, + "step": 30142 + }, + { + "epoch": 0.5596254311737208, + "grad_norm": 0.4120645821094513, + "learning_rate": 8.137554728890222e-06, + "loss": 0.4687, + "step": 30144 + }, + { + "epoch": 0.5596625613111395, + "grad_norm": 0.307976096868515, + "learning_rate": 8.136408660494728e-06, + "loss": 0.3757, + "step": 30146 + }, + { + "epoch": 0.559699691448558, + "grad_norm": 0.3964973986148834, + "learning_rate": 8.13526261745713e-06, + "loss": 0.1903, + "step": 30148 + }, + { + "epoch": 0.5597368215859767, + "grad_norm": 0.36342376470565796, + "learning_rate": 8.134116599793023e-06, + "loss": 0.3501, + "step": 30150 + }, + { + "epoch": 0.5597739517233953, + "grad_norm": 0.42313218116760254, + "learning_rate": 8.132970607517998e-06, + "loss": 0.3628, + "step": 30152 + }, + { + "epoch": 0.559811081860814, + "grad_norm": 0.3811320662498474, + "learning_rate": 8.131824640647655e-06, + "loss": 0.1791, + "step": 30154 + }, + { + "epoch": 0.5598482119982326, + "grad_norm": 0.3339000344276428, + "learning_rate": 8.13067869919758e-06, + "loss": 0.0821, + "step": 30156 + }, + { + "epoch": 0.5598853421356512, + "grad_norm": 0.3232860565185547, + "learning_rate": 8.129532783183366e-06, + "loss": 0.4528, + "step": 30158 + }, + { + "epoch": 0.5599224722730699, + "grad_norm": 0.37227779626846313, + "learning_rate": 8.128386892620611e-06, + "loss": 0.2232, + "step": 30160 + }, + { + "epoch": 0.5599596024104885, + "grad_norm": 0.36323127150535583, + "learning_rate": 8.127241027524904e-06, + "loss": 0.1981, + "step": 30162 + }, + { + "epoch": 0.5599967325479072, + "grad_norm": 0.5447588562965393, + "learning_rate": 8.126095187911836e-06, + "loss": 0.1229, + "step": 30164 + }, + { + "epoch": 0.5600338626853258, + "grad_norm": 0.3681600093841553, + "learning_rate": 8.124949373797001e-06, + "loss": 0.3909, + "step": 30166 + }, + { + "epoch": 0.5600709928227444, + "grad_norm": 0.32206544280052185, + "learning_rate": 8.123803585195991e-06, + "loss": 0.217, + "step": 30168 + }, + { + "epoch": 0.5601081229601631, + "grad_norm": 0.46701258420944214, + "learning_rate": 8.12265782212439e-06, + "loss": 0.3525, + "step": 30170 + }, + { + "epoch": 0.5601452530975817, + "grad_norm": 0.7677059173583984, + "learning_rate": 8.121512084597796e-06, + "loss": 0.1731, + "step": 30172 + }, + { + "epoch": 0.5601823832350004, + "grad_norm": 0.6768728494644165, + "learning_rate": 8.120366372631792e-06, + "loss": 0.3812, + "step": 30174 + }, + { + "epoch": 0.560219513372419, + "grad_norm": 0.2740638852119446, + "learning_rate": 8.119220686241974e-06, + "loss": 0.2187, + "step": 30176 + }, + { + "epoch": 0.5602566435098376, + "grad_norm": 0.5069589614868164, + "learning_rate": 8.118075025443927e-06, + "loss": 0.3207, + "step": 30178 + }, + { + "epoch": 0.5602937736472563, + "grad_norm": 0.28309041261672974, + "learning_rate": 8.116929390253247e-06, + "loss": 0.2697, + "step": 30180 + }, + { + "epoch": 0.5603309037846749, + "grad_norm": 0.4566730260848999, + "learning_rate": 8.115783780685512e-06, + "loss": 0.2456, + "step": 30182 + }, + { + "epoch": 0.5603680339220936, + "grad_norm": 0.4592951834201813, + "learning_rate": 8.114638196756319e-06, + "loss": 0.1655, + "step": 30184 + }, + { + "epoch": 0.5604051640595122, + "grad_norm": 0.42102646827697754, + "learning_rate": 8.113492638481251e-06, + "loss": 0.2037, + "step": 30186 + }, + { + "epoch": 0.5604422941969308, + "grad_norm": 0.47339728474617004, + "learning_rate": 8.112347105875897e-06, + "loss": 0.1962, + "step": 30188 + }, + { + "epoch": 0.5604794243343495, + "grad_norm": 0.30878114700317383, + "learning_rate": 8.111201598955844e-06, + "loss": 0.3053, + "step": 30190 + }, + { + "epoch": 0.5605165544717681, + "grad_norm": 0.3250124156475067, + "learning_rate": 8.110056117736685e-06, + "loss": 0.4341, + "step": 30192 + }, + { + "epoch": 0.5605536846091868, + "grad_norm": 0.5107461214065552, + "learning_rate": 8.108910662233999e-06, + "loss": 0.1095, + "step": 30194 + }, + { + "epoch": 0.5605908147466053, + "grad_norm": 0.39063841104507446, + "learning_rate": 8.10776523246337e-06, + "loss": 0.3132, + "step": 30196 + }, + { + "epoch": 0.560627944884024, + "grad_norm": 0.3738793730735779, + "learning_rate": 8.106619828440391e-06, + "loss": 0.2627, + "step": 30198 + }, + { + "epoch": 0.5606650750214427, + "grad_norm": 0.6023288369178772, + "learning_rate": 8.105474450180645e-06, + "loss": 0.375, + "step": 30200 + }, + { + "epoch": 0.5607022051588613, + "grad_norm": 0.5444366931915283, + "learning_rate": 8.104329097699718e-06, + "loss": 0.3811, + "step": 30202 + }, + { + "epoch": 0.56073933529628, + "grad_norm": 0.256476491689682, + "learning_rate": 8.103183771013195e-06, + "loss": 0.4002, + "step": 30204 + }, + { + "epoch": 0.5607764654336985, + "grad_norm": 0.25194743275642395, + "learning_rate": 8.102038470136657e-06, + "loss": 0.2382, + "step": 30206 + }, + { + "epoch": 0.5608135955711172, + "grad_norm": 0.39973315596580505, + "learning_rate": 8.10089319508569e-06, + "loss": 0.3115, + "step": 30208 + }, + { + "epoch": 0.5608507257085359, + "grad_norm": 0.3081108331680298, + "learning_rate": 8.099747945875878e-06, + "loss": 0.0589, + "step": 30210 + }, + { + "epoch": 0.5608878558459545, + "grad_norm": 0.27437108755111694, + "learning_rate": 8.098602722522803e-06, + "loss": 0.2016, + "step": 30212 + }, + { + "epoch": 0.5609249859833731, + "grad_norm": 0.618203341960907, + "learning_rate": 8.097457525042053e-06, + "loss": 0.1776, + "step": 30214 + }, + { + "epoch": 0.5609621161207917, + "grad_norm": 0.45953524112701416, + "learning_rate": 8.096312353449205e-06, + "loss": 0.3395, + "step": 30216 + }, + { + "epoch": 0.5609992462582104, + "grad_norm": 0.3742344379425049, + "learning_rate": 8.095167207759848e-06, + "loss": 0.3659, + "step": 30218 + }, + { + "epoch": 0.5610363763956291, + "grad_norm": 0.5747145414352417, + "learning_rate": 8.094022087989557e-06, + "loss": 0.1863, + "step": 30220 + }, + { + "epoch": 0.5610735065330477, + "grad_norm": 0.2893756330013275, + "learning_rate": 8.092876994153913e-06, + "loss": 0.1415, + "step": 30222 + }, + { + "epoch": 0.5611106366704663, + "grad_norm": 0.38143131136894226, + "learning_rate": 8.091731926268504e-06, + "loss": 0.2908, + "step": 30224 + }, + { + "epoch": 0.5611477668078849, + "grad_norm": 0.27126309275627136, + "learning_rate": 8.090586884348906e-06, + "loss": 0.4208, + "step": 30226 + }, + { + "epoch": 0.5611848969453036, + "grad_norm": 0.3374236822128296, + "learning_rate": 8.089441868410702e-06, + "loss": 0.3781, + "step": 30228 + }, + { + "epoch": 0.5612220270827223, + "grad_norm": 0.2948911190032959, + "learning_rate": 8.088296878469475e-06, + "loss": 0.2502, + "step": 30230 + }, + { + "epoch": 0.5612591572201409, + "grad_norm": 0.42715033888816833, + "learning_rate": 8.087151914540796e-06, + "loss": 0.1814, + "step": 30232 + }, + { + "epoch": 0.5612962873575595, + "grad_norm": 0.6408441662788391, + "learning_rate": 8.08600697664025e-06, + "loss": 0.3599, + "step": 30234 + }, + { + "epoch": 0.5613334174949781, + "grad_norm": 0.38711637258529663, + "learning_rate": 8.084862064783418e-06, + "loss": 0.1933, + "step": 30236 + }, + { + "epoch": 0.5613705476323968, + "grad_norm": 0.38149070739746094, + "learning_rate": 8.083717178985877e-06, + "loss": 0.195, + "step": 30238 + }, + { + "epoch": 0.5614076777698154, + "grad_norm": 0.29618707299232483, + "learning_rate": 8.082572319263204e-06, + "loss": 0.1664, + "step": 30240 + }, + { + "epoch": 0.5614448079072341, + "grad_norm": 0.4671536087989807, + "learning_rate": 8.081427485630981e-06, + "loss": 0.2415, + "step": 30242 + }, + { + "epoch": 0.5614819380446527, + "grad_norm": 0.37555959820747375, + "learning_rate": 8.080282678104781e-06, + "loss": 0.3336, + "step": 30244 + }, + { + "epoch": 0.5615190681820713, + "grad_norm": 0.3112461566925049, + "learning_rate": 8.079137896700183e-06, + "loss": 0.1574, + "step": 30246 + }, + { + "epoch": 0.56155619831949, + "grad_norm": 0.568596601486206, + "learning_rate": 8.077993141432764e-06, + "loss": 0.2388, + "step": 30248 + }, + { + "epoch": 0.5615933284569086, + "grad_norm": 0.33753904700279236, + "learning_rate": 8.076848412318102e-06, + "loss": 0.3679, + "step": 30250 + }, + { + "epoch": 0.5616304585943273, + "grad_norm": 0.4225180745124817, + "learning_rate": 8.075703709371771e-06, + "loss": 0.2318, + "step": 30252 + }, + { + "epoch": 0.5616675887317459, + "grad_norm": 0.3956039547920227, + "learning_rate": 8.074559032609352e-06, + "loss": 0.2045, + "step": 30254 + }, + { + "epoch": 0.5617047188691645, + "grad_norm": 0.31032007932662964, + "learning_rate": 8.073414382046418e-06, + "loss": 0.3747, + "step": 30256 + }, + { + "epoch": 0.5617418490065832, + "grad_norm": 0.3781510889530182, + "learning_rate": 8.072269757698541e-06, + "loss": 0.4621, + "step": 30258 + }, + { + "epoch": 0.5617789791440018, + "grad_norm": 0.3770076632499695, + "learning_rate": 8.071125159581298e-06, + "loss": 0.2185, + "step": 30260 + }, + { + "epoch": 0.5618161092814205, + "grad_norm": 0.3826526701450348, + "learning_rate": 8.069980587710264e-06, + "loss": 0.3527, + "step": 30262 + }, + { + "epoch": 0.5618532394188391, + "grad_norm": 0.9721832275390625, + "learning_rate": 8.068836042101014e-06, + "loss": 0.2795, + "step": 30264 + }, + { + "epoch": 0.5618903695562577, + "grad_norm": 0.6243985891342163, + "learning_rate": 8.06769152276912e-06, + "loss": 0.4041, + "step": 30266 + }, + { + "epoch": 0.5619274996936764, + "grad_norm": 0.5163834691047668, + "learning_rate": 8.066547029730158e-06, + "loss": 0.237, + "step": 30268 + }, + { + "epoch": 0.561964629831095, + "grad_norm": 0.35948845744132996, + "learning_rate": 8.065402562999701e-06, + "loss": 0.2491, + "step": 30270 + }, + { + "epoch": 0.5620017599685136, + "grad_norm": 0.3760967552661896, + "learning_rate": 8.064258122593316e-06, + "loss": 0.1981, + "step": 30272 + }, + { + "epoch": 0.5620388901059323, + "grad_norm": 0.28214970231056213, + "learning_rate": 8.063113708526582e-06, + "loss": 0.2033, + "step": 30274 + }, + { + "epoch": 0.5620760202433509, + "grad_norm": 0.4050746560096741, + "learning_rate": 8.061969320815066e-06, + "loss": 0.2324, + "step": 30276 + }, + { + "epoch": 0.5621131503807696, + "grad_norm": 0.3127691149711609, + "learning_rate": 8.060824959474346e-06, + "loss": 0.1505, + "step": 30278 + }, + { + "epoch": 0.5621502805181882, + "grad_norm": 0.44916513562202454, + "learning_rate": 8.059680624519993e-06, + "loss": 0.2206, + "step": 30280 + }, + { + "epoch": 0.5621874106556068, + "grad_norm": 0.3895914852619171, + "learning_rate": 8.05853631596757e-06, + "loss": 0.3028, + "step": 30282 + }, + { + "epoch": 0.5622245407930255, + "grad_norm": 0.30895158648490906, + "learning_rate": 8.057392033832652e-06, + "loss": 0.2037, + "step": 30284 + }, + { + "epoch": 0.5622616709304441, + "grad_norm": 0.5032930970191956, + "learning_rate": 8.05624777813081e-06, + "loss": 0.3849, + "step": 30286 + }, + { + "epoch": 0.5622988010678628, + "grad_norm": 0.3455074429512024, + "learning_rate": 8.055103548877614e-06, + "loss": 0.2426, + "step": 30288 + }, + { + "epoch": 0.5623359312052814, + "grad_norm": 0.2796286940574646, + "learning_rate": 8.053959346088632e-06, + "loss": 0.4726, + "step": 30290 + }, + { + "epoch": 0.5623730613427, + "grad_norm": 0.3647927939891815, + "learning_rate": 8.052815169779434e-06, + "loss": 0.3179, + "step": 30292 + }, + { + "epoch": 0.5624101914801186, + "grad_norm": 0.5455312728881836, + "learning_rate": 8.051671019965595e-06, + "loss": 0.145, + "step": 30294 + }, + { + "epoch": 0.5624473216175373, + "grad_norm": 0.4319457709789276, + "learning_rate": 8.05052689666267e-06, + "loss": 0.3003, + "step": 30296 + }, + { + "epoch": 0.562484451754956, + "grad_norm": 0.4691339135169983, + "learning_rate": 8.049382799886237e-06, + "loss": 0.2039, + "step": 30298 + }, + { + "epoch": 0.5625215818923746, + "grad_norm": 0.2796179950237274, + "learning_rate": 8.048238729651864e-06, + "loss": 0.3252, + "step": 30300 + }, + { + "epoch": 0.5625587120297932, + "grad_norm": 0.2572115659713745, + "learning_rate": 8.047094685975112e-06, + "loss": 0.207, + "step": 30302 + }, + { + "epoch": 0.5625958421672118, + "grad_norm": 0.5285819172859192, + "learning_rate": 8.045950668871551e-06, + "loss": 0.2288, + "step": 30304 + }, + { + "epoch": 0.5626329723046305, + "grad_norm": 0.24390468001365662, + "learning_rate": 8.044806678356755e-06, + "loss": 0.1286, + "step": 30306 + }, + { + "epoch": 0.5626701024420492, + "grad_norm": 0.5649982690811157, + "learning_rate": 8.043662714446279e-06, + "loss": 0.4846, + "step": 30308 + }, + { + "epoch": 0.5627072325794678, + "grad_norm": 0.48388344049453735, + "learning_rate": 8.042518777155694e-06, + "loss": 0.313, + "step": 30310 + }, + { + "epoch": 0.5627443627168864, + "grad_norm": 0.26966148614883423, + "learning_rate": 8.041374866500564e-06, + "loss": 0.2665, + "step": 30312 + }, + { + "epoch": 0.562781492854305, + "grad_norm": 0.3024383783340454, + "learning_rate": 8.040230982496455e-06, + "loss": 0.1354, + "step": 30314 + }, + { + "epoch": 0.5628186229917237, + "grad_norm": 0.4717419445514679, + "learning_rate": 8.039087125158932e-06, + "loss": 0.2333, + "step": 30316 + }, + { + "epoch": 0.5628557531291424, + "grad_norm": 0.3630298376083374, + "learning_rate": 8.037943294503565e-06, + "loss": 0.4768, + "step": 30318 + }, + { + "epoch": 0.562892883266561, + "grad_norm": 0.2688644528388977, + "learning_rate": 8.036799490545907e-06, + "loss": 0.4132, + "step": 30320 + }, + { + "epoch": 0.5629300134039796, + "grad_norm": 0.41613903641700745, + "learning_rate": 8.03565571330153e-06, + "loss": 0.3567, + "step": 30322 + }, + { + "epoch": 0.5629671435413982, + "grad_norm": 0.5882170796394348, + "learning_rate": 8.034511962785994e-06, + "loss": 0.3226, + "step": 30324 + }, + { + "epoch": 0.5630042736788169, + "grad_norm": 0.45509546995162964, + "learning_rate": 8.03336823901486e-06, + "loss": 0.2357, + "step": 30326 + }, + { + "epoch": 0.5630414038162356, + "grad_norm": 0.19202972948551178, + "learning_rate": 8.032224542003696e-06, + "loss": 0.2367, + "step": 30328 + }, + { + "epoch": 0.5630785339536541, + "grad_norm": 0.42827412486076355, + "learning_rate": 8.031080871768063e-06, + "loss": 0.3667, + "step": 30330 + }, + { + "epoch": 0.5631156640910728, + "grad_norm": 0.3420211374759674, + "learning_rate": 8.029937228323525e-06, + "loss": 0.391, + "step": 30332 + }, + { + "epoch": 0.5631527942284914, + "grad_norm": 0.2570461630821228, + "learning_rate": 8.028793611685635e-06, + "loss": 0.1686, + "step": 30334 + }, + { + "epoch": 0.5631899243659101, + "grad_norm": 0.3125964403152466, + "learning_rate": 8.02765002186996e-06, + "loss": 0.1648, + "step": 30336 + }, + { + "epoch": 0.5632270545033288, + "grad_norm": 0.24024170637130737, + "learning_rate": 8.02650645889206e-06, + "loss": 0.2705, + "step": 30338 + }, + { + "epoch": 0.5632641846407473, + "grad_norm": 0.3164975643157959, + "learning_rate": 8.025362922767497e-06, + "loss": 0.272, + "step": 30340 + }, + { + "epoch": 0.563301314778166, + "grad_norm": 0.3534393608570099, + "learning_rate": 8.02421941351183e-06, + "loss": 0.3129, + "step": 30342 + }, + { + "epoch": 0.5633384449155846, + "grad_norm": 0.5638948082923889, + "learning_rate": 8.02307593114062e-06, + "loss": 0.2103, + "step": 30344 + }, + { + "epoch": 0.5633755750530033, + "grad_norm": 0.2293599545955658, + "learning_rate": 8.021932475669423e-06, + "loss": 0.1865, + "step": 30346 + }, + { + "epoch": 0.5634127051904219, + "grad_norm": 0.390081524848938, + "learning_rate": 8.0207890471138e-06, + "loss": 0.2021, + "step": 30348 + }, + { + "epoch": 0.5634498353278405, + "grad_norm": 0.4304048717021942, + "learning_rate": 8.019645645489308e-06, + "loss": 0.29, + "step": 30350 + }, + { + "epoch": 0.5634869654652592, + "grad_norm": 0.571884274482727, + "learning_rate": 8.01850227081151e-06, + "loss": 0.2853, + "step": 30352 + }, + { + "epoch": 0.5635240956026778, + "grad_norm": 0.4301969110965729, + "learning_rate": 8.01735892309596e-06, + "loss": 0.2862, + "step": 30354 + }, + { + "epoch": 0.5635612257400965, + "grad_norm": 0.2893680930137634, + "learning_rate": 8.016215602358218e-06, + "loss": 0.38, + "step": 30356 + }, + { + "epoch": 0.5635983558775151, + "grad_norm": 0.320781946182251, + "learning_rate": 8.015072308613836e-06, + "loss": 0.4167, + "step": 30358 + }, + { + "epoch": 0.5636354860149337, + "grad_norm": 0.5543734431266785, + "learning_rate": 8.013929041878375e-06, + "loss": 0.393, + "step": 30360 + }, + { + "epoch": 0.5636726161523524, + "grad_norm": 0.37209829688072205, + "learning_rate": 8.012785802167394e-06, + "loss": 0.2306, + "step": 30362 + }, + { + "epoch": 0.563709746289771, + "grad_norm": 0.3098314106464386, + "learning_rate": 8.011642589496442e-06, + "loss": 0.1432, + "step": 30364 + }, + { + "epoch": 0.5637468764271897, + "grad_norm": 0.4104865789413452, + "learning_rate": 8.010499403881079e-06, + "loss": 0.3695, + "step": 30366 + }, + { + "epoch": 0.5637840065646083, + "grad_norm": 0.3670860230922699, + "learning_rate": 8.009356245336865e-06, + "loss": 0.2937, + "step": 30368 + }, + { + "epoch": 0.5638211367020269, + "grad_norm": 0.335248202085495, + "learning_rate": 8.008213113879344e-06, + "loss": 0.0961, + "step": 30370 + }, + { + "epoch": 0.5638582668394456, + "grad_norm": 0.3985885977745056, + "learning_rate": 8.007070009524077e-06, + "loss": 0.2918, + "step": 30372 + }, + { + "epoch": 0.5638953969768642, + "grad_norm": 0.3513612151145935, + "learning_rate": 8.00592693228662e-06, + "loss": 0.285, + "step": 30374 + }, + { + "epoch": 0.5639325271142829, + "grad_norm": 0.4683316648006439, + "learning_rate": 8.004783882182523e-06, + "loss": 0.1828, + "step": 30376 + }, + { + "epoch": 0.5639696572517015, + "grad_norm": 0.19643741846084595, + "learning_rate": 8.00364085922734e-06, + "loss": 0.2809, + "step": 30378 + }, + { + "epoch": 0.5640067873891201, + "grad_norm": 0.5451693534851074, + "learning_rate": 8.002497863436625e-06, + "loss": 0.1489, + "step": 30380 + }, + { + "epoch": 0.5640439175265388, + "grad_norm": 0.3660389482975006, + "learning_rate": 8.001354894825936e-06, + "loss": 0.26, + "step": 30382 + }, + { + "epoch": 0.5640810476639574, + "grad_norm": 0.48008474707603455, + "learning_rate": 8.000211953410816e-06, + "loss": 0.2141, + "step": 30384 + }, + { + "epoch": 0.5641181778013761, + "grad_norm": 0.36755871772766113, + "learning_rate": 7.999069039206822e-06, + "loss": 0.2089, + "step": 30386 + }, + { + "epoch": 0.5641553079387946, + "grad_norm": 0.4247550070285797, + "learning_rate": 7.997926152229505e-06, + "loss": 0.2291, + "step": 30388 + }, + { + "epoch": 0.5641924380762133, + "grad_norm": 0.42695823311805725, + "learning_rate": 7.996783292494415e-06, + "loss": 0.2526, + "step": 30390 + }, + { + "epoch": 0.5642295682136319, + "grad_norm": 0.4390032887458801, + "learning_rate": 7.995640460017103e-06, + "loss": 0.3591, + "step": 30392 + }, + { + "epoch": 0.5642666983510506, + "grad_norm": 0.37121182680130005, + "learning_rate": 7.994497654813126e-06, + "loss": 0.4137, + "step": 30394 + }, + { + "epoch": 0.5643038284884693, + "grad_norm": 0.36116474866867065, + "learning_rate": 7.993354876898026e-06, + "loss": 0.2493, + "step": 30396 + }, + { + "epoch": 0.5643409586258878, + "grad_norm": 0.3571341037750244, + "learning_rate": 7.992212126287355e-06, + "loss": 0.0537, + "step": 30398 + }, + { + "epoch": 0.5643780887633065, + "grad_norm": 0.4802612066268921, + "learning_rate": 7.99106940299666e-06, + "loss": 0.3075, + "step": 30400 + }, + { + "epoch": 0.5644152189007251, + "grad_norm": 0.4084606468677521, + "learning_rate": 7.989926707041495e-06, + "loss": 0.3401, + "step": 30402 + }, + { + "epoch": 0.5644523490381438, + "grad_norm": 0.35473909974098206, + "learning_rate": 7.98878403843741e-06, + "loss": 0.3276, + "step": 30404 + }, + { + "epoch": 0.5644894791755625, + "grad_norm": 0.43000349402427673, + "learning_rate": 7.987641397199948e-06, + "loss": 0.4182, + "step": 30406 + }, + { + "epoch": 0.564526609312981, + "grad_norm": 0.26502829790115356, + "learning_rate": 7.98649878334466e-06, + "loss": 0.3769, + "step": 30408 + }, + { + "epoch": 0.5645637394503997, + "grad_norm": 0.24508610367774963, + "learning_rate": 7.985356196887089e-06, + "loss": 0.2474, + "step": 30410 + }, + { + "epoch": 0.5646008695878183, + "grad_norm": 0.31696271896362305, + "learning_rate": 7.984213637842787e-06, + "loss": 0.3023, + "step": 30412 + }, + { + "epoch": 0.564637999725237, + "grad_norm": 0.352560818195343, + "learning_rate": 7.983071106227299e-06, + "loss": 0.2853, + "step": 30414 + }, + { + "epoch": 0.5646751298626557, + "grad_norm": 0.2814945578575134, + "learning_rate": 7.981928602056173e-06, + "loss": 0.2863, + "step": 30416 + }, + { + "epoch": 0.5647122600000742, + "grad_norm": 0.5061913132667542, + "learning_rate": 7.980786125344952e-06, + "loss": 0.266, + "step": 30418 + }, + { + "epoch": 0.5647493901374929, + "grad_norm": 0.642584502696991, + "learning_rate": 7.979643676109188e-06, + "loss": 0.3441, + "step": 30420 + }, + { + "epoch": 0.5647865202749115, + "grad_norm": 0.3245680034160614, + "learning_rate": 7.978501254364419e-06, + "loss": 0.2654, + "step": 30422 + }, + { + "epoch": 0.5648236504123302, + "grad_norm": 0.4735945761203766, + "learning_rate": 7.97735886012619e-06, + "loss": 0.2697, + "step": 30424 + }, + { + "epoch": 0.5648607805497489, + "grad_norm": 0.5074973702430725, + "learning_rate": 7.976216493410053e-06, + "loss": 0.3992, + "step": 30426 + }, + { + "epoch": 0.5648979106871674, + "grad_norm": 0.7065256237983704, + "learning_rate": 7.975074154231545e-06, + "loss": 0.5596, + "step": 30428 + }, + { + "epoch": 0.5649350408245861, + "grad_norm": 0.6071568131446838, + "learning_rate": 7.973931842606212e-06, + "loss": 0.328, + "step": 30430 + }, + { + "epoch": 0.5649721709620047, + "grad_norm": 0.33255162835121155, + "learning_rate": 7.972789558549601e-06, + "loss": 0.4342, + "step": 30432 + }, + { + "epoch": 0.5650093010994234, + "grad_norm": 0.3089801073074341, + "learning_rate": 7.97164730207725e-06, + "loss": 0.1307, + "step": 30434 + }, + { + "epoch": 0.5650464312368421, + "grad_norm": 0.5067664384841919, + "learning_rate": 7.970505073204702e-06, + "loss": 0.3456, + "step": 30436 + }, + { + "epoch": 0.5650835613742606, + "grad_norm": 0.3979984223842621, + "learning_rate": 7.969362871947503e-06, + "loss": 0.2428, + "step": 30438 + }, + { + "epoch": 0.5651206915116793, + "grad_norm": 0.37151622772216797, + "learning_rate": 7.968220698321191e-06, + "loss": 0.2866, + "step": 30440 + }, + { + "epoch": 0.5651578216490979, + "grad_norm": 0.24620410799980164, + "learning_rate": 7.967078552341312e-06, + "loss": 0.3118, + "step": 30442 + }, + { + "epoch": 0.5651949517865166, + "grad_norm": 0.48776742815971375, + "learning_rate": 7.965936434023405e-06, + "loss": 0.2363, + "step": 30444 + }, + { + "epoch": 0.5652320819239351, + "grad_norm": 0.2626207172870636, + "learning_rate": 7.964794343383007e-06, + "loss": 0.3689, + "step": 30446 + }, + { + "epoch": 0.5652692120613538, + "grad_norm": 0.4751438498497009, + "learning_rate": 7.963652280435665e-06, + "loss": 0.3387, + "step": 30448 + }, + { + "epoch": 0.5653063421987725, + "grad_norm": 0.7058199644088745, + "learning_rate": 7.962510245196913e-06, + "loss": 0.357, + "step": 30450 + }, + { + "epoch": 0.5653434723361911, + "grad_norm": 0.43393072485923767, + "learning_rate": 7.961368237682294e-06, + "loss": 0.1695, + "step": 30452 + }, + { + "epoch": 0.5653806024736098, + "grad_norm": 0.3317243754863739, + "learning_rate": 7.960226257907348e-06, + "loss": 0.2018, + "step": 30454 + }, + { + "epoch": 0.5654177326110283, + "grad_norm": 0.534257173538208, + "learning_rate": 7.959084305887613e-06, + "loss": 0.1866, + "step": 30456 + }, + { + "epoch": 0.565454862748447, + "grad_norm": 0.43892350792884827, + "learning_rate": 7.957942381638628e-06, + "loss": 0.2552, + "step": 30458 + }, + { + "epoch": 0.5654919928858657, + "grad_norm": 0.9451227784156799, + "learning_rate": 7.95680048517593e-06, + "loss": 0.2506, + "step": 30460 + }, + { + "epoch": 0.5655291230232843, + "grad_norm": 0.6164782047271729, + "learning_rate": 7.955658616515058e-06, + "loss": 0.2304, + "step": 30462 + }, + { + "epoch": 0.565566253160703, + "grad_norm": 0.5831804275512695, + "learning_rate": 7.954516775671547e-06, + "loss": 0.2441, + "step": 30464 + }, + { + "epoch": 0.5656033832981215, + "grad_norm": 0.31520023941993713, + "learning_rate": 7.953374962660933e-06, + "loss": 0.2924, + "step": 30466 + }, + { + "epoch": 0.5656405134355402, + "grad_norm": 0.4159213900566101, + "learning_rate": 7.952233177498761e-06, + "loss": 0.3522, + "step": 30468 + }, + { + "epoch": 0.5656776435729589, + "grad_norm": 0.4521883726119995, + "learning_rate": 7.951091420200563e-06, + "loss": 0.3285, + "step": 30470 + }, + { + "epoch": 0.5657147737103775, + "grad_norm": 0.4095653295516968, + "learning_rate": 7.949949690781868e-06, + "loss": 0.3415, + "step": 30472 + }, + { + "epoch": 0.5657519038477962, + "grad_norm": 0.359004944562912, + "learning_rate": 7.948807989258219e-06, + "loss": 0.1731, + "step": 30474 + }, + { + "epoch": 0.5657890339852147, + "grad_norm": 0.5125638246536255, + "learning_rate": 7.947666315645148e-06, + "loss": 0.4333, + "step": 30476 + }, + { + "epoch": 0.5658261641226334, + "grad_norm": 0.2509605586528778, + "learning_rate": 7.946524669958194e-06, + "loss": 0.1732, + "step": 30478 + }, + { + "epoch": 0.5658632942600521, + "grad_norm": 0.45520615577697754, + "learning_rate": 7.945383052212885e-06, + "loss": 0.2306, + "step": 30480 + }, + { + "epoch": 0.5659004243974707, + "grad_norm": 0.45426324009895325, + "learning_rate": 7.944241462424762e-06, + "loss": 0.2353, + "step": 30482 + }, + { + "epoch": 0.5659375545348894, + "grad_norm": 0.4015585482120514, + "learning_rate": 7.943099900609352e-06, + "loss": 0.3062, + "step": 30484 + }, + { + "epoch": 0.5659746846723079, + "grad_norm": 0.5005658864974976, + "learning_rate": 7.941958366782191e-06, + "loss": 0.2741, + "step": 30486 + }, + { + "epoch": 0.5660118148097266, + "grad_norm": 0.5383375287055969, + "learning_rate": 7.940816860958813e-06, + "loss": 0.3576, + "step": 30488 + }, + { + "epoch": 0.5660489449471452, + "grad_norm": 0.3859419822692871, + "learning_rate": 7.939675383154752e-06, + "loss": 0.1459, + "step": 30490 + }, + { + "epoch": 0.5660860750845639, + "grad_norm": 0.5962996482849121, + "learning_rate": 7.938533933385534e-06, + "loss": 0.3759, + "step": 30492 + }, + { + "epoch": 0.5661232052219826, + "grad_norm": 0.3544987142086029, + "learning_rate": 7.937392511666699e-06, + "loss": 0.184, + "step": 30494 + }, + { + "epoch": 0.5661603353594011, + "grad_norm": 0.6087614893913269, + "learning_rate": 7.93625111801377e-06, + "loss": 0.3119, + "step": 30496 + }, + { + "epoch": 0.5661974654968198, + "grad_norm": 0.34794753789901733, + "learning_rate": 7.93510975244228e-06, + "loss": 0.1928, + "step": 30498 + }, + { + "epoch": 0.5662345956342384, + "grad_norm": 0.3439965844154358, + "learning_rate": 7.933968414967763e-06, + "loss": 0.2963, + "step": 30500 + }, + { + "epoch": 0.5662717257716571, + "grad_norm": 0.4580877721309662, + "learning_rate": 7.932827105605749e-06, + "loss": 0.2651, + "step": 30502 + }, + { + "epoch": 0.5663088559090758, + "grad_norm": 0.6592761874198914, + "learning_rate": 7.931685824371765e-06, + "loss": 0.4515, + "step": 30504 + }, + { + "epoch": 0.5663459860464943, + "grad_norm": 0.384457528591156, + "learning_rate": 7.930544571281341e-06, + "loss": 0.1629, + "step": 30506 + }, + { + "epoch": 0.566383116183913, + "grad_norm": 0.5042252540588379, + "learning_rate": 7.92940334635001e-06, + "loss": 0.2389, + "step": 30508 + }, + { + "epoch": 0.5664202463213316, + "grad_norm": 0.2814830541610718, + "learning_rate": 7.928262149593294e-06, + "loss": 0.2782, + "step": 30510 + }, + { + "epoch": 0.5664573764587503, + "grad_norm": 0.43002599477767944, + "learning_rate": 7.927120981026724e-06, + "loss": 0.1498, + "step": 30512 + }, + { + "epoch": 0.566494506596169, + "grad_norm": 0.3772023022174835, + "learning_rate": 7.92597984066583e-06, + "loss": 0.3443, + "step": 30514 + }, + { + "epoch": 0.5665316367335875, + "grad_norm": 0.2985260784626007, + "learning_rate": 7.924838728526136e-06, + "loss": 0.2348, + "step": 30516 + }, + { + "epoch": 0.5665687668710062, + "grad_norm": 0.4417247772216797, + "learning_rate": 7.923697644623171e-06, + "loss": 0.2961, + "step": 30518 + }, + { + "epoch": 0.5666058970084248, + "grad_norm": 0.6379358768463135, + "learning_rate": 7.922556588972468e-06, + "loss": 0.2178, + "step": 30520 + }, + { + "epoch": 0.5666430271458435, + "grad_norm": 0.5376560688018799, + "learning_rate": 7.92141556158954e-06, + "loss": 0.2804, + "step": 30522 + }, + { + "epoch": 0.5666801572832622, + "grad_norm": 0.4958012104034424, + "learning_rate": 7.920274562489925e-06, + "loss": 0.1379, + "step": 30524 + }, + { + "epoch": 0.5667172874206807, + "grad_norm": 0.6264126300811768, + "learning_rate": 7.919133591689141e-06, + "loss": 0.3168, + "step": 30526 + }, + { + "epoch": 0.5667544175580994, + "grad_norm": 0.27815523743629456, + "learning_rate": 7.917992649202715e-06, + "loss": 0.1476, + "step": 30528 + }, + { + "epoch": 0.566791547695518, + "grad_norm": 0.44340285658836365, + "learning_rate": 7.916851735046172e-06, + "loss": 0.2596, + "step": 30530 + }, + { + "epoch": 0.5668286778329367, + "grad_norm": 0.4616158902645111, + "learning_rate": 7.915710849235043e-06, + "loss": 0.3706, + "step": 30532 + }, + { + "epoch": 0.5668658079703554, + "grad_norm": 0.2583787739276886, + "learning_rate": 7.914569991784844e-06, + "loss": 0.214, + "step": 30534 + }, + { + "epoch": 0.5669029381077739, + "grad_norm": 0.40340855717658997, + "learning_rate": 7.913429162711098e-06, + "loss": 0.2585, + "step": 30536 + }, + { + "epoch": 0.5669400682451926, + "grad_norm": 0.5663079619407654, + "learning_rate": 7.912288362029331e-06, + "loss": 0.2748, + "step": 30538 + }, + { + "epoch": 0.5669771983826112, + "grad_norm": 0.5247089862823486, + "learning_rate": 7.911147589755066e-06, + "loss": 0.1839, + "step": 30540 + }, + { + "epoch": 0.5670143285200299, + "grad_norm": 0.3321037292480469, + "learning_rate": 7.910006845903829e-06, + "loss": 0.3876, + "step": 30542 + }, + { + "epoch": 0.5670514586574484, + "grad_norm": 0.45910540223121643, + "learning_rate": 7.908866130491135e-06, + "loss": 0.2434, + "step": 30544 + }, + { + "epoch": 0.5670885887948671, + "grad_norm": 0.3489314615726471, + "learning_rate": 7.907725443532513e-06, + "loss": 0.1678, + "step": 30546 + }, + { + "epoch": 0.5671257189322858, + "grad_norm": 0.40685340762138367, + "learning_rate": 7.906584785043477e-06, + "loss": 0.1489, + "step": 30548 + }, + { + "epoch": 0.5671628490697044, + "grad_norm": 0.3494434058666229, + "learning_rate": 7.905444155039553e-06, + "loss": 0.2871, + "step": 30550 + }, + { + "epoch": 0.5671999792071231, + "grad_norm": 0.3340587913990021, + "learning_rate": 7.904303553536258e-06, + "loss": 0.2398, + "step": 30552 + }, + { + "epoch": 0.5672371093445416, + "grad_norm": 0.3975366950035095, + "learning_rate": 7.903162980549118e-06, + "loss": 0.2688, + "step": 30554 + }, + { + "epoch": 0.5672742394819603, + "grad_norm": 0.4589778184890747, + "learning_rate": 7.902022436093646e-06, + "loss": 0.3695, + "step": 30556 + }, + { + "epoch": 0.567311369619379, + "grad_norm": 0.4233415722846985, + "learning_rate": 7.900881920185368e-06, + "loss": 0.2856, + "step": 30558 + }, + { + "epoch": 0.5673484997567976, + "grad_norm": 0.27418792247772217, + "learning_rate": 7.899741432839796e-06, + "loss": 0.3356, + "step": 30560 + }, + { + "epoch": 0.5673856298942163, + "grad_norm": 0.590097963809967, + "learning_rate": 7.898600974072454e-06, + "loss": 0.162, + "step": 30562 + }, + { + "epoch": 0.5674227600316348, + "grad_norm": 0.3199407160282135, + "learning_rate": 7.897460543898859e-06, + "loss": 0.3109, + "step": 30564 + }, + { + "epoch": 0.5674598901690535, + "grad_norm": 0.2804213762283325, + "learning_rate": 7.896320142334524e-06, + "loss": 0.2686, + "step": 30566 + }, + { + "epoch": 0.5674970203064722, + "grad_norm": 0.3871452510356903, + "learning_rate": 7.895179769394972e-06, + "loss": 0.2359, + "step": 30568 + }, + { + "epoch": 0.5675341504438908, + "grad_norm": 0.4456593692302704, + "learning_rate": 7.894039425095724e-06, + "loss": 0.261, + "step": 30570 + }, + { + "epoch": 0.5675712805813095, + "grad_norm": 0.3904845118522644, + "learning_rate": 7.892899109452287e-06, + "loss": 0.4949, + "step": 30572 + }, + { + "epoch": 0.567608410718728, + "grad_norm": 0.21100293099880219, + "learning_rate": 7.89175882248018e-06, + "loss": 0.0518, + "step": 30574 + }, + { + "epoch": 0.5676455408561467, + "grad_norm": 0.3031800389289856, + "learning_rate": 7.890618564194925e-06, + "loss": 0.1883, + "step": 30576 + }, + { + "epoch": 0.5676826709935654, + "grad_norm": 0.3527359664440155, + "learning_rate": 7.88947833461203e-06, + "loss": 0.3409, + "step": 30578 + }, + { + "epoch": 0.567719801130984, + "grad_norm": 0.45798730850219727, + "learning_rate": 7.888338133747012e-06, + "loss": 0.4397, + "step": 30580 + }, + { + "epoch": 0.5677569312684027, + "grad_norm": 0.327743798494339, + "learning_rate": 7.887197961615387e-06, + "loss": 0.2987, + "step": 30582 + }, + { + "epoch": 0.5677940614058212, + "grad_norm": 0.4482690691947937, + "learning_rate": 7.886057818232675e-06, + "loss": 0.4611, + "step": 30584 + }, + { + "epoch": 0.5678311915432399, + "grad_norm": 0.2777673304080963, + "learning_rate": 7.88491770361438e-06, + "loss": 0.2406, + "step": 30586 + }, + { + "epoch": 0.5678683216806586, + "grad_norm": 0.5364262461662292, + "learning_rate": 7.883777617776019e-06, + "loss": 0.1179, + "step": 30588 + }, + { + "epoch": 0.5679054518180772, + "grad_norm": 0.43481430411338806, + "learning_rate": 7.882637560733105e-06, + "loss": 0.2247, + "step": 30590 + }, + { + "epoch": 0.5679425819554959, + "grad_norm": 0.2687269449234009, + "learning_rate": 7.881497532501153e-06, + "loss": 0.2994, + "step": 30592 + }, + { + "epoch": 0.5679797120929144, + "grad_norm": 0.31993377208709717, + "learning_rate": 7.880357533095673e-06, + "loss": 0.2845, + "step": 30594 + }, + { + "epoch": 0.5680168422303331, + "grad_norm": 0.3220365643501282, + "learning_rate": 7.87921756253218e-06, + "loss": 0.2595, + "step": 30596 + }, + { + "epoch": 0.5680539723677517, + "grad_norm": 0.3849681615829468, + "learning_rate": 7.878077620826184e-06, + "loss": 0.2725, + "step": 30598 + }, + { + "epoch": 0.5680911025051704, + "grad_norm": 0.2802127003669739, + "learning_rate": 7.876937707993192e-06, + "loss": 0.2563, + "step": 30600 + }, + { + "epoch": 0.568128232642589, + "grad_norm": 0.3694424033164978, + "learning_rate": 7.87579782404872e-06, + "loss": 0.257, + "step": 30602 + }, + { + "epoch": 0.5681653627800076, + "grad_norm": 0.2947586178779602, + "learning_rate": 7.874657969008277e-06, + "loss": 0.3143, + "step": 30604 + }, + { + "epoch": 0.5682024929174263, + "grad_norm": 0.32467716932296753, + "learning_rate": 7.873518142887373e-06, + "loss": 0.3619, + "step": 30606 + }, + { + "epoch": 0.5682396230548449, + "grad_norm": 0.30035674571990967, + "learning_rate": 7.87237834570152e-06, + "loss": 0.1706, + "step": 30608 + }, + { + "epoch": 0.5682767531922636, + "grad_norm": 0.3676237165927887, + "learning_rate": 7.871238577466222e-06, + "loss": 0.2987, + "step": 30610 + }, + { + "epoch": 0.5683138833296822, + "grad_norm": 0.2749525308609009, + "learning_rate": 7.870098838196992e-06, + "loss": 0.3511, + "step": 30612 + }, + { + "epoch": 0.5683510134671008, + "grad_norm": 0.5221290588378906, + "learning_rate": 7.868959127909334e-06, + "loss": 0.3261, + "step": 30614 + }, + { + "epoch": 0.5683881436045195, + "grad_norm": 0.44048845767974854, + "learning_rate": 7.867819446618762e-06, + "loss": 0.3985, + "step": 30616 + }, + { + "epoch": 0.5684252737419381, + "grad_norm": 0.3135998249053955, + "learning_rate": 7.866679794340779e-06, + "loss": 0.3617, + "step": 30618 + }, + { + "epoch": 0.5684624038793568, + "grad_norm": 0.40057504177093506, + "learning_rate": 7.865540171090895e-06, + "loss": 0.3769, + "step": 30620 + }, + { + "epoch": 0.5684995340167754, + "grad_norm": 0.35424596071243286, + "learning_rate": 7.864400576884618e-06, + "loss": 0.2173, + "step": 30622 + }, + { + "epoch": 0.568536664154194, + "grad_norm": 0.4932957887649536, + "learning_rate": 7.863261011737449e-06, + "loss": 0.1415, + "step": 30624 + }, + { + "epoch": 0.5685737942916127, + "grad_norm": 0.3345850706100464, + "learning_rate": 7.862121475664897e-06, + "loss": 0.3222, + "step": 30626 + }, + { + "epoch": 0.5686109244290313, + "grad_norm": 0.32648009061813354, + "learning_rate": 7.86098196868247e-06, + "loss": 0.4924, + "step": 30628 + }, + { + "epoch": 0.56864805456645, + "grad_norm": 0.42507317662239075, + "learning_rate": 7.85984249080567e-06, + "loss": 0.2344, + "step": 30630 + }, + { + "epoch": 0.5686851847038686, + "grad_norm": 0.43238961696624756, + "learning_rate": 7.858703042050002e-06, + "loss": 0.2487, + "step": 30632 + }, + { + "epoch": 0.5687223148412872, + "grad_norm": 0.42522186040878296, + "learning_rate": 7.857563622430977e-06, + "loss": 0.2099, + "step": 30634 + }, + { + "epoch": 0.5687594449787059, + "grad_norm": 0.5822479128837585, + "learning_rate": 7.85642423196409e-06, + "loss": 0.3288, + "step": 30636 + }, + { + "epoch": 0.5687965751161245, + "grad_norm": 0.46770092844963074, + "learning_rate": 7.855284870664847e-06, + "loss": 0.4783, + "step": 30638 + }, + { + "epoch": 0.5688337052535432, + "grad_norm": 0.8809147477149963, + "learning_rate": 7.854145538548755e-06, + "loss": 0.1386, + "step": 30640 + }, + { + "epoch": 0.5688708353909617, + "grad_norm": 0.38927674293518066, + "learning_rate": 7.853006235631314e-06, + "loss": 0.327, + "step": 30642 + }, + { + "epoch": 0.5689079655283804, + "grad_norm": 0.34164538979530334, + "learning_rate": 7.851866961928025e-06, + "loss": 0.4552, + "step": 30644 + }, + { + "epoch": 0.5689450956657991, + "grad_norm": 0.27964499592781067, + "learning_rate": 7.850727717454399e-06, + "loss": 0.1415, + "step": 30646 + }, + { + "epoch": 0.5689822258032177, + "grad_norm": 0.35941168665885925, + "learning_rate": 7.849588502225925e-06, + "loss": 0.257, + "step": 30648 + }, + { + "epoch": 0.5690193559406364, + "grad_norm": 0.12937435507774353, + "learning_rate": 7.848449316258113e-06, + "loss": 0.2192, + "step": 30650 + }, + { + "epoch": 0.5690564860780549, + "grad_norm": 0.37457650899887085, + "learning_rate": 7.847310159566458e-06, + "loss": 0.2718, + "step": 30652 + }, + { + "epoch": 0.5690936162154736, + "grad_norm": 0.5210898518562317, + "learning_rate": 7.846171032166464e-06, + "loss": 0.142, + "step": 30654 + }, + { + "epoch": 0.5691307463528923, + "grad_norm": 0.3666442036628723, + "learning_rate": 7.845031934073631e-06, + "loss": 0.3833, + "step": 30656 + }, + { + "epoch": 0.5691678764903109, + "grad_norm": 0.3594425916671753, + "learning_rate": 7.843892865303464e-06, + "loss": 0.1405, + "step": 30658 + }, + { + "epoch": 0.5692050066277295, + "grad_norm": 0.24422810971736908, + "learning_rate": 7.84275382587145e-06, + "loss": 0.2049, + "step": 30660 + }, + { + "epoch": 0.5692421367651481, + "grad_norm": 0.31878241896629333, + "learning_rate": 7.8416148157931e-06, + "loss": 0.1513, + "step": 30662 + }, + { + "epoch": 0.5692792669025668, + "grad_norm": 0.31119266152381897, + "learning_rate": 7.840475835083903e-06, + "loss": 0.3645, + "step": 30664 + }, + { + "epoch": 0.5693163970399855, + "grad_norm": 0.24074499309062958, + "learning_rate": 7.839336883759362e-06, + "loss": 0.211, + "step": 30666 + }, + { + "epoch": 0.5693535271774041, + "grad_norm": 0.5032479763031006, + "learning_rate": 7.838197961834974e-06, + "loss": 0.2564, + "step": 30668 + }, + { + "epoch": 0.5693906573148227, + "grad_norm": 0.40660709142684937, + "learning_rate": 7.837059069326238e-06, + "loss": 0.3445, + "step": 30670 + }, + { + "epoch": 0.5694277874522413, + "grad_norm": 0.31265658140182495, + "learning_rate": 7.835920206248652e-06, + "loss": 0.3057, + "step": 30672 + }, + { + "epoch": 0.56946491758966, + "grad_norm": 0.4856959283351898, + "learning_rate": 7.834781372617706e-06, + "loss": 0.3463, + "step": 30674 + }, + { + "epoch": 0.5695020477270787, + "grad_norm": 0.49358445405960083, + "learning_rate": 7.833642568448899e-06, + "loss": 0.3, + "step": 30676 + }, + { + "epoch": 0.5695391778644973, + "grad_norm": 0.19963692128658295, + "learning_rate": 7.832503793757729e-06, + "loss": 0.2208, + "step": 30678 + }, + { + "epoch": 0.5695763080019159, + "grad_norm": 0.3275558650493622, + "learning_rate": 7.831365048559691e-06, + "loss": 0.0805, + "step": 30680 + }, + { + "epoch": 0.5696134381393345, + "grad_norm": 0.35259246826171875, + "learning_rate": 7.830226332870277e-06, + "loss": 0.2964, + "step": 30682 + }, + { + "epoch": 0.5696505682767532, + "grad_norm": 0.4486176073551178, + "learning_rate": 7.829087646704987e-06, + "loss": 0.18, + "step": 30684 + }, + { + "epoch": 0.5696876984141719, + "grad_norm": 0.4811548888683319, + "learning_rate": 7.827948990079309e-06, + "loss": 0.2291, + "step": 30686 + }, + { + "epoch": 0.5697248285515905, + "grad_norm": 0.731239914894104, + "learning_rate": 7.826810363008736e-06, + "loss": 0.313, + "step": 30688 + }, + { + "epoch": 0.5697619586890091, + "grad_norm": 0.2572113275527954, + "learning_rate": 7.825671765508766e-06, + "loss": 0.3548, + "step": 30690 + }, + { + "epoch": 0.5697990888264277, + "grad_norm": 0.30911195278167725, + "learning_rate": 7.824533197594895e-06, + "loss": 0.4283, + "step": 30692 + }, + { + "epoch": 0.5698362189638464, + "grad_norm": 0.2597065269947052, + "learning_rate": 7.823394659282606e-06, + "loss": 0.3744, + "step": 30694 + }, + { + "epoch": 0.569873349101265, + "grad_norm": 0.63627028465271, + "learning_rate": 7.8222561505874e-06, + "loss": 0.2022, + "step": 30696 + }, + { + "epoch": 0.5699104792386837, + "grad_norm": 0.3540831506252289, + "learning_rate": 7.821117671524763e-06, + "loss": 0.2381, + "step": 30698 + }, + { + "epoch": 0.5699476093761023, + "grad_norm": 0.4699212908744812, + "learning_rate": 7.819979222110186e-06, + "loss": 0.2419, + "step": 30700 + }, + { + "epoch": 0.5699847395135209, + "grad_norm": 0.33074215054512024, + "learning_rate": 7.818840802359164e-06, + "loss": 0.526, + "step": 30702 + }, + { + "epoch": 0.5700218696509396, + "grad_norm": 0.7920529842376709, + "learning_rate": 7.817702412287185e-06, + "loss": 0.4891, + "step": 30704 + }, + { + "epoch": 0.5700589997883582, + "grad_norm": 0.5426456332206726, + "learning_rate": 7.816564051909737e-06, + "loss": 0.2399, + "step": 30706 + }, + { + "epoch": 0.5700961299257769, + "grad_norm": 0.40793612599372864, + "learning_rate": 7.815425721242313e-06, + "loss": 0.2963, + "step": 30708 + }, + { + "epoch": 0.5701332600631955, + "grad_norm": 0.37187185883522034, + "learning_rate": 7.814287420300407e-06, + "loss": 0.3898, + "step": 30710 + }, + { + "epoch": 0.5701703902006141, + "grad_norm": 0.4627900719642639, + "learning_rate": 7.813149149099495e-06, + "loss": 0.223, + "step": 30712 + }, + { + "epoch": 0.5702075203380328, + "grad_norm": 0.40232518315315247, + "learning_rate": 7.812010907655078e-06, + "loss": 0.146, + "step": 30714 + }, + { + "epoch": 0.5702446504754514, + "grad_norm": 0.4002465009689331, + "learning_rate": 7.810872695982636e-06, + "loss": 0.2709, + "step": 30716 + }, + { + "epoch": 0.57028178061287, + "grad_norm": 0.2571702003479004, + "learning_rate": 7.809734514097657e-06, + "loss": 0.1335, + "step": 30718 + }, + { + "epoch": 0.5703189107502887, + "grad_norm": 0.2706353962421417, + "learning_rate": 7.808596362015633e-06, + "loss": 0.204, + "step": 30720 + }, + { + "epoch": 0.5703560408877073, + "grad_norm": 0.3100935220718384, + "learning_rate": 7.807458239752053e-06, + "loss": 0.2623, + "step": 30722 + }, + { + "epoch": 0.570393171025126, + "grad_norm": 0.3397640883922577, + "learning_rate": 7.806320147322396e-06, + "loss": 0.2088, + "step": 30724 + }, + { + "epoch": 0.5704303011625446, + "grad_norm": 0.26328396797180176, + "learning_rate": 7.805182084742148e-06, + "loss": 0.1558, + "step": 30726 + }, + { + "epoch": 0.5704674312999632, + "grad_norm": 0.4029276669025421, + "learning_rate": 7.804044052026799e-06, + "loss": 0.3402, + "step": 30728 + }, + { + "epoch": 0.5705045614373819, + "grad_norm": 0.286686509847641, + "learning_rate": 7.802906049191832e-06, + "loss": 0.3206, + "step": 30730 + }, + { + "epoch": 0.5705416915748005, + "grad_norm": 0.49031952023506165, + "learning_rate": 7.801768076252735e-06, + "loss": 0.233, + "step": 30732 + }, + { + "epoch": 0.5705788217122192, + "grad_norm": 0.7933528423309326, + "learning_rate": 7.80063013322499e-06, + "loss": 0.1482, + "step": 30734 + }, + { + "epoch": 0.5706159518496378, + "grad_norm": 0.4536634385585785, + "learning_rate": 7.799492220124082e-06, + "loss": 0.2074, + "step": 30736 + }, + { + "epoch": 0.5706530819870564, + "grad_norm": 0.24374347925186157, + "learning_rate": 7.798354336965489e-06, + "loss": 0.3752, + "step": 30738 + }, + { + "epoch": 0.5706902121244751, + "grad_norm": 0.517255961894989, + "learning_rate": 7.7972164837647e-06, + "loss": 0.2132, + "step": 30740 + }, + { + "epoch": 0.5707273422618937, + "grad_norm": 0.29550403356552124, + "learning_rate": 7.796078660537197e-06, + "loss": 0.1578, + "step": 30742 + }, + { + "epoch": 0.5707644723993124, + "grad_norm": 0.30857229232788086, + "learning_rate": 7.794940867298465e-06, + "loss": 0.3103, + "step": 30744 + }, + { + "epoch": 0.570801602536731, + "grad_norm": 0.383404403924942, + "learning_rate": 7.793803104063979e-06, + "loss": 0.2005, + "step": 30746 + }, + { + "epoch": 0.5708387326741496, + "grad_norm": 0.3568408191204071, + "learning_rate": 7.792665370849229e-06, + "loss": 0.2535, + "step": 30748 + }, + { + "epoch": 0.5708758628115682, + "grad_norm": 0.3433665633201599, + "learning_rate": 7.791527667669687e-06, + "loss": 0.2532, + "step": 30750 + }, + { + "epoch": 0.5709129929489869, + "grad_norm": 0.3389299809932709, + "learning_rate": 7.790389994540839e-06, + "loss": 0.2057, + "step": 30752 + }, + { + "epoch": 0.5709501230864056, + "grad_norm": 0.44569307565689087, + "learning_rate": 7.789252351478167e-06, + "loss": 0.1781, + "step": 30754 + }, + { + "epoch": 0.5709872532238242, + "grad_norm": 0.34246334433555603, + "learning_rate": 7.788114738497146e-06, + "loss": 0.3095, + "step": 30756 + }, + { + "epoch": 0.5710243833612428, + "grad_norm": 0.3768478333950043, + "learning_rate": 7.786977155613258e-06, + "loss": 0.2001, + "step": 30758 + }, + { + "epoch": 0.5710615134986614, + "grad_norm": 0.42460960149765015, + "learning_rate": 7.785839602841986e-06, + "loss": 0.2922, + "step": 30760 + }, + { + "epoch": 0.5710986436360801, + "grad_norm": 0.48252469301223755, + "learning_rate": 7.784702080198801e-06, + "loss": 0.4393, + "step": 30762 + }, + { + "epoch": 0.5711357737734988, + "grad_norm": 0.2758078873157501, + "learning_rate": 7.783564587699185e-06, + "loss": 0.3501, + "step": 30764 + }, + { + "epoch": 0.5711729039109174, + "grad_norm": 0.3760770559310913, + "learning_rate": 7.782427125358619e-06, + "loss": 0.4444, + "step": 30766 + }, + { + "epoch": 0.571210034048336, + "grad_norm": 0.28239133954048157, + "learning_rate": 7.781289693192575e-06, + "loss": 0.1436, + "step": 30768 + }, + { + "epoch": 0.5712471641857546, + "grad_norm": 0.3897975981235504, + "learning_rate": 7.78015229121653e-06, + "loss": 0.207, + "step": 30770 + }, + { + "epoch": 0.5712842943231733, + "grad_norm": 0.31077486276626587, + "learning_rate": 7.779014919445971e-06, + "loss": 0.2058, + "step": 30772 + }, + { + "epoch": 0.571321424460592, + "grad_norm": 0.4491395652294159, + "learning_rate": 7.77787757789636e-06, + "loss": 0.1487, + "step": 30774 + }, + { + "epoch": 0.5713585545980105, + "grad_norm": 0.3883741497993469, + "learning_rate": 7.776740266583179e-06, + "loss": 0.3554, + "step": 30776 + }, + { + "epoch": 0.5713956847354292, + "grad_norm": 0.3194393217563629, + "learning_rate": 7.775602985521907e-06, + "loss": 0.3588, + "step": 30778 + }, + { + "epoch": 0.5714328148728478, + "grad_norm": 0.3997102379798889, + "learning_rate": 7.774465734728012e-06, + "loss": 0.2643, + "step": 30780 + }, + { + "epoch": 0.5714699450102665, + "grad_norm": 0.3605523705482483, + "learning_rate": 7.773328514216972e-06, + "loss": 0.2143, + "step": 30782 + }, + { + "epoch": 0.5715070751476852, + "grad_norm": 0.467251718044281, + "learning_rate": 7.772191324004267e-06, + "loss": 0.2502, + "step": 30784 + }, + { + "epoch": 0.5715442052851037, + "grad_norm": 0.661156952381134, + "learning_rate": 7.771054164105361e-06, + "loss": 0.3988, + "step": 30786 + }, + { + "epoch": 0.5715813354225224, + "grad_norm": 0.37096482515335083, + "learning_rate": 7.769917034535732e-06, + "loss": 0.4458, + "step": 30788 + }, + { + "epoch": 0.571618465559941, + "grad_norm": 0.3853954076766968, + "learning_rate": 7.76877993531085e-06, + "loss": 0.1391, + "step": 30790 + }, + { + "epoch": 0.5716555956973597, + "grad_norm": 0.4599910378456116, + "learning_rate": 7.76764286644619e-06, + "loss": 0.1767, + "step": 30792 + }, + { + "epoch": 0.5716927258347783, + "grad_norm": 0.2952060401439667, + "learning_rate": 7.766505827957224e-06, + "loss": 0.277, + "step": 30794 + }, + { + "epoch": 0.5717298559721969, + "grad_norm": 0.4265202581882477, + "learning_rate": 7.765368819859427e-06, + "loss": 0.172, + "step": 30796 + }, + { + "epoch": 0.5717669861096156, + "grad_norm": 0.5990549921989441, + "learning_rate": 7.764231842168267e-06, + "loss": 0.4412, + "step": 30798 + }, + { + "epoch": 0.5718041162470342, + "grad_norm": 0.33042195439338684, + "learning_rate": 7.763094894899212e-06, + "loss": 0.4162, + "step": 30800 + }, + { + "epoch": 0.5718412463844529, + "grad_norm": 0.3797372579574585, + "learning_rate": 7.761957978067735e-06, + "loss": 0.2773, + "step": 30802 + }, + { + "epoch": 0.5718783765218715, + "grad_norm": 0.4291931390762329, + "learning_rate": 7.760821091689307e-06, + "loss": 0.4347, + "step": 30804 + }, + { + "epoch": 0.5719155066592901, + "grad_norm": 0.2676706910133362, + "learning_rate": 7.759684235779394e-06, + "loss": 0.2687, + "step": 30806 + }, + { + "epoch": 0.5719526367967088, + "grad_norm": 0.43203699588775635, + "learning_rate": 7.758547410353474e-06, + "loss": 0.4544, + "step": 30808 + }, + { + "epoch": 0.5719897669341274, + "grad_norm": 0.5756469368934631, + "learning_rate": 7.75741061542701e-06, + "loss": 0.0833, + "step": 30810 + }, + { + "epoch": 0.5720268970715461, + "grad_norm": 0.713797390460968, + "learning_rate": 7.756273851015464e-06, + "loss": 0.2146, + "step": 30812 + }, + { + "epoch": 0.5720640272089647, + "grad_norm": 0.2435319572687149, + "learning_rate": 7.755137117134312e-06, + "loss": 0.2397, + "step": 30814 + }, + { + "epoch": 0.5721011573463833, + "grad_norm": 0.49905359745025635, + "learning_rate": 7.75400041379902e-06, + "loss": 0.2173, + "step": 30816 + }, + { + "epoch": 0.572138287483802, + "grad_norm": 0.27993685007095337, + "learning_rate": 7.752863741025057e-06, + "loss": 0.3293, + "step": 30818 + }, + { + "epoch": 0.5721754176212206, + "grad_norm": 0.33293116092681885, + "learning_rate": 7.751727098827885e-06, + "loss": 0.1664, + "step": 30820 + }, + { + "epoch": 0.5722125477586393, + "grad_norm": 0.5058616995811462, + "learning_rate": 7.750590487222978e-06, + "loss": 0.3322, + "step": 30822 + }, + { + "epoch": 0.5722496778960579, + "grad_norm": 0.2833290696144104, + "learning_rate": 7.74945390622579e-06, + "loss": 0.2085, + "step": 30824 + }, + { + "epoch": 0.5722868080334765, + "grad_norm": 0.645182728767395, + "learning_rate": 7.748317355851795e-06, + "loss": 0.2779, + "step": 30826 + }, + { + "epoch": 0.5723239381708952, + "grad_norm": 0.4296180009841919, + "learning_rate": 7.747180836116456e-06, + "loss": 0.1074, + "step": 30828 + }, + { + "epoch": 0.5723610683083138, + "grad_norm": 0.677361011505127, + "learning_rate": 7.74604434703524e-06, + "loss": 0.4217, + "step": 30830 + }, + { + "epoch": 0.5723981984457325, + "grad_norm": 0.4936288893222809, + "learning_rate": 7.744907888623607e-06, + "loss": 0.1996, + "step": 30832 + }, + { + "epoch": 0.572435328583151, + "grad_norm": 0.5426415205001831, + "learning_rate": 7.743771460897023e-06, + "loss": 0.3049, + "step": 30834 + }, + { + "epoch": 0.5724724587205697, + "grad_norm": 0.3194630444049835, + "learning_rate": 7.742635063870955e-06, + "loss": 0.2884, + "step": 30836 + }, + { + "epoch": 0.5725095888579884, + "grad_norm": 0.33115580677986145, + "learning_rate": 7.741498697560858e-06, + "loss": 0.1595, + "step": 30838 + }, + { + "epoch": 0.572546718995407, + "grad_norm": 0.3711857497692108, + "learning_rate": 7.7403623619822e-06, + "loss": 0.177, + "step": 30840 + }, + { + "epoch": 0.5725838491328257, + "grad_norm": 0.5087011456489563, + "learning_rate": 7.739226057150441e-06, + "loss": 0.1781, + "step": 30842 + }, + { + "epoch": 0.5726209792702442, + "grad_norm": 0.3291945159435272, + "learning_rate": 7.738089783081045e-06, + "loss": 0.2766, + "step": 30844 + }, + { + "epoch": 0.5726581094076629, + "grad_norm": 0.5278339385986328, + "learning_rate": 7.73695353978947e-06, + "loss": 0.3735, + "step": 30846 + }, + { + "epoch": 0.5726952395450815, + "grad_norm": 0.34606456756591797, + "learning_rate": 7.735817327291182e-06, + "loss": 0.1887, + "step": 30848 + }, + { + "epoch": 0.5727323696825002, + "grad_norm": 0.35968929529190063, + "learning_rate": 7.734681145601635e-06, + "loss": 0.169, + "step": 30850 + }, + { + "epoch": 0.5727694998199189, + "grad_norm": 0.4294825792312622, + "learning_rate": 7.733544994736295e-06, + "loss": 0.3571, + "step": 30852 + }, + { + "epoch": 0.5728066299573374, + "grad_norm": 0.3126409649848938, + "learning_rate": 7.732408874710615e-06, + "loss": 0.0842, + "step": 30854 + }, + { + "epoch": 0.5728437600947561, + "grad_norm": 0.32681673765182495, + "learning_rate": 7.731272785540058e-06, + "loss": 0.286, + "step": 30856 + }, + { + "epoch": 0.5728808902321747, + "grad_norm": 0.40448275208473206, + "learning_rate": 7.730136727240085e-06, + "loss": 0.3281, + "step": 30858 + }, + { + "epoch": 0.5729180203695934, + "grad_norm": 0.8401007652282715, + "learning_rate": 7.729000699826155e-06, + "loss": 0.3482, + "step": 30860 + }, + { + "epoch": 0.5729551505070121, + "grad_norm": 0.32294583320617676, + "learning_rate": 7.72786470331372e-06, + "loss": 0.1319, + "step": 30862 + }, + { + "epoch": 0.5729922806444306, + "grad_norm": 0.4724728465080261, + "learning_rate": 7.72672873771824e-06, + "loss": 0.1826, + "step": 30864 + }, + { + "epoch": 0.5730294107818493, + "grad_norm": 0.38687512278556824, + "learning_rate": 7.725592803055172e-06, + "loss": 0.2544, + "step": 30866 + }, + { + "epoch": 0.5730665409192679, + "grad_norm": 0.3796022832393646, + "learning_rate": 7.724456899339974e-06, + "loss": 0.4137, + "step": 30868 + }, + { + "epoch": 0.5731036710566866, + "grad_norm": 0.29786425828933716, + "learning_rate": 7.723321026588102e-06, + "loss": 0.1833, + "step": 30870 + }, + { + "epoch": 0.5731408011941053, + "grad_norm": 0.663848340511322, + "learning_rate": 7.722185184815011e-06, + "loss": 0.0775, + "step": 30872 + }, + { + "epoch": 0.5731779313315238, + "grad_norm": 0.43506842851638794, + "learning_rate": 7.72104937403616e-06, + "loss": 0.2182, + "step": 30874 + }, + { + "epoch": 0.5732150614689425, + "grad_norm": 0.3086608052253723, + "learning_rate": 7.719913594266997e-06, + "loss": 0.3267, + "step": 30876 + }, + { + "epoch": 0.5732521916063611, + "grad_norm": 0.5191611647605896, + "learning_rate": 7.71877784552298e-06, + "loss": 0.1985, + "step": 30878 + }, + { + "epoch": 0.5732893217437798, + "grad_norm": 0.2562870681285858, + "learning_rate": 7.717642127819564e-06, + "loss": 0.5357, + "step": 30880 + }, + { + "epoch": 0.5733264518811985, + "grad_norm": 0.5959519743919373, + "learning_rate": 7.716506441172204e-06, + "loss": 0.2628, + "step": 30882 + }, + { + "epoch": 0.573363582018617, + "grad_norm": 0.8156997561454773, + "learning_rate": 7.71537078559635e-06, + "loss": 0.1958, + "step": 30884 + }, + { + "epoch": 0.5734007121560357, + "grad_norm": 0.5089747309684753, + "learning_rate": 7.71423516110746e-06, + "loss": 0.2883, + "step": 30886 + }, + { + "epoch": 0.5734378422934543, + "grad_norm": 0.5371935963630676, + "learning_rate": 7.713099567720978e-06, + "loss": 0.2678, + "step": 30888 + }, + { + "epoch": 0.573474972430873, + "grad_norm": 0.5082718729972839, + "learning_rate": 7.711964005452361e-06, + "loss": 0.3693, + "step": 30890 + }, + { + "epoch": 0.5735121025682917, + "grad_norm": 0.39806991815567017, + "learning_rate": 7.710828474317064e-06, + "loss": 0.3454, + "step": 30892 + }, + { + "epoch": 0.5735492327057102, + "grad_norm": 0.27226024866104126, + "learning_rate": 7.70969297433053e-06, + "loss": 0.2292, + "step": 30894 + }, + { + "epoch": 0.5735863628431289, + "grad_norm": 0.44892653822898865, + "learning_rate": 7.708557505508216e-06, + "loss": 0.3301, + "step": 30896 + }, + { + "epoch": 0.5736234929805475, + "grad_norm": 0.32208141684532166, + "learning_rate": 7.707422067865576e-06, + "loss": 0.235, + "step": 30898 + }, + { + "epoch": 0.5736606231179662, + "grad_norm": 0.39069029688835144, + "learning_rate": 7.706286661418048e-06, + "loss": 0.2074, + "step": 30900 + }, + { + "epoch": 0.5736977532553847, + "grad_norm": 0.4110138416290283, + "learning_rate": 7.705151286181089e-06, + "loss": 0.1847, + "step": 30902 + }, + { + "epoch": 0.5737348833928034, + "grad_norm": 0.4770754277706146, + "learning_rate": 7.704015942170148e-06, + "loss": 0.3258, + "step": 30904 + }, + { + "epoch": 0.5737720135302221, + "grad_norm": 0.6607776284217834, + "learning_rate": 7.702880629400673e-06, + "loss": 0.4048, + "step": 30906 + }, + { + "epoch": 0.5738091436676407, + "grad_norm": 0.4549581706523895, + "learning_rate": 7.701745347888111e-06, + "loss": 0.2049, + "step": 30908 + }, + { + "epoch": 0.5738462738050594, + "grad_norm": 0.28146395087242126, + "learning_rate": 7.700610097647911e-06, + "loss": 0.1322, + "step": 30910 + }, + { + "epoch": 0.5738834039424779, + "grad_norm": 0.4381413161754608, + "learning_rate": 7.699474878695524e-06, + "loss": 0.4039, + "step": 30912 + }, + { + "epoch": 0.5739205340798966, + "grad_norm": 0.35600990056991577, + "learning_rate": 7.698339691046387e-06, + "loss": 0.2214, + "step": 30914 + }, + { + "epoch": 0.5739576642173153, + "grad_norm": 0.3268638551235199, + "learning_rate": 7.69720453471596e-06, + "loss": 0.1793, + "step": 30916 + }, + { + "epoch": 0.5739947943547339, + "grad_norm": 0.5677059888839722, + "learning_rate": 7.696069409719675e-06, + "loss": 0.228, + "step": 30918 + }, + { + "epoch": 0.5740319244921526, + "grad_norm": 0.37476134300231934, + "learning_rate": 7.694934316072983e-06, + "loss": 0.2277, + "step": 30920 + }, + { + "epoch": 0.5740690546295711, + "grad_norm": 0.33506545424461365, + "learning_rate": 7.693799253791333e-06, + "loss": 0.2208, + "step": 30922 + }, + { + "epoch": 0.5741061847669898, + "grad_norm": 0.5173262357711792, + "learning_rate": 7.692664222890172e-06, + "loss": 0.289, + "step": 30924 + }, + { + "epoch": 0.5741433149044085, + "grad_norm": 0.34582948684692383, + "learning_rate": 7.691529223384936e-06, + "loss": 0.3179, + "step": 30926 + }, + { + "epoch": 0.5741804450418271, + "grad_norm": 0.559708833694458, + "learning_rate": 7.690394255291072e-06, + "loss": 0.3112, + "step": 30928 + }, + { + "epoch": 0.5742175751792458, + "grad_norm": 0.3752209544181824, + "learning_rate": 7.689259318624024e-06, + "loss": 0.2925, + "step": 30930 + }, + { + "epoch": 0.5742547053166643, + "grad_norm": 0.3502529263496399, + "learning_rate": 7.688124413399233e-06, + "loss": 0.2887, + "step": 30932 + }, + { + "epoch": 0.574291835454083, + "grad_norm": 0.47667697072029114, + "learning_rate": 7.686989539632149e-06, + "loss": 0.379, + "step": 30934 + }, + { + "epoch": 0.5743289655915017, + "grad_norm": 0.2457290142774582, + "learning_rate": 7.685854697338208e-06, + "loss": 0.2391, + "step": 30936 + }, + { + "epoch": 0.5743660957289203, + "grad_norm": 0.505111813545227, + "learning_rate": 7.684719886532851e-06, + "loss": 0.2741, + "step": 30938 + }, + { + "epoch": 0.574403225866339, + "grad_norm": 0.4584346115589142, + "learning_rate": 7.683585107231521e-06, + "loss": 0.5324, + "step": 30940 + }, + { + "epoch": 0.5744403560037575, + "grad_norm": 0.4836214482784271, + "learning_rate": 7.682450359449659e-06, + "loss": 0.3176, + "step": 30942 + }, + { + "epoch": 0.5744774861411762, + "grad_norm": 0.32196342945098877, + "learning_rate": 7.681315643202707e-06, + "loss": 0.2909, + "step": 30944 + }, + { + "epoch": 0.5745146162785948, + "grad_norm": 0.2770683765411377, + "learning_rate": 7.680180958506103e-06, + "loss": 0.3103, + "step": 30946 + }, + { + "epoch": 0.5745517464160135, + "grad_norm": 0.3949334919452667, + "learning_rate": 7.679046305375291e-06, + "loss": 0.539, + "step": 30948 + }, + { + "epoch": 0.5745888765534322, + "grad_norm": 0.29031673073768616, + "learning_rate": 7.677911683825704e-06, + "loss": 0.5197, + "step": 30950 + }, + { + "epoch": 0.5746260066908507, + "grad_norm": 0.2679411470890045, + "learning_rate": 7.676777093872783e-06, + "loss": 0.4677, + "step": 30952 + }, + { + "epoch": 0.5746631368282694, + "grad_norm": 0.4363188147544861, + "learning_rate": 7.675642535531966e-06, + "loss": 0.1272, + "step": 30954 + }, + { + "epoch": 0.574700266965688, + "grad_norm": 0.4277131259441376, + "learning_rate": 7.674508008818693e-06, + "loss": 0.2459, + "step": 30956 + }, + { + "epoch": 0.5747373971031067, + "grad_norm": 0.4207153916358948, + "learning_rate": 7.6733735137484e-06, + "loss": 0.3632, + "step": 30958 + }, + { + "epoch": 0.5747745272405254, + "grad_norm": 0.3746563196182251, + "learning_rate": 7.672239050336523e-06, + "loss": 0.2765, + "step": 30960 + }, + { + "epoch": 0.5748116573779439, + "grad_norm": 0.44773221015930176, + "learning_rate": 7.671104618598505e-06, + "loss": 0.274, + "step": 30962 + }, + { + "epoch": 0.5748487875153626, + "grad_norm": 3.3151469230651855, + "learning_rate": 7.669970218549773e-06, + "loss": 0.1843, + "step": 30964 + }, + { + "epoch": 0.5748859176527812, + "grad_norm": 0.3795017898082733, + "learning_rate": 7.668835850205768e-06, + "loss": 0.2415, + "step": 30966 + }, + { + "epoch": 0.5749230477901999, + "grad_norm": 0.29662495851516724, + "learning_rate": 7.667701513581926e-06, + "loss": 0.3192, + "step": 30968 + }, + { + "epoch": 0.5749601779276186, + "grad_norm": 0.397330641746521, + "learning_rate": 7.666567208693677e-06, + "loss": 0.3502, + "step": 30970 + }, + { + "epoch": 0.5749973080650371, + "grad_norm": 0.32202550768852234, + "learning_rate": 7.665432935556461e-06, + "loss": 0.0861, + "step": 30972 + }, + { + "epoch": 0.5750344382024558, + "grad_norm": 0.08321783691644669, + "learning_rate": 7.664298694185713e-06, + "loss": 0.0712, + "step": 30974 + }, + { + "epoch": 0.5750715683398744, + "grad_norm": 0.5714452862739563, + "learning_rate": 7.66316448459686e-06, + "loss": 0.4176, + "step": 30976 + }, + { + "epoch": 0.5751086984772931, + "grad_norm": 0.5573163032531738, + "learning_rate": 7.662030306805338e-06, + "loss": 0.5409, + "step": 30978 + }, + { + "epoch": 0.5751458286147118, + "grad_norm": 0.2605822682380676, + "learning_rate": 7.660896160826582e-06, + "loss": 0.237, + "step": 30980 + }, + { + "epoch": 0.5751829587521303, + "grad_norm": 0.4066370725631714, + "learning_rate": 7.65976204667602e-06, + "loss": 0.2691, + "step": 30982 + }, + { + "epoch": 0.575220088889549, + "grad_norm": 0.5078721046447754, + "learning_rate": 7.658627964369088e-06, + "loss": 0.3956, + "step": 30984 + }, + { + "epoch": 0.5752572190269676, + "grad_norm": 0.38002809882164, + "learning_rate": 7.657493913921221e-06, + "loss": 0.3838, + "step": 30986 + }, + { + "epoch": 0.5752943491643863, + "grad_norm": 0.38463693857192993, + "learning_rate": 7.65635989534784e-06, + "loss": 0.2424, + "step": 30988 + }, + { + "epoch": 0.575331479301805, + "grad_norm": 0.2529514729976654, + "learning_rate": 7.655225908664383e-06, + "loss": 0.361, + "step": 30990 + }, + { + "epoch": 0.5753686094392235, + "grad_norm": 0.38379237055778503, + "learning_rate": 7.654091953886276e-06, + "loss": 0.4205, + "step": 30992 + }, + { + "epoch": 0.5754057395766422, + "grad_norm": 0.6508237719535828, + "learning_rate": 7.652958031028953e-06, + "loss": 0.2591, + "step": 30994 + }, + { + "epoch": 0.5754428697140608, + "grad_norm": 0.5471687316894531, + "learning_rate": 7.651824140107838e-06, + "loss": 0.2737, + "step": 30996 + }, + { + "epoch": 0.5754799998514795, + "grad_norm": 0.37261924147605896, + "learning_rate": 7.650690281138365e-06, + "loss": 0.3347, + "step": 30998 + }, + { + "epoch": 0.575517129988898, + "grad_norm": 0.3480701446533203, + "learning_rate": 7.649556454135964e-06, + "loss": 0.0702, + "step": 31000 + }, + { + "epoch": 0.5755542601263167, + "grad_norm": 0.3818562626838684, + "learning_rate": 7.648422659116053e-06, + "loss": 0.3007, + "step": 31002 + }, + { + "epoch": 0.5755913902637354, + "grad_norm": 0.29535770416259766, + "learning_rate": 7.647288896094069e-06, + "loss": 0.2153, + "step": 31004 + }, + { + "epoch": 0.575628520401154, + "grad_norm": 0.4236251711845398, + "learning_rate": 7.646155165085434e-06, + "loss": 0.2775, + "step": 31006 + }, + { + "epoch": 0.5756656505385727, + "grad_norm": 0.25951558351516724, + "learning_rate": 7.645021466105578e-06, + "loss": 0.3522, + "step": 31008 + }, + { + "epoch": 0.5757027806759912, + "grad_norm": 0.35228031873703003, + "learning_rate": 7.643887799169925e-06, + "loss": 0.3039, + "step": 31010 + }, + { + "epoch": 0.5757399108134099, + "grad_norm": 0.22825314104557037, + "learning_rate": 7.642754164293905e-06, + "loss": 0.1563, + "step": 31012 + }, + { + "epoch": 0.5757770409508286, + "grad_norm": 0.35881125926971436, + "learning_rate": 7.641620561492936e-06, + "loss": 0.3959, + "step": 31014 + }, + { + "epoch": 0.5758141710882472, + "grad_norm": 0.265899121761322, + "learning_rate": 7.640486990782447e-06, + "loss": 0.2356, + "step": 31016 + }, + { + "epoch": 0.5758513012256659, + "grad_norm": 0.3160868287086487, + "learning_rate": 7.639353452177863e-06, + "loss": 0.0942, + "step": 31018 + }, + { + "epoch": 0.5758884313630844, + "grad_norm": 0.2538793385028839, + "learning_rate": 7.63821994569461e-06, + "loss": 0.2013, + "step": 31020 + }, + { + "epoch": 0.5759255615005031, + "grad_norm": 0.37353113293647766, + "learning_rate": 7.637086471348106e-06, + "loss": 0.4185, + "step": 31022 + }, + { + "epoch": 0.5759626916379218, + "grad_norm": 0.4472772777080536, + "learning_rate": 7.635953029153782e-06, + "loss": 0.2368, + "step": 31024 + }, + { + "epoch": 0.5759998217753404, + "grad_norm": 0.3807167708873749, + "learning_rate": 7.634819619127051e-06, + "loss": 0.1808, + "step": 31026 + }, + { + "epoch": 0.576036951912759, + "grad_norm": 0.3559841513633728, + "learning_rate": 7.633686241283342e-06, + "loss": 0.2481, + "step": 31028 + }, + { + "epoch": 0.5760740820501776, + "grad_norm": 0.35631564259529114, + "learning_rate": 7.632552895638075e-06, + "loss": 0.255, + "step": 31030 + }, + { + "epoch": 0.5761112121875963, + "grad_norm": 0.34104984998703003, + "learning_rate": 7.631419582206673e-06, + "loss": 0.3576, + "step": 31032 + }, + { + "epoch": 0.576148342325015, + "grad_norm": 0.4543199837207794, + "learning_rate": 7.630286301004556e-06, + "loss": 0.4909, + "step": 31034 + }, + { + "epoch": 0.5761854724624336, + "grad_norm": 0.353497177362442, + "learning_rate": 7.6291530520471426e-06, + "loss": 0.2809, + "step": 31036 + }, + { + "epoch": 0.5762226025998523, + "grad_norm": 0.22295664250850677, + "learning_rate": 7.628019835349859e-06, + "loss": 0.2394, + "step": 31038 + }, + { + "epoch": 0.5762597327372708, + "grad_norm": 0.2737695574760437, + "learning_rate": 7.626886650928116e-06, + "loss": 0.3897, + "step": 31040 + }, + { + "epoch": 0.5762968628746895, + "grad_norm": 0.3815363943576813, + "learning_rate": 7.625753498797339e-06, + "loss": 0.2597, + "step": 31042 + }, + { + "epoch": 0.5763339930121082, + "grad_norm": 0.24110160768032074, + "learning_rate": 7.624620378972944e-06, + "loss": 0.2332, + "step": 31044 + }, + { + "epoch": 0.5763711231495268, + "grad_norm": 0.5233474373817444, + "learning_rate": 7.62348729147035e-06, + "loss": 0.2005, + "step": 31046 + }, + { + "epoch": 0.5764082532869454, + "grad_norm": 0.36254993081092834, + "learning_rate": 7.622354236304976e-06, + "loss": 0.1988, + "step": 31048 + }, + { + "epoch": 0.576445383424364, + "grad_norm": 0.33645758032798767, + "learning_rate": 7.621221213492244e-06, + "loss": 0.228, + "step": 31050 + }, + { + "epoch": 0.5764825135617827, + "grad_norm": 0.2907795310020447, + "learning_rate": 7.62008822304756e-06, + "loss": 0.3067, + "step": 31052 + }, + { + "epoch": 0.5765196436992013, + "grad_norm": 0.44895225763320923, + "learning_rate": 7.618955264986351e-06, + "loss": 0.3484, + "step": 31054 + }, + { + "epoch": 0.57655677383662, + "grad_norm": 0.4202222526073456, + "learning_rate": 7.617822339324025e-06, + "loss": 0.2185, + "step": 31056 + }, + { + "epoch": 0.5765939039740386, + "grad_norm": 0.47238579392433167, + "learning_rate": 7.616689446076001e-06, + "loss": 0.3503, + "step": 31058 + }, + { + "epoch": 0.5766310341114572, + "grad_norm": 0.614136815071106, + "learning_rate": 7.615556585257696e-06, + "loss": 0.4441, + "step": 31060 + }, + { + "epoch": 0.5766681642488759, + "grad_norm": 0.358832985162735, + "learning_rate": 7.614423756884528e-06, + "loss": 0.1489, + "step": 31062 + }, + { + "epoch": 0.5767052943862945, + "grad_norm": 0.637636661529541, + "learning_rate": 7.613290960971904e-06, + "loss": 0.3262, + "step": 31064 + }, + { + "epoch": 0.5767424245237132, + "grad_norm": 0.35878950357437134, + "learning_rate": 7.612158197535239e-06, + "loss": 0.2943, + "step": 31066 + }, + { + "epoch": 0.5767795546611318, + "grad_norm": 0.195411816239357, + "learning_rate": 7.611025466589949e-06, + "loss": 0.0762, + "step": 31068 + }, + { + "epoch": 0.5768166847985504, + "grad_norm": 0.3723064064979553, + "learning_rate": 7.609892768151446e-06, + "loss": 0.2061, + "step": 31070 + }, + { + "epoch": 0.5768538149359691, + "grad_norm": 0.3227402865886688, + "learning_rate": 7.608760102235147e-06, + "loss": 0.0856, + "step": 31072 + }, + { + "epoch": 0.5768909450733877, + "grad_norm": 0.2868557870388031, + "learning_rate": 7.607627468856458e-06, + "loss": 0.2215, + "step": 31074 + }, + { + "epoch": 0.5769280752108064, + "grad_norm": 0.46220844984054565, + "learning_rate": 7.606494868030793e-06, + "loss": 0.2236, + "step": 31076 + }, + { + "epoch": 0.576965205348225, + "grad_norm": 0.44900795817375183, + "learning_rate": 7.6053622997735645e-06, + "loss": 0.5057, + "step": 31078 + }, + { + "epoch": 0.5770023354856436, + "grad_norm": 0.43503543734550476, + "learning_rate": 7.604229764100179e-06, + "loss": 0.2162, + "step": 31080 + }, + { + "epoch": 0.5770394656230623, + "grad_norm": 0.469188928604126, + "learning_rate": 7.603097261026053e-06, + "loss": 0.2356, + "step": 31082 + }, + { + "epoch": 0.5770765957604809, + "grad_norm": 0.37695544958114624, + "learning_rate": 7.601964790566594e-06, + "loss": 0.3335, + "step": 31084 + }, + { + "epoch": 0.5771137258978996, + "grad_norm": 0.4378848373889923, + "learning_rate": 7.60083235273721e-06, + "loss": 0.1724, + "step": 31086 + }, + { + "epoch": 0.5771508560353182, + "grad_norm": 0.38973599672317505, + "learning_rate": 7.599699947553315e-06, + "loss": 0.2511, + "step": 31088 + }, + { + "epoch": 0.5771879861727368, + "grad_norm": 0.3587445318698883, + "learning_rate": 7.59856757503031e-06, + "loss": 0.2637, + "step": 31090 + }, + { + "epoch": 0.5772251163101555, + "grad_norm": 0.2674536406993866, + "learning_rate": 7.597435235183607e-06, + "loss": 0.1289, + "step": 31092 + }, + { + "epoch": 0.5772622464475741, + "grad_norm": 0.3264833390712738, + "learning_rate": 7.596302928028617e-06, + "loss": 0.3112, + "step": 31094 + }, + { + "epoch": 0.5772993765849928, + "grad_norm": 0.27748262882232666, + "learning_rate": 7.595170653580741e-06, + "loss": 0.082, + "step": 31096 + }, + { + "epoch": 0.5773365067224113, + "grad_norm": 0.30836835503578186, + "learning_rate": 7.594038411855389e-06, + "loss": 0.1842, + "step": 31098 + }, + { + "epoch": 0.57737363685983, + "grad_norm": 0.3451334536075592, + "learning_rate": 7.592906202867973e-06, + "loss": 0.0799, + "step": 31100 + }, + { + "epoch": 0.5774107669972487, + "grad_norm": 0.5137263536453247, + "learning_rate": 7.591774026633888e-06, + "loss": 0.3753, + "step": 31102 + }, + { + "epoch": 0.5774478971346673, + "grad_norm": 0.4151761829853058, + "learning_rate": 7.590641883168545e-06, + "loss": 0.4445, + "step": 31104 + }, + { + "epoch": 0.577485027272086, + "grad_norm": 0.4870668053627014, + "learning_rate": 7.5895097724873525e-06, + "loss": 0.1672, + "step": 31106 + }, + { + "epoch": 0.5775221574095045, + "grad_norm": 0.4959615170955658, + "learning_rate": 7.588377694605709e-06, + "loss": 0.2958, + "step": 31108 + }, + { + "epoch": 0.5775592875469232, + "grad_norm": 0.3315300941467285, + "learning_rate": 7.587245649539021e-06, + "loss": 0.2315, + "step": 31110 + }, + { + "epoch": 0.5775964176843419, + "grad_norm": 0.34610649943351746, + "learning_rate": 7.586113637302698e-06, + "loss": 0.1462, + "step": 31112 + }, + { + "epoch": 0.5776335478217605, + "grad_norm": 0.34251880645751953, + "learning_rate": 7.584981657912133e-06, + "loss": 0.2781, + "step": 31114 + }, + { + "epoch": 0.5776706779591791, + "grad_norm": 0.3090687692165375, + "learning_rate": 7.583849711382736e-06, + "loss": 0.3889, + "step": 31116 + }, + { + "epoch": 0.5777078080965977, + "grad_norm": 0.5516473650932312, + "learning_rate": 7.582717797729904e-06, + "loss": 0.3875, + "step": 31118 + }, + { + "epoch": 0.5777449382340164, + "grad_norm": 0.34100228548049927, + "learning_rate": 7.5815859169690435e-06, + "loss": 0.3291, + "step": 31120 + }, + { + "epoch": 0.5777820683714351, + "grad_norm": 0.324738472700119, + "learning_rate": 7.580454069115555e-06, + "loss": 0.2212, + "step": 31122 + }, + { + "epoch": 0.5778191985088537, + "grad_norm": 0.5066614151000977, + "learning_rate": 7.57932225418484e-06, + "loss": 0.2936, + "step": 31124 + }, + { + "epoch": 0.5778563286462723, + "grad_norm": 0.3880600929260254, + "learning_rate": 7.578190472192299e-06, + "loss": 0.1724, + "step": 31126 + }, + { + "epoch": 0.5778934587836909, + "grad_norm": 0.3594605326652527, + "learning_rate": 7.577058723153332e-06, + "loss": 0.4552, + "step": 31128 + }, + { + "epoch": 0.5779305889211096, + "grad_norm": 0.5140810012817383, + "learning_rate": 7.575927007083335e-06, + "loss": 0.379, + "step": 31130 + }, + { + "epoch": 0.5779677190585283, + "grad_norm": 0.43927818536758423, + "learning_rate": 7.5747953239977124e-06, + "loss": 0.2491, + "step": 31132 + }, + { + "epoch": 0.5780048491959469, + "grad_norm": 0.25763311982154846, + "learning_rate": 7.57366367391186e-06, + "loss": 0.2964, + "step": 31134 + }, + { + "epoch": 0.5780419793333655, + "grad_norm": 0.31850776076316833, + "learning_rate": 7.572532056841178e-06, + "loss": 0.2405, + "step": 31136 + }, + { + "epoch": 0.5780791094707841, + "grad_norm": 0.36355075240135193, + "learning_rate": 7.5714004728010645e-06, + "loss": 0.1795, + "step": 31138 + }, + { + "epoch": 0.5781162396082028, + "grad_norm": 0.37581780552864075, + "learning_rate": 7.570268921806917e-06, + "loss": 0.3514, + "step": 31140 + }, + { + "epoch": 0.5781533697456215, + "grad_norm": 0.3243929147720337, + "learning_rate": 7.569137403874129e-06, + "loss": 0.187, + "step": 31142 + }, + { + "epoch": 0.57819049988304, + "grad_norm": 0.39333802461624146, + "learning_rate": 7.5680059190181e-06, + "loss": 0.2883, + "step": 31144 + }, + { + "epoch": 0.5782276300204587, + "grad_norm": 0.30685508251190186, + "learning_rate": 7.566874467254227e-06, + "loss": 0.3749, + "step": 31146 + }, + { + "epoch": 0.5782647601578773, + "grad_norm": 0.3579385280609131, + "learning_rate": 7.565743048597903e-06, + "loss": 0.5051, + "step": 31148 + }, + { + "epoch": 0.578301890295296, + "grad_norm": 0.3715967535972595, + "learning_rate": 7.5646116630645295e-06, + "loss": 0.3512, + "step": 31150 + }, + { + "epoch": 0.5783390204327146, + "grad_norm": 0.5116569399833679, + "learning_rate": 7.563480310669491e-06, + "loss": 0.3125, + "step": 31152 + }, + { + "epoch": 0.5783761505701333, + "grad_norm": 0.46344009041786194, + "learning_rate": 7.562348991428188e-06, + "loss": 0.2145, + "step": 31154 + }, + { + "epoch": 0.5784132807075519, + "grad_norm": 0.3946397006511688, + "learning_rate": 7.561217705356012e-06, + "loss": 0.2155, + "step": 31156 + }, + { + "epoch": 0.5784504108449705, + "grad_norm": 0.26402443647384644, + "learning_rate": 7.560086452468363e-06, + "loss": 0.3264, + "step": 31158 + }, + { + "epoch": 0.5784875409823892, + "grad_norm": 0.48576226830482483, + "learning_rate": 7.558955232780625e-06, + "loss": 0.4532, + "step": 31160 + }, + { + "epoch": 0.5785246711198078, + "grad_norm": 0.4014604389667511, + "learning_rate": 7.557824046308194e-06, + "loss": 0.2974, + "step": 31162 + }, + { + "epoch": 0.5785618012572264, + "grad_norm": 0.33381161093711853, + "learning_rate": 7.556692893066469e-06, + "loss": 0.154, + "step": 31164 + }, + { + "epoch": 0.5785989313946451, + "grad_norm": 0.2902061343193054, + "learning_rate": 7.555561773070828e-06, + "loss": 0.3285, + "step": 31166 + }, + { + "epoch": 0.5786360615320637, + "grad_norm": 0.4292329251766205, + "learning_rate": 7.554430686336671e-06, + "loss": 0.1008, + "step": 31168 + }, + { + "epoch": 0.5786731916694824, + "grad_norm": 0.3464864194393158, + "learning_rate": 7.553299632879389e-06, + "loss": 0.5245, + "step": 31170 + }, + { + "epoch": 0.578710321806901, + "grad_norm": 0.3979438841342926, + "learning_rate": 7.552168612714368e-06, + "loss": 0.3633, + "step": 31172 + }, + { + "epoch": 0.5787474519443196, + "grad_norm": 0.3536602556705475, + "learning_rate": 7.551037625857001e-06, + "loss": 0.3105, + "step": 31174 + }, + { + "epoch": 0.5787845820817383, + "grad_norm": 0.36892902851104736, + "learning_rate": 7.54990667232268e-06, + "loss": 0.253, + "step": 31176 + }, + { + "epoch": 0.5788217122191569, + "grad_norm": 0.39388081431388855, + "learning_rate": 7.548775752126785e-06, + "loss": 0.4155, + "step": 31178 + }, + { + "epoch": 0.5788588423565756, + "grad_norm": 0.6572017073631287, + "learning_rate": 7.547644865284713e-06, + "loss": 0.1942, + "step": 31180 + }, + { + "epoch": 0.5788959724939942, + "grad_norm": 0.31375494599342346, + "learning_rate": 7.546514011811848e-06, + "loss": 0.3266, + "step": 31182 + }, + { + "epoch": 0.5789331026314128, + "grad_norm": 0.31462037563323975, + "learning_rate": 7.545383191723576e-06, + "loss": 0.2088, + "step": 31184 + }, + { + "epoch": 0.5789702327688315, + "grad_norm": 0.34164267778396606, + "learning_rate": 7.5442524050352865e-06, + "loss": 0.4071, + "step": 31186 + }, + { + "epoch": 0.5790073629062501, + "grad_norm": 0.3621094822883606, + "learning_rate": 7.543121651762371e-06, + "loss": 0.2049, + "step": 31188 + }, + { + "epoch": 0.5790444930436688, + "grad_norm": 0.36736997961997986, + "learning_rate": 7.541990931920207e-06, + "loss": 0.3622, + "step": 31190 + }, + { + "epoch": 0.5790816231810874, + "grad_norm": 0.3473871648311615, + "learning_rate": 7.540860245524184e-06, + "loss": 0.2158, + "step": 31192 + }, + { + "epoch": 0.579118753318506, + "grad_norm": 0.2878754734992981, + "learning_rate": 7.539729592589686e-06, + "loss": 0.1341, + "step": 31194 + }, + { + "epoch": 0.5791558834559247, + "grad_norm": 0.4229128658771515, + "learning_rate": 7.538598973132099e-06, + "loss": 0.174, + "step": 31196 + }, + { + "epoch": 0.5791930135933433, + "grad_norm": 0.5712783932685852, + "learning_rate": 7.537468387166808e-06, + "loss": 0.1855, + "step": 31198 + }, + { + "epoch": 0.579230143730762, + "grad_norm": 0.25841474533081055, + "learning_rate": 7.536337834709196e-06, + "loss": 0.2232, + "step": 31200 + }, + { + "epoch": 0.5792672738681806, + "grad_norm": 0.2668760418891907, + "learning_rate": 7.53520731577465e-06, + "loss": 0.4626, + "step": 31202 + }, + { + "epoch": 0.5793044040055992, + "grad_norm": 0.6375037431716919, + "learning_rate": 7.534076830378546e-06, + "loss": 0.3463, + "step": 31204 + }, + { + "epoch": 0.5793415341430178, + "grad_norm": 0.6976446509361267, + "learning_rate": 7.53294637853627e-06, + "loss": 0.2774, + "step": 31206 + }, + { + "epoch": 0.5793786642804365, + "grad_norm": 0.2875373661518097, + "learning_rate": 7.531815960263203e-06, + "loss": 0.2808, + "step": 31208 + }, + { + "epoch": 0.5794157944178552, + "grad_norm": 0.4141579866409302, + "learning_rate": 7.53068557557473e-06, + "loss": 0.3061, + "step": 31210 + }, + { + "epoch": 0.5794529245552738, + "grad_norm": 0.4020611047744751, + "learning_rate": 7.529555224486228e-06, + "loss": 0.291, + "step": 31212 + }, + { + "epoch": 0.5794900546926924, + "grad_norm": 0.3870101273059845, + "learning_rate": 7.5284249070130846e-06, + "loss": 0.3585, + "step": 31214 + }, + { + "epoch": 0.579527184830111, + "grad_norm": 0.429636687040329, + "learning_rate": 7.527294623170669e-06, + "loss": 0.3685, + "step": 31216 + }, + { + "epoch": 0.5795643149675297, + "grad_norm": 0.47146692872047424, + "learning_rate": 7.526164372974368e-06, + "loss": 0.2863, + "step": 31218 + }, + { + "epoch": 0.5796014451049484, + "grad_norm": 0.30755946040153503, + "learning_rate": 7.52503415643956e-06, + "loss": 0.3691, + "step": 31220 + }, + { + "epoch": 0.579638575242367, + "grad_norm": 0.6568705439567566, + "learning_rate": 7.523903973581625e-06, + "loss": 0.1784, + "step": 31222 + }, + { + "epoch": 0.5796757053797856, + "grad_norm": 0.3821035325527191, + "learning_rate": 7.52277382441594e-06, + "loss": 0.15, + "step": 31224 + }, + { + "epoch": 0.5797128355172042, + "grad_norm": 0.3715713620185852, + "learning_rate": 7.5216437089578865e-06, + "loss": 0.3992, + "step": 31226 + }, + { + "epoch": 0.5797499656546229, + "grad_norm": 0.5227676033973694, + "learning_rate": 7.520513627222833e-06, + "loss": 0.2477, + "step": 31228 + }, + { + "epoch": 0.5797870957920416, + "grad_norm": 0.4146497845649719, + "learning_rate": 7.519383579226163e-06, + "loss": 0.3091, + "step": 31230 + }, + { + "epoch": 0.5798242259294601, + "grad_norm": 0.38209593296051025, + "learning_rate": 7.518253564983255e-06, + "loss": 0.1763, + "step": 31232 + }, + { + "epoch": 0.5798613560668788, + "grad_norm": 0.39699509739875793, + "learning_rate": 7.517123584509479e-06, + "loss": 0.5125, + "step": 31234 + }, + { + "epoch": 0.5798984862042974, + "grad_norm": 0.30754196643829346, + "learning_rate": 7.515993637820214e-06, + "loss": 0.3199, + "step": 31236 + }, + { + "epoch": 0.5799356163417161, + "grad_norm": 0.3600892722606659, + "learning_rate": 7.514863724930841e-06, + "loss": 0.3063, + "step": 31238 + }, + { + "epoch": 0.5799727464791348, + "grad_norm": 0.2616475224494934, + "learning_rate": 7.5137338458567234e-06, + "loss": 0.2201, + "step": 31240 + }, + { + "epoch": 0.5800098766165533, + "grad_norm": 0.2983090579509735, + "learning_rate": 7.512604000613241e-06, + "loss": 0.1526, + "step": 31242 + }, + { + "epoch": 0.580047006753972, + "grad_norm": 0.3064076602458954, + "learning_rate": 7.511474189215769e-06, + "loss": 0.2026, + "step": 31244 + }, + { + "epoch": 0.5800841368913906, + "grad_norm": 0.4513185918331146, + "learning_rate": 7.510344411679677e-06, + "loss": 0.1806, + "step": 31246 + }, + { + "epoch": 0.5801212670288093, + "grad_norm": 0.49970078468322754, + "learning_rate": 7.509214668020342e-06, + "loss": 0.2918, + "step": 31248 + }, + { + "epoch": 0.5801583971662279, + "grad_norm": 0.3199923038482666, + "learning_rate": 7.508084958253133e-06, + "loss": 0.224, + "step": 31250 + }, + { + "epoch": 0.5801955273036465, + "grad_norm": 0.4985957443714142, + "learning_rate": 7.506955282393429e-06, + "loss": 0.3188, + "step": 31252 + }, + { + "epoch": 0.5802326574410652, + "grad_norm": 0.4084486961364746, + "learning_rate": 7.505825640456593e-06, + "loss": 0.1931, + "step": 31254 + }, + { + "epoch": 0.5802697875784838, + "grad_norm": 0.3797971308231354, + "learning_rate": 7.504696032457998e-06, + "loss": 0.4373, + "step": 31256 + }, + { + "epoch": 0.5803069177159025, + "grad_norm": 1.5427873134613037, + "learning_rate": 7.503566458413014e-06, + "loss": 0.3382, + "step": 31258 + }, + { + "epoch": 0.580344047853321, + "grad_norm": 0.38955816626548767, + "learning_rate": 7.502436918337015e-06, + "loss": 0.3002, + "step": 31260 + }, + { + "epoch": 0.5803811779907397, + "grad_norm": 0.44622698426246643, + "learning_rate": 7.501307412245369e-06, + "loss": 0.2736, + "step": 31262 + }, + { + "epoch": 0.5804183081281584, + "grad_norm": 0.4852862060070038, + "learning_rate": 7.500177940153445e-06, + "loss": 0.1185, + "step": 31264 + }, + { + "epoch": 0.580455438265577, + "grad_norm": 0.6417048573493958, + "learning_rate": 7.499048502076611e-06, + "loss": 0.3548, + "step": 31266 + }, + { + "epoch": 0.5804925684029957, + "grad_norm": 0.4444785714149475, + "learning_rate": 7.497919098030235e-06, + "loss": 0.2291, + "step": 31268 + }, + { + "epoch": 0.5805296985404143, + "grad_norm": 0.2659972608089447, + "learning_rate": 7.496789728029684e-06, + "loss": 0.2728, + "step": 31270 + }, + { + "epoch": 0.5805668286778329, + "grad_norm": 0.6256941556930542, + "learning_rate": 7.495660392090327e-06, + "loss": 0.2474, + "step": 31272 + }, + { + "epoch": 0.5806039588152516, + "grad_norm": 0.41704824566841125, + "learning_rate": 7.494531090227533e-06, + "loss": 0.3616, + "step": 31274 + }, + { + "epoch": 0.5806410889526702, + "grad_norm": 0.1835106760263443, + "learning_rate": 7.493401822456667e-06, + "loss": 0.2539, + "step": 31276 + }, + { + "epoch": 0.5806782190900889, + "grad_norm": 0.3414742648601532, + "learning_rate": 7.492272588793093e-06, + "loss": 0.2318, + "step": 31278 + }, + { + "epoch": 0.5807153492275074, + "grad_norm": 0.5324370265007019, + "learning_rate": 7.491143389252176e-06, + "loss": 0.3272, + "step": 31280 + }, + { + "epoch": 0.5807524793649261, + "grad_norm": 0.388435035943985, + "learning_rate": 7.4900142238492826e-06, + "loss": 0.2675, + "step": 31282 + }, + { + "epoch": 0.5807896095023448, + "grad_norm": 0.21172888576984406, + "learning_rate": 7.488885092599779e-06, + "loss": 0.1862, + "step": 31284 + }, + { + "epoch": 0.5808267396397634, + "grad_norm": 0.2763361930847168, + "learning_rate": 7.487755995519026e-06, + "loss": 0.2257, + "step": 31286 + }, + { + "epoch": 0.5808638697771821, + "grad_norm": 0.27705878019332886, + "learning_rate": 7.486626932622388e-06, + "loss": 0.4772, + "step": 31288 + }, + { + "epoch": 0.5809009999146006, + "grad_norm": 0.40326008200645447, + "learning_rate": 7.485497903925234e-06, + "loss": 0.1828, + "step": 31290 + }, + { + "epoch": 0.5809381300520193, + "grad_norm": 0.8272413015365601, + "learning_rate": 7.484368909442918e-06, + "loss": 0.3258, + "step": 31292 + }, + { + "epoch": 0.580975260189438, + "grad_norm": 0.4801693558692932, + "learning_rate": 7.483239949190805e-06, + "loss": 0.1799, + "step": 31294 + }, + { + "epoch": 0.5810123903268566, + "grad_norm": 0.3801548182964325, + "learning_rate": 7.48211102318426e-06, + "loss": 0.2798, + "step": 31296 + }, + { + "epoch": 0.5810495204642753, + "grad_norm": 0.3058623969554901, + "learning_rate": 7.48098213143864e-06, + "loss": 0.3793, + "step": 31298 + }, + { + "epoch": 0.5810866506016938, + "grad_norm": 0.28413820266723633, + "learning_rate": 7.4798532739693085e-06, + "loss": 0.3551, + "step": 31300 + }, + { + "epoch": 0.5811237807391125, + "grad_norm": 0.18476887047290802, + "learning_rate": 7.478724450791628e-06, + "loss": 0.1957, + "step": 31302 + }, + { + "epoch": 0.5811609108765311, + "grad_norm": 0.4460327625274658, + "learning_rate": 7.477595661920953e-06, + "loss": 0.2001, + "step": 31304 + }, + { + "epoch": 0.5811980410139498, + "grad_norm": 0.542113721370697, + "learning_rate": 7.4764669073726435e-06, + "loss": 0.2242, + "step": 31306 + }, + { + "epoch": 0.5812351711513685, + "grad_norm": 0.37249520421028137, + "learning_rate": 7.4753381871620645e-06, + "loss": 0.3522, + "step": 31308 + }, + { + "epoch": 0.581272301288787, + "grad_norm": 0.4218714237213135, + "learning_rate": 7.474209501304568e-06, + "loss": 0.1671, + "step": 31310 + }, + { + "epoch": 0.5813094314262057, + "grad_norm": 0.3553682267665863, + "learning_rate": 7.473080849815515e-06, + "loss": 0.2259, + "step": 31312 + }, + { + "epoch": 0.5813465615636243, + "grad_norm": 0.5226209759712219, + "learning_rate": 7.471952232710267e-06, + "loss": 0.3675, + "step": 31314 + }, + { + "epoch": 0.581383691701043, + "grad_norm": 0.4288221001625061, + "learning_rate": 7.4708236500041705e-06, + "loss": 0.1673, + "step": 31316 + }, + { + "epoch": 0.5814208218384617, + "grad_norm": 0.36535248160362244, + "learning_rate": 7.469695101712593e-06, + "loss": 0.3972, + "step": 31318 + }, + { + "epoch": 0.5814579519758802, + "grad_norm": 0.4311227798461914, + "learning_rate": 7.468566587850882e-06, + "loss": 0.2337, + "step": 31320 + }, + { + "epoch": 0.5814950821132989, + "grad_norm": 0.3497351408004761, + "learning_rate": 7.467438108434397e-06, + "loss": 0.3453, + "step": 31322 + }, + { + "epoch": 0.5815322122507175, + "grad_norm": 0.44875404238700867, + "learning_rate": 7.466309663478495e-06, + "loss": 0.3234, + "step": 31324 + }, + { + "epoch": 0.5815693423881362, + "grad_norm": 0.22620448470115662, + "learning_rate": 7.465181252998531e-06, + "loss": 0.2973, + "step": 31326 + }, + { + "epoch": 0.5816064725255549, + "grad_norm": 0.33616337180137634, + "learning_rate": 7.464052877009857e-06, + "loss": 0.3884, + "step": 31328 + }, + { + "epoch": 0.5816436026629734, + "grad_norm": 0.36800092458724976, + "learning_rate": 7.462924535527828e-06, + "loss": 0.332, + "step": 31330 + }, + { + "epoch": 0.5816807328003921, + "grad_norm": 0.4493695795536041, + "learning_rate": 7.461796228567794e-06, + "loss": 0.2977, + "step": 31332 + }, + { + "epoch": 0.5817178629378107, + "grad_norm": 0.48661890625953674, + "learning_rate": 7.46066795614511e-06, + "loss": 0.3263, + "step": 31334 + }, + { + "epoch": 0.5817549930752294, + "grad_norm": 0.6547520756721497, + "learning_rate": 7.45953971827513e-06, + "loss": 0.3438, + "step": 31336 + }, + { + "epoch": 0.5817921232126481, + "grad_norm": 0.4534946382045746, + "learning_rate": 7.458411514973206e-06, + "loss": 0.1855, + "step": 31338 + }, + { + "epoch": 0.5818292533500666, + "grad_norm": 0.4844074845314026, + "learning_rate": 7.457283346254691e-06, + "loss": 0.3136, + "step": 31340 + }, + { + "epoch": 0.5818663834874853, + "grad_norm": 0.3138137459754944, + "learning_rate": 7.456155212134929e-06, + "loss": 0.2008, + "step": 31342 + }, + { + "epoch": 0.5819035136249039, + "grad_norm": 0.5542809367179871, + "learning_rate": 7.455027112629275e-06, + "loss": 0.1143, + "step": 31344 + }, + { + "epoch": 0.5819406437623226, + "grad_norm": 0.38744744658470154, + "learning_rate": 7.4538990477530796e-06, + "loss": 0.2222, + "step": 31346 + }, + { + "epoch": 0.5819777738997413, + "grad_norm": 0.22495150566101074, + "learning_rate": 7.452771017521691e-06, + "loss": 0.2341, + "step": 31348 + }, + { + "epoch": 0.5820149040371598, + "grad_norm": 0.3316761255264282, + "learning_rate": 7.451643021950461e-06, + "loss": 0.1663, + "step": 31350 + }, + { + "epoch": 0.5820520341745785, + "grad_norm": 0.42802131175994873, + "learning_rate": 7.450515061054737e-06, + "loss": 0.3642, + "step": 31352 + }, + { + "epoch": 0.5820891643119971, + "grad_norm": 0.2496299296617508, + "learning_rate": 7.449387134849864e-06, + "loss": 0.3945, + "step": 31354 + }, + { + "epoch": 0.5821262944494158, + "grad_norm": 0.2747473418712616, + "learning_rate": 7.4482592433511925e-06, + "loss": 0.3296, + "step": 31356 + }, + { + "epoch": 0.5821634245868343, + "grad_norm": 0.2841508686542511, + "learning_rate": 7.447131386574068e-06, + "loss": 0.1723, + "step": 31358 + }, + { + "epoch": 0.582200554724253, + "grad_norm": 0.624060869216919, + "learning_rate": 7.446003564533841e-06, + "loss": 0.4475, + "step": 31360 + }, + { + "epoch": 0.5822376848616717, + "grad_norm": 0.3869979977607727, + "learning_rate": 7.444875777245855e-06, + "loss": 0.3269, + "step": 31362 + }, + { + "epoch": 0.5822748149990903, + "grad_norm": 0.24494224786758423, + "learning_rate": 7.443748024725459e-06, + "loss": 0.3183, + "step": 31364 + }, + { + "epoch": 0.582311945136509, + "grad_norm": 0.42008814215660095, + "learning_rate": 7.442620306987991e-06, + "loss": 0.3629, + "step": 31366 + }, + { + "epoch": 0.5823490752739275, + "grad_norm": 0.2734806537628174, + "learning_rate": 7.4414926240488e-06, + "loss": 0.3491, + "step": 31368 + }, + { + "epoch": 0.5823862054113462, + "grad_norm": 0.3839620351791382, + "learning_rate": 7.440364975923233e-06, + "loss": 0.4229, + "step": 31370 + }, + { + "epoch": 0.5824233355487649, + "grad_norm": 0.422510027885437, + "learning_rate": 7.43923736262663e-06, + "loss": 0.3187, + "step": 31372 + }, + { + "epoch": 0.5824604656861835, + "grad_norm": 0.2778130769729614, + "learning_rate": 7.438109784174336e-06, + "loss": 0.3011, + "step": 31374 + }, + { + "epoch": 0.5824975958236022, + "grad_norm": 0.28930965065956116, + "learning_rate": 7.436982240581696e-06, + "loss": 0.4143, + "step": 31376 + }, + { + "epoch": 0.5825347259610207, + "grad_norm": 0.2550574541091919, + "learning_rate": 7.435854731864053e-06, + "loss": 0.1535, + "step": 31378 + }, + { + "epoch": 0.5825718560984394, + "grad_norm": 0.3449892997741699, + "learning_rate": 7.434727258036742e-06, + "loss": 0.2671, + "step": 31380 + }, + { + "epoch": 0.5826089862358581, + "grad_norm": 0.43867236375808716, + "learning_rate": 7.4335998191151115e-06, + "loss": 0.2301, + "step": 31382 + }, + { + "epoch": 0.5826461163732767, + "grad_norm": 0.2857857942581177, + "learning_rate": 7.4324724151144985e-06, + "loss": 0.193, + "step": 31384 + }, + { + "epoch": 0.5826832465106954, + "grad_norm": 0.35813453793525696, + "learning_rate": 7.431345046050246e-06, + "loss": 0.2036, + "step": 31386 + }, + { + "epoch": 0.5827203766481139, + "grad_norm": 0.40862342715263367, + "learning_rate": 7.430217711937692e-06, + "loss": 0.214, + "step": 31388 + }, + { + "epoch": 0.5827575067855326, + "grad_norm": 0.5152801871299744, + "learning_rate": 7.429090412792183e-06, + "loss": 0.2799, + "step": 31390 + }, + { + "epoch": 0.5827946369229513, + "grad_norm": 0.6497358083724976, + "learning_rate": 7.4279631486290495e-06, + "loss": 0.4325, + "step": 31392 + }, + { + "epoch": 0.5828317670603699, + "grad_norm": 0.36648866534233093, + "learning_rate": 7.42683591946363e-06, + "loss": 0.3002, + "step": 31394 + }, + { + "epoch": 0.5828688971977886, + "grad_norm": 0.797605574131012, + "learning_rate": 7.4257087253112695e-06, + "loss": 0.3095, + "step": 31396 + }, + { + "epoch": 0.5829060273352071, + "grad_norm": 0.31922563910484314, + "learning_rate": 7.424581566187301e-06, + "loss": 0.2466, + "step": 31398 + }, + { + "epoch": 0.5829431574726258, + "grad_norm": 0.2807650566101074, + "learning_rate": 7.423454442107066e-06, + "loss": 0.3748, + "step": 31400 + }, + { + "epoch": 0.5829802876100444, + "grad_norm": 0.33038461208343506, + "learning_rate": 7.4223273530858964e-06, + "loss": 0.3549, + "step": 31402 + }, + { + "epoch": 0.5830174177474631, + "grad_norm": 0.5780045986175537, + "learning_rate": 7.421200299139131e-06, + "loss": 0.3044, + "step": 31404 + }, + { + "epoch": 0.5830545478848818, + "grad_norm": 0.5376818180084229, + "learning_rate": 7.420073280282103e-06, + "loss": 0.1609, + "step": 31406 + }, + { + "epoch": 0.5830916780223003, + "grad_norm": 0.37683767080307007, + "learning_rate": 7.418946296530151e-06, + "loss": 0.2816, + "step": 31408 + }, + { + "epoch": 0.583128808159719, + "grad_norm": 0.31889238953590393, + "learning_rate": 7.4178193478986074e-06, + "loss": 0.2946, + "step": 31410 + }, + { + "epoch": 0.5831659382971376, + "grad_norm": 0.4798712134361267, + "learning_rate": 7.4166924344028115e-06, + "loss": 0.279, + "step": 31412 + }, + { + "epoch": 0.5832030684345563, + "grad_norm": 0.3796563446521759, + "learning_rate": 7.415565556058091e-06, + "loss": 0.3548, + "step": 31414 + }, + { + "epoch": 0.583240198571975, + "grad_norm": 0.4634820520877838, + "learning_rate": 7.4144387128797855e-06, + "loss": 0.3351, + "step": 31416 + }, + { + "epoch": 0.5832773287093935, + "grad_norm": 0.40738263726234436, + "learning_rate": 7.4133119048832205e-06, + "loss": 0.1525, + "step": 31418 + }, + { + "epoch": 0.5833144588468122, + "grad_norm": 0.3201143145561218, + "learning_rate": 7.412185132083733e-06, + "loss": 0.2776, + "step": 31420 + }, + { + "epoch": 0.5833515889842308, + "grad_norm": 0.25892800092697144, + "learning_rate": 7.411058394496657e-06, + "loss": 0.2675, + "step": 31422 + }, + { + "epoch": 0.5833887191216495, + "grad_norm": 0.2496744990348816, + "learning_rate": 7.409931692137318e-06, + "loss": 0.1254, + "step": 31424 + }, + { + "epoch": 0.5834258492590682, + "grad_norm": 0.5938456654548645, + "learning_rate": 7.4088050250210505e-06, + "loss": 0.3177, + "step": 31426 + }, + { + "epoch": 0.5834629793964867, + "grad_norm": 0.38729992508888245, + "learning_rate": 7.407678393163191e-06, + "loss": 0.381, + "step": 31428 + }, + { + "epoch": 0.5835001095339054, + "grad_norm": 0.5222066640853882, + "learning_rate": 7.4065517965790576e-06, + "loss": 0.3827, + "step": 31430 + }, + { + "epoch": 0.583537239671324, + "grad_norm": 0.3405328094959259, + "learning_rate": 7.405425235283987e-06, + "loss": 0.2475, + "step": 31432 + }, + { + "epoch": 0.5835743698087427, + "grad_norm": 0.4006451666355133, + "learning_rate": 7.40429870929331e-06, + "loss": 0.2578, + "step": 31434 + }, + { + "epoch": 0.5836114999461613, + "grad_norm": 0.27298107743263245, + "learning_rate": 7.4031722186223485e-06, + "loss": 0.2662, + "step": 31436 + }, + { + "epoch": 0.5836486300835799, + "grad_norm": 0.42411088943481445, + "learning_rate": 7.402045763286437e-06, + "loss": 0.3057, + "step": 31438 + }, + { + "epoch": 0.5836857602209986, + "grad_norm": 0.3684844672679901, + "learning_rate": 7.400919343300903e-06, + "loss": 0.244, + "step": 31440 + }, + { + "epoch": 0.5837228903584172, + "grad_norm": 0.2758946716785431, + "learning_rate": 7.399792958681069e-06, + "loss": 0.198, + "step": 31442 + }, + { + "epoch": 0.5837600204958359, + "grad_norm": 0.4888532757759094, + "learning_rate": 7.398666609442263e-06, + "loss": 0.275, + "step": 31444 + }, + { + "epoch": 0.5837971506332545, + "grad_norm": 0.472589373588562, + "learning_rate": 7.3975402955998154e-06, + "loss": 0.3844, + "step": 31446 + }, + { + "epoch": 0.5838342807706731, + "grad_norm": 0.25237753987312317, + "learning_rate": 7.396414017169047e-06, + "loss": 0.3549, + "step": 31448 + }, + { + "epoch": 0.5838714109080918, + "grad_norm": 0.3862515091896057, + "learning_rate": 7.395287774165285e-06, + "loss": 0.3569, + "step": 31450 + }, + { + "epoch": 0.5839085410455104, + "grad_norm": 0.3240642249584198, + "learning_rate": 7.394161566603855e-06, + "loss": 0.3013, + "step": 31452 + }, + { + "epoch": 0.5839456711829291, + "grad_norm": 0.3982669711112976, + "learning_rate": 7.393035394500084e-06, + "loss": 0.1658, + "step": 31454 + }, + { + "epoch": 0.5839828013203476, + "grad_norm": 0.3717283308506012, + "learning_rate": 7.39190925786929e-06, + "loss": 0.2017, + "step": 31456 + }, + { + "epoch": 0.5840199314577663, + "grad_norm": 0.34135472774505615, + "learning_rate": 7.390783156726797e-06, + "loss": 0.1955, + "step": 31458 + }, + { + "epoch": 0.584057061595185, + "grad_norm": 0.5701931715011597, + "learning_rate": 7.389657091087931e-06, + "loss": 0.2218, + "step": 31460 + }, + { + "epoch": 0.5840941917326036, + "grad_norm": 0.3414704501628876, + "learning_rate": 7.388531060968011e-06, + "loss": 0.3175, + "step": 31462 + }, + { + "epoch": 0.5841313218700223, + "grad_norm": 0.5569100975990295, + "learning_rate": 7.387405066382366e-06, + "loss": 0.2615, + "step": 31464 + }, + { + "epoch": 0.5841684520074408, + "grad_norm": 0.43285226821899414, + "learning_rate": 7.38627910734631e-06, + "loss": 0.4792, + "step": 31466 + }, + { + "epoch": 0.5842055821448595, + "grad_norm": 0.3917894959449768, + "learning_rate": 7.385153183875168e-06, + "loss": 0.1886, + "step": 31468 + }, + { + "epoch": 0.5842427122822782, + "grad_norm": 0.7385019063949585, + "learning_rate": 7.384027295984255e-06, + "loss": 0.2173, + "step": 31470 + }, + { + "epoch": 0.5842798424196968, + "grad_norm": 0.43126580119132996, + "learning_rate": 7.3829014436888965e-06, + "loss": 0.4084, + "step": 31472 + }, + { + "epoch": 0.5843169725571155, + "grad_norm": 0.522849440574646, + "learning_rate": 7.38177562700441e-06, + "loss": 0.5275, + "step": 31474 + }, + { + "epoch": 0.584354102694534, + "grad_norm": 0.38137561082839966, + "learning_rate": 7.380649845946115e-06, + "loss": 0.643, + "step": 31476 + }, + { + "epoch": 0.5843912328319527, + "grad_norm": 0.4658193290233612, + "learning_rate": 7.379524100529334e-06, + "loss": 0.1522, + "step": 31478 + }, + { + "epoch": 0.5844283629693714, + "grad_norm": 0.4742436110973358, + "learning_rate": 7.378398390769374e-06, + "loss": 0.491, + "step": 31480 + }, + { + "epoch": 0.58446549310679, + "grad_norm": 0.4431532621383667, + "learning_rate": 7.377272716681563e-06, + "loss": 0.1469, + "step": 31482 + }, + { + "epoch": 0.5845026232442087, + "grad_norm": 0.3973146975040436, + "learning_rate": 7.37614707828121e-06, + "loss": 0.3057, + "step": 31484 + }, + { + "epoch": 0.5845397533816272, + "grad_norm": 0.45231929421424866, + "learning_rate": 7.375021475583641e-06, + "loss": 0.3976, + "step": 31486 + }, + { + "epoch": 0.5845768835190459, + "grad_norm": 0.5320724248886108, + "learning_rate": 7.373895908604165e-06, + "loss": 0.3556, + "step": 31488 + }, + { + "epoch": 0.5846140136564646, + "grad_norm": 0.3774247467517853, + "learning_rate": 7.372770377358098e-06, + "loss": 0.2147, + "step": 31490 + }, + { + "epoch": 0.5846511437938832, + "grad_norm": 0.3362526595592499, + "learning_rate": 7.371644881860762e-06, + "loss": 0.1999, + "step": 31492 + }, + { + "epoch": 0.5846882739313018, + "grad_norm": 0.32823148369789124, + "learning_rate": 7.370519422127463e-06, + "loss": 0.2647, + "step": 31494 + }, + { + "epoch": 0.5847254040687204, + "grad_norm": 0.525265634059906, + "learning_rate": 7.369393998173516e-06, + "loss": 0.0913, + "step": 31496 + }, + { + "epoch": 0.5847625342061391, + "grad_norm": 0.288187712430954, + "learning_rate": 7.368268610014241e-06, + "loss": 0.3261, + "step": 31498 + }, + { + "epoch": 0.5847996643435578, + "grad_norm": 0.4077187478542328, + "learning_rate": 7.3671432576649436e-06, + "loss": 0.2703, + "step": 31500 + }, + { + "epoch": 0.5848367944809764, + "grad_norm": 0.25891801714897156, + "learning_rate": 7.366017941140941e-06, + "loss": 0.339, + "step": 31502 + }, + { + "epoch": 0.584873924618395, + "grad_norm": 0.2599584758281708, + "learning_rate": 7.36489266045755e-06, + "loss": 0.1865, + "step": 31504 + }, + { + "epoch": 0.5849110547558136, + "grad_norm": 0.3618934750556946, + "learning_rate": 7.363767415630072e-06, + "loss": 0.3521, + "step": 31506 + }, + { + "epoch": 0.5849481848932323, + "grad_norm": 0.30013081431388855, + "learning_rate": 7.3626422066738245e-06, + "loss": 0.1472, + "step": 31508 + }, + { + "epoch": 0.5849853150306509, + "grad_norm": 0.34922075271606445, + "learning_rate": 7.361517033604114e-06, + "loss": 0.2098, + "step": 31510 + }, + { + "epoch": 0.5850224451680696, + "grad_norm": 0.2749541997909546, + "learning_rate": 7.360391896436254e-06, + "loss": 0.3397, + "step": 31512 + }, + { + "epoch": 0.5850595753054882, + "grad_norm": 0.5792484283447266, + "learning_rate": 7.359266795185554e-06, + "loss": 0.3064, + "step": 31514 + }, + { + "epoch": 0.5850967054429068, + "grad_norm": 0.2625385820865631, + "learning_rate": 7.358141729867328e-06, + "loss": 0.1844, + "step": 31516 + }, + { + "epoch": 0.5851338355803255, + "grad_norm": 0.18893569707870483, + "learning_rate": 7.3570167004968755e-06, + "loss": 0.1497, + "step": 31518 + }, + { + "epoch": 0.5851709657177441, + "grad_norm": 0.5120677947998047, + "learning_rate": 7.35589170708951e-06, + "loss": 0.4884, + "step": 31520 + }, + { + "epoch": 0.5852080958551628, + "grad_norm": 0.34654709696769714, + "learning_rate": 7.3547667496605365e-06, + "loss": 0.338, + "step": 31522 + }, + { + "epoch": 0.5852452259925814, + "grad_norm": 0.3771117925643921, + "learning_rate": 7.353641828225264e-06, + "loss": 0.2927, + "step": 31524 + }, + { + "epoch": 0.58528235613, + "grad_norm": 0.3346656560897827, + "learning_rate": 7.352516942799e-06, + "loss": 0.1749, + "step": 31526 + }, + { + "epoch": 0.5853194862674187, + "grad_norm": 0.36570876836776733, + "learning_rate": 7.351392093397055e-06, + "loss": 0.1835, + "step": 31528 + }, + { + "epoch": 0.5853566164048373, + "grad_norm": 0.6522613763809204, + "learning_rate": 7.350267280034726e-06, + "loss": 0.225, + "step": 31530 + }, + { + "epoch": 0.585393746542256, + "grad_norm": 0.3438888192176819, + "learning_rate": 7.3491425027273225e-06, + "loss": 0.3963, + "step": 31532 + }, + { + "epoch": 0.5854308766796746, + "grad_norm": 0.3448486030101776, + "learning_rate": 7.348017761490149e-06, + "loss": 0.2492, + "step": 31534 + }, + { + "epoch": 0.5854680068170932, + "grad_norm": 0.5372067093849182, + "learning_rate": 7.346893056338508e-06, + "loss": 0.3701, + "step": 31536 + }, + { + "epoch": 0.5855051369545119, + "grad_norm": 0.39993029832839966, + "learning_rate": 7.345768387287711e-06, + "loss": 0.2831, + "step": 31538 + }, + { + "epoch": 0.5855422670919305, + "grad_norm": 0.4859073758125305, + "learning_rate": 7.344643754353051e-06, + "loss": 0.1724, + "step": 31540 + }, + { + "epoch": 0.5855793972293492, + "grad_norm": 0.32795417308807373, + "learning_rate": 7.343519157549842e-06, + "loss": 0.3751, + "step": 31542 + }, + { + "epoch": 0.5856165273667678, + "grad_norm": 0.32972195744514465, + "learning_rate": 7.3423945968933755e-06, + "loss": 0.2058, + "step": 31544 + }, + { + "epoch": 0.5856536575041864, + "grad_norm": 0.2241780012845993, + "learning_rate": 7.341270072398957e-06, + "loss": 0.17, + "step": 31546 + }, + { + "epoch": 0.5856907876416051, + "grad_norm": 0.6067708730697632, + "learning_rate": 7.34014558408189e-06, + "loss": 0.4838, + "step": 31548 + }, + { + "epoch": 0.5857279177790237, + "grad_norm": 0.5196802020072937, + "learning_rate": 7.339021131957476e-06, + "loss": 0.2343, + "step": 31550 + }, + { + "epoch": 0.5857650479164423, + "grad_norm": 0.2722427546977997, + "learning_rate": 7.337896716041012e-06, + "loss": 0.2526, + "step": 31552 + }, + { + "epoch": 0.5858021780538609, + "grad_norm": 0.4690447449684143, + "learning_rate": 7.336772336347804e-06, + "loss": 0.2416, + "step": 31554 + }, + { + "epoch": 0.5858393081912796, + "grad_norm": 0.6072005033493042, + "learning_rate": 7.335647992893142e-06, + "loss": 0.2913, + "step": 31556 + }, + { + "epoch": 0.5858764383286983, + "grad_norm": 0.341249018907547, + "learning_rate": 7.334523685692332e-06, + "loss": 0.1736, + "step": 31558 + }, + { + "epoch": 0.5859135684661169, + "grad_norm": 0.3197457194328308, + "learning_rate": 7.333399414760669e-06, + "loss": 0.2647, + "step": 31560 + }, + { + "epoch": 0.5859506986035355, + "grad_norm": 0.3608039617538452, + "learning_rate": 7.332275180113455e-06, + "loss": 0.1225, + "step": 31562 + }, + { + "epoch": 0.5859878287409541, + "grad_norm": 0.27338674664497375, + "learning_rate": 7.331150981765984e-06, + "loss": 0.3106, + "step": 31564 + }, + { + "epoch": 0.5860249588783728, + "grad_norm": 0.4414764642715454, + "learning_rate": 7.330026819733558e-06, + "loss": 0.3487, + "step": 31566 + }, + { + "epoch": 0.5860620890157915, + "grad_norm": 0.5151707530021667, + "learning_rate": 7.328902694031465e-06, + "loss": 0.3351, + "step": 31568 + }, + { + "epoch": 0.5860992191532101, + "grad_norm": 0.413105845451355, + "learning_rate": 7.327778604675004e-06, + "loss": 0.248, + "step": 31570 + }, + { + "epoch": 0.5861363492906287, + "grad_norm": 0.27820658683776855, + "learning_rate": 7.326654551679475e-06, + "loss": 0.3669, + "step": 31572 + }, + { + "epoch": 0.5861734794280473, + "grad_norm": 0.4441945552825928, + "learning_rate": 7.325530535060169e-06, + "loss": 0.5604, + "step": 31574 + }, + { + "epoch": 0.586210609565466, + "grad_norm": 0.39767172932624817, + "learning_rate": 7.3244065548323805e-06, + "loss": 0.2985, + "step": 31576 + }, + { + "epoch": 0.5862477397028847, + "grad_norm": 0.6023899912834167, + "learning_rate": 7.323282611011404e-06, + "loss": 0.1445, + "step": 31578 + }, + { + "epoch": 0.5862848698403033, + "grad_norm": 0.3366793692111969, + "learning_rate": 7.322158703612538e-06, + "loss": 0.2848, + "step": 31580 + }, + { + "epoch": 0.5863219999777219, + "grad_norm": 0.3359677195549011, + "learning_rate": 7.3210348326510685e-06, + "loss": 0.2832, + "step": 31582 + }, + { + "epoch": 0.5863591301151405, + "grad_norm": 0.4678323268890381, + "learning_rate": 7.31991099814229e-06, + "loss": 0.4158, + "step": 31584 + }, + { + "epoch": 0.5863962602525592, + "grad_norm": 0.2143193930387497, + "learning_rate": 7.318787200101495e-06, + "loss": 0.3868, + "step": 31586 + }, + { + "epoch": 0.5864333903899779, + "grad_norm": 0.39950650930404663, + "learning_rate": 7.317663438543974e-06, + "loss": 0.2197, + "step": 31588 + }, + { + "epoch": 0.5864705205273965, + "grad_norm": 0.3779835104942322, + "learning_rate": 7.316539713485018e-06, + "loss": 0.1988, + "step": 31590 + }, + { + "epoch": 0.5865076506648151, + "grad_norm": 0.6571510434150696, + "learning_rate": 7.315416024939925e-06, + "loss": 0.3316, + "step": 31592 + }, + { + "epoch": 0.5865447808022337, + "grad_norm": 0.37568551301956177, + "learning_rate": 7.314292372923974e-06, + "loss": 0.3847, + "step": 31594 + }, + { + "epoch": 0.5865819109396524, + "grad_norm": 0.32305559515953064, + "learning_rate": 7.31316875745246e-06, + "loss": 0.3251, + "step": 31596 + }, + { + "epoch": 0.5866190410770711, + "grad_norm": 0.275055468082428, + "learning_rate": 7.31204517854067e-06, + "loss": 0.2718, + "step": 31598 + }, + { + "epoch": 0.5866561712144897, + "grad_norm": 0.5199087262153625, + "learning_rate": 7.310921636203893e-06, + "loss": 0.1347, + "step": 31600 + }, + { + "epoch": 0.5866933013519083, + "grad_norm": 0.4450320601463318, + "learning_rate": 7.30979813045742e-06, + "loss": 0.3419, + "step": 31602 + }, + { + "epoch": 0.5867304314893269, + "grad_norm": 0.3427060544490814, + "learning_rate": 7.308674661316537e-06, + "loss": 0.3143, + "step": 31604 + }, + { + "epoch": 0.5867675616267456, + "grad_norm": 0.3739967942237854, + "learning_rate": 7.30755122879653e-06, + "loss": 0.3336, + "step": 31606 + }, + { + "epoch": 0.5868046917641642, + "grad_norm": 0.5261752009391785, + "learning_rate": 7.306427832912685e-06, + "loss": 0.2162, + "step": 31608 + }, + { + "epoch": 0.5868418219015828, + "grad_norm": 0.4019750952720642, + "learning_rate": 7.305304473680288e-06, + "loss": 0.3365, + "step": 31610 + }, + { + "epoch": 0.5868789520390015, + "grad_norm": 0.4957427382469177, + "learning_rate": 7.304181151114626e-06, + "loss": 0.4845, + "step": 31612 + }, + { + "epoch": 0.5869160821764201, + "grad_norm": 0.25053849816322327, + "learning_rate": 7.3030578652309846e-06, + "loss": 0.2467, + "step": 31614 + }, + { + "epoch": 0.5869532123138388, + "grad_norm": 0.32958361506462097, + "learning_rate": 7.301934616044646e-06, + "loss": 0.4048, + "step": 31616 + }, + { + "epoch": 0.5869903424512574, + "grad_norm": 0.5286376476287842, + "learning_rate": 7.3008114035709e-06, + "loss": 0.3093, + "step": 31618 + }, + { + "epoch": 0.587027472588676, + "grad_norm": 0.3356854319572449, + "learning_rate": 7.2996882278250215e-06, + "loss": 0.2803, + "step": 31620 + }, + { + "epoch": 0.5870646027260947, + "grad_norm": 0.3413086235523224, + "learning_rate": 7.298565088822298e-06, + "loss": 0.206, + "step": 31622 + }, + { + "epoch": 0.5871017328635133, + "grad_norm": 0.4413459300994873, + "learning_rate": 7.297441986578013e-06, + "loss": 0.3098, + "step": 31624 + }, + { + "epoch": 0.587138863000932, + "grad_norm": 0.2945999801158905, + "learning_rate": 7.296318921107448e-06, + "loss": 0.4644, + "step": 31626 + }, + { + "epoch": 0.5871759931383506, + "grad_norm": 0.42358773946762085, + "learning_rate": 7.295195892425881e-06, + "loss": 0.413, + "step": 31628 + }, + { + "epoch": 0.5872131232757692, + "grad_norm": 0.42129024863243103, + "learning_rate": 7.294072900548602e-06, + "loss": 0.3324, + "step": 31630 + }, + { + "epoch": 0.5872502534131879, + "grad_norm": 0.298801988363266, + "learning_rate": 7.292949945490881e-06, + "loss": 0.3104, + "step": 31632 + }, + { + "epoch": 0.5872873835506065, + "grad_norm": 0.5141480565071106, + "learning_rate": 7.291827027268002e-06, + "loss": 0.2152, + "step": 31634 + }, + { + "epoch": 0.5873245136880252, + "grad_norm": 0.2527773380279541, + "learning_rate": 7.290704145895249e-06, + "loss": 0.3855, + "step": 31636 + }, + { + "epoch": 0.5873616438254438, + "grad_norm": 0.7944961786270142, + "learning_rate": 7.289581301387894e-06, + "loss": 0.4471, + "step": 31638 + }, + { + "epoch": 0.5873987739628624, + "grad_norm": 0.4543292820453644, + "learning_rate": 7.288458493761219e-06, + "loss": 0.462, + "step": 31640 + }, + { + "epoch": 0.5874359041002811, + "grad_norm": 0.3127201795578003, + "learning_rate": 7.2873357230305065e-06, + "loss": 0.2639, + "step": 31642 + }, + { + "epoch": 0.5874730342376997, + "grad_norm": 0.3707166314125061, + "learning_rate": 7.286212989211024e-06, + "loss": 0.169, + "step": 31644 + }, + { + "epoch": 0.5875101643751184, + "grad_norm": 0.27145740389823914, + "learning_rate": 7.285090292318057e-06, + "loss": 0.2026, + "step": 31646 + }, + { + "epoch": 0.587547294512537, + "grad_norm": 0.2943972647190094, + "learning_rate": 7.2839676323668775e-06, + "loss": 0.1889, + "step": 31648 + }, + { + "epoch": 0.5875844246499556, + "grad_norm": 0.4220932126045227, + "learning_rate": 7.282845009372762e-06, + "loss": 0.3244, + "step": 31650 + }, + { + "epoch": 0.5876215547873743, + "grad_norm": 0.32255157828330994, + "learning_rate": 7.281722423350987e-06, + "loss": 0.2239, + "step": 31652 + }, + { + "epoch": 0.5876586849247929, + "grad_norm": 0.3327600359916687, + "learning_rate": 7.280599874316833e-06, + "loss": 0.4165, + "step": 31654 + }, + { + "epoch": 0.5876958150622116, + "grad_norm": 0.21703441441059113, + "learning_rate": 7.279477362285564e-06, + "loss": 0.1965, + "step": 31656 + }, + { + "epoch": 0.5877329451996302, + "grad_norm": 0.3859167695045471, + "learning_rate": 7.278354887272462e-06, + "loss": 0.3525, + "step": 31658 + }, + { + "epoch": 0.5877700753370488, + "grad_norm": 0.2846670150756836, + "learning_rate": 7.2772324492927946e-06, + "loss": 0.2374, + "step": 31660 + }, + { + "epoch": 0.5878072054744674, + "grad_norm": 0.698136031627655, + "learning_rate": 7.2761100483618395e-06, + "loss": 0.4735, + "step": 31662 + }, + { + "epoch": 0.5878443356118861, + "grad_norm": 0.31667712330818176, + "learning_rate": 7.274987684494867e-06, + "loss": 0.1797, + "step": 31664 + }, + { + "epoch": 0.5878814657493048, + "grad_norm": 0.28017285466194153, + "learning_rate": 7.273865357707151e-06, + "loss": 0.2274, + "step": 31666 + }, + { + "epoch": 0.5879185958867233, + "grad_norm": 0.40834394097328186, + "learning_rate": 7.272743068013962e-06, + "loss": 0.1312, + "step": 31668 + }, + { + "epoch": 0.587955726024142, + "grad_norm": 0.674379825592041, + "learning_rate": 7.271620815430571e-06, + "loss": 0.225, + "step": 31670 + }, + { + "epoch": 0.5879928561615606, + "grad_norm": 0.28663137555122375, + "learning_rate": 7.270498599972246e-06, + "loss": 0.1194, + "step": 31672 + }, + { + "epoch": 0.5880299862989793, + "grad_norm": 0.308267742395401, + "learning_rate": 7.269376421654259e-06, + "loss": 0.1885, + "step": 31674 + }, + { + "epoch": 0.588067116436398, + "grad_norm": 0.2910693883895874, + "learning_rate": 7.2682542804918836e-06, + "loss": 0.2459, + "step": 31676 + }, + { + "epoch": 0.5881042465738165, + "grad_norm": 0.29778817296028137, + "learning_rate": 7.267132176500381e-06, + "loss": 0.3687, + "step": 31678 + }, + { + "epoch": 0.5881413767112352, + "grad_norm": 0.4061230719089508, + "learning_rate": 7.266010109695028e-06, + "loss": 0.4514, + "step": 31680 + }, + { + "epoch": 0.5881785068486538, + "grad_norm": 0.6901503801345825, + "learning_rate": 7.2648880800910845e-06, + "loss": 0.3275, + "step": 31682 + }, + { + "epoch": 0.5882156369860725, + "grad_norm": 0.33507126569747925, + "learning_rate": 7.263766087703822e-06, + "loss": 0.3252, + "step": 31684 + }, + { + "epoch": 0.5882527671234912, + "grad_norm": 0.41105884313583374, + "learning_rate": 7.2626441325485065e-06, + "loss": 0.2266, + "step": 31686 + }, + { + "epoch": 0.5882898972609097, + "grad_norm": 0.3871004581451416, + "learning_rate": 7.261522214640406e-06, + "loss": 0.0916, + "step": 31688 + }, + { + "epoch": 0.5883270273983284, + "grad_norm": 0.4171714782714844, + "learning_rate": 7.260400333994786e-06, + "loss": 0.4098, + "step": 31690 + }, + { + "epoch": 0.588364157535747, + "grad_norm": 0.4369262158870697, + "learning_rate": 7.2592784906269134e-06, + "loss": 0.4129, + "step": 31692 + }, + { + "epoch": 0.5884012876731657, + "grad_norm": 0.47549691796302795, + "learning_rate": 7.258156684552048e-06, + "loss": 0.1689, + "step": 31694 + }, + { + "epoch": 0.5884384178105844, + "grad_norm": 0.28676649928092957, + "learning_rate": 7.257034915785457e-06, + "loss": 0.1609, + "step": 31696 + }, + { + "epoch": 0.5884755479480029, + "grad_norm": 0.4710913896560669, + "learning_rate": 7.255913184342404e-06, + "loss": 0.2452, + "step": 31698 + }, + { + "epoch": 0.5885126780854216, + "grad_norm": 0.5212896466255188, + "learning_rate": 7.254791490238156e-06, + "loss": 0.5185, + "step": 31700 + }, + { + "epoch": 0.5885498082228402, + "grad_norm": 0.7548796534538269, + "learning_rate": 7.2536698334879704e-06, + "loss": 0.3305, + "step": 31702 + }, + { + "epoch": 0.5885869383602589, + "grad_norm": 0.2140730321407318, + "learning_rate": 7.2525482141071135e-06, + "loss": 0.1599, + "step": 31704 + }, + { + "epoch": 0.5886240684976775, + "grad_norm": 0.4729933738708496, + "learning_rate": 7.2514266321108484e-06, + "loss": 0.3249, + "step": 31706 + }, + { + "epoch": 0.5886611986350961, + "grad_norm": 0.47176676988601685, + "learning_rate": 7.250305087514431e-06, + "loss": 0.2916, + "step": 31708 + }, + { + "epoch": 0.5886983287725148, + "grad_norm": 0.3367743492126465, + "learning_rate": 7.2491835803331255e-06, + "loss": 0.3558, + "step": 31710 + }, + { + "epoch": 0.5887354589099334, + "grad_norm": 0.340731680393219, + "learning_rate": 7.24806211058219e-06, + "loss": 0.2012, + "step": 31712 + }, + { + "epoch": 0.5887725890473521, + "grad_norm": 0.5731931924819946, + "learning_rate": 7.246940678276888e-06, + "loss": 0.2506, + "step": 31714 + }, + { + "epoch": 0.5888097191847707, + "grad_norm": 0.3500525653362274, + "learning_rate": 7.245819283432476e-06, + "loss": 0.4298, + "step": 31716 + }, + { + "epoch": 0.5888468493221893, + "grad_norm": 0.3710630238056183, + "learning_rate": 7.244697926064217e-06, + "loss": 0.3968, + "step": 31718 + }, + { + "epoch": 0.588883979459608, + "grad_norm": 0.4012805223464966, + "learning_rate": 7.243576606187363e-06, + "loss": 0.2583, + "step": 31720 + }, + { + "epoch": 0.5889211095970266, + "grad_norm": 0.8543648719787598, + "learning_rate": 7.242455323817177e-06, + "loss": 0.3058, + "step": 31722 + }, + { + "epoch": 0.5889582397344453, + "grad_norm": 0.22527900338172913, + "learning_rate": 7.2413340789689114e-06, + "loss": 0.0547, + "step": 31724 + }, + { + "epoch": 0.5889953698718638, + "grad_norm": 0.5282771587371826, + "learning_rate": 7.2402128716578265e-06, + "loss": 0.4259, + "step": 31726 + }, + { + "epoch": 0.5890325000092825, + "grad_norm": 0.2811391353607178, + "learning_rate": 7.239091701899179e-06, + "loss": 0.4504, + "step": 31728 + }, + { + "epoch": 0.5890696301467012, + "grad_norm": 0.3888522684574127, + "learning_rate": 7.237970569708226e-06, + "loss": 0.3129, + "step": 31730 + }, + { + "epoch": 0.5891067602841198, + "grad_norm": 0.2887738049030304, + "learning_rate": 7.236849475100218e-06, + "loss": 0.0957, + "step": 31732 + }, + { + "epoch": 0.5891438904215385, + "grad_norm": 0.4300808608531952, + "learning_rate": 7.235728418090411e-06, + "loss": 0.3205, + "step": 31734 + }, + { + "epoch": 0.589181020558957, + "grad_norm": 0.3292039632797241, + "learning_rate": 7.23460739869406e-06, + "loss": 0.4345, + "step": 31736 + }, + { + "epoch": 0.5892181506963757, + "grad_norm": 0.42816850543022156, + "learning_rate": 7.2334864169264196e-06, + "loss": 0.4088, + "step": 31738 + }, + { + "epoch": 0.5892552808337944, + "grad_norm": 0.5426253080368042, + "learning_rate": 7.232365472802744e-06, + "loss": 0.3536, + "step": 31740 + }, + { + "epoch": 0.589292410971213, + "grad_norm": 0.24948692321777344, + "learning_rate": 7.2312445663382825e-06, + "loss": 0.3238, + "step": 31742 + }, + { + "epoch": 0.5893295411086317, + "grad_norm": 0.4598807394504547, + "learning_rate": 7.230123697548292e-06, + "loss": 0.2677, + "step": 31744 + }, + { + "epoch": 0.5893666712460502, + "grad_norm": 0.33556583523750305, + "learning_rate": 7.229002866448018e-06, + "loss": 0.2597, + "step": 31746 + }, + { + "epoch": 0.5894038013834689, + "grad_norm": 0.4495238661766052, + "learning_rate": 7.227882073052715e-06, + "loss": 0.2128, + "step": 31748 + }, + { + "epoch": 0.5894409315208876, + "grad_norm": 0.45971202850341797, + "learning_rate": 7.226761317377633e-06, + "loss": 0.4362, + "step": 31750 + }, + { + "epoch": 0.5894780616583062, + "grad_norm": 0.5171971321105957, + "learning_rate": 7.225640599438026e-06, + "loss": 0.1982, + "step": 31752 + }, + { + "epoch": 0.5895151917957249, + "grad_norm": 0.30257871747016907, + "learning_rate": 7.224519919249137e-06, + "loss": 0.2379, + "step": 31754 + }, + { + "epoch": 0.5895523219331434, + "grad_norm": 0.3692000210285187, + "learning_rate": 7.223399276826222e-06, + "loss": 0.1298, + "step": 31756 + }, + { + "epoch": 0.5895894520705621, + "grad_norm": 0.5173700451850891, + "learning_rate": 7.222278672184523e-06, + "loss": 0.226, + "step": 31758 + }, + { + "epoch": 0.5896265822079807, + "grad_norm": 0.6413261294364929, + "learning_rate": 7.22115810533929e-06, + "loss": 0.2394, + "step": 31760 + }, + { + "epoch": 0.5896637123453994, + "grad_norm": 0.3544710874557495, + "learning_rate": 7.220037576305775e-06, + "loss": 0.2489, + "step": 31762 + }, + { + "epoch": 0.5897008424828181, + "grad_norm": 0.4805065095424652, + "learning_rate": 7.2189170850992185e-06, + "loss": 0.4889, + "step": 31764 + }, + { + "epoch": 0.5897379726202366, + "grad_norm": 0.5835082530975342, + "learning_rate": 7.2177966317348704e-06, + "loss": 0.4198, + "step": 31766 + }, + { + "epoch": 0.5897751027576553, + "grad_norm": 0.3729493021965027, + "learning_rate": 7.2166762162279814e-06, + "loss": 0.2666, + "step": 31768 + }, + { + "epoch": 0.5898122328950739, + "grad_norm": 0.1929493099451065, + "learning_rate": 7.215555838593786e-06, + "loss": 0.1567, + "step": 31770 + }, + { + "epoch": 0.5898493630324926, + "grad_norm": 0.3131501376628876, + "learning_rate": 7.2144354988475375e-06, + "loss": 0.2633, + "step": 31772 + }, + { + "epoch": 0.5898864931699113, + "grad_norm": 0.4871968924999237, + "learning_rate": 7.213315197004479e-06, + "loss": 0.3372, + "step": 31774 + }, + { + "epoch": 0.5899236233073298, + "grad_norm": 0.3772599399089813, + "learning_rate": 7.212194933079853e-06, + "loss": 0.4734, + "step": 31776 + }, + { + "epoch": 0.5899607534447485, + "grad_norm": 0.26847440004348755, + "learning_rate": 7.2110747070889005e-06, + "loss": 0.1963, + "step": 31778 + }, + { + "epoch": 0.5899978835821671, + "grad_norm": 0.30690836906433105, + "learning_rate": 7.20995451904687e-06, + "loss": 0.3431, + "step": 31780 + }, + { + "epoch": 0.5900350137195858, + "grad_norm": 0.32768118381500244, + "learning_rate": 7.2088343689690045e-06, + "loss": 0.4049, + "step": 31782 + }, + { + "epoch": 0.5900721438570045, + "grad_norm": 0.48004692792892456, + "learning_rate": 7.207714256870542e-06, + "loss": 0.3866, + "step": 31784 + }, + { + "epoch": 0.590109273994423, + "grad_norm": 0.37556806206703186, + "learning_rate": 7.20659418276672e-06, + "loss": 0.4233, + "step": 31786 + }, + { + "epoch": 0.5901464041318417, + "grad_norm": 0.3070814311504364, + "learning_rate": 7.205474146672786e-06, + "loss": 0.3252, + "step": 31788 + }, + { + "epoch": 0.5901835342692603, + "grad_norm": 0.22532576322555542, + "learning_rate": 7.204354148603977e-06, + "loss": 0.2487, + "step": 31790 + }, + { + "epoch": 0.590220664406679, + "grad_norm": 0.2723808288574219, + "learning_rate": 7.203234188575537e-06, + "loss": 0.3176, + "step": 31792 + }, + { + "epoch": 0.5902577945440977, + "grad_norm": 0.42170941829681396, + "learning_rate": 7.202114266602702e-06, + "loss": 0.2805, + "step": 31794 + }, + { + "epoch": 0.5902949246815162, + "grad_norm": 0.3466981053352356, + "learning_rate": 7.2009943827007126e-06, + "loss": 0.2003, + "step": 31796 + }, + { + "epoch": 0.5903320548189349, + "grad_norm": 0.48853299021720886, + "learning_rate": 7.199874536884801e-06, + "loss": 0.1956, + "step": 31798 + }, + { + "epoch": 0.5903691849563535, + "grad_norm": 0.35713034868240356, + "learning_rate": 7.198754729170212e-06, + "loss": 0.1366, + "step": 31800 + }, + { + "epoch": 0.5904063150937722, + "grad_norm": 0.525822103023529, + "learning_rate": 7.19763495957218e-06, + "loss": 0.3041, + "step": 31802 + }, + { + "epoch": 0.5904434452311909, + "grad_norm": 0.2929930090904236, + "learning_rate": 7.196515228105943e-06, + "loss": 0.211, + "step": 31804 + }, + { + "epoch": 0.5904805753686094, + "grad_norm": 0.4529709815979004, + "learning_rate": 7.195395534786737e-06, + "loss": 0.339, + "step": 31806 + }, + { + "epoch": 0.5905177055060281, + "grad_norm": 0.30272164940834045, + "learning_rate": 7.194275879629797e-06, + "loss": 0.1839, + "step": 31808 + }, + { + "epoch": 0.5905548356434467, + "grad_norm": 0.4891115427017212, + "learning_rate": 7.193156262650356e-06, + "loss": 0.3068, + "step": 31810 + }, + { + "epoch": 0.5905919657808654, + "grad_norm": 0.30060839653015137, + "learning_rate": 7.19203668386365e-06, + "loss": 0.2607, + "step": 31812 + }, + { + "epoch": 0.5906290959182839, + "grad_norm": 0.44671860337257385, + "learning_rate": 7.1909171432849166e-06, + "loss": 0.2802, + "step": 31814 + }, + { + "epoch": 0.5906662260557026, + "grad_norm": 0.6356915235519409, + "learning_rate": 7.189797640929386e-06, + "loss": 0.3191, + "step": 31816 + }, + { + "epoch": 0.5907033561931213, + "grad_norm": 0.3474804162979126, + "learning_rate": 7.188678176812295e-06, + "loss": 0.3198, + "step": 31818 + }, + { + "epoch": 0.5907404863305399, + "grad_norm": 0.2997127175331116, + "learning_rate": 7.187558750948868e-06, + "loss": 0.4235, + "step": 31820 + }, + { + "epoch": 0.5907776164679586, + "grad_norm": 0.4708705246448517, + "learning_rate": 7.186439363354344e-06, + "loss": 0.3383, + "step": 31822 + }, + { + "epoch": 0.5908147466053771, + "grad_norm": 0.5596514344215393, + "learning_rate": 7.185320014043951e-06, + "loss": 0.348, + "step": 31824 + }, + { + "epoch": 0.5908518767427958, + "grad_norm": 0.2771008610725403, + "learning_rate": 7.184200703032925e-06, + "loss": 0.2181, + "step": 31826 + }, + { + "epoch": 0.5908890068802145, + "grad_norm": 0.19368411600589752, + "learning_rate": 7.183081430336491e-06, + "loss": 0.2349, + "step": 31828 + }, + { + "epoch": 0.5909261370176331, + "grad_norm": 0.3257860541343689, + "learning_rate": 7.18196219596988e-06, + "loss": 0.067, + "step": 31830 + }, + { + "epoch": 0.5909632671550518, + "grad_norm": 0.4943435788154602, + "learning_rate": 7.180842999948328e-06, + "loss": 0.6303, + "step": 31832 + }, + { + "epoch": 0.5910003972924703, + "grad_norm": 0.33831918239593506, + "learning_rate": 7.179723842287054e-06, + "loss": 0.289, + "step": 31834 + }, + { + "epoch": 0.591037527429889, + "grad_norm": 0.24263888597488403, + "learning_rate": 7.17860472300129e-06, + "loss": 0.1256, + "step": 31836 + }, + { + "epoch": 0.5910746575673077, + "grad_norm": 0.4024035632610321, + "learning_rate": 7.177485642106268e-06, + "loss": 0.2259, + "step": 31838 + }, + { + "epoch": 0.5911117877047263, + "grad_norm": 0.35957103967666626, + "learning_rate": 7.1763665996172084e-06, + "loss": 0.1774, + "step": 31840 + }, + { + "epoch": 0.591148917842145, + "grad_norm": 0.3907424807548523, + "learning_rate": 7.175247595549342e-06, + "loss": 0.272, + "step": 31842 + }, + { + "epoch": 0.5911860479795635, + "grad_norm": 0.2327420562505722, + "learning_rate": 7.1741286299179e-06, + "loss": 0.3407, + "step": 31844 + }, + { + "epoch": 0.5912231781169822, + "grad_norm": 0.2771170139312744, + "learning_rate": 7.173009702738097e-06, + "loss": 0.2861, + "step": 31846 + }, + { + "epoch": 0.5912603082544009, + "grad_norm": 0.6723152995109558, + "learning_rate": 7.171890814025168e-06, + "loss": 0.2464, + "step": 31848 + }, + { + "epoch": 0.5912974383918195, + "grad_norm": 0.34809747338294983, + "learning_rate": 7.17077196379433e-06, + "loss": 0.3478, + "step": 31850 + }, + { + "epoch": 0.5913345685292382, + "grad_norm": 0.352260023355484, + "learning_rate": 7.169653152060812e-06, + "loss": 0.1227, + "step": 31852 + }, + { + "epoch": 0.5913716986666567, + "grad_norm": 0.3397088944911957, + "learning_rate": 7.1685343788398374e-06, + "loss": 0.2267, + "step": 31854 + }, + { + "epoch": 0.5914088288040754, + "grad_norm": 0.3706021308898926, + "learning_rate": 7.167415644146631e-06, + "loss": 0.2992, + "step": 31856 + }, + { + "epoch": 0.591445958941494, + "grad_norm": 0.2912646234035492, + "learning_rate": 7.166296947996411e-06, + "loss": 0.2023, + "step": 31858 + }, + { + "epoch": 0.5914830890789127, + "grad_norm": 0.3106619417667389, + "learning_rate": 7.165178290404402e-06, + "loss": 0.3426, + "step": 31860 + }, + { + "epoch": 0.5915202192163314, + "grad_norm": 0.24913892149925232, + "learning_rate": 7.164059671385825e-06, + "loss": 0.2955, + "step": 31862 + }, + { + "epoch": 0.5915573493537499, + "grad_norm": 0.34791985154151917, + "learning_rate": 7.1629410909559e-06, + "loss": 0.3267, + "step": 31864 + }, + { + "epoch": 0.5915944794911686, + "grad_norm": 0.22536355257034302, + "learning_rate": 7.161822549129849e-06, + "loss": 0.2333, + "step": 31866 + }, + { + "epoch": 0.5916316096285872, + "grad_norm": 0.33451002836227417, + "learning_rate": 7.1607040459228924e-06, + "loss": 0.4063, + "step": 31868 + }, + { + "epoch": 0.5916687397660059, + "grad_norm": 0.2724950611591339, + "learning_rate": 7.159585581350253e-06, + "loss": 0.3091, + "step": 31870 + }, + { + "epoch": 0.5917058699034246, + "grad_norm": 0.31680744886398315, + "learning_rate": 7.158467155427142e-06, + "loss": 0.3817, + "step": 31872 + }, + { + "epoch": 0.5917430000408431, + "grad_norm": 0.32738280296325684, + "learning_rate": 7.1573487681687806e-06, + "loss": 0.454, + "step": 31874 + }, + { + "epoch": 0.5917801301782618, + "grad_norm": 0.3219420611858368, + "learning_rate": 7.156230419590388e-06, + "loss": 0.1233, + "step": 31876 + }, + { + "epoch": 0.5918172603156804, + "grad_norm": 0.4125814735889435, + "learning_rate": 7.155112109707183e-06, + "loss": 0.2718, + "step": 31878 + }, + { + "epoch": 0.5918543904530991, + "grad_norm": 0.5096237659454346, + "learning_rate": 7.153993838534379e-06, + "loss": 0.3371, + "step": 31880 + }, + { + "epoch": 0.5918915205905178, + "grad_norm": 0.5044049024581909, + "learning_rate": 7.152875606087198e-06, + "loss": 0.3438, + "step": 31882 + }, + { + "epoch": 0.5919286507279363, + "grad_norm": 0.25340163707733154, + "learning_rate": 7.151757412380849e-06, + "loss": 0.1829, + "step": 31884 + }, + { + "epoch": 0.591965780865355, + "grad_norm": 0.4396693706512451, + "learning_rate": 7.150639257430549e-06, + "loss": 0.1754, + "step": 31886 + }, + { + "epoch": 0.5920029110027736, + "grad_norm": 0.3053189516067505, + "learning_rate": 7.149521141251514e-06, + "loss": 0.4049, + "step": 31888 + }, + { + "epoch": 0.5920400411401923, + "grad_norm": 0.4439174234867096, + "learning_rate": 7.14840306385896e-06, + "loss": 0.2853, + "step": 31890 + }, + { + "epoch": 0.592077171277611, + "grad_norm": 0.5069766640663147, + "learning_rate": 7.147285025268096e-06, + "loss": 0.4567, + "step": 31892 + }, + { + "epoch": 0.5921143014150295, + "grad_norm": 0.357025146484375, + "learning_rate": 7.146167025494144e-06, + "loss": 0.2694, + "step": 31894 + }, + { + "epoch": 0.5921514315524482, + "grad_norm": 0.6058133244514465, + "learning_rate": 7.145049064552306e-06, + "loss": 0.3387, + "step": 31896 + }, + { + "epoch": 0.5921885616898668, + "grad_norm": 0.5639526844024658, + "learning_rate": 7.143931142457796e-06, + "loss": 0.0967, + "step": 31898 + }, + { + "epoch": 0.5922256918272855, + "grad_norm": 0.21593736112117767, + "learning_rate": 7.142813259225834e-06, + "loss": 0.1455, + "step": 31900 + }, + { + "epoch": 0.5922628219647041, + "grad_norm": 0.6025198101997375, + "learning_rate": 7.141695414871621e-06, + "loss": 0.362, + "step": 31902 + }, + { + "epoch": 0.5922999521021227, + "grad_norm": 0.4275647699832916, + "learning_rate": 7.140577609410373e-06, + "loss": 0.2282, + "step": 31904 + }, + { + "epoch": 0.5923370822395414, + "grad_norm": 0.47565558552742004, + "learning_rate": 7.139459842857297e-06, + "loss": 0.2381, + "step": 31906 + }, + { + "epoch": 0.59237421237696, + "grad_norm": 0.48718932271003723, + "learning_rate": 7.13834211522761e-06, + "loss": 0.341, + "step": 31908 + }, + { + "epoch": 0.5924113425143787, + "grad_norm": 0.4600697159767151, + "learning_rate": 7.137224426536511e-06, + "loss": 0.3368, + "step": 31910 + }, + { + "epoch": 0.5924484726517972, + "grad_norm": 0.3298058807849884, + "learning_rate": 7.136106776799214e-06, + "loss": 0.1267, + "step": 31912 + }, + { + "epoch": 0.5924856027892159, + "grad_norm": 0.2750627398490906, + "learning_rate": 7.134989166030924e-06, + "loss": 0.0969, + "step": 31914 + }, + { + "epoch": 0.5925227329266346, + "grad_norm": 0.46612748503685, + "learning_rate": 7.133871594246849e-06, + "loss": 0.3137, + "step": 31916 + }, + { + "epoch": 0.5925598630640532, + "grad_norm": 0.36794906854629517, + "learning_rate": 7.132754061462196e-06, + "loss": 0.378, + "step": 31918 + }, + { + "epoch": 0.5925969932014719, + "grad_norm": 0.33430978655815125, + "learning_rate": 7.131636567692177e-06, + "loss": 0.253, + "step": 31920 + }, + { + "epoch": 0.5926341233388904, + "grad_norm": 0.3982916474342346, + "learning_rate": 7.13051911295199e-06, + "loss": 0.2414, + "step": 31922 + }, + { + "epoch": 0.5926712534763091, + "grad_norm": 0.658501923084259, + "learning_rate": 7.129401697256841e-06, + "loss": 0.188, + "step": 31924 + }, + { + "epoch": 0.5927083836137278, + "grad_norm": 0.3841792345046997, + "learning_rate": 7.128284320621936e-06, + "loss": 0.3973, + "step": 31926 + }, + { + "epoch": 0.5927455137511464, + "grad_norm": 0.3503793478012085, + "learning_rate": 7.12716698306248e-06, + "loss": 0.3195, + "step": 31928 + }, + { + "epoch": 0.592782643888565, + "grad_norm": 0.4302903115749359, + "learning_rate": 7.126049684593679e-06, + "loss": 0.1836, + "step": 31930 + }, + { + "epoch": 0.5928197740259836, + "grad_norm": 0.3483423590660095, + "learning_rate": 7.124932425230733e-06, + "loss": 0.2631, + "step": 31932 + }, + { + "epoch": 0.5928569041634023, + "grad_norm": 0.4045327603816986, + "learning_rate": 7.123815204988844e-06, + "loss": 0.3444, + "step": 31934 + }, + { + "epoch": 0.592894034300821, + "grad_norm": 0.6940419673919678, + "learning_rate": 7.122698023883214e-06, + "loss": 0.2876, + "step": 31936 + }, + { + "epoch": 0.5929311644382396, + "grad_norm": 1.8124761581420898, + "learning_rate": 7.1215808819290445e-06, + "loss": 0.4224, + "step": 31938 + }, + { + "epoch": 0.5929682945756583, + "grad_norm": 0.3656772971153259, + "learning_rate": 7.120463779141537e-06, + "loss": 0.2066, + "step": 31940 + }, + { + "epoch": 0.5930054247130768, + "grad_norm": 0.32985472679138184, + "learning_rate": 7.119346715535896e-06, + "loss": 0.2833, + "step": 31942 + }, + { + "epoch": 0.5930425548504955, + "grad_norm": 0.3622593581676483, + "learning_rate": 7.118229691127315e-06, + "loss": 0.2956, + "step": 31944 + }, + { + "epoch": 0.5930796849879142, + "grad_norm": 0.3249266445636749, + "learning_rate": 7.1171127059309975e-06, + "loss": 0.3316, + "step": 31946 + }, + { + "epoch": 0.5931168151253328, + "grad_norm": 0.44524556398391724, + "learning_rate": 7.1159957599621385e-06, + "loss": 0.1417, + "step": 31948 + }, + { + "epoch": 0.5931539452627514, + "grad_norm": 0.3817630410194397, + "learning_rate": 7.114878853235939e-06, + "loss": 0.2935, + "step": 31950 + }, + { + "epoch": 0.59319107540017, + "grad_norm": 0.21469825506210327, + "learning_rate": 7.113761985767599e-06, + "loss": 0.2504, + "step": 31952 + }, + { + "epoch": 0.5932282055375887, + "grad_norm": 0.41692715883255005, + "learning_rate": 7.11264515757231e-06, + "loss": 0.3634, + "step": 31954 + }, + { + "epoch": 0.5932653356750074, + "grad_norm": 0.65981525182724, + "learning_rate": 7.111528368665272e-06, + "loss": 0.3637, + "step": 31956 + }, + { + "epoch": 0.593302465812426, + "grad_norm": 1.1483608484268188, + "learning_rate": 7.110411619061686e-06, + "loss": 0.2631, + "step": 31958 + }, + { + "epoch": 0.5933395959498446, + "grad_norm": 0.2627449333667755, + "learning_rate": 7.109294908776737e-06, + "loss": 0.2614, + "step": 31960 + }, + { + "epoch": 0.5933767260872632, + "grad_norm": 0.35787415504455566, + "learning_rate": 7.108178237825627e-06, + "loss": 0.3767, + "step": 31962 + }, + { + "epoch": 0.5934138562246819, + "grad_norm": 0.6688876152038574, + "learning_rate": 7.10706160622355e-06, + "loss": 0.352, + "step": 31964 + }, + { + "epoch": 0.5934509863621005, + "grad_norm": 0.47806447744369507, + "learning_rate": 7.105945013985698e-06, + "loss": 0.2748, + "step": 31966 + }, + { + "epoch": 0.5934881164995192, + "grad_norm": 0.4578014016151428, + "learning_rate": 7.104828461127264e-06, + "loss": 0.498, + "step": 31968 + }, + { + "epoch": 0.5935252466369378, + "grad_norm": 0.28616851568222046, + "learning_rate": 7.103711947663448e-06, + "loss": 0.1701, + "step": 31970 + }, + { + "epoch": 0.5935623767743564, + "grad_norm": 0.4376542568206787, + "learning_rate": 7.102595473609433e-06, + "loss": 0.27, + "step": 31972 + }, + { + "epoch": 0.5935995069117751, + "grad_norm": 0.47028684616088867, + "learning_rate": 7.1014790389804145e-06, + "loss": 0.3354, + "step": 31974 + }, + { + "epoch": 0.5936366370491937, + "grad_norm": 0.22200638055801392, + "learning_rate": 7.100362643791587e-06, + "loss": 0.1626, + "step": 31976 + }, + { + "epoch": 0.5936737671866124, + "grad_norm": 0.24875736236572266, + "learning_rate": 7.099246288058136e-06, + "loss": 0.2794, + "step": 31978 + }, + { + "epoch": 0.593710897324031, + "grad_norm": 0.5217177271842957, + "learning_rate": 7.098129971795253e-06, + "loss": 0.4267, + "step": 31980 + }, + { + "epoch": 0.5937480274614496, + "grad_norm": 0.3891518712043762, + "learning_rate": 7.097013695018136e-06, + "loss": 0.3193, + "step": 31982 + }, + { + "epoch": 0.5937851575988683, + "grad_norm": 0.362983375787735, + "learning_rate": 7.095897457741961e-06, + "loss": 0.3013, + "step": 31984 + }, + { + "epoch": 0.5938222877362869, + "grad_norm": 0.44616296887397766, + "learning_rate": 7.094781259981925e-06, + "loss": 0.2531, + "step": 31986 + }, + { + "epoch": 0.5938594178737056, + "grad_norm": 0.36302801966667175, + "learning_rate": 7.093665101753212e-06, + "loss": 0.2163, + "step": 31988 + }, + { + "epoch": 0.5938965480111242, + "grad_norm": 0.6121655702590942, + "learning_rate": 7.092548983071012e-06, + "loss": 0.3208, + "step": 31990 + }, + { + "epoch": 0.5939336781485428, + "grad_norm": 0.4383487403392792, + "learning_rate": 7.09143290395051e-06, + "loss": 0.2518, + "step": 31992 + }, + { + "epoch": 0.5939708082859615, + "grad_norm": 0.40191033482551575, + "learning_rate": 7.0903168644068976e-06, + "loss": 0.2433, + "step": 31994 + }, + { + "epoch": 0.5940079384233801, + "grad_norm": 0.45267802476882935, + "learning_rate": 7.089200864455357e-06, + "loss": 0.2352, + "step": 31996 + }, + { + "epoch": 0.5940450685607988, + "grad_norm": 0.3091062605381012, + "learning_rate": 7.088084904111073e-06, + "loss": 0.2829, + "step": 31998 + }, + { + "epoch": 0.5940821986982174, + "grad_norm": 0.4867989122867584, + "learning_rate": 7.086968983389229e-06, + "loss": 0.2713, + "step": 32000 + }, + { + "epoch": 0.594119328835636, + "grad_norm": 0.4080210030078888, + "learning_rate": 7.085853102305014e-06, + "loss": 0.2724, + "step": 32002 + }, + { + "epoch": 0.5941564589730547, + "grad_norm": 0.39662978053092957, + "learning_rate": 7.084737260873606e-06, + "loss": 0.2898, + "step": 32004 + }, + { + "epoch": 0.5941935891104733, + "grad_norm": 0.43277570605278015, + "learning_rate": 7.083621459110196e-06, + "loss": 0.2786, + "step": 32006 + }, + { + "epoch": 0.594230719247892, + "grad_norm": 0.5412875413894653, + "learning_rate": 7.082505697029964e-06, + "loss": 0.243, + "step": 32008 + }, + { + "epoch": 0.5942678493853105, + "grad_norm": 0.5167099237442017, + "learning_rate": 7.081389974648086e-06, + "loss": 0.1281, + "step": 32010 + }, + { + "epoch": 0.5943049795227292, + "grad_norm": 0.44475677609443665, + "learning_rate": 7.080274291979748e-06, + "loss": 0.2883, + "step": 32012 + }, + { + "epoch": 0.5943421096601479, + "grad_norm": 0.32310712337493896, + "learning_rate": 7.079158649040132e-06, + "loss": 0.3817, + "step": 32014 + }, + { + "epoch": 0.5943792397975665, + "grad_norm": 0.35012197494506836, + "learning_rate": 7.078043045844421e-06, + "loss": 0.2649, + "step": 32016 + }, + { + "epoch": 0.5944163699349851, + "grad_norm": 0.5891090035438538, + "learning_rate": 7.076927482407787e-06, + "loss": 0.2092, + "step": 32018 + }, + { + "epoch": 0.5944535000724037, + "grad_norm": 0.466403067111969, + "learning_rate": 7.07581195874542e-06, + "loss": 0.2657, + "step": 32020 + }, + { + "epoch": 0.5944906302098224, + "grad_norm": 0.4455684721469879, + "learning_rate": 7.074696474872489e-06, + "loss": 0.2382, + "step": 32022 + }, + { + "epoch": 0.5945277603472411, + "grad_norm": 0.3129173517227173, + "learning_rate": 7.073581030804178e-06, + "loss": 0.2216, + "step": 32024 + }, + { + "epoch": 0.5945648904846597, + "grad_norm": 0.3589586317539215, + "learning_rate": 7.072465626555661e-06, + "loss": 0.1706, + "step": 32026 + }, + { + "epoch": 0.5946020206220783, + "grad_norm": 0.4635947048664093, + "learning_rate": 7.071350262142122e-06, + "loss": 0.2965, + "step": 32028 + }, + { + "epoch": 0.5946391507594969, + "grad_norm": 0.25965920090675354, + "learning_rate": 7.070234937578731e-06, + "loss": 0.3181, + "step": 32030 + }, + { + "epoch": 0.5946762808969156, + "grad_norm": 0.41095206141471863, + "learning_rate": 7.0691196528806664e-06, + "loss": 0.3135, + "step": 32032 + }, + { + "epoch": 0.5947134110343343, + "grad_norm": 0.3606607913970947, + "learning_rate": 7.068004408063108e-06, + "loss": 0.3218, + "step": 32034 + }, + { + "epoch": 0.5947505411717529, + "grad_norm": 0.4844478368759155, + "learning_rate": 7.066889203141224e-06, + "loss": 0.4048, + "step": 32036 + }, + { + "epoch": 0.5947876713091715, + "grad_norm": 0.2948613464832306, + "learning_rate": 7.065774038130194e-06, + "loss": 0.1446, + "step": 32038 + }, + { + "epoch": 0.5948248014465901, + "grad_norm": 0.4373939037322998, + "learning_rate": 7.064658913045188e-06, + "loss": 0.3039, + "step": 32040 + }, + { + "epoch": 0.5948619315840088, + "grad_norm": 0.3036930561065674, + "learning_rate": 7.063543827901382e-06, + "loss": 0.4233, + "step": 32042 + }, + { + "epoch": 0.5948990617214275, + "grad_norm": 0.589603841304779, + "learning_rate": 7.0624287827139505e-06, + "loss": 0.3637, + "step": 32044 + }, + { + "epoch": 0.594936191858846, + "grad_norm": 0.41544055938720703, + "learning_rate": 7.061313777498066e-06, + "loss": 0.3079, + "step": 32046 + }, + { + "epoch": 0.5949733219962647, + "grad_norm": 0.33734944462776184, + "learning_rate": 7.060198812268895e-06, + "loss": 0.1934, + "step": 32048 + }, + { + "epoch": 0.5950104521336833, + "grad_norm": 0.39886340498924255, + "learning_rate": 7.059083887041616e-06, + "loss": 0.2439, + "step": 32050 + }, + { + "epoch": 0.595047582271102, + "grad_norm": 0.4452390968799591, + "learning_rate": 7.057969001831393e-06, + "loss": 0.4094, + "step": 32052 + }, + { + "epoch": 0.5950847124085207, + "grad_norm": 0.36419904232025146, + "learning_rate": 7.056854156653399e-06, + "loss": 0.2322, + "step": 32054 + }, + { + "epoch": 0.5951218425459393, + "grad_norm": 0.7428411841392517, + "learning_rate": 7.055739351522803e-06, + "loss": 0.389, + "step": 32056 + }, + { + "epoch": 0.5951589726833579, + "grad_norm": 0.3975401222705841, + "learning_rate": 7.054624586454782e-06, + "loss": 0.1818, + "step": 32058 + }, + { + "epoch": 0.5951961028207765, + "grad_norm": 0.3636437654495239, + "learning_rate": 7.0535098614644955e-06, + "loss": 0.2116, + "step": 32060 + }, + { + "epoch": 0.5952332329581952, + "grad_norm": 0.5471928715705872, + "learning_rate": 7.0523951765671105e-06, + "loss": 0.2247, + "step": 32062 + }, + { + "epoch": 0.5952703630956138, + "grad_norm": 0.30913886427879333, + "learning_rate": 7.0512805317778e-06, + "loss": 0.276, + "step": 32064 + }, + { + "epoch": 0.5953074932330324, + "grad_norm": 0.26910400390625, + "learning_rate": 7.0501659271117275e-06, + "loss": 0.2427, + "step": 32066 + }, + { + "epoch": 0.5953446233704511, + "grad_norm": 0.3515623211860657, + "learning_rate": 7.049051362584064e-06, + "loss": 0.2653, + "step": 32068 + }, + { + "epoch": 0.5953817535078697, + "grad_norm": 0.487458735704422, + "learning_rate": 7.04793683820997e-06, + "loss": 0.3528, + "step": 32070 + }, + { + "epoch": 0.5954188836452884, + "grad_norm": 0.2506726086139679, + "learning_rate": 7.046822354004617e-06, + "loss": 0.2607, + "step": 32072 + }, + { + "epoch": 0.595456013782707, + "grad_norm": 0.2581019997596741, + "learning_rate": 7.045707909983161e-06, + "loss": 0.2905, + "step": 32074 + }, + { + "epoch": 0.5954931439201256, + "grad_norm": 0.24956727027893066, + "learning_rate": 7.044593506160773e-06, + "loss": 0.2039, + "step": 32076 + }, + { + "epoch": 0.5955302740575443, + "grad_norm": 0.3519080579280853, + "learning_rate": 7.043479142552614e-06, + "loss": 0.3707, + "step": 32078 + }, + { + "epoch": 0.5955674041949629, + "grad_norm": 0.501090407371521, + "learning_rate": 7.04236481917385e-06, + "loss": 0.1642, + "step": 32080 + }, + { + "epoch": 0.5956045343323816, + "grad_norm": 0.5079665780067444, + "learning_rate": 7.041250536039641e-06, + "loss": 0.3733, + "step": 32082 + }, + { + "epoch": 0.5956416644698002, + "grad_norm": 0.36113592982292175, + "learning_rate": 7.040136293165152e-06, + "loss": 0.2956, + "step": 32084 + }, + { + "epoch": 0.5956787946072188, + "grad_norm": 0.3640531599521637, + "learning_rate": 7.0390220905655395e-06, + "loss": 0.1521, + "step": 32086 + }, + { + "epoch": 0.5957159247446375, + "grad_norm": 0.32021060585975647, + "learning_rate": 7.037907928255966e-06, + "loss": 0.2449, + "step": 32088 + }, + { + "epoch": 0.5957530548820561, + "grad_norm": 0.20007668435573578, + "learning_rate": 7.036793806251594e-06, + "loss": 0.2787, + "step": 32090 + }, + { + "epoch": 0.5957901850194748, + "grad_norm": 0.2692842185497284, + "learning_rate": 7.035679724567583e-06, + "loss": 0.3178, + "step": 32092 + }, + { + "epoch": 0.5958273151568934, + "grad_norm": 0.2839367687702179, + "learning_rate": 7.034565683219092e-06, + "loss": 0.3523, + "step": 32094 + }, + { + "epoch": 0.595864445294312, + "grad_norm": 0.283980131149292, + "learning_rate": 7.033451682221282e-06, + "loss": 0.3268, + "step": 32096 + }, + { + "epoch": 0.5959015754317307, + "grad_norm": 0.2893757224082947, + "learning_rate": 7.032337721589305e-06, + "loss": 0.2633, + "step": 32098 + }, + { + "epoch": 0.5959387055691493, + "grad_norm": 0.6066461205482483, + "learning_rate": 7.031223801338323e-06, + "loss": 0.3882, + "step": 32100 + }, + { + "epoch": 0.595975835706568, + "grad_norm": 0.43130892515182495, + "learning_rate": 7.030109921483495e-06, + "loss": 0.1755, + "step": 32102 + }, + { + "epoch": 0.5960129658439866, + "grad_norm": 0.5042694807052612, + "learning_rate": 7.028996082039971e-06, + "loss": 0.3617, + "step": 32104 + }, + { + "epoch": 0.5960500959814052, + "grad_norm": 0.3423978388309479, + "learning_rate": 7.027882283022913e-06, + "loss": 0.2971, + "step": 32106 + }, + { + "epoch": 0.5960872261188239, + "grad_norm": 0.2865414023399353, + "learning_rate": 7.026768524447478e-06, + "loss": 0.2837, + "step": 32108 + }, + { + "epoch": 0.5961243562562425, + "grad_norm": 0.3856591582298279, + "learning_rate": 7.025654806328813e-06, + "loss": 0.498, + "step": 32110 + }, + { + "epoch": 0.5961614863936612, + "grad_norm": 0.3129841387271881, + "learning_rate": 7.024541128682079e-06, + "loss": 0.3237, + "step": 32112 + }, + { + "epoch": 0.5961986165310798, + "grad_norm": 0.3520040810108185, + "learning_rate": 7.023427491522427e-06, + "loss": 0.0968, + "step": 32114 + }, + { + "epoch": 0.5962357466684984, + "grad_norm": 0.19297181069850922, + "learning_rate": 7.022313894865009e-06, + "loss": 0.1976, + "step": 32116 + }, + { + "epoch": 0.596272876805917, + "grad_norm": 0.41210314631462097, + "learning_rate": 7.021200338724981e-06, + "loss": 0.1956, + "step": 32118 + }, + { + "epoch": 0.5963100069433357, + "grad_norm": 0.4435357451438904, + "learning_rate": 7.0200868231174946e-06, + "loss": 0.2347, + "step": 32120 + }, + { + "epoch": 0.5963471370807544, + "grad_norm": 0.3743399977684021, + "learning_rate": 7.018973348057704e-06, + "loss": 0.3986, + "step": 32122 + }, + { + "epoch": 0.596384267218173, + "grad_norm": 0.5370433926582336, + "learning_rate": 7.0178599135607535e-06, + "loss": 0.3055, + "step": 32124 + }, + { + "epoch": 0.5964213973555916, + "grad_norm": 0.591663122177124, + "learning_rate": 7.016746519641797e-06, + "loss": 0.3292, + "step": 32126 + }, + { + "epoch": 0.5964585274930102, + "grad_norm": 0.3489430546760559, + "learning_rate": 7.0156331663159836e-06, + "loss": 0.416, + "step": 32128 + }, + { + "epoch": 0.5964956576304289, + "grad_norm": 0.4691818356513977, + "learning_rate": 7.014519853598464e-06, + "loss": 0.2898, + "step": 32130 + }, + { + "epoch": 0.5965327877678476, + "grad_norm": 0.4787689745426178, + "learning_rate": 7.013406581504388e-06, + "loss": 0.2008, + "step": 32132 + }, + { + "epoch": 0.5965699179052661, + "grad_norm": 0.33664363622665405, + "learning_rate": 7.012293350048903e-06, + "loss": 0.3116, + "step": 32134 + }, + { + "epoch": 0.5966070480426848, + "grad_norm": 0.30854278802871704, + "learning_rate": 7.011180159247156e-06, + "loss": 0.4419, + "step": 32136 + }, + { + "epoch": 0.5966441781801034, + "grad_norm": 0.41785240173339844, + "learning_rate": 7.010067009114293e-06, + "loss": 0.2482, + "step": 32138 + }, + { + "epoch": 0.5966813083175221, + "grad_norm": 0.284810870885849, + "learning_rate": 7.008953899665461e-06, + "loss": 0.148, + "step": 32140 + }, + { + "epoch": 0.5967184384549408, + "grad_norm": 0.3168129026889801, + "learning_rate": 7.007840830915809e-06, + "loss": 0.2592, + "step": 32142 + }, + { + "epoch": 0.5967555685923593, + "grad_norm": 0.3707023859024048, + "learning_rate": 7.006727802880482e-06, + "loss": 0.4346, + "step": 32144 + }, + { + "epoch": 0.596792698729778, + "grad_norm": 0.5273804068565369, + "learning_rate": 7.005614815574624e-06, + "loss": 0.2916, + "step": 32146 + }, + { + "epoch": 0.5968298288671966, + "grad_norm": 0.3138352036476135, + "learning_rate": 7.004501869013377e-06, + "loss": 0.1423, + "step": 32148 + }, + { + "epoch": 0.5968669590046153, + "grad_norm": 0.3856821060180664, + "learning_rate": 7.003388963211887e-06, + "loss": 0.2476, + "step": 32150 + }, + { + "epoch": 0.596904089142034, + "grad_norm": 0.2853991985321045, + "learning_rate": 7.002276098185296e-06, + "loss": 0.2763, + "step": 32152 + }, + { + "epoch": 0.5969412192794525, + "grad_norm": 0.3874876797199249, + "learning_rate": 7.001163273948752e-06, + "loss": 0.2588, + "step": 32154 + }, + { + "epoch": 0.5969783494168712, + "grad_norm": 0.4165259003639221, + "learning_rate": 7.00005049051739e-06, + "loss": 0.1325, + "step": 32156 + }, + { + "epoch": 0.5970154795542898, + "grad_norm": 0.8187820911407471, + "learning_rate": 6.998937747906355e-06, + "loss": 0.3999, + "step": 32158 + }, + { + "epoch": 0.5970526096917085, + "grad_norm": 0.3733706474304199, + "learning_rate": 6.997825046130793e-06, + "loss": 0.2813, + "step": 32160 + }, + { + "epoch": 0.597089739829127, + "grad_norm": 0.8290793299674988, + "learning_rate": 6.996712385205834e-06, + "loss": 0.2395, + "step": 32162 + }, + { + "epoch": 0.5971268699665457, + "grad_norm": 0.47913289070129395, + "learning_rate": 6.995599765146624e-06, + "loss": 0.3609, + "step": 32164 + }, + { + "epoch": 0.5971640001039644, + "grad_norm": 0.3290073275566101, + "learning_rate": 6.994487185968304e-06, + "loss": 0.2946, + "step": 32166 + }, + { + "epoch": 0.597201130241383, + "grad_norm": 0.4065541923046112, + "learning_rate": 6.99337464768601e-06, + "loss": 0.3089, + "step": 32168 + }, + { + "epoch": 0.5972382603788017, + "grad_norm": 0.639032781124115, + "learning_rate": 6.99226215031488e-06, + "loss": 0.2243, + "step": 32170 + }, + { + "epoch": 0.5972753905162203, + "grad_norm": 0.5234506130218506, + "learning_rate": 6.9911496938700574e-06, + "loss": 0.2831, + "step": 32172 + }, + { + "epoch": 0.5973125206536389, + "grad_norm": 0.4382260739803314, + "learning_rate": 6.99003727836667e-06, + "loss": 0.167, + "step": 32174 + }, + { + "epoch": 0.5973496507910576, + "grad_norm": 0.5304849743843079, + "learning_rate": 6.988924903819862e-06, + "loss": 0.2818, + "step": 32176 + }, + { + "epoch": 0.5973867809284762, + "grad_norm": 0.2650459408760071, + "learning_rate": 6.987812570244765e-06, + "loss": 0.2675, + "step": 32178 + }, + { + "epoch": 0.5974239110658949, + "grad_norm": 0.43515804409980774, + "learning_rate": 6.986700277656517e-06, + "loss": 0.3576, + "step": 32180 + }, + { + "epoch": 0.5974610412033134, + "grad_norm": 0.2797057330608368, + "learning_rate": 6.985588026070251e-06, + "loss": 0.2644, + "step": 32182 + }, + { + "epoch": 0.5974981713407321, + "grad_norm": 0.32745155692100525, + "learning_rate": 6.984475815501108e-06, + "loss": 0.1974, + "step": 32184 + }, + { + "epoch": 0.5975353014781508, + "grad_norm": 0.7134277820587158, + "learning_rate": 6.9833636459642116e-06, + "loss": 0.409, + "step": 32186 + }, + { + "epoch": 0.5975724316155694, + "grad_norm": 0.21984697878360748, + "learning_rate": 6.982251517474703e-06, + "loss": 0.2418, + "step": 32188 + }, + { + "epoch": 0.5976095617529881, + "grad_norm": 0.4138549268245697, + "learning_rate": 6.98113943004771e-06, + "loss": 0.359, + "step": 32190 + }, + { + "epoch": 0.5976466918904066, + "grad_norm": 0.4999110996723175, + "learning_rate": 6.980027383698366e-06, + "loss": 0.1718, + "step": 32192 + }, + { + "epoch": 0.5976838220278253, + "grad_norm": 0.6186845302581787, + "learning_rate": 6.978915378441804e-06, + "loss": 0.3815, + "step": 32194 + }, + { + "epoch": 0.597720952165244, + "grad_norm": 0.4924602806568146, + "learning_rate": 6.977803414293156e-06, + "loss": 0.2831, + "step": 32196 + }, + { + "epoch": 0.5977580823026626, + "grad_norm": 0.5660045146942139, + "learning_rate": 6.976691491267551e-06, + "loss": 0.2748, + "step": 32198 + }, + { + "epoch": 0.5977952124400813, + "grad_norm": 0.38521608710289, + "learning_rate": 6.975579609380119e-06, + "loss": 0.2882, + "step": 32200 + }, + { + "epoch": 0.5978323425774998, + "grad_norm": 0.4365895390510559, + "learning_rate": 6.974467768645989e-06, + "loss": 0.1925, + "step": 32202 + }, + { + "epoch": 0.5978694727149185, + "grad_norm": 0.37729278206825256, + "learning_rate": 6.973355969080288e-06, + "loss": 0.3939, + "step": 32204 + }, + { + "epoch": 0.5979066028523372, + "grad_norm": 0.5381395816802979, + "learning_rate": 6.972244210698149e-06, + "loss": 0.3173, + "step": 32206 + }, + { + "epoch": 0.5979437329897558, + "grad_norm": 0.32999387383461, + "learning_rate": 6.971132493514696e-06, + "loss": 0.2296, + "step": 32208 + }, + { + "epoch": 0.5979808631271745, + "grad_norm": 0.4245683252811432, + "learning_rate": 6.97002081754506e-06, + "loss": 0.4875, + "step": 32210 + }, + { + "epoch": 0.598017993264593, + "grad_norm": 0.2963868975639343, + "learning_rate": 6.968909182804362e-06, + "loss": 0.2813, + "step": 32212 + }, + { + "epoch": 0.5980551234020117, + "grad_norm": 0.40088337659835815, + "learning_rate": 6.96779758930773e-06, + "loss": 0.245, + "step": 32214 + }, + { + "epoch": 0.5980922535394303, + "grad_norm": 0.27033665776252747, + "learning_rate": 6.966686037070291e-06, + "loss": 0.2149, + "step": 32216 + }, + { + "epoch": 0.598129383676849, + "grad_norm": 0.25479933619499207, + "learning_rate": 6.965574526107171e-06, + "loss": 0.2841, + "step": 32218 + }, + { + "epoch": 0.5981665138142677, + "grad_norm": 0.7035544514656067, + "learning_rate": 6.964463056433489e-06, + "loss": 0.1424, + "step": 32220 + }, + { + "epoch": 0.5982036439516862, + "grad_norm": 0.38962796330451965, + "learning_rate": 6.963351628064378e-06, + "loss": 0.1934, + "step": 32222 + }, + { + "epoch": 0.5982407740891049, + "grad_norm": 0.3113766014575958, + "learning_rate": 6.962240241014952e-06, + "loss": 0.3995, + "step": 32224 + }, + { + "epoch": 0.5982779042265235, + "grad_norm": 0.36976194381713867, + "learning_rate": 6.9611288953003355e-06, + "loss": 0.2408, + "step": 32226 + }, + { + "epoch": 0.5983150343639422, + "grad_norm": 0.2940559685230255, + "learning_rate": 6.960017590935653e-06, + "loss": 0.4856, + "step": 32228 + }, + { + "epoch": 0.5983521645013609, + "grad_norm": 0.22092384099960327, + "learning_rate": 6.958906327936028e-06, + "loss": 0.3329, + "step": 32230 + }, + { + "epoch": 0.5983892946387794, + "grad_norm": 0.6442883610725403, + "learning_rate": 6.957795106316576e-06, + "loss": 0.2353, + "step": 32232 + }, + { + "epoch": 0.5984264247761981, + "grad_norm": 0.40700411796569824, + "learning_rate": 6.956683926092425e-06, + "loss": 0.2563, + "step": 32234 + }, + { + "epoch": 0.5984635549136167, + "grad_norm": 0.43985840678215027, + "learning_rate": 6.955572787278684e-06, + "loss": 0.3136, + "step": 32236 + }, + { + "epoch": 0.5985006850510354, + "grad_norm": 0.3280580937862396, + "learning_rate": 6.954461689890479e-06, + "loss": 0.3723, + "step": 32238 + }, + { + "epoch": 0.5985378151884541, + "grad_norm": 0.2075342983007431, + "learning_rate": 6.953350633942932e-06, + "loss": 0.37, + "step": 32240 + }, + { + "epoch": 0.5985749453258726, + "grad_norm": 0.14586514234542847, + "learning_rate": 6.952239619451153e-06, + "loss": 0.0896, + "step": 32242 + }, + { + "epoch": 0.5986120754632913, + "grad_norm": 0.3405345380306244, + "learning_rate": 6.951128646430264e-06, + "loss": 0.4005, + "step": 32244 + }, + { + "epoch": 0.5986492056007099, + "grad_norm": 0.2208608239889145, + "learning_rate": 6.950017714895382e-06, + "loss": 0.2462, + "step": 32246 + }, + { + "epoch": 0.5986863357381286, + "grad_norm": 0.33383479714393616, + "learning_rate": 6.948906824861628e-06, + "loss": 0.3729, + "step": 32248 + }, + { + "epoch": 0.5987234658755473, + "grad_norm": 0.34075966477394104, + "learning_rate": 6.9477959763441076e-06, + "loss": 0.2483, + "step": 32250 + }, + { + "epoch": 0.5987605960129658, + "grad_norm": 0.3031713366508484, + "learning_rate": 6.946685169357943e-06, + "loss": 0.4888, + "step": 32252 + }, + { + "epoch": 0.5987977261503845, + "grad_norm": 0.3562350869178772, + "learning_rate": 6.945574403918247e-06, + "loss": 0.3611, + "step": 32254 + }, + { + "epoch": 0.5988348562878031, + "grad_norm": 0.2112765610218048, + "learning_rate": 6.944463680040135e-06, + "loss": 0.1677, + "step": 32256 + }, + { + "epoch": 0.5988719864252218, + "grad_norm": 0.4124261736869812, + "learning_rate": 6.943352997738717e-06, + "loss": 0.3567, + "step": 32258 + }, + { + "epoch": 0.5989091165626405, + "grad_norm": 0.2734866738319397, + "learning_rate": 6.9422423570291155e-06, + "loss": 0.2741, + "step": 32260 + }, + { + "epoch": 0.598946246700059, + "grad_norm": 0.2706665098667145, + "learning_rate": 6.941131757926434e-06, + "loss": 0.245, + "step": 32262 + }, + { + "epoch": 0.5989833768374777, + "grad_norm": 0.3233645260334015, + "learning_rate": 6.940021200445784e-06, + "loss": 0.3452, + "step": 32264 + }, + { + "epoch": 0.5990205069748963, + "grad_norm": 0.516941487789154, + "learning_rate": 6.93891068460228e-06, + "loss": 0.3022, + "step": 32266 + }, + { + "epoch": 0.599057637112315, + "grad_norm": 0.31668367981910706, + "learning_rate": 6.937800210411034e-06, + "loss": 0.2686, + "step": 32268 + }, + { + "epoch": 0.5990947672497335, + "grad_norm": 0.4152942895889282, + "learning_rate": 6.936689777887156e-06, + "loss": 0.3642, + "step": 32270 + }, + { + "epoch": 0.5991318973871522, + "grad_norm": 0.49711453914642334, + "learning_rate": 6.935579387045754e-06, + "loss": 0.2998, + "step": 32272 + }, + { + "epoch": 0.5991690275245709, + "grad_norm": 0.43986788392066956, + "learning_rate": 6.934469037901937e-06, + "loss": 0.3765, + "step": 32274 + }, + { + "epoch": 0.5992061576619895, + "grad_norm": 0.47349226474761963, + "learning_rate": 6.9333587304708135e-06, + "loss": 0.4092, + "step": 32276 + }, + { + "epoch": 0.5992432877994082, + "grad_norm": 0.37648308277130127, + "learning_rate": 6.9322484647674906e-06, + "loss": 0.1533, + "step": 32278 + }, + { + "epoch": 0.5992804179368267, + "grad_norm": 0.554593563079834, + "learning_rate": 6.931138240807078e-06, + "loss": 0.1686, + "step": 32280 + }, + { + "epoch": 0.5993175480742454, + "grad_norm": 0.5181983709335327, + "learning_rate": 6.930028058604683e-06, + "loss": 0.2623, + "step": 32282 + }, + { + "epoch": 0.5993546782116641, + "grad_norm": 0.28140196204185486, + "learning_rate": 6.928917918175409e-06, + "loss": 0.0696, + "step": 32284 + }, + { + "epoch": 0.5993918083490827, + "grad_norm": 0.5067324638366699, + "learning_rate": 6.927807819534368e-06, + "loss": 0.1395, + "step": 32286 + }, + { + "epoch": 0.5994289384865014, + "grad_norm": 0.5374001860618591, + "learning_rate": 6.926697762696654e-06, + "loss": 0.5165, + "step": 32288 + }, + { + "epoch": 0.5994660686239199, + "grad_norm": 0.40321382880210876, + "learning_rate": 6.925587747677381e-06, + "loss": 0.3077, + "step": 32290 + }, + { + "epoch": 0.5995031987613386, + "grad_norm": 0.5801188349723816, + "learning_rate": 6.924477774491649e-06, + "loss": 0.2256, + "step": 32292 + }, + { + "epoch": 0.5995403288987573, + "grad_norm": 0.41153010725975037, + "learning_rate": 6.923367843154562e-06, + "loss": 0.2536, + "step": 32294 + }, + { + "epoch": 0.5995774590361759, + "grad_norm": 0.40889444947242737, + "learning_rate": 6.922257953681222e-06, + "loss": 0.2995, + "step": 32296 + }, + { + "epoch": 0.5996145891735946, + "grad_norm": 0.2777296006679535, + "learning_rate": 6.9211481060867365e-06, + "loss": 0.2279, + "step": 32298 + }, + { + "epoch": 0.5996517193110131, + "grad_norm": 0.39582282304763794, + "learning_rate": 6.920038300386201e-06, + "loss": 0.1537, + "step": 32300 + }, + { + "epoch": 0.5996888494484318, + "grad_norm": 0.24680942296981812, + "learning_rate": 6.9189285365947155e-06, + "loss": 0.3372, + "step": 32302 + }, + { + "epoch": 0.5997259795858505, + "grad_norm": 0.2839396893978119, + "learning_rate": 6.917818814727389e-06, + "loss": 0.3197, + "step": 32304 + }, + { + "epoch": 0.5997631097232691, + "grad_norm": 0.33715304732322693, + "learning_rate": 6.916709134799311e-06, + "loss": 0.3246, + "step": 32306 + }, + { + "epoch": 0.5998002398606878, + "grad_norm": 0.3624929189682007, + "learning_rate": 6.915599496825588e-06, + "loss": 0.3152, + "step": 32308 + }, + { + "epoch": 0.5998373699981063, + "grad_norm": 0.4930504560470581, + "learning_rate": 6.91448990082132e-06, + "loss": 0.1805, + "step": 32310 + }, + { + "epoch": 0.599874500135525, + "grad_norm": 0.23059092462062836, + "learning_rate": 6.913380346801599e-06, + "loss": 0.3819, + "step": 32312 + }, + { + "epoch": 0.5999116302729436, + "grad_norm": 0.42047372460365295, + "learning_rate": 6.912270834781528e-06, + "loss": 0.3256, + "step": 32314 + }, + { + "epoch": 0.5999487604103623, + "grad_norm": 0.37749379873275757, + "learning_rate": 6.9111613647762e-06, + "loss": 0.2435, + "step": 32316 + }, + { + "epoch": 0.599985890547781, + "grad_norm": 0.7667651176452637, + "learning_rate": 6.910051936800715e-06, + "loss": 0.3469, + "step": 32318 + }, + { + "epoch": 0.6000230206851995, + "grad_norm": 0.5379275679588318, + "learning_rate": 6.908942550870166e-06, + "loss": 0.4098, + "step": 32320 + }, + { + "epoch": 0.6000601508226182, + "grad_norm": 0.46825674176216125, + "learning_rate": 6.907833206999652e-06, + "loss": 0.2924, + "step": 32322 + }, + { + "epoch": 0.6000972809600368, + "grad_norm": 0.40784215927124023, + "learning_rate": 6.906723905204266e-06, + "loss": 0.4538, + "step": 32324 + }, + { + "epoch": 0.6001344110974555, + "grad_norm": 0.39177078008651733, + "learning_rate": 6.905614645499103e-06, + "loss": 0.1647, + "step": 32326 + }, + { + "epoch": 0.6001715412348742, + "grad_norm": 0.2066248059272766, + "learning_rate": 6.9045054278992525e-06, + "loss": 0.2552, + "step": 32328 + }, + { + "epoch": 0.6002086713722927, + "grad_norm": 0.5254664421081543, + "learning_rate": 6.903396252419813e-06, + "loss": 0.2669, + "step": 32330 + }, + { + "epoch": 0.6002458015097114, + "grad_norm": 0.31111767888069153, + "learning_rate": 6.902287119075874e-06, + "loss": 0.1313, + "step": 32332 + }, + { + "epoch": 0.60028293164713, + "grad_norm": 0.3785063922405243, + "learning_rate": 6.901178027882531e-06, + "loss": 0.4086, + "step": 32334 + }, + { + "epoch": 0.6003200617845487, + "grad_norm": 0.3431715667247772, + "learning_rate": 6.900068978854872e-06, + "loss": 0.2089, + "step": 32336 + }, + { + "epoch": 0.6003571919219673, + "grad_norm": 0.49535998702049255, + "learning_rate": 6.8989599720079905e-06, + "loss": 0.303, + "step": 32338 + }, + { + "epoch": 0.6003943220593859, + "grad_norm": 0.33867746591567993, + "learning_rate": 6.897851007356973e-06, + "loss": 0.1183, + "step": 32340 + }, + { + "epoch": 0.6004314521968046, + "grad_norm": 0.8034718632698059, + "learning_rate": 6.89674208491691e-06, + "loss": 0.3682, + "step": 32342 + }, + { + "epoch": 0.6004685823342232, + "grad_norm": 0.3230651021003723, + "learning_rate": 6.895633204702894e-06, + "loss": 0.1903, + "step": 32344 + }, + { + "epoch": 0.6005057124716419, + "grad_norm": 0.3249034285545349, + "learning_rate": 6.894524366730009e-06, + "loss": 0.2176, + "step": 32346 + }, + { + "epoch": 0.6005428426090605, + "grad_norm": 0.48975130915641785, + "learning_rate": 6.893415571013351e-06, + "loss": 0.4711, + "step": 32348 + }, + { + "epoch": 0.6005799727464791, + "grad_norm": 0.6179929971694946, + "learning_rate": 6.892306817567996e-06, + "loss": 0.2394, + "step": 32350 + }, + { + "epoch": 0.6006171028838978, + "grad_norm": 0.2959885895252228, + "learning_rate": 6.891198106409038e-06, + "loss": 0.2593, + "step": 32352 + }, + { + "epoch": 0.6006542330213164, + "grad_norm": 0.4248366355895996, + "learning_rate": 6.890089437551562e-06, + "loss": 0.2814, + "step": 32354 + }, + { + "epoch": 0.6006913631587351, + "grad_norm": 0.33484944701194763, + "learning_rate": 6.888980811010655e-06, + "loss": 0.2525, + "step": 32356 + }, + { + "epoch": 0.6007284932961537, + "grad_norm": 0.41420888900756836, + "learning_rate": 6.887872226801398e-06, + "loss": 0.3303, + "step": 32358 + }, + { + "epoch": 0.6007656234335723, + "grad_norm": 0.36624428629875183, + "learning_rate": 6.886763684938877e-06, + "loss": 0.2047, + "step": 32360 + }, + { + "epoch": 0.600802753570991, + "grad_norm": 0.46592411398887634, + "learning_rate": 6.885655185438184e-06, + "loss": 0.2133, + "step": 32362 + }, + { + "epoch": 0.6008398837084096, + "grad_norm": 0.37529098987579346, + "learning_rate": 6.88454672831439e-06, + "loss": 0.2506, + "step": 32364 + }, + { + "epoch": 0.6008770138458283, + "grad_norm": 0.5220930576324463, + "learning_rate": 6.883438313582582e-06, + "loss": 0.3615, + "step": 32366 + }, + { + "epoch": 0.6009141439832468, + "grad_norm": 0.5191646814346313, + "learning_rate": 6.882329941257847e-06, + "loss": 0.2636, + "step": 32368 + }, + { + "epoch": 0.6009512741206655, + "grad_norm": 0.4602196514606476, + "learning_rate": 6.88122161135526e-06, + "loss": 0.2221, + "step": 32370 + }, + { + "epoch": 0.6009884042580842, + "grad_norm": 0.4736001789569855, + "learning_rate": 6.880113323889905e-06, + "loss": 0.3182, + "step": 32372 + }, + { + "epoch": 0.6010255343955028, + "grad_norm": 0.563730001449585, + "learning_rate": 6.879005078876868e-06, + "loss": 0.4232, + "step": 32374 + }, + { + "epoch": 0.6010626645329215, + "grad_norm": 0.4768717586994171, + "learning_rate": 6.877896876331218e-06, + "loss": 0.157, + "step": 32376 + }, + { + "epoch": 0.60109979467034, + "grad_norm": 0.26671531796455383, + "learning_rate": 6.876788716268044e-06, + "loss": 0.2049, + "step": 32378 + }, + { + "epoch": 0.6011369248077587, + "grad_norm": 0.40325430035591125, + "learning_rate": 6.875680598702416e-06, + "loss": 0.2696, + "step": 32380 + }, + { + "epoch": 0.6011740549451774, + "grad_norm": 0.3511100709438324, + "learning_rate": 6.8745725236494165e-06, + "loss": 0.2577, + "step": 32382 + }, + { + "epoch": 0.601211185082596, + "grad_norm": 0.319902241230011, + "learning_rate": 6.873464491124125e-06, + "loss": 0.1937, + "step": 32384 + }, + { + "epoch": 0.6012483152200147, + "grad_norm": 0.4421692490577698, + "learning_rate": 6.872356501141619e-06, + "loss": 0.1382, + "step": 32386 + }, + { + "epoch": 0.6012854453574332, + "grad_norm": 0.3024381101131439, + "learning_rate": 6.871248553716969e-06, + "loss": 0.1976, + "step": 32388 + }, + { + "epoch": 0.6013225754948519, + "grad_norm": 0.3544471263885498, + "learning_rate": 6.8701406488652574e-06, + "loss": 0.3233, + "step": 32390 + }, + { + "epoch": 0.6013597056322706, + "grad_norm": 0.303026020526886, + "learning_rate": 6.869032786601553e-06, + "loss": 0.4024, + "step": 32392 + }, + { + "epoch": 0.6013968357696892, + "grad_norm": 0.5401151180267334, + "learning_rate": 6.867924966940935e-06, + "loss": 0.3826, + "step": 32394 + }, + { + "epoch": 0.6014339659071078, + "grad_norm": 0.3572685420513153, + "learning_rate": 6.866817189898478e-06, + "loss": 0.3484, + "step": 32396 + }, + { + "epoch": 0.6014710960445264, + "grad_norm": 0.5170982480049133, + "learning_rate": 6.865709455489256e-06, + "loss": 0.3199, + "step": 32398 + }, + { + "epoch": 0.6015082261819451, + "grad_norm": 0.3436547815799713, + "learning_rate": 6.864601763728339e-06, + "loss": 0.3344, + "step": 32400 + }, + { + "epoch": 0.6015453563193638, + "grad_norm": 0.45781105756759644, + "learning_rate": 6.863494114630797e-06, + "loss": 0.4065, + "step": 32402 + }, + { + "epoch": 0.6015824864567824, + "grad_norm": 0.18804508447647095, + "learning_rate": 6.862386508211707e-06, + "loss": 0.1049, + "step": 32404 + }, + { + "epoch": 0.601619616594201, + "grad_norm": 0.4620898365974426, + "learning_rate": 6.861278944486138e-06, + "loss": 0.1148, + "step": 32406 + }, + { + "epoch": 0.6016567467316196, + "grad_norm": 0.3620264232158661, + "learning_rate": 6.8601714234691616e-06, + "loss": 0.249, + "step": 32408 + }, + { + "epoch": 0.6016938768690383, + "grad_norm": 0.29990071058273315, + "learning_rate": 6.8590639451758475e-06, + "loss": 0.3186, + "step": 32410 + }, + { + "epoch": 0.601731007006457, + "grad_norm": 0.3128824532032013, + "learning_rate": 6.857956509621267e-06, + "loss": 0.2539, + "step": 32412 + }, + { + "epoch": 0.6017681371438756, + "grad_norm": 0.34034353494644165, + "learning_rate": 6.856849116820484e-06, + "loss": 0.4939, + "step": 32414 + }, + { + "epoch": 0.6018052672812942, + "grad_norm": 0.4037137031555176, + "learning_rate": 6.855741766788569e-06, + "loss": 0.1733, + "step": 32416 + }, + { + "epoch": 0.6018423974187128, + "grad_norm": 0.33270686864852905, + "learning_rate": 6.854634459540591e-06, + "loss": 0.319, + "step": 32418 + }, + { + "epoch": 0.6018795275561315, + "grad_norm": 0.5087394714355469, + "learning_rate": 6.853527195091618e-06, + "loss": 0.2471, + "step": 32420 + }, + { + "epoch": 0.6019166576935501, + "grad_norm": 0.4133280813694, + "learning_rate": 6.852419973456714e-06, + "loss": 0.1008, + "step": 32422 + }, + { + "epoch": 0.6019537878309688, + "grad_norm": 0.4253655672073364, + "learning_rate": 6.8513127946509495e-06, + "loss": 0.1912, + "step": 32424 + }, + { + "epoch": 0.6019909179683874, + "grad_norm": 0.3918859362602234, + "learning_rate": 6.8502056586893815e-06, + "loss": 0.4206, + "step": 32426 + }, + { + "epoch": 0.602028048105806, + "grad_norm": 0.30853214859962463, + "learning_rate": 6.849098565587081e-06, + "loss": 0.1552, + "step": 32428 + }, + { + "epoch": 0.6020651782432247, + "grad_norm": 0.3706313371658325, + "learning_rate": 6.8479915153591125e-06, + "loss": 0.212, + "step": 32430 + }, + { + "epoch": 0.6021023083806433, + "grad_norm": 0.381692111492157, + "learning_rate": 6.846884508020537e-06, + "loss": 0.3, + "step": 32432 + }, + { + "epoch": 0.602139438518062, + "grad_norm": 0.3469703197479248, + "learning_rate": 6.845777543586417e-06, + "loss": 0.2524, + "step": 32434 + }, + { + "epoch": 0.6021765686554806, + "grad_norm": 0.3518145680427551, + "learning_rate": 6.844670622071823e-06, + "loss": 0.3685, + "step": 32436 + }, + { + "epoch": 0.6022136987928992, + "grad_norm": 0.4700168967247009, + "learning_rate": 6.843563743491804e-06, + "loss": 0.3805, + "step": 32438 + }, + { + "epoch": 0.6022508289303179, + "grad_norm": 0.45189833641052246, + "learning_rate": 6.842456907861429e-06, + "loss": 0.299, + "step": 32440 + }, + { + "epoch": 0.6022879590677365, + "grad_norm": 0.3574831485748291, + "learning_rate": 6.841350115195759e-06, + "loss": 0.2671, + "step": 32442 + }, + { + "epoch": 0.6023250892051552, + "grad_norm": 0.34665700793266296, + "learning_rate": 6.840243365509851e-06, + "loss": 0.295, + "step": 32444 + }, + { + "epoch": 0.6023622193425738, + "grad_norm": 0.27932488918304443, + "learning_rate": 6.839136658818767e-06, + "loss": 0.329, + "step": 32446 + }, + { + "epoch": 0.6023993494799924, + "grad_norm": 0.5271481871604919, + "learning_rate": 6.838029995137565e-06, + "loss": 0.2733, + "step": 32448 + }, + { + "epoch": 0.6024364796174111, + "grad_norm": 0.31140831112861633, + "learning_rate": 6.836923374481307e-06, + "loss": 0.299, + "step": 32450 + }, + { + "epoch": 0.6024736097548297, + "grad_norm": 0.296762615442276, + "learning_rate": 6.8358167968650445e-06, + "loss": 0.1889, + "step": 32452 + }, + { + "epoch": 0.6025107398922483, + "grad_norm": 0.5563852190971375, + "learning_rate": 6.834710262303837e-06, + "loss": 0.3739, + "step": 32454 + }, + { + "epoch": 0.602547870029667, + "grad_norm": 0.41838330030441284, + "learning_rate": 6.833603770812741e-06, + "loss": 0.509, + "step": 32456 + }, + { + "epoch": 0.6025850001670856, + "grad_norm": 0.17578670382499695, + "learning_rate": 6.8324973224068135e-06, + "loss": 0.2262, + "step": 32458 + }, + { + "epoch": 0.6026221303045043, + "grad_norm": 0.35536518692970276, + "learning_rate": 6.8313909171011115e-06, + "loss": 0.3172, + "step": 32460 + }, + { + "epoch": 0.6026592604419229, + "grad_norm": 0.38685643672943115, + "learning_rate": 6.8302845549106885e-06, + "loss": 0.3369, + "step": 32462 + }, + { + "epoch": 0.6026963905793415, + "grad_norm": 0.4160190224647522, + "learning_rate": 6.829178235850598e-06, + "loss": 0.2863, + "step": 32464 + }, + { + "epoch": 0.6027335207167601, + "grad_norm": 0.3141392767429352, + "learning_rate": 6.828071959935891e-06, + "loss": 0.3522, + "step": 32466 + }, + { + "epoch": 0.6027706508541788, + "grad_norm": 0.33029550313949585, + "learning_rate": 6.826965727181626e-06, + "loss": 0.292, + "step": 32468 + }, + { + "epoch": 0.6028077809915975, + "grad_norm": 0.2849985361099243, + "learning_rate": 6.825859537602851e-06, + "loss": 0.2381, + "step": 32470 + }, + { + "epoch": 0.6028449111290161, + "grad_norm": 0.29119938611984253, + "learning_rate": 6.824753391214622e-06, + "loss": 0.2892, + "step": 32472 + }, + { + "epoch": 0.6028820412664347, + "grad_norm": 0.34570741653442383, + "learning_rate": 6.8236472880319905e-06, + "loss": 0.2646, + "step": 32474 + }, + { + "epoch": 0.6029191714038533, + "grad_norm": 0.516773521900177, + "learning_rate": 6.822541228070003e-06, + "loss": 0.394, + "step": 32476 + }, + { + "epoch": 0.602956301541272, + "grad_norm": 0.3689149022102356, + "learning_rate": 6.821435211343711e-06, + "loss": 0.2546, + "step": 32478 + }, + { + "epoch": 0.6029934316786907, + "grad_norm": 0.35306376218795776, + "learning_rate": 6.820329237868164e-06, + "loss": 0.3622, + "step": 32480 + }, + { + "epoch": 0.6030305618161093, + "grad_norm": 0.4598345458507538, + "learning_rate": 6.819223307658415e-06, + "loss": 0.3642, + "step": 32482 + }, + { + "epoch": 0.6030676919535279, + "grad_norm": 0.34100160002708435, + "learning_rate": 6.818117420729506e-06, + "loss": 0.33, + "step": 32484 + }, + { + "epoch": 0.6031048220909465, + "grad_norm": 0.38707593083381653, + "learning_rate": 6.817011577096488e-06, + "loss": 0.3462, + "step": 32486 + }, + { + "epoch": 0.6031419522283652, + "grad_norm": 0.405487596988678, + "learning_rate": 6.815905776774414e-06, + "loss": 0.3477, + "step": 32488 + }, + { + "epoch": 0.6031790823657839, + "grad_norm": 0.4612721800804138, + "learning_rate": 6.8148000197783205e-06, + "loss": 0.236, + "step": 32490 + }, + { + "epoch": 0.6032162125032025, + "grad_norm": 0.3976864218711853, + "learning_rate": 6.813694306123256e-06, + "loss": 0.3506, + "step": 32492 + }, + { + "epoch": 0.6032533426406211, + "grad_norm": 0.27394425868988037, + "learning_rate": 6.812588635824271e-06, + "loss": 0.2758, + "step": 32494 + }, + { + "epoch": 0.6032904727780397, + "grad_norm": 0.24788857996463776, + "learning_rate": 6.811483008896406e-06, + "loss": 0.234, + "step": 32496 + }, + { + "epoch": 0.6033276029154584, + "grad_norm": 0.33730241656303406, + "learning_rate": 6.810377425354706e-06, + "loss": 0.2366, + "step": 32498 + }, + { + "epoch": 0.6033647330528771, + "grad_norm": 0.42708128690719604, + "learning_rate": 6.8092718852142194e-06, + "loss": 0.3964, + "step": 32500 + }, + { + "epoch": 0.6034018631902957, + "grad_norm": 0.34661149978637695, + "learning_rate": 6.8081663884899805e-06, + "loss": 0.1334, + "step": 32502 + }, + { + "epoch": 0.6034389933277143, + "grad_norm": 0.3175419270992279, + "learning_rate": 6.807060935197037e-06, + "loss": 0.2615, + "step": 32504 + }, + { + "epoch": 0.6034761234651329, + "grad_norm": 0.5078348517417908, + "learning_rate": 6.805955525350432e-06, + "loss": 0.2524, + "step": 32506 + }, + { + "epoch": 0.6035132536025516, + "grad_norm": 0.5579872727394104, + "learning_rate": 6.804850158965203e-06, + "loss": 0.5139, + "step": 32508 + }, + { + "epoch": 0.6035503837399703, + "grad_norm": 0.5155790448188782, + "learning_rate": 6.803744836056391e-06, + "loss": 0.3268, + "step": 32510 + }, + { + "epoch": 0.6035875138773888, + "grad_norm": 0.39170435070991516, + "learning_rate": 6.8026395566390455e-06, + "loss": 0.3054, + "step": 32512 + }, + { + "epoch": 0.6036246440148075, + "grad_norm": 0.3989746570587158, + "learning_rate": 6.801534320728192e-06, + "loss": 0.1169, + "step": 32514 + }, + { + "epoch": 0.6036617741522261, + "grad_norm": 0.5660011172294617, + "learning_rate": 6.800429128338879e-06, + "loss": 0.4021, + "step": 32516 + }, + { + "epoch": 0.6036989042896448, + "grad_norm": 0.35633864998817444, + "learning_rate": 6.799323979486139e-06, + "loss": 0.1759, + "step": 32518 + }, + { + "epoch": 0.6037360344270634, + "grad_norm": 0.31688812375068665, + "learning_rate": 6.7982188741850115e-06, + "loss": 0.2235, + "step": 32520 + }, + { + "epoch": 0.603773164564482, + "grad_norm": 0.2559809386730194, + "learning_rate": 6.797113812450538e-06, + "loss": 0.2623, + "step": 32522 + }, + { + "epoch": 0.6038102947019007, + "grad_norm": 0.30031338334083557, + "learning_rate": 6.7960087942977526e-06, + "loss": 0.3032, + "step": 32524 + }, + { + "epoch": 0.6038474248393193, + "grad_norm": 0.4881744086742401, + "learning_rate": 6.794903819741687e-06, + "loss": 0.1683, + "step": 32526 + }, + { + "epoch": 0.603884554976738, + "grad_norm": 0.2855229675769806, + "learning_rate": 6.793798888797383e-06, + "loss": 0.294, + "step": 32528 + }, + { + "epoch": 0.6039216851141566, + "grad_norm": 0.318218469619751, + "learning_rate": 6.7926940014798695e-06, + "loss": 0.3111, + "step": 32530 + }, + { + "epoch": 0.6039588152515752, + "grad_norm": 0.3056376874446869, + "learning_rate": 6.791589157804184e-06, + "loss": 0.2483, + "step": 32532 + }, + { + "epoch": 0.6039959453889939, + "grad_norm": 0.46332964301109314, + "learning_rate": 6.790484357785361e-06, + "loss": 0.4864, + "step": 32534 + }, + { + "epoch": 0.6040330755264125, + "grad_norm": 0.37451857328414917, + "learning_rate": 6.7893796014384325e-06, + "loss": 0.1973, + "step": 32536 + }, + { + "epoch": 0.6040702056638312, + "grad_norm": 0.3718571364879608, + "learning_rate": 6.788274888778434e-06, + "loss": 0.3319, + "step": 32538 + }, + { + "epoch": 0.6041073358012498, + "grad_norm": 0.3948982059955597, + "learning_rate": 6.787170219820389e-06, + "loss": 0.304, + "step": 32540 + }, + { + "epoch": 0.6041444659386684, + "grad_norm": 0.3447987735271454, + "learning_rate": 6.786065594579334e-06, + "loss": 0.2331, + "step": 32542 + }, + { + "epoch": 0.6041815960760871, + "grad_norm": 0.2462286502122879, + "learning_rate": 6.784961013070299e-06, + "loss": 0.1373, + "step": 32544 + }, + { + "epoch": 0.6042187262135057, + "grad_norm": 0.3173312842845917, + "learning_rate": 6.783856475308317e-06, + "loss": 0.2307, + "step": 32546 + }, + { + "epoch": 0.6042558563509244, + "grad_norm": 0.38091227412223816, + "learning_rate": 6.782751981308413e-06, + "loss": 0.3747, + "step": 32548 + }, + { + "epoch": 0.604292986488343, + "grad_norm": 0.3523854911327362, + "learning_rate": 6.7816475310856225e-06, + "loss": 0.2326, + "step": 32550 + }, + { + "epoch": 0.6043301166257616, + "grad_norm": 0.4816212058067322, + "learning_rate": 6.780543124654964e-06, + "loss": 0.2908, + "step": 32552 + }, + { + "epoch": 0.6043672467631803, + "grad_norm": 0.39369356632232666, + "learning_rate": 6.7794387620314694e-06, + "loss": 0.4322, + "step": 32554 + }, + { + "epoch": 0.6044043769005989, + "grad_norm": 0.34154409170150757, + "learning_rate": 6.778334443230168e-06, + "loss": 0.3477, + "step": 32556 + }, + { + "epoch": 0.6044415070380176, + "grad_norm": 0.3572297692298889, + "learning_rate": 6.777230168266087e-06, + "loss": 0.1544, + "step": 32558 + }, + { + "epoch": 0.6044786371754362, + "grad_norm": 0.3864055871963501, + "learning_rate": 6.776125937154248e-06, + "loss": 0.2449, + "step": 32560 + }, + { + "epoch": 0.6045157673128548, + "grad_norm": 1.211503267288208, + "learning_rate": 6.775021749909681e-06, + "loss": 0.2319, + "step": 32562 + }, + { + "epoch": 0.6045528974502735, + "grad_norm": 0.3568444848060608, + "learning_rate": 6.7739176065474045e-06, + "loss": 0.3187, + "step": 32564 + }, + { + "epoch": 0.6045900275876921, + "grad_norm": 0.3553500771522522, + "learning_rate": 6.772813507082447e-06, + "loss": 0.454, + "step": 32566 + }, + { + "epoch": 0.6046271577251108, + "grad_norm": 0.3139319121837616, + "learning_rate": 6.771709451529833e-06, + "loss": 0.4255, + "step": 32568 + }, + { + "epoch": 0.6046642878625293, + "grad_norm": 0.5245634317398071, + "learning_rate": 6.77060543990458e-06, + "loss": 0.2041, + "step": 32570 + }, + { + "epoch": 0.604701417999948, + "grad_norm": 0.39949071407318115, + "learning_rate": 6.7695014722217155e-06, + "loss": 0.2201, + "step": 32572 + }, + { + "epoch": 0.6047385481373666, + "grad_norm": 0.4564610719680786, + "learning_rate": 6.768397548496259e-06, + "loss": 0.2674, + "step": 32574 + }, + { + "epoch": 0.6047756782747853, + "grad_norm": 0.31416645646095276, + "learning_rate": 6.767293668743236e-06, + "loss": 0.1347, + "step": 32576 + }, + { + "epoch": 0.604812808412204, + "grad_norm": 0.2084096372127533, + "learning_rate": 6.766189832977659e-06, + "loss": 0.1212, + "step": 32578 + }, + { + "epoch": 0.6048499385496225, + "grad_norm": 0.3292437195777893, + "learning_rate": 6.765086041214555e-06, + "loss": 0.214, + "step": 32580 + }, + { + "epoch": 0.6048870686870412, + "grad_norm": 0.4148804247379303, + "learning_rate": 6.763982293468937e-06, + "loss": 0.2818, + "step": 32582 + }, + { + "epoch": 0.6049241988244598, + "grad_norm": 0.37251049280166626, + "learning_rate": 6.762878589755828e-06, + "loss": 0.2731, + "step": 32584 + }, + { + "epoch": 0.6049613289618785, + "grad_norm": 0.3387444317340851, + "learning_rate": 6.7617749300902434e-06, + "loss": 0.261, + "step": 32586 + }, + { + "epoch": 0.6049984590992972, + "grad_norm": 0.30089038610458374, + "learning_rate": 6.7606713144872085e-06, + "loss": 0.2494, + "step": 32588 + }, + { + "epoch": 0.6050355892367157, + "grad_norm": 0.3548854887485504, + "learning_rate": 6.75956774296173e-06, + "loss": 0.3065, + "step": 32590 + }, + { + "epoch": 0.6050727193741344, + "grad_norm": 0.4021078646183014, + "learning_rate": 6.758464215528828e-06, + "loss": 0.2859, + "step": 32592 + }, + { + "epoch": 0.605109849511553, + "grad_norm": 0.3577335774898529, + "learning_rate": 6.757360732203518e-06, + "loss": 0.2636, + "step": 32594 + }, + { + "epoch": 0.6051469796489717, + "grad_norm": 0.4125879406929016, + "learning_rate": 6.7562572930008165e-06, + "loss": 0.4176, + "step": 32596 + }, + { + "epoch": 0.6051841097863904, + "grad_norm": 0.5143601298332214, + "learning_rate": 6.755153897935738e-06, + "loss": 0.3234, + "step": 32598 + }, + { + "epoch": 0.6052212399238089, + "grad_norm": 0.26305949687957764, + "learning_rate": 6.754050547023294e-06, + "loss": 0.254, + "step": 32600 + }, + { + "epoch": 0.6052583700612276, + "grad_norm": 0.36468344926834106, + "learning_rate": 6.752947240278502e-06, + "loss": 0.3757, + "step": 32602 + }, + { + "epoch": 0.6052955001986462, + "grad_norm": 0.3187844753265381, + "learning_rate": 6.751843977716368e-06, + "loss": 0.1258, + "step": 32604 + }, + { + "epoch": 0.6053326303360649, + "grad_norm": 0.37027421593666077, + "learning_rate": 6.750740759351911e-06, + "loss": 0.4116, + "step": 32606 + }, + { + "epoch": 0.6053697604734836, + "grad_norm": 0.9331082701683044, + "learning_rate": 6.749637585200137e-06, + "loss": 0.2255, + "step": 32608 + }, + { + "epoch": 0.6054068906109021, + "grad_norm": 0.4842776358127594, + "learning_rate": 6.748534455276062e-06, + "loss": 0.3665, + "step": 32610 + }, + { + "epoch": 0.6054440207483208, + "grad_norm": 0.3069177567958832, + "learning_rate": 6.747431369594691e-06, + "loss": 0.2839, + "step": 32612 + }, + { + "epoch": 0.6054811508857394, + "grad_norm": 0.568530797958374, + "learning_rate": 6.746328328171043e-06, + "loss": 0.3872, + "step": 32614 + }, + { + "epoch": 0.6055182810231581, + "grad_norm": 0.4535745680332184, + "learning_rate": 6.745225331020114e-06, + "loss": 0.3662, + "step": 32616 + }, + { + "epoch": 0.6055554111605767, + "grad_norm": 0.46867963671684265, + "learning_rate": 6.744122378156921e-06, + "loss": 0.3565, + "step": 32618 + }, + { + "epoch": 0.6055925412979953, + "grad_norm": 0.3103230595588684, + "learning_rate": 6.743019469596468e-06, + "loss": 0.5443, + "step": 32620 + }, + { + "epoch": 0.605629671435414, + "grad_norm": 0.43798306584358215, + "learning_rate": 6.741916605353767e-06, + "loss": 0.4664, + "step": 32622 + }, + { + "epoch": 0.6056668015728326, + "grad_norm": 0.40012794733047485, + "learning_rate": 6.74081378544382e-06, + "loss": 0.1945, + "step": 32624 + }, + { + "epoch": 0.6057039317102513, + "grad_norm": 0.35662710666656494, + "learning_rate": 6.73971100988164e-06, + "loss": 0.3303, + "step": 32626 + }, + { + "epoch": 0.6057410618476698, + "grad_norm": 0.3911553919315338, + "learning_rate": 6.738608278682222e-06, + "loss": 0.1273, + "step": 32628 + }, + { + "epoch": 0.6057781919850885, + "grad_norm": 0.29097649455070496, + "learning_rate": 6.737505591860578e-06, + "loss": 0.2464, + "step": 32630 + }, + { + "epoch": 0.6058153221225072, + "grad_norm": 0.4287024140357971, + "learning_rate": 6.736402949431711e-06, + "loss": 0.3172, + "step": 32632 + }, + { + "epoch": 0.6058524522599258, + "grad_norm": 0.2854858636856079, + "learning_rate": 6.735300351410623e-06, + "loss": 0.2673, + "step": 32634 + }, + { + "epoch": 0.6058895823973445, + "grad_norm": 0.39085450768470764, + "learning_rate": 6.734197797812318e-06, + "loss": 0.1654, + "step": 32636 + }, + { + "epoch": 0.605926712534763, + "grad_norm": 0.3609722852706909, + "learning_rate": 6.733095288651803e-06, + "loss": 0.3038, + "step": 32638 + }, + { + "epoch": 0.6059638426721817, + "grad_norm": 0.3893957734107971, + "learning_rate": 6.731992823944072e-06, + "loss": 0.2682, + "step": 32640 + }, + { + "epoch": 0.6060009728096004, + "grad_norm": 0.5668826699256897, + "learning_rate": 6.7308904037041286e-06, + "loss": 0.276, + "step": 32642 + }, + { + "epoch": 0.606038102947019, + "grad_norm": 0.6270896792411804, + "learning_rate": 6.729788027946977e-06, + "loss": 0.1242, + "step": 32644 + }, + { + "epoch": 0.6060752330844377, + "grad_norm": 0.33200517296791077, + "learning_rate": 6.728685696687613e-06, + "loss": 0.2337, + "step": 32646 + }, + { + "epoch": 0.6061123632218562, + "grad_norm": 0.29668793082237244, + "learning_rate": 6.72758340994104e-06, + "loss": 0.1695, + "step": 32648 + }, + { + "epoch": 0.6061494933592749, + "grad_norm": 0.2584170997142792, + "learning_rate": 6.726481167722252e-06, + "loss": 0.1102, + "step": 32650 + }, + { + "epoch": 0.6061866234966936, + "grad_norm": 0.46197426319122314, + "learning_rate": 6.725378970046255e-06, + "loss": 0.4541, + "step": 32652 + }, + { + "epoch": 0.6062237536341122, + "grad_norm": 0.4598432183265686, + "learning_rate": 6.7242768169280405e-06, + "loss": 0.2369, + "step": 32654 + }, + { + "epoch": 0.6062608837715309, + "grad_norm": 0.796837329864502, + "learning_rate": 6.7231747083826035e-06, + "loss": 0.1361, + "step": 32656 + }, + { + "epoch": 0.6062980139089494, + "grad_norm": 0.31335270404815674, + "learning_rate": 6.722072644424944e-06, + "loss": 0.155, + "step": 32658 + }, + { + "epoch": 0.6063351440463681, + "grad_norm": 0.28925031423568726, + "learning_rate": 6.7209706250700566e-06, + "loss": 0.1149, + "step": 32660 + }, + { + "epoch": 0.6063722741837868, + "grad_norm": 0.2990482747554779, + "learning_rate": 6.719868650332939e-06, + "loss": 0.1779, + "step": 32662 + }, + { + "epoch": 0.6064094043212054, + "grad_norm": 0.2956528663635254, + "learning_rate": 6.718766720228586e-06, + "loss": 0.2705, + "step": 32664 + }, + { + "epoch": 0.6064465344586241, + "grad_norm": 0.41349929571151733, + "learning_rate": 6.717664834771988e-06, + "loss": 0.0894, + "step": 32666 + }, + { + "epoch": 0.6064836645960426, + "grad_norm": 0.2936456799507141, + "learning_rate": 6.716562993978138e-06, + "loss": 0.1351, + "step": 32668 + }, + { + "epoch": 0.6065207947334613, + "grad_norm": 0.3496336340904236, + "learning_rate": 6.71546119786203e-06, + "loss": 0.2723, + "step": 32670 + }, + { + "epoch": 0.6065579248708799, + "grad_norm": 0.4455249011516571, + "learning_rate": 6.7143594464386564e-06, + "loss": 0.4663, + "step": 32672 + }, + { + "epoch": 0.6065950550082986, + "grad_norm": 0.44359856843948364, + "learning_rate": 6.713257739723013e-06, + "loss": 0.2588, + "step": 32674 + }, + { + "epoch": 0.6066321851457173, + "grad_norm": 0.26801928877830505, + "learning_rate": 6.712156077730088e-06, + "loss": 0.1935, + "step": 32676 + }, + { + "epoch": 0.6066693152831358, + "grad_norm": 0.6040407419204712, + "learning_rate": 6.7110544604748666e-06, + "loss": 0.3325, + "step": 32678 + }, + { + "epoch": 0.6067064454205545, + "grad_norm": 1.4163864850997925, + "learning_rate": 6.709952887972342e-06, + "loss": 0.2145, + "step": 32680 + }, + { + "epoch": 0.6067435755579731, + "grad_norm": 0.3321932256221771, + "learning_rate": 6.708851360237503e-06, + "loss": 0.2204, + "step": 32682 + }, + { + "epoch": 0.6067807056953918, + "grad_norm": 0.389706552028656, + "learning_rate": 6.707749877285342e-06, + "loss": 0.3559, + "step": 32684 + }, + { + "epoch": 0.6068178358328105, + "grad_norm": 0.23836737871170044, + "learning_rate": 6.7066484391308415e-06, + "loss": 0.2591, + "step": 32686 + }, + { + "epoch": 0.606854965970229, + "grad_norm": 0.6039220690727234, + "learning_rate": 6.705547045788996e-06, + "loss": 0.2261, + "step": 32688 + }, + { + "epoch": 0.6068920961076477, + "grad_norm": 0.31422141194343567, + "learning_rate": 6.7044456972747815e-06, + "loss": 0.4185, + "step": 32690 + }, + { + "epoch": 0.6069292262450663, + "grad_norm": 0.45084986090660095, + "learning_rate": 6.70334439360319e-06, + "loss": 0.2918, + "step": 32692 + }, + { + "epoch": 0.606966356382485, + "grad_norm": 0.7564059495925903, + "learning_rate": 6.702243134789208e-06, + "loss": 0.363, + "step": 32694 + }, + { + "epoch": 0.6070034865199037, + "grad_norm": 0.6141263246536255, + "learning_rate": 6.70114192084782e-06, + "loss": 0.1423, + "step": 32696 + }, + { + "epoch": 0.6070406166573222, + "grad_norm": 0.4967813789844513, + "learning_rate": 6.700040751794008e-06, + "loss": 0.5046, + "step": 32698 + }, + { + "epoch": 0.6070777467947409, + "grad_norm": 0.4076906144618988, + "learning_rate": 6.698939627642755e-06, + "loss": 0.2435, + "step": 32700 + }, + { + "epoch": 0.6071148769321595, + "grad_norm": 0.5306950807571411, + "learning_rate": 6.697838548409053e-06, + "loss": 0.4185, + "step": 32702 + }, + { + "epoch": 0.6071520070695782, + "grad_norm": 0.42083579301834106, + "learning_rate": 6.696737514107871e-06, + "loss": 0.2023, + "step": 32704 + }, + { + "epoch": 0.6071891372069969, + "grad_norm": 0.47346773743629456, + "learning_rate": 6.695636524754199e-06, + "loss": 0.2756, + "step": 32706 + }, + { + "epoch": 0.6072262673444154, + "grad_norm": 0.3367878198623657, + "learning_rate": 6.694535580363014e-06, + "loss": 0.4296, + "step": 32708 + }, + { + "epoch": 0.6072633974818341, + "grad_norm": 0.5973307490348816, + "learning_rate": 6.6934346809493e-06, + "loss": 0.3116, + "step": 32710 + }, + { + "epoch": 0.6073005276192527, + "grad_norm": 0.3987002670764923, + "learning_rate": 6.692333826528034e-06, + "loss": 0.3019, + "step": 32712 + }, + { + "epoch": 0.6073376577566714, + "grad_norm": 0.47669774293899536, + "learning_rate": 6.691233017114202e-06, + "loss": 0.1935, + "step": 32714 + }, + { + "epoch": 0.60737478789409, + "grad_norm": 0.3510865271091461, + "learning_rate": 6.690132252722774e-06, + "loss": 0.4826, + "step": 32716 + }, + { + "epoch": 0.6074119180315086, + "grad_norm": 0.6552179455757141, + "learning_rate": 6.6890315333687335e-06, + "loss": 0.3308, + "step": 32718 + }, + { + "epoch": 0.6074490481689273, + "grad_norm": 0.3568669259548187, + "learning_rate": 6.6879308590670556e-06, + "loss": 0.3582, + "step": 32720 + }, + { + "epoch": 0.6074861783063459, + "grad_norm": 0.3111821413040161, + "learning_rate": 6.686830229832716e-06, + "loss": 0.244, + "step": 32722 + }, + { + "epoch": 0.6075233084437646, + "grad_norm": 0.5278318524360657, + "learning_rate": 6.685729645680694e-06, + "loss": 0.1474, + "step": 32724 + }, + { + "epoch": 0.6075604385811831, + "grad_norm": 0.26630595326423645, + "learning_rate": 6.684629106625967e-06, + "loss": 0.1953, + "step": 32726 + }, + { + "epoch": 0.6075975687186018, + "grad_norm": 0.3744412064552307, + "learning_rate": 6.683528612683504e-06, + "loss": 0.191, + "step": 32728 + }, + { + "epoch": 0.6076346988560205, + "grad_norm": 0.3799089789390564, + "learning_rate": 6.6824281638682845e-06, + "loss": 0.193, + "step": 32730 + }, + { + "epoch": 0.6076718289934391, + "grad_norm": 0.6737147569656372, + "learning_rate": 6.681327760195279e-06, + "loss": 0.3717, + "step": 32732 + }, + { + "epoch": 0.6077089591308578, + "grad_norm": 0.42276766896247864, + "learning_rate": 6.680227401679461e-06, + "loss": 0.4692, + "step": 32734 + }, + { + "epoch": 0.6077460892682763, + "grad_norm": 0.6538462042808533, + "learning_rate": 6.679127088335806e-06, + "loss": 0.2624, + "step": 32736 + }, + { + "epoch": 0.607783219405695, + "grad_norm": 0.32437923550605774, + "learning_rate": 6.678026820179284e-06, + "loss": 0.2722, + "step": 32738 + }, + { + "epoch": 0.6078203495431137, + "grad_norm": 0.22122645378112793, + "learning_rate": 6.676926597224869e-06, + "loss": 0.1708, + "step": 32740 + }, + { + "epoch": 0.6078574796805323, + "grad_norm": 0.4881673753261566, + "learning_rate": 6.675826419487526e-06, + "loss": 0.2128, + "step": 32742 + }, + { + "epoch": 0.607894609817951, + "grad_norm": 0.8026943206787109, + "learning_rate": 6.6747262869822275e-06, + "loss": 0.2166, + "step": 32744 + }, + { + "epoch": 0.6079317399553695, + "grad_norm": 0.21508045494556427, + "learning_rate": 6.673626199723944e-06, + "loss": 0.2798, + "step": 32746 + }, + { + "epoch": 0.6079688700927882, + "grad_norm": 0.28522974252700806, + "learning_rate": 6.672526157727648e-06, + "loss": 0.2709, + "step": 32748 + }, + { + "epoch": 0.6080060002302069, + "grad_norm": 0.46932491660118103, + "learning_rate": 6.6714261610083e-06, + "loss": 0.2783, + "step": 32750 + }, + { + "epoch": 0.6080431303676255, + "grad_norm": 0.4139155149459839, + "learning_rate": 6.6703262095808766e-06, + "loss": 0.369, + "step": 32752 + }, + { + "epoch": 0.6080802605050442, + "grad_norm": 0.46052414178848267, + "learning_rate": 6.669226303460335e-06, + "loss": 0.3554, + "step": 32754 + }, + { + "epoch": 0.6081173906424627, + "grad_norm": 0.3651711046695709, + "learning_rate": 6.668126442661648e-06, + "loss": 0.3992, + "step": 32756 + }, + { + "epoch": 0.6081545207798814, + "grad_norm": 0.42071935534477234, + "learning_rate": 6.6670266271997795e-06, + "loss": 0.2138, + "step": 32758 + }, + { + "epoch": 0.6081916509173001, + "grad_norm": 0.34669363498687744, + "learning_rate": 6.665926857089698e-06, + "loss": 0.1847, + "step": 32760 + }, + { + "epoch": 0.6082287810547187, + "grad_norm": 0.6089686751365662, + "learning_rate": 6.664827132346361e-06, + "loss": 0.25, + "step": 32762 + }, + { + "epoch": 0.6082659111921374, + "grad_norm": 0.3463898003101349, + "learning_rate": 6.663727452984743e-06, + "loss": 0.1514, + "step": 32764 + }, + { + "epoch": 0.6083030413295559, + "grad_norm": 0.3123079836368561, + "learning_rate": 6.662627819019796e-06, + "loss": 0.1732, + "step": 32766 + }, + { + "epoch": 0.6083401714669746, + "grad_norm": 0.33132031559944153, + "learning_rate": 6.661528230466487e-06, + "loss": 0.2335, + "step": 32768 + }, + { + "epoch": 0.6083773016043932, + "grad_norm": 0.5688385963439941, + "learning_rate": 6.6604286873397815e-06, + "loss": 0.292, + "step": 32770 + }, + { + "epoch": 0.6084144317418119, + "grad_norm": 0.3657228946685791, + "learning_rate": 6.659329189654638e-06, + "loss": 0.1794, + "step": 32772 + }, + { + "epoch": 0.6084515618792306, + "grad_norm": 0.43812522292137146, + "learning_rate": 6.658229737426016e-06, + "loss": 0.3285, + "step": 32774 + }, + { + "epoch": 0.6084886920166491, + "grad_norm": 0.3374134302139282, + "learning_rate": 6.657130330668877e-06, + "loss": 0.1921, + "step": 32776 + }, + { + "epoch": 0.6085258221540678, + "grad_norm": 0.3188070058822632, + "learning_rate": 6.656030969398187e-06, + "loss": 0.2265, + "step": 32778 + }, + { + "epoch": 0.6085629522914864, + "grad_norm": 0.4505956470966339, + "learning_rate": 6.654931653628894e-06, + "loss": 0.2337, + "step": 32780 + }, + { + "epoch": 0.6086000824289051, + "grad_norm": 0.2641972005367279, + "learning_rate": 6.653832383375964e-06, + "loss": 0.2661, + "step": 32782 + }, + { + "epoch": 0.6086372125663237, + "grad_norm": 0.3299517035484314, + "learning_rate": 6.652733158654351e-06, + "loss": 0.3367, + "step": 32784 + }, + { + "epoch": 0.6086743427037423, + "grad_norm": 0.2928280830383301, + "learning_rate": 6.651633979479013e-06, + "loss": 0.2291, + "step": 32786 + }, + { + "epoch": 0.608711472841161, + "grad_norm": 0.3623118996620178, + "learning_rate": 6.650534845864906e-06, + "loss": 0.4383, + "step": 32788 + }, + { + "epoch": 0.6087486029785796, + "grad_norm": 0.5305338501930237, + "learning_rate": 6.6494357578269915e-06, + "loss": 0.3214, + "step": 32790 + }, + { + "epoch": 0.6087857331159983, + "grad_norm": 0.27662909030914307, + "learning_rate": 6.648336715380219e-06, + "loss": 0.313, + "step": 32792 + }, + { + "epoch": 0.608822863253417, + "grad_norm": 0.4266732633113861, + "learning_rate": 6.647237718539541e-06, + "loss": 0.4248, + "step": 32794 + }, + { + "epoch": 0.6088599933908355, + "grad_norm": 0.31537142395973206, + "learning_rate": 6.646138767319916e-06, + "loss": 0.1635, + "step": 32796 + }, + { + "epoch": 0.6088971235282542, + "grad_norm": 0.6782666444778442, + "learning_rate": 6.6450398617362965e-06, + "loss": 0.2337, + "step": 32798 + }, + { + "epoch": 0.6089342536656728, + "grad_norm": 0.9137536883354187, + "learning_rate": 6.643941001803638e-06, + "loss": 0.3029, + "step": 32800 + }, + { + "epoch": 0.6089713838030915, + "grad_norm": 0.4484010934829712, + "learning_rate": 6.642842187536888e-06, + "loss": 0.4011, + "step": 32802 + }, + { + "epoch": 0.6090085139405101, + "grad_norm": 0.5610449314117432, + "learning_rate": 6.641743418951001e-06, + "loss": 0.3067, + "step": 32804 + }, + { + "epoch": 0.6090456440779287, + "grad_norm": 0.34543943405151367, + "learning_rate": 6.640644696060924e-06, + "loss": 0.2898, + "step": 32806 + }, + { + "epoch": 0.6090827742153474, + "grad_norm": 0.33914604783058167, + "learning_rate": 6.639546018881611e-06, + "loss": 0.3389, + "step": 32808 + }, + { + "epoch": 0.609119904352766, + "grad_norm": 0.3454466760158539, + "learning_rate": 6.638447387428011e-06, + "loss": 0.3109, + "step": 32810 + }, + { + "epoch": 0.6091570344901847, + "grad_norm": 0.5863184928894043, + "learning_rate": 6.637348801715076e-06, + "loss": 0.3368, + "step": 32812 + }, + { + "epoch": 0.6091941646276033, + "grad_norm": 0.3228932321071625, + "learning_rate": 6.636250261757751e-06, + "loss": 0.2055, + "step": 32814 + }, + { + "epoch": 0.6092312947650219, + "grad_norm": 0.4191569685935974, + "learning_rate": 6.635151767570982e-06, + "loss": 0.4926, + "step": 32816 + }, + { + "epoch": 0.6092684249024406, + "grad_norm": 0.3733617067337036, + "learning_rate": 6.634053319169719e-06, + "loss": 0.2847, + "step": 32818 + }, + { + "epoch": 0.6093055550398592, + "grad_norm": 0.37717553973197937, + "learning_rate": 6.6329549165689065e-06, + "loss": 0.2492, + "step": 32820 + }, + { + "epoch": 0.6093426851772779, + "grad_norm": 0.47194042801856995, + "learning_rate": 6.631856559783496e-06, + "loss": 0.5036, + "step": 32822 + }, + { + "epoch": 0.6093798153146964, + "grad_norm": 0.3301788568496704, + "learning_rate": 6.6307582488284264e-06, + "loss": 0.2047, + "step": 32824 + }, + { + "epoch": 0.6094169454521151, + "grad_norm": 0.495137482881546, + "learning_rate": 6.6296599837186435e-06, + "loss": 0.2615, + "step": 32826 + }, + { + "epoch": 0.6094540755895338, + "grad_norm": 0.7760566473007202, + "learning_rate": 6.628561764469099e-06, + "loss": 0.4259, + "step": 32828 + }, + { + "epoch": 0.6094912057269524, + "grad_norm": 0.4803955852985382, + "learning_rate": 6.627463591094725e-06, + "loss": 0.2227, + "step": 32830 + }, + { + "epoch": 0.609528335864371, + "grad_norm": 0.4168407618999481, + "learning_rate": 6.626365463610469e-06, + "loss": 0.1223, + "step": 32832 + }, + { + "epoch": 0.6095654660017896, + "grad_norm": 0.13965439796447754, + "learning_rate": 6.625267382031277e-06, + "loss": 0.2154, + "step": 32834 + }, + { + "epoch": 0.6096025961392083, + "grad_norm": 0.4420328736305237, + "learning_rate": 6.624169346372086e-06, + "loss": 0.2294, + "step": 32836 + }, + { + "epoch": 0.609639726276627, + "grad_norm": 0.3596246838569641, + "learning_rate": 6.623071356647836e-06, + "loss": 0.2922, + "step": 32838 + }, + { + "epoch": 0.6096768564140456, + "grad_norm": 0.401109904050827, + "learning_rate": 6.621973412873477e-06, + "loss": 0.3026, + "step": 32840 + }, + { + "epoch": 0.6097139865514642, + "grad_norm": 0.36242595314979553, + "learning_rate": 6.6208755150639355e-06, + "loss": 0.1687, + "step": 32842 + }, + { + "epoch": 0.6097511166888828, + "grad_norm": 0.5056469440460205, + "learning_rate": 6.619777663234159e-06, + "loss": 0.138, + "step": 32844 + }, + { + "epoch": 0.6097882468263015, + "grad_norm": 0.2522508203983307, + "learning_rate": 6.618679857399082e-06, + "loss": 0.3664, + "step": 32846 + }, + { + "epoch": 0.6098253769637202, + "grad_norm": 0.3038733899593353, + "learning_rate": 6.617582097573643e-06, + "loss": 0.334, + "step": 32848 + }, + { + "epoch": 0.6098625071011388, + "grad_norm": 0.23065631091594696, + "learning_rate": 6.61648438377278e-06, + "loss": 0.2753, + "step": 32850 + }, + { + "epoch": 0.6098996372385574, + "grad_norm": 0.3708191514015198, + "learning_rate": 6.615386716011434e-06, + "loss": 0.3781, + "step": 32852 + }, + { + "epoch": 0.609936767375976, + "grad_norm": 0.2590278387069702, + "learning_rate": 6.614289094304534e-06, + "loss": 0.305, + "step": 32854 + }, + { + "epoch": 0.6099738975133947, + "grad_norm": 0.586765468120575, + "learning_rate": 6.613191518667019e-06, + "loss": 0.2261, + "step": 32856 + }, + { + "epoch": 0.6100110276508134, + "grad_norm": 0.3995712399482727, + "learning_rate": 6.61209398911382e-06, + "loss": 0.0987, + "step": 32858 + }, + { + "epoch": 0.610048157788232, + "grad_norm": 0.31207749247550964, + "learning_rate": 6.6109965056598745e-06, + "loss": 0.2768, + "step": 32860 + }, + { + "epoch": 0.6100852879256506, + "grad_norm": 0.4302317500114441, + "learning_rate": 6.609899068320116e-06, + "loss": 0.4211, + "step": 32862 + }, + { + "epoch": 0.6101224180630692, + "grad_norm": 0.4288552403450012, + "learning_rate": 6.6088016771094775e-06, + "loss": 0.221, + "step": 32864 + }, + { + "epoch": 0.6101595482004879, + "grad_norm": 0.44692501425743103, + "learning_rate": 6.607704332042892e-06, + "loss": 0.2812, + "step": 32866 + }, + { + "epoch": 0.6101966783379066, + "grad_norm": 0.5933568477630615, + "learning_rate": 6.606607033135288e-06, + "loss": 0.2551, + "step": 32868 + }, + { + "epoch": 0.6102338084753252, + "grad_norm": 0.24897730350494385, + "learning_rate": 6.605509780401595e-06, + "loss": 0.2425, + "step": 32870 + }, + { + "epoch": 0.6102709386127438, + "grad_norm": 0.22982865571975708, + "learning_rate": 6.604412573856749e-06, + "loss": 0.5191, + "step": 32872 + }, + { + "epoch": 0.6103080687501624, + "grad_norm": 0.3344128131866455, + "learning_rate": 6.603315413515678e-06, + "loss": 0.2552, + "step": 32874 + }, + { + "epoch": 0.6103451988875811, + "grad_norm": 0.3237553834915161, + "learning_rate": 6.602218299393306e-06, + "loss": 0.2905, + "step": 32876 + }, + { + "epoch": 0.6103823290249997, + "grad_norm": 0.2504872977733612, + "learning_rate": 6.601121231504571e-06, + "loss": 0.179, + "step": 32878 + }, + { + "epoch": 0.6104194591624184, + "grad_norm": 0.5493221282958984, + "learning_rate": 6.6000242098643916e-06, + "loss": 0.4061, + "step": 32880 + }, + { + "epoch": 0.610456589299837, + "grad_norm": 0.3034321665763855, + "learning_rate": 6.598927234487698e-06, + "loss": 0.2555, + "step": 32882 + }, + { + "epoch": 0.6104937194372556, + "grad_norm": 0.3993164896965027, + "learning_rate": 6.5978303053894165e-06, + "loss": 0.2592, + "step": 32884 + }, + { + "epoch": 0.6105308495746743, + "grad_norm": 0.4795207977294922, + "learning_rate": 6.596733422584478e-06, + "loss": 0.3893, + "step": 32886 + }, + { + "epoch": 0.6105679797120929, + "grad_norm": 0.5287923812866211, + "learning_rate": 6.5956365860878e-06, + "loss": 0.2128, + "step": 32888 + }, + { + "epoch": 0.6106051098495116, + "grad_norm": 0.34218212962150574, + "learning_rate": 6.594539795914315e-06, + "loss": 0.309, + "step": 32890 + }, + { + "epoch": 0.6106422399869302, + "grad_norm": 0.4681243300437927, + "learning_rate": 6.593443052078939e-06, + "loss": 0.1562, + "step": 32892 + }, + { + "epoch": 0.6106793701243488, + "grad_norm": 0.2978992462158203, + "learning_rate": 6.592346354596599e-06, + "loss": 0.2366, + "step": 32894 + }, + { + "epoch": 0.6107165002617675, + "grad_norm": 0.2970438301563263, + "learning_rate": 6.5912497034822185e-06, + "loss": 0.2118, + "step": 32896 + }, + { + "epoch": 0.6107536303991861, + "grad_norm": 0.49563807249069214, + "learning_rate": 6.590153098750721e-06, + "loss": 0.2844, + "step": 32898 + }, + { + "epoch": 0.6107907605366047, + "grad_norm": 0.2288004755973816, + "learning_rate": 6.5890565404170244e-06, + "loss": 0.3342, + "step": 32900 + }, + { + "epoch": 0.6108278906740234, + "grad_norm": 0.3173462748527527, + "learning_rate": 6.587960028496051e-06, + "loss": 0.1512, + "step": 32902 + }, + { + "epoch": 0.610865020811442, + "grad_norm": 0.42793482542037964, + "learning_rate": 6.586863563002725e-06, + "loss": 0.3627, + "step": 32904 + }, + { + "epoch": 0.6109021509488607, + "grad_norm": 0.5063623189926147, + "learning_rate": 6.58576714395196e-06, + "loss": 0.2035, + "step": 32906 + }, + { + "epoch": 0.6109392810862793, + "grad_norm": 0.35794708132743835, + "learning_rate": 6.5846707713586776e-06, + "loss": 0.4158, + "step": 32908 + }, + { + "epoch": 0.610976411223698, + "grad_norm": 0.2992331087589264, + "learning_rate": 6.583574445237795e-06, + "loss": 0.4699, + "step": 32910 + }, + { + "epoch": 0.6110135413611166, + "grad_norm": 0.3020668923854828, + "learning_rate": 6.582478165604229e-06, + "loss": 0.2197, + "step": 32912 + }, + { + "epoch": 0.6110506714985352, + "grad_norm": 0.4621583819389343, + "learning_rate": 6.581381932472901e-06, + "loss": 0.2897, + "step": 32914 + }, + { + "epoch": 0.6110878016359539, + "grad_norm": 0.31691548228263855, + "learning_rate": 6.580285745858728e-06, + "loss": 0.2745, + "step": 32916 + }, + { + "epoch": 0.6111249317733725, + "grad_norm": 0.34798476099967957, + "learning_rate": 6.579189605776617e-06, + "loss": 0.2279, + "step": 32918 + }, + { + "epoch": 0.6111620619107911, + "grad_norm": 0.2948818504810333, + "learning_rate": 6.578093512241492e-06, + "loss": 0.1182, + "step": 32920 + }, + { + "epoch": 0.6111991920482097, + "grad_norm": 0.41257402300834656, + "learning_rate": 6.576997465268264e-06, + "loss": 0.242, + "step": 32922 + }, + { + "epoch": 0.6112363221856284, + "grad_norm": 0.4669298827648163, + "learning_rate": 6.575901464871845e-06, + "loss": 0.3101, + "step": 32924 + }, + { + "epoch": 0.6112734523230471, + "grad_norm": 0.5174317359924316, + "learning_rate": 6.574805511067153e-06, + "loss": 0.2405, + "step": 32926 + }, + { + "epoch": 0.6113105824604657, + "grad_norm": 0.34562817215919495, + "learning_rate": 6.573709603869102e-06, + "loss": 0.2381, + "step": 32928 + }, + { + "epoch": 0.6113477125978843, + "grad_norm": 0.2984899878501892, + "learning_rate": 6.572613743292597e-06, + "loss": 0.3387, + "step": 32930 + }, + { + "epoch": 0.6113848427353029, + "grad_norm": 0.5052001476287842, + "learning_rate": 6.571517929352552e-06, + "loss": 0.3186, + "step": 32932 + }, + { + "epoch": 0.6114219728727216, + "grad_norm": 0.18774987757205963, + "learning_rate": 6.5704221620638785e-06, + "loss": 0.0648, + "step": 32934 + }, + { + "epoch": 0.6114591030101403, + "grad_norm": 0.5624311566352844, + "learning_rate": 6.569326441441486e-06, + "loss": 0.4532, + "step": 32936 + }, + { + "epoch": 0.6114962331475589, + "grad_norm": 0.1844814568758011, + "learning_rate": 6.568230767500287e-06, + "loss": 0.3266, + "step": 32938 + }, + { + "epoch": 0.6115333632849775, + "grad_norm": 0.3739783465862274, + "learning_rate": 6.567135140255185e-06, + "loss": 0.2318, + "step": 32940 + }, + { + "epoch": 0.6115704934223961, + "grad_norm": 0.2932315170764923, + "learning_rate": 6.566039559721096e-06, + "loss": 0.1094, + "step": 32942 + }, + { + "epoch": 0.6116076235598148, + "grad_norm": 0.47917065024375916, + "learning_rate": 6.564944025912918e-06, + "loss": 0.2782, + "step": 32944 + }, + { + "epoch": 0.6116447536972335, + "grad_norm": 0.46618887782096863, + "learning_rate": 6.563848538845563e-06, + "loss": 0.4315, + "step": 32946 + }, + { + "epoch": 0.611681883834652, + "grad_norm": 0.36694973707199097, + "learning_rate": 6.562753098533937e-06, + "loss": 0.3107, + "step": 32948 + }, + { + "epoch": 0.6117190139720707, + "grad_norm": 0.32681792974472046, + "learning_rate": 6.561657704992947e-06, + "loss": 0.2528, + "step": 32950 + }, + { + "epoch": 0.6117561441094893, + "grad_norm": 0.25283411145210266, + "learning_rate": 6.5605623582374966e-06, + "loss": 0.3299, + "step": 32952 + }, + { + "epoch": 0.611793274246908, + "grad_norm": 0.2517596483230591, + "learning_rate": 6.559467058282492e-06, + "loss": 0.2835, + "step": 32954 + }, + { + "epoch": 0.6118304043843267, + "grad_norm": 0.2988564074039459, + "learning_rate": 6.558371805142832e-06, + "loss": 0.1676, + "step": 32956 + }, + { + "epoch": 0.6118675345217452, + "grad_norm": 0.27291765809059143, + "learning_rate": 6.557276598833422e-06, + "loss": 0.1826, + "step": 32958 + }, + { + "epoch": 0.6119046646591639, + "grad_norm": 0.47746115922927856, + "learning_rate": 6.556181439369169e-06, + "loss": 0.5474, + "step": 32960 + }, + { + "epoch": 0.6119417947965825, + "grad_norm": 0.49516817927360535, + "learning_rate": 6.555086326764968e-06, + "loss": 0.1944, + "step": 32962 + }, + { + "epoch": 0.6119789249340012, + "grad_norm": 0.33475977182388306, + "learning_rate": 6.553991261035723e-06, + "loss": 0.1576, + "step": 32964 + }, + { + "epoch": 0.6120160550714199, + "grad_norm": 0.32170072197914124, + "learning_rate": 6.552896242196341e-06, + "loss": 0.2266, + "step": 32966 + }, + { + "epoch": 0.6120531852088384, + "grad_norm": 0.5481547117233276, + "learning_rate": 6.5518012702617106e-06, + "loss": 0.2796, + "step": 32968 + }, + { + "epoch": 0.6120903153462571, + "grad_norm": 0.42201921343803406, + "learning_rate": 6.550706345246736e-06, + "loss": 0.2333, + "step": 32970 + }, + { + "epoch": 0.6121274454836757, + "grad_norm": 0.4481096565723419, + "learning_rate": 6.549611467166318e-06, + "loss": 0.341, + "step": 32972 + }, + { + "epoch": 0.6121645756210944, + "grad_norm": 0.4292342960834503, + "learning_rate": 6.548516636035352e-06, + "loss": 0.2189, + "step": 32974 + }, + { + "epoch": 0.612201705758513, + "grad_norm": 0.48943889141082764, + "learning_rate": 6.547421851868735e-06, + "loss": 0.4204, + "step": 32976 + }, + { + "epoch": 0.6122388358959316, + "grad_norm": 0.780896008014679, + "learning_rate": 6.546327114681369e-06, + "loss": 0.2284, + "step": 32978 + }, + { + "epoch": 0.6122759660333503, + "grad_norm": 0.39008629322052, + "learning_rate": 6.545232424488143e-06, + "loss": 0.3385, + "step": 32980 + }, + { + "epoch": 0.6123130961707689, + "grad_norm": 0.3087436854839325, + "learning_rate": 6.544137781303956e-06, + "loss": 0.4305, + "step": 32982 + }, + { + "epoch": 0.6123502263081876, + "grad_norm": 0.3258281648159027, + "learning_rate": 6.543043185143702e-06, + "loss": 0.2434, + "step": 32984 + }, + { + "epoch": 0.6123873564456062, + "grad_norm": 0.30154532194137573, + "learning_rate": 6.541948636022274e-06, + "loss": 0.2862, + "step": 32986 + }, + { + "epoch": 0.6124244865830248, + "grad_norm": 0.3135763704776764, + "learning_rate": 6.540854133954569e-06, + "loss": 0.3838, + "step": 32988 + }, + { + "epoch": 0.6124616167204435, + "grad_norm": 0.28022530674934387, + "learning_rate": 6.539759678955477e-06, + "loss": 0.4314, + "step": 32990 + }, + { + "epoch": 0.6124987468578621, + "grad_norm": 0.3986649513244629, + "learning_rate": 6.538665271039892e-06, + "loss": 0.1938, + "step": 32992 + }, + { + "epoch": 0.6125358769952808, + "grad_norm": 0.3138667941093445, + "learning_rate": 6.537570910222706e-06, + "loss": 0.1722, + "step": 32994 + }, + { + "epoch": 0.6125730071326994, + "grad_norm": 0.32664528489112854, + "learning_rate": 6.536476596518806e-06, + "loss": 0.3035, + "step": 32996 + }, + { + "epoch": 0.612610137270118, + "grad_norm": 0.4007793068885803, + "learning_rate": 6.5353823299430855e-06, + "loss": 0.4964, + "step": 32998 + }, + { + "epoch": 0.6126472674075367, + "grad_norm": 0.49423709511756897, + "learning_rate": 6.534288110510434e-06, + "loss": 0.2719, + "step": 33000 + }, + { + "epoch": 0.6126843975449553, + "grad_norm": 0.33546826243400574, + "learning_rate": 6.533193938235742e-06, + "loss": 0.2384, + "step": 33002 + }, + { + "epoch": 0.612721527682374, + "grad_norm": 0.44680553674697876, + "learning_rate": 6.532099813133896e-06, + "loss": 0.2929, + "step": 33004 + }, + { + "epoch": 0.6127586578197926, + "grad_norm": 0.3409819006919861, + "learning_rate": 6.531005735219784e-06, + "loss": 0.2278, + "step": 33006 + }, + { + "epoch": 0.6127957879572112, + "grad_norm": 0.8074988126754761, + "learning_rate": 6.529911704508292e-06, + "loss": 0.5943, + "step": 33008 + }, + { + "epoch": 0.6128329180946299, + "grad_norm": 0.35389649868011475, + "learning_rate": 6.528817721014306e-06, + "loss": 0.2371, + "step": 33010 + }, + { + "epoch": 0.6128700482320485, + "grad_norm": 0.23854900896549225, + "learning_rate": 6.527723784752715e-06, + "loss": 0.2657, + "step": 33012 + }, + { + "epoch": 0.6129071783694672, + "grad_norm": 0.3040628731250763, + "learning_rate": 6.5266298957384035e-06, + "loss": 0.3001, + "step": 33014 + }, + { + "epoch": 0.6129443085068857, + "grad_norm": 0.32979947328567505, + "learning_rate": 6.525536053986257e-06, + "loss": 0.378, + "step": 33016 + }, + { + "epoch": 0.6129814386443044, + "grad_norm": 0.2979823350906372, + "learning_rate": 6.524442259511154e-06, + "loss": 0.3029, + "step": 33018 + }, + { + "epoch": 0.6130185687817231, + "grad_norm": 0.37575721740722656, + "learning_rate": 6.52334851232798e-06, + "loss": 0.4912, + "step": 33020 + }, + { + "epoch": 0.6130556989191417, + "grad_norm": 0.39125654101371765, + "learning_rate": 6.52225481245162e-06, + "loss": 0.3983, + "step": 33022 + }, + { + "epoch": 0.6130928290565604, + "grad_norm": 0.3909904956817627, + "learning_rate": 6.5211611598969585e-06, + "loss": 0.2852, + "step": 33024 + }, + { + "epoch": 0.613129959193979, + "grad_norm": 0.35755226016044617, + "learning_rate": 6.520067554678871e-06, + "loss": 0.0824, + "step": 33026 + }, + { + "epoch": 0.6131670893313976, + "grad_norm": 0.38608142733573914, + "learning_rate": 6.518973996812239e-06, + "loss": 0.2652, + "step": 33028 + }, + { + "epoch": 0.6132042194688162, + "grad_norm": 0.2963379919528961, + "learning_rate": 6.517880486311948e-06, + "loss": 0.2369, + "step": 33030 + }, + { + "epoch": 0.6132413496062349, + "grad_norm": 0.6357234120368958, + "learning_rate": 6.5167870231928695e-06, + "loss": 0.2369, + "step": 33032 + }, + { + "epoch": 0.6132784797436536, + "grad_norm": 0.48091399669647217, + "learning_rate": 6.515693607469888e-06, + "loss": 0.3405, + "step": 33034 + }, + { + "epoch": 0.6133156098810721, + "grad_norm": 0.583322286605835, + "learning_rate": 6.51460023915788e-06, + "loss": 0.3737, + "step": 33036 + }, + { + "epoch": 0.6133527400184908, + "grad_norm": 0.4173484444618225, + "learning_rate": 6.513506918271722e-06, + "loss": 0.5614, + "step": 33038 + }, + { + "epoch": 0.6133898701559094, + "grad_norm": 0.4138701856136322, + "learning_rate": 6.51241364482629e-06, + "loss": 0.1895, + "step": 33040 + }, + { + "epoch": 0.6134270002933281, + "grad_norm": 0.3064877986907959, + "learning_rate": 6.511320418836466e-06, + "loss": 0.2161, + "step": 33042 + }, + { + "epoch": 0.6134641304307468, + "grad_norm": 0.28886619210243225, + "learning_rate": 6.510227240317118e-06, + "loss": 0.2918, + "step": 33044 + }, + { + "epoch": 0.6135012605681653, + "grad_norm": 0.5084324479103088, + "learning_rate": 6.509134109283126e-06, + "loss": 0.5349, + "step": 33046 + }, + { + "epoch": 0.613538390705584, + "grad_norm": 0.6784845590591431, + "learning_rate": 6.508041025749361e-06, + "loss": 0.3064, + "step": 33048 + }, + { + "epoch": 0.6135755208430026, + "grad_norm": 0.36370372772216797, + "learning_rate": 6.506947989730696e-06, + "loss": 0.3529, + "step": 33050 + }, + { + "epoch": 0.6136126509804213, + "grad_norm": 0.3345828056335449, + "learning_rate": 6.505855001242009e-06, + "loss": 0.2032, + "step": 33052 + }, + { + "epoch": 0.61364978111784, + "grad_norm": 0.5631812810897827, + "learning_rate": 6.504762060298171e-06, + "loss": 0.3461, + "step": 33054 + }, + { + "epoch": 0.6136869112552585, + "grad_norm": 0.3804876506328583, + "learning_rate": 6.503669166914049e-06, + "loss": 0.1982, + "step": 33056 + }, + { + "epoch": 0.6137240413926772, + "grad_norm": 0.7347245812416077, + "learning_rate": 6.502576321104519e-06, + "loss": 0.2427, + "step": 33058 + }, + { + "epoch": 0.6137611715300958, + "grad_norm": 0.33977210521698, + "learning_rate": 6.501483522884446e-06, + "loss": 0.4583, + "step": 33060 + }, + { + "epoch": 0.6137983016675145, + "grad_norm": 0.3442073464393616, + "learning_rate": 6.500390772268703e-06, + "loss": 0.2786, + "step": 33062 + }, + { + "epoch": 0.6138354318049332, + "grad_norm": 0.5103311538696289, + "learning_rate": 6.49929806927216e-06, + "loss": 0.3866, + "step": 33064 + }, + { + "epoch": 0.6138725619423517, + "grad_norm": 0.49007707834243774, + "learning_rate": 6.498205413909686e-06, + "loss": 0.1851, + "step": 33066 + }, + { + "epoch": 0.6139096920797704, + "grad_norm": 0.5131375193595886, + "learning_rate": 6.497112806196151e-06, + "loss": 0.4461, + "step": 33068 + }, + { + "epoch": 0.613946822217189, + "grad_norm": 2.0458126068115234, + "learning_rate": 6.496020246146413e-06, + "loss": 0.1861, + "step": 33070 + }, + { + "epoch": 0.6139839523546077, + "grad_norm": 0.18598277866840363, + "learning_rate": 6.494927733775343e-06, + "loss": 0.1727, + "step": 33072 + }, + { + "epoch": 0.6140210824920262, + "grad_norm": 0.5223251581192017, + "learning_rate": 6.493835269097808e-06, + "loss": 0.2294, + "step": 33074 + }, + { + "epoch": 0.6140582126294449, + "grad_norm": 0.31790274381637573, + "learning_rate": 6.492742852128675e-06, + "loss": 0.3094, + "step": 33076 + }, + { + "epoch": 0.6140953427668636, + "grad_norm": 0.29239508509635925, + "learning_rate": 6.4916504828828055e-06, + "loss": 0.1357, + "step": 33078 + }, + { + "epoch": 0.6141324729042822, + "grad_norm": 0.40649569034576416, + "learning_rate": 6.4905581613750665e-06, + "loss": 0.2772, + "step": 33080 + }, + { + "epoch": 0.6141696030417009, + "grad_norm": 0.3389107286930084, + "learning_rate": 6.489465887620315e-06, + "loss": 0.3091, + "step": 33082 + }, + { + "epoch": 0.6142067331791194, + "grad_norm": 0.524733304977417, + "learning_rate": 6.4883736616334185e-06, + "loss": 0.3464, + "step": 33084 + }, + { + "epoch": 0.6142438633165381, + "grad_norm": 0.4763987064361572, + "learning_rate": 6.487281483429238e-06, + "loss": 0.1915, + "step": 33086 + }, + { + "epoch": 0.6142809934539568, + "grad_norm": 0.3909938335418701, + "learning_rate": 6.486189353022636e-06, + "loss": 0.3777, + "step": 33088 + }, + { + "epoch": 0.6143181235913754, + "grad_norm": 0.44554078578948975, + "learning_rate": 6.4850972704284695e-06, + "loss": 0.229, + "step": 33090 + }, + { + "epoch": 0.6143552537287941, + "grad_norm": 0.36556869745254517, + "learning_rate": 6.484005235661605e-06, + "loss": 0.2317, + "step": 33092 + }, + { + "epoch": 0.6143923838662126, + "grad_norm": 0.4382587969303131, + "learning_rate": 6.482913248736895e-06, + "loss": 0.2382, + "step": 33094 + }, + { + "epoch": 0.6144295140036313, + "grad_norm": 0.31412291526794434, + "learning_rate": 6.481821309669199e-06, + "loss": 0.1976, + "step": 33096 + }, + { + "epoch": 0.61446664414105, + "grad_norm": 0.2784813344478607, + "learning_rate": 6.480729418473379e-06, + "loss": 0.1606, + "step": 33098 + }, + { + "epoch": 0.6145037742784686, + "grad_norm": 0.3418746590614319, + "learning_rate": 6.479637575164289e-06, + "loss": 0.2878, + "step": 33100 + }, + { + "epoch": 0.6145409044158873, + "grad_norm": 0.3554566204547882, + "learning_rate": 6.478545779756787e-06, + "loss": 0.3583, + "step": 33102 + }, + { + "epoch": 0.6145780345533058, + "grad_norm": 0.4605807065963745, + "learning_rate": 6.477454032265733e-06, + "loss": 0.4636, + "step": 33104 + }, + { + "epoch": 0.6146151646907245, + "grad_norm": 0.38364318013191223, + "learning_rate": 6.4763623327059745e-06, + "loss": 0.2091, + "step": 33106 + }, + { + "epoch": 0.6146522948281432, + "grad_norm": 0.19605998694896698, + "learning_rate": 6.475270681092369e-06, + "loss": 0.251, + "step": 33108 + }, + { + "epoch": 0.6146894249655618, + "grad_norm": 0.3493518531322479, + "learning_rate": 6.474179077439774e-06, + "loss": 0.1905, + "step": 33110 + }, + { + "epoch": 0.6147265551029805, + "grad_norm": 0.3489682674407959, + "learning_rate": 6.473087521763039e-06, + "loss": 0.2315, + "step": 33112 + }, + { + "epoch": 0.614763685240399, + "grad_norm": 0.27524271607398987, + "learning_rate": 6.4719960140770175e-06, + "loss": 0.3106, + "step": 33114 + }, + { + "epoch": 0.6148008153778177, + "grad_norm": 0.45186784863471985, + "learning_rate": 6.470904554396564e-06, + "loss": 0.2757, + "step": 33116 + }, + { + "epoch": 0.6148379455152364, + "grad_norm": 0.459164559841156, + "learning_rate": 6.469813142736533e-06, + "loss": 0.2602, + "step": 33118 + }, + { + "epoch": 0.614875075652655, + "grad_norm": 0.28344520926475525, + "learning_rate": 6.4687217791117685e-06, + "loss": 0.1638, + "step": 33120 + }, + { + "epoch": 0.6149122057900737, + "grad_norm": 0.40025684237480164, + "learning_rate": 6.467630463537121e-06, + "loss": 0.374, + "step": 33122 + }, + { + "epoch": 0.6149493359274922, + "grad_norm": 0.4474346935749054, + "learning_rate": 6.4665391960274415e-06, + "loss": 0.3748, + "step": 33124 + }, + { + "epoch": 0.6149864660649109, + "grad_norm": 0.27468186616897583, + "learning_rate": 6.465447976597581e-06, + "loss": 0.2126, + "step": 33126 + }, + { + "epoch": 0.6150235962023295, + "grad_norm": 0.4217972457408905, + "learning_rate": 6.464356805262388e-06, + "loss": 0.3537, + "step": 33128 + }, + { + "epoch": 0.6150607263397482, + "grad_norm": 0.4222817122936249, + "learning_rate": 6.463265682036708e-06, + "loss": 0.1881, + "step": 33130 + }, + { + "epoch": 0.6150978564771669, + "grad_norm": 0.3665860891342163, + "learning_rate": 6.462174606935387e-06, + "loss": 0.3712, + "step": 33132 + }, + { + "epoch": 0.6151349866145854, + "grad_norm": 0.3084348440170288, + "learning_rate": 6.461083579973273e-06, + "loss": 0.311, + "step": 33134 + }, + { + "epoch": 0.6151721167520041, + "grad_norm": 0.5085157155990601, + "learning_rate": 6.45999260116521e-06, + "loss": 0.429, + "step": 33136 + }, + { + "epoch": 0.6152092468894227, + "grad_norm": 0.47789356112480164, + "learning_rate": 6.458901670526044e-06, + "loss": 0.263, + "step": 33138 + }, + { + "epoch": 0.6152463770268414, + "grad_norm": 0.39849966764450073, + "learning_rate": 6.457810788070622e-06, + "loss": 0.3208, + "step": 33140 + }, + { + "epoch": 0.6152835071642601, + "grad_norm": 0.3974080979824066, + "learning_rate": 6.456719953813786e-06, + "loss": 0.336, + "step": 33142 + }, + { + "epoch": 0.6153206373016786, + "grad_norm": 0.4094145894050598, + "learning_rate": 6.4556291677703766e-06, + "loss": 0.1664, + "step": 33144 + }, + { + "epoch": 0.6153577674390973, + "grad_norm": 0.6352247595787048, + "learning_rate": 6.454538429955235e-06, + "loss": 0.4701, + "step": 33146 + }, + { + "epoch": 0.6153948975765159, + "grad_norm": 0.3249083161354065, + "learning_rate": 6.453447740383208e-06, + "loss": 0.2302, + "step": 33148 + }, + { + "epoch": 0.6154320277139346, + "grad_norm": 0.3730751574039459, + "learning_rate": 6.452357099069131e-06, + "loss": 0.2201, + "step": 33150 + }, + { + "epoch": 0.6154691578513533, + "grad_norm": 0.6651400923728943, + "learning_rate": 6.451266506027851e-06, + "loss": 0.2801, + "step": 33152 + }, + { + "epoch": 0.6155062879887718, + "grad_norm": 0.49591532349586487, + "learning_rate": 6.450175961274203e-06, + "loss": 0.3879, + "step": 33154 + }, + { + "epoch": 0.6155434181261905, + "grad_norm": 0.325879842042923, + "learning_rate": 6.44908546482303e-06, + "loss": 0.253, + "step": 33156 + }, + { + "epoch": 0.6155805482636091, + "grad_norm": 0.34087178111076355, + "learning_rate": 6.4479950166891634e-06, + "loss": 0.2821, + "step": 33158 + }, + { + "epoch": 0.6156176784010278, + "grad_norm": 0.40923139452934265, + "learning_rate": 6.4469046168874464e-06, + "loss": 0.2897, + "step": 33160 + }, + { + "epoch": 0.6156548085384465, + "grad_norm": 0.3220416009426117, + "learning_rate": 6.445814265432716e-06, + "loss": 0.322, + "step": 33162 + }, + { + "epoch": 0.615691938675865, + "grad_norm": 0.24226497113704681, + "learning_rate": 6.444723962339805e-06, + "loss": 0.1943, + "step": 33164 + }, + { + "epoch": 0.6157290688132837, + "grad_norm": 0.3367232382297516, + "learning_rate": 6.443633707623553e-06, + "loss": 0.308, + "step": 33166 + }, + { + "epoch": 0.6157661989507023, + "grad_norm": 0.3824969530105591, + "learning_rate": 6.442543501298797e-06, + "loss": 0.3017, + "step": 33168 + }, + { + "epoch": 0.615803329088121, + "grad_norm": 0.36098381876945496, + "learning_rate": 6.441453343380364e-06, + "loss": 0.3882, + "step": 33170 + }, + { + "epoch": 0.6158404592255395, + "grad_norm": 0.44608891010284424, + "learning_rate": 6.440363233883091e-06, + "loss": 0.4947, + "step": 33172 + }, + { + "epoch": 0.6158775893629582, + "grad_norm": 0.3605786859989166, + "learning_rate": 6.439273172821815e-06, + "loss": 0.317, + "step": 33174 + }, + { + "epoch": 0.6159147195003769, + "grad_norm": 0.454386830329895, + "learning_rate": 6.438183160211363e-06, + "loss": 0.2987, + "step": 33176 + }, + { + "epoch": 0.6159518496377955, + "grad_norm": 0.3336338400840759, + "learning_rate": 6.437093196066571e-06, + "loss": 0.1687, + "step": 33178 + }, + { + "epoch": 0.6159889797752142, + "grad_norm": 0.44107571244239807, + "learning_rate": 6.43600328040227e-06, + "loss": 0.2225, + "step": 33180 + }, + { + "epoch": 0.6160261099126327, + "grad_norm": 0.4113306701183319, + "learning_rate": 6.434913413233286e-06, + "loss": 0.2094, + "step": 33182 + }, + { + "epoch": 0.6160632400500514, + "grad_norm": 0.4570377469062805, + "learning_rate": 6.433823594574454e-06, + "loss": 0.0955, + "step": 33184 + }, + { + "epoch": 0.6161003701874701, + "grad_norm": 0.4180873930454254, + "learning_rate": 6.432733824440599e-06, + "loss": 0.3208, + "step": 33186 + }, + { + "epoch": 0.6161375003248887, + "grad_norm": 0.3868831396102905, + "learning_rate": 6.4316441028465515e-06, + "loss": 0.198, + "step": 33188 + }, + { + "epoch": 0.6161746304623074, + "grad_norm": 0.25433504581451416, + "learning_rate": 6.430554429807139e-06, + "loss": 0.3004, + "step": 33190 + }, + { + "epoch": 0.6162117605997259, + "grad_norm": 0.2533932030200958, + "learning_rate": 6.429464805337191e-06, + "loss": 0.4459, + "step": 33192 + }, + { + "epoch": 0.6162488907371446, + "grad_norm": 0.298885315656662, + "learning_rate": 6.428375229451532e-06, + "loss": 0.2569, + "step": 33194 + }, + { + "epoch": 0.6162860208745633, + "grad_norm": 0.27680450677871704, + "learning_rate": 6.427285702164988e-06, + "loss": 0.3591, + "step": 33196 + }, + { + "epoch": 0.6163231510119819, + "grad_norm": 0.268452912569046, + "learning_rate": 6.426196223492383e-06, + "loss": 0.4684, + "step": 33198 + }, + { + "epoch": 0.6163602811494006, + "grad_norm": 0.46640223264694214, + "learning_rate": 6.425106793448541e-06, + "loss": 0.4145, + "step": 33200 + }, + { + "epoch": 0.6163974112868191, + "grad_norm": 0.468563973903656, + "learning_rate": 6.4240174120482875e-06, + "loss": 0.3722, + "step": 33202 + }, + { + "epoch": 0.6164345414242378, + "grad_norm": 0.2872581481933594, + "learning_rate": 6.422928079306449e-06, + "loss": 0.2299, + "step": 33204 + }, + { + "epoch": 0.6164716715616565, + "grad_norm": 0.5119061470031738, + "learning_rate": 6.4218387952378455e-06, + "loss": 0.2804, + "step": 33206 + }, + { + "epoch": 0.6165088016990751, + "grad_norm": 0.41904041171073914, + "learning_rate": 6.420749559857296e-06, + "loss": 0.2831, + "step": 33208 + }, + { + "epoch": 0.6165459318364938, + "grad_norm": 0.389636367559433, + "learning_rate": 6.419660373179622e-06, + "loss": 0.2257, + "step": 33210 + }, + { + "epoch": 0.6165830619739123, + "grad_norm": 0.38800057768821716, + "learning_rate": 6.4185712352196464e-06, + "loss": 0.1826, + "step": 33212 + }, + { + "epoch": 0.616620192111331, + "grad_norm": 0.3436047434806824, + "learning_rate": 6.417482145992191e-06, + "loss": 0.2883, + "step": 33214 + }, + { + "epoch": 0.6166573222487497, + "grad_norm": 0.3250725269317627, + "learning_rate": 6.416393105512071e-06, + "loss": 0.2469, + "step": 33216 + }, + { + "epoch": 0.6166944523861683, + "grad_norm": 0.451680451631546, + "learning_rate": 6.41530411379411e-06, + "loss": 0.1499, + "step": 33218 + }, + { + "epoch": 0.616731582523587, + "grad_norm": 0.3136092722415924, + "learning_rate": 6.414215170853119e-06, + "loss": 0.2315, + "step": 33220 + }, + { + "epoch": 0.6167687126610055, + "grad_norm": 0.6403464078903198, + "learning_rate": 6.413126276703918e-06, + "loss": 0.2244, + "step": 33222 + }, + { + "epoch": 0.6168058427984242, + "grad_norm": 0.26836350560188293, + "learning_rate": 6.412037431361326e-06, + "loss": 0.3053, + "step": 33224 + }, + { + "epoch": 0.6168429729358428, + "grad_norm": 0.41522327065467834, + "learning_rate": 6.410948634840158e-06, + "loss": 0.4316, + "step": 33226 + }, + { + "epoch": 0.6168801030732615, + "grad_norm": 0.30513033270835876, + "learning_rate": 6.409859887155227e-06, + "loss": 0.2539, + "step": 33228 + }, + { + "epoch": 0.6169172332106801, + "grad_norm": 0.34272241592407227, + "learning_rate": 6.4087711883213495e-06, + "loss": 0.1907, + "step": 33230 + }, + { + "epoch": 0.6169543633480987, + "grad_norm": 0.30178695917129517, + "learning_rate": 6.407682538353344e-06, + "loss": 0.2186, + "step": 33232 + }, + { + "epoch": 0.6169914934855174, + "grad_norm": 0.29983285069465637, + "learning_rate": 6.406593937266013e-06, + "loss": 0.2296, + "step": 33234 + }, + { + "epoch": 0.617028623622936, + "grad_norm": 0.41090255975723267, + "learning_rate": 6.405505385074179e-06, + "loss": 0.4401, + "step": 33236 + }, + { + "epoch": 0.6170657537603547, + "grad_norm": 0.32922253012657166, + "learning_rate": 6.404416881792646e-06, + "loss": 0.3256, + "step": 33238 + }, + { + "epoch": 0.6171028838977733, + "grad_norm": 0.2647010087966919, + "learning_rate": 6.4033284274362305e-06, + "loss": 0.3941, + "step": 33240 + }, + { + "epoch": 0.6171400140351919, + "grad_norm": 0.5008987784385681, + "learning_rate": 6.402240022019741e-06, + "loss": 0.297, + "step": 33242 + }, + { + "epoch": 0.6171771441726106, + "grad_norm": 0.4774961471557617, + "learning_rate": 6.401151665557994e-06, + "loss": 0.2839, + "step": 33244 + }, + { + "epoch": 0.6172142743100292, + "grad_norm": 0.2755691409111023, + "learning_rate": 6.400063358065787e-06, + "loss": 0.2683, + "step": 33246 + }, + { + "epoch": 0.6172514044474479, + "grad_norm": 0.909270167350769, + "learning_rate": 6.398975099557938e-06, + "loss": 0.422, + "step": 33248 + }, + { + "epoch": 0.6172885345848665, + "grad_norm": 0.2864827513694763, + "learning_rate": 6.397886890049249e-06, + "loss": 0.1347, + "step": 33250 + }, + { + "epoch": 0.6173256647222851, + "grad_norm": 0.5350677371025085, + "learning_rate": 6.39679872955453e-06, + "loss": 0.2044, + "step": 33252 + }, + { + "epoch": 0.6173627948597038, + "grad_norm": 0.7250619530677795, + "learning_rate": 6.395710618088587e-06, + "loss": 0.3853, + "step": 33254 + }, + { + "epoch": 0.6173999249971224, + "grad_norm": 0.6137080192565918, + "learning_rate": 6.39462255566623e-06, + "loss": 0.3967, + "step": 33256 + }, + { + "epoch": 0.6174370551345411, + "grad_norm": 0.41511866450309753, + "learning_rate": 6.393534542302258e-06, + "loss": 0.4025, + "step": 33258 + }, + { + "epoch": 0.6174741852719597, + "grad_norm": 0.7183419466018677, + "learning_rate": 6.39244657801148e-06, + "loss": 0.3129, + "step": 33260 + }, + { + "epoch": 0.6175113154093783, + "grad_norm": 0.4176501929759979, + "learning_rate": 6.391358662808695e-06, + "loss": 0.137, + "step": 33262 + }, + { + "epoch": 0.617548445546797, + "grad_norm": 0.39454394578933716, + "learning_rate": 6.390270796708711e-06, + "loss": 0.2419, + "step": 33264 + }, + { + "epoch": 0.6175855756842156, + "grad_norm": 0.28319844603538513, + "learning_rate": 6.389182979726331e-06, + "loss": 0.1551, + "step": 33266 + }, + { + "epoch": 0.6176227058216343, + "grad_norm": 0.35768696665763855, + "learning_rate": 6.3880952118763525e-06, + "loss": 0.3303, + "step": 33268 + }, + { + "epoch": 0.6176598359590529, + "grad_norm": 0.3630194067955017, + "learning_rate": 6.3870074931735824e-06, + "loss": 0.2665, + "step": 33270 + }, + { + "epoch": 0.6176969660964715, + "grad_norm": 0.3873700797557831, + "learning_rate": 6.385919823632815e-06, + "loss": 0.2194, + "step": 33272 + }, + { + "epoch": 0.6177340962338902, + "grad_norm": 0.45491498708724976, + "learning_rate": 6.384832203268854e-06, + "loss": 0.2585, + "step": 33274 + }, + { + "epoch": 0.6177712263713088, + "grad_norm": 0.35012829303741455, + "learning_rate": 6.383744632096495e-06, + "loss": 0.379, + "step": 33276 + }, + { + "epoch": 0.6178083565087275, + "grad_norm": 0.3113013505935669, + "learning_rate": 6.382657110130546e-06, + "loss": 0.4105, + "step": 33278 + }, + { + "epoch": 0.617845486646146, + "grad_norm": 0.4309644103050232, + "learning_rate": 6.381569637385794e-06, + "loss": 0.2201, + "step": 33280 + }, + { + "epoch": 0.6178826167835647, + "grad_norm": 0.3338206112384796, + "learning_rate": 6.380482213877045e-06, + "loss": 0.3196, + "step": 33282 + }, + { + "epoch": 0.6179197469209834, + "grad_norm": 0.4329235255718231, + "learning_rate": 6.379394839619088e-06, + "loss": 0.1962, + "step": 33284 + }, + { + "epoch": 0.617956877058402, + "grad_norm": 0.3053397536277771, + "learning_rate": 6.378307514626722e-06, + "loss": 0.222, + "step": 33286 + }, + { + "epoch": 0.6179940071958206, + "grad_norm": 0.4653834104537964, + "learning_rate": 6.377220238914741e-06, + "loss": 0.274, + "step": 33288 + }, + { + "epoch": 0.6180311373332392, + "grad_norm": 0.3061346411705017, + "learning_rate": 6.376133012497945e-06, + "loss": 0.1059, + "step": 33290 + }, + { + "epoch": 0.6180682674706579, + "grad_norm": 0.39407241344451904, + "learning_rate": 6.375045835391121e-06, + "loss": 0.3046, + "step": 33292 + }, + { + "epoch": 0.6181053976080766, + "grad_norm": 0.20992477238178253, + "learning_rate": 6.373958707609069e-06, + "loss": 0.2434, + "step": 33294 + }, + { + "epoch": 0.6181425277454952, + "grad_norm": 0.37773194909095764, + "learning_rate": 6.372871629166575e-06, + "loss": 0.4344, + "step": 33296 + }, + { + "epoch": 0.6181796578829138, + "grad_norm": 0.34386521577835083, + "learning_rate": 6.371784600078433e-06, + "loss": 0.3349, + "step": 33298 + }, + { + "epoch": 0.6182167880203324, + "grad_norm": 0.1684887409210205, + "learning_rate": 6.370697620359436e-06, + "loss": 0.1522, + "step": 33300 + }, + { + "epoch": 0.6182539181577511, + "grad_norm": 0.43745386600494385, + "learning_rate": 6.369610690024373e-06, + "loss": 0.3218, + "step": 33302 + }, + { + "epoch": 0.6182910482951698, + "grad_norm": 0.5287793874740601, + "learning_rate": 6.368523809088034e-06, + "loss": 0.1382, + "step": 33304 + }, + { + "epoch": 0.6183281784325884, + "grad_norm": 0.35927218198776245, + "learning_rate": 6.3674369775652115e-06, + "loss": 0.4028, + "step": 33306 + }, + { + "epoch": 0.618365308570007, + "grad_norm": 0.4231165945529938, + "learning_rate": 6.366350195470687e-06, + "loss": 0.4429, + "step": 33308 + }, + { + "epoch": 0.6184024387074256, + "grad_norm": 0.39693304896354675, + "learning_rate": 6.3652634628192525e-06, + "loss": 0.2971, + "step": 33310 + }, + { + "epoch": 0.6184395688448443, + "grad_norm": 0.49616900086402893, + "learning_rate": 6.364176779625697e-06, + "loss": 0.2773, + "step": 33312 + }, + { + "epoch": 0.618476698982263, + "grad_norm": 0.36219197511672974, + "learning_rate": 6.3630901459048025e-06, + "loss": 0.1197, + "step": 33314 + }, + { + "epoch": 0.6185138291196816, + "grad_norm": 0.4326838552951813, + "learning_rate": 6.362003561671358e-06, + "loss": 0.2995, + "step": 33316 + }, + { + "epoch": 0.6185509592571002, + "grad_norm": 0.25405293703079224, + "learning_rate": 6.360917026940147e-06, + "loss": 0.2371, + "step": 33318 + }, + { + "epoch": 0.6185880893945188, + "grad_norm": 0.30564671754837036, + "learning_rate": 6.35983054172596e-06, + "loss": 0.1064, + "step": 33320 + }, + { + "epoch": 0.6186252195319375, + "grad_norm": 0.3000064194202423, + "learning_rate": 6.358744106043574e-06, + "loss": 0.3431, + "step": 33322 + }, + { + "epoch": 0.6186623496693561, + "grad_norm": 0.423822820186615, + "learning_rate": 6.357657719907772e-06, + "loss": 0.3845, + "step": 33324 + }, + { + "epoch": 0.6186994798067748, + "grad_norm": 0.3667752146720886, + "learning_rate": 6.356571383333337e-06, + "loss": 0.3177, + "step": 33326 + }, + { + "epoch": 0.6187366099441934, + "grad_norm": 0.2843489944934845, + "learning_rate": 6.355485096335052e-06, + "loss": 0.251, + "step": 33328 + }, + { + "epoch": 0.618773740081612, + "grad_norm": 0.8024854063987732, + "learning_rate": 6.354398858927701e-06, + "loss": 0.4505, + "step": 33330 + }, + { + "epoch": 0.6188108702190307, + "grad_norm": 0.31695571541786194, + "learning_rate": 6.353312671126063e-06, + "loss": 0.1438, + "step": 33332 + }, + { + "epoch": 0.6188480003564493, + "grad_norm": 0.32730311155319214, + "learning_rate": 6.352226532944915e-06, + "loss": 0.2741, + "step": 33334 + }, + { + "epoch": 0.618885130493868, + "grad_norm": 0.40460073947906494, + "learning_rate": 6.351140444399035e-06, + "loss": 0.3114, + "step": 33336 + }, + { + "epoch": 0.6189222606312866, + "grad_norm": 0.5215739607810974, + "learning_rate": 6.350054405503205e-06, + "loss": 0.3624, + "step": 33338 + }, + { + "epoch": 0.6189593907687052, + "grad_norm": 0.4731350243091583, + "learning_rate": 6.348968416272202e-06, + "loss": 0.4386, + "step": 33340 + }, + { + "epoch": 0.6189965209061239, + "grad_norm": 0.25077199935913086, + "learning_rate": 6.347882476720803e-06, + "loss": 0.1345, + "step": 33342 + }, + { + "epoch": 0.6190336510435425, + "grad_norm": 0.3972725570201874, + "learning_rate": 6.346796586863788e-06, + "loss": 0.3556, + "step": 33344 + }, + { + "epoch": 0.6190707811809611, + "grad_norm": 0.27672943472862244, + "learning_rate": 6.345710746715925e-06, + "loss": 0.3759, + "step": 33346 + }, + { + "epoch": 0.6191079113183798, + "grad_norm": 0.3596516251564026, + "learning_rate": 6.344624956291991e-06, + "loss": 0.4195, + "step": 33348 + }, + { + "epoch": 0.6191450414557984, + "grad_norm": 0.457146555185318, + "learning_rate": 6.343539215606764e-06, + "loss": 0.1052, + "step": 33350 + }, + { + "epoch": 0.6191821715932171, + "grad_norm": 0.3335888087749481, + "learning_rate": 6.342453524675016e-06, + "loss": 0.1799, + "step": 33352 + }, + { + "epoch": 0.6192193017306357, + "grad_norm": 0.3823656737804413, + "learning_rate": 6.34136788351152e-06, + "loss": 0.2373, + "step": 33354 + }, + { + "epoch": 0.6192564318680543, + "grad_norm": 0.3855670094490051, + "learning_rate": 6.340282292131048e-06, + "loss": 0.1443, + "step": 33356 + }, + { + "epoch": 0.619293562005473, + "grad_norm": 0.437018483877182, + "learning_rate": 6.3391967505483756e-06, + "loss": 0.3183, + "step": 33358 + }, + { + "epoch": 0.6193306921428916, + "grad_norm": 0.3893338739871979, + "learning_rate": 6.338111258778264e-06, + "loss": 0.137, + "step": 33360 + }, + { + "epoch": 0.6193678222803103, + "grad_norm": 0.30368420481681824, + "learning_rate": 6.337025816835491e-06, + "loss": 0.1675, + "step": 33362 + }, + { + "epoch": 0.6194049524177289, + "grad_norm": 0.3592333197593689, + "learning_rate": 6.335940424734828e-06, + "loss": 0.4007, + "step": 33364 + }, + { + "epoch": 0.6194420825551475, + "grad_norm": 0.21633483469486237, + "learning_rate": 6.334855082491037e-06, + "loss": 0.2542, + "step": 33366 + }, + { + "epoch": 0.6194792126925662, + "grad_norm": 0.5048404335975647, + "learning_rate": 6.333769790118891e-06, + "loss": 0.1114, + "step": 33368 + }, + { + "epoch": 0.6195163428299848, + "grad_norm": 0.39145511388778687, + "learning_rate": 6.33268454763316e-06, + "loss": 0.1339, + "step": 33370 + }, + { + "epoch": 0.6195534729674035, + "grad_norm": 0.4640060067176819, + "learning_rate": 6.3315993550486036e-06, + "loss": 0.3211, + "step": 33372 + }, + { + "epoch": 0.6195906031048221, + "grad_norm": 0.3121573030948639, + "learning_rate": 6.3305142123799935e-06, + "loss": 0.1447, + "step": 33374 + }, + { + "epoch": 0.6196277332422407, + "grad_norm": 0.43064436316490173, + "learning_rate": 6.329429119642092e-06, + "loss": 0.3384, + "step": 33376 + }, + { + "epoch": 0.6196648633796593, + "grad_norm": 0.3839457631111145, + "learning_rate": 6.328344076849665e-06, + "loss": 0.3374, + "step": 33378 + }, + { + "epoch": 0.619701993517078, + "grad_norm": 0.5601218342781067, + "learning_rate": 6.327259084017478e-06, + "loss": 0.4757, + "step": 33380 + }, + { + "epoch": 0.6197391236544967, + "grad_norm": 0.31021183729171753, + "learning_rate": 6.326174141160297e-06, + "loss": 0.2898, + "step": 33382 + }, + { + "epoch": 0.6197762537919153, + "grad_norm": 0.4099505543708801, + "learning_rate": 6.325089248292878e-06, + "loss": 0.3153, + "step": 33384 + }, + { + "epoch": 0.6198133839293339, + "grad_norm": 0.47506025433540344, + "learning_rate": 6.324004405429988e-06, + "loss": 0.1783, + "step": 33386 + }, + { + "epoch": 0.6198505140667525, + "grad_norm": 0.6071589589118958, + "learning_rate": 6.322919612586387e-06, + "loss": 0.2586, + "step": 33388 + }, + { + "epoch": 0.6198876442041712, + "grad_norm": 0.4597158133983612, + "learning_rate": 6.321834869776835e-06, + "loss": 0.3362, + "step": 33390 + }, + { + "epoch": 0.6199247743415899, + "grad_norm": 0.5954412221908569, + "learning_rate": 6.320750177016092e-06, + "loss": 0.242, + "step": 33392 + }, + { + "epoch": 0.6199619044790085, + "grad_norm": 0.3859167993068695, + "learning_rate": 6.3196655343189235e-06, + "loss": 0.2448, + "step": 33394 + }, + { + "epoch": 0.6199990346164271, + "grad_norm": 0.38169625401496887, + "learning_rate": 6.318580941700079e-06, + "loss": 0.2692, + "step": 33396 + }, + { + "epoch": 0.6200361647538457, + "grad_norm": 0.4628596901893616, + "learning_rate": 6.317496399174322e-06, + "loss": 0.2511, + "step": 33398 + }, + { + "epoch": 0.6200732948912644, + "grad_norm": 0.37985485792160034, + "learning_rate": 6.316411906756408e-06, + "loss": 0.1323, + "step": 33400 + }, + { + "epoch": 0.6201104250286831, + "grad_norm": 0.3160107135772705, + "learning_rate": 6.315327464461094e-06, + "loss": 0.3513, + "step": 33402 + }, + { + "epoch": 0.6201475551661016, + "grad_norm": 0.9894459843635559, + "learning_rate": 6.314243072303137e-06, + "loss": 0.3699, + "step": 33404 + }, + { + "epoch": 0.6201846853035203, + "grad_norm": 0.5605846047401428, + "learning_rate": 6.313158730297291e-06, + "loss": 0.3255, + "step": 33406 + }, + { + "epoch": 0.6202218154409389, + "grad_norm": 0.7524283528327942, + "learning_rate": 6.312074438458316e-06, + "loss": 0.4171, + "step": 33408 + }, + { + "epoch": 0.6202589455783576, + "grad_norm": 0.40950241684913635, + "learning_rate": 6.310990196800955e-06, + "loss": 0.2222, + "step": 33410 + }, + { + "epoch": 0.6202960757157763, + "grad_norm": 0.34139859676361084, + "learning_rate": 6.3099060053399685e-06, + "loss": 0.1857, + "step": 33412 + }, + { + "epoch": 0.6203332058531948, + "grad_norm": 0.3612729012966156, + "learning_rate": 6.308821864090109e-06, + "loss": 0.3563, + "step": 33414 + }, + { + "epoch": 0.6203703359906135, + "grad_norm": 0.28802332282066345, + "learning_rate": 6.307737773066129e-06, + "loss": 0.3761, + "step": 33416 + }, + { + "epoch": 0.6204074661280321, + "grad_norm": 0.45993295311927795, + "learning_rate": 6.306653732282776e-06, + "loss": 0.2985, + "step": 33418 + }, + { + "epoch": 0.6204445962654508, + "grad_norm": 0.3681545853614807, + "learning_rate": 6.305569741754807e-06, + "loss": 0.2873, + "step": 33420 + }, + { + "epoch": 0.6204817264028695, + "grad_norm": 0.28756701946258545, + "learning_rate": 6.304485801496964e-06, + "loss": 0.4095, + "step": 33422 + }, + { + "epoch": 0.620518856540288, + "grad_norm": 0.24331222474575043, + "learning_rate": 6.3034019115239995e-06, + "loss": 0.1753, + "step": 33424 + }, + { + "epoch": 0.6205559866777067, + "grad_norm": 0.5234178304672241, + "learning_rate": 6.302318071850664e-06, + "loss": 0.3179, + "step": 33426 + }, + { + "epoch": 0.6205931168151253, + "grad_norm": 0.3460370600223541, + "learning_rate": 6.301234282491704e-06, + "loss": 0.2699, + "step": 33428 + }, + { + "epoch": 0.620630246952544, + "grad_norm": 0.47693508863449097, + "learning_rate": 6.300150543461865e-06, + "loss": 0.24, + "step": 33430 + }, + { + "epoch": 0.6206673770899626, + "grad_norm": 0.5901803374290466, + "learning_rate": 6.299066854775897e-06, + "loss": 0.4031, + "step": 33432 + }, + { + "epoch": 0.6207045072273812, + "grad_norm": 0.4246162474155426, + "learning_rate": 6.297983216448542e-06, + "loss": 0.2395, + "step": 33434 + }, + { + "epoch": 0.6207416373647999, + "grad_norm": 0.5186436176300049, + "learning_rate": 6.296899628494545e-06, + "loss": 0.1754, + "step": 33436 + }, + { + "epoch": 0.6207787675022185, + "grad_norm": 0.35105907917022705, + "learning_rate": 6.295816090928654e-06, + "loss": 0.2276, + "step": 33438 + }, + { + "epoch": 0.6208158976396372, + "grad_norm": 0.4499373137950897, + "learning_rate": 6.29473260376561e-06, + "loss": 0.2084, + "step": 33440 + }, + { + "epoch": 0.6208530277770558, + "grad_norm": 0.30024397373199463, + "learning_rate": 6.293649167020156e-06, + "loss": 0.1673, + "step": 33442 + }, + { + "epoch": 0.6208901579144744, + "grad_norm": 0.23456653952598572, + "learning_rate": 6.292565780707035e-06, + "loss": 0.3916, + "step": 33444 + }, + { + "epoch": 0.6209272880518931, + "grad_norm": 0.3159496784210205, + "learning_rate": 6.291482444840993e-06, + "loss": 0.2663, + "step": 33446 + }, + { + "epoch": 0.6209644181893117, + "grad_norm": 0.4011788070201874, + "learning_rate": 6.290399159436762e-06, + "loss": 0.4351, + "step": 33448 + }, + { + "epoch": 0.6210015483267304, + "grad_norm": 0.3560031056404114, + "learning_rate": 6.2893159245090895e-06, + "loss": 0.2579, + "step": 33450 + }, + { + "epoch": 0.621038678464149, + "grad_norm": 0.3825017511844635, + "learning_rate": 6.288232740072711e-06, + "loss": 0.1405, + "step": 33452 + }, + { + "epoch": 0.6210758086015676, + "grad_norm": 0.4587399363517761, + "learning_rate": 6.287149606142364e-06, + "loss": 0.1194, + "step": 33454 + }, + { + "epoch": 0.6211129387389863, + "grad_norm": 0.40120425820350647, + "learning_rate": 6.286066522732792e-06, + "loss": 0.5247, + "step": 33456 + }, + { + "epoch": 0.6211500688764049, + "grad_norm": 0.41624021530151367, + "learning_rate": 6.2849834898587335e-06, + "loss": 0.3626, + "step": 33458 + }, + { + "epoch": 0.6211871990138236, + "grad_norm": 0.4103415012359619, + "learning_rate": 6.283900507534921e-06, + "loss": 0.2227, + "step": 33460 + }, + { + "epoch": 0.6212243291512421, + "grad_norm": 0.6267563700675964, + "learning_rate": 6.282817575776089e-06, + "loss": 0.2891, + "step": 33462 + }, + { + "epoch": 0.6212614592886608, + "grad_norm": 0.602595865726471, + "learning_rate": 6.281734694596975e-06, + "loss": 0.1885, + "step": 33464 + }, + { + "epoch": 0.6212985894260795, + "grad_norm": 0.2548821270465851, + "learning_rate": 6.280651864012315e-06, + "loss": 0.2907, + "step": 33466 + }, + { + "epoch": 0.6213357195634981, + "grad_norm": 0.4155820906162262, + "learning_rate": 6.279569084036844e-06, + "loss": 0.3823, + "step": 33468 + }, + { + "epoch": 0.6213728497009168, + "grad_norm": 0.4594273865222931, + "learning_rate": 6.278486354685294e-06, + "loss": 0.4306, + "step": 33470 + }, + { + "epoch": 0.6214099798383353, + "grad_norm": 0.4496306777000427, + "learning_rate": 6.277403675972397e-06, + "loss": 0.1366, + "step": 33472 + }, + { + "epoch": 0.621447109975754, + "grad_norm": 0.512714684009552, + "learning_rate": 6.276321047912883e-06, + "loss": 0.2604, + "step": 33474 + }, + { + "epoch": 0.6214842401131726, + "grad_norm": 0.5083045363426208, + "learning_rate": 6.275238470521487e-06, + "loss": 0.2115, + "step": 33476 + }, + { + "epoch": 0.6215213702505913, + "grad_norm": 0.26935267448425293, + "learning_rate": 6.274155943812938e-06, + "loss": 0.225, + "step": 33478 + }, + { + "epoch": 0.62155850038801, + "grad_norm": 0.32789263129234314, + "learning_rate": 6.273073467801969e-06, + "loss": 0.2408, + "step": 33480 + }, + { + "epoch": 0.6215956305254285, + "grad_norm": 0.2412440925836563, + "learning_rate": 6.271991042503305e-06, + "loss": 0.2569, + "step": 33482 + }, + { + "epoch": 0.6216327606628472, + "grad_norm": 0.3709932267665863, + "learning_rate": 6.270908667931679e-06, + "loss": 0.4383, + "step": 33484 + }, + { + "epoch": 0.6216698908002658, + "grad_norm": 0.3263992369174957, + "learning_rate": 6.269826344101813e-06, + "loss": 0.435, + "step": 33486 + }, + { + "epoch": 0.6217070209376845, + "grad_norm": 0.46520960330963135, + "learning_rate": 6.268744071028437e-06, + "loss": 0.3145, + "step": 33488 + }, + { + "epoch": 0.6217441510751032, + "grad_norm": 0.3823679983615875, + "learning_rate": 6.267661848726279e-06, + "loss": 0.3142, + "step": 33490 + }, + { + "epoch": 0.6217812812125217, + "grad_norm": 0.4349318742752075, + "learning_rate": 6.266579677210062e-06, + "loss": 0.4574, + "step": 33492 + }, + { + "epoch": 0.6218184113499404, + "grad_norm": 0.345052033662796, + "learning_rate": 6.265497556494513e-06, + "loss": 0.2733, + "step": 33494 + }, + { + "epoch": 0.621855541487359, + "grad_norm": 0.8973308801651001, + "learning_rate": 6.26441548659436e-06, + "loss": 0.33, + "step": 33496 + }, + { + "epoch": 0.6218926716247777, + "grad_norm": 0.38141483068466187, + "learning_rate": 6.263333467524319e-06, + "loss": 0.3403, + "step": 33498 + }, + { + "epoch": 0.6219298017621964, + "grad_norm": 0.27269378304481506, + "learning_rate": 6.262251499299117e-06, + "loss": 0.1462, + "step": 33500 + }, + { + "epoch": 0.6219669318996149, + "grad_norm": 0.6646750569343567, + "learning_rate": 6.261169581933477e-06, + "loss": 0.4609, + "step": 33502 + }, + { + "epoch": 0.6220040620370336, + "grad_norm": 0.49614548683166504, + "learning_rate": 6.260087715442119e-06, + "loss": 0.3653, + "step": 33504 + }, + { + "epoch": 0.6220411921744522, + "grad_norm": 0.4744861125946045, + "learning_rate": 6.2590058998397654e-06, + "loss": 0.3324, + "step": 33506 + }, + { + "epoch": 0.6220783223118709, + "grad_norm": 0.3603343069553375, + "learning_rate": 6.257924135141139e-06, + "loss": 0.4553, + "step": 33508 + }, + { + "epoch": 0.6221154524492896, + "grad_norm": 0.4117833375930786, + "learning_rate": 6.2568424213609525e-06, + "loss": 0.3819, + "step": 33510 + }, + { + "epoch": 0.6221525825867081, + "grad_norm": 0.3570064902305603, + "learning_rate": 6.255760758513931e-06, + "loss": 0.3176, + "step": 33512 + }, + { + "epoch": 0.6221897127241268, + "grad_norm": 0.5464138388633728, + "learning_rate": 6.254679146614788e-06, + "loss": 0.2544, + "step": 33514 + }, + { + "epoch": 0.6222268428615454, + "grad_norm": 0.2973516285419464, + "learning_rate": 6.253597585678243e-06, + "loss": 0.2901, + "step": 33516 + }, + { + "epoch": 0.6222639729989641, + "grad_norm": 0.38917866349220276, + "learning_rate": 6.252516075719013e-06, + "loss": 0.3537, + "step": 33518 + }, + { + "epoch": 0.6223011031363828, + "grad_norm": 0.2568664848804474, + "learning_rate": 6.251434616751817e-06, + "loss": 0.1946, + "step": 33520 + }, + { + "epoch": 0.6223382332738013, + "grad_norm": 0.8497270345687866, + "learning_rate": 6.250353208791367e-06, + "loss": 0.1758, + "step": 33522 + }, + { + "epoch": 0.62237536341122, + "grad_norm": 0.3346641957759857, + "learning_rate": 6.249271851852379e-06, + "loss": 0.2139, + "step": 33524 + }, + { + "epoch": 0.6224124935486386, + "grad_norm": 0.4493895471096039, + "learning_rate": 6.2481905459495625e-06, + "loss": 0.5815, + "step": 33526 + }, + { + "epoch": 0.6224496236860573, + "grad_norm": 0.31807294487953186, + "learning_rate": 6.247109291097637e-06, + "loss": 0.2525, + "step": 33528 + }, + { + "epoch": 0.6224867538234758, + "grad_norm": 0.7084963321685791, + "learning_rate": 6.246028087311311e-06, + "loss": 0.2961, + "step": 33530 + }, + { + "epoch": 0.6225238839608945, + "grad_norm": 0.49804481863975525, + "learning_rate": 6.244946934605302e-06, + "loss": 0.223, + "step": 33532 + }, + { + "epoch": 0.6225610140983132, + "grad_norm": 0.39025992155075073, + "learning_rate": 6.243865832994316e-06, + "loss": 0.3599, + "step": 33534 + }, + { + "epoch": 0.6225981442357318, + "grad_norm": 0.26710212230682373, + "learning_rate": 6.242784782493066e-06, + "loss": 0.2713, + "step": 33536 + }, + { + "epoch": 0.6226352743731505, + "grad_norm": 0.40024060010910034, + "learning_rate": 6.24170378311626e-06, + "loss": 0.2298, + "step": 33538 + }, + { + "epoch": 0.622672404510569, + "grad_norm": 0.3097241520881653, + "learning_rate": 6.240622834878606e-06, + "loss": 0.2941, + "step": 33540 + }, + { + "epoch": 0.6227095346479877, + "grad_norm": 0.5373071432113647, + "learning_rate": 6.239541937794818e-06, + "loss": 0.3938, + "step": 33542 + }, + { + "epoch": 0.6227466647854064, + "grad_norm": 0.2220676839351654, + "learning_rate": 6.238461091879597e-06, + "loss": 0.3212, + "step": 33544 + }, + { + "epoch": 0.622783794922825, + "grad_norm": 0.3487304151058197, + "learning_rate": 6.237380297147658e-06, + "loss": 0.2238, + "step": 33546 + }, + { + "epoch": 0.6228209250602437, + "grad_norm": 0.45740750432014465, + "learning_rate": 6.236299553613699e-06, + "loss": 0.386, + "step": 33548 + }, + { + "epoch": 0.6228580551976622, + "grad_norm": 0.3692704737186432, + "learning_rate": 6.235218861292429e-06, + "loss": 0.3395, + "step": 33550 + }, + { + "epoch": 0.6228951853350809, + "grad_norm": 0.5486975312232971, + "learning_rate": 6.234138220198554e-06, + "loss": 0.4075, + "step": 33552 + }, + { + "epoch": 0.6229323154724996, + "grad_norm": 0.3254355490207672, + "learning_rate": 6.233057630346781e-06, + "loss": 0.28, + "step": 33554 + }, + { + "epoch": 0.6229694456099182, + "grad_norm": 0.5588211417198181, + "learning_rate": 6.231977091751806e-06, + "loss": 0.3147, + "step": 33556 + }, + { + "epoch": 0.6230065757473369, + "grad_norm": 0.3213815689086914, + "learning_rate": 6.230896604428343e-06, + "loss": 0.4682, + "step": 33558 + }, + { + "epoch": 0.6230437058847554, + "grad_norm": 0.3725796639919281, + "learning_rate": 6.2298161683910805e-06, + "loss": 0.3156, + "step": 33560 + }, + { + "epoch": 0.6230808360221741, + "grad_norm": 0.31407278776168823, + "learning_rate": 6.2287357836547294e-06, + "loss": 0.3139, + "step": 33562 + }, + { + "epoch": 0.6231179661595928, + "grad_norm": 0.4046039283275604, + "learning_rate": 6.227655450233986e-06, + "loss": 0.1866, + "step": 33564 + }, + { + "epoch": 0.6231550962970114, + "grad_norm": 0.26877516508102417, + "learning_rate": 6.226575168143555e-06, + "loss": 0.2306, + "step": 33566 + }, + { + "epoch": 0.6231922264344301, + "grad_norm": 0.3913203775882721, + "learning_rate": 6.2254949373981314e-06, + "loss": 0.333, + "step": 33568 + }, + { + "epoch": 0.6232293565718486, + "grad_norm": 0.3008491098880768, + "learning_rate": 6.224414758012416e-06, + "loss": 0.1067, + "step": 33570 + }, + { + "epoch": 0.6232664867092673, + "grad_norm": 0.44140657782554626, + "learning_rate": 6.22333463000111e-06, + "loss": 0.2979, + "step": 33572 + }, + { + "epoch": 0.623303616846686, + "grad_norm": 0.4018612504005432, + "learning_rate": 6.222254553378904e-06, + "loss": 0.2325, + "step": 33574 + }, + { + "epoch": 0.6233407469841046, + "grad_norm": 0.4377041459083557, + "learning_rate": 6.2211745281605e-06, + "loss": 0.265, + "step": 33576 + }, + { + "epoch": 0.6233778771215233, + "grad_norm": 0.714357316493988, + "learning_rate": 6.22009455436059e-06, + "loss": 0.324, + "step": 33578 + }, + { + "epoch": 0.6234150072589418, + "grad_norm": 0.3865002691745758, + "learning_rate": 6.219014631993869e-06, + "loss": 0.335, + "step": 33580 + }, + { + "epoch": 0.6234521373963605, + "grad_norm": 0.5451263785362244, + "learning_rate": 6.217934761075035e-06, + "loss": 0.3478, + "step": 33582 + }, + { + "epoch": 0.6234892675337791, + "grad_norm": 0.5084574222564697, + "learning_rate": 6.216854941618784e-06, + "loss": 0.2694, + "step": 33584 + }, + { + "epoch": 0.6235263976711978, + "grad_norm": 0.2983416020870209, + "learning_rate": 6.2157751736398016e-06, + "loss": 0.2923, + "step": 33586 + }, + { + "epoch": 0.6235635278086165, + "grad_norm": 0.412888765335083, + "learning_rate": 6.214695457152786e-06, + "loss": 0.2773, + "step": 33588 + }, + { + "epoch": 0.623600657946035, + "grad_norm": 0.44216951727867126, + "learning_rate": 6.213615792172425e-06, + "loss": 0.3006, + "step": 33590 + }, + { + "epoch": 0.6236377880834537, + "grad_norm": 0.30473342537879944, + "learning_rate": 6.212536178713412e-06, + "loss": 0.1612, + "step": 33592 + }, + { + "epoch": 0.6236749182208723, + "grad_norm": 0.5691676139831543, + "learning_rate": 6.211456616790437e-06, + "loss": 0.3484, + "step": 33594 + }, + { + "epoch": 0.623712048358291, + "grad_norm": 0.46212685108184814, + "learning_rate": 6.210377106418192e-06, + "loss": 0.1805, + "step": 33596 + }, + { + "epoch": 0.6237491784957097, + "grad_norm": 0.7607839107513428, + "learning_rate": 6.209297647611362e-06, + "loss": 0.4388, + "step": 33598 + }, + { + "epoch": 0.6237863086331282, + "grad_norm": 0.42715758085250854, + "learning_rate": 6.2082182403846345e-06, + "loss": 0.2543, + "step": 33600 + }, + { + "epoch": 0.6238234387705469, + "grad_norm": 0.3998908996582031, + "learning_rate": 6.207138884752699e-06, + "loss": 0.3483, + "step": 33602 + }, + { + "epoch": 0.6238605689079655, + "grad_norm": 0.50295490026474, + "learning_rate": 6.2060595807302424e-06, + "loss": 0.3534, + "step": 33604 + }, + { + "epoch": 0.6238976990453842, + "grad_norm": 0.34141379594802856, + "learning_rate": 6.204980328331954e-06, + "loss": 0.1619, + "step": 33606 + }, + { + "epoch": 0.6239348291828029, + "grad_norm": 0.4538635313510895, + "learning_rate": 6.203901127572512e-06, + "loss": 0.2529, + "step": 33608 + }, + { + "epoch": 0.6239719593202214, + "grad_norm": 0.3219510614871979, + "learning_rate": 6.20282197846661e-06, + "loss": 0.4102, + "step": 33610 + }, + { + "epoch": 0.6240090894576401, + "grad_norm": 0.42428040504455566, + "learning_rate": 6.201742881028922e-06, + "loss": 0.4422, + "step": 33612 + }, + { + "epoch": 0.6240462195950587, + "grad_norm": 0.4472428262233734, + "learning_rate": 6.200663835274138e-06, + "loss": 0.1824, + "step": 33614 + }, + { + "epoch": 0.6240833497324774, + "grad_norm": 0.3867628276348114, + "learning_rate": 6.1995848412169364e-06, + "loss": 0.3032, + "step": 33616 + }, + { + "epoch": 0.624120479869896, + "grad_norm": 0.4629480242729187, + "learning_rate": 6.198505898872007e-06, + "loss": 0.2959, + "step": 33618 + }, + { + "epoch": 0.6241576100073146, + "grad_norm": 0.23004283010959625, + "learning_rate": 6.197427008254021e-06, + "loss": 0.1355, + "step": 33620 + }, + { + "epoch": 0.6241947401447333, + "grad_norm": 0.508292555809021, + "learning_rate": 6.196348169377668e-06, + "loss": 0.566, + "step": 33622 + }, + { + "epoch": 0.6242318702821519, + "grad_norm": 0.37736445665359497, + "learning_rate": 6.195269382257619e-06, + "loss": 0.3869, + "step": 33624 + }, + { + "epoch": 0.6242690004195706, + "grad_norm": 0.35757002234458923, + "learning_rate": 6.194190646908558e-06, + "loss": 0.4167, + "step": 33626 + }, + { + "epoch": 0.6243061305569891, + "grad_norm": 0.3544449210166931, + "learning_rate": 6.193111963345165e-06, + "loss": 0.2409, + "step": 33628 + }, + { + "epoch": 0.6243432606944078, + "grad_norm": 0.44588562846183777, + "learning_rate": 6.19203333158211e-06, + "loss": 0.1527, + "step": 33630 + }, + { + "epoch": 0.6243803908318265, + "grad_norm": 0.41675305366516113, + "learning_rate": 6.190954751634078e-06, + "loss": 0.2863, + "step": 33632 + }, + { + "epoch": 0.6244175209692451, + "grad_norm": 0.34612202644348145, + "learning_rate": 6.189876223515746e-06, + "loss": 0.4173, + "step": 33634 + }, + { + "epoch": 0.6244546511066638, + "grad_norm": 0.38405779004096985, + "learning_rate": 6.188797747241782e-06, + "loss": 0.2842, + "step": 33636 + }, + { + "epoch": 0.6244917812440823, + "grad_norm": 0.5298508405685425, + "learning_rate": 6.187719322826864e-06, + "loss": 0.2592, + "step": 33638 + }, + { + "epoch": 0.624528911381501, + "grad_norm": 0.28581956028938293, + "learning_rate": 6.186640950285669e-06, + "loss": 0.2859, + "step": 33640 + }, + { + "epoch": 0.6245660415189197, + "grad_norm": 0.31445547938346863, + "learning_rate": 6.185562629632869e-06, + "loss": 0.2421, + "step": 33642 + }, + { + "epoch": 0.6246031716563383, + "grad_norm": 0.3807367980480194, + "learning_rate": 6.1844843608831325e-06, + "loss": 0.386, + "step": 33644 + }, + { + "epoch": 0.624640301793757, + "grad_norm": 0.4329812526702881, + "learning_rate": 6.1834061440511364e-06, + "loss": 0.5375, + "step": 33646 + }, + { + "epoch": 0.6246774319311755, + "grad_norm": 0.5119138956069946, + "learning_rate": 6.1823279791515546e-06, + "loss": 0.2728, + "step": 33648 + }, + { + "epoch": 0.6247145620685942, + "grad_norm": 0.6869155168533325, + "learning_rate": 6.181249866199052e-06, + "loss": 0.2547, + "step": 33650 + }, + { + "epoch": 0.6247516922060129, + "grad_norm": 0.35880687832832336, + "learning_rate": 6.180171805208298e-06, + "loss": 0.6476, + "step": 33652 + }, + { + "epoch": 0.6247888223434315, + "grad_norm": 0.19963699579238892, + "learning_rate": 6.179093796193964e-06, + "loss": 0.2864, + "step": 33654 + }, + { + "epoch": 0.6248259524808502, + "grad_norm": 0.38334372639656067, + "learning_rate": 6.178015839170719e-06, + "loss": 0.2039, + "step": 33656 + }, + { + "epoch": 0.6248630826182687, + "grad_norm": 0.28877779841423035, + "learning_rate": 6.176937934153231e-06, + "loss": 0.2892, + "step": 33658 + }, + { + "epoch": 0.6249002127556874, + "grad_norm": 0.376616895198822, + "learning_rate": 6.175860081156168e-06, + "loss": 0.3712, + "step": 33660 + }, + { + "epoch": 0.6249373428931061, + "grad_norm": 0.4133601486682892, + "learning_rate": 6.174782280194194e-06, + "loss": 0.2686, + "step": 33662 + }, + { + "epoch": 0.6249744730305247, + "grad_norm": 0.34283655881881714, + "learning_rate": 6.1737045312819725e-06, + "loss": 0.4099, + "step": 33664 + }, + { + "epoch": 0.6250116031679434, + "grad_norm": 0.34196093678474426, + "learning_rate": 6.172626834434172e-06, + "loss": 0.1498, + "step": 33666 + }, + { + "epoch": 0.6250487333053619, + "grad_norm": 0.4400765597820282, + "learning_rate": 6.171549189665456e-06, + "loss": 0.3034, + "step": 33668 + }, + { + "epoch": 0.6250858634427806, + "grad_norm": 0.40945225954055786, + "learning_rate": 6.17047159699049e-06, + "loss": 0.181, + "step": 33670 + }, + { + "epoch": 0.6251229935801993, + "grad_norm": 0.3731880187988281, + "learning_rate": 6.169394056423934e-06, + "loss": 0.3556, + "step": 33672 + }, + { + "epoch": 0.6251601237176179, + "grad_norm": 0.2364620715379715, + "learning_rate": 6.168316567980452e-06, + "loss": 0.1699, + "step": 33674 + }, + { + "epoch": 0.6251972538550365, + "grad_norm": 0.485349178314209, + "learning_rate": 6.167239131674703e-06, + "loss": 0.2843, + "step": 33676 + }, + { + "epoch": 0.6252343839924551, + "grad_norm": 0.29831939935684204, + "learning_rate": 6.166161747521347e-06, + "loss": 0.22, + "step": 33678 + }, + { + "epoch": 0.6252715141298738, + "grad_norm": 0.27302294969558716, + "learning_rate": 6.1650844155350465e-06, + "loss": 0.2673, + "step": 33680 + }, + { + "epoch": 0.6253086442672924, + "grad_norm": 0.26489681005477905, + "learning_rate": 6.164007135730463e-06, + "loss": 0.2521, + "step": 33682 + }, + { + "epoch": 0.6253457744047111, + "grad_norm": 0.5267603993415833, + "learning_rate": 6.162929908122253e-06, + "loss": 0.5018, + "step": 33684 + }, + { + "epoch": 0.6253829045421297, + "grad_norm": 0.3156895041465759, + "learning_rate": 6.161852732725071e-06, + "loss": 0.2595, + "step": 33686 + }, + { + "epoch": 0.6254200346795483, + "grad_norm": 0.4034641683101654, + "learning_rate": 6.160775609553575e-06, + "loss": 0.1991, + "step": 33688 + }, + { + "epoch": 0.625457164816967, + "grad_norm": 0.39785236120224, + "learning_rate": 6.159698538622425e-06, + "loss": 0.2517, + "step": 33690 + }, + { + "epoch": 0.6254942949543856, + "grad_norm": 0.2915186285972595, + "learning_rate": 6.158621519946275e-06, + "loss": 0.1973, + "step": 33692 + }, + { + "epoch": 0.6255314250918043, + "grad_norm": 0.5866549015045166, + "learning_rate": 6.15754455353978e-06, + "loss": 0.3051, + "step": 33694 + }, + { + "epoch": 0.625568555229223, + "grad_norm": 0.37558498978614807, + "learning_rate": 6.156467639417593e-06, + "loss": 0.094, + "step": 33696 + }, + { + "epoch": 0.6256056853666415, + "grad_norm": 0.3335569500923157, + "learning_rate": 6.155390777594373e-06, + "loss": 0.2218, + "step": 33698 + }, + { + "epoch": 0.6256428155040602, + "grad_norm": 0.505942165851593, + "learning_rate": 6.154313968084764e-06, + "loss": 0.2639, + "step": 33700 + }, + { + "epoch": 0.6256799456414788, + "grad_norm": 0.45688754320144653, + "learning_rate": 6.153237210903422e-06, + "loss": 0.3439, + "step": 33702 + }, + { + "epoch": 0.6257170757788975, + "grad_norm": 0.5846219658851624, + "learning_rate": 6.152160506065004e-06, + "loss": 0.2327, + "step": 33704 + }, + { + "epoch": 0.6257542059163161, + "grad_norm": 0.49177420139312744, + "learning_rate": 6.151083853584151e-06, + "loss": 0.21, + "step": 33706 + }, + { + "epoch": 0.6257913360537347, + "grad_norm": 0.4829719662666321, + "learning_rate": 6.1500072534755196e-06, + "loss": 0.3661, + "step": 33708 + }, + { + "epoch": 0.6258284661911534, + "grad_norm": 0.6135827898979187, + "learning_rate": 6.148930705753761e-06, + "loss": 0.1218, + "step": 33710 + }, + { + "epoch": 0.625865596328572, + "grad_norm": 2.516249895095825, + "learning_rate": 6.147854210433515e-06, + "loss": 0.3147, + "step": 33712 + }, + { + "epoch": 0.6259027264659907, + "grad_norm": 0.2733874022960663, + "learning_rate": 6.1467777675294385e-06, + "loss": 0.304, + "step": 33714 + }, + { + "epoch": 0.6259398566034093, + "grad_norm": 0.3303776681423187, + "learning_rate": 6.145701377056172e-06, + "loss": 0.2423, + "step": 33716 + }, + { + "epoch": 0.6259769867408279, + "grad_norm": 0.4016251564025879, + "learning_rate": 6.144625039028365e-06, + "loss": 0.336, + "step": 33718 + }, + { + "epoch": 0.6260141168782466, + "grad_norm": 0.3686492145061493, + "learning_rate": 6.143548753460662e-06, + "loss": 0.279, + "step": 33720 + }, + { + "epoch": 0.6260512470156652, + "grad_norm": 0.3058331310749054, + "learning_rate": 6.142472520367715e-06, + "loss": 0.3619, + "step": 33722 + }, + { + "epoch": 0.6260883771530839, + "grad_norm": 0.5305180549621582, + "learning_rate": 6.141396339764156e-06, + "loss": 0.3481, + "step": 33724 + }, + { + "epoch": 0.6261255072905025, + "grad_norm": 0.3334653377532959, + "learning_rate": 6.140320211664639e-06, + "loss": 0.2598, + "step": 33726 + }, + { + "epoch": 0.6261626374279211, + "grad_norm": 0.47472646832466125, + "learning_rate": 6.139244136083801e-06, + "loss": 0.2044, + "step": 33728 + }, + { + "epoch": 0.6261997675653398, + "grad_norm": 0.46620553731918335, + "learning_rate": 6.138168113036285e-06, + "loss": 0.3054, + "step": 33730 + }, + { + "epoch": 0.6262368977027584, + "grad_norm": 0.3397664725780487, + "learning_rate": 6.137092142536733e-06, + "loss": 0.3721, + "step": 33732 + }, + { + "epoch": 0.626274027840177, + "grad_norm": 0.40037786960601807, + "learning_rate": 6.136016224599789e-06, + "loss": 0.3973, + "step": 33734 + }, + { + "epoch": 0.6263111579775956, + "grad_norm": 0.3438727855682373, + "learning_rate": 6.134940359240091e-06, + "loss": 0.3175, + "step": 33736 + }, + { + "epoch": 0.6263482881150143, + "grad_norm": 0.4470684230327606, + "learning_rate": 6.133864546472276e-06, + "loss": 0.1328, + "step": 33738 + }, + { + "epoch": 0.626385418252433, + "grad_norm": 0.3403429687023163, + "learning_rate": 6.132788786310983e-06, + "loss": 0.362, + "step": 33740 + }, + { + "epoch": 0.6264225483898516, + "grad_norm": 0.40784934163093567, + "learning_rate": 6.13171307877085e-06, + "loss": 0.1683, + "step": 33742 + }, + { + "epoch": 0.6264596785272702, + "grad_norm": 0.3788936138153076, + "learning_rate": 6.130637423866518e-06, + "loss": 0.4557, + "step": 33744 + }, + { + "epoch": 0.6264968086646888, + "grad_norm": 0.4682208001613617, + "learning_rate": 6.1295618216126175e-06, + "loss": 0.4296, + "step": 33746 + }, + { + "epoch": 0.6265339388021075, + "grad_norm": 0.1407458782196045, + "learning_rate": 6.128486272023792e-06, + "loss": 0.2423, + "step": 33748 + }, + { + "epoch": 0.6265710689395262, + "grad_norm": 0.9254353046417236, + "learning_rate": 6.1274107751146686e-06, + "loss": 0.2534, + "step": 33750 + }, + { + "epoch": 0.6266081990769448, + "grad_norm": 0.26007452607154846, + "learning_rate": 6.126335330899884e-06, + "loss": 0.1586, + "step": 33752 + }, + { + "epoch": 0.6266453292143634, + "grad_norm": 0.7198523283004761, + "learning_rate": 6.125259939394073e-06, + "loss": 0.2512, + "step": 33754 + }, + { + "epoch": 0.626682459351782, + "grad_norm": 0.6036335229873657, + "learning_rate": 6.1241846006118684e-06, + "loss": 0.2661, + "step": 33756 + }, + { + "epoch": 0.6267195894892007, + "grad_norm": 0.5589054822921753, + "learning_rate": 6.123109314567903e-06, + "loss": 0.2248, + "step": 33758 + }, + { + "epoch": 0.6267567196266194, + "grad_norm": 0.4162291884422302, + "learning_rate": 6.122034081276809e-06, + "loss": 0.3069, + "step": 33760 + }, + { + "epoch": 0.626793849764038, + "grad_norm": 0.3105548620223999, + "learning_rate": 6.120958900753211e-06, + "loss": 0.3566, + "step": 33762 + }, + { + "epoch": 0.6268309799014566, + "grad_norm": 0.38274091482162476, + "learning_rate": 6.119883773011746e-06, + "loss": 0.3503, + "step": 33764 + }, + { + "epoch": 0.6268681100388752, + "grad_norm": 0.43922746181488037, + "learning_rate": 6.11880869806704e-06, + "loss": 0.2227, + "step": 33766 + }, + { + "epoch": 0.6269052401762939, + "grad_norm": 0.3980352580547333, + "learning_rate": 6.11773367593372e-06, + "loss": 0.532, + "step": 33768 + }, + { + "epoch": 0.6269423703137126, + "grad_norm": 0.3464619219303131, + "learning_rate": 6.1166587066264174e-06, + "loss": 0.1799, + "step": 33770 + }, + { + "epoch": 0.6269795004511312, + "grad_norm": 0.3709046542644501, + "learning_rate": 6.115583790159757e-06, + "loss": 0.2985, + "step": 33772 + }, + { + "epoch": 0.6270166305885498, + "grad_norm": 0.4215565025806427, + "learning_rate": 6.11450892654837e-06, + "loss": 0.2319, + "step": 33774 + }, + { + "epoch": 0.6270537607259684, + "grad_norm": 0.35182565450668335, + "learning_rate": 6.113434115806874e-06, + "loss": 0.3307, + "step": 33776 + }, + { + "epoch": 0.6270908908633871, + "grad_norm": 0.5552123785018921, + "learning_rate": 6.112359357949901e-06, + "loss": 0.2357, + "step": 33778 + }, + { + "epoch": 0.6271280210008057, + "grad_norm": 0.3818530738353729, + "learning_rate": 6.111284652992069e-06, + "loss": 0.3848, + "step": 33780 + }, + { + "epoch": 0.6271651511382244, + "grad_norm": 0.44810959696769714, + "learning_rate": 6.110210000948006e-06, + "loss": 0.3052, + "step": 33782 + }, + { + "epoch": 0.627202281275643, + "grad_norm": 0.1763920933008194, + "learning_rate": 6.109135401832333e-06, + "loss": 0.2653, + "step": 33784 + }, + { + "epoch": 0.6272394114130616, + "grad_norm": 0.4581526219844818, + "learning_rate": 6.108060855659677e-06, + "loss": 0.2854, + "step": 33786 + }, + { + "epoch": 0.6272765415504803, + "grad_norm": 0.4450340270996094, + "learning_rate": 6.106986362444651e-06, + "loss": 0.2037, + "step": 33788 + }, + { + "epoch": 0.6273136716878989, + "grad_norm": 0.5797021985054016, + "learning_rate": 6.105911922201881e-06, + "loss": 0.3063, + "step": 33790 + }, + { + "epoch": 0.6273508018253175, + "grad_norm": 0.46969103813171387, + "learning_rate": 6.104837534945985e-06, + "loss": 0.1585, + "step": 33792 + }, + { + "epoch": 0.6273879319627362, + "grad_norm": 0.2981218993663788, + "learning_rate": 6.1037632006915815e-06, + "loss": 0.2523, + "step": 33794 + }, + { + "epoch": 0.6274250621001548, + "grad_norm": 0.2927955389022827, + "learning_rate": 6.102688919453292e-06, + "loss": 0.2513, + "step": 33796 + }, + { + "epoch": 0.6274621922375735, + "grad_norm": 0.4551640450954437, + "learning_rate": 6.101614691245734e-06, + "loss": 0.2243, + "step": 33798 + }, + { + "epoch": 0.6274993223749921, + "grad_norm": 0.41981008648872375, + "learning_rate": 6.100540516083522e-06, + "loss": 0.2888, + "step": 33800 + }, + { + "epoch": 0.6275364525124107, + "grad_norm": 0.3514082729816437, + "learning_rate": 6.099466393981273e-06, + "loss": 0.1425, + "step": 33802 + }, + { + "epoch": 0.6275735826498294, + "grad_norm": 0.35468536615371704, + "learning_rate": 6.0983923249536e-06, + "loss": 0.2403, + "step": 33804 + }, + { + "epoch": 0.627610712787248, + "grad_norm": 0.33821043372154236, + "learning_rate": 6.097318309015123e-06, + "loss": 0.3153, + "step": 33806 + }, + { + "epoch": 0.6276478429246667, + "grad_norm": 0.3924834430217743, + "learning_rate": 6.096244346180455e-06, + "loss": 0.2519, + "step": 33808 + }, + { + "epoch": 0.6276849730620853, + "grad_norm": 0.4211489260196686, + "learning_rate": 6.095170436464208e-06, + "loss": 0.2621, + "step": 33810 + }, + { + "epoch": 0.627722103199504, + "grad_norm": 0.3500491976737976, + "learning_rate": 6.094096579880996e-06, + "loss": 0.2091, + "step": 33812 + }, + { + "epoch": 0.6277592333369226, + "grad_norm": 0.30993348360061646, + "learning_rate": 6.093022776445428e-06, + "loss": 0.2823, + "step": 33814 + }, + { + "epoch": 0.6277963634743412, + "grad_norm": 0.3980141580104828, + "learning_rate": 6.091949026172117e-06, + "loss": 0.1791, + "step": 33816 + }, + { + "epoch": 0.6278334936117599, + "grad_norm": 0.5893418192863464, + "learning_rate": 6.090875329075674e-06, + "loss": 0.3796, + "step": 33818 + }, + { + "epoch": 0.6278706237491785, + "grad_norm": 0.5061236023902893, + "learning_rate": 6.089801685170709e-06, + "loss": 0.2261, + "step": 33820 + }, + { + "epoch": 0.6279077538865971, + "grad_norm": 0.3368014991283417, + "learning_rate": 6.08872809447183e-06, + "loss": 0.2813, + "step": 33822 + }, + { + "epoch": 0.6279448840240158, + "grad_norm": 0.45922359824180603, + "learning_rate": 6.087654556993649e-06, + "loss": 0.3107, + "step": 33824 + }, + { + "epoch": 0.6279820141614344, + "grad_norm": 0.4237291216850281, + "learning_rate": 6.086581072750769e-06, + "loss": 0.3051, + "step": 33826 + }, + { + "epoch": 0.6280191442988531, + "grad_norm": 0.37289169430732727, + "learning_rate": 6.085507641757798e-06, + "loss": 0.487, + "step": 33828 + }, + { + "epoch": 0.6280562744362717, + "grad_norm": 0.25098609924316406, + "learning_rate": 6.084434264029343e-06, + "loss": 0.3396, + "step": 33830 + }, + { + "epoch": 0.6280934045736903, + "grad_norm": 0.46209532022476196, + "learning_rate": 6.0833609395800074e-06, + "loss": 0.3498, + "step": 33832 + }, + { + "epoch": 0.6281305347111089, + "grad_norm": 0.46025192737579346, + "learning_rate": 6.082287668424399e-06, + "loss": 0.2618, + "step": 33834 + }, + { + "epoch": 0.6281676648485276, + "grad_norm": 0.3597796559333801, + "learning_rate": 6.081214450577124e-06, + "loss": 0.2214, + "step": 33836 + }, + { + "epoch": 0.6282047949859463, + "grad_norm": 0.4983012080192566, + "learning_rate": 6.080141286052778e-06, + "loss": 0.3155, + "step": 33838 + }, + { + "epoch": 0.6282419251233649, + "grad_norm": 0.3218439817428589, + "learning_rate": 6.0790681748659674e-06, + "loss": 0.1216, + "step": 33840 + }, + { + "epoch": 0.6282790552607835, + "grad_norm": 0.4289875328540802, + "learning_rate": 6.077995117031297e-06, + "loss": 0.2489, + "step": 33842 + }, + { + "epoch": 0.6283161853982021, + "grad_norm": 0.4476417899131775, + "learning_rate": 6.076922112563364e-06, + "loss": 0.2286, + "step": 33844 + }, + { + "epoch": 0.6283533155356208, + "grad_norm": 0.48402199149131775, + "learning_rate": 6.075849161476769e-06, + "loss": 0.5451, + "step": 33846 + }, + { + "epoch": 0.6283904456730395, + "grad_norm": 0.20987172424793243, + "learning_rate": 6.0747762637861175e-06, + "loss": 0.1435, + "step": 33848 + }, + { + "epoch": 0.628427575810458, + "grad_norm": 0.40642261505126953, + "learning_rate": 6.073703419505999e-06, + "loss": 0.1083, + "step": 33850 + }, + { + "epoch": 0.6284647059478767, + "grad_norm": 0.27885913848876953, + "learning_rate": 6.0726306286510175e-06, + "loss": 0.3879, + "step": 33852 + }, + { + "epoch": 0.6285018360852953, + "grad_norm": 0.43868330121040344, + "learning_rate": 6.0715578912357685e-06, + "loss": 0.3033, + "step": 33854 + }, + { + "epoch": 0.628538966222714, + "grad_norm": 0.37691131234169006, + "learning_rate": 6.07048520727485e-06, + "loss": 0.4333, + "step": 33856 + }, + { + "epoch": 0.6285760963601327, + "grad_norm": 0.406637042760849, + "learning_rate": 6.069412576782856e-06, + "loss": 0.3154, + "step": 33858 + }, + { + "epoch": 0.6286132264975512, + "grad_norm": 0.3655599057674408, + "learning_rate": 6.068339999774386e-06, + "loss": 0.2653, + "step": 33860 + }, + { + "epoch": 0.6286503566349699, + "grad_norm": 0.36459845304489136, + "learning_rate": 6.0672674762640325e-06, + "loss": 0.3079, + "step": 33862 + }, + { + "epoch": 0.6286874867723885, + "grad_norm": 0.26623696088790894, + "learning_rate": 6.066195006266389e-06, + "loss": 0.2303, + "step": 33864 + }, + { + "epoch": 0.6287246169098072, + "grad_norm": 0.3186604082584381, + "learning_rate": 6.065122589796045e-06, + "loss": 0.2062, + "step": 33866 + }, + { + "epoch": 0.6287617470472259, + "grad_norm": 0.3088248074054718, + "learning_rate": 6.064050226867597e-06, + "loss": 0.1533, + "step": 33868 + }, + { + "epoch": 0.6287988771846444, + "grad_norm": 0.3572561740875244, + "learning_rate": 6.062977917495636e-06, + "loss": 0.3008, + "step": 33870 + }, + { + "epoch": 0.6288360073220631, + "grad_norm": 0.3829580247402191, + "learning_rate": 6.061905661694755e-06, + "loss": 0.1458, + "step": 33872 + }, + { + "epoch": 0.6288731374594817, + "grad_norm": 0.319844514131546, + "learning_rate": 6.0608334594795435e-06, + "loss": 0.0907, + "step": 33874 + }, + { + "epoch": 0.6289102675969004, + "grad_norm": 0.4637920558452606, + "learning_rate": 6.059761310864586e-06, + "loss": 0.3299, + "step": 33876 + }, + { + "epoch": 0.6289473977343191, + "grad_norm": 0.23585030436515808, + "learning_rate": 6.058689215864474e-06, + "loss": 0.3233, + "step": 33878 + }, + { + "epoch": 0.6289845278717376, + "grad_norm": 0.354974627494812, + "learning_rate": 6.0576171744937966e-06, + "loss": 0.3089, + "step": 33880 + }, + { + "epoch": 0.6290216580091563, + "grad_norm": 0.429735392332077, + "learning_rate": 6.056545186767142e-06, + "loss": 0.2635, + "step": 33882 + }, + { + "epoch": 0.6290587881465749, + "grad_norm": 0.4359927177429199, + "learning_rate": 6.055473252699093e-06, + "loss": 0.2483, + "step": 33884 + }, + { + "epoch": 0.6290959182839936, + "grad_norm": 0.4963236451148987, + "learning_rate": 6.0544013723042435e-06, + "loss": 0.2013, + "step": 33886 + }, + { + "epoch": 0.6291330484214122, + "grad_norm": 0.38880491256713867, + "learning_rate": 6.0533295455971684e-06, + "loss": 0.1589, + "step": 33888 + }, + { + "epoch": 0.6291701785588308, + "grad_norm": 0.3633100986480713, + "learning_rate": 6.052257772592456e-06, + "loss": 0.3084, + "step": 33890 + }, + { + "epoch": 0.6292073086962495, + "grad_norm": 0.43028852343559265, + "learning_rate": 6.05118605330469e-06, + "loss": 0.3644, + "step": 33892 + }, + { + "epoch": 0.6292444388336681, + "grad_norm": 0.408188134431839, + "learning_rate": 6.050114387748458e-06, + "loss": 0.2321, + "step": 33894 + }, + { + "epoch": 0.6292815689710868, + "grad_norm": 0.32363826036453247, + "learning_rate": 6.0490427759383345e-06, + "loss": 0.4529, + "step": 33896 + }, + { + "epoch": 0.6293186991085054, + "grad_norm": 0.37696394324302673, + "learning_rate": 6.047971217888904e-06, + "loss": 0.2305, + "step": 33898 + }, + { + "epoch": 0.629355829245924, + "grad_norm": 0.6494757533073425, + "learning_rate": 6.046899713614751e-06, + "loss": 0.3198, + "step": 33900 + }, + { + "epoch": 0.6293929593833427, + "grad_norm": 0.2846950888633728, + "learning_rate": 6.0458282631304485e-06, + "loss": 0.1868, + "step": 33902 + }, + { + "epoch": 0.6294300895207613, + "grad_norm": 0.3400444984436035, + "learning_rate": 6.044756866450582e-06, + "loss": 0.3013, + "step": 33904 + }, + { + "epoch": 0.62946721965818, + "grad_norm": 0.6710299849510193, + "learning_rate": 6.043685523589724e-06, + "loss": 0.2046, + "step": 33906 + }, + { + "epoch": 0.6295043497955986, + "grad_norm": 0.31077876687049866, + "learning_rate": 6.042614234562456e-06, + "loss": 0.3936, + "step": 33908 + }, + { + "epoch": 0.6295414799330172, + "grad_norm": 0.9020906686782837, + "learning_rate": 6.041542999383356e-06, + "loss": 0.3091, + "step": 33910 + }, + { + "epoch": 0.6295786100704359, + "grad_norm": 0.6193994879722595, + "learning_rate": 6.040471818067e-06, + "loss": 0.4458, + "step": 33912 + }, + { + "epoch": 0.6296157402078545, + "grad_norm": 0.29391154646873474, + "learning_rate": 6.039400690627961e-06, + "loss": 0.208, + "step": 33914 + }, + { + "epoch": 0.6296528703452732, + "grad_norm": 0.3856953978538513, + "learning_rate": 6.038329617080816e-06, + "loss": 0.2033, + "step": 33916 + }, + { + "epoch": 0.6296900004826917, + "grad_norm": 0.5634004473686218, + "learning_rate": 6.037258597440136e-06, + "loss": 0.2772, + "step": 33918 + }, + { + "epoch": 0.6297271306201104, + "grad_norm": 0.4434337019920349, + "learning_rate": 6.036187631720497e-06, + "loss": 0.274, + "step": 33920 + }, + { + "epoch": 0.6297642607575291, + "grad_norm": 0.3064804673194885, + "learning_rate": 6.035116719936471e-06, + "loss": 0.2363, + "step": 33922 + }, + { + "epoch": 0.6298013908949477, + "grad_norm": 0.36120733618736267, + "learning_rate": 6.034045862102636e-06, + "loss": 0.2195, + "step": 33924 + }, + { + "epoch": 0.6298385210323664, + "grad_norm": 0.37281864881515503, + "learning_rate": 6.032975058233552e-06, + "loss": 0.4467, + "step": 33926 + }, + { + "epoch": 0.629875651169785, + "grad_norm": 0.1743318736553192, + "learning_rate": 6.031904308343797e-06, + "loss": 0.257, + "step": 33928 + }, + { + "epoch": 0.6299127813072036, + "grad_norm": 0.43869540095329285, + "learning_rate": 6.030833612447936e-06, + "loss": 0.4597, + "step": 33930 + }, + { + "epoch": 0.6299499114446222, + "grad_norm": 0.3877910077571869, + "learning_rate": 6.0297629705605406e-06, + "loss": 0.3908, + "step": 33932 + }, + { + "epoch": 0.6299870415820409, + "grad_norm": 0.34378233551979065, + "learning_rate": 6.0286923826961815e-06, + "loss": 0.2751, + "step": 33934 + }, + { + "epoch": 0.6300241717194596, + "grad_norm": 0.3581828474998474, + "learning_rate": 6.027621848869422e-06, + "loss": 0.1359, + "step": 33936 + }, + { + "epoch": 0.6300613018568781, + "grad_norm": 0.33425086736679077, + "learning_rate": 6.026551369094833e-06, + "loss": 0.1825, + "step": 33938 + }, + { + "epoch": 0.6300984319942968, + "grad_norm": 0.38773342967033386, + "learning_rate": 6.025480943386976e-06, + "loss": 0.3022, + "step": 33940 + }, + { + "epoch": 0.6301355621317154, + "grad_norm": 0.36916765570640564, + "learning_rate": 6.024410571760418e-06, + "loss": 0.2681, + "step": 33942 + }, + { + "epoch": 0.6301726922691341, + "grad_norm": 0.5057629942893982, + "learning_rate": 6.023340254229721e-06, + "loss": 0.2142, + "step": 33944 + }, + { + "epoch": 0.6302098224065528, + "grad_norm": 0.4288654923439026, + "learning_rate": 6.022269990809457e-06, + "loss": 0.3665, + "step": 33946 + }, + { + "epoch": 0.6302469525439713, + "grad_norm": 0.4674459993839264, + "learning_rate": 6.0211997815141795e-06, + "loss": 0.433, + "step": 33948 + }, + { + "epoch": 0.63028408268139, + "grad_norm": 1.163735270500183, + "learning_rate": 6.020129626358462e-06, + "loss": 0.3013, + "step": 33950 + }, + { + "epoch": 0.6303212128188086, + "grad_norm": 0.2307201474905014, + "learning_rate": 6.019059525356852e-06, + "loss": 0.3706, + "step": 33952 + }, + { + "epoch": 0.6303583429562273, + "grad_norm": 0.2800556421279907, + "learning_rate": 6.017989478523919e-06, + "loss": 0.214, + "step": 33954 + }, + { + "epoch": 0.630395473093646, + "grad_norm": 0.46497732400894165, + "learning_rate": 6.016919485874222e-06, + "loss": 0.1697, + "step": 33956 + }, + { + "epoch": 0.6304326032310645, + "grad_norm": 0.3138503432273865, + "learning_rate": 6.015849547422321e-06, + "loss": 0.1554, + "step": 33958 + }, + { + "epoch": 0.6304697333684832, + "grad_norm": 0.19327868521213531, + "learning_rate": 6.014779663182773e-06, + "loss": 0.2308, + "step": 33960 + }, + { + "epoch": 0.6305068635059018, + "grad_norm": 0.2802954614162445, + "learning_rate": 6.01370983317014e-06, + "loss": 0.1992, + "step": 33962 + }, + { + "epoch": 0.6305439936433205, + "grad_norm": 0.5347206592559814, + "learning_rate": 6.012640057398972e-06, + "loss": 0.321, + "step": 33964 + }, + { + "epoch": 0.6305811237807392, + "grad_norm": 0.267868310213089, + "learning_rate": 6.0115703358838296e-06, + "loss": 0.3132, + "step": 33966 + }, + { + "epoch": 0.6306182539181577, + "grad_norm": 0.35025647282600403, + "learning_rate": 6.01050066863927e-06, + "loss": 0.2858, + "step": 33968 + }, + { + "epoch": 0.6306553840555764, + "grad_norm": 0.3864188492298126, + "learning_rate": 6.009431055679844e-06, + "loss": 0.1714, + "step": 33970 + }, + { + "epoch": 0.630692514192995, + "grad_norm": 0.4265131950378418, + "learning_rate": 6.008361497020107e-06, + "loss": 0.2996, + "step": 33972 + }, + { + "epoch": 0.6307296443304137, + "grad_norm": 0.5354328155517578, + "learning_rate": 6.00729199267462e-06, + "loss": 0.2899, + "step": 33974 + }, + { + "epoch": 0.6307667744678324, + "grad_norm": 0.7740700840950012, + "learning_rate": 6.006222542657924e-06, + "loss": 0.3264, + "step": 33976 + }, + { + "epoch": 0.6308039046052509, + "grad_norm": 0.42751383781433105, + "learning_rate": 6.005153146984577e-06, + "loss": 0.3641, + "step": 33978 + }, + { + "epoch": 0.6308410347426696, + "grad_norm": 0.41626062989234924, + "learning_rate": 6.004083805669132e-06, + "loss": 0.4913, + "step": 33980 + }, + { + "epoch": 0.6308781648800882, + "grad_norm": 0.32094380259513855, + "learning_rate": 6.003014518726135e-06, + "loss": 0.2442, + "step": 33982 + }, + { + "epoch": 0.6309152950175069, + "grad_norm": 0.8709374666213989, + "learning_rate": 6.001945286170138e-06, + "loss": 0.1776, + "step": 33984 + }, + { + "epoch": 0.6309524251549254, + "grad_norm": 0.2721101641654968, + "learning_rate": 6.000876108015689e-06, + "loss": 0.0814, + "step": 33986 + }, + { + "epoch": 0.6309895552923441, + "grad_norm": 0.5212908387184143, + "learning_rate": 5.999806984277343e-06, + "loss": 0.2076, + "step": 33988 + }, + { + "epoch": 0.6310266854297628, + "grad_norm": 0.3964730501174927, + "learning_rate": 5.99873791496964e-06, + "loss": 0.1625, + "step": 33990 + }, + { + "epoch": 0.6310638155671814, + "grad_norm": 0.4831129312515259, + "learning_rate": 5.9976689001071256e-06, + "loss": 0.2669, + "step": 33992 + }, + { + "epoch": 0.6311009457046001, + "grad_norm": 0.3244108259677887, + "learning_rate": 5.9965999397043505e-06, + "loss": 0.305, + "step": 33994 + }, + { + "epoch": 0.6311380758420186, + "grad_norm": 0.6332389712333679, + "learning_rate": 5.9955310337758575e-06, + "loss": 0.3103, + "step": 33996 + }, + { + "epoch": 0.6311752059794373, + "grad_norm": 0.3181458115577698, + "learning_rate": 5.994462182336195e-06, + "loss": 0.3471, + "step": 33998 + }, + { + "epoch": 0.631212336116856, + "grad_norm": 0.3949708938598633, + "learning_rate": 5.993393385399904e-06, + "loss": 0.2314, + "step": 34000 + }, + { + "epoch": 0.6312494662542746, + "grad_norm": 0.4090232849121094, + "learning_rate": 5.992324642981529e-06, + "loss": 0.421, + "step": 34002 + }, + { + "epoch": 0.6312865963916933, + "grad_norm": 0.3519652783870697, + "learning_rate": 5.991255955095607e-06, + "loss": 0.2891, + "step": 34004 + }, + { + "epoch": 0.6313237265291118, + "grad_norm": 0.4112909436225891, + "learning_rate": 5.990187321756684e-06, + "loss": 0.2938, + "step": 34006 + }, + { + "epoch": 0.6313608566665305, + "grad_norm": 0.26916155219078064, + "learning_rate": 5.989118742979303e-06, + "loss": 0.2704, + "step": 34008 + }, + { + "epoch": 0.6313979868039492, + "grad_norm": 0.39649736881256104, + "learning_rate": 5.988050218778002e-06, + "loss": 0.2307, + "step": 34010 + }, + { + "epoch": 0.6314351169413678, + "grad_norm": 0.5152195692062378, + "learning_rate": 5.986981749167323e-06, + "loss": 0.2929, + "step": 34012 + }, + { + "epoch": 0.6314722470787865, + "grad_norm": 0.496154248714447, + "learning_rate": 5.985913334161798e-06, + "loss": 0.2532, + "step": 34014 + }, + { + "epoch": 0.631509377216205, + "grad_norm": 0.39036989212036133, + "learning_rate": 5.98484497377597e-06, + "loss": 0.2554, + "step": 34016 + }, + { + "epoch": 0.6315465073536237, + "grad_norm": 0.20826849341392517, + "learning_rate": 5.983776668024372e-06, + "loss": 0.4023, + "step": 34018 + }, + { + "epoch": 0.6315836374910424, + "grad_norm": 0.3139849305152893, + "learning_rate": 5.9827084169215485e-06, + "loss": 0.3754, + "step": 34020 + }, + { + "epoch": 0.631620767628461, + "grad_norm": 0.605755627155304, + "learning_rate": 5.981640220482028e-06, + "loss": 0.1711, + "step": 34022 + }, + { + "epoch": 0.6316578977658797, + "grad_norm": 0.2503967583179474, + "learning_rate": 5.980572078720346e-06, + "loss": 0.2507, + "step": 34024 + }, + { + "epoch": 0.6316950279032982, + "grad_norm": 0.40294012427330017, + "learning_rate": 5.979503991651043e-06, + "loss": 0.503, + "step": 34026 + }, + { + "epoch": 0.6317321580407169, + "grad_norm": 0.274328351020813, + "learning_rate": 5.978435959288645e-06, + "loss": 0.2053, + "step": 34028 + }, + { + "epoch": 0.6317692881781356, + "grad_norm": 0.46464401483535767, + "learning_rate": 5.977367981647688e-06, + "loss": 0.3877, + "step": 34030 + }, + { + "epoch": 0.6318064183155542, + "grad_norm": 0.4474378228187561, + "learning_rate": 5.976300058742704e-06, + "loss": 0.2505, + "step": 34032 + }, + { + "epoch": 0.6318435484529729, + "grad_norm": 0.39054033160209656, + "learning_rate": 5.975232190588223e-06, + "loss": 0.3399, + "step": 34034 + }, + { + "epoch": 0.6318806785903914, + "grad_norm": 0.33258041739463806, + "learning_rate": 5.974164377198774e-06, + "loss": 0.1325, + "step": 34036 + }, + { + "epoch": 0.6319178087278101, + "grad_norm": 0.4047335386276245, + "learning_rate": 5.973096618588896e-06, + "loss": 0.2571, + "step": 34038 + }, + { + "epoch": 0.6319549388652287, + "grad_norm": 0.32413750886917114, + "learning_rate": 5.972028914773106e-06, + "loss": 0.3846, + "step": 34040 + }, + { + "epoch": 0.6319920690026474, + "grad_norm": 0.27093204855918884, + "learning_rate": 5.97096126576594e-06, + "loss": 0.254, + "step": 34042 + }, + { + "epoch": 0.6320291991400661, + "grad_norm": 0.5951005220413208, + "learning_rate": 5.969893671581919e-06, + "loss": 0.2038, + "step": 34044 + }, + { + "epoch": 0.6320663292774846, + "grad_norm": 0.4287288188934326, + "learning_rate": 5.968826132235574e-06, + "loss": 0.2659, + "step": 34046 + }, + { + "epoch": 0.6321034594149033, + "grad_norm": 0.4471907615661621, + "learning_rate": 5.967758647741432e-06, + "loss": 0.4219, + "step": 34048 + }, + { + "epoch": 0.6321405895523219, + "grad_norm": 0.48721233010292053, + "learning_rate": 5.96669121811402e-06, + "loss": 0.4639, + "step": 34050 + }, + { + "epoch": 0.6321777196897406, + "grad_norm": 0.5401844382286072, + "learning_rate": 5.965623843367855e-06, + "loss": 0.208, + "step": 34052 + }, + { + "epoch": 0.6322148498271593, + "grad_norm": 0.3319588005542755, + "learning_rate": 5.964556523517467e-06, + "loss": 0.3834, + "step": 34054 + }, + { + "epoch": 0.6322519799645778, + "grad_norm": 0.4348486065864563, + "learning_rate": 5.963489258577376e-06, + "loss": 0.243, + "step": 34056 + }, + { + "epoch": 0.6322891101019965, + "grad_norm": 0.4106428027153015, + "learning_rate": 5.962422048562106e-06, + "loss": 0.3039, + "step": 34058 + }, + { + "epoch": 0.6323262402394151, + "grad_norm": 0.21634675562381744, + "learning_rate": 5.961354893486176e-06, + "loss": 0.1485, + "step": 34060 + }, + { + "epoch": 0.6323633703768338, + "grad_norm": 0.48626741766929626, + "learning_rate": 5.960287793364112e-06, + "loss": 0.3464, + "step": 34062 + }, + { + "epoch": 0.6324005005142525, + "grad_norm": 0.5003921985626221, + "learning_rate": 5.95922074821043e-06, + "loss": 0.3613, + "step": 34064 + }, + { + "epoch": 0.632437630651671, + "grad_norm": 0.7658032178878784, + "learning_rate": 5.958153758039651e-06, + "loss": 0.3402, + "step": 34066 + }, + { + "epoch": 0.6324747607890897, + "grad_norm": 0.4114118814468384, + "learning_rate": 5.95708682286629e-06, + "loss": 0.1472, + "step": 34068 + }, + { + "epoch": 0.6325118909265083, + "grad_norm": 0.3192443549633026, + "learning_rate": 5.9560199427048686e-06, + "loss": 0.4329, + "step": 34070 + }, + { + "epoch": 0.632549021063927, + "grad_norm": 0.5751197934150696, + "learning_rate": 5.954953117569904e-06, + "loss": 0.1912, + "step": 34072 + }, + { + "epoch": 0.6325861512013456, + "grad_norm": 0.6894327402114868, + "learning_rate": 5.9538863474759076e-06, + "loss": 0.1483, + "step": 34074 + }, + { + "epoch": 0.6326232813387642, + "grad_norm": 0.3841079771518707, + "learning_rate": 5.9528196324374045e-06, + "loss": 0.1184, + "step": 34076 + }, + { + "epoch": 0.6326604114761829, + "grad_norm": 0.49154019355773926, + "learning_rate": 5.951752972468898e-06, + "loss": 0.1801, + "step": 34078 + }, + { + "epoch": 0.6326975416136015, + "grad_norm": 0.41750723123550415, + "learning_rate": 5.950686367584909e-06, + "loss": 0.5239, + "step": 34080 + }, + { + "epoch": 0.6327346717510202, + "grad_norm": 0.47046223282814026, + "learning_rate": 5.949619817799949e-06, + "loss": 0.3747, + "step": 34082 + }, + { + "epoch": 0.6327718018884387, + "grad_norm": 0.3947194516658783, + "learning_rate": 5.948553323128533e-06, + "loss": 0.3942, + "step": 34084 + }, + { + "epoch": 0.6328089320258574, + "grad_norm": 0.7540957927703857, + "learning_rate": 5.947486883585169e-06, + "loss": 0.2485, + "step": 34086 + }, + { + "epoch": 0.6328460621632761, + "grad_norm": 0.7194357514381409, + "learning_rate": 5.946420499184373e-06, + "loss": 0.1445, + "step": 34088 + }, + { + "epoch": 0.6328831923006947, + "grad_norm": 0.5120323896408081, + "learning_rate": 5.9453541699406495e-06, + "loss": 0.2514, + "step": 34090 + }, + { + "epoch": 0.6329203224381134, + "grad_norm": 0.26570555567741394, + "learning_rate": 5.944287895868509e-06, + "loss": 0.3247, + "step": 34092 + }, + { + "epoch": 0.6329574525755319, + "grad_norm": 0.4399605393409729, + "learning_rate": 5.943221676982462e-06, + "loss": 0.2062, + "step": 34094 + }, + { + "epoch": 0.6329945827129506, + "grad_norm": 0.5909283757209778, + "learning_rate": 5.94215551329702e-06, + "loss": 0.2199, + "step": 34096 + }, + { + "epoch": 0.6330317128503693, + "grad_norm": 0.5592047572135925, + "learning_rate": 5.941089404826683e-06, + "loss": 0.2705, + "step": 34098 + }, + { + "epoch": 0.6330688429877879, + "grad_norm": 0.31185683608055115, + "learning_rate": 5.940023351585968e-06, + "loss": 0.1997, + "step": 34100 + }, + { + "epoch": 0.6331059731252066, + "grad_norm": 0.5447320342063904, + "learning_rate": 5.938957353589367e-06, + "loss": 0.2916, + "step": 34102 + }, + { + "epoch": 0.6331431032626251, + "grad_norm": 0.49067893624305725, + "learning_rate": 5.9378914108513955e-06, + "loss": 0.2305, + "step": 34104 + }, + { + "epoch": 0.6331802334000438, + "grad_norm": 0.3497582674026489, + "learning_rate": 5.936825523386554e-06, + "loss": 0.3374, + "step": 34106 + }, + { + "epoch": 0.6332173635374625, + "grad_norm": 0.412686824798584, + "learning_rate": 5.9357596912093454e-06, + "loss": 0.2023, + "step": 34108 + }, + { + "epoch": 0.6332544936748811, + "grad_norm": 0.44322842359542847, + "learning_rate": 5.934693914334273e-06, + "loss": 0.3528, + "step": 34110 + }, + { + "epoch": 0.6332916238122998, + "grad_norm": 0.3058278262615204, + "learning_rate": 5.93362819277584e-06, + "loss": 0.0725, + "step": 34112 + }, + { + "epoch": 0.6333287539497183, + "grad_norm": 0.27452242374420166, + "learning_rate": 5.932562526548551e-06, + "loss": 0.261, + "step": 34114 + }, + { + "epoch": 0.633365884087137, + "grad_norm": 0.5095474720001221, + "learning_rate": 5.9314969156668985e-06, + "loss": 0.2483, + "step": 34116 + }, + { + "epoch": 0.6334030142245557, + "grad_norm": 0.5862179398536682, + "learning_rate": 5.930431360145389e-06, + "loss": 0.1846, + "step": 34118 + }, + { + "epoch": 0.6334401443619743, + "grad_norm": 0.44797593355178833, + "learning_rate": 5.929365859998516e-06, + "loss": 0.1571, + "step": 34120 + }, + { + "epoch": 0.633477274499393, + "grad_norm": 0.5446195006370544, + "learning_rate": 5.928300415240782e-06, + "loss": 0.2858, + "step": 34122 + }, + { + "epoch": 0.6335144046368115, + "grad_norm": 0.31766778230667114, + "learning_rate": 5.927235025886682e-06, + "loss": 0.2677, + "step": 34124 + }, + { + "epoch": 0.6335515347742302, + "grad_norm": 0.3274802565574646, + "learning_rate": 5.926169691950719e-06, + "loss": 0.244, + "step": 34126 + }, + { + "epoch": 0.6335886649116489, + "grad_norm": 0.38085150718688965, + "learning_rate": 5.92510441344738e-06, + "loss": 0.3819, + "step": 34128 + }, + { + "epoch": 0.6336257950490675, + "grad_norm": 0.38478055596351624, + "learning_rate": 5.9240391903911645e-06, + "loss": 0.2507, + "step": 34130 + }, + { + "epoch": 0.6336629251864861, + "grad_norm": 0.30696019530296326, + "learning_rate": 5.922974022796565e-06, + "loss": 0.3612, + "step": 34132 + }, + { + "epoch": 0.6337000553239047, + "grad_norm": 0.36365044116973877, + "learning_rate": 5.921908910678077e-06, + "loss": 0.5358, + "step": 34134 + }, + { + "epoch": 0.6337371854613234, + "grad_norm": 0.3938376009464264, + "learning_rate": 5.920843854050195e-06, + "loss": 0.5446, + "step": 34136 + }, + { + "epoch": 0.633774315598742, + "grad_norm": 0.4423855245113373, + "learning_rate": 5.919778852927412e-06, + "loss": 0.1972, + "step": 34138 + }, + { + "epoch": 0.6338114457361607, + "grad_norm": 0.25173941254615784, + "learning_rate": 5.918713907324216e-06, + "loss": 0.1984, + "step": 34140 + }, + { + "epoch": 0.6338485758735793, + "grad_norm": 0.5239980220794678, + "learning_rate": 5.917649017255096e-06, + "loss": 0.4124, + "step": 34142 + }, + { + "epoch": 0.6338857060109979, + "grad_norm": 0.2828787565231323, + "learning_rate": 5.916584182734546e-06, + "loss": 0.2561, + "step": 34144 + }, + { + "epoch": 0.6339228361484166, + "grad_norm": 0.4133024513721466, + "learning_rate": 5.9155194037770525e-06, + "loss": 0.3307, + "step": 34146 + }, + { + "epoch": 0.6339599662858352, + "grad_norm": 1.0381852388381958, + "learning_rate": 5.914454680397109e-06, + "loss": 0.2988, + "step": 34148 + }, + { + "epoch": 0.6339970964232539, + "grad_norm": 0.42105981707572937, + "learning_rate": 5.913390012609197e-06, + "loss": 0.1443, + "step": 34150 + }, + { + "epoch": 0.6340342265606725, + "grad_norm": 0.5514183044433594, + "learning_rate": 5.912325400427811e-06, + "loss": 0.1539, + "step": 34152 + }, + { + "epoch": 0.6340713566980911, + "grad_norm": 0.36305615305900574, + "learning_rate": 5.911260843867428e-06, + "loss": 0.3981, + "step": 34154 + }, + { + "epoch": 0.6341084868355098, + "grad_norm": 0.43454042077064514, + "learning_rate": 5.910196342942536e-06, + "loss": 0.3702, + "step": 34156 + }, + { + "epoch": 0.6341456169729284, + "grad_norm": 0.4109431505203247, + "learning_rate": 5.909131897667626e-06, + "loss": 0.2979, + "step": 34158 + }, + { + "epoch": 0.6341827471103471, + "grad_norm": 0.4199132025241852, + "learning_rate": 5.9080675080571736e-06, + "loss": 0.2667, + "step": 34160 + }, + { + "epoch": 0.6342198772477657, + "grad_norm": 0.40708449482917786, + "learning_rate": 5.907003174125665e-06, + "loss": 0.3371, + "step": 34162 + }, + { + "epoch": 0.6342570073851843, + "grad_norm": 0.3604401648044586, + "learning_rate": 5.905938895887589e-06, + "loss": 0.295, + "step": 34164 + }, + { + "epoch": 0.634294137522603, + "grad_norm": 0.2783866822719574, + "learning_rate": 5.904874673357417e-06, + "loss": 0.2155, + "step": 34166 + }, + { + "epoch": 0.6343312676600216, + "grad_norm": 0.6536400318145752, + "learning_rate": 5.9038105065496345e-06, + "loss": 0.2037, + "step": 34168 + }, + { + "epoch": 0.6343683977974403, + "grad_norm": 0.24478089809417725, + "learning_rate": 5.902746395478722e-06, + "loss": 0.4082, + "step": 34170 + }, + { + "epoch": 0.6344055279348589, + "grad_norm": 0.6131709814071655, + "learning_rate": 5.901682340159158e-06, + "loss": 0.2629, + "step": 34172 + }, + { + "epoch": 0.6344426580722775, + "grad_norm": 0.44498828053474426, + "learning_rate": 5.900618340605422e-06, + "loss": 0.5501, + "step": 34174 + }, + { + "epoch": 0.6344797882096962, + "grad_norm": 0.3928348124027252, + "learning_rate": 5.8995543968319934e-06, + "loss": 0.4021, + "step": 34176 + }, + { + "epoch": 0.6345169183471148, + "grad_norm": 0.6353996992111206, + "learning_rate": 5.898490508853344e-06, + "loss": 0.2929, + "step": 34178 + }, + { + "epoch": 0.6345540484845335, + "grad_norm": 0.3568483889102936, + "learning_rate": 5.897426676683955e-06, + "loss": 0.2086, + "step": 34180 + }, + { + "epoch": 0.6345911786219521, + "grad_norm": 0.34923484921455383, + "learning_rate": 5.8963629003382995e-06, + "loss": 0.3246, + "step": 34182 + }, + { + "epoch": 0.6346283087593707, + "grad_norm": 0.5015848875045776, + "learning_rate": 5.895299179830853e-06, + "loss": 0.4258, + "step": 34184 + }, + { + "epoch": 0.6346654388967894, + "grad_norm": 0.5905255079269409, + "learning_rate": 5.8942355151760876e-06, + "loss": 0.2958, + "step": 34186 + }, + { + "epoch": 0.634702569034208, + "grad_norm": 0.3284199833869934, + "learning_rate": 5.893171906388482e-06, + "loss": 0.2813, + "step": 34188 + }, + { + "epoch": 0.6347396991716266, + "grad_norm": 0.39212074875831604, + "learning_rate": 5.892108353482506e-06, + "loss": 0.3243, + "step": 34190 + }, + { + "epoch": 0.6347768293090452, + "grad_norm": 0.4372919499874115, + "learning_rate": 5.89104485647263e-06, + "loss": 0.2959, + "step": 34192 + }, + { + "epoch": 0.6348139594464639, + "grad_norm": 0.6657776832580566, + "learning_rate": 5.889981415373325e-06, + "loss": 0.3296, + "step": 34194 + }, + { + "epoch": 0.6348510895838826, + "grad_norm": 0.658766508102417, + "learning_rate": 5.88891803019906e-06, + "loss": 0.3948, + "step": 34196 + }, + { + "epoch": 0.6348882197213012, + "grad_norm": 0.24199338257312775, + "learning_rate": 5.887854700964308e-06, + "loss": 0.152, + "step": 34198 + }, + { + "epoch": 0.6349253498587198, + "grad_norm": 0.4012165367603302, + "learning_rate": 5.8867914276835366e-06, + "loss": 0.3221, + "step": 34200 + }, + { + "epoch": 0.6349624799961384, + "grad_norm": 0.26037243008613586, + "learning_rate": 5.8857282103712155e-06, + "loss": 0.177, + "step": 34202 + }, + { + "epoch": 0.6349996101335571, + "grad_norm": 0.2393687516450882, + "learning_rate": 5.884665049041809e-06, + "loss": 0.2401, + "step": 34204 + }, + { + "epoch": 0.6350367402709758, + "grad_norm": 0.5441455841064453, + "learning_rate": 5.883601943709781e-06, + "loss": 0.2458, + "step": 34206 + }, + { + "epoch": 0.6350738704083944, + "grad_norm": 0.40732839703559875, + "learning_rate": 5.882538894389602e-06, + "loss": 0.3388, + "step": 34208 + }, + { + "epoch": 0.635111000545813, + "grad_norm": 0.3955431282520294, + "learning_rate": 5.881475901095734e-06, + "loss": 0.256, + "step": 34210 + }, + { + "epoch": 0.6351481306832316, + "grad_norm": 0.48650291562080383, + "learning_rate": 5.880412963842646e-06, + "loss": 0.2512, + "step": 34212 + }, + { + "epoch": 0.6351852608206503, + "grad_norm": 1.178161859512329, + "learning_rate": 5.8793500826448e-06, + "loss": 0.3336, + "step": 34214 + }, + { + "epoch": 0.635222390958069, + "grad_norm": 0.26588088274002075, + "learning_rate": 5.878287257516653e-06, + "loss": 0.2701, + "step": 34216 + }, + { + "epoch": 0.6352595210954876, + "grad_norm": 0.3551124930381775, + "learning_rate": 5.877224488472668e-06, + "loss": 0.187, + "step": 34218 + }, + { + "epoch": 0.6352966512329062, + "grad_norm": 0.396115779876709, + "learning_rate": 5.8761617755273116e-06, + "loss": 0.1815, + "step": 34220 + }, + { + "epoch": 0.6353337813703248, + "grad_norm": 0.25871387124061584, + "learning_rate": 5.875099118695042e-06, + "loss": 0.1591, + "step": 34222 + }, + { + "epoch": 0.6353709115077435, + "grad_norm": 0.2751860022544861, + "learning_rate": 5.874036517990315e-06, + "loss": 0.2249, + "step": 34224 + }, + { + "epoch": 0.6354080416451622, + "grad_norm": 0.3980858623981476, + "learning_rate": 5.872973973427592e-06, + "loss": 0.5431, + "step": 34226 + }, + { + "epoch": 0.6354451717825808, + "grad_norm": 0.47329890727996826, + "learning_rate": 5.8719114850213364e-06, + "loss": 0.3597, + "step": 34228 + }, + { + "epoch": 0.6354823019199994, + "grad_norm": 0.3624727427959442, + "learning_rate": 5.870849052785996e-06, + "loss": 0.2387, + "step": 34230 + }, + { + "epoch": 0.635519432057418, + "grad_norm": 0.2326180636882782, + "learning_rate": 5.869786676736032e-06, + "loss": 0.2911, + "step": 34232 + }, + { + "epoch": 0.6355565621948367, + "grad_norm": 0.3721226453781128, + "learning_rate": 5.868724356885902e-06, + "loss": 0.2463, + "step": 34234 + }, + { + "epoch": 0.6355936923322553, + "grad_norm": 0.38494521379470825, + "learning_rate": 5.867662093250057e-06, + "loss": 0.3036, + "step": 34236 + }, + { + "epoch": 0.635630822469674, + "grad_norm": 0.43942970037460327, + "learning_rate": 5.866599885842953e-06, + "loss": 0.1773, + "step": 34238 + }, + { + "epoch": 0.6356679526070926, + "grad_norm": 0.3306193947792053, + "learning_rate": 5.8655377346790476e-06, + "loss": 0.2219, + "step": 34240 + }, + { + "epoch": 0.6357050827445112, + "grad_norm": 0.5862135887145996, + "learning_rate": 5.864475639772785e-06, + "loss": 0.247, + "step": 34242 + }, + { + "epoch": 0.6357422128819299, + "grad_norm": 0.5253994464874268, + "learning_rate": 5.863413601138625e-06, + "loss": 0.2449, + "step": 34244 + }, + { + "epoch": 0.6357793430193485, + "grad_norm": 0.39305558800697327, + "learning_rate": 5.8623516187910115e-06, + "loss": 0.282, + "step": 34246 + }, + { + "epoch": 0.6358164731567671, + "grad_norm": 0.58650803565979, + "learning_rate": 5.861289692744401e-06, + "loss": 0.4139, + "step": 34248 + }, + { + "epoch": 0.6358536032941858, + "grad_norm": 0.34237655997276306, + "learning_rate": 5.860227823013238e-06, + "loss": 0.2476, + "step": 34250 + }, + { + "epoch": 0.6358907334316044, + "grad_norm": 0.5999776124954224, + "learning_rate": 5.859166009611981e-06, + "loss": 0.2873, + "step": 34252 + }, + { + "epoch": 0.6359278635690231, + "grad_norm": 0.3404444456100464, + "learning_rate": 5.858104252555065e-06, + "loss": 0.523, + "step": 34254 + }, + { + "epoch": 0.6359649937064417, + "grad_norm": 0.29434454441070557, + "learning_rate": 5.857042551856947e-06, + "loss": 0.2707, + "step": 34256 + }, + { + "epoch": 0.6360021238438603, + "grad_norm": 0.3198910355567932, + "learning_rate": 5.855980907532069e-06, + "loss": 0.3045, + "step": 34258 + }, + { + "epoch": 0.636039253981279, + "grad_norm": 0.27349331974983215, + "learning_rate": 5.854919319594877e-06, + "loss": 0.2615, + "step": 34260 + }, + { + "epoch": 0.6360763841186976, + "grad_norm": 0.25971391797065735, + "learning_rate": 5.853857788059818e-06, + "loss": 0.1395, + "step": 34262 + }, + { + "epoch": 0.6361135142561163, + "grad_norm": 0.5142319202423096, + "learning_rate": 5.852796312941338e-06, + "loss": 0.2637, + "step": 34264 + }, + { + "epoch": 0.6361506443935349, + "grad_norm": 1.3276456594467163, + "learning_rate": 5.851734894253878e-06, + "loss": 0.39, + "step": 34266 + }, + { + "epoch": 0.6361877745309535, + "grad_norm": 0.2671741545200348, + "learning_rate": 5.850673532011877e-06, + "loss": 0.2268, + "step": 34268 + }, + { + "epoch": 0.6362249046683722, + "grad_norm": 0.38655802607536316, + "learning_rate": 5.84961222622978e-06, + "loss": 0.3734, + "step": 34270 + }, + { + "epoch": 0.6362620348057908, + "grad_norm": 0.3436509966850281, + "learning_rate": 5.8485509769220295e-06, + "loss": 0.2277, + "step": 34272 + }, + { + "epoch": 0.6362991649432095, + "grad_norm": 0.49369919300079346, + "learning_rate": 5.847489784103067e-06, + "loss": 0.122, + "step": 34274 + }, + { + "epoch": 0.6363362950806281, + "grad_norm": 0.2864399552345276, + "learning_rate": 5.846428647787329e-06, + "loss": 0.1757, + "step": 34276 + }, + { + "epoch": 0.6363734252180467, + "grad_norm": 0.28743574023246765, + "learning_rate": 5.84536756798926e-06, + "loss": 0.2245, + "step": 34278 + }, + { + "epoch": 0.6364105553554654, + "grad_norm": 0.262789785861969, + "learning_rate": 5.844306544723288e-06, + "loss": 0.1327, + "step": 34280 + }, + { + "epoch": 0.636447685492884, + "grad_norm": 0.3452463448047638, + "learning_rate": 5.8432455780038576e-06, + "loss": 0.3296, + "step": 34282 + }, + { + "epoch": 0.6364848156303027, + "grad_norm": 0.4265664517879486, + "learning_rate": 5.842184667845403e-06, + "loss": 0.3715, + "step": 34284 + }, + { + "epoch": 0.6365219457677213, + "grad_norm": 0.44878312945365906, + "learning_rate": 5.8411238142623634e-06, + "loss": 0.2373, + "step": 34286 + }, + { + "epoch": 0.6365590759051399, + "grad_norm": 0.5777428150177002, + "learning_rate": 5.840063017269171e-06, + "loss": 0.3037, + "step": 34288 + }, + { + "epoch": 0.6365962060425585, + "grad_norm": 0.4439290761947632, + "learning_rate": 5.839002276880263e-06, + "loss": 0.5705, + "step": 34290 + }, + { + "epoch": 0.6366333361799772, + "grad_norm": 0.2997719645500183, + "learning_rate": 5.837941593110066e-06, + "loss": 0.2721, + "step": 34292 + }, + { + "epoch": 0.6366704663173959, + "grad_norm": 0.4777323305606842, + "learning_rate": 5.8368809659730175e-06, + "loss": 0.1673, + "step": 34294 + }, + { + "epoch": 0.6367075964548145, + "grad_norm": 0.5824756622314453, + "learning_rate": 5.835820395483549e-06, + "loss": 0.2587, + "step": 34296 + }, + { + "epoch": 0.6367447265922331, + "grad_norm": 0.42135584354400635, + "learning_rate": 5.8347598816560915e-06, + "loss": 0.2556, + "step": 34298 + }, + { + "epoch": 0.6367818567296517, + "grad_norm": 0.43942365050315857, + "learning_rate": 5.833699424505081e-06, + "loss": 0.474, + "step": 34300 + }, + { + "epoch": 0.6368189868670704, + "grad_norm": 0.33204424381256104, + "learning_rate": 5.832639024044937e-06, + "loss": 0.1395, + "step": 34302 + }, + { + "epoch": 0.6368561170044891, + "grad_norm": 0.4456065595149994, + "learning_rate": 5.831578680290096e-06, + "loss": 0.305, + "step": 34304 + }, + { + "epoch": 0.6368932471419076, + "grad_norm": 0.3564998507499695, + "learning_rate": 5.83051839325498e-06, + "loss": 0.4715, + "step": 34306 + }, + { + "epoch": 0.6369303772793263, + "grad_norm": 0.509390652179718, + "learning_rate": 5.82945816295402e-06, + "loss": 0.2404, + "step": 34308 + }, + { + "epoch": 0.6369675074167449, + "grad_norm": 0.3275364637374878, + "learning_rate": 5.828397989401644e-06, + "loss": 0.3322, + "step": 34310 + }, + { + "epoch": 0.6370046375541636, + "grad_norm": 0.316283255815506, + "learning_rate": 5.827337872612273e-06, + "loss": 0.3737, + "step": 34312 + }, + { + "epoch": 0.6370417676915823, + "grad_norm": 0.23323306441307068, + "learning_rate": 5.826277812600336e-06, + "loss": 0.0852, + "step": 34314 + }, + { + "epoch": 0.6370788978290008, + "grad_norm": 0.46385693550109863, + "learning_rate": 5.825217809380261e-06, + "loss": 0.2175, + "step": 34316 + }, + { + "epoch": 0.6371160279664195, + "grad_norm": 0.39158040285110474, + "learning_rate": 5.824157862966462e-06, + "loss": 0.2945, + "step": 34318 + }, + { + "epoch": 0.6371531581038381, + "grad_norm": 0.34611204266548157, + "learning_rate": 5.823097973373366e-06, + "loss": 0.2303, + "step": 34320 + }, + { + "epoch": 0.6371902882412568, + "grad_norm": 0.7379342913627625, + "learning_rate": 5.8220381406154e-06, + "loss": 0.2894, + "step": 34322 + }, + { + "epoch": 0.6372274183786755, + "grad_norm": 0.3979783356189728, + "learning_rate": 5.8209783647069774e-06, + "loss": 0.2937, + "step": 34324 + }, + { + "epoch": 0.637264548516094, + "grad_norm": 0.3409028947353363, + "learning_rate": 5.819918645662519e-06, + "loss": 0.2459, + "step": 34326 + }, + { + "epoch": 0.6373016786535127, + "grad_norm": 0.301815390586853, + "learning_rate": 5.818858983496454e-06, + "loss": 0.2811, + "step": 34328 + }, + { + "epoch": 0.6373388087909313, + "grad_norm": 0.29546594619750977, + "learning_rate": 5.817799378223188e-06, + "loss": 0.3983, + "step": 34330 + }, + { + "epoch": 0.63737593892835, + "grad_norm": 0.39727792143821716, + "learning_rate": 5.816739829857146e-06, + "loss": 0.3101, + "step": 34332 + }, + { + "epoch": 0.6374130690657687, + "grad_norm": 0.5162875652313232, + "learning_rate": 5.8156803384127444e-06, + "loss": 0.3013, + "step": 34334 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.33304065465927124, + "learning_rate": 5.8146209039044e-06, + "loss": 0.2581, + "step": 34336 + }, + { + "epoch": 0.6374873293406059, + "grad_norm": 0.4690551459789276, + "learning_rate": 5.81356152634653e-06, + "loss": 0.2116, + "step": 34338 + }, + { + "epoch": 0.6375244594780245, + "grad_norm": 0.4315544366836548, + "learning_rate": 5.812502205753549e-06, + "loss": 0.2398, + "step": 34340 + }, + { + "epoch": 0.6375615896154432, + "grad_norm": 0.23634302616119385, + "learning_rate": 5.811442942139868e-06, + "loss": 0.1212, + "step": 34342 + }, + { + "epoch": 0.6375987197528618, + "grad_norm": 0.3305998742580414, + "learning_rate": 5.810383735519902e-06, + "loss": 0.1571, + "step": 34344 + }, + { + "epoch": 0.6376358498902804, + "grad_norm": 0.39975443482398987, + "learning_rate": 5.809324585908067e-06, + "loss": 0.1924, + "step": 34346 + }, + { + "epoch": 0.6376729800276991, + "grad_norm": 0.393121600151062, + "learning_rate": 5.80826549331877e-06, + "loss": 0.3276, + "step": 34348 + }, + { + "epoch": 0.6377101101651177, + "grad_norm": 0.3924253284931183, + "learning_rate": 5.807206457766421e-06, + "loss": 0.3868, + "step": 34350 + }, + { + "epoch": 0.6377472403025364, + "grad_norm": 0.4458809494972229, + "learning_rate": 5.806147479265436e-06, + "loss": 0.1365, + "step": 34352 + }, + { + "epoch": 0.637784370439955, + "grad_norm": 0.21137550473213196, + "learning_rate": 5.805088557830224e-06, + "loss": 0.273, + "step": 34354 + }, + { + "epoch": 0.6378215005773736, + "grad_norm": 0.4154908359050751, + "learning_rate": 5.804029693475188e-06, + "loss": 0.3271, + "step": 34356 + }, + { + "epoch": 0.6378586307147923, + "grad_norm": 0.34669429063796997, + "learning_rate": 5.8029708862147404e-06, + "loss": 0.2253, + "step": 34358 + }, + { + "epoch": 0.6378957608522109, + "grad_norm": 0.3073492646217346, + "learning_rate": 5.8019121360632855e-06, + "loss": 0.316, + "step": 34360 + }, + { + "epoch": 0.6379328909896296, + "grad_norm": 0.41762781143188477, + "learning_rate": 5.800853443035234e-06, + "loss": 0.1745, + "step": 34362 + }, + { + "epoch": 0.6379700211270481, + "grad_norm": 0.41482043266296387, + "learning_rate": 5.799794807144992e-06, + "loss": 0.1274, + "step": 34364 + }, + { + "epoch": 0.6380071512644668, + "grad_norm": 0.4193533658981323, + "learning_rate": 5.7987362284069584e-06, + "loss": 0.3086, + "step": 34366 + }, + { + "epoch": 0.6380442814018855, + "grad_norm": 0.4279491603374481, + "learning_rate": 5.797677706835543e-06, + "loss": 0.2947, + "step": 34368 + }, + { + "epoch": 0.6380814115393041, + "grad_norm": 0.33530905842781067, + "learning_rate": 5.796619242445143e-06, + "loss": 0.303, + "step": 34370 + }, + { + "epoch": 0.6381185416767228, + "grad_norm": 0.5515667200088501, + "learning_rate": 5.795560835250165e-06, + "loss": 0.0954, + "step": 34372 + }, + { + "epoch": 0.6381556718141413, + "grad_norm": 0.35331907868385315, + "learning_rate": 5.79450248526501e-06, + "loss": 0.1266, + "step": 34374 + }, + { + "epoch": 0.63819280195156, + "grad_norm": 2.5497946739196777, + "learning_rate": 5.79344419250408e-06, + "loss": 0.1536, + "step": 34376 + }, + { + "epoch": 0.6382299320889787, + "grad_norm": 0.41464558243751526, + "learning_rate": 5.792385956981777e-06, + "loss": 0.2204, + "step": 34378 + }, + { + "epoch": 0.6382670622263973, + "grad_norm": 0.5481444597244263, + "learning_rate": 5.791327778712496e-06, + "loss": 0.3593, + "step": 34380 + }, + { + "epoch": 0.638304192363816, + "grad_norm": 0.1794361174106598, + "learning_rate": 5.790269657710635e-06, + "loss": 0.1642, + "step": 34382 + }, + { + "epoch": 0.6383413225012345, + "grad_norm": 0.2744278013706207, + "learning_rate": 5.789211593990596e-06, + "loss": 0.19, + "step": 34384 + }, + { + "epoch": 0.6383784526386532, + "grad_norm": 0.35223159193992615, + "learning_rate": 5.788153587566778e-06, + "loss": 0.1695, + "step": 34386 + }, + { + "epoch": 0.6384155827760718, + "grad_norm": 0.4164327085018158, + "learning_rate": 5.7870956384535706e-06, + "loss": 0.317, + "step": 34388 + }, + { + "epoch": 0.6384527129134905, + "grad_norm": 0.2784217894077301, + "learning_rate": 5.786037746665375e-06, + "loss": 0.1223, + "step": 34390 + }, + { + "epoch": 0.6384898430509092, + "grad_norm": 0.42970484495162964, + "learning_rate": 5.784979912216579e-06, + "loss": 0.3063, + "step": 34392 + }, + { + "epoch": 0.6385269731883277, + "grad_norm": 0.4044070541858673, + "learning_rate": 5.783922135121582e-06, + "loss": 0.2385, + "step": 34394 + }, + { + "epoch": 0.6385641033257464, + "grad_norm": 0.32016560435295105, + "learning_rate": 5.782864415394778e-06, + "loss": 0.2341, + "step": 34396 + }, + { + "epoch": 0.638601233463165, + "grad_norm": 0.38111764192581177, + "learning_rate": 5.781806753050555e-06, + "loss": 0.288, + "step": 34398 + }, + { + "epoch": 0.6386383636005837, + "grad_norm": 0.17499271035194397, + "learning_rate": 5.780749148103309e-06, + "loss": 0.3033, + "step": 34400 + }, + { + "epoch": 0.6386754937380024, + "grad_norm": 0.7759330868721008, + "learning_rate": 5.7796916005674265e-06, + "loss": 0.2888, + "step": 34402 + }, + { + "epoch": 0.6387126238754209, + "grad_norm": 0.3772846460342407, + "learning_rate": 5.778634110457305e-06, + "loss": 0.3881, + "step": 34404 + }, + { + "epoch": 0.6387497540128396, + "grad_norm": 0.38556066155433655, + "learning_rate": 5.777576677787325e-06, + "loss": 0.2721, + "step": 34406 + }, + { + "epoch": 0.6387868841502582, + "grad_norm": 0.39567604660987854, + "learning_rate": 5.776519302571883e-06, + "loss": 0.2709, + "step": 34408 + }, + { + "epoch": 0.6388240142876769, + "grad_norm": 0.37514883279800415, + "learning_rate": 5.775461984825359e-06, + "loss": 0.2264, + "step": 34410 + }, + { + "epoch": 0.6388611444250956, + "grad_norm": 0.39258813858032227, + "learning_rate": 5.774404724562142e-06, + "loss": 0.3255, + "step": 34412 + }, + { + "epoch": 0.6388982745625141, + "grad_norm": 0.45563045144081116, + "learning_rate": 5.77334752179662e-06, + "loss": 0.1687, + "step": 34414 + }, + { + "epoch": 0.6389354046999328, + "grad_norm": 0.3534729778766632, + "learning_rate": 5.772290376543184e-06, + "loss": 0.4204, + "step": 34416 + }, + { + "epoch": 0.6389725348373514, + "grad_norm": 0.3805806338787079, + "learning_rate": 5.7712332888162056e-06, + "loss": 0.4312, + "step": 34418 + }, + { + "epoch": 0.6390096649747701, + "grad_norm": 0.45399031043052673, + "learning_rate": 5.770176258630077e-06, + "loss": 0.4205, + "step": 34420 + }, + { + "epoch": 0.6390467951121888, + "grad_norm": 0.3785521984100342, + "learning_rate": 5.7691192859991795e-06, + "loss": 0.3137, + "step": 34422 + }, + { + "epoch": 0.6390839252496073, + "grad_norm": 0.3475157916545868, + "learning_rate": 5.7680623709378946e-06, + "loss": 0.2657, + "step": 34424 + }, + { + "epoch": 0.639121055387026, + "grad_norm": 0.3515818417072296, + "learning_rate": 5.767005513460606e-06, + "loss": 0.3331, + "step": 34426 + }, + { + "epoch": 0.6391581855244446, + "grad_norm": 0.32471948862075806, + "learning_rate": 5.7659487135816975e-06, + "loss": 0.353, + "step": 34428 + }, + { + "epoch": 0.6391953156618633, + "grad_norm": 0.3636452555656433, + "learning_rate": 5.764891971315544e-06, + "loss": 0.3381, + "step": 34430 + }, + { + "epoch": 0.639232445799282, + "grad_norm": 0.35272589325904846, + "learning_rate": 5.76383528667652e-06, + "loss": 0.4716, + "step": 34432 + }, + { + "epoch": 0.6392695759367005, + "grad_norm": 0.4739702641963959, + "learning_rate": 5.76277865967901e-06, + "loss": 0.2831, + "step": 34434 + }, + { + "epoch": 0.6393067060741192, + "grad_norm": 0.33771243691444397, + "learning_rate": 5.7617220903373916e-06, + "loss": 0.071, + "step": 34436 + }, + { + "epoch": 0.6393438362115378, + "grad_norm": 0.3048953413963318, + "learning_rate": 5.760665578666038e-06, + "loss": 0.221, + "step": 34438 + }, + { + "epoch": 0.6393809663489565, + "grad_norm": 0.5305869579315186, + "learning_rate": 5.759609124679329e-06, + "loss": 0.332, + "step": 34440 + }, + { + "epoch": 0.639418096486375, + "grad_norm": 0.5023661255836487, + "learning_rate": 5.758552728391642e-06, + "loss": 0.1984, + "step": 34442 + }, + { + "epoch": 0.6394552266237937, + "grad_norm": 0.6118751168251038, + "learning_rate": 5.757496389817345e-06, + "loss": 0.164, + "step": 34444 + }, + { + "epoch": 0.6394923567612124, + "grad_norm": 0.27222010493278503, + "learning_rate": 5.756440108970813e-06, + "loss": 0.3025, + "step": 34446 + }, + { + "epoch": 0.639529486898631, + "grad_norm": 0.38678622245788574, + "learning_rate": 5.7553838858664206e-06, + "loss": 0.2028, + "step": 34448 + }, + { + "epoch": 0.6395666170360497, + "grad_norm": 0.4148137867450714, + "learning_rate": 5.754327720518543e-06, + "loss": 0.1497, + "step": 34450 + }, + { + "epoch": 0.6396037471734682, + "grad_norm": 0.377964586019516, + "learning_rate": 5.7532716129415444e-06, + "loss": 0.2453, + "step": 34452 + }, + { + "epoch": 0.6396408773108869, + "grad_norm": 0.3893478512763977, + "learning_rate": 5.752215563149802e-06, + "loss": 0.2501, + "step": 34454 + }, + { + "epoch": 0.6396780074483056, + "grad_norm": 0.32430949807167053, + "learning_rate": 5.751159571157679e-06, + "loss": 0.3333, + "step": 34456 + }, + { + "epoch": 0.6397151375857242, + "grad_norm": 0.35612526535987854, + "learning_rate": 5.750103636979547e-06, + "loss": 0.4336, + "step": 34458 + }, + { + "epoch": 0.6397522677231429, + "grad_norm": 0.31727901101112366, + "learning_rate": 5.749047760629776e-06, + "loss": 0.2042, + "step": 34460 + }, + { + "epoch": 0.6397893978605614, + "grad_norm": 0.37208810448646545, + "learning_rate": 5.7479919421227305e-06, + "loss": 0.3188, + "step": 34462 + }, + { + "epoch": 0.6398265279979801, + "grad_norm": 0.5261711478233337, + "learning_rate": 5.746936181472777e-06, + "loss": 0.2907, + "step": 34464 + }, + { + "epoch": 0.6398636581353988, + "grad_norm": 0.2947002053260803, + "learning_rate": 5.745880478694289e-06, + "loss": 0.3634, + "step": 34466 + }, + { + "epoch": 0.6399007882728174, + "grad_norm": 0.2871014475822449, + "learning_rate": 5.744824833801621e-06, + "loss": 0.2795, + "step": 34468 + }, + { + "epoch": 0.6399379184102361, + "grad_norm": 0.19240085780620575, + "learning_rate": 5.743769246809141e-06, + "loss": 0.3702, + "step": 34470 + }, + { + "epoch": 0.6399750485476546, + "grad_norm": 0.4328199028968811, + "learning_rate": 5.742713717731216e-06, + "loss": 0.1297, + "step": 34472 + }, + { + "epoch": 0.6400121786850733, + "grad_norm": 0.8787009716033936, + "learning_rate": 5.741658246582201e-06, + "loss": 0.1305, + "step": 34474 + }, + { + "epoch": 0.640049308822492, + "grad_norm": 0.33513548970222473, + "learning_rate": 5.740602833376463e-06, + "loss": 0.3488, + "step": 34476 + }, + { + "epoch": 0.6400864389599106, + "grad_norm": 0.305398166179657, + "learning_rate": 5.739547478128362e-06, + "loss": 0.3491, + "step": 34478 + }, + { + "epoch": 0.6401235690973293, + "grad_norm": 0.5140475630760193, + "learning_rate": 5.738492180852262e-06, + "loss": 0.351, + "step": 34480 + }, + { + "epoch": 0.6401606992347478, + "grad_norm": 0.3031245768070221, + "learning_rate": 5.737436941562514e-06, + "loss": 0.4482, + "step": 34482 + }, + { + "epoch": 0.6401978293721665, + "grad_norm": 0.48454996943473816, + "learning_rate": 5.736381760273482e-06, + "loss": 0.1801, + "step": 34484 + }, + { + "epoch": 0.6402349595095852, + "grad_norm": 0.4383023679256439, + "learning_rate": 5.735326636999523e-06, + "loss": 0.2544, + "step": 34486 + }, + { + "epoch": 0.6402720896470038, + "grad_norm": 0.44977495074272156, + "learning_rate": 5.734271571754996e-06, + "loss": 0.2795, + "step": 34488 + }, + { + "epoch": 0.6403092197844225, + "grad_norm": 0.3827721178531647, + "learning_rate": 5.7332165645542534e-06, + "loss": 0.2078, + "step": 34490 + }, + { + "epoch": 0.640346349921841, + "grad_norm": 0.3157905638217926, + "learning_rate": 5.732161615411657e-06, + "loss": 0.3334, + "step": 34492 + }, + { + "epoch": 0.6403834800592597, + "grad_norm": 0.32621458172798157, + "learning_rate": 5.73110672434156e-06, + "loss": 0.201, + "step": 34494 + }, + { + "epoch": 0.6404206101966783, + "grad_norm": 0.271375447511673, + "learning_rate": 5.730051891358307e-06, + "loss": 0.3317, + "step": 34496 + }, + { + "epoch": 0.640457740334097, + "grad_norm": 0.3328658938407898, + "learning_rate": 5.7289971164762585e-06, + "loss": 0.3082, + "step": 34498 + }, + { + "epoch": 0.6404948704715157, + "grad_norm": 0.3727584183216095, + "learning_rate": 5.727942399709766e-06, + "loss": 0.2013, + "step": 34500 + }, + { + "epoch": 0.6405320006089342, + "grad_norm": 0.6493826508522034, + "learning_rate": 5.726887741073182e-06, + "loss": 0.2665, + "step": 34502 + }, + { + "epoch": 0.6405691307463529, + "grad_norm": 0.2889195382595062, + "learning_rate": 5.725833140580859e-06, + "loss": 0.2028, + "step": 34504 + }, + { + "epoch": 0.6406062608837715, + "grad_norm": 0.46604329347610474, + "learning_rate": 5.724778598247141e-06, + "loss": 0.1538, + "step": 34506 + }, + { + "epoch": 0.6406433910211902, + "grad_norm": 0.28957000374794006, + "learning_rate": 5.7237241140863795e-06, + "loss": 0.2749, + "step": 34508 + }, + { + "epoch": 0.6406805211586089, + "grad_norm": 0.4267987906932831, + "learning_rate": 5.722669688112925e-06, + "loss": 0.3562, + "step": 34510 + }, + { + "epoch": 0.6407176512960274, + "grad_norm": 0.43735820055007935, + "learning_rate": 5.721615320341125e-06, + "loss": 0.3494, + "step": 34512 + }, + { + "epoch": 0.6407547814334461, + "grad_norm": 0.5622162818908691, + "learning_rate": 5.720561010785327e-06, + "loss": 0.361, + "step": 34514 + }, + { + "epoch": 0.6407919115708647, + "grad_norm": 0.3882259428501129, + "learning_rate": 5.719506759459872e-06, + "loss": 0.392, + "step": 34516 + }, + { + "epoch": 0.6408290417082834, + "grad_norm": 0.2676790952682495, + "learning_rate": 5.718452566379114e-06, + "loss": 0.1966, + "step": 34518 + }, + { + "epoch": 0.640866171845702, + "grad_norm": 0.4147421717643738, + "learning_rate": 5.717398431557386e-06, + "loss": 0.2436, + "step": 34520 + }, + { + "epoch": 0.6409033019831206, + "grad_norm": 0.7291442155838013, + "learning_rate": 5.716344355009038e-06, + "loss": 0.3866, + "step": 34522 + }, + { + "epoch": 0.6409404321205393, + "grad_norm": 0.4977591335773468, + "learning_rate": 5.715290336748413e-06, + "loss": 0.2817, + "step": 34524 + }, + { + "epoch": 0.6409775622579579, + "grad_norm": 0.47533005475997925, + "learning_rate": 5.7142363767898515e-06, + "loss": 0.1583, + "step": 34526 + }, + { + "epoch": 0.6410146923953766, + "grad_norm": 0.3429177403450012, + "learning_rate": 5.713182475147696e-06, + "loss": 0.2045, + "step": 34528 + }, + { + "epoch": 0.6410518225327952, + "grad_norm": 0.30088746547698975, + "learning_rate": 5.712128631836289e-06, + "loss": 0.2331, + "step": 34530 + }, + { + "epoch": 0.6410889526702138, + "grad_norm": 0.4164296090602875, + "learning_rate": 5.711074846869966e-06, + "loss": 0.28, + "step": 34532 + }, + { + "epoch": 0.6411260828076325, + "grad_norm": 0.515152633190155, + "learning_rate": 5.710021120263066e-06, + "loss": 0.223, + "step": 34534 + }, + { + "epoch": 0.6411632129450511, + "grad_norm": 0.3355543911457062, + "learning_rate": 5.708967452029933e-06, + "loss": 0.2847, + "step": 34536 + }, + { + "epoch": 0.6412003430824698, + "grad_norm": 0.39738187193870544, + "learning_rate": 5.7079138421848955e-06, + "loss": 0.393, + "step": 34538 + }, + { + "epoch": 0.6412374732198883, + "grad_norm": 0.560077428817749, + "learning_rate": 5.706860290742296e-06, + "loss": 0.1241, + "step": 34540 + }, + { + "epoch": 0.641274603357307, + "grad_norm": 0.4178966283798218, + "learning_rate": 5.70580679771647e-06, + "loss": 0.2877, + "step": 34542 + }, + { + "epoch": 0.6413117334947257, + "grad_norm": 0.4897531270980835, + "learning_rate": 5.704753363121749e-06, + "loss": 0.2128, + "step": 34544 + }, + { + "epoch": 0.6413488636321443, + "grad_norm": 0.3979819715023041, + "learning_rate": 5.703699986972467e-06, + "loss": 0.2481, + "step": 34546 + }, + { + "epoch": 0.641385993769563, + "grad_norm": 0.41877618432044983, + "learning_rate": 5.702646669282961e-06, + "loss": 0.3074, + "step": 34548 + }, + { + "epoch": 0.6414231239069815, + "grad_norm": 0.43111488223075867, + "learning_rate": 5.70159341006756e-06, + "loss": 0.2319, + "step": 34550 + }, + { + "epoch": 0.6414602540444002, + "grad_norm": 0.2287842035293579, + "learning_rate": 5.700540209340599e-06, + "loss": 0.2187, + "step": 34552 + }, + { + "epoch": 0.6414973841818189, + "grad_norm": 0.46635547280311584, + "learning_rate": 5.699487067116411e-06, + "loss": 0.2224, + "step": 34554 + }, + { + "epoch": 0.6415345143192375, + "grad_norm": 0.5059830546379089, + "learning_rate": 5.698433983409318e-06, + "loss": 0.1089, + "step": 34556 + }, + { + "epoch": 0.6415716444566562, + "grad_norm": 0.28047865629196167, + "learning_rate": 5.697380958233658e-06, + "loss": 0.1653, + "step": 34558 + }, + { + "epoch": 0.6416087745940747, + "grad_norm": 0.33498844504356384, + "learning_rate": 5.696327991603752e-06, + "loss": 0.1747, + "step": 34560 + }, + { + "epoch": 0.6416459047314934, + "grad_norm": 0.39726707339286804, + "learning_rate": 5.695275083533931e-06, + "loss": 0.3921, + "step": 34562 + }, + { + "epoch": 0.6416830348689121, + "grad_norm": 0.5147075653076172, + "learning_rate": 5.694222234038523e-06, + "loss": 0.4023, + "step": 34564 + }, + { + "epoch": 0.6417201650063307, + "grad_norm": 0.3728301227092743, + "learning_rate": 5.693169443131849e-06, + "loss": 0.4022, + "step": 34566 + }, + { + "epoch": 0.6417572951437494, + "grad_norm": 0.4728337228298187, + "learning_rate": 5.692116710828246e-06, + "loss": 0.1201, + "step": 34568 + }, + { + "epoch": 0.6417944252811679, + "grad_norm": 0.30708497762680054, + "learning_rate": 5.691064037142027e-06, + "loss": 0.2886, + "step": 34570 + }, + { + "epoch": 0.6418315554185866, + "grad_norm": 0.4832508862018585, + "learning_rate": 5.690011422087518e-06, + "loss": 0.2247, + "step": 34572 + }, + { + "epoch": 0.6418686855560053, + "grad_norm": 0.4336124062538147, + "learning_rate": 5.688958865679044e-06, + "loss": 0.3838, + "step": 34574 + }, + { + "epoch": 0.6419058156934239, + "grad_norm": 0.43545395135879517, + "learning_rate": 5.687906367930931e-06, + "loss": 0.2008, + "step": 34576 + }, + { + "epoch": 0.6419429458308425, + "grad_norm": 0.3618486225605011, + "learning_rate": 5.686853928857492e-06, + "loss": 0.1908, + "step": 34578 + }, + { + "epoch": 0.6419800759682611, + "grad_norm": 0.343269407749176, + "learning_rate": 5.685801548473057e-06, + "loss": 0.18, + "step": 34580 + }, + { + "epoch": 0.6420172061056798, + "grad_norm": 0.33832189440727234, + "learning_rate": 5.684749226791935e-06, + "loss": 0.2567, + "step": 34582 + }, + { + "epoch": 0.6420543362430985, + "grad_norm": 0.37028977274894714, + "learning_rate": 5.683696963828451e-06, + "loss": 0.3512, + "step": 34584 + }, + { + "epoch": 0.6420914663805171, + "grad_norm": 0.3490130603313446, + "learning_rate": 5.682644759596923e-06, + "loss": 0.4354, + "step": 34586 + }, + { + "epoch": 0.6421285965179357, + "grad_norm": 0.4102526307106018, + "learning_rate": 5.6815926141116665e-06, + "loss": 0.2374, + "step": 34588 + }, + { + "epoch": 0.6421657266553543, + "grad_norm": 0.3075184226036072, + "learning_rate": 5.680540527386999e-06, + "loss": 0.3928, + "step": 34590 + }, + { + "epoch": 0.642202856792773, + "grad_norm": 0.30693307518959045, + "learning_rate": 5.679488499437242e-06, + "loss": 0.303, + "step": 34592 + }, + { + "epoch": 0.6422399869301916, + "grad_norm": 0.28893569111824036, + "learning_rate": 5.678436530276701e-06, + "loss": 0.4322, + "step": 34594 + }, + { + "epoch": 0.6422771170676103, + "grad_norm": 0.40623289346694946, + "learning_rate": 5.677384619919693e-06, + "loss": 0.3072, + "step": 34596 + }, + { + "epoch": 0.6423142472050289, + "grad_norm": 0.5637116432189941, + "learning_rate": 5.676332768380535e-06, + "loss": 0.5271, + "step": 34598 + }, + { + "epoch": 0.6423513773424475, + "grad_norm": 0.8443187475204468, + "learning_rate": 5.675280975673538e-06, + "loss": 0.3471, + "step": 34600 + }, + { + "epoch": 0.6423885074798662, + "grad_norm": 0.6141651272773743, + "learning_rate": 5.674229241813012e-06, + "loss": 0.3658, + "step": 34602 + }, + { + "epoch": 0.6424256376172848, + "grad_norm": 0.6757946610450745, + "learning_rate": 5.673177566813266e-06, + "loss": 0.4089, + "step": 34604 + }, + { + "epoch": 0.6424627677547035, + "grad_norm": 0.35423508286476135, + "learning_rate": 5.672125950688618e-06, + "loss": 0.3595, + "step": 34606 + }, + { + "epoch": 0.6424998978921221, + "grad_norm": 0.2510984241962433, + "learning_rate": 5.67107439345337e-06, + "loss": 0.2174, + "step": 34608 + }, + { + "epoch": 0.6425370280295407, + "grad_norm": 0.43001770973205566, + "learning_rate": 5.670022895121832e-06, + "loss": 0.3229, + "step": 34610 + }, + { + "epoch": 0.6425741581669594, + "grad_norm": 0.48932796716690063, + "learning_rate": 5.668971455708311e-06, + "loss": 0.4381, + "step": 34612 + }, + { + "epoch": 0.642611288304378, + "grad_norm": 0.3985104262828827, + "learning_rate": 5.667920075227117e-06, + "loss": 0.1877, + "step": 34614 + }, + { + "epoch": 0.6426484184417967, + "grad_norm": 0.34801462292671204, + "learning_rate": 5.666868753692554e-06, + "loss": 0.311, + "step": 34616 + }, + { + "epoch": 0.6426855485792153, + "grad_norm": 0.36197179555892944, + "learning_rate": 5.665817491118932e-06, + "loss": 0.3285, + "step": 34618 + }, + { + "epoch": 0.6427226787166339, + "grad_norm": 0.3744148313999176, + "learning_rate": 5.664766287520549e-06, + "loss": 0.2925, + "step": 34620 + }, + { + "epoch": 0.6427598088540526, + "grad_norm": 0.38207340240478516, + "learning_rate": 5.663715142911715e-06, + "loss": 0.0593, + "step": 34622 + }, + { + "epoch": 0.6427969389914712, + "grad_norm": 0.1998310089111328, + "learning_rate": 5.662664057306724e-06, + "loss": 0.2616, + "step": 34624 + }, + { + "epoch": 0.6428340691288899, + "grad_norm": 0.2842331528663635, + "learning_rate": 5.661613030719883e-06, + "loss": 0.3475, + "step": 34626 + }, + { + "epoch": 0.6428711992663085, + "grad_norm": 0.3710392117500305, + "learning_rate": 5.660562063165495e-06, + "loss": 0.4012, + "step": 34628 + }, + { + "epoch": 0.6429083294037271, + "grad_norm": 0.3332996368408203, + "learning_rate": 5.659511154657862e-06, + "loss": 0.4786, + "step": 34630 + }, + { + "epoch": 0.6429454595411458, + "grad_norm": 0.37276142835617065, + "learning_rate": 5.6584603052112776e-06, + "loss": 0.1717, + "step": 34632 + }, + { + "epoch": 0.6429825896785644, + "grad_norm": 0.4062551259994507, + "learning_rate": 5.657409514840044e-06, + "loss": 0.1661, + "step": 34634 + }, + { + "epoch": 0.643019719815983, + "grad_norm": 0.3422892391681671, + "learning_rate": 5.65635878355846e-06, + "loss": 0.2376, + "step": 34636 + }, + { + "epoch": 0.6430568499534017, + "grad_norm": 0.3834654688835144, + "learning_rate": 5.655308111380822e-06, + "loss": 0.16, + "step": 34638 + }, + { + "epoch": 0.6430939800908203, + "grad_norm": 0.3469104766845703, + "learning_rate": 5.65425749832143e-06, + "loss": 0.2378, + "step": 34640 + }, + { + "epoch": 0.643131110228239, + "grad_norm": 0.39868664741516113, + "learning_rate": 5.6532069443945735e-06, + "loss": 0.1257, + "step": 34642 + }, + { + "epoch": 0.6431682403656576, + "grad_norm": 0.4539540112018585, + "learning_rate": 5.6521564496145535e-06, + "loss": 0.2833, + "step": 34644 + }, + { + "epoch": 0.6432053705030762, + "grad_norm": 0.4189632833003998, + "learning_rate": 5.651106013995657e-06, + "loss": 0.2584, + "step": 34646 + }, + { + "epoch": 0.6432425006404948, + "grad_norm": 0.34033602476119995, + "learning_rate": 5.650055637552181e-06, + "loss": 0.4255, + "step": 34648 + }, + { + "epoch": 0.6432796307779135, + "grad_norm": 0.42274802923202515, + "learning_rate": 5.6490053202984185e-06, + "loss": 0.4432, + "step": 34650 + }, + { + "epoch": 0.6433167609153322, + "grad_norm": 0.5104333758354187, + "learning_rate": 5.6479550622486616e-06, + "loss": 0.3391, + "step": 34652 + }, + { + "epoch": 0.6433538910527508, + "grad_norm": 0.6274007558822632, + "learning_rate": 5.6469048634172e-06, + "loss": 0.4583, + "step": 34654 + }, + { + "epoch": 0.6433910211901694, + "grad_norm": 0.40335410833358765, + "learning_rate": 5.645854723818327e-06, + "loss": 0.2771, + "step": 34656 + }, + { + "epoch": 0.643428151327588, + "grad_norm": 0.33272022008895874, + "learning_rate": 5.644804643466326e-06, + "loss": 0.2386, + "step": 34658 + }, + { + "epoch": 0.6434652814650067, + "grad_norm": 0.4339945614337921, + "learning_rate": 5.643754622375488e-06, + "loss": 0.3071, + "step": 34660 + }, + { + "epoch": 0.6435024116024254, + "grad_norm": 0.34428122639656067, + "learning_rate": 5.642704660560105e-06, + "loss": 0.2874, + "step": 34662 + }, + { + "epoch": 0.643539541739844, + "grad_norm": 0.6480374336242676, + "learning_rate": 5.6416547580344574e-06, + "loss": 0.1827, + "step": 34664 + }, + { + "epoch": 0.6435766718772626, + "grad_norm": 0.2720595896244049, + "learning_rate": 5.640604914812833e-06, + "loss": 0.1776, + "step": 34666 + }, + { + "epoch": 0.6436138020146812, + "grad_norm": 0.28396087884902954, + "learning_rate": 5.639555130909522e-06, + "loss": 0.1213, + "step": 34668 + }, + { + "epoch": 0.6436509321520999, + "grad_norm": 0.4466366171836853, + "learning_rate": 5.638505406338799e-06, + "loss": 0.3292, + "step": 34670 + }, + { + "epoch": 0.6436880622895186, + "grad_norm": 0.31564563512802124, + "learning_rate": 5.637455741114954e-06, + "loss": 0.2693, + "step": 34672 + }, + { + "epoch": 0.6437251924269372, + "grad_norm": 0.2586180865764618, + "learning_rate": 5.63640613525227e-06, + "loss": 0.0887, + "step": 34674 + }, + { + "epoch": 0.6437623225643558, + "grad_norm": 0.3365249037742615, + "learning_rate": 5.635356588765028e-06, + "loss": 0.2423, + "step": 34676 + }, + { + "epoch": 0.6437994527017744, + "grad_norm": 0.4494837820529938, + "learning_rate": 5.634307101667509e-06, + "loss": 0.2423, + "step": 34678 + }, + { + "epoch": 0.6438365828391931, + "grad_norm": 0.34529224038124084, + "learning_rate": 5.633257673973997e-06, + "loss": 0.2712, + "step": 34680 + }, + { + "epoch": 0.6438737129766118, + "grad_norm": 0.4213443994522095, + "learning_rate": 5.632208305698765e-06, + "loss": 0.2152, + "step": 34682 + }, + { + "epoch": 0.6439108431140304, + "grad_norm": 0.437482088804245, + "learning_rate": 5.631158996856099e-06, + "loss": 0.2496, + "step": 34684 + }, + { + "epoch": 0.643947973251449, + "grad_norm": 0.3434986472129822, + "learning_rate": 5.630109747460267e-06, + "loss": 0.2901, + "step": 34686 + }, + { + "epoch": 0.6439851033888676, + "grad_norm": 0.4669639468193054, + "learning_rate": 5.629060557525554e-06, + "loss": 0.336, + "step": 34688 + }, + { + "epoch": 0.6440222335262863, + "grad_norm": 0.2946740686893463, + "learning_rate": 5.628011427066233e-06, + "loss": 0.221, + "step": 34690 + }, + { + "epoch": 0.6440593636637049, + "grad_norm": 0.31371450424194336, + "learning_rate": 5.626962356096581e-06, + "loss": 0.3982, + "step": 34692 + }, + { + "epoch": 0.6440964938011235, + "grad_norm": 0.2911532521247864, + "learning_rate": 5.625913344630878e-06, + "loss": 0.4586, + "step": 34694 + }, + { + "epoch": 0.6441336239385422, + "grad_norm": 0.35006454586982727, + "learning_rate": 5.624864392683387e-06, + "loss": 0.4806, + "step": 34696 + }, + { + "epoch": 0.6441707540759608, + "grad_norm": 0.38948938250541687, + "learning_rate": 5.6238155002683885e-06, + "loss": 0.3223, + "step": 34698 + }, + { + "epoch": 0.6442078842133795, + "grad_norm": 0.44755303859710693, + "learning_rate": 5.62276666740015e-06, + "loss": 0.2799, + "step": 34700 + }, + { + "epoch": 0.6442450143507981, + "grad_norm": 0.400484174489975, + "learning_rate": 5.621717894092947e-06, + "loss": 0.1782, + "step": 34702 + }, + { + "epoch": 0.6442821444882167, + "grad_norm": 0.6038129925727844, + "learning_rate": 5.6206691803610536e-06, + "loss": 0.4655, + "step": 34704 + }, + { + "epoch": 0.6443192746256354, + "grad_norm": 0.3647526204586029, + "learning_rate": 5.619620526218731e-06, + "loss": 0.3203, + "step": 34706 + }, + { + "epoch": 0.644356404763054, + "grad_norm": 0.36976975202560425, + "learning_rate": 5.618571931680255e-06, + "loss": 0.3028, + "step": 34708 + }, + { + "epoch": 0.6443935349004727, + "grad_norm": 0.42161667346954346, + "learning_rate": 5.61752339675989e-06, + "loss": 0.3183, + "step": 34710 + }, + { + "epoch": 0.6444306650378913, + "grad_norm": 0.3405196964740753, + "learning_rate": 5.616474921471902e-06, + "loss": 0.4521, + "step": 34712 + }, + { + "epoch": 0.6444677951753099, + "grad_norm": 0.4106855094432831, + "learning_rate": 5.61542650583056e-06, + "loss": 0.3542, + "step": 34714 + }, + { + "epoch": 0.6445049253127286, + "grad_norm": 0.4027274549007416, + "learning_rate": 5.614378149850131e-06, + "loss": 0.2645, + "step": 34716 + }, + { + "epoch": 0.6445420554501472, + "grad_norm": 0.3040961027145386, + "learning_rate": 5.613329853544882e-06, + "loss": 0.2581, + "step": 34718 + }, + { + "epoch": 0.6445791855875659, + "grad_norm": 0.2999259829521179, + "learning_rate": 5.612281616929071e-06, + "loss": 0.2352, + "step": 34720 + }, + { + "epoch": 0.6446163157249845, + "grad_norm": 0.34112080931663513, + "learning_rate": 5.611233440016964e-06, + "loss": 0.2061, + "step": 34722 + }, + { + "epoch": 0.6446534458624031, + "grad_norm": 0.27394694089889526, + "learning_rate": 5.610185322822824e-06, + "loss": 0.2321, + "step": 34724 + }, + { + "epoch": 0.6446905759998218, + "grad_norm": 0.1619529277086258, + "learning_rate": 5.6091372653609175e-06, + "loss": 0.2491, + "step": 34726 + }, + { + "epoch": 0.6447277061372404, + "grad_norm": 0.3059213161468506, + "learning_rate": 5.608089267645496e-06, + "loss": 0.4792, + "step": 34728 + }, + { + "epoch": 0.6447648362746591, + "grad_norm": 0.3793026804924011, + "learning_rate": 5.607041329690824e-06, + "loss": 0.3044, + "step": 34730 + }, + { + "epoch": 0.6448019664120777, + "grad_norm": 0.4118514955043793, + "learning_rate": 5.605993451511166e-06, + "loss": 0.3038, + "step": 34732 + }, + { + "epoch": 0.6448390965494963, + "grad_norm": 0.4746505618095398, + "learning_rate": 5.604945633120771e-06, + "loss": 0.2842, + "step": 34734 + }, + { + "epoch": 0.644876226686915, + "grad_norm": 0.3615133762359619, + "learning_rate": 5.603897874533901e-06, + "loss": 0.4499, + "step": 34736 + }, + { + "epoch": 0.6449133568243336, + "grad_norm": 0.4404677748680115, + "learning_rate": 5.6028501757648135e-06, + "loss": 0.2867, + "step": 34738 + }, + { + "epoch": 0.6449504869617523, + "grad_norm": 0.474431574344635, + "learning_rate": 5.601802536827763e-06, + "loss": 0.4728, + "step": 34740 + }, + { + "epoch": 0.6449876170991709, + "grad_norm": 0.46012353897094727, + "learning_rate": 5.6007549577370065e-06, + "loss": 0.3034, + "step": 34742 + }, + { + "epoch": 0.6450247472365895, + "grad_norm": 0.3508109152317047, + "learning_rate": 5.599707438506803e-06, + "loss": 0.2042, + "step": 34744 + }, + { + "epoch": 0.6450618773740081, + "grad_norm": 0.3310129940509796, + "learning_rate": 5.598659979151395e-06, + "loss": 0.1549, + "step": 34746 + }, + { + "epoch": 0.6450990075114268, + "grad_norm": 0.24820493161678314, + "learning_rate": 5.597612579685046e-06, + "loss": 0.1644, + "step": 34748 + }, + { + "epoch": 0.6451361376488455, + "grad_norm": 0.7365579009056091, + "learning_rate": 5.596565240121999e-06, + "loss": 0.2377, + "step": 34750 + }, + { + "epoch": 0.645173267786264, + "grad_norm": 0.3869255781173706, + "learning_rate": 5.595517960476509e-06, + "loss": 0.149, + "step": 34752 + }, + { + "epoch": 0.6452103979236827, + "grad_norm": 0.4437241554260254, + "learning_rate": 5.594470740762827e-06, + "loss": 0.1664, + "step": 34754 + }, + { + "epoch": 0.6452475280611013, + "grad_norm": 0.3068319261074066, + "learning_rate": 5.593423580995208e-06, + "loss": 0.2778, + "step": 34756 + }, + { + "epoch": 0.64528465819852, + "grad_norm": 0.4013836681842804, + "learning_rate": 5.592376481187888e-06, + "loss": 0.4642, + "step": 34758 + }, + { + "epoch": 0.6453217883359387, + "grad_norm": 0.45737430453300476, + "learning_rate": 5.5913294413551225e-06, + "loss": 0.2801, + "step": 34760 + }, + { + "epoch": 0.6453589184733572, + "grad_norm": 0.42493894696235657, + "learning_rate": 5.5902824615111584e-06, + "loss": 0.2295, + "step": 34762 + }, + { + "epoch": 0.6453960486107759, + "grad_norm": 0.47333043813705444, + "learning_rate": 5.58923554167024e-06, + "loss": 0.2335, + "step": 34764 + }, + { + "epoch": 0.6454331787481945, + "grad_norm": 0.41420790553092957, + "learning_rate": 5.588188681846615e-06, + "loss": 0.4207, + "step": 34766 + }, + { + "epoch": 0.6454703088856132, + "grad_norm": 0.3941937983036041, + "learning_rate": 5.587141882054532e-06, + "loss": 0.2703, + "step": 34768 + }, + { + "epoch": 0.6455074390230319, + "grad_norm": 0.36116212606430054, + "learning_rate": 5.586095142308229e-06, + "loss": 0.3403, + "step": 34770 + }, + { + "epoch": 0.6455445691604504, + "grad_norm": 0.30828458070755005, + "learning_rate": 5.5850484626219455e-06, + "loss": 0.4133, + "step": 34772 + }, + { + "epoch": 0.6455816992978691, + "grad_norm": 0.34383338689804077, + "learning_rate": 5.58400184300993e-06, + "loss": 0.2437, + "step": 34774 + }, + { + "epoch": 0.6456188294352877, + "grad_norm": 0.3185109794139862, + "learning_rate": 5.58295528348642e-06, + "loss": 0.2417, + "step": 34776 + }, + { + "epoch": 0.6456559595727064, + "grad_norm": 0.3361605107784271, + "learning_rate": 5.581908784065658e-06, + "loss": 0.4926, + "step": 34778 + }, + { + "epoch": 0.6456930897101251, + "grad_norm": 0.37308335304260254, + "learning_rate": 5.580862344761885e-06, + "loss": 0.3176, + "step": 34780 + }, + { + "epoch": 0.6457302198475436, + "grad_norm": 0.4026464521884918, + "learning_rate": 5.5798159655893415e-06, + "loss": 0.3016, + "step": 34782 + }, + { + "epoch": 0.6457673499849623, + "grad_norm": 0.47163859009742737, + "learning_rate": 5.578769646562259e-06, + "loss": 0.2099, + "step": 34784 + }, + { + "epoch": 0.6458044801223809, + "grad_norm": 0.5056636929512024, + "learning_rate": 5.57772338769488e-06, + "loss": 0.2569, + "step": 34786 + }, + { + "epoch": 0.6458416102597996, + "grad_norm": 0.35792338848114014, + "learning_rate": 5.576677189001438e-06, + "loss": 0.4732, + "step": 34788 + }, + { + "epoch": 0.6458787403972183, + "grad_norm": 0.3689761459827423, + "learning_rate": 5.575631050496173e-06, + "loss": 0.3575, + "step": 34790 + }, + { + "epoch": 0.6459158705346368, + "grad_norm": 0.34515824913978577, + "learning_rate": 5.574584972193315e-06, + "loss": 0.2438, + "step": 34792 + }, + { + "epoch": 0.6459530006720555, + "grad_norm": 0.3479342460632324, + "learning_rate": 5.573538954107103e-06, + "loss": 0.1738, + "step": 34794 + }, + { + "epoch": 0.6459901308094741, + "grad_norm": 0.33143702149391174, + "learning_rate": 5.572492996251763e-06, + "loss": 0.3422, + "step": 34796 + }, + { + "epoch": 0.6460272609468928, + "grad_norm": 0.35069042444229126, + "learning_rate": 5.571447098641532e-06, + "loss": 0.2284, + "step": 34798 + }, + { + "epoch": 0.6460643910843114, + "grad_norm": 0.37591543793678284, + "learning_rate": 5.57040126129064e-06, + "loss": 0.204, + "step": 34800 + }, + { + "epoch": 0.64610152122173, + "grad_norm": 0.33767032623291016, + "learning_rate": 5.569355484213319e-06, + "loss": 0.2309, + "step": 34802 + }, + { + "epoch": 0.6461386513591487, + "grad_norm": 0.28123676776885986, + "learning_rate": 5.568309767423799e-06, + "loss": 0.2502, + "step": 34804 + }, + { + "epoch": 0.6461757814965673, + "grad_norm": 0.3622833490371704, + "learning_rate": 5.567264110936308e-06, + "loss": 0.2284, + "step": 34806 + }, + { + "epoch": 0.646212911633986, + "grad_norm": 0.45907846093177795, + "learning_rate": 5.5662185147650785e-06, + "loss": 0.3109, + "step": 34808 + }, + { + "epoch": 0.6462500417714045, + "grad_norm": 0.41510286927223206, + "learning_rate": 5.565172978924331e-06, + "loss": 0.3457, + "step": 34810 + }, + { + "epoch": 0.6462871719088232, + "grad_norm": 0.32006219029426575, + "learning_rate": 5.564127503428299e-06, + "loss": 0.4219, + "step": 34812 + }, + { + "epoch": 0.6463243020462419, + "grad_norm": 0.3592986762523651, + "learning_rate": 5.563082088291201e-06, + "loss": 0.2188, + "step": 34814 + }, + { + "epoch": 0.6463614321836605, + "grad_norm": 0.5203841924667358, + "learning_rate": 5.562036733527265e-06, + "loss": 0.2687, + "step": 34816 + }, + { + "epoch": 0.6463985623210792, + "grad_norm": 0.6168431639671326, + "learning_rate": 5.560991439150718e-06, + "loss": 0.2128, + "step": 34818 + }, + { + "epoch": 0.6464356924584977, + "grad_norm": 0.3143469989299774, + "learning_rate": 5.559946205175784e-06, + "loss": 0.3074, + "step": 34820 + }, + { + "epoch": 0.6464728225959164, + "grad_norm": 0.8062521815299988, + "learning_rate": 5.558901031616677e-06, + "loss": 0.3613, + "step": 34822 + }, + { + "epoch": 0.6465099527333351, + "grad_norm": 0.6121131777763367, + "learning_rate": 5.557855918487626e-06, + "loss": 0.275, + "step": 34824 + }, + { + "epoch": 0.6465470828707537, + "grad_norm": 0.40222230553627014, + "learning_rate": 5.55681086580285e-06, + "loss": 0.1617, + "step": 34826 + }, + { + "epoch": 0.6465842130081724, + "grad_norm": 0.5299124121665955, + "learning_rate": 5.555765873576568e-06, + "loss": 0.4369, + "step": 34828 + }, + { + "epoch": 0.6466213431455909, + "grad_norm": 0.48029881715774536, + "learning_rate": 5.554720941823006e-06, + "loss": 0.3489, + "step": 34830 + }, + { + "epoch": 0.6466584732830096, + "grad_norm": 0.3887456953525543, + "learning_rate": 5.553676070556373e-06, + "loss": 0.215, + "step": 34832 + }, + { + "epoch": 0.6466956034204283, + "grad_norm": 0.43169882893562317, + "learning_rate": 5.5526312597908924e-06, + "loss": 0.2094, + "step": 34834 + }, + { + "epoch": 0.6467327335578469, + "grad_norm": 0.5147672295570374, + "learning_rate": 5.551586509540776e-06, + "loss": 0.5572, + "step": 34836 + }, + { + "epoch": 0.6467698636952656, + "grad_norm": 0.4072299003601074, + "learning_rate": 5.550541819820243e-06, + "loss": 0.1439, + "step": 34838 + }, + { + "epoch": 0.6468069938326841, + "grad_norm": 0.2971310019493103, + "learning_rate": 5.549497190643509e-06, + "loss": 0.3772, + "step": 34840 + }, + { + "epoch": 0.6468441239701028, + "grad_norm": 0.1707666665315628, + "learning_rate": 5.548452622024787e-06, + "loss": 0.1632, + "step": 34842 + }, + { + "epoch": 0.6468812541075214, + "grad_norm": 0.3800630271434784, + "learning_rate": 5.547408113978294e-06, + "loss": 0.1908, + "step": 34844 + }, + { + "epoch": 0.6469183842449401, + "grad_norm": 0.2611372768878937, + "learning_rate": 5.546363666518235e-06, + "loss": 0.2418, + "step": 34846 + }, + { + "epoch": 0.6469555143823588, + "grad_norm": 0.31096985936164856, + "learning_rate": 5.5453192796588276e-06, + "loss": 0.339, + "step": 34848 + }, + { + "epoch": 0.6469926445197773, + "grad_norm": 0.43462124466896057, + "learning_rate": 5.544274953414282e-06, + "loss": 0.453, + "step": 34850 + }, + { + "epoch": 0.647029774657196, + "grad_norm": 0.7805233597755432, + "learning_rate": 5.54323068779881e-06, + "loss": 0.3628, + "step": 34852 + }, + { + "epoch": 0.6470669047946146, + "grad_norm": 0.40374529361724854, + "learning_rate": 5.542186482826616e-06, + "loss": 0.1895, + "step": 34854 + }, + { + "epoch": 0.6471040349320333, + "grad_norm": 0.4321712255477905, + "learning_rate": 5.541142338511911e-06, + "loss": 0.2566, + "step": 34856 + }, + { + "epoch": 0.647141165069452, + "grad_norm": 0.48027125000953674, + "learning_rate": 5.540098254868906e-06, + "loss": 0.2572, + "step": 34858 + }, + { + "epoch": 0.6471782952068705, + "grad_norm": 0.24633851647377014, + "learning_rate": 5.539054231911803e-06, + "loss": 0.1767, + "step": 34860 + }, + { + "epoch": 0.6472154253442892, + "grad_norm": 0.3651930093765259, + "learning_rate": 5.538010269654807e-06, + "loss": 0.2137, + "step": 34862 + }, + { + "epoch": 0.6472525554817078, + "grad_norm": 0.26462599635124207, + "learning_rate": 5.536966368112129e-06, + "loss": 0.0579, + "step": 34864 + }, + { + "epoch": 0.6472896856191265, + "grad_norm": 0.2882705628871918, + "learning_rate": 5.5359225272979674e-06, + "loss": 0.2956, + "step": 34866 + }, + { + "epoch": 0.6473268157565452, + "grad_norm": 0.4609827399253845, + "learning_rate": 5.534878747226531e-06, + "loss": 0.1048, + "step": 34868 + }, + { + "epoch": 0.6473639458939637, + "grad_norm": 0.3137326240539551, + "learning_rate": 5.5338350279120245e-06, + "loss": 0.3685, + "step": 34870 + }, + { + "epoch": 0.6474010760313824, + "grad_norm": 0.2897990345954895, + "learning_rate": 5.53279136936864e-06, + "loss": 0.2807, + "step": 34872 + }, + { + "epoch": 0.647438206168801, + "grad_norm": 0.39075005054473877, + "learning_rate": 5.531747771610585e-06, + "loss": 0.2715, + "step": 34874 + }, + { + "epoch": 0.6474753363062197, + "grad_norm": 0.32559868693351746, + "learning_rate": 5.530704234652062e-06, + "loss": 0.215, + "step": 34876 + }, + { + "epoch": 0.6475124664436384, + "grad_norm": 0.5592748522758484, + "learning_rate": 5.529660758507265e-06, + "loss": 0.2455, + "step": 34878 + }, + { + "epoch": 0.6475495965810569, + "grad_norm": 0.37045055627822876, + "learning_rate": 5.528617343190393e-06, + "loss": 0.3569, + "step": 34880 + }, + { + "epoch": 0.6475867267184756, + "grad_norm": 0.4509209394454956, + "learning_rate": 5.527573988715651e-06, + "loss": 0.137, + "step": 34882 + }, + { + "epoch": 0.6476238568558942, + "grad_norm": 0.3632057309150696, + "learning_rate": 5.526530695097225e-06, + "loss": 0.1768, + "step": 34884 + }, + { + "epoch": 0.6476609869933129, + "grad_norm": 0.44686976075172424, + "learning_rate": 5.525487462349318e-06, + "loss": 0.3729, + "step": 34886 + }, + { + "epoch": 0.6476981171307316, + "grad_norm": 0.37398457527160645, + "learning_rate": 5.524444290486124e-06, + "loss": 0.2536, + "step": 34888 + }, + { + "epoch": 0.6477352472681501, + "grad_norm": 0.314525306224823, + "learning_rate": 5.523401179521837e-06, + "loss": 0.2924, + "step": 34890 + }, + { + "epoch": 0.6477723774055688, + "grad_norm": 0.24351750314235687, + "learning_rate": 5.52235812947065e-06, + "loss": 0.2495, + "step": 34892 + }, + { + "epoch": 0.6478095075429874, + "grad_norm": 0.3813991844654083, + "learning_rate": 5.521315140346761e-06, + "loss": 0.2684, + "step": 34894 + }, + { + "epoch": 0.6478466376804061, + "grad_norm": 0.252829909324646, + "learning_rate": 5.520272212164355e-06, + "loss": 0.2235, + "step": 34896 + }, + { + "epoch": 0.6478837678178246, + "grad_norm": 0.45496848225593567, + "learning_rate": 5.519229344937629e-06, + "loss": 0.297, + "step": 34898 + }, + { + "epoch": 0.6479208979552433, + "grad_norm": 0.32488980889320374, + "learning_rate": 5.518186538680766e-06, + "loss": 0.2563, + "step": 34900 + }, + { + "epoch": 0.647958028092662, + "grad_norm": 0.4425840973854065, + "learning_rate": 5.51714379340796e-06, + "loss": 0.4079, + "step": 34902 + }, + { + "epoch": 0.6479951582300806, + "grad_norm": 0.32276326417922974, + "learning_rate": 5.5161011091334e-06, + "loss": 0.3124, + "step": 34904 + }, + { + "epoch": 0.6480322883674993, + "grad_norm": 0.24568946659564972, + "learning_rate": 5.515058485871272e-06, + "loss": 0.3151, + "step": 34906 + }, + { + "epoch": 0.6480694185049178, + "grad_norm": 0.4530656337738037, + "learning_rate": 5.514015923635768e-06, + "loss": 0.4042, + "step": 34908 + }, + { + "epoch": 0.6481065486423365, + "grad_norm": 0.37952136993408203, + "learning_rate": 5.512973422441067e-06, + "loss": 0.4066, + "step": 34910 + }, + { + "epoch": 0.6481436787797552, + "grad_norm": 0.21045108139514923, + "learning_rate": 5.511930982301357e-06, + "loss": 0.3288, + "step": 34912 + }, + { + "epoch": 0.6481808089171738, + "grad_norm": 0.24382953345775604, + "learning_rate": 5.510888603230823e-06, + "loss": 0.2468, + "step": 34914 + }, + { + "epoch": 0.6482179390545925, + "grad_norm": 0.38174641132354736, + "learning_rate": 5.509846285243653e-06, + "loss": 0.3297, + "step": 34916 + }, + { + "epoch": 0.648255069192011, + "grad_norm": 0.195305734872818, + "learning_rate": 5.508804028354021e-06, + "loss": 0.1979, + "step": 34918 + }, + { + "epoch": 0.6482921993294297, + "grad_norm": 0.4923331141471863, + "learning_rate": 5.5077618325761176e-06, + "loss": 0.2503, + "step": 34920 + }, + { + "epoch": 0.6483293294668484, + "grad_norm": 0.3371700942516327, + "learning_rate": 5.506719697924116e-06, + "loss": 0.1046, + "step": 34922 + }, + { + "epoch": 0.648366459604267, + "grad_norm": 0.4205317795276642, + "learning_rate": 5.5056776244122e-06, + "loss": 0.1732, + "step": 34924 + }, + { + "epoch": 0.6484035897416857, + "grad_norm": 0.4075887203216553, + "learning_rate": 5.5046356120545496e-06, + "loss": 0.0798, + "step": 34926 + }, + { + "epoch": 0.6484407198791042, + "grad_norm": 0.9094563126564026, + "learning_rate": 5.503593660865344e-06, + "loss": 0.144, + "step": 34928 + }, + { + "epoch": 0.6484778500165229, + "grad_norm": 0.3482798933982849, + "learning_rate": 5.502551770858759e-06, + "loss": 0.2812, + "step": 34930 + }, + { + "epoch": 0.6485149801539416, + "grad_norm": 0.4022442400455475, + "learning_rate": 5.501509942048975e-06, + "loss": 0.1589, + "step": 34932 + }, + { + "epoch": 0.6485521102913602, + "grad_norm": 0.31005892157554626, + "learning_rate": 5.5004681744501684e-06, + "loss": 0.3069, + "step": 34934 + }, + { + "epoch": 0.6485892404287789, + "grad_norm": 0.42672133445739746, + "learning_rate": 5.49942646807651e-06, + "loss": 0.2984, + "step": 34936 + }, + { + "epoch": 0.6486263705661974, + "grad_norm": 0.3502918779850006, + "learning_rate": 5.498384822942178e-06, + "loss": 0.4202, + "step": 34938 + }, + { + "epoch": 0.6486635007036161, + "grad_norm": 0.486399382352829, + "learning_rate": 5.4973432390613435e-06, + "loss": 0.3509, + "step": 34940 + }, + { + "epoch": 0.6487006308410348, + "grad_norm": 0.318162739276886, + "learning_rate": 5.496301716448179e-06, + "loss": 0.225, + "step": 34942 + }, + { + "epoch": 0.6487377609784534, + "grad_norm": 0.22610719501972198, + "learning_rate": 5.4952602551168584e-06, + "loss": 0.2994, + "step": 34944 + }, + { + "epoch": 0.648774891115872, + "grad_norm": 0.21827496588230133, + "learning_rate": 5.494218855081557e-06, + "loss": 0.402, + "step": 34946 + }, + { + "epoch": 0.6488120212532906, + "grad_norm": 0.29414430260658264, + "learning_rate": 5.493177516356435e-06, + "loss": 0.2127, + "step": 34948 + }, + { + "epoch": 0.6488491513907093, + "grad_norm": 0.269583523273468, + "learning_rate": 5.492136238955667e-06, + "loss": 0.3591, + "step": 34950 + }, + { + "epoch": 0.6488862815281279, + "grad_norm": 0.5178949236869812, + "learning_rate": 5.491095022893425e-06, + "loss": 0.3144, + "step": 34952 + }, + { + "epoch": 0.6489234116655466, + "grad_norm": 0.4178057312965393, + "learning_rate": 5.490053868183872e-06, + "loss": 0.4341, + "step": 34954 + }, + { + "epoch": 0.6489605418029653, + "grad_norm": 0.4074004888534546, + "learning_rate": 5.4890127748411785e-06, + "loss": 0.2682, + "step": 34956 + }, + { + "epoch": 0.6489976719403838, + "grad_norm": 0.2792261242866516, + "learning_rate": 5.4879717428795124e-06, + "loss": 0.1902, + "step": 34958 + }, + { + "epoch": 0.6490348020778025, + "grad_norm": 0.5006771087646484, + "learning_rate": 5.486930772313035e-06, + "loss": 0.3362, + "step": 34960 + }, + { + "epoch": 0.6490719322152211, + "grad_norm": 0.2366732954978943, + "learning_rate": 5.485889863155909e-06, + "loss": 0.196, + "step": 34962 + }, + { + "epoch": 0.6491090623526398, + "grad_norm": 0.3755991756916046, + "learning_rate": 5.4848490154222985e-06, + "loss": 0.2582, + "step": 34964 + }, + { + "epoch": 0.6491461924900584, + "grad_norm": 0.3734539747238159, + "learning_rate": 5.4838082291263705e-06, + "loss": 0.2461, + "step": 34966 + }, + { + "epoch": 0.649183322627477, + "grad_norm": 0.609535276889801, + "learning_rate": 5.482767504282285e-06, + "loss": 0.246, + "step": 34968 + }, + { + "epoch": 0.6492204527648957, + "grad_norm": 0.464240163564682, + "learning_rate": 5.481726840904205e-06, + "loss": 0.2901, + "step": 34970 + }, + { + "epoch": 0.6492575829023143, + "grad_norm": 0.42593303322792053, + "learning_rate": 5.480686239006286e-06, + "loss": 0.3872, + "step": 34972 + }, + { + "epoch": 0.649294713039733, + "grad_norm": 0.30449581146240234, + "learning_rate": 5.4796456986026915e-06, + "loss": 0.3897, + "step": 34974 + }, + { + "epoch": 0.6493318431771516, + "grad_norm": 0.2090335190296173, + "learning_rate": 5.478605219707578e-06, + "loss": 0.3146, + "step": 34976 + }, + { + "epoch": 0.6493689733145702, + "grad_norm": 0.2927362024784088, + "learning_rate": 5.477564802335105e-06, + "loss": 0.1149, + "step": 34978 + }, + { + "epoch": 0.6494061034519889, + "grad_norm": 0.2984867990016937, + "learning_rate": 5.476524446499432e-06, + "loss": 0.312, + "step": 34980 + }, + { + "epoch": 0.6494432335894075, + "grad_norm": 0.3318144679069519, + "learning_rate": 5.475484152214707e-06, + "loss": 0.4555, + "step": 34982 + }, + { + "epoch": 0.6494803637268262, + "grad_norm": 0.41366684436798096, + "learning_rate": 5.474443919495096e-06, + "loss": 0.2546, + "step": 34984 + }, + { + "epoch": 0.6495174938642448, + "grad_norm": 0.2889772653579712, + "learning_rate": 5.473403748354742e-06, + "loss": 0.3224, + "step": 34986 + }, + { + "epoch": 0.6495546240016634, + "grad_norm": 0.4579371511936188, + "learning_rate": 5.472363638807806e-06, + "loss": 0.1557, + "step": 34988 + }, + { + "epoch": 0.6495917541390821, + "grad_norm": 0.5011560320854187, + "learning_rate": 5.471323590868438e-06, + "loss": 0.0507, + "step": 34990 + }, + { + "epoch": 0.6496288842765007, + "grad_norm": 0.2969849407672882, + "learning_rate": 5.470283604550791e-06, + "loss": 0.245, + "step": 34992 + }, + { + "epoch": 0.6496660144139194, + "grad_norm": 0.4154781699180603, + "learning_rate": 5.469243679869017e-06, + "loss": 0.2046, + "step": 34994 + }, + { + "epoch": 0.6497031445513379, + "grad_norm": 0.5398894548416138, + "learning_rate": 5.468203816837267e-06, + "loss": 0.5276, + "step": 34996 + }, + { + "epoch": 0.6497402746887566, + "grad_norm": 0.30955785512924194, + "learning_rate": 5.467164015469688e-06, + "loss": 0.3282, + "step": 34998 + }, + { + "epoch": 0.6497774048261753, + "grad_norm": 0.25512146949768066, + "learning_rate": 5.466124275780427e-06, + "loss": 0.1096, + "step": 35000 + }, + { + "epoch": 0.6498145349635939, + "grad_norm": 0.3012755811214447, + "learning_rate": 5.465084597783639e-06, + "loss": 0.4655, + "step": 35002 + }, + { + "epoch": 0.6498516651010126, + "grad_norm": 0.48194220662117004, + "learning_rate": 5.464044981493461e-06, + "loss": 0.2883, + "step": 35004 + }, + { + "epoch": 0.6498887952384311, + "grad_norm": 0.412505567073822, + "learning_rate": 5.463005426924046e-06, + "loss": 0.1957, + "step": 35006 + }, + { + "epoch": 0.6499259253758498, + "grad_norm": 0.482642263174057, + "learning_rate": 5.461965934089539e-06, + "loss": 0.1745, + "step": 35008 + }, + { + "epoch": 0.6499630555132685, + "grad_norm": 0.5232049226760864, + "learning_rate": 5.46092650300408e-06, + "loss": 0.2497, + "step": 35010 + }, + { + "epoch": 0.6500001856506871, + "grad_norm": 0.5267934799194336, + "learning_rate": 5.459887133681816e-06, + "loss": 0.231, + "step": 35012 + }, + { + "epoch": 0.6500373157881058, + "grad_norm": 0.4003421366214752, + "learning_rate": 5.458847826136887e-06, + "loss": 0.2798, + "step": 35014 + }, + { + "epoch": 0.6500744459255243, + "grad_norm": 0.33312806487083435, + "learning_rate": 5.457808580383438e-06, + "loss": 0.3884, + "step": 35016 + }, + { + "epoch": 0.650111576062943, + "grad_norm": 0.3849543631076813, + "learning_rate": 5.456769396435608e-06, + "loss": 0.3571, + "step": 35018 + }, + { + "epoch": 0.6501487062003617, + "grad_norm": 0.2600293755531311, + "learning_rate": 5.455730274307538e-06, + "loss": 0.1945, + "step": 35020 + }, + { + "epoch": 0.6501858363377803, + "grad_norm": 0.525417149066925, + "learning_rate": 5.454691214013372e-06, + "loss": 0.3436, + "step": 35022 + }, + { + "epoch": 0.650222966475199, + "grad_norm": 0.25680115818977356, + "learning_rate": 5.4536522155672435e-06, + "loss": 0.2754, + "step": 35024 + }, + { + "epoch": 0.6502600966126175, + "grad_norm": 0.37854015827178955, + "learning_rate": 5.452613278983286e-06, + "loss": 0.2252, + "step": 35026 + }, + { + "epoch": 0.6502972267500362, + "grad_norm": 0.35359838604927063, + "learning_rate": 5.451574404275641e-06, + "loss": 0.3363, + "step": 35028 + }, + { + "epoch": 0.6503343568874549, + "grad_norm": 0.1946440190076828, + "learning_rate": 5.450535591458445e-06, + "loss": 0.2218, + "step": 35030 + }, + { + "epoch": 0.6503714870248735, + "grad_norm": 0.36694175004959106, + "learning_rate": 5.449496840545832e-06, + "loss": 0.5371, + "step": 35032 + }, + { + "epoch": 0.6504086171622921, + "grad_norm": 0.3081398606300354, + "learning_rate": 5.448458151551941e-06, + "loss": 0.3088, + "step": 35034 + }, + { + "epoch": 0.6504457472997107, + "grad_norm": 0.45935556292533875, + "learning_rate": 5.447419524490895e-06, + "loss": 0.3653, + "step": 35036 + }, + { + "epoch": 0.6504828774371294, + "grad_norm": 0.3218877911567688, + "learning_rate": 5.446380959376836e-06, + "loss": 0.2441, + "step": 35038 + }, + { + "epoch": 0.6505200075745481, + "grad_norm": 0.440879762172699, + "learning_rate": 5.4453424562238895e-06, + "loss": 0.1952, + "step": 35040 + }, + { + "epoch": 0.6505571377119667, + "grad_norm": 0.3222860097885132, + "learning_rate": 5.444304015046192e-06, + "loss": 0.1587, + "step": 35042 + }, + { + "epoch": 0.6505942678493853, + "grad_norm": 0.30197104811668396, + "learning_rate": 5.4432656358578724e-06, + "loss": 0.2205, + "step": 35044 + }, + { + "epoch": 0.6506313979868039, + "grad_norm": 0.32392019033432007, + "learning_rate": 5.442227318673059e-06, + "loss": 0.2343, + "step": 35046 + }, + { + "epoch": 0.6506685281242226, + "grad_norm": 0.6413139700889587, + "learning_rate": 5.441189063505877e-06, + "loss": 0.301, + "step": 35048 + }, + { + "epoch": 0.6507056582616412, + "grad_norm": 0.23965413868427277, + "learning_rate": 5.4401508703704565e-06, + "loss": 0.362, + "step": 35050 + }, + { + "epoch": 0.6507427883990599, + "grad_norm": 0.3463403880596161, + "learning_rate": 5.439112739280923e-06, + "loss": 0.2613, + "step": 35052 + }, + { + "epoch": 0.6507799185364785, + "grad_norm": 0.40303468704223633, + "learning_rate": 5.438074670251406e-06, + "loss": 0.3733, + "step": 35054 + }, + { + "epoch": 0.6508170486738971, + "grad_norm": 0.36131739616394043, + "learning_rate": 5.437036663296026e-06, + "loss": 0.1621, + "step": 35056 + }, + { + "epoch": 0.6508541788113158, + "grad_norm": 0.35674044489860535, + "learning_rate": 5.4359987184289095e-06, + "loss": 0.253, + "step": 35058 + }, + { + "epoch": 0.6508913089487344, + "grad_norm": 0.6778879165649414, + "learning_rate": 5.434960835664185e-06, + "loss": 0.2688, + "step": 35060 + }, + { + "epoch": 0.650928439086153, + "grad_norm": 0.42753756046295166, + "learning_rate": 5.433923015015965e-06, + "loss": 0.2942, + "step": 35062 + }, + { + "epoch": 0.6509655692235717, + "grad_norm": 0.2883737087249756, + "learning_rate": 5.432885256498375e-06, + "loss": 0.2028, + "step": 35064 + }, + { + "epoch": 0.6510026993609903, + "grad_norm": 1.1320891380310059, + "learning_rate": 5.43184756012554e-06, + "loss": 0.3115, + "step": 35066 + }, + { + "epoch": 0.651039829498409, + "grad_norm": 0.3906393051147461, + "learning_rate": 5.430809925911575e-06, + "loss": 0.5193, + "step": 35068 + }, + { + "epoch": 0.6510769596358276, + "grad_norm": 0.4349411725997925, + "learning_rate": 5.429772353870599e-06, + "loss": 0.1746, + "step": 35070 + }, + { + "epoch": 0.6511140897732463, + "grad_norm": 1.0571234226226807, + "learning_rate": 5.4287348440167365e-06, + "loss": 0.2444, + "step": 35072 + }, + { + "epoch": 0.6511512199106649, + "grad_norm": 0.6107184886932373, + "learning_rate": 5.4276973963640954e-06, + "loss": 0.3767, + "step": 35074 + }, + { + "epoch": 0.6511883500480835, + "grad_norm": 0.2539249062538147, + "learning_rate": 5.4266600109267985e-06, + "loss": 0.1762, + "step": 35076 + }, + { + "epoch": 0.6512254801855022, + "grad_norm": 1.2727235555648804, + "learning_rate": 5.425622687718959e-06, + "loss": 0.3137, + "step": 35078 + }, + { + "epoch": 0.6512626103229208, + "grad_norm": 0.3641892373561859, + "learning_rate": 5.424585426754694e-06, + "loss": 0.3174, + "step": 35080 + }, + { + "epoch": 0.6512997404603394, + "grad_norm": 0.2908001244068146, + "learning_rate": 5.423548228048115e-06, + "loss": 0.3235, + "step": 35082 + }, + { + "epoch": 0.6513368705977581, + "grad_norm": 0.3206023573875427, + "learning_rate": 5.422511091613341e-06, + "loss": 0.1874, + "step": 35084 + }, + { + "epoch": 0.6513740007351767, + "grad_norm": 0.4240186810493469, + "learning_rate": 5.421474017464476e-06, + "loss": 0.2083, + "step": 35086 + }, + { + "epoch": 0.6514111308725954, + "grad_norm": 0.3637961447238922, + "learning_rate": 5.42043700561564e-06, + "loss": 0.3179, + "step": 35088 + }, + { + "epoch": 0.651448261010014, + "grad_norm": 0.49588721990585327, + "learning_rate": 5.419400056080933e-06, + "loss": 0.3298, + "step": 35090 + }, + { + "epoch": 0.6514853911474326, + "grad_norm": 0.40184250473976135, + "learning_rate": 5.418363168874472e-06, + "loss": 0.1944, + "step": 35092 + }, + { + "epoch": 0.6515225212848513, + "grad_norm": 0.8556508421897888, + "learning_rate": 5.417326344010365e-06, + "loss": 0.4215, + "step": 35094 + }, + { + "epoch": 0.6515596514222699, + "grad_norm": 0.3922984302043915, + "learning_rate": 5.4162895815027195e-06, + "loss": 0.2975, + "step": 35096 + }, + { + "epoch": 0.6515967815596886, + "grad_norm": 0.4298214316368103, + "learning_rate": 5.4152528813656465e-06, + "loss": 0.4013, + "step": 35098 + }, + { + "epoch": 0.6516339116971072, + "grad_norm": 0.49251043796539307, + "learning_rate": 5.414216243613246e-06, + "loss": 0.3218, + "step": 35100 + }, + { + "epoch": 0.6516710418345258, + "grad_norm": 0.31654471158981323, + "learning_rate": 5.4131796682596245e-06, + "loss": 0.2833, + "step": 35102 + }, + { + "epoch": 0.6517081719719444, + "grad_norm": 0.24148577451705933, + "learning_rate": 5.412143155318891e-06, + "loss": 0.2764, + "step": 35104 + }, + { + "epoch": 0.6517453021093631, + "grad_norm": 0.19002054631710052, + "learning_rate": 5.411106704805149e-06, + "loss": 0.208, + "step": 35106 + }, + { + "epoch": 0.6517824322467818, + "grad_norm": 0.42621880769729614, + "learning_rate": 5.410070316732497e-06, + "loss": 0.2284, + "step": 35108 + }, + { + "epoch": 0.6518195623842004, + "grad_norm": 0.9535357356071472, + "learning_rate": 5.409033991115043e-06, + "loss": 0.272, + "step": 35110 + }, + { + "epoch": 0.651856692521619, + "grad_norm": 0.3512863218784332, + "learning_rate": 5.40799772796688e-06, + "loss": 0.2321, + "step": 35112 + }, + { + "epoch": 0.6518938226590376, + "grad_norm": 0.2934827506542206, + "learning_rate": 5.406961527302114e-06, + "loss": 0.1707, + "step": 35114 + }, + { + "epoch": 0.6519309527964563, + "grad_norm": 0.5100598335266113, + "learning_rate": 5.405925389134843e-06, + "loss": 0.3555, + "step": 35116 + }, + { + "epoch": 0.651968082933875, + "grad_norm": 0.2400893121957779, + "learning_rate": 5.404889313479168e-06, + "loss": 0.2267, + "step": 35118 + }, + { + "epoch": 0.6520052130712936, + "grad_norm": 0.3128950595855713, + "learning_rate": 5.403853300349185e-06, + "loss": 0.2256, + "step": 35120 + }, + { + "epoch": 0.6520423432087122, + "grad_norm": 0.35240858793258667, + "learning_rate": 5.402817349758995e-06, + "loss": 0.5031, + "step": 35122 + }, + { + "epoch": 0.6520794733461308, + "grad_norm": 0.6541539430618286, + "learning_rate": 5.401781461722687e-06, + "loss": 0.3501, + "step": 35124 + }, + { + "epoch": 0.6521166034835495, + "grad_norm": 0.2375851422548294, + "learning_rate": 5.400745636254361e-06, + "loss": 0.2076, + "step": 35126 + }, + { + "epoch": 0.6521537336209682, + "grad_norm": 0.5388584733009338, + "learning_rate": 5.3997098733681085e-06, + "loss": 0.3356, + "step": 35128 + }, + { + "epoch": 0.6521908637583868, + "grad_norm": 0.3617672920227051, + "learning_rate": 5.398674173078031e-06, + "loss": 0.1341, + "step": 35130 + }, + { + "epoch": 0.6522279938958054, + "grad_norm": 0.30219653248786926, + "learning_rate": 5.397638535398211e-06, + "loss": 0.1757, + "step": 35132 + }, + { + "epoch": 0.652265124033224, + "grad_norm": 0.40171170234680176, + "learning_rate": 5.396602960342748e-06, + "loss": 0.2869, + "step": 35134 + }, + { + "epoch": 0.6523022541706427, + "grad_norm": 0.3007016181945801, + "learning_rate": 5.395567447925727e-06, + "loss": 0.3751, + "step": 35136 + }, + { + "epoch": 0.6523393843080614, + "grad_norm": 0.5373406410217285, + "learning_rate": 5.394531998161241e-06, + "loss": 0.3456, + "step": 35138 + }, + { + "epoch": 0.65237651444548, + "grad_norm": 0.29361391067504883, + "learning_rate": 5.393496611063379e-06, + "loss": 0.3678, + "step": 35140 + }, + { + "epoch": 0.6524136445828986, + "grad_norm": 0.8628339767456055, + "learning_rate": 5.39246128664623e-06, + "loss": 0.2087, + "step": 35142 + }, + { + "epoch": 0.6524507747203172, + "grad_norm": 0.3301093578338623, + "learning_rate": 5.391426024923883e-06, + "loss": 0.3099, + "step": 35144 + }, + { + "epoch": 0.6524879048577359, + "grad_norm": 0.41500940918922424, + "learning_rate": 5.390390825910422e-06, + "loss": 0.2702, + "step": 35146 + }, + { + "epoch": 0.6525250349951545, + "grad_norm": 0.4091249704360962, + "learning_rate": 5.389355689619939e-06, + "loss": 0.4012, + "step": 35148 + }, + { + "epoch": 0.6525621651325731, + "grad_norm": 0.42479413747787476, + "learning_rate": 5.38832061606651e-06, + "loss": 0.2417, + "step": 35150 + }, + { + "epoch": 0.6525992952699918, + "grad_norm": 0.19798994064331055, + "learning_rate": 5.38728560526423e-06, + "loss": 0.1942, + "step": 35152 + }, + { + "epoch": 0.6526364254074104, + "grad_norm": 0.3743791878223419, + "learning_rate": 5.38625065722717e-06, + "loss": 0.2146, + "step": 35154 + }, + { + "epoch": 0.6526735555448291, + "grad_norm": 0.34228742122650146, + "learning_rate": 5.3852157719694185e-06, + "loss": 0.3625, + "step": 35156 + }, + { + "epoch": 0.6527106856822477, + "grad_norm": 0.305728942155838, + "learning_rate": 5.384180949505059e-06, + "loss": 0.4026, + "step": 35158 + }, + { + "epoch": 0.6527478158196663, + "grad_norm": 0.28715023398399353, + "learning_rate": 5.383146189848174e-06, + "loss": 0.2366, + "step": 35160 + }, + { + "epoch": 0.652784945957085, + "grad_norm": 0.483659952878952, + "learning_rate": 5.382111493012836e-06, + "loss": 0.2486, + "step": 35162 + }, + { + "epoch": 0.6528220760945036, + "grad_norm": 0.2783755362033844, + "learning_rate": 5.38107685901313e-06, + "loss": 0.295, + "step": 35164 + }, + { + "epoch": 0.6528592062319223, + "grad_norm": 0.3489808440208435, + "learning_rate": 5.380042287863132e-06, + "loss": 0.1985, + "step": 35166 + }, + { + "epoch": 0.6528963363693409, + "grad_norm": 0.34966909885406494, + "learning_rate": 5.379007779576919e-06, + "loss": 0.3544, + "step": 35168 + }, + { + "epoch": 0.6529334665067595, + "grad_norm": 0.4507407546043396, + "learning_rate": 5.377973334168574e-06, + "loss": 0.3785, + "step": 35170 + }, + { + "epoch": 0.6529705966441782, + "grad_norm": 0.2911249101161957, + "learning_rate": 5.376938951652162e-06, + "loss": 0.0946, + "step": 35172 + }, + { + "epoch": 0.6530077267815968, + "grad_norm": 0.23742179572582245, + "learning_rate": 5.375904632041768e-06, + "loss": 0.2405, + "step": 35174 + }, + { + "epoch": 0.6530448569190155, + "grad_norm": 0.2796684801578522, + "learning_rate": 5.3748703753514575e-06, + "loss": 0.2928, + "step": 35176 + }, + { + "epoch": 0.653081987056434, + "grad_norm": 0.33510711789131165, + "learning_rate": 5.373836181595307e-06, + "loss": 0.249, + "step": 35178 + }, + { + "epoch": 0.6531191171938527, + "grad_norm": 0.3652302622795105, + "learning_rate": 5.372802050787388e-06, + "loss": 0.333, + "step": 35180 + }, + { + "epoch": 0.6531562473312714, + "grad_norm": 0.6881268620491028, + "learning_rate": 5.371767982941774e-06, + "loss": 0.1914, + "step": 35182 + }, + { + "epoch": 0.65319337746869, + "grad_norm": 0.4327228367328644, + "learning_rate": 5.370733978072535e-06, + "loss": 0.3138, + "step": 35184 + }, + { + "epoch": 0.6532305076061087, + "grad_norm": 0.5186774134635925, + "learning_rate": 5.369700036193744e-06, + "loss": 0.2238, + "step": 35186 + }, + { + "epoch": 0.6532676377435273, + "grad_norm": 0.36401671171188354, + "learning_rate": 5.368666157319462e-06, + "loss": 0.4068, + "step": 35188 + }, + { + "epoch": 0.6533047678809459, + "grad_norm": 0.5229595899581909, + "learning_rate": 5.367632341463761e-06, + "loss": 0.2368, + "step": 35190 + }, + { + "epoch": 0.6533418980183646, + "grad_norm": 0.361337274312973, + "learning_rate": 5.366598588640713e-06, + "loss": 0.2177, + "step": 35192 + }, + { + "epoch": 0.6533790281557832, + "grad_norm": 0.30592697858810425, + "learning_rate": 5.3655648988643745e-06, + "loss": 0.2882, + "step": 35194 + }, + { + "epoch": 0.6534161582932019, + "grad_norm": 0.45658719539642334, + "learning_rate": 5.364531272148816e-06, + "loss": 0.2335, + "step": 35196 + }, + { + "epoch": 0.6534532884306204, + "grad_norm": 0.567110538482666, + "learning_rate": 5.363497708508107e-06, + "loss": 0.2479, + "step": 35198 + }, + { + "epoch": 0.6534904185680391, + "grad_norm": 0.4558965265750885, + "learning_rate": 5.362464207956301e-06, + "loss": 0.4069, + "step": 35200 + }, + { + "epoch": 0.6535275487054577, + "grad_norm": 0.44819381833076477, + "learning_rate": 5.361430770507467e-06, + "loss": 0.2461, + "step": 35202 + }, + { + "epoch": 0.6535646788428764, + "grad_norm": 0.5538884401321411, + "learning_rate": 5.360397396175667e-06, + "loss": 0.397, + "step": 35204 + }, + { + "epoch": 0.6536018089802951, + "grad_norm": 0.3635983467102051, + "learning_rate": 5.359364084974961e-06, + "loss": 0.2973, + "step": 35206 + }, + { + "epoch": 0.6536389391177136, + "grad_norm": 0.4137502908706665, + "learning_rate": 5.358330836919407e-06, + "loss": 0.42, + "step": 35208 + }, + { + "epoch": 0.6536760692551323, + "grad_norm": 0.2761116027832031, + "learning_rate": 5.357297652023074e-06, + "loss": 0.1263, + "step": 35210 + }, + { + "epoch": 0.6537131993925509, + "grad_norm": 0.6658402681350708, + "learning_rate": 5.356264530300009e-06, + "loss": 0.1947, + "step": 35212 + }, + { + "epoch": 0.6537503295299696, + "grad_norm": 0.3985794186592102, + "learning_rate": 5.355231471764275e-06, + "loss": 0.3537, + "step": 35214 + }, + { + "epoch": 0.6537874596673883, + "grad_norm": 0.27038154006004333, + "learning_rate": 5.354198476429927e-06, + "loss": 0.2353, + "step": 35216 + }, + { + "epoch": 0.6538245898048068, + "grad_norm": 0.24203923344612122, + "learning_rate": 5.3531655443110205e-06, + "loss": 0.1921, + "step": 35218 + }, + { + "epoch": 0.6538617199422255, + "grad_norm": 0.22974804043769836, + "learning_rate": 5.352132675421613e-06, + "loss": 0.1954, + "step": 35220 + }, + { + "epoch": 0.6538988500796441, + "grad_norm": 0.36859554052352905, + "learning_rate": 5.351099869775757e-06, + "loss": 0.2946, + "step": 35222 + }, + { + "epoch": 0.6539359802170628, + "grad_norm": 0.4116674065589905, + "learning_rate": 5.350067127387513e-06, + "loss": 0.2603, + "step": 35224 + }, + { + "epoch": 0.6539731103544815, + "grad_norm": 0.36322322487831116, + "learning_rate": 5.34903444827092e-06, + "loss": 0.1331, + "step": 35226 + }, + { + "epoch": 0.6540102404919, + "grad_norm": 0.3429400622844696, + "learning_rate": 5.34800183244004e-06, + "loss": 0.1253, + "step": 35228 + }, + { + "epoch": 0.6540473706293187, + "grad_norm": 0.4147558808326721, + "learning_rate": 5.346969279908918e-06, + "loss": 0.2276, + "step": 35230 + }, + { + "epoch": 0.6540845007667373, + "grad_norm": 0.36440587043762207, + "learning_rate": 5.345936790691608e-06, + "loss": 0.4382, + "step": 35232 + }, + { + "epoch": 0.654121630904156, + "grad_norm": 0.4868938624858856, + "learning_rate": 5.344904364802162e-06, + "loss": 0.2929, + "step": 35234 + }, + { + "epoch": 0.6541587610415747, + "grad_norm": 0.3093056380748749, + "learning_rate": 5.343872002254619e-06, + "loss": 0.1217, + "step": 35236 + }, + { + "epoch": 0.6541958911789932, + "grad_norm": 0.34889519214630127, + "learning_rate": 5.342839703063035e-06, + "loss": 0.1853, + "step": 35238 + }, + { + "epoch": 0.6542330213164119, + "grad_norm": 0.4011906385421753, + "learning_rate": 5.34180746724145e-06, + "loss": 0.2749, + "step": 35240 + }, + { + "epoch": 0.6542701514538305, + "grad_norm": 0.47876298427581787, + "learning_rate": 5.340775294803912e-06, + "loss": 0.2958, + "step": 35242 + }, + { + "epoch": 0.6543072815912492, + "grad_norm": 0.5504870414733887, + "learning_rate": 5.339743185764467e-06, + "loss": 0.2764, + "step": 35244 + }, + { + "epoch": 0.6543444117286679, + "grad_norm": 0.7109227776527405, + "learning_rate": 5.338711140137157e-06, + "loss": 0.3367, + "step": 35246 + }, + { + "epoch": 0.6543815418660864, + "grad_norm": 0.5481871366500854, + "learning_rate": 5.337679157936031e-06, + "loss": 0.4409, + "step": 35248 + }, + { + "epoch": 0.6544186720035051, + "grad_norm": 0.431727796792984, + "learning_rate": 5.336647239175121e-06, + "loss": 0.32, + "step": 35250 + }, + { + "epoch": 0.6544558021409237, + "grad_norm": 0.35014235973358154, + "learning_rate": 5.335615383868475e-06, + "loss": 0.4015, + "step": 35252 + }, + { + "epoch": 0.6544929322783424, + "grad_norm": 0.4208662211894989, + "learning_rate": 5.334583592030133e-06, + "loss": 0.2796, + "step": 35254 + }, + { + "epoch": 0.654530062415761, + "grad_norm": 0.5008327960968018, + "learning_rate": 5.333551863674135e-06, + "loss": 0.3238, + "step": 35256 + }, + { + "epoch": 0.6545671925531796, + "grad_norm": 0.3998279869556427, + "learning_rate": 5.332520198814517e-06, + "loss": 0.1599, + "step": 35258 + }, + { + "epoch": 0.6546043226905983, + "grad_norm": 0.4773446321487427, + "learning_rate": 5.3314885974653215e-06, + "loss": 0.4229, + "step": 35260 + }, + { + "epoch": 0.6546414528280169, + "grad_norm": 0.40094083547592163, + "learning_rate": 5.330457059640579e-06, + "loss": 0.2248, + "step": 35262 + }, + { + "epoch": 0.6546785829654356, + "grad_norm": 0.4393852949142456, + "learning_rate": 5.329425585354328e-06, + "loss": 0.2608, + "step": 35264 + }, + { + "epoch": 0.6547157131028541, + "grad_norm": 0.3545730412006378, + "learning_rate": 5.328394174620606e-06, + "loss": 0.3012, + "step": 35266 + }, + { + "epoch": 0.6547528432402728, + "grad_norm": 0.4005003273487091, + "learning_rate": 5.327362827453446e-06, + "loss": 0.2422, + "step": 35268 + }, + { + "epoch": 0.6547899733776915, + "grad_norm": 0.31729578971862793, + "learning_rate": 5.3263315438668805e-06, + "loss": 0.1843, + "step": 35270 + }, + { + "epoch": 0.6548271035151101, + "grad_norm": 0.3156442642211914, + "learning_rate": 5.3253003238749445e-06, + "loss": 0.1221, + "step": 35272 + }, + { + "epoch": 0.6548642336525288, + "grad_norm": 0.25421348214149475, + "learning_rate": 5.324269167491673e-06, + "loss": 0.1928, + "step": 35274 + }, + { + "epoch": 0.6549013637899473, + "grad_norm": 0.4177688658237457, + "learning_rate": 5.323238074731088e-06, + "loss": 0.2804, + "step": 35276 + }, + { + "epoch": 0.654938493927366, + "grad_norm": 0.4600485563278198, + "learning_rate": 5.322207045607228e-06, + "loss": 0.0798, + "step": 35278 + }, + { + "epoch": 0.6549756240647847, + "grad_norm": 0.648827075958252, + "learning_rate": 5.321176080134115e-06, + "loss": 0.2939, + "step": 35280 + }, + { + "epoch": 0.6550127542022033, + "grad_norm": 0.3129625618457794, + "learning_rate": 5.320145178325781e-06, + "loss": 0.2818, + "step": 35282 + }, + { + "epoch": 0.655049884339622, + "grad_norm": 0.4650766849517822, + "learning_rate": 5.319114340196253e-06, + "loss": 0.1527, + "step": 35284 + }, + { + "epoch": 0.6550870144770405, + "grad_norm": 0.528578519821167, + "learning_rate": 5.318083565759562e-06, + "loss": 0.3859, + "step": 35286 + }, + { + "epoch": 0.6551241446144592, + "grad_norm": 0.503020167350769, + "learning_rate": 5.317052855029725e-06, + "loss": 0.2798, + "step": 35288 + }, + { + "epoch": 0.6551612747518779, + "grad_norm": 0.43730732798576355, + "learning_rate": 5.316022208020772e-06, + "loss": 0.4014, + "step": 35290 + }, + { + "epoch": 0.6551984048892965, + "grad_norm": 0.5357770323753357, + "learning_rate": 5.314991624746728e-06, + "loss": 0.367, + "step": 35292 + }, + { + "epoch": 0.6552355350267152, + "grad_norm": 0.5192011594772339, + "learning_rate": 5.3139611052216144e-06, + "loss": 0.1268, + "step": 35294 + }, + { + "epoch": 0.6552726651641337, + "grad_norm": 0.4778147339820862, + "learning_rate": 5.312930649459454e-06, + "loss": 0.2959, + "step": 35296 + }, + { + "epoch": 0.6553097953015524, + "grad_norm": 0.38792353868484497, + "learning_rate": 5.311900257474269e-06, + "loss": 0.2345, + "step": 35298 + }, + { + "epoch": 0.655346925438971, + "grad_norm": 0.6582024693489075, + "learning_rate": 5.310869929280082e-06, + "loss": 0.3735, + "step": 35300 + }, + { + "epoch": 0.6553840555763897, + "grad_norm": 0.29808539152145386, + "learning_rate": 5.309839664890905e-06, + "loss": 0.2849, + "step": 35302 + }, + { + "epoch": 0.6554211857138084, + "grad_norm": 0.37377986311912537, + "learning_rate": 5.308809464320761e-06, + "loss": 0.4981, + "step": 35304 + }, + { + "epoch": 0.6554583158512269, + "grad_norm": 0.6447781920433044, + "learning_rate": 5.3077793275836695e-06, + "loss": 0.2857, + "step": 35306 + }, + { + "epoch": 0.6554954459886456, + "grad_norm": 0.3082626461982727, + "learning_rate": 5.306749254693646e-06, + "loss": 0.201, + "step": 35308 + }, + { + "epoch": 0.6555325761260642, + "grad_norm": 0.3443126678466797, + "learning_rate": 5.305719245664707e-06, + "loss": 0.1756, + "step": 35310 + }, + { + "epoch": 0.6555697062634829, + "grad_norm": 0.419045090675354, + "learning_rate": 5.304689300510873e-06, + "loss": 0.1877, + "step": 35312 + }, + { + "epoch": 0.6556068364009016, + "grad_norm": 0.3775356411933899, + "learning_rate": 5.303659419246148e-06, + "loss": 0.1651, + "step": 35314 + }, + { + "epoch": 0.6556439665383201, + "grad_norm": 0.41566628217697144, + "learning_rate": 5.302629601884552e-06, + "loss": 0.316, + "step": 35316 + }, + { + "epoch": 0.6556810966757388, + "grad_norm": 0.5356295704841614, + "learning_rate": 5.301599848440097e-06, + "loss": 0.2757, + "step": 35318 + }, + { + "epoch": 0.6557182268131574, + "grad_norm": 0.31234022974967957, + "learning_rate": 5.300570158926799e-06, + "loss": 0.3421, + "step": 35320 + }, + { + "epoch": 0.6557553569505761, + "grad_norm": 0.35381388664245605, + "learning_rate": 5.299540533358659e-06, + "loss": 0.1255, + "step": 35322 + }, + { + "epoch": 0.6557924870879948, + "grad_norm": 0.35568520426750183, + "learning_rate": 5.298510971749698e-06, + "loss": 0.1036, + "step": 35324 + }, + { + "epoch": 0.6558296172254133, + "grad_norm": 0.47837957739830017, + "learning_rate": 5.297481474113917e-06, + "loss": 0.2808, + "step": 35326 + }, + { + "epoch": 0.655866747362832, + "grad_norm": 0.3509726822376251, + "learning_rate": 5.2964520404653266e-06, + "loss": 0.4463, + "step": 35328 + }, + { + "epoch": 0.6559038775002506, + "grad_norm": 0.3945107161998749, + "learning_rate": 5.295422670817934e-06, + "loss": 0.4627, + "step": 35330 + }, + { + "epoch": 0.6559410076376693, + "grad_norm": 0.39911481738090515, + "learning_rate": 5.2943933651857484e-06, + "loss": 0.2445, + "step": 35332 + }, + { + "epoch": 0.655978137775088, + "grad_norm": 0.30275222659111023, + "learning_rate": 5.2933641235827735e-06, + "loss": 0.2083, + "step": 35334 + }, + { + "epoch": 0.6560152679125065, + "grad_norm": 0.45349594950675964, + "learning_rate": 5.29233494602302e-06, + "loss": 0.3519, + "step": 35336 + }, + { + "epoch": 0.6560523980499252, + "grad_norm": 0.465545117855072, + "learning_rate": 5.291305832520483e-06, + "loss": 0.2576, + "step": 35338 + }, + { + "epoch": 0.6560895281873438, + "grad_norm": 0.4254024028778076, + "learning_rate": 5.290276783089167e-06, + "loss": 0.2364, + "step": 35340 + }, + { + "epoch": 0.6561266583247625, + "grad_norm": 0.3732546269893646, + "learning_rate": 5.289247797743083e-06, + "loss": 0.2143, + "step": 35342 + }, + { + "epoch": 0.6561637884621812, + "grad_norm": 0.5355303287506104, + "learning_rate": 5.2882188764962214e-06, + "loss": 0.403, + "step": 35344 + }, + { + "epoch": 0.6562009185995997, + "grad_norm": 0.43809232115745544, + "learning_rate": 5.287190019362587e-06, + "loss": 0.3841, + "step": 35346 + }, + { + "epoch": 0.6562380487370184, + "grad_norm": 0.2559813857078552, + "learning_rate": 5.286161226356182e-06, + "loss": 0.1878, + "step": 35348 + }, + { + "epoch": 0.656275178874437, + "grad_norm": 0.2984473705291748, + "learning_rate": 5.285132497491005e-06, + "loss": 0.1147, + "step": 35350 + }, + { + "epoch": 0.6563123090118557, + "grad_norm": 0.37955009937286377, + "learning_rate": 5.284103832781049e-06, + "loss": 0.2031, + "step": 35352 + }, + { + "epoch": 0.6563494391492742, + "grad_norm": 0.5334699749946594, + "learning_rate": 5.283075232240314e-06, + "loss": 0.1036, + "step": 35354 + }, + { + "epoch": 0.6563865692866929, + "grad_norm": 0.4127434194087982, + "learning_rate": 5.282046695882794e-06, + "loss": 0.1393, + "step": 35356 + }, + { + "epoch": 0.6564236994241116, + "grad_norm": 0.3763151466846466, + "learning_rate": 5.281018223722489e-06, + "loss": 0.1747, + "step": 35358 + }, + { + "epoch": 0.6564608295615302, + "grad_norm": 0.3438388407230377, + "learning_rate": 5.279989815773394e-06, + "loss": 0.3463, + "step": 35360 + }, + { + "epoch": 0.6564979596989489, + "grad_norm": 0.465952068567276, + "learning_rate": 5.278961472049498e-06, + "loss": 0.1803, + "step": 35362 + }, + { + "epoch": 0.6565350898363674, + "grad_norm": 0.558900773525238, + "learning_rate": 5.277933192564797e-06, + "loss": 0.4426, + "step": 35364 + }, + { + "epoch": 0.6565722199737861, + "grad_norm": 0.26000380516052246, + "learning_rate": 5.276904977333277e-06, + "loss": 0.1395, + "step": 35366 + }, + { + "epoch": 0.6566093501112048, + "grad_norm": 0.5756716132164001, + "learning_rate": 5.275876826368933e-06, + "loss": 0.22, + "step": 35368 + }, + { + "epoch": 0.6566464802486234, + "grad_norm": 0.31303897500038147, + "learning_rate": 5.274848739685755e-06, + "loss": 0.1896, + "step": 35370 + }, + { + "epoch": 0.6566836103860421, + "grad_norm": 0.17878375947475433, + "learning_rate": 5.273820717297733e-06, + "loss": 0.1589, + "step": 35372 + }, + { + "epoch": 0.6567207405234606, + "grad_norm": 0.37714269757270813, + "learning_rate": 5.272792759218857e-06, + "loss": 0.1716, + "step": 35374 + }, + { + "epoch": 0.6567578706608793, + "grad_norm": 0.631506621837616, + "learning_rate": 5.271764865463109e-06, + "loss": 0.283, + "step": 35376 + }, + { + "epoch": 0.656795000798298, + "grad_norm": 0.3017447888851166, + "learning_rate": 5.270737036044479e-06, + "loss": 0.3847, + "step": 35378 + }, + { + "epoch": 0.6568321309357166, + "grad_norm": 0.3440400958061218, + "learning_rate": 5.2697092709769505e-06, + "loss": 0.3009, + "step": 35380 + }, + { + "epoch": 0.6568692610731353, + "grad_norm": 0.40685343742370605, + "learning_rate": 5.268681570274516e-06, + "loss": 0.2495, + "step": 35382 + }, + { + "epoch": 0.6569063912105538, + "grad_norm": 0.34019628167152405, + "learning_rate": 5.267653933951148e-06, + "loss": 0.1331, + "step": 35384 + }, + { + "epoch": 0.6569435213479725, + "grad_norm": 0.34905216097831726, + "learning_rate": 5.266626362020835e-06, + "loss": 0.2358, + "step": 35386 + }, + { + "epoch": 0.6569806514853912, + "grad_norm": 0.3839201033115387, + "learning_rate": 5.2655988544975636e-06, + "loss": 0.2242, + "step": 35388 + }, + { + "epoch": 0.6570177816228098, + "grad_norm": 0.47935885190963745, + "learning_rate": 5.264571411395306e-06, + "loss": 0.2663, + "step": 35390 + }, + { + "epoch": 0.6570549117602285, + "grad_norm": 0.7915021181106567, + "learning_rate": 5.263544032728047e-06, + "loss": 0.243, + "step": 35392 + }, + { + "epoch": 0.657092041897647, + "grad_norm": 0.6132286190986633, + "learning_rate": 5.262516718509768e-06, + "loss": 0.2173, + "step": 35394 + }, + { + "epoch": 0.6571291720350657, + "grad_norm": 0.35200098156929016, + "learning_rate": 5.2614894687544445e-06, + "loss": 0.266, + "step": 35396 + }, + { + "epoch": 0.6571663021724844, + "grad_norm": 0.28726890683174133, + "learning_rate": 5.2604622834760575e-06, + "loss": 0.3896, + "step": 35398 + }, + { + "epoch": 0.657203432309903, + "grad_norm": 0.41205185651779175, + "learning_rate": 5.259435162688584e-06, + "loss": 0.3514, + "step": 35400 + }, + { + "epoch": 0.6572405624473217, + "grad_norm": 0.830986738204956, + "learning_rate": 5.258408106405996e-06, + "loss": 0.188, + "step": 35402 + }, + { + "epoch": 0.6572776925847402, + "grad_norm": 0.45158135890960693, + "learning_rate": 5.25738111464227e-06, + "loss": 0.2007, + "step": 35404 + }, + { + "epoch": 0.6573148227221589, + "grad_norm": 0.32918915152549744, + "learning_rate": 5.256354187411385e-06, + "loss": 0.3151, + "step": 35406 + }, + { + "epoch": 0.6573519528595775, + "grad_norm": 0.5441567301750183, + "learning_rate": 5.255327324727308e-06, + "loss": 0.2152, + "step": 35408 + }, + { + "epoch": 0.6573890829969962, + "grad_norm": 1.4596136808395386, + "learning_rate": 5.254300526604015e-06, + "loss": 0.2519, + "step": 35410 + }, + { + "epoch": 0.6574262131344148, + "grad_norm": 0.37669867277145386, + "learning_rate": 5.2532737930554776e-06, + "loss": 0.404, + "step": 35412 + }, + { + "epoch": 0.6574633432718334, + "grad_norm": 0.5424704551696777, + "learning_rate": 5.2522471240956645e-06, + "loss": 0.3582, + "step": 35414 + }, + { + "epoch": 0.6575004734092521, + "grad_norm": 0.45767876505851746, + "learning_rate": 5.251220519738546e-06, + "loss": 0.1537, + "step": 35416 + }, + { + "epoch": 0.6575376035466707, + "grad_norm": 0.40727460384368896, + "learning_rate": 5.2501939799980915e-06, + "loss": 0.1774, + "step": 35418 + }, + { + "epoch": 0.6575747336840894, + "grad_norm": 0.40965506434440613, + "learning_rate": 5.24916750488827e-06, + "loss": 0.3089, + "step": 35420 + }, + { + "epoch": 0.657611863821508, + "grad_norm": 0.3258717656135559, + "learning_rate": 5.248141094423049e-06, + "loss": 0.1492, + "step": 35422 + }, + { + "epoch": 0.6576489939589266, + "grad_norm": 0.46241846680641174, + "learning_rate": 5.2471147486163976e-06, + "loss": 0.3617, + "step": 35424 + }, + { + "epoch": 0.6576861240963453, + "grad_norm": 0.28546419739723206, + "learning_rate": 5.246088467482273e-06, + "loss": 0.4672, + "step": 35426 + }, + { + "epoch": 0.6577232542337639, + "grad_norm": 1.5007548332214355, + "learning_rate": 5.245062251034649e-06, + "loss": 0.2412, + "step": 35428 + }, + { + "epoch": 0.6577603843711826, + "grad_norm": 0.41270267963409424, + "learning_rate": 5.244036099287483e-06, + "loss": 0.2759, + "step": 35430 + }, + { + "epoch": 0.6577975145086012, + "grad_norm": 0.23636583983898163, + "learning_rate": 5.243010012254739e-06, + "loss": 0.1365, + "step": 35432 + }, + { + "epoch": 0.6578346446460198, + "grad_norm": 0.44029700756073, + "learning_rate": 5.241983989950379e-06, + "loss": 0.1614, + "step": 35434 + }, + { + "epoch": 0.6578717747834385, + "grad_norm": 0.2560267746448517, + "learning_rate": 5.240958032388366e-06, + "loss": 0.3045, + "step": 35436 + }, + { + "epoch": 0.6579089049208571, + "grad_norm": 0.32967299222946167, + "learning_rate": 5.2399321395826615e-06, + "loss": 0.2045, + "step": 35438 + }, + { + "epoch": 0.6579460350582758, + "grad_norm": 0.39231032133102417, + "learning_rate": 5.23890631154722e-06, + "loss": 0.2917, + "step": 35440 + }, + { + "epoch": 0.6579831651956944, + "grad_norm": 0.3208622634410858, + "learning_rate": 5.237880548296004e-06, + "loss": 0.1383, + "step": 35442 + }, + { + "epoch": 0.658020295333113, + "grad_norm": 0.3608616590499878, + "learning_rate": 5.236854849842967e-06, + "loss": 0.2382, + "step": 35444 + }, + { + "epoch": 0.6580574254705317, + "grad_norm": 0.3875238597393036, + "learning_rate": 5.235829216202073e-06, + "loss": 0.4459, + "step": 35446 + }, + { + "epoch": 0.6580945556079503, + "grad_norm": 0.5234822034835815, + "learning_rate": 5.234803647387269e-06, + "loss": 0.2614, + "step": 35448 + }, + { + "epoch": 0.658131685745369, + "grad_norm": 0.4111554026603699, + "learning_rate": 5.2337781434125175e-06, + "loss": 0.3282, + "step": 35450 + }, + { + "epoch": 0.6581688158827875, + "grad_norm": 0.2506873309612274, + "learning_rate": 5.232752704291766e-06, + "loss": 0.1384, + "step": 35452 + }, + { + "epoch": 0.6582059460202062, + "grad_norm": 0.40214842557907104, + "learning_rate": 5.2317273300389695e-06, + "loss": 0.4022, + "step": 35454 + }, + { + "epoch": 0.6582430761576249, + "grad_norm": 0.5302115082740784, + "learning_rate": 5.230702020668083e-06, + "loss": 0.2811, + "step": 35456 + }, + { + "epoch": 0.6582802062950435, + "grad_norm": 0.5096840262413025, + "learning_rate": 5.2296767761930555e-06, + "loss": 0.4807, + "step": 35458 + }, + { + "epoch": 0.6583173364324622, + "grad_norm": 0.25991812348365784, + "learning_rate": 5.2286515966278375e-06, + "loss": 0.1888, + "step": 35460 + }, + { + "epoch": 0.6583544665698807, + "grad_norm": 0.25767040252685547, + "learning_rate": 5.227626481986383e-06, + "loss": 0.2201, + "step": 35462 + }, + { + "epoch": 0.6583915967072994, + "grad_norm": 0.39043304324150085, + "learning_rate": 5.226601432282636e-06, + "loss": 0.1861, + "step": 35464 + }, + { + "epoch": 0.6584287268447181, + "grad_norm": 0.34052613377571106, + "learning_rate": 5.225576447530543e-06, + "loss": 0.2574, + "step": 35466 + }, + { + "epoch": 0.6584658569821367, + "grad_norm": 0.30887866020202637, + "learning_rate": 5.224551527744057e-06, + "loss": 0.3945, + "step": 35468 + }, + { + "epoch": 0.6585029871195553, + "grad_norm": 0.4929267466068268, + "learning_rate": 5.2235266729371185e-06, + "loss": 0.2169, + "step": 35470 + }, + { + "epoch": 0.6585401172569739, + "grad_norm": 0.46489089727401733, + "learning_rate": 5.222501883123674e-06, + "loss": 0.2385, + "step": 35472 + }, + { + "epoch": 0.6585772473943926, + "grad_norm": 0.3887680470943451, + "learning_rate": 5.221477158317669e-06, + "loss": 0.2796, + "step": 35474 + }, + { + "epoch": 0.6586143775318113, + "grad_norm": 0.5122246146202087, + "learning_rate": 5.220452498533051e-06, + "loss": 0.2728, + "step": 35476 + }, + { + "epoch": 0.6586515076692299, + "grad_norm": 0.2843627333641052, + "learning_rate": 5.2194279037837524e-06, + "loss": 0.2369, + "step": 35478 + }, + { + "epoch": 0.6586886378066485, + "grad_norm": 0.3789721131324768, + "learning_rate": 5.218403374083723e-06, + "loss": 0.3219, + "step": 35480 + }, + { + "epoch": 0.6587257679440671, + "grad_norm": 0.19454063475131989, + "learning_rate": 5.217378909446899e-06, + "loss": 0.3091, + "step": 35482 + }, + { + "epoch": 0.6587628980814858, + "grad_norm": 0.3968220353126526, + "learning_rate": 5.216354509887223e-06, + "loss": 0.3076, + "step": 35484 + }, + { + "epoch": 0.6588000282189045, + "grad_norm": 0.4575442969799042, + "learning_rate": 5.215330175418634e-06, + "loss": 0.6014, + "step": 35486 + }, + { + "epoch": 0.6588371583563231, + "grad_norm": 0.672534167766571, + "learning_rate": 5.214305906055073e-06, + "loss": 0.4092, + "step": 35488 + }, + { + "epoch": 0.6588742884937417, + "grad_norm": 0.3155250549316406, + "learning_rate": 5.2132817018104734e-06, + "loss": 0.2757, + "step": 35490 + }, + { + "epoch": 0.6589114186311603, + "grad_norm": 0.4715367257595062, + "learning_rate": 5.212257562698768e-06, + "loss": 0.1655, + "step": 35492 + }, + { + "epoch": 0.658948548768579, + "grad_norm": 1.056399941444397, + "learning_rate": 5.211233488733897e-06, + "loss": 0.3875, + "step": 35494 + }, + { + "epoch": 0.6589856789059977, + "grad_norm": 0.4571651220321655, + "learning_rate": 5.210209479929793e-06, + "loss": 0.2939, + "step": 35496 + }, + { + "epoch": 0.6590228090434163, + "grad_norm": 0.5640197396278381, + "learning_rate": 5.209185536300392e-06, + "loss": 0.3932, + "step": 35498 + }, + { + "epoch": 0.6590599391808349, + "grad_norm": 0.3530406951904297, + "learning_rate": 5.20816165785963e-06, + "loss": 0.3019, + "step": 35500 + }, + { + "epoch": 0.6590970693182535, + "grad_norm": 0.488593190908432, + "learning_rate": 5.20713784462143e-06, + "loss": 0.296, + "step": 35502 + }, + { + "epoch": 0.6591341994556722, + "grad_norm": 0.2664813995361328, + "learning_rate": 5.2061140965997284e-06, + "loss": 0.176, + "step": 35504 + }, + { + "epoch": 0.6591713295930908, + "grad_norm": 0.3595189154148102, + "learning_rate": 5.205090413808453e-06, + "loss": 0.3127, + "step": 35506 + }, + { + "epoch": 0.6592084597305095, + "grad_norm": 0.42755237221717834, + "learning_rate": 5.204066796261535e-06, + "loss": 0.2745, + "step": 35508 + }, + { + "epoch": 0.6592455898679281, + "grad_norm": 0.5254095792770386, + "learning_rate": 5.203043243972907e-06, + "loss": 0.19, + "step": 35510 + }, + { + "epoch": 0.6592827200053467, + "grad_norm": 0.28370365500450134, + "learning_rate": 5.202019756956489e-06, + "loss": 0.2566, + "step": 35512 + }, + { + "epoch": 0.6593198501427654, + "grad_norm": 0.38276195526123047, + "learning_rate": 5.2009963352262135e-06, + "loss": 0.2152, + "step": 35514 + }, + { + "epoch": 0.659356980280184, + "grad_norm": 0.4603082537651062, + "learning_rate": 5.1999729787959986e-06, + "loss": 0.1963, + "step": 35516 + }, + { + "epoch": 0.6593941104176027, + "grad_norm": 0.44894811511039734, + "learning_rate": 5.198949687679774e-06, + "loss": 0.3753, + "step": 35518 + }, + { + "epoch": 0.6594312405550213, + "grad_norm": 0.43216943740844727, + "learning_rate": 5.197926461891464e-06, + "loss": 0.3356, + "step": 35520 + }, + { + "epoch": 0.6594683706924399, + "grad_norm": 0.3915995955467224, + "learning_rate": 5.196903301444991e-06, + "loss": 0.1585, + "step": 35522 + }, + { + "epoch": 0.6595055008298586, + "grad_norm": 0.35694894194602966, + "learning_rate": 5.195880206354276e-06, + "loss": 0.2996, + "step": 35524 + }, + { + "epoch": 0.6595426309672772, + "grad_norm": 0.41175082325935364, + "learning_rate": 5.194857176633246e-06, + "loss": 0.3438, + "step": 35526 + }, + { + "epoch": 0.6595797611046958, + "grad_norm": 0.3641592860221863, + "learning_rate": 5.193834212295813e-06, + "loss": 0.3659, + "step": 35528 + }, + { + "epoch": 0.6596168912421145, + "grad_norm": 0.32085898518562317, + "learning_rate": 5.1928113133559e-06, + "loss": 0.3078, + "step": 35530 + }, + { + "epoch": 0.6596540213795331, + "grad_norm": 0.3446044325828552, + "learning_rate": 5.1917884798274295e-06, + "loss": 0.2144, + "step": 35532 + }, + { + "epoch": 0.6596911515169518, + "grad_norm": 0.4545195400714874, + "learning_rate": 5.1907657117243124e-06, + "loss": 0.243, + "step": 35534 + }, + { + "epoch": 0.6597282816543704, + "grad_norm": 0.6099935173988342, + "learning_rate": 5.189743009060468e-06, + "loss": 0.4023, + "step": 35536 + }, + { + "epoch": 0.659765411791789, + "grad_norm": 0.31657537817955017, + "learning_rate": 5.188720371849817e-06, + "loss": 0.3495, + "step": 35538 + }, + { + "epoch": 0.6598025419292077, + "grad_norm": 0.5445834994316101, + "learning_rate": 5.187697800106266e-06, + "loss": 0.2018, + "step": 35540 + }, + { + "epoch": 0.6598396720666263, + "grad_norm": 0.36553066968917847, + "learning_rate": 5.186675293843734e-06, + "loss": 0.2207, + "step": 35542 + }, + { + "epoch": 0.659876802204045, + "grad_norm": 0.43657222390174866, + "learning_rate": 5.185652853076133e-06, + "loss": 0.4178, + "step": 35544 + }, + { + "epoch": 0.6599139323414636, + "grad_norm": 0.33869799971580505, + "learning_rate": 5.184630477817376e-06, + "loss": 0.2608, + "step": 35546 + }, + { + "epoch": 0.6599510624788822, + "grad_norm": 0.36574652791023254, + "learning_rate": 5.183608168081375e-06, + "loss": 0.4497, + "step": 35548 + }, + { + "epoch": 0.6599881926163009, + "grad_norm": 0.47389522194862366, + "learning_rate": 5.182585923882044e-06, + "loss": 0.1713, + "step": 35550 + }, + { + "epoch": 0.6600253227537195, + "grad_norm": 0.2933224141597748, + "learning_rate": 5.1815637452332845e-06, + "loss": 0.3438, + "step": 35552 + }, + { + "epoch": 0.6600624528911382, + "grad_norm": 0.5897746682167053, + "learning_rate": 5.180541632149014e-06, + "loss": 0.2287, + "step": 35554 + }, + { + "epoch": 0.6600995830285568, + "grad_norm": 0.3783877193927765, + "learning_rate": 5.179519584643131e-06, + "loss": 0.2508, + "step": 35556 + }, + { + "epoch": 0.6601367131659754, + "grad_norm": 0.3197145462036133, + "learning_rate": 5.178497602729549e-06, + "loss": 0.4348, + "step": 35558 + }, + { + "epoch": 0.660173843303394, + "grad_norm": 0.36324411630630493, + "learning_rate": 5.177475686422172e-06, + "loss": 0.2722, + "step": 35560 + }, + { + "epoch": 0.6602109734408127, + "grad_norm": 0.573016345500946, + "learning_rate": 5.176453835734905e-06, + "loss": 0.5282, + "step": 35562 + }, + { + "epoch": 0.6602481035782314, + "grad_norm": 0.3848489224910736, + "learning_rate": 5.175432050681658e-06, + "loss": 0.3175, + "step": 35564 + }, + { + "epoch": 0.66028523371565, + "grad_norm": 0.51182621717453, + "learning_rate": 5.174410331276325e-06, + "loss": 0.2433, + "step": 35566 + }, + { + "epoch": 0.6603223638530686, + "grad_norm": 0.44513410329818726, + "learning_rate": 5.173388677532813e-06, + "loss": 0.1991, + "step": 35568 + }, + { + "epoch": 0.6603594939904872, + "grad_norm": 0.5000667572021484, + "learning_rate": 5.172367089465025e-06, + "loss": 0.154, + "step": 35570 + }, + { + "epoch": 0.6603966241279059, + "grad_norm": 0.43416404724121094, + "learning_rate": 5.171345567086859e-06, + "loss": 0.1413, + "step": 35572 + }, + { + "epoch": 0.6604337542653246, + "grad_norm": 0.6004547476768494, + "learning_rate": 5.17032411041222e-06, + "loss": 0.4135, + "step": 35574 + }, + { + "epoch": 0.6604708844027432, + "grad_norm": 0.38738057017326355, + "learning_rate": 5.169302719455005e-06, + "loss": 0.4395, + "step": 35576 + }, + { + "epoch": 0.6605080145401618, + "grad_norm": 0.3397410809993744, + "learning_rate": 5.168281394229104e-06, + "loss": 0.1427, + "step": 35578 + }, + { + "epoch": 0.6605451446775804, + "grad_norm": 0.707030177116394, + "learning_rate": 5.167260134748422e-06, + "loss": 0.1641, + "step": 35580 + }, + { + "epoch": 0.6605822748149991, + "grad_norm": 0.3504965007305145, + "learning_rate": 5.166238941026855e-06, + "loss": 0.3173, + "step": 35582 + }, + { + "epoch": 0.6606194049524178, + "grad_norm": 0.4131953716278076, + "learning_rate": 5.165217813078296e-06, + "loss": 0.2435, + "step": 35584 + }, + { + "epoch": 0.6606565350898363, + "grad_norm": 0.3229162395000458, + "learning_rate": 5.16419675091664e-06, + "loss": 0.3132, + "step": 35586 + }, + { + "epoch": 0.660693665227255, + "grad_norm": 0.525147020816803, + "learning_rate": 5.163175754555786e-06, + "loss": 0.2187, + "step": 35588 + }, + { + "epoch": 0.6607307953646736, + "grad_norm": 0.37702298164367676, + "learning_rate": 5.162154824009617e-06, + "loss": 0.3664, + "step": 35590 + }, + { + "epoch": 0.6607679255020923, + "grad_norm": 0.3066100776195526, + "learning_rate": 5.161133959292033e-06, + "loss": 0.1916, + "step": 35592 + }, + { + "epoch": 0.660805055639511, + "grad_norm": 0.315158486366272, + "learning_rate": 5.160113160416918e-06, + "loss": 0.3567, + "step": 35594 + }, + { + "epoch": 0.6608421857769295, + "grad_norm": 0.5466799139976501, + "learning_rate": 5.159092427398172e-06, + "loss": 0.3157, + "step": 35596 + }, + { + "epoch": 0.6608793159143482, + "grad_norm": 0.23543672263622284, + "learning_rate": 5.158071760249673e-06, + "loss": 0.2462, + "step": 35598 + }, + { + "epoch": 0.6609164460517668, + "grad_norm": 0.6227414608001709, + "learning_rate": 5.157051158985315e-06, + "loss": 0.2832, + "step": 35600 + }, + { + "epoch": 0.6609535761891855, + "grad_norm": 0.2572740912437439, + "learning_rate": 5.156030623618987e-06, + "loss": 0.3225, + "step": 35602 + }, + { + "epoch": 0.6609907063266041, + "grad_norm": 0.3736858069896698, + "learning_rate": 5.15501015416457e-06, + "loss": 0.1234, + "step": 35604 + }, + { + "epoch": 0.6610278364640227, + "grad_norm": 0.5968903303146362, + "learning_rate": 5.153989750635952e-06, + "loss": 0.354, + "step": 35606 + }, + { + "epoch": 0.6610649666014414, + "grad_norm": 0.4293021261692047, + "learning_rate": 5.1529694130470175e-06, + "loss": 0.4129, + "step": 35608 + }, + { + "epoch": 0.66110209673886, + "grad_norm": 0.33938416838645935, + "learning_rate": 5.151949141411652e-06, + "loss": 0.2606, + "step": 35610 + }, + { + "epoch": 0.6611392268762787, + "grad_norm": 0.4143626093864441, + "learning_rate": 5.150928935743735e-06, + "loss": 0.2341, + "step": 35612 + }, + { + "epoch": 0.6611763570136973, + "grad_norm": 0.3429454267024994, + "learning_rate": 5.149908796057157e-06, + "loss": 0.3199, + "step": 35614 + }, + { + "epoch": 0.6612134871511159, + "grad_norm": 0.37814095616340637, + "learning_rate": 5.148888722365787e-06, + "loss": 0.2741, + "step": 35616 + }, + { + "epoch": 0.6612506172885346, + "grad_norm": 0.3880716562271118, + "learning_rate": 5.147868714683515e-06, + "loss": 0.2515, + "step": 35618 + }, + { + "epoch": 0.6612877474259532, + "grad_norm": 0.22556045651435852, + "learning_rate": 5.146848773024213e-06, + "loss": 0.3194, + "step": 35620 + }, + { + "epoch": 0.6613248775633719, + "grad_norm": 0.3572297692298889, + "learning_rate": 5.145828897401761e-06, + "loss": 0.3897, + "step": 35622 + }, + { + "epoch": 0.6613620077007905, + "grad_norm": 0.24831829965114594, + "learning_rate": 5.144809087830038e-06, + "loss": 0.1197, + "step": 35624 + }, + { + "epoch": 0.6613991378382091, + "grad_norm": 0.3366273045539856, + "learning_rate": 5.143789344322925e-06, + "loss": 0.3036, + "step": 35626 + }, + { + "epoch": 0.6614362679756278, + "grad_norm": 0.31361690163612366, + "learning_rate": 5.142769666894287e-06, + "loss": 0.1911, + "step": 35628 + }, + { + "epoch": 0.6614733981130464, + "grad_norm": 0.5385131239891052, + "learning_rate": 5.141750055558008e-06, + "loss": 0.2099, + "step": 35630 + }, + { + "epoch": 0.6615105282504651, + "grad_norm": 0.5193853974342346, + "learning_rate": 5.140730510327956e-06, + "loss": 0.2996, + "step": 35632 + }, + { + "epoch": 0.6615476583878837, + "grad_norm": 0.5467257499694824, + "learning_rate": 5.139711031218008e-06, + "loss": 0.4593, + "step": 35634 + }, + { + "epoch": 0.6615847885253023, + "grad_norm": 0.24626703560352325, + "learning_rate": 5.1386916182420375e-06, + "loss": 0.4301, + "step": 35636 + }, + { + "epoch": 0.661621918662721, + "grad_norm": 0.41291165351867676, + "learning_rate": 5.137672271413909e-06, + "loss": 0.3254, + "step": 35638 + }, + { + "epoch": 0.6616590488001396, + "grad_norm": 0.5426741242408752, + "learning_rate": 5.136652990747502e-06, + "loss": 0.3513, + "step": 35640 + }, + { + "epoch": 0.6616961789375583, + "grad_norm": 0.6074190735816956, + "learning_rate": 5.135633776256674e-06, + "loss": 0.3691, + "step": 35642 + }, + { + "epoch": 0.6617333090749768, + "grad_norm": 0.28804171085357666, + "learning_rate": 5.134614627955301e-06, + "loss": 0.2254, + "step": 35644 + }, + { + "epoch": 0.6617704392123955, + "grad_norm": 0.4080308973789215, + "learning_rate": 5.13359554585725e-06, + "loss": 0.2475, + "step": 35646 + }, + { + "epoch": 0.6618075693498142, + "grad_norm": 0.3483557403087616, + "learning_rate": 5.132576529976387e-06, + "loss": 0.3901, + "step": 35648 + }, + { + "epoch": 0.6618446994872328, + "grad_norm": 0.42413410544395447, + "learning_rate": 5.131557580326577e-06, + "loss": 0.3416, + "step": 35650 + }, + { + "epoch": 0.6618818296246515, + "grad_norm": 0.3393104672431946, + "learning_rate": 5.13053869692169e-06, + "loss": 0.2569, + "step": 35652 + }, + { + "epoch": 0.66191895976207, + "grad_norm": 0.43552491068840027, + "learning_rate": 5.129519879775582e-06, + "loss": 0.4348, + "step": 35654 + }, + { + "epoch": 0.6619560898994887, + "grad_norm": 0.1484507918357849, + "learning_rate": 5.128501128902119e-06, + "loss": 0.2366, + "step": 35656 + }, + { + "epoch": 0.6619932200369073, + "grad_norm": 0.2583087980747223, + "learning_rate": 5.127482444315164e-06, + "loss": 0.1096, + "step": 35658 + }, + { + "epoch": 0.662030350174326, + "grad_norm": 0.414387047290802, + "learning_rate": 5.126463826028583e-06, + "loss": 0.4068, + "step": 35660 + }, + { + "epoch": 0.6620674803117447, + "grad_norm": 0.533430278301239, + "learning_rate": 5.125445274056226e-06, + "loss": 0.1305, + "step": 35662 + }, + { + "epoch": 0.6621046104491632, + "grad_norm": 0.25655046105384827, + "learning_rate": 5.124426788411963e-06, + "loss": 0.2066, + "step": 35664 + }, + { + "epoch": 0.6621417405865819, + "grad_norm": 0.39234763383865356, + "learning_rate": 5.123408369109642e-06, + "loss": 0.3543, + "step": 35666 + }, + { + "epoch": 0.6621788707240005, + "grad_norm": 0.43655967712402344, + "learning_rate": 5.122390016163127e-06, + "loss": 0.1866, + "step": 35668 + }, + { + "epoch": 0.6622160008614192, + "grad_norm": 0.6385837197303772, + "learning_rate": 5.121371729586273e-06, + "loss": 0.2823, + "step": 35670 + }, + { + "epoch": 0.6622531309988379, + "grad_norm": 0.31867802143096924, + "learning_rate": 5.120353509392937e-06, + "loss": 0.3769, + "step": 35672 + }, + { + "epoch": 0.6622902611362564, + "grad_norm": 0.30754727125167847, + "learning_rate": 5.1193353555969734e-06, + "loss": 0.2686, + "step": 35674 + }, + { + "epoch": 0.6623273912736751, + "grad_norm": 0.2654276490211487, + "learning_rate": 5.118317268212236e-06, + "loss": 0.1817, + "step": 35676 + }, + { + "epoch": 0.6623645214110937, + "grad_norm": 0.4747851490974426, + "learning_rate": 5.117299247252583e-06, + "loss": 0.4362, + "step": 35678 + }, + { + "epoch": 0.6624016515485124, + "grad_norm": 0.40849828720092773, + "learning_rate": 5.1162812927318584e-06, + "loss": 0.3287, + "step": 35680 + }, + { + "epoch": 0.6624387816859311, + "grad_norm": 0.3860376179218292, + "learning_rate": 5.1152634046639194e-06, + "loss": 0.2716, + "step": 35682 + }, + { + "epoch": 0.6624759118233496, + "grad_norm": 0.6082983016967773, + "learning_rate": 5.114245583062612e-06, + "loss": 0.3003, + "step": 35684 + }, + { + "epoch": 0.6625130419607683, + "grad_norm": 0.4881953001022339, + "learning_rate": 5.113227827941786e-06, + "loss": 0.2605, + "step": 35686 + }, + { + "epoch": 0.6625501720981869, + "grad_norm": 0.26633453369140625, + "learning_rate": 5.112210139315292e-06, + "loss": 0.3383, + "step": 35688 + }, + { + "epoch": 0.6625873022356056, + "grad_norm": 0.30107632279396057, + "learning_rate": 5.111192517196981e-06, + "loss": 0.3308, + "step": 35690 + }, + { + "epoch": 0.6626244323730243, + "grad_norm": 0.4587944746017456, + "learning_rate": 5.110174961600694e-06, + "loss": 0.394, + "step": 35692 + }, + { + "epoch": 0.6626615625104428, + "grad_norm": 0.3761554956436157, + "learning_rate": 5.1091574725402775e-06, + "loss": 0.1361, + "step": 35694 + }, + { + "epoch": 0.6626986926478615, + "grad_norm": 0.49689170718193054, + "learning_rate": 5.108140050029577e-06, + "loss": 0.1904, + "step": 35696 + }, + { + "epoch": 0.6627358227852801, + "grad_norm": 0.3448061943054199, + "learning_rate": 5.10712269408244e-06, + "loss": 0.2597, + "step": 35698 + }, + { + "epoch": 0.6627729529226988, + "grad_norm": 0.2589368522167206, + "learning_rate": 5.10610540471271e-06, + "loss": 0.1484, + "step": 35700 + }, + { + "epoch": 0.6628100830601175, + "grad_norm": 0.3849527835845947, + "learning_rate": 5.105088181934222e-06, + "loss": 0.2872, + "step": 35702 + }, + { + "epoch": 0.662847213197536, + "grad_norm": 0.4294360280036926, + "learning_rate": 5.104071025760827e-06, + "loss": 0.2916, + "step": 35704 + }, + { + "epoch": 0.6628843433349547, + "grad_norm": 0.4944005012512207, + "learning_rate": 5.103053936206356e-06, + "loss": 0.19, + "step": 35706 + }, + { + "epoch": 0.6629214734723733, + "grad_norm": 0.3311578333377838, + "learning_rate": 5.102036913284652e-06, + "loss": 0.2588, + "step": 35708 + }, + { + "epoch": 0.662958603609792, + "grad_norm": 0.2882724702358246, + "learning_rate": 5.1010199570095565e-06, + "loss": 0.1761, + "step": 35710 + }, + { + "epoch": 0.6629957337472105, + "grad_norm": 0.27424556016921997, + "learning_rate": 5.100003067394903e-06, + "loss": 0.3292, + "step": 35712 + }, + { + "epoch": 0.6630328638846292, + "grad_norm": 0.33726057410240173, + "learning_rate": 5.098986244454536e-06, + "loss": 0.2198, + "step": 35714 + }, + { + "epoch": 0.6630699940220479, + "grad_norm": 0.3712612986564636, + "learning_rate": 5.097969488202281e-06, + "loss": 0.1616, + "step": 35716 + }, + { + "epoch": 0.6631071241594665, + "grad_norm": 0.4006690979003906, + "learning_rate": 5.09695279865198e-06, + "loss": 0.2564, + "step": 35718 + }, + { + "epoch": 0.6631442542968852, + "grad_norm": 0.32374322414398193, + "learning_rate": 5.095936175817463e-06, + "loss": 0.2432, + "step": 35720 + }, + { + "epoch": 0.6631813844343037, + "grad_norm": 0.25246912240982056, + "learning_rate": 5.094919619712571e-06, + "loss": 0.1877, + "step": 35722 + }, + { + "epoch": 0.6632185145717224, + "grad_norm": 0.4558045268058777, + "learning_rate": 5.0939031303511246e-06, + "loss": 0.3896, + "step": 35724 + }, + { + "epoch": 0.6632556447091411, + "grad_norm": 0.6932918429374695, + "learning_rate": 5.092886707746962e-06, + "loss": 0.4129, + "step": 35726 + }, + { + "epoch": 0.6632927748465597, + "grad_norm": 0.4563109874725342, + "learning_rate": 5.091870351913916e-06, + "loss": 0.2993, + "step": 35728 + }, + { + "epoch": 0.6633299049839784, + "grad_norm": 0.2583703398704529, + "learning_rate": 5.090854062865811e-06, + "loss": 0.3683, + "step": 35730 + }, + { + "epoch": 0.6633670351213969, + "grad_norm": 0.45667240023612976, + "learning_rate": 5.0898378406164765e-06, + "loss": 0.2917, + "step": 35732 + }, + { + "epoch": 0.6634041652588156, + "grad_norm": 0.5004454255104065, + "learning_rate": 5.088821685179741e-06, + "loss": 0.2969, + "step": 35734 + }, + { + "epoch": 0.6634412953962343, + "grad_norm": 0.40400418639183044, + "learning_rate": 5.087805596569431e-06, + "loss": 0.4543, + "step": 35736 + }, + { + "epoch": 0.6634784255336529, + "grad_norm": 0.3969423174858093, + "learning_rate": 5.0867895747993745e-06, + "loss": 0.2197, + "step": 35738 + }, + { + "epoch": 0.6635155556710716, + "grad_norm": 0.4083348214626312, + "learning_rate": 5.085773619883398e-06, + "loss": 0.3656, + "step": 35740 + }, + { + "epoch": 0.6635526858084901, + "grad_norm": 0.44520920515060425, + "learning_rate": 5.084757731835319e-06, + "loss": 0.1845, + "step": 35742 + }, + { + "epoch": 0.6635898159459088, + "grad_norm": 0.3800777196884155, + "learning_rate": 5.083741910668969e-06, + "loss": 0.3937, + "step": 35744 + }, + { + "epoch": 0.6636269460833275, + "grad_norm": 0.3157062530517578, + "learning_rate": 5.082726156398162e-06, + "loss": 0.087, + "step": 35746 + }, + { + "epoch": 0.6636640762207461, + "grad_norm": 0.5410152673721313, + "learning_rate": 5.081710469036723e-06, + "loss": 0.2215, + "step": 35748 + }, + { + "epoch": 0.6637012063581648, + "grad_norm": 0.4531360864639282, + "learning_rate": 5.080694848598472e-06, + "loss": 0.3463, + "step": 35750 + }, + { + "epoch": 0.6637383364955833, + "grad_norm": 0.45560604333877563, + "learning_rate": 5.079679295097233e-06, + "loss": 0.4001, + "step": 35752 + }, + { + "epoch": 0.663775466633002, + "grad_norm": 0.464783638715744, + "learning_rate": 5.078663808546817e-06, + "loss": 0.2929, + "step": 35754 + }, + { + "epoch": 0.6638125967704206, + "grad_norm": 0.42269477248191833, + "learning_rate": 5.077648388961045e-06, + "loss": 0.4259, + "step": 35756 + }, + { + "epoch": 0.6638497269078393, + "grad_norm": 0.37835606932640076, + "learning_rate": 5.076633036353735e-06, + "loss": 0.1601, + "step": 35758 + }, + { + "epoch": 0.663886857045258, + "grad_norm": 0.37240105867385864, + "learning_rate": 5.075617750738702e-06, + "loss": 0.3516, + "step": 35760 + }, + { + "epoch": 0.6639239871826765, + "grad_norm": 0.27211758494377136, + "learning_rate": 5.0746025321297595e-06, + "loss": 0.2881, + "step": 35762 + }, + { + "epoch": 0.6639611173200952, + "grad_norm": 0.623069703578949, + "learning_rate": 5.07358738054073e-06, + "loss": 0.2386, + "step": 35764 + }, + { + "epoch": 0.6639982474575138, + "grad_norm": 0.27582481503486633, + "learning_rate": 5.072572295985414e-06, + "loss": 0.1577, + "step": 35766 + }, + { + "epoch": 0.6640353775949325, + "grad_norm": 0.30054011940956116, + "learning_rate": 5.071557278477634e-06, + "loss": 0.336, + "step": 35768 + }, + { + "epoch": 0.6640725077323512, + "grad_norm": 0.386131227016449, + "learning_rate": 5.070542328031194e-06, + "loss": 0.1935, + "step": 35770 + }, + { + "epoch": 0.6641096378697697, + "grad_norm": 0.3890044689178467, + "learning_rate": 5.069527444659908e-06, + "loss": 0.4169, + "step": 35772 + }, + { + "epoch": 0.6641467680071884, + "grad_norm": 0.2515128254890442, + "learning_rate": 5.068512628377583e-06, + "loss": 0.2272, + "step": 35774 + }, + { + "epoch": 0.664183898144607, + "grad_norm": 0.35916200280189514, + "learning_rate": 5.067497879198031e-06, + "loss": 0.0774, + "step": 35776 + }, + { + "epoch": 0.6642210282820257, + "grad_norm": 0.5421344041824341, + "learning_rate": 5.0664831971350616e-06, + "loss": 0.2332, + "step": 35778 + }, + { + "epoch": 0.6642581584194444, + "grad_norm": 0.34166672825813293, + "learning_rate": 5.065468582202474e-06, + "loss": 0.3609, + "step": 35780 + }, + { + "epoch": 0.6642952885568629, + "grad_norm": 0.3095262348651886, + "learning_rate": 5.064454034414079e-06, + "loss": 0.1724, + "step": 35782 + }, + { + "epoch": 0.6643324186942816, + "grad_norm": 0.382196307182312, + "learning_rate": 5.063439553783681e-06, + "loss": 0.3086, + "step": 35784 + }, + { + "epoch": 0.6643695488317002, + "grad_norm": 0.3494543433189392, + "learning_rate": 5.062425140325088e-06, + "loss": 0.1381, + "step": 35786 + }, + { + "epoch": 0.6644066789691189, + "grad_norm": 0.3344893753528595, + "learning_rate": 5.061410794052095e-06, + "loss": 0.2822, + "step": 35788 + }, + { + "epoch": 0.6644438091065376, + "grad_norm": 0.3500637114048004, + "learning_rate": 5.060396514978512e-06, + "loss": 0.1001, + "step": 35790 + }, + { + "epoch": 0.6644809392439561, + "grad_norm": 0.30417224764823914, + "learning_rate": 5.059382303118132e-06, + "loss": 0.271, + "step": 35792 + }, + { + "epoch": 0.6645180693813748, + "grad_norm": 0.5784962773323059, + "learning_rate": 5.058368158484761e-06, + "loss": 0.3286, + "step": 35794 + }, + { + "epoch": 0.6645551995187934, + "grad_norm": 0.3390215337276459, + "learning_rate": 5.057354081092198e-06, + "loss": 0.3508, + "step": 35796 + }, + { + "epoch": 0.6645923296562121, + "grad_norm": 0.7221940755844116, + "learning_rate": 5.0563400709542394e-06, + "loss": 0.4065, + "step": 35798 + }, + { + "epoch": 0.6646294597936308, + "grad_norm": 0.31197357177734375, + "learning_rate": 5.055326128084685e-06, + "loss": 0.1717, + "step": 35800 + }, + { + "epoch": 0.6646665899310493, + "grad_norm": 0.25203120708465576, + "learning_rate": 5.054312252497332e-06, + "loss": 0.3156, + "step": 35802 + }, + { + "epoch": 0.664703720068468, + "grad_norm": 0.5010393857955933, + "learning_rate": 5.053298444205978e-06, + "loss": 0.1339, + "step": 35804 + }, + { + "epoch": 0.6647408502058866, + "grad_norm": 0.5132566094398499, + "learning_rate": 5.052284703224413e-06, + "loss": 0.3982, + "step": 35806 + }, + { + "epoch": 0.6647779803433053, + "grad_norm": 0.3334343135356903, + "learning_rate": 5.051271029566435e-06, + "loss": 0.4459, + "step": 35808 + }, + { + "epoch": 0.6648151104807238, + "grad_norm": 0.595818281173706, + "learning_rate": 5.050257423245831e-06, + "loss": 0.2544, + "step": 35810 + }, + { + "epoch": 0.6648522406181425, + "grad_norm": 0.27838897705078125, + "learning_rate": 5.049243884276398e-06, + "loss": 0.4134, + "step": 35812 + }, + { + "epoch": 0.6648893707555612, + "grad_norm": 0.328463077545166, + "learning_rate": 5.048230412671926e-06, + "loss": 0.3704, + "step": 35814 + }, + { + "epoch": 0.6649265008929798, + "grad_norm": 0.3384462594985962, + "learning_rate": 5.0472170084462115e-06, + "loss": 0.1662, + "step": 35816 + }, + { + "epoch": 0.6649636310303985, + "grad_norm": 0.2465285211801529, + "learning_rate": 5.046203671613033e-06, + "loss": 0.1677, + "step": 35818 + }, + { + "epoch": 0.665000761167817, + "grad_norm": 0.2547358572483063, + "learning_rate": 5.045190402186184e-06, + "loss": 0.3787, + "step": 35820 + }, + { + "epoch": 0.6650378913052357, + "grad_norm": 0.3993379473686218, + "learning_rate": 5.044177200179452e-06, + "loss": 0.157, + "step": 35822 + }, + { + "epoch": 0.6650750214426544, + "grad_norm": 0.5293014049530029, + "learning_rate": 5.043164065606625e-06, + "loss": 0.2889, + "step": 35824 + }, + { + "epoch": 0.665112151580073, + "grad_norm": 0.3078661561012268, + "learning_rate": 5.0421509984814875e-06, + "loss": 0.2747, + "step": 35826 + }, + { + "epoch": 0.6651492817174917, + "grad_norm": 0.4443161189556122, + "learning_rate": 5.041137998817828e-06, + "loss": 0.043, + "step": 35828 + }, + { + "epoch": 0.6651864118549102, + "grad_norm": 0.2378215342760086, + "learning_rate": 5.040125066629426e-06, + "loss": 0.3533, + "step": 35830 + }, + { + "epoch": 0.6652235419923289, + "grad_norm": 0.3818085491657257, + "learning_rate": 5.039112201930063e-06, + "loss": 0.1341, + "step": 35832 + }, + { + "epoch": 0.6652606721297476, + "grad_norm": 0.4161611795425415, + "learning_rate": 5.038099404733522e-06, + "loss": 0.3693, + "step": 35834 + }, + { + "epoch": 0.6652978022671662, + "grad_norm": 0.5004638433456421, + "learning_rate": 5.037086675053586e-06, + "loss": 0.3019, + "step": 35836 + }, + { + "epoch": 0.6653349324045849, + "grad_norm": 0.2755444049835205, + "learning_rate": 5.036074012904034e-06, + "loss": 0.2303, + "step": 35838 + }, + { + "epoch": 0.6653720625420034, + "grad_norm": 0.6116583943367004, + "learning_rate": 5.035061418298651e-06, + "loss": 0.5848, + "step": 35840 + }, + { + "epoch": 0.6654091926794221, + "grad_norm": 0.34827920794487, + "learning_rate": 5.034048891251206e-06, + "loss": 0.1568, + "step": 35842 + }, + { + "epoch": 0.6654463228168408, + "grad_norm": 0.544321596622467, + "learning_rate": 5.033036431775481e-06, + "loss": 0.1322, + "step": 35844 + }, + { + "epoch": 0.6654834529542594, + "grad_norm": 0.3289828598499298, + "learning_rate": 5.032024039885253e-06, + "loss": 0.2501, + "step": 35846 + }, + { + "epoch": 0.665520583091678, + "grad_norm": 0.5619469881057739, + "learning_rate": 5.031011715594296e-06, + "loss": 0.2011, + "step": 35848 + }, + { + "epoch": 0.6655577132290966, + "grad_norm": 0.3207385241985321, + "learning_rate": 5.029999458916389e-06, + "loss": 0.2082, + "step": 35850 + }, + { + "epoch": 0.6655948433665153, + "grad_norm": 0.5099183320999146, + "learning_rate": 5.0289872698653e-06, + "loss": 0.2355, + "step": 35852 + }, + { + "epoch": 0.665631973503934, + "grad_norm": 0.4620128273963928, + "learning_rate": 5.027975148454808e-06, + "loss": 0.2548, + "step": 35854 + }, + { + "epoch": 0.6656691036413526, + "grad_norm": 0.4863109886646271, + "learning_rate": 5.026963094698677e-06, + "loss": 0.3173, + "step": 35856 + }, + { + "epoch": 0.6657062337787713, + "grad_norm": 0.3203144669532776, + "learning_rate": 5.025951108610683e-06, + "loss": 0.2457, + "step": 35858 + }, + { + "epoch": 0.6657433639161898, + "grad_norm": 0.28783664107322693, + "learning_rate": 5.024939190204594e-06, + "loss": 0.1186, + "step": 35860 + }, + { + "epoch": 0.6657804940536085, + "grad_norm": 0.30248650908470154, + "learning_rate": 5.0239273394941815e-06, + "loss": 0.0793, + "step": 35862 + }, + { + "epoch": 0.6658176241910271, + "grad_norm": 0.22050417959690094, + "learning_rate": 5.022915556493213e-06, + "loss": 0.2458, + "step": 35864 + }, + { + "epoch": 0.6658547543284458, + "grad_norm": 0.3294491171836853, + "learning_rate": 5.021903841215459e-06, + "loss": 0.4251, + "step": 35866 + }, + { + "epoch": 0.6658918844658644, + "grad_norm": 0.396463543176651, + "learning_rate": 5.020892193674679e-06, + "loss": 0.2561, + "step": 35868 + }, + { + "epoch": 0.665929014603283, + "grad_norm": 0.32678523659706116, + "learning_rate": 5.019880613884642e-06, + "loss": 0.3692, + "step": 35870 + }, + { + "epoch": 0.6659661447407017, + "grad_norm": 0.7674769163131714, + "learning_rate": 5.018869101859116e-06, + "loss": 0.2979, + "step": 35872 + }, + { + "epoch": 0.6660032748781203, + "grad_norm": 0.498659610748291, + "learning_rate": 5.01785765761186e-06, + "loss": 0.2214, + "step": 35874 + }, + { + "epoch": 0.666040405015539, + "grad_norm": 0.3917543888092041, + "learning_rate": 5.0168462811566355e-06, + "loss": 0.2721, + "step": 35876 + }, + { + "epoch": 0.6660775351529576, + "grad_norm": 0.31248512864112854, + "learning_rate": 5.015834972507212e-06, + "loss": 0.3128, + "step": 35878 + }, + { + "epoch": 0.6661146652903762, + "grad_norm": 0.5265284180641174, + "learning_rate": 5.0148237316773405e-06, + "loss": 0.2835, + "step": 35880 + }, + { + "epoch": 0.6661517954277949, + "grad_norm": 0.20794647932052612, + "learning_rate": 5.013812558680785e-06, + "loss": 0.0209, + "step": 35882 + }, + { + "epoch": 0.6661889255652135, + "grad_norm": 0.5394459366798401, + "learning_rate": 5.012801453531306e-06, + "loss": 0.2839, + "step": 35884 + }, + { + "epoch": 0.6662260557026322, + "grad_norm": 0.3130069971084595, + "learning_rate": 5.011790416242661e-06, + "loss": 0.2207, + "step": 35886 + }, + { + "epoch": 0.6662631858400508, + "grad_norm": 2.21299147605896, + "learning_rate": 5.010779446828607e-06, + "loss": 0.1895, + "step": 35888 + }, + { + "epoch": 0.6663003159774694, + "grad_norm": 0.5102929472923279, + "learning_rate": 5.0097685453029045e-06, + "loss": 0.3709, + "step": 35890 + }, + { + "epoch": 0.6663374461148881, + "grad_norm": 0.30325862765312195, + "learning_rate": 5.0087577116793e-06, + "loss": 0.2482, + "step": 35892 + }, + { + "epoch": 0.6663745762523067, + "grad_norm": 0.39358147978782654, + "learning_rate": 5.007746945971557e-06, + "loss": 0.3156, + "step": 35894 + }, + { + "epoch": 0.6664117063897254, + "grad_norm": 0.45120489597320557, + "learning_rate": 5.00673624819342e-06, + "loss": 0.1997, + "step": 35896 + }, + { + "epoch": 0.666448836527144, + "grad_norm": 0.23626482486724854, + "learning_rate": 5.005725618358648e-06, + "loss": 0.2561, + "step": 35898 + }, + { + "epoch": 0.6664859666645626, + "grad_norm": 0.5352849364280701, + "learning_rate": 5.004715056480989e-06, + "loss": 0.2989, + "step": 35900 + }, + { + "epoch": 0.6665230968019813, + "grad_norm": 0.3275187313556671, + "learning_rate": 5.0037045625741955e-06, + "loss": 0.1789, + "step": 35902 + }, + { + "epoch": 0.6665602269393999, + "grad_norm": 0.3067677617073059, + "learning_rate": 5.002694136652021e-06, + "loss": 0.2387, + "step": 35904 + }, + { + "epoch": 0.6665973570768186, + "grad_norm": 0.2859973907470703, + "learning_rate": 5.001683778728208e-06, + "loss": 0.2383, + "step": 35906 + }, + { + "epoch": 0.6666344872142371, + "grad_norm": 0.36118507385253906, + "learning_rate": 5.000673488816506e-06, + "loss": 0.229, + "step": 35908 + }, + { + "epoch": 0.6666716173516558, + "grad_norm": 0.7263296246528625, + "learning_rate": 4.999663266930663e-06, + "loss": 0.37, + "step": 35910 + }, + { + "epoch": 0.6667087474890745, + "grad_norm": 0.35135626792907715, + "learning_rate": 4.9986531130844294e-06, + "loss": 0.1112, + "step": 35912 + }, + { + "epoch": 0.6667458776264931, + "grad_norm": 0.2783950865268707, + "learning_rate": 4.9976430272915425e-06, + "loss": 0.2994, + "step": 35914 + }, + { + "epoch": 0.6667830077639118, + "grad_norm": 0.24082109332084656, + "learning_rate": 4.996633009565753e-06, + "loss": 0.229, + "step": 35916 + }, + { + "epoch": 0.6668201379013303, + "grad_norm": 0.4215172529220581, + "learning_rate": 4.9956230599207985e-06, + "loss": 0.2137, + "step": 35918 + }, + { + "epoch": 0.666857268038749, + "grad_norm": 0.5319555401802063, + "learning_rate": 4.994613178370424e-06, + "loss": 0.2399, + "step": 35920 + }, + { + "epoch": 0.6668943981761677, + "grad_norm": 0.6436839699745178, + "learning_rate": 4.9936033649283725e-06, + "loss": 0.5121, + "step": 35922 + }, + { + "epoch": 0.6669315283135863, + "grad_norm": 0.5118966698646545, + "learning_rate": 4.992593619608382e-06, + "loss": 0.3657, + "step": 35924 + }, + { + "epoch": 0.666968658451005, + "grad_norm": 0.23507390916347504, + "learning_rate": 4.991583942424194e-06, + "loss": 0.0808, + "step": 35926 + }, + { + "epoch": 0.6670057885884235, + "grad_norm": 0.29171034693717957, + "learning_rate": 4.990574333389546e-06, + "loss": 0.1848, + "step": 35928 + }, + { + "epoch": 0.6670429187258422, + "grad_norm": 0.268824964761734, + "learning_rate": 4.9895647925181815e-06, + "loss": 0.2019, + "step": 35930 + }, + { + "epoch": 0.6670800488632609, + "grad_norm": 0.5149436593055725, + "learning_rate": 4.988555319823827e-06, + "loss": 0.1829, + "step": 35932 + }, + { + "epoch": 0.6671171790006795, + "grad_norm": 0.35516873002052307, + "learning_rate": 4.987545915320224e-06, + "loss": 0.2225, + "step": 35934 + }, + { + "epoch": 0.6671543091380981, + "grad_norm": 0.5018120408058167, + "learning_rate": 4.986536579021111e-06, + "loss": 0.4053, + "step": 35936 + }, + { + "epoch": 0.6671914392755167, + "grad_norm": 0.3599507808685303, + "learning_rate": 4.9855273109402135e-06, + "loss": 0.3057, + "step": 35938 + }, + { + "epoch": 0.6672285694129354, + "grad_norm": 0.44632601737976074, + "learning_rate": 4.98451811109127e-06, + "loss": 0.1872, + "step": 35940 + }, + { + "epoch": 0.6672656995503541, + "grad_norm": 0.3860042691230774, + "learning_rate": 4.983508979488016e-06, + "loss": 0.3172, + "step": 35942 + }, + { + "epoch": 0.6673028296877727, + "grad_norm": 0.4096100926399231, + "learning_rate": 4.982499916144175e-06, + "loss": 0.3765, + "step": 35944 + }, + { + "epoch": 0.6673399598251913, + "grad_norm": 0.439312219619751, + "learning_rate": 4.981490921073479e-06, + "loss": 0.1419, + "step": 35946 + }, + { + "epoch": 0.6673770899626099, + "grad_norm": 0.22904759645462036, + "learning_rate": 4.980481994289661e-06, + "loss": 0.2394, + "step": 35948 + }, + { + "epoch": 0.6674142201000286, + "grad_norm": 0.2927766442298889, + "learning_rate": 4.979473135806448e-06, + "loss": 0.307, + "step": 35950 + }, + { + "epoch": 0.6674513502374473, + "grad_norm": 0.5472964644432068, + "learning_rate": 4.978464345637567e-06, + "loss": 0.4367, + "step": 35952 + }, + { + "epoch": 0.6674884803748659, + "grad_norm": 0.4023457467556, + "learning_rate": 4.977455623796749e-06, + "loss": 0.269, + "step": 35954 + }, + { + "epoch": 0.6675256105122845, + "grad_norm": 0.2702849507331848, + "learning_rate": 4.976446970297711e-06, + "loss": 0.3737, + "step": 35956 + }, + { + "epoch": 0.6675627406497031, + "grad_norm": 0.6069836616516113, + "learning_rate": 4.975438385154188e-06, + "loss": 0.3533, + "step": 35958 + }, + { + "epoch": 0.6675998707871218, + "grad_norm": 0.26242130994796753, + "learning_rate": 4.974429868379893e-06, + "loss": 0.3339, + "step": 35960 + }, + { + "epoch": 0.6676370009245404, + "grad_norm": 0.5177432298660278, + "learning_rate": 4.973421419988555e-06, + "loss": 0.3557, + "step": 35962 + }, + { + "epoch": 0.667674131061959, + "grad_norm": 0.3699071407318115, + "learning_rate": 4.972413039993895e-06, + "loss": 0.2702, + "step": 35964 + }, + { + "epoch": 0.6677112611993777, + "grad_norm": 0.35056543350219727, + "learning_rate": 4.971404728409633e-06, + "loss": 0.2996, + "step": 35966 + }, + { + "epoch": 0.6677483913367963, + "grad_norm": 0.3380669355392456, + "learning_rate": 4.970396485249496e-06, + "loss": 0.1783, + "step": 35968 + }, + { + "epoch": 0.667785521474215, + "grad_norm": 0.39701929688453674, + "learning_rate": 4.969388310527192e-06, + "loss": 0.4205, + "step": 35970 + }, + { + "epoch": 0.6678226516116336, + "grad_norm": 0.43648093938827515, + "learning_rate": 4.9683802042564455e-06, + "loss": 0.3462, + "step": 35972 + }, + { + "epoch": 0.6678597817490523, + "grad_norm": 0.42095333337783813, + "learning_rate": 4.967372166450973e-06, + "loss": 0.3145, + "step": 35974 + }, + { + "epoch": 0.6678969118864709, + "grad_norm": 0.4743790030479431, + "learning_rate": 4.966364197124494e-06, + "loss": 0.1249, + "step": 35976 + }, + { + "epoch": 0.6679340420238895, + "grad_norm": 0.5376197695732117, + "learning_rate": 4.965356296290718e-06, + "loss": 0.2832, + "step": 35978 + }, + { + "epoch": 0.6679711721613082, + "grad_norm": 0.48700010776519775, + "learning_rate": 4.9643484639633655e-06, + "loss": 0.0408, + "step": 35980 + }, + { + "epoch": 0.6680083022987268, + "grad_norm": 0.47989457845687866, + "learning_rate": 4.963340700156143e-06, + "loss": 0.1935, + "step": 35982 + }, + { + "epoch": 0.6680454324361454, + "grad_norm": 0.5127720832824707, + "learning_rate": 4.9623330048827665e-06, + "loss": 0.6321, + "step": 35984 + }, + { + "epoch": 0.6680825625735641, + "grad_norm": 0.5668118596076965, + "learning_rate": 4.961325378156949e-06, + "loss": 0.2296, + "step": 35986 + }, + { + "epoch": 0.6681196927109827, + "grad_norm": 0.33296075463294983, + "learning_rate": 4.960317819992401e-06, + "loss": 0.2721, + "step": 35988 + }, + { + "epoch": 0.6681568228484014, + "grad_norm": 0.4003893733024597, + "learning_rate": 4.959310330402831e-06, + "loss": 0.3534, + "step": 35990 + }, + { + "epoch": 0.66819395298582, + "grad_norm": 0.5721896886825562, + "learning_rate": 4.9583029094019534e-06, + "loss": 0.2316, + "step": 35992 + }, + { + "epoch": 0.6682310831232386, + "grad_norm": 0.6361413598060608, + "learning_rate": 4.957295557003467e-06, + "loss": 0.4689, + "step": 35994 + }, + { + "epoch": 0.6682682132606573, + "grad_norm": 0.23571045696735382, + "learning_rate": 4.956288273221084e-06, + "loss": 0.2726, + "step": 35996 + }, + { + "epoch": 0.6683053433980759, + "grad_norm": 0.3729521334171295, + "learning_rate": 4.955281058068513e-06, + "loss": 0.2721, + "step": 35998 + }, + { + "epoch": 0.6683424735354946, + "grad_norm": 0.34197014570236206, + "learning_rate": 4.954273911559453e-06, + "loss": 0.2081, + "step": 36000 + }, + { + "epoch": 0.6683796036729132, + "grad_norm": 0.41357097029685974, + "learning_rate": 4.95326683370761e-06, + "loss": 0.269, + "step": 36002 + }, + { + "epoch": 0.6684167338103318, + "grad_norm": 0.5535535216331482, + "learning_rate": 4.952259824526694e-06, + "loss": 0.3703, + "step": 36004 + }, + { + "epoch": 0.6684538639477504, + "grad_norm": 0.427957147359848, + "learning_rate": 4.9512528840303984e-06, + "loss": 0.23, + "step": 36006 + }, + { + "epoch": 0.6684909940851691, + "grad_norm": 0.2387382835149765, + "learning_rate": 4.950246012232427e-06, + "loss": 0.165, + "step": 36008 + }, + { + "epoch": 0.6685281242225878, + "grad_norm": 0.19637629389762878, + "learning_rate": 4.949239209146483e-06, + "loss": 0.2396, + "step": 36010 + }, + { + "epoch": 0.6685652543600064, + "grad_norm": 0.3589199483394623, + "learning_rate": 4.948232474786263e-06, + "loss": 0.395, + "step": 36012 + }, + { + "epoch": 0.668602384497425, + "grad_norm": 0.2703711688518524, + "learning_rate": 4.947225809165469e-06, + "loss": 0.2584, + "step": 36014 + }, + { + "epoch": 0.6686395146348436, + "grad_norm": 0.5052081942558289, + "learning_rate": 4.946219212297796e-06, + "loss": 0.192, + "step": 36016 + }, + { + "epoch": 0.6686766447722623, + "grad_norm": 0.39090025424957275, + "learning_rate": 4.945212684196945e-06, + "loss": 0.3031, + "step": 36018 + }, + { + "epoch": 0.668713774909681, + "grad_norm": 0.37972021102905273, + "learning_rate": 4.94420622487661e-06, + "loss": 0.2339, + "step": 36020 + }, + { + "epoch": 0.6687509050470996, + "grad_norm": 0.28349170088768005, + "learning_rate": 4.94319983435048e-06, + "loss": 0.1602, + "step": 36022 + }, + { + "epoch": 0.6687880351845182, + "grad_norm": 0.5201715230941772, + "learning_rate": 4.942193512632252e-06, + "loss": 0.2667, + "step": 36024 + }, + { + "epoch": 0.6688251653219368, + "grad_norm": 0.31312376260757446, + "learning_rate": 4.941187259735622e-06, + "loss": 0.3804, + "step": 36026 + }, + { + "epoch": 0.6688622954593555, + "grad_norm": 0.4552910029888153, + "learning_rate": 4.9401810756742795e-06, + "loss": 0.4987, + "step": 36028 + }, + { + "epoch": 0.6688994255967742, + "grad_norm": 0.3306925296783447, + "learning_rate": 4.939174960461921e-06, + "loss": 0.1924, + "step": 36030 + }, + { + "epoch": 0.6689365557341928, + "grad_norm": 0.3504699170589447, + "learning_rate": 4.9381689141122294e-06, + "loss": 0.3078, + "step": 36032 + }, + { + "epoch": 0.6689736858716114, + "grad_norm": 0.25688788294792175, + "learning_rate": 4.937162936638895e-06, + "loss": 0.1717, + "step": 36034 + }, + { + "epoch": 0.66901081600903, + "grad_norm": 0.4880479872226715, + "learning_rate": 4.936157028055609e-06, + "loss": 0.1504, + "step": 36036 + }, + { + "epoch": 0.6690479461464487, + "grad_norm": 0.383085161447525, + "learning_rate": 4.935151188376058e-06, + "loss": 0.4494, + "step": 36038 + }, + { + "epoch": 0.6690850762838674, + "grad_norm": 0.4531101882457733, + "learning_rate": 4.934145417613933e-06, + "loss": 0.2313, + "step": 36040 + }, + { + "epoch": 0.669122206421286, + "grad_norm": 0.3811333477497101, + "learning_rate": 4.93313971578291e-06, + "loss": 0.3018, + "step": 36042 + }, + { + "epoch": 0.6691593365587046, + "grad_norm": 0.9566578269004822, + "learning_rate": 4.932134082896682e-06, + "loss": 0.2951, + "step": 36044 + }, + { + "epoch": 0.6691964666961232, + "grad_norm": 0.5044624209403992, + "learning_rate": 4.9311285189689265e-06, + "loss": 0.221, + "step": 36046 + }, + { + "epoch": 0.6692335968335419, + "grad_norm": 0.2554212212562561, + "learning_rate": 4.930123024013328e-06, + "loss": 0.2427, + "step": 36048 + }, + { + "epoch": 0.6692707269709606, + "grad_norm": 0.1995251178741455, + "learning_rate": 4.929117598043569e-06, + "loss": 0.1658, + "step": 36050 + }, + { + "epoch": 0.6693078571083791, + "grad_norm": 0.2876911163330078, + "learning_rate": 4.92811224107333e-06, + "loss": 0.1636, + "step": 36052 + }, + { + "epoch": 0.6693449872457978, + "grad_norm": 0.3296375572681427, + "learning_rate": 4.927106953116293e-06, + "loss": 0.3489, + "step": 36054 + }, + { + "epoch": 0.6693821173832164, + "grad_norm": 0.3932287096977234, + "learning_rate": 4.926101734186139e-06, + "loss": 0.4067, + "step": 36056 + }, + { + "epoch": 0.6694192475206351, + "grad_norm": 0.3363531529903412, + "learning_rate": 4.925096584296536e-06, + "loss": 0.1688, + "step": 36058 + }, + { + "epoch": 0.6694563776580537, + "grad_norm": 0.37586840987205505, + "learning_rate": 4.924091503461169e-06, + "loss": 0.2341, + "step": 36060 + }, + { + "epoch": 0.6694935077954723, + "grad_norm": 0.4679376184940338, + "learning_rate": 4.923086491693717e-06, + "loss": 0.2759, + "step": 36062 + }, + { + "epoch": 0.669530637932891, + "grad_norm": 0.49608898162841797, + "learning_rate": 4.922081549007847e-06, + "loss": 0.2325, + "step": 36064 + }, + { + "epoch": 0.6695677680703096, + "grad_norm": 0.5918713212013245, + "learning_rate": 4.921076675417235e-06, + "loss": 0.2805, + "step": 36066 + }, + { + "epoch": 0.6696048982077283, + "grad_norm": 0.2984234690666199, + "learning_rate": 4.920071870935562e-06, + "loss": 0.1844, + "step": 36068 + }, + { + "epoch": 0.6696420283451469, + "grad_norm": 0.402296781539917, + "learning_rate": 4.91906713557649e-06, + "loss": 0.2175, + "step": 36070 + }, + { + "epoch": 0.6696791584825655, + "grad_norm": 0.4155738353729248, + "learning_rate": 4.918062469353695e-06, + "loss": 0.2917, + "step": 36072 + }, + { + "epoch": 0.6697162886199842, + "grad_norm": 0.2432553470134735, + "learning_rate": 4.9170578722808486e-06, + "loss": 0.2065, + "step": 36074 + }, + { + "epoch": 0.6697534187574028, + "grad_norm": 0.6407560706138611, + "learning_rate": 4.916053344371618e-06, + "loss": 0.3182, + "step": 36076 + }, + { + "epoch": 0.6697905488948215, + "grad_norm": 0.3637886047363281, + "learning_rate": 4.915048885639675e-06, + "loss": 0.2615, + "step": 36078 + }, + { + "epoch": 0.66982767903224, + "grad_norm": 0.25983428955078125, + "learning_rate": 4.914044496098687e-06, + "loss": 0.2402, + "step": 36080 + }, + { + "epoch": 0.6698648091696587, + "grad_norm": 0.46867635846138, + "learning_rate": 4.913040175762318e-06, + "loss": 0.3428, + "step": 36082 + }, + { + "epoch": 0.6699019393070774, + "grad_norm": 0.3947322964668274, + "learning_rate": 4.912035924644237e-06, + "loss": 0.3696, + "step": 36084 + }, + { + "epoch": 0.669939069444496, + "grad_norm": 0.46786293387413025, + "learning_rate": 4.911031742758103e-06, + "loss": 0.2109, + "step": 36086 + }, + { + "epoch": 0.6699761995819147, + "grad_norm": 0.3154354393482208, + "learning_rate": 4.910027630117585e-06, + "loss": 0.3597, + "step": 36088 + }, + { + "epoch": 0.6700133297193333, + "grad_norm": 0.4669903516769409, + "learning_rate": 4.909023586736344e-06, + "loss": 0.2733, + "step": 36090 + }, + { + "epoch": 0.6700504598567519, + "grad_norm": 0.5217243432998657, + "learning_rate": 4.908019612628043e-06, + "loss": 0.2234, + "step": 36092 + }, + { + "epoch": 0.6700875899941706, + "grad_norm": 0.25635436177253723, + "learning_rate": 4.907015707806347e-06, + "loss": 0.3151, + "step": 36094 + }, + { + "epoch": 0.6701247201315892, + "grad_norm": 0.32063689827919006, + "learning_rate": 4.906011872284907e-06, + "loss": 0.1628, + "step": 36096 + }, + { + "epoch": 0.6701618502690079, + "grad_norm": 0.49992769956588745, + "learning_rate": 4.905008106077387e-06, + "loss": 0.392, + "step": 36098 + }, + { + "epoch": 0.6701989804064264, + "grad_norm": 0.3122595250606537, + "learning_rate": 4.904004409197446e-06, + "loss": 0.3194, + "step": 36100 + }, + { + "epoch": 0.6702361105438451, + "grad_norm": 0.4619857966899872, + "learning_rate": 4.903000781658741e-06, + "loss": 0.2745, + "step": 36102 + }, + { + "epoch": 0.6702732406812638, + "grad_norm": 0.35484379529953003, + "learning_rate": 4.901997223474933e-06, + "loss": 0.3481, + "step": 36104 + }, + { + "epoch": 0.6703103708186824, + "grad_norm": 0.6152006387710571, + "learning_rate": 4.900993734659671e-06, + "loss": 0.2162, + "step": 36106 + }, + { + "epoch": 0.6703475009561011, + "grad_norm": 0.39619210362434387, + "learning_rate": 4.899990315226607e-06, + "loss": 0.4153, + "step": 36108 + }, + { + "epoch": 0.6703846310935196, + "grad_norm": 0.3276269733905792, + "learning_rate": 4.898986965189398e-06, + "loss": 0.2386, + "step": 36110 + }, + { + "epoch": 0.6704217612309383, + "grad_norm": 0.44733211398124695, + "learning_rate": 4.897983684561699e-06, + "loss": 0.1921, + "step": 36112 + }, + { + "epoch": 0.6704588913683569, + "grad_norm": 0.4223423898220062, + "learning_rate": 4.896980473357158e-06, + "loss": 0.3669, + "step": 36114 + }, + { + "epoch": 0.6704960215057756, + "grad_norm": 0.33408883213996887, + "learning_rate": 4.8959773315894285e-06, + "loss": 0.2087, + "step": 36116 + }, + { + "epoch": 0.6705331516431943, + "grad_norm": 0.2544315457344055, + "learning_rate": 4.894974259272162e-06, + "loss": 0.3602, + "step": 36118 + }, + { + "epoch": 0.6705702817806128, + "grad_norm": 0.3726828098297119, + "learning_rate": 4.893971256419003e-06, + "loss": 0.186, + "step": 36120 + }, + { + "epoch": 0.6706074119180315, + "grad_norm": 0.33832740783691406, + "learning_rate": 4.8929683230435984e-06, + "loss": 0.4682, + "step": 36122 + }, + { + "epoch": 0.6706445420554501, + "grad_norm": 0.3523814380168915, + "learning_rate": 4.891965459159599e-06, + "loss": 0.1802, + "step": 36124 + }, + { + "epoch": 0.6706816721928688, + "grad_norm": 0.438400000333786, + "learning_rate": 4.890962664780652e-06, + "loss": 0.3291, + "step": 36126 + }, + { + "epoch": 0.6707188023302875, + "grad_norm": 0.3655209243297577, + "learning_rate": 4.889959939920396e-06, + "loss": 0.1971, + "step": 36128 + }, + { + "epoch": 0.670755932467706, + "grad_norm": 0.32059258222579956, + "learning_rate": 4.888957284592484e-06, + "loss": 0.266, + "step": 36130 + }, + { + "epoch": 0.6707930626051247, + "grad_norm": 2.950822591781616, + "learning_rate": 4.88795469881055e-06, + "loss": 0.2509, + "step": 36132 + }, + { + "epoch": 0.6708301927425433, + "grad_norm": 0.4147107005119324, + "learning_rate": 4.886952182588239e-06, + "loss": 0.3286, + "step": 36134 + }, + { + "epoch": 0.670867322879962, + "grad_norm": 0.38981154561042786, + "learning_rate": 4.885949735939195e-06, + "loss": 0.2042, + "step": 36136 + }, + { + "epoch": 0.6709044530173807, + "grad_norm": 0.41154104471206665, + "learning_rate": 4.884947358877056e-06, + "loss": 0.2709, + "step": 36138 + }, + { + "epoch": 0.6709415831547992, + "grad_norm": 0.3203439712524414, + "learning_rate": 4.883945051415462e-06, + "loss": 0.3776, + "step": 36140 + }, + { + "epoch": 0.6709787132922179, + "grad_norm": 0.5013834834098816, + "learning_rate": 4.882942813568051e-06, + "loss": 0.2415, + "step": 36142 + }, + { + "epoch": 0.6710158434296365, + "grad_norm": 0.25282618403434753, + "learning_rate": 4.881940645348465e-06, + "loss": 0.2675, + "step": 36144 + }, + { + "epoch": 0.6710529735670552, + "grad_norm": 0.3100430965423584, + "learning_rate": 4.880938546770333e-06, + "loss": 0.3344, + "step": 36146 + }, + { + "epoch": 0.6710901037044739, + "grad_norm": 0.4388672709465027, + "learning_rate": 4.879936517847298e-06, + "loss": 0.2189, + "step": 36148 + }, + { + "epoch": 0.6711272338418924, + "grad_norm": 0.5125184655189514, + "learning_rate": 4.878934558592986e-06, + "loss": 0.2662, + "step": 36150 + }, + { + "epoch": 0.6711643639793111, + "grad_norm": 0.47947296500205994, + "learning_rate": 4.877932669021036e-06, + "loss": 0.1638, + "step": 36152 + }, + { + "epoch": 0.6712014941167297, + "grad_norm": 0.34604835510253906, + "learning_rate": 4.876930849145079e-06, + "loss": 0.2757, + "step": 36154 + }, + { + "epoch": 0.6712386242541484, + "grad_norm": 0.32241278886795044, + "learning_rate": 4.875929098978753e-06, + "loss": 0.2125, + "step": 36156 + }, + { + "epoch": 0.671275754391567, + "grad_norm": 0.4844227731227875, + "learning_rate": 4.8749274185356796e-06, + "loss": 0.4033, + "step": 36158 + }, + { + "epoch": 0.6713128845289856, + "grad_norm": 0.3899403512477875, + "learning_rate": 4.873925807829492e-06, + "loss": 0.3905, + "step": 36160 + }, + { + "epoch": 0.6713500146664043, + "grad_norm": 0.26708874106407166, + "learning_rate": 4.87292426687382e-06, + "loss": 0.2811, + "step": 36162 + }, + { + "epoch": 0.6713871448038229, + "grad_norm": 0.38783735036849976, + "learning_rate": 4.8719227956822915e-06, + "loss": 0.343, + "step": 36164 + }, + { + "epoch": 0.6714242749412416, + "grad_norm": 0.4256456196308136, + "learning_rate": 4.870921394268537e-06, + "loss": 0.3944, + "step": 36166 + }, + { + "epoch": 0.6714614050786601, + "grad_norm": 0.4007189869880676, + "learning_rate": 4.869920062646175e-06, + "loss": 0.3168, + "step": 36168 + }, + { + "epoch": 0.6714985352160788, + "grad_norm": 0.48081111907958984, + "learning_rate": 4.868918800828839e-06, + "loss": 0.3104, + "step": 36170 + }, + { + "epoch": 0.6715356653534975, + "grad_norm": 0.22749769687652588, + "learning_rate": 4.867917608830145e-06, + "loss": 0.2267, + "step": 36172 + }, + { + "epoch": 0.6715727954909161, + "grad_norm": 0.3982134163379669, + "learning_rate": 4.866916486663719e-06, + "loss": 0.2069, + "step": 36174 + }, + { + "epoch": 0.6716099256283348, + "grad_norm": 0.3250977098941803, + "learning_rate": 4.865915434343185e-06, + "loss": 0.2538, + "step": 36176 + }, + { + "epoch": 0.6716470557657533, + "grad_norm": 0.6489495038986206, + "learning_rate": 4.8649144518821634e-06, + "loss": 0.3035, + "step": 36178 + }, + { + "epoch": 0.671684185903172, + "grad_norm": 0.30447709560394287, + "learning_rate": 4.863913539294275e-06, + "loss": 0.3421, + "step": 36180 + }, + { + "epoch": 0.6717213160405907, + "grad_norm": 0.4187319278717041, + "learning_rate": 4.8629126965931416e-06, + "loss": 0.2431, + "step": 36182 + }, + { + "epoch": 0.6717584461780093, + "grad_norm": 0.24698862433433533, + "learning_rate": 4.861911923792377e-06, + "loss": 0.2386, + "step": 36184 + }, + { + "epoch": 0.671795576315428, + "grad_norm": 0.3094417154788971, + "learning_rate": 4.8609112209056e-06, + "loss": 0.4169, + "step": 36186 + }, + { + "epoch": 0.6718327064528465, + "grad_norm": 0.18941430747509003, + "learning_rate": 4.859910587946427e-06, + "loss": 0.2524, + "step": 36188 + }, + { + "epoch": 0.6718698365902652, + "grad_norm": 0.36202120780944824, + "learning_rate": 4.8589100249284795e-06, + "loss": 0.1894, + "step": 36190 + }, + { + "epoch": 0.6719069667276839, + "grad_norm": 0.4387712776660919, + "learning_rate": 4.857909531865362e-06, + "loss": 0.2049, + "step": 36192 + }, + { + "epoch": 0.6719440968651025, + "grad_norm": 0.29462796449661255, + "learning_rate": 4.856909108770699e-06, + "loss": 0.2764, + "step": 36194 + }, + { + "epoch": 0.6719812270025212, + "grad_norm": 0.44651541113853455, + "learning_rate": 4.855908755658093e-06, + "loss": 0.2126, + "step": 36196 + }, + { + "epoch": 0.6720183571399397, + "grad_norm": 0.4596565067768097, + "learning_rate": 4.854908472541161e-06, + "loss": 0.3989, + "step": 36198 + }, + { + "epoch": 0.6720554872773584, + "grad_norm": 0.284769743680954, + "learning_rate": 4.853908259433513e-06, + "loss": 0.1811, + "step": 36200 + }, + { + "epoch": 0.6720926174147771, + "grad_norm": 0.36276155710220337, + "learning_rate": 4.852908116348759e-06, + "loss": 0.5226, + "step": 36202 + }, + { + "epoch": 0.6721297475521957, + "grad_norm": 0.524876058101654, + "learning_rate": 4.851908043300509e-06, + "loss": 0.3271, + "step": 36204 + }, + { + "epoch": 0.6721668776896144, + "grad_norm": 0.2024860978126526, + "learning_rate": 4.850908040302374e-06, + "loss": 0.1087, + "step": 36206 + }, + { + "epoch": 0.6722040078270329, + "grad_norm": 0.29798269271850586, + "learning_rate": 4.849908107367952e-06, + "loss": 0.2927, + "step": 36208 + }, + { + "epoch": 0.6722411379644516, + "grad_norm": 0.3565357029438019, + "learning_rate": 4.848908244510856e-06, + "loss": 0.2246, + "step": 36210 + }, + { + "epoch": 0.6722782681018702, + "grad_norm": 0.38925671577453613, + "learning_rate": 4.847908451744693e-06, + "loss": 0.2996, + "step": 36212 + }, + { + "epoch": 0.6723153982392889, + "grad_norm": 0.26704326272010803, + "learning_rate": 4.846908729083058e-06, + "loss": 0.218, + "step": 36214 + }, + { + "epoch": 0.6723525283767076, + "grad_norm": 0.30685800313949585, + "learning_rate": 4.8459090765395625e-06, + "loss": 0.1871, + "step": 36216 + }, + { + "epoch": 0.6723896585141261, + "grad_norm": 0.31705355644226074, + "learning_rate": 4.844909494127805e-06, + "loss": 0.2916, + "step": 36218 + }, + { + "epoch": 0.6724267886515448, + "grad_norm": 0.22528082132339478, + "learning_rate": 4.843909981861392e-06, + "loss": 0.1959, + "step": 36220 + }, + { + "epoch": 0.6724639187889634, + "grad_norm": 0.44637659192085266, + "learning_rate": 4.842910539753915e-06, + "loss": 0.2734, + "step": 36222 + }, + { + "epoch": 0.6725010489263821, + "grad_norm": 0.3254113793373108, + "learning_rate": 4.84191116781898e-06, + "loss": 0.1706, + "step": 36224 + }, + { + "epoch": 0.6725381790638008, + "grad_norm": 0.30111074447631836, + "learning_rate": 4.840911866070183e-06, + "loss": 0.4161, + "step": 36226 + }, + { + "epoch": 0.6725753092012193, + "grad_norm": 0.4336724281311035, + "learning_rate": 4.8399126345211225e-06, + "loss": 0.1988, + "step": 36228 + }, + { + "epoch": 0.672612439338638, + "grad_norm": 0.41153383255004883, + "learning_rate": 4.838913473185398e-06, + "loss": 0.1378, + "step": 36230 + }, + { + "epoch": 0.6726495694760566, + "grad_norm": 0.3022269904613495, + "learning_rate": 4.837914382076599e-06, + "loss": 0.1664, + "step": 36232 + }, + { + "epoch": 0.6726866996134753, + "grad_norm": 0.4415663480758667, + "learning_rate": 4.8369153612083256e-06, + "loss": 0.2237, + "step": 36234 + }, + { + "epoch": 0.672723829750894, + "grad_norm": 0.4283555746078491, + "learning_rate": 4.835916410594165e-06, + "loss": 0.2907, + "step": 36236 + }, + { + "epoch": 0.6727609598883125, + "grad_norm": 0.38964778184890747, + "learning_rate": 4.8349175302477156e-06, + "loss": 0.3674, + "step": 36238 + }, + { + "epoch": 0.6727980900257312, + "grad_norm": 0.3656388521194458, + "learning_rate": 4.833918720182567e-06, + "loss": 0.129, + "step": 36240 + }, + { + "epoch": 0.6728352201631498, + "grad_norm": 0.8354846835136414, + "learning_rate": 4.8329199804123085e-06, + "loss": 0.1754, + "step": 36242 + }, + { + "epoch": 0.6728723503005685, + "grad_norm": 0.47813042998313904, + "learning_rate": 4.831921310950537e-06, + "loss": 0.4138, + "step": 36244 + }, + { + "epoch": 0.6729094804379872, + "grad_norm": 0.6088079214096069, + "learning_rate": 4.830922711810833e-06, + "loss": 0.1842, + "step": 36246 + }, + { + "epoch": 0.6729466105754057, + "grad_norm": 0.3779643476009369, + "learning_rate": 4.829924183006787e-06, + "loss": 0.361, + "step": 36248 + }, + { + "epoch": 0.6729837407128244, + "grad_norm": 0.3980613946914673, + "learning_rate": 4.828925724551986e-06, + "loss": 0.3115, + "step": 36250 + }, + { + "epoch": 0.673020870850243, + "grad_norm": 0.28450995683670044, + "learning_rate": 4.827927336460022e-06, + "loss": 0.3824, + "step": 36252 + }, + { + "epoch": 0.6730580009876617, + "grad_norm": 0.19442516565322876, + "learning_rate": 4.82692901874447e-06, + "loss": 0.2221, + "step": 36254 + }, + { + "epoch": 0.6730951311250803, + "grad_norm": 0.3275705575942993, + "learning_rate": 4.825930771418919e-06, + "loss": 0.2303, + "step": 36256 + }, + { + "epoch": 0.6731322612624989, + "grad_norm": 0.4632571041584015, + "learning_rate": 4.824932594496957e-06, + "loss": 0.495, + "step": 36258 + }, + { + "epoch": 0.6731693913999176, + "grad_norm": 0.5370644330978394, + "learning_rate": 4.823934487992157e-06, + "loss": 0.2445, + "step": 36260 + }, + { + "epoch": 0.6732065215373362, + "grad_norm": 0.44724059104919434, + "learning_rate": 4.8229364519181035e-06, + "loss": 0.2034, + "step": 36262 + }, + { + "epoch": 0.6732436516747549, + "grad_norm": 0.3070735037326813, + "learning_rate": 4.821938486288378e-06, + "loss": 0.2268, + "step": 36264 + }, + { + "epoch": 0.6732807818121734, + "grad_norm": 0.4787311851978302, + "learning_rate": 4.8209405911165605e-06, + "loss": 0.3295, + "step": 36266 + }, + { + "epoch": 0.6733179119495921, + "grad_norm": 0.4329741895198822, + "learning_rate": 4.819942766416228e-06, + "loss": 0.215, + "step": 36268 + }, + { + "epoch": 0.6733550420870108, + "grad_norm": 0.4220558702945709, + "learning_rate": 4.818945012200962e-06, + "loss": 0.1188, + "step": 36270 + }, + { + "epoch": 0.6733921722244294, + "grad_norm": 0.4629891514778137, + "learning_rate": 4.8179473284843315e-06, + "loss": 0.3295, + "step": 36272 + }, + { + "epoch": 0.6734293023618481, + "grad_norm": 0.5784451365470886, + "learning_rate": 4.8169497152799194e-06, + "loss": 0.3255, + "step": 36274 + }, + { + "epoch": 0.6734664324992666, + "grad_norm": 0.5005277991294861, + "learning_rate": 4.815952172601293e-06, + "loss": 0.3624, + "step": 36276 + }, + { + "epoch": 0.6735035626366853, + "grad_norm": 0.2573402225971222, + "learning_rate": 4.81495470046203e-06, + "loss": 0.3372, + "step": 36278 + }, + { + "epoch": 0.673540692774104, + "grad_norm": 0.4015173614025116, + "learning_rate": 4.813957298875703e-06, + "loss": 0.1793, + "step": 36280 + }, + { + "epoch": 0.6735778229115226, + "grad_norm": 0.2875911295413971, + "learning_rate": 4.812959967855886e-06, + "loss": 0.3429, + "step": 36282 + }, + { + "epoch": 0.6736149530489413, + "grad_norm": 0.4089427590370178, + "learning_rate": 4.811962707416143e-06, + "loss": 0.3275, + "step": 36284 + }, + { + "epoch": 0.6736520831863598, + "grad_norm": 0.3341543972492218, + "learning_rate": 4.810965517570048e-06, + "loss": 0.1792, + "step": 36286 + }, + { + "epoch": 0.6736892133237785, + "grad_norm": 0.34848302602767944, + "learning_rate": 4.809968398331169e-06, + "loss": 0.2209, + "step": 36288 + }, + { + "epoch": 0.6737263434611972, + "grad_norm": 0.3384447693824768, + "learning_rate": 4.808971349713075e-06, + "loss": 0.2138, + "step": 36290 + }, + { + "epoch": 0.6737634735986158, + "grad_norm": 0.24984313547611237, + "learning_rate": 4.807974371729332e-06, + "loss": 0.2054, + "step": 36292 + }, + { + "epoch": 0.6738006037360345, + "grad_norm": 0.3199433982372284, + "learning_rate": 4.806977464393509e-06, + "loss": 0.1436, + "step": 36294 + }, + { + "epoch": 0.673837733873453, + "grad_norm": 0.5323359370231628, + "learning_rate": 4.8059806277191635e-06, + "loss": 0.4386, + "step": 36296 + }, + { + "epoch": 0.6738748640108717, + "grad_norm": 0.4026868939399719, + "learning_rate": 4.804983861719867e-06, + "loss": 0.2113, + "step": 36298 + }, + { + "epoch": 0.6739119941482904, + "grad_norm": 0.4514008164405823, + "learning_rate": 4.803987166409177e-06, + "loss": 0.1291, + "step": 36300 + }, + { + "epoch": 0.673949124285709, + "grad_norm": 0.8926435112953186, + "learning_rate": 4.802990541800659e-06, + "loss": 0.286, + "step": 36302 + }, + { + "epoch": 0.6739862544231277, + "grad_norm": 0.48347288370132446, + "learning_rate": 4.8019939879078705e-06, + "loss": 0.3151, + "step": 36304 + }, + { + "epoch": 0.6740233845605462, + "grad_norm": 0.3353980779647827, + "learning_rate": 4.800997504744375e-06, + "loss": 0.45, + "step": 36306 + }, + { + "epoch": 0.6740605146979649, + "grad_norm": 0.3935904800891876, + "learning_rate": 4.800001092323734e-06, + "loss": 0.262, + "step": 36308 + }, + { + "epoch": 0.6740976448353835, + "grad_norm": 0.4170452952384949, + "learning_rate": 4.799004750659498e-06, + "loss": 0.1914, + "step": 36310 + }, + { + "epoch": 0.6741347749728022, + "grad_norm": 0.3674319386482239, + "learning_rate": 4.7980084797652295e-06, + "loss": 0.1943, + "step": 36312 + }, + { + "epoch": 0.6741719051102208, + "grad_norm": 0.43882086873054504, + "learning_rate": 4.797012279654484e-06, + "loss": 0.2734, + "step": 36314 + }, + { + "epoch": 0.6742090352476394, + "grad_norm": 0.48057156801223755, + "learning_rate": 4.79601615034082e-06, + "loss": 0.3316, + "step": 36316 + }, + { + "epoch": 0.6742461653850581, + "grad_norm": 0.3032284379005432, + "learning_rate": 4.795020091837784e-06, + "loss": 0.3245, + "step": 36318 + }, + { + "epoch": 0.6742832955224767, + "grad_norm": 0.32553064823150635, + "learning_rate": 4.7940241041589385e-06, + "loss": 0.3336, + "step": 36320 + }, + { + "epoch": 0.6743204256598954, + "grad_norm": 0.37098920345306396, + "learning_rate": 4.7930281873178275e-06, + "loss": 0.2863, + "step": 36322 + }, + { + "epoch": 0.674357555797314, + "grad_norm": 0.3477371633052826, + "learning_rate": 4.792032341328007e-06, + "loss": 0.1795, + "step": 36324 + }, + { + "epoch": 0.6743946859347326, + "grad_norm": 0.5911028981208801, + "learning_rate": 4.791036566203025e-06, + "loss": 0.428, + "step": 36326 + }, + { + "epoch": 0.6744318160721513, + "grad_norm": 0.2589866518974304, + "learning_rate": 4.790040861956433e-06, + "loss": 0.3601, + "step": 36328 + }, + { + "epoch": 0.6744689462095699, + "grad_norm": 0.49530717730522156, + "learning_rate": 4.78904522860178e-06, + "loss": 0.1938, + "step": 36330 + }, + { + "epoch": 0.6745060763469886, + "grad_norm": 0.5417235493659973, + "learning_rate": 4.788049666152615e-06, + "loss": 0.182, + "step": 36332 + }, + { + "epoch": 0.6745432064844072, + "grad_norm": 0.26481157541275024, + "learning_rate": 4.7870541746224806e-06, + "loss": 0.1308, + "step": 36334 + }, + { + "epoch": 0.6745803366218258, + "grad_norm": 0.3770453631877899, + "learning_rate": 4.786058754024924e-06, + "loss": 0.4614, + "step": 36336 + }, + { + "epoch": 0.6746174667592445, + "grad_norm": 0.37398794293403625, + "learning_rate": 4.785063404373493e-06, + "loss": 0.3655, + "step": 36338 + }, + { + "epoch": 0.6746545968966631, + "grad_norm": 0.4845532476902008, + "learning_rate": 4.784068125681726e-06, + "loss": 0.1364, + "step": 36340 + }, + { + "epoch": 0.6746917270340818, + "grad_norm": 0.2908177673816681, + "learning_rate": 4.783072917963167e-06, + "loss": 0.2524, + "step": 36342 + }, + { + "epoch": 0.6747288571715004, + "grad_norm": 0.3771679401397705, + "learning_rate": 4.782077781231359e-06, + "loss": 0.1327, + "step": 36344 + }, + { + "epoch": 0.674765987308919, + "grad_norm": 0.3853102922439575, + "learning_rate": 4.781082715499848e-06, + "loss": 0.3071, + "step": 36346 + }, + { + "epoch": 0.6748031174463377, + "grad_norm": 0.47328341007232666, + "learning_rate": 4.7800877207821635e-06, + "loss": 0.1731, + "step": 36348 + }, + { + "epoch": 0.6748402475837563, + "grad_norm": 0.44574710726737976, + "learning_rate": 4.779092797091851e-06, + "loss": 0.228, + "step": 36350 + }, + { + "epoch": 0.674877377721175, + "grad_norm": 0.3613787889480591, + "learning_rate": 4.778097944442447e-06, + "loss": 0.3166, + "step": 36352 + }, + { + "epoch": 0.6749145078585936, + "grad_norm": 0.2751176357269287, + "learning_rate": 4.777103162847488e-06, + "loss": 0.4194, + "step": 36354 + }, + { + "epoch": 0.6749516379960122, + "grad_norm": 0.4206976890563965, + "learning_rate": 4.77610845232051e-06, + "loss": 0.1989, + "step": 36356 + }, + { + "epoch": 0.6749887681334309, + "grad_norm": 0.5810465812683105, + "learning_rate": 4.7751138128750544e-06, + "loss": 0.2345, + "step": 36358 + }, + { + "epoch": 0.6750258982708495, + "grad_norm": 0.49143537878990173, + "learning_rate": 4.774119244524648e-06, + "loss": 0.2226, + "step": 36360 + }, + { + "epoch": 0.6750630284082682, + "grad_norm": 0.44942373037338257, + "learning_rate": 4.773124747282821e-06, + "loss": 0.3655, + "step": 36362 + }, + { + "epoch": 0.6751001585456867, + "grad_norm": 0.47464489936828613, + "learning_rate": 4.772130321163111e-06, + "loss": 0.3644, + "step": 36364 + }, + { + "epoch": 0.6751372886831054, + "grad_norm": 0.4693332612514496, + "learning_rate": 4.771135966179047e-06, + "loss": 0.3215, + "step": 36366 + }, + { + "epoch": 0.6751744188205241, + "grad_norm": 0.22810512781143188, + "learning_rate": 4.770141682344162e-06, + "loss": 0.176, + "step": 36368 + }, + { + "epoch": 0.6752115489579427, + "grad_norm": 0.4643705189228058, + "learning_rate": 4.769147469671985e-06, + "loss": 0.3775, + "step": 36370 + }, + { + "epoch": 0.6752486790953613, + "grad_norm": 0.5585604906082153, + "learning_rate": 4.768153328176041e-06, + "loss": 0.3275, + "step": 36372 + }, + { + "epoch": 0.6752858092327799, + "grad_norm": 0.5852408409118652, + "learning_rate": 4.767159257869858e-06, + "loss": 0.3544, + "step": 36374 + }, + { + "epoch": 0.6753229393701986, + "grad_norm": 0.4132533371448517, + "learning_rate": 4.766165258766962e-06, + "loss": 0.2187, + "step": 36376 + }, + { + "epoch": 0.6753600695076173, + "grad_norm": 0.3609318435192108, + "learning_rate": 4.765171330880883e-06, + "loss": 0.1367, + "step": 36378 + }, + { + "epoch": 0.6753971996450359, + "grad_norm": 0.5181414484977722, + "learning_rate": 4.764177474225143e-06, + "loss": 0.5101, + "step": 36380 + }, + { + "epoch": 0.6754343297824545, + "grad_norm": 0.6194131374359131, + "learning_rate": 4.763183688813262e-06, + "loss": 0.1535, + "step": 36382 + }, + { + "epoch": 0.6754714599198731, + "grad_norm": 0.3785829544067383, + "learning_rate": 4.762189974658768e-06, + "loss": 0.2727, + "step": 36384 + }, + { + "epoch": 0.6755085900572918, + "grad_norm": 0.33141621947288513, + "learning_rate": 4.761196331775176e-06, + "loss": 0.3645, + "step": 36386 + }, + { + "epoch": 0.6755457201947105, + "grad_norm": 0.2935234308242798, + "learning_rate": 4.76020276017601e-06, + "loss": 0.273, + "step": 36388 + }, + { + "epoch": 0.6755828503321291, + "grad_norm": 0.24872499704360962, + "learning_rate": 4.7592092598747884e-06, + "loss": 0.2151, + "step": 36390 + }, + { + "epoch": 0.6756199804695477, + "grad_norm": 0.31837916374206543, + "learning_rate": 4.758215830885032e-06, + "loss": 0.393, + "step": 36392 + }, + { + "epoch": 0.6756571106069663, + "grad_norm": 0.4265042245388031, + "learning_rate": 4.757222473220256e-06, + "loss": 0.3093, + "step": 36394 + }, + { + "epoch": 0.675694240744385, + "grad_norm": 0.36889320611953735, + "learning_rate": 4.756229186893983e-06, + "loss": 0.4381, + "step": 36396 + }, + { + "epoch": 0.6757313708818037, + "grad_norm": 0.2660238742828369, + "learning_rate": 4.75523597191972e-06, + "loss": 0.2768, + "step": 36398 + }, + { + "epoch": 0.6757685010192223, + "grad_norm": 0.3256976306438446, + "learning_rate": 4.7542428283109865e-06, + "loss": 0.2292, + "step": 36400 + }, + { + "epoch": 0.6758056311566409, + "grad_norm": 0.39191898703575134, + "learning_rate": 4.7532497560812975e-06, + "loss": 0.3031, + "step": 36402 + }, + { + "epoch": 0.6758427612940595, + "grad_norm": 0.24308307468891144, + "learning_rate": 4.75225675524416e-06, + "loss": 0.1941, + "step": 36404 + }, + { + "epoch": 0.6758798914314782, + "grad_norm": 0.4531537592411041, + "learning_rate": 4.751263825813089e-06, + "loss": 0.3853, + "step": 36406 + }, + { + "epoch": 0.6759170215688969, + "grad_norm": 0.3413432240486145, + "learning_rate": 4.7502709678016e-06, + "loss": 0.1684, + "step": 36408 + }, + { + "epoch": 0.6759541517063155, + "grad_norm": 0.3316570520401001, + "learning_rate": 4.749278181223195e-06, + "loss": 0.1494, + "step": 36410 + }, + { + "epoch": 0.6759912818437341, + "grad_norm": 0.48200929164886475, + "learning_rate": 4.748285466091385e-06, + "loss": 0.415, + "step": 36412 + }, + { + "epoch": 0.6760284119811527, + "grad_norm": 0.3232986330986023, + "learning_rate": 4.74729282241968e-06, + "loss": 0.2833, + "step": 36414 + }, + { + "epoch": 0.6760655421185714, + "grad_norm": 0.3097541630268097, + "learning_rate": 4.746300250221585e-06, + "loss": 0.31, + "step": 36416 + }, + { + "epoch": 0.67610267225599, + "grad_norm": 0.37366917729377747, + "learning_rate": 4.745307749510608e-06, + "loss": 0.3019, + "step": 36418 + }, + { + "epoch": 0.6761398023934087, + "grad_norm": 0.4714035987854004, + "learning_rate": 4.7443153203002555e-06, + "loss": 0.3812, + "step": 36420 + }, + { + "epoch": 0.6761769325308273, + "grad_norm": 0.6430938839912415, + "learning_rate": 4.743322962604027e-06, + "loss": 0.3415, + "step": 36422 + }, + { + "epoch": 0.6762140626682459, + "grad_norm": 0.45992711186408997, + "learning_rate": 4.742330676435429e-06, + "loss": 0.3542, + "step": 36424 + }, + { + "epoch": 0.6762511928056646, + "grad_norm": 0.4914569854736328, + "learning_rate": 4.74133846180796e-06, + "loss": 0.3372, + "step": 36426 + }, + { + "epoch": 0.6762883229430832, + "grad_norm": 0.31977933645248413, + "learning_rate": 4.740346318735122e-06, + "loss": 0.3087, + "step": 36428 + }, + { + "epoch": 0.6763254530805018, + "grad_norm": 0.29822492599487305, + "learning_rate": 4.739354247230416e-06, + "loss": 0.278, + "step": 36430 + }, + { + "epoch": 0.6763625832179205, + "grad_norm": 0.39992520213127136, + "learning_rate": 4.738362247307342e-06, + "loss": 0.3478, + "step": 36432 + }, + { + "epoch": 0.6763997133553391, + "grad_norm": 0.35032975673675537, + "learning_rate": 4.7373703189794e-06, + "loss": 0.2341, + "step": 36434 + }, + { + "epoch": 0.6764368434927578, + "grad_norm": 0.4662838578224182, + "learning_rate": 4.7363784622600816e-06, + "loss": 0.3096, + "step": 36436 + }, + { + "epoch": 0.6764739736301764, + "grad_norm": 0.4886572062969208, + "learning_rate": 4.735386677162886e-06, + "loss": 0.1326, + "step": 36438 + }, + { + "epoch": 0.676511103767595, + "grad_norm": 0.473810076713562, + "learning_rate": 4.734394963701309e-06, + "loss": 0.3492, + "step": 36440 + }, + { + "epoch": 0.6765482339050137, + "grad_norm": 0.3377487361431122, + "learning_rate": 4.7334033218888475e-06, + "loss": 0.1672, + "step": 36442 + }, + { + "epoch": 0.6765853640424323, + "grad_norm": 0.3469337522983551, + "learning_rate": 4.732411751738987e-06, + "loss": 0.2293, + "step": 36444 + }, + { + "epoch": 0.676622494179851, + "grad_norm": 0.4482578635215759, + "learning_rate": 4.731420253265229e-06, + "loss": 0.322, + "step": 36446 + }, + { + "epoch": 0.6766596243172696, + "grad_norm": 0.3213288486003876, + "learning_rate": 4.730428826481056e-06, + "loss": 0.0477, + "step": 36448 + }, + { + "epoch": 0.6766967544546882, + "grad_norm": 0.34195560216903687, + "learning_rate": 4.729437471399962e-06, + "loss": 0.2181, + "step": 36450 + }, + { + "epoch": 0.6767338845921069, + "grad_norm": 0.36801910400390625, + "learning_rate": 4.728446188035437e-06, + "loss": 0.2719, + "step": 36452 + }, + { + "epoch": 0.6767710147295255, + "grad_norm": 0.550866425037384, + "learning_rate": 4.72745497640097e-06, + "loss": 0.3028, + "step": 36454 + }, + { + "epoch": 0.6768081448669442, + "grad_norm": 0.288461297750473, + "learning_rate": 4.726463836510047e-06, + "loss": 0.3096, + "step": 36456 + }, + { + "epoch": 0.6768452750043628, + "grad_norm": 0.35113444924354553, + "learning_rate": 4.725472768376159e-06, + "loss": 0.1405, + "step": 36458 + }, + { + "epoch": 0.6768824051417814, + "grad_norm": 0.3291110396385193, + "learning_rate": 4.724481772012784e-06, + "loss": 0.2513, + "step": 36460 + }, + { + "epoch": 0.6769195352792, + "grad_norm": 0.4129972755908966, + "learning_rate": 4.72349084743341e-06, + "loss": 0.3026, + "step": 36462 + }, + { + "epoch": 0.6769566654166187, + "grad_norm": 0.5406935811042786, + "learning_rate": 4.722499994651521e-06, + "loss": 0.3771, + "step": 36464 + }, + { + "epoch": 0.6769937955540374, + "grad_norm": 0.5709956884384155, + "learning_rate": 4.721509213680602e-06, + "loss": 0.2094, + "step": 36466 + }, + { + "epoch": 0.677030925691456, + "grad_norm": 0.44346678256988525, + "learning_rate": 4.720518504534128e-06, + "loss": 0.3095, + "step": 36468 + }, + { + "epoch": 0.6770680558288746, + "grad_norm": 0.4805660843849182, + "learning_rate": 4.719527867225584e-06, + "loss": 0.0944, + "step": 36470 + }, + { + "epoch": 0.6771051859662932, + "grad_norm": 0.3394569754600525, + "learning_rate": 4.718537301768452e-06, + "loss": 0.3496, + "step": 36472 + }, + { + "epoch": 0.6771423161037119, + "grad_norm": 0.31394901871681213, + "learning_rate": 4.7175468081762046e-06, + "loss": 0.2273, + "step": 36474 + }, + { + "epoch": 0.6771794462411306, + "grad_norm": 0.385687917470932, + "learning_rate": 4.716556386462322e-06, + "loss": 0.4899, + "step": 36476 + }, + { + "epoch": 0.6772165763785492, + "grad_norm": 0.30165600776672363, + "learning_rate": 4.715566036640281e-06, + "loss": 0.2596, + "step": 36478 + }, + { + "epoch": 0.6772537065159678, + "grad_norm": 0.36663302779197693, + "learning_rate": 4.714575758723558e-06, + "loss": 0.2198, + "step": 36480 + }, + { + "epoch": 0.6772908366533864, + "grad_norm": 0.43976643681526184, + "learning_rate": 4.713585552725626e-06, + "loss": 0.1707, + "step": 36482 + }, + { + "epoch": 0.6773279667908051, + "grad_norm": 0.3960498571395874, + "learning_rate": 4.7125954186599646e-06, + "loss": 0.2511, + "step": 36484 + }, + { + "epoch": 0.6773650969282238, + "grad_norm": 0.3838033974170685, + "learning_rate": 4.711605356540038e-06, + "loss": 0.3024, + "step": 36486 + }, + { + "epoch": 0.6774022270656423, + "grad_norm": 0.3273037075996399, + "learning_rate": 4.710615366379326e-06, + "loss": 0.222, + "step": 36488 + }, + { + "epoch": 0.677439357203061, + "grad_norm": 0.3275817632675171, + "learning_rate": 4.709625448191292e-06, + "loss": 0.1962, + "step": 36490 + }, + { + "epoch": 0.6774764873404796, + "grad_norm": 0.36478307843208313, + "learning_rate": 4.708635601989407e-06, + "loss": 0.2729, + "step": 36492 + }, + { + "epoch": 0.6775136174778983, + "grad_norm": 0.600856602191925, + "learning_rate": 4.707645827787144e-06, + "loss": 0.3337, + "step": 36494 + }, + { + "epoch": 0.677550747615317, + "grad_norm": 0.2982170879840851, + "learning_rate": 4.706656125597972e-06, + "loss": 0.2396, + "step": 36496 + }, + { + "epoch": 0.6775878777527355, + "grad_norm": 0.4709659516811371, + "learning_rate": 4.705666495435352e-06, + "loss": 0.2135, + "step": 36498 + }, + { + "epoch": 0.6776250078901542, + "grad_norm": 0.4827239513397217, + "learning_rate": 4.704676937312751e-06, + "loss": 0.3319, + "step": 36500 + }, + { + "epoch": 0.6776621380275728, + "grad_norm": 0.2548995018005371, + "learning_rate": 4.703687451243637e-06, + "loss": 0.1706, + "step": 36502 + }, + { + "epoch": 0.6776992681649915, + "grad_norm": 0.42466509342193604, + "learning_rate": 4.702698037241471e-06, + "loss": 0.4044, + "step": 36504 + }, + { + "epoch": 0.6777363983024102, + "grad_norm": 0.4611983001232147, + "learning_rate": 4.701708695319722e-06, + "loss": 0.2571, + "step": 36506 + }, + { + "epoch": 0.6777735284398287, + "grad_norm": 0.26731839776039124, + "learning_rate": 4.700719425491844e-06, + "loss": 0.2425, + "step": 36508 + }, + { + "epoch": 0.6778106585772474, + "grad_norm": 0.6715711355209351, + "learning_rate": 4.699730227771305e-06, + "loss": 0.2327, + "step": 36510 + }, + { + "epoch": 0.677847788714666, + "grad_norm": 0.4574154019355774, + "learning_rate": 4.698741102171558e-06, + "loss": 0.4333, + "step": 36512 + }, + { + "epoch": 0.6778849188520847, + "grad_norm": 0.3367522656917572, + "learning_rate": 4.6977520487060645e-06, + "loss": 0.3052, + "step": 36514 + }, + { + "epoch": 0.6779220489895033, + "grad_norm": 0.38283029198646545, + "learning_rate": 4.696763067388285e-06, + "loss": 0.2659, + "step": 36516 + }, + { + "epoch": 0.6779591791269219, + "grad_norm": 0.32643836736679077, + "learning_rate": 4.695774158231674e-06, + "loss": 0.3212, + "step": 36518 + }, + { + "epoch": 0.6779963092643406, + "grad_norm": 0.6314180493354797, + "learning_rate": 4.694785321249688e-06, + "loss": 0.2763, + "step": 36520 + }, + { + "epoch": 0.6780334394017592, + "grad_norm": 0.18016217648983002, + "learning_rate": 4.693796556455787e-06, + "loss": 0.2662, + "step": 36522 + }, + { + "epoch": 0.6780705695391779, + "grad_norm": 0.3607742190361023, + "learning_rate": 4.692807863863419e-06, + "loss": 0.3643, + "step": 36524 + }, + { + "epoch": 0.6781076996765965, + "grad_norm": 0.3754550814628601, + "learning_rate": 4.691819243486038e-06, + "loss": 0.169, + "step": 36526 + }, + { + "epoch": 0.6781448298140151, + "grad_norm": 0.158907949924469, + "learning_rate": 4.690830695337101e-06, + "loss": 0.3108, + "step": 36528 + }, + { + "epoch": 0.6781819599514338, + "grad_norm": 0.8036734461784363, + "learning_rate": 4.689842219430051e-06, + "loss": 0.2687, + "step": 36530 + }, + { + "epoch": 0.6782190900888524, + "grad_norm": 0.28285539150238037, + "learning_rate": 4.688853815778343e-06, + "loss": 0.1788, + "step": 36532 + }, + { + "epoch": 0.6782562202262711, + "grad_norm": 0.5729122757911682, + "learning_rate": 4.68786548439543e-06, + "loss": 0.2115, + "step": 36534 + }, + { + "epoch": 0.6782933503636897, + "grad_norm": 0.3527403175830841, + "learning_rate": 4.686877225294751e-06, + "loss": 0.4524, + "step": 36536 + }, + { + "epoch": 0.6783304805011083, + "grad_norm": 0.6176527142524719, + "learning_rate": 4.68588903848976e-06, + "loss": 0.1819, + "step": 36538 + }, + { + "epoch": 0.678367610638527, + "grad_norm": 0.4776153266429901, + "learning_rate": 4.6849009239939016e-06, + "loss": 0.3631, + "step": 36540 + }, + { + "epoch": 0.6784047407759456, + "grad_norm": 0.5935525298118591, + "learning_rate": 4.683912881820619e-06, + "loss": 0.2165, + "step": 36542 + }, + { + "epoch": 0.6784418709133643, + "grad_norm": 0.31611618399620056, + "learning_rate": 4.682924911983361e-06, + "loss": 0.2183, + "step": 36544 + }, + { + "epoch": 0.6784790010507828, + "grad_norm": 0.27530214190483093, + "learning_rate": 4.681937014495567e-06, + "loss": 0.3206, + "step": 36546 + }, + { + "epoch": 0.6785161311882015, + "grad_norm": 0.355207234621048, + "learning_rate": 4.6809491893706845e-06, + "loss": 0.134, + "step": 36548 + }, + { + "epoch": 0.6785532613256202, + "grad_norm": 0.2882908582687378, + "learning_rate": 4.679961436622152e-06, + "loss": 0.2756, + "step": 36550 + }, + { + "epoch": 0.6785903914630388, + "grad_norm": 0.36620956659317017, + "learning_rate": 4.678973756263405e-06, + "loss": 0.3932, + "step": 36552 + }, + { + "epoch": 0.6786275216004575, + "grad_norm": 0.17871969938278198, + "learning_rate": 4.677986148307887e-06, + "loss": 0.2058, + "step": 36554 + }, + { + "epoch": 0.678664651737876, + "grad_norm": 0.42708820104599, + "learning_rate": 4.6769986127690355e-06, + "loss": 0.3689, + "step": 36556 + }, + { + "epoch": 0.6787017818752947, + "grad_norm": 0.36767616868019104, + "learning_rate": 4.676011149660289e-06, + "loss": 0.2729, + "step": 36558 + }, + { + "epoch": 0.6787389120127134, + "grad_norm": 0.3953140079975128, + "learning_rate": 4.675023758995088e-06, + "loss": 0.2241, + "step": 36560 + }, + { + "epoch": 0.678776042150132, + "grad_norm": 0.249820277094841, + "learning_rate": 4.6740364407868586e-06, + "loss": 0.2534, + "step": 36562 + }, + { + "epoch": 0.6788131722875507, + "grad_norm": 0.37099185585975647, + "learning_rate": 4.6730491950490405e-06, + "loss": 0.4403, + "step": 36564 + }, + { + "epoch": 0.6788503024249692, + "grad_norm": 0.383452832698822, + "learning_rate": 4.672062021795067e-06, + "loss": 0.0932, + "step": 36566 + }, + { + "epoch": 0.6788874325623879, + "grad_norm": 0.3339192867279053, + "learning_rate": 4.67107492103837e-06, + "loss": 0.2741, + "step": 36568 + }, + { + "epoch": 0.6789245626998065, + "grad_norm": 0.39190760254859924, + "learning_rate": 4.670087892792385e-06, + "loss": 0.2541, + "step": 36570 + }, + { + "epoch": 0.6789616928372252, + "grad_norm": 0.4393065869808197, + "learning_rate": 4.669100937070536e-06, + "loss": 0.5056, + "step": 36572 + }, + { + "epoch": 0.6789988229746439, + "grad_norm": 0.3446149230003357, + "learning_rate": 4.668114053886259e-06, + "loss": 0.1979, + "step": 36574 + }, + { + "epoch": 0.6790359531120624, + "grad_norm": 0.3102419972419739, + "learning_rate": 4.6671272432529744e-06, + "loss": 0.1823, + "step": 36576 + }, + { + "epoch": 0.6790730832494811, + "grad_norm": 0.5714578032493591, + "learning_rate": 4.6661405051841144e-06, + "loss": 0.2381, + "step": 36578 + }, + { + "epoch": 0.6791102133868997, + "grad_norm": 0.6529115438461304, + "learning_rate": 4.6651538396931065e-06, + "loss": 0.2378, + "step": 36580 + }, + { + "epoch": 0.6791473435243184, + "grad_norm": 0.38438543677330017, + "learning_rate": 4.664167246793373e-06, + "loss": 0.2658, + "step": 36582 + }, + { + "epoch": 0.6791844736617371, + "grad_norm": 0.4142589271068573, + "learning_rate": 4.663180726498345e-06, + "loss": 0.2232, + "step": 36584 + }, + { + "epoch": 0.6792216037991556, + "grad_norm": 0.2852151393890381, + "learning_rate": 4.662194278821439e-06, + "loss": 0.3353, + "step": 36586 + }, + { + "epoch": 0.6792587339365743, + "grad_norm": 0.38932931423187256, + "learning_rate": 4.66120790377608e-06, + "loss": 0.3419, + "step": 36588 + }, + { + "epoch": 0.6792958640739929, + "grad_norm": 0.257129967212677, + "learning_rate": 4.660221601375689e-06, + "loss": 0.3094, + "step": 36590 + }, + { + "epoch": 0.6793329942114116, + "grad_norm": 0.34265750646591187, + "learning_rate": 4.659235371633692e-06, + "loss": 0.399, + "step": 36592 + }, + { + "epoch": 0.6793701243488303, + "grad_norm": 0.4926569163799286, + "learning_rate": 4.6582492145635e-06, + "loss": 0.285, + "step": 36594 + }, + { + "epoch": 0.6794072544862488, + "grad_norm": 0.4248081147670746, + "learning_rate": 4.657263130178536e-06, + "loss": 0.295, + "step": 36596 + }, + { + "epoch": 0.6794443846236675, + "grad_norm": 0.3272993862628937, + "learning_rate": 4.656277118492222e-06, + "loss": 0.2525, + "step": 36598 + }, + { + "epoch": 0.6794815147610861, + "grad_norm": 0.5105322003364563, + "learning_rate": 4.655291179517965e-06, + "loss": 0.2202, + "step": 36600 + }, + { + "epoch": 0.6795186448985048, + "grad_norm": 0.39101994037628174, + "learning_rate": 4.654305313269188e-06, + "loss": 0.3339, + "step": 36602 + }, + { + "epoch": 0.6795557750359235, + "grad_norm": 0.3296178877353668, + "learning_rate": 4.653319519759301e-06, + "loss": 0.5378, + "step": 36604 + }, + { + "epoch": 0.679592905173342, + "grad_norm": 0.2997519373893738, + "learning_rate": 4.652333799001722e-06, + "loss": 0.2724, + "step": 36606 + }, + { + "epoch": 0.6796300353107607, + "grad_norm": 0.34020131826400757, + "learning_rate": 4.65134815100986e-06, + "loss": 0.3855, + "step": 36608 + }, + { + "epoch": 0.6796671654481793, + "grad_norm": 0.2601917088031769, + "learning_rate": 4.650362575797132e-06, + "loss": 0.2613, + "step": 36610 + }, + { + "epoch": 0.679704295585598, + "grad_norm": 0.297384649515152, + "learning_rate": 4.649377073376944e-06, + "loss": 0.1312, + "step": 36612 + }, + { + "epoch": 0.6797414257230165, + "grad_norm": 0.35620924830436707, + "learning_rate": 4.64839164376271e-06, + "loss": 0.3187, + "step": 36614 + }, + { + "epoch": 0.6797785558604352, + "grad_norm": 0.4011893570423126, + "learning_rate": 4.647406286967832e-06, + "loss": 0.1707, + "step": 36616 + }, + { + "epoch": 0.6798156859978539, + "grad_norm": 0.3564300835132599, + "learning_rate": 4.646421003005721e-06, + "loss": 0.3464, + "step": 36618 + }, + { + "epoch": 0.6798528161352725, + "grad_norm": 0.40349623560905457, + "learning_rate": 4.645435791889784e-06, + "loss": 0.3768, + "step": 36620 + }, + { + "epoch": 0.6798899462726912, + "grad_norm": 0.7479465007781982, + "learning_rate": 4.644450653633432e-06, + "loss": 0.3091, + "step": 36622 + }, + { + "epoch": 0.6799270764101097, + "grad_norm": 0.38486817479133606, + "learning_rate": 4.64346558825006e-06, + "loss": 0.1475, + "step": 36624 + }, + { + "epoch": 0.6799642065475284, + "grad_norm": 0.45041412115097046, + "learning_rate": 4.6424805957530775e-06, + "loss": 0.1883, + "step": 36626 + }, + { + "epoch": 0.6800013366849471, + "grad_norm": 0.26288503408432007, + "learning_rate": 4.641495676155886e-06, + "loss": 0.2934, + "step": 36628 + }, + { + "epoch": 0.6800384668223657, + "grad_norm": 0.4546377956867218, + "learning_rate": 4.640510829471888e-06, + "loss": 0.2305, + "step": 36630 + }, + { + "epoch": 0.6800755969597844, + "grad_norm": 0.3075890839099884, + "learning_rate": 4.639526055714485e-06, + "loss": 0.2574, + "step": 36632 + }, + { + "epoch": 0.6801127270972029, + "grad_norm": 0.5221927762031555, + "learning_rate": 4.638541354897078e-06, + "loss": 0.3573, + "step": 36634 + }, + { + "epoch": 0.6801498572346216, + "grad_norm": 0.30191871523857117, + "learning_rate": 4.6375567270330645e-06, + "loss": 0.2914, + "step": 36636 + }, + { + "epoch": 0.6801869873720403, + "grad_norm": 0.4841984212398529, + "learning_rate": 4.636572172135838e-06, + "loss": 0.2589, + "step": 36638 + }, + { + "epoch": 0.6802241175094589, + "grad_norm": 0.33180081844329834, + "learning_rate": 4.635587690218799e-06, + "loss": 0.3972, + "step": 36640 + }, + { + "epoch": 0.6802612476468776, + "grad_norm": 0.8104206919670105, + "learning_rate": 4.634603281295343e-06, + "loss": 0.3596, + "step": 36642 + }, + { + "epoch": 0.6802983777842961, + "grad_norm": 0.3464994728565216, + "learning_rate": 4.633618945378865e-06, + "loss": 0.2209, + "step": 36644 + }, + { + "epoch": 0.6803355079217148, + "grad_norm": 0.41266077756881714, + "learning_rate": 4.6326346824827585e-06, + "loss": 0.1782, + "step": 36646 + }, + { + "epoch": 0.6803726380591335, + "grad_norm": 0.2535708248615265, + "learning_rate": 4.631650492620421e-06, + "loss": 0.1666, + "step": 36648 + }, + { + "epoch": 0.6804097681965521, + "grad_norm": 0.31775355339050293, + "learning_rate": 4.630666375805237e-06, + "loss": 0.3488, + "step": 36650 + }, + { + "epoch": 0.6804468983339708, + "grad_norm": 0.3390483856201172, + "learning_rate": 4.6296823320506e-06, + "loss": 0.3456, + "step": 36652 + }, + { + "epoch": 0.6804840284713893, + "grad_norm": 0.4517935514450073, + "learning_rate": 4.628698361369901e-06, + "loss": 0.3289, + "step": 36654 + }, + { + "epoch": 0.680521158608808, + "grad_norm": 0.47368916869163513, + "learning_rate": 4.627714463776531e-06, + "loss": 0.2156, + "step": 36656 + }, + { + "epoch": 0.6805582887462267, + "grad_norm": 0.46265238523483276, + "learning_rate": 4.626730639283873e-06, + "loss": 0.2898, + "step": 36658 + }, + { + "epoch": 0.6805954188836453, + "grad_norm": 0.361904114484787, + "learning_rate": 4.625746887905319e-06, + "loss": 0.3942, + "step": 36660 + }, + { + "epoch": 0.680632549021064, + "grad_norm": 0.43007397651672363, + "learning_rate": 4.624763209654249e-06, + "loss": 0.3897, + "step": 36662 + }, + { + "epoch": 0.6806696791584825, + "grad_norm": 0.5235523581504822, + "learning_rate": 4.623779604544051e-06, + "loss": 0.2264, + "step": 36664 + }, + { + "epoch": 0.6807068092959012, + "grad_norm": 0.3871137797832489, + "learning_rate": 4.622796072588108e-06, + "loss": 0.2331, + "step": 36666 + }, + { + "epoch": 0.6807439394333198, + "grad_norm": 0.4467279314994812, + "learning_rate": 4.621812613799805e-06, + "loss": 0.2465, + "step": 36668 + }, + { + "epoch": 0.6807810695707385, + "grad_norm": 0.29052039980888367, + "learning_rate": 4.6208292281925226e-06, + "loss": 0.2362, + "step": 36670 + }, + { + "epoch": 0.6808181997081572, + "grad_norm": 0.5195879340171814, + "learning_rate": 4.619845915779641e-06, + "loss": 0.2717, + "step": 36672 + }, + { + "epoch": 0.6808553298455757, + "grad_norm": 0.5728261470794678, + "learning_rate": 4.6188626765745455e-06, + "loss": 0.2502, + "step": 36674 + }, + { + "epoch": 0.6808924599829944, + "grad_norm": 0.4399751126766205, + "learning_rate": 4.617879510590607e-06, + "loss": 0.1459, + "step": 36676 + }, + { + "epoch": 0.680929590120413, + "grad_norm": 0.31272971630096436, + "learning_rate": 4.616896417841211e-06, + "loss": 0.3102, + "step": 36678 + }, + { + "epoch": 0.6809667202578317, + "grad_norm": 0.4640543758869171, + "learning_rate": 4.615913398339726e-06, + "loss": 0.2266, + "step": 36680 + }, + { + "epoch": 0.6810038503952504, + "grad_norm": 0.4285161793231964, + "learning_rate": 4.614930452099533e-06, + "loss": 0.2428, + "step": 36682 + }, + { + "epoch": 0.6810409805326689, + "grad_norm": 0.633770763874054, + "learning_rate": 4.6139475791340064e-06, + "loss": 0.2424, + "step": 36684 + }, + { + "epoch": 0.6810781106700876, + "grad_norm": 0.5195276141166687, + "learning_rate": 4.612964779456524e-06, + "loss": 0.3027, + "step": 36686 + }, + { + "epoch": 0.6811152408075062, + "grad_norm": 0.4167497158050537, + "learning_rate": 4.61198205308045e-06, + "loss": 0.4612, + "step": 36688 + }, + { + "epoch": 0.6811523709449249, + "grad_norm": 0.2877090871334076, + "learning_rate": 4.610999400019162e-06, + "loss": 0.3466, + "step": 36690 + }, + { + "epoch": 0.6811895010823436, + "grad_norm": 0.5440207719802856, + "learning_rate": 4.610016820286031e-06, + "loss": 0.2927, + "step": 36692 + }, + { + "epoch": 0.6812266312197621, + "grad_norm": 0.2668544352054596, + "learning_rate": 4.609034313894425e-06, + "loss": 0.3608, + "step": 36694 + }, + { + "epoch": 0.6812637613571808, + "grad_norm": 0.5148725509643555, + "learning_rate": 4.608051880857719e-06, + "loss": 0.2335, + "step": 36696 + }, + { + "epoch": 0.6813008914945994, + "grad_norm": 0.4221431016921997, + "learning_rate": 4.607069521189271e-06, + "loss": 0.3397, + "step": 36698 + }, + { + "epoch": 0.6813380216320181, + "grad_norm": 0.24876393377780914, + "learning_rate": 4.6060872349024584e-06, + "loss": 0.1289, + "step": 36700 + }, + { + "epoch": 0.6813751517694367, + "grad_norm": 0.6464248299598694, + "learning_rate": 4.605105022010638e-06, + "loss": 0.2708, + "step": 36702 + }, + { + "epoch": 0.6814122819068553, + "grad_norm": 0.32391157746315, + "learning_rate": 4.6041228825271775e-06, + "loss": 0.213, + "step": 36704 + }, + { + "epoch": 0.681449412044274, + "grad_norm": 0.47570061683654785, + "learning_rate": 4.603140816465442e-06, + "loss": 0.2451, + "step": 36706 + }, + { + "epoch": 0.6814865421816926, + "grad_norm": 0.45762503147125244, + "learning_rate": 4.602158823838796e-06, + "loss": 0.2294, + "step": 36708 + }, + { + "epoch": 0.6815236723191113, + "grad_norm": 0.4113583266735077, + "learning_rate": 4.601176904660603e-06, + "loss": 0.2511, + "step": 36710 + }, + { + "epoch": 0.68156080245653, + "grad_norm": 0.3370051681995392, + "learning_rate": 4.6001950589442165e-06, + "loss": 0.3217, + "step": 36712 + }, + { + "epoch": 0.6815979325939485, + "grad_norm": 0.3481823205947876, + "learning_rate": 4.599213286703003e-06, + "loss": 0.2484, + "step": 36714 + }, + { + "epoch": 0.6816350627313672, + "grad_norm": 0.47473111748695374, + "learning_rate": 4.598231587950317e-06, + "loss": 0.1396, + "step": 36716 + }, + { + "epoch": 0.6816721928687858, + "grad_norm": 0.5065504312515259, + "learning_rate": 4.59724996269952e-06, + "loss": 0.2837, + "step": 36718 + }, + { + "epoch": 0.6817093230062045, + "grad_norm": 0.4569677710533142, + "learning_rate": 4.5962684109639725e-06, + "loss": 0.1816, + "step": 36720 + }, + { + "epoch": 0.681746453143623, + "grad_norm": 0.44991540908813477, + "learning_rate": 4.595286932757022e-06, + "loss": 0.1659, + "step": 36722 + }, + { + "epoch": 0.6817835832810417, + "grad_norm": 0.46073397994041443, + "learning_rate": 4.594305528092029e-06, + "loss": 0.3097, + "step": 36724 + }, + { + "epoch": 0.6818207134184604, + "grad_norm": 0.4513939321041107, + "learning_rate": 4.593324196982344e-06, + "loss": 0.626, + "step": 36726 + }, + { + "epoch": 0.681857843555879, + "grad_norm": 0.43300861120224, + "learning_rate": 4.592342939441321e-06, + "loss": 0.3946, + "step": 36728 + }, + { + "epoch": 0.6818949736932977, + "grad_norm": 0.32495608925819397, + "learning_rate": 4.591361755482313e-06, + "loss": 0.2713, + "step": 36730 + }, + { + "epoch": 0.6819321038307162, + "grad_norm": 0.24395209550857544, + "learning_rate": 4.590380645118672e-06, + "loss": 0.394, + "step": 36732 + }, + { + "epoch": 0.6819692339681349, + "grad_norm": 0.31837767362594604, + "learning_rate": 4.589399608363745e-06, + "loss": 0.2079, + "step": 36734 + }, + { + "epoch": 0.6820063641055536, + "grad_norm": 0.33453434705734253, + "learning_rate": 4.5884186452308865e-06, + "loss": 0.1781, + "step": 36736 + }, + { + "epoch": 0.6820434942429722, + "grad_norm": 0.29054832458496094, + "learning_rate": 4.587437755733438e-06, + "loss": 0.2355, + "step": 36738 + }, + { + "epoch": 0.6820806243803909, + "grad_norm": 0.35468605160713196, + "learning_rate": 4.586456939884748e-06, + "loss": 0.2837, + "step": 36740 + }, + { + "epoch": 0.6821177545178094, + "grad_norm": 0.2772906720638275, + "learning_rate": 4.585476197698169e-06, + "loss": 0.1072, + "step": 36742 + }, + { + "epoch": 0.6821548846552281, + "grad_norm": 0.45635318756103516, + "learning_rate": 4.584495529187035e-06, + "loss": 0.3416, + "step": 36744 + }, + { + "epoch": 0.6821920147926468, + "grad_norm": 0.3737885355949402, + "learning_rate": 4.583514934364696e-06, + "loss": 0.228, + "step": 36746 + }, + { + "epoch": 0.6822291449300654, + "grad_norm": 0.4764951765537262, + "learning_rate": 4.582534413244499e-06, + "loss": 0.3466, + "step": 36748 + }, + { + "epoch": 0.682266275067484, + "grad_norm": 0.4684264063835144, + "learning_rate": 4.5815539658397765e-06, + "loss": 0.3251, + "step": 36750 + }, + { + "epoch": 0.6823034052049026, + "grad_norm": 0.2644585967063904, + "learning_rate": 4.580573592163876e-06, + "loss": 0.2619, + "step": 36752 + }, + { + "epoch": 0.6823405353423213, + "grad_norm": 0.3051812946796417, + "learning_rate": 4.579593292230135e-06, + "loss": 0.2618, + "step": 36754 + }, + { + "epoch": 0.68237766547974, + "grad_norm": 0.3321031928062439, + "learning_rate": 4.578613066051894e-06, + "loss": 0.2601, + "step": 36756 + }, + { + "epoch": 0.6824147956171586, + "grad_norm": 0.29542508721351624, + "learning_rate": 4.577632913642489e-06, + "loss": 0.2886, + "step": 36758 + }, + { + "epoch": 0.6824519257545772, + "grad_norm": 0.6704930663108826, + "learning_rate": 4.5766528350152625e-06, + "loss": 0.2636, + "step": 36760 + }, + { + "epoch": 0.6824890558919958, + "grad_norm": 0.3154360353946686, + "learning_rate": 4.575672830183542e-06, + "loss": 0.2514, + "step": 36762 + }, + { + "epoch": 0.6825261860294145, + "grad_norm": 0.5168063044548035, + "learning_rate": 4.574692899160671e-06, + "loss": 0.319, + "step": 36764 + }, + { + "epoch": 0.6825633161668331, + "grad_norm": 0.3341941833496094, + "learning_rate": 4.573713041959975e-06, + "loss": 0.4922, + "step": 36766 + }, + { + "epoch": 0.6826004463042518, + "grad_norm": 0.48993566632270813, + "learning_rate": 4.57273325859479e-06, + "loss": 0.2922, + "step": 36768 + }, + { + "epoch": 0.6826375764416704, + "grad_norm": 0.22482529282569885, + "learning_rate": 4.5717535490784505e-06, + "loss": 0.2624, + "step": 36770 + }, + { + "epoch": 0.682674706579089, + "grad_norm": 0.5108135342597961, + "learning_rate": 4.570773913424285e-06, + "loss": 0.2064, + "step": 36772 + }, + { + "epoch": 0.6827118367165077, + "grad_norm": 0.6251305341720581, + "learning_rate": 4.5697943516456265e-06, + "loss": 0.3208, + "step": 36774 + }, + { + "epoch": 0.6827489668539263, + "grad_norm": 0.3603210151195526, + "learning_rate": 4.5688148637557995e-06, + "loss": 0.2277, + "step": 36776 + }, + { + "epoch": 0.682786096991345, + "grad_norm": 0.38427647948265076, + "learning_rate": 4.567835449768133e-06, + "loss": 0.4129, + "step": 36778 + }, + { + "epoch": 0.6828232271287636, + "grad_norm": 0.2830137312412262, + "learning_rate": 4.5668561096959555e-06, + "loss": 0.2281, + "step": 36780 + }, + { + "epoch": 0.6828603572661822, + "grad_norm": 0.40049147605895996, + "learning_rate": 4.565876843552595e-06, + "loss": 0.4461, + "step": 36782 + }, + { + "epoch": 0.6828974874036009, + "grad_norm": 0.2534545660018921, + "learning_rate": 4.564897651351371e-06, + "loss": 0.3604, + "step": 36784 + }, + { + "epoch": 0.6829346175410195, + "grad_norm": 0.25542131066322327, + "learning_rate": 4.563918533105612e-06, + "loss": 0.3382, + "step": 36786 + }, + { + "epoch": 0.6829717476784382, + "grad_norm": 0.427701473236084, + "learning_rate": 4.562939488828637e-06, + "loss": 0.185, + "step": 36788 + }, + { + "epoch": 0.6830088778158568, + "grad_norm": 0.3095892071723938, + "learning_rate": 4.561960518533769e-06, + "loss": 0.198, + "step": 36790 + }, + { + "epoch": 0.6830460079532754, + "grad_norm": 0.2594239413738251, + "learning_rate": 4.56098162223433e-06, + "loss": 0.2297, + "step": 36792 + }, + { + "epoch": 0.6830831380906941, + "grad_norm": 0.31070205569267273, + "learning_rate": 4.560002799943638e-06, + "loss": 0.328, + "step": 36794 + }, + { + "epoch": 0.6831202682281127, + "grad_norm": 0.35155758261680603, + "learning_rate": 4.559024051675014e-06, + "loss": 0.271, + "step": 36796 + }, + { + "epoch": 0.6831573983655314, + "grad_norm": 0.3239842653274536, + "learning_rate": 4.558045377441774e-06, + "loss": 0.3507, + "step": 36798 + }, + { + "epoch": 0.68319452850295, + "grad_norm": 0.5962363481521606, + "learning_rate": 4.557066777257242e-06, + "loss": 0.1329, + "step": 36800 + }, + { + "epoch": 0.6832316586403686, + "grad_norm": 0.3601365089416504, + "learning_rate": 4.556088251134722e-06, + "loss": 0.3814, + "step": 36802 + }, + { + "epoch": 0.6832687887777873, + "grad_norm": 0.4445032775402069, + "learning_rate": 4.5551097990875395e-06, + "loss": 0.2941, + "step": 36804 + }, + { + "epoch": 0.6833059189152059, + "grad_norm": 0.360127329826355, + "learning_rate": 4.554131421128998e-06, + "loss": 0.2534, + "step": 36806 + }, + { + "epoch": 0.6833430490526246, + "grad_norm": 0.5423081517219543, + "learning_rate": 4.553153117272417e-06, + "loss": 0.2391, + "step": 36808 + }, + { + "epoch": 0.6833801791900432, + "grad_norm": 0.33139416575431824, + "learning_rate": 4.552174887531105e-06, + "loss": 0.2029, + "step": 36810 + }, + { + "epoch": 0.6834173093274618, + "grad_norm": 0.31729045510292053, + "learning_rate": 4.55119673191838e-06, + "loss": 0.3296, + "step": 36812 + }, + { + "epoch": 0.6834544394648805, + "grad_norm": 0.3390863537788391, + "learning_rate": 4.550218650447543e-06, + "loss": 0.3025, + "step": 36814 + }, + { + "epoch": 0.6834915696022991, + "grad_norm": 0.6686185002326965, + "learning_rate": 4.549240643131906e-06, + "loss": 0.327, + "step": 36816 + }, + { + "epoch": 0.6835286997397177, + "grad_norm": 0.29599666595458984, + "learning_rate": 4.548262709984777e-06, + "loss": 0.2591, + "step": 36818 + }, + { + "epoch": 0.6835658298771363, + "grad_norm": 0.5164443850517273, + "learning_rate": 4.547284851019461e-06, + "loss": 0.1666, + "step": 36820 + }, + { + "epoch": 0.683602960014555, + "grad_norm": 0.28643471002578735, + "learning_rate": 4.546307066249267e-06, + "loss": 0.284, + "step": 36822 + }, + { + "epoch": 0.6836400901519737, + "grad_norm": 0.3506264090538025, + "learning_rate": 4.545329355687501e-06, + "loss": 0.3737, + "step": 36824 + }, + { + "epoch": 0.6836772202893923, + "grad_norm": 0.3941076397895813, + "learning_rate": 4.5443517193474605e-06, + "loss": 0.2529, + "step": 36826 + }, + { + "epoch": 0.683714350426811, + "grad_norm": 0.37025681138038635, + "learning_rate": 4.543374157242456e-06, + "loss": 0.2857, + "step": 36828 + }, + { + "epoch": 0.6837514805642295, + "grad_norm": 0.3864564895629883, + "learning_rate": 4.54239666938578e-06, + "loss": 0.3116, + "step": 36830 + }, + { + "epoch": 0.6837886107016482, + "grad_norm": 0.844323992729187, + "learning_rate": 4.541419255790739e-06, + "loss": 0.5784, + "step": 36832 + }, + { + "epoch": 0.6838257408390669, + "grad_norm": 0.5115868449211121, + "learning_rate": 4.540441916470631e-06, + "loss": 0.2487, + "step": 36834 + }, + { + "epoch": 0.6838628709764855, + "grad_norm": 0.17546138167381287, + "learning_rate": 4.539464651438754e-06, + "loss": 0.088, + "step": 36836 + }, + { + "epoch": 0.6839000011139041, + "grad_norm": 0.38474082946777344, + "learning_rate": 4.5384874607084114e-06, + "loss": 0.1687, + "step": 36838 + }, + { + "epoch": 0.6839371312513227, + "grad_norm": 0.6710985898971558, + "learning_rate": 4.5375103442928926e-06, + "loss": 0.3016, + "step": 36840 + }, + { + "epoch": 0.6839742613887414, + "grad_norm": 0.4241836965084076, + "learning_rate": 4.536533302205496e-06, + "loss": 0.2908, + "step": 36842 + }, + { + "epoch": 0.6840113915261601, + "grad_norm": 0.49030038714408875, + "learning_rate": 4.535556334459516e-06, + "loss": 0.116, + "step": 36844 + }, + { + "epoch": 0.6840485216635787, + "grad_norm": 0.33132204413414, + "learning_rate": 4.534579441068249e-06, + "loss": 0.0841, + "step": 36846 + }, + { + "epoch": 0.6840856518009973, + "grad_norm": 0.5838752388954163, + "learning_rate": 4.533602622044981e-06, + "loss": 0.2382, + "step": 36848 + }, + { + "epoch": 0.6841227819384159, + "grad_norm": 0.2662561237812042, + "learning_rate": 4.532625877403013e-06, + "loss": 0.2316, + "step": 36850 + }, + { + "epoch": 0.6841599120758346, + "grad_norm": 0.3253823220729828, + "learning_rate": 4.531649207155625e-06, + "loss": 0.2387, + "step": 36852 + }, + { + "epoch": 0.6841970422132533, + "grad_norm": 0.4482451379299164, + "learning_rate": 4.530672611316112e-06, + "loss": 0.2095, + "step": 36854 + }, + { + "epoch": 0.6842341723506719, + "grad_norm": 0.4497491121292114, + "learning_rate": 4.529696089897762e-06, + "loss": 0.3685, + "step": 36856 + }, + { + "epoch": 0.6842713024880905, + "grad_norm": 0.31457987427711487, + "learning_rate": 4.528719642913863e-06, + "loss": 0.3079, + "step": 36858 + }, + { + "epoch": 0.6843084326255091, + "grad_norm": 0.40683087706565857, + "learning_rate": 4.5277432703777025e-06, + "loss": 0.1596, + "step": 36860 + }, + { + "epoch": 0.6843455627629278, + "grad_norm": 0.5581992864608765, + "learning_rate": 4.5267669723025674e-06, + "loss": 0.3008, + "step": 36862 + }, + { + "epoch": 0.6843826929003465, + "grad_norm": 0.3747551441192627, + "learning_rate": 4.525790748701736e-06, + "loss": 0.3854, + "step": 36864 + }, + { + "epoch": 0.684419823037765, + "grad_norm": 0.18551938235759735, + "learning_rate": 4.524814599588495e-06, + "loss": 0.1623, + "step": 36866 + }, + { + "epoch": 0.6844569531751837, + "grad_norm": 0.3616929054260254, + "learning_rate": 4.523838524976132e-06, + "loss": 0.3776, + "step": 36868 + }, + { + "epoch": 0.6844940833126023, + "grad_norm": 0.3146587908267975, + "learning_rate": 4.522862524877919e-06, + "loss": 0.3231, + "step": 36870 + }, + { + "epoch": 0.684531213450021, + "grad_norm": 0.28498008847236633, + "learning_rate": 4.52188659930714e-06, + "loss": 0.2179, + "step": 36872 + }, + { + "epoch": 0.6845683435874396, + "grad_norm": 0.6642209887504578, + "learning_rate": 4.520910748277081e-06, + "loss": 0.3281, + "step": 36874 + }, + { + "epoch": 0.6846054737248582, + "grad_norm": 0.2693619430065155, + "learning_rate": 4.519934971801011e-06, + "loss": 0.2749, + "step": 36876 + }, + { + "epoch": 0.6846426038622769, + "grad_norm": 0.3265398144721985, + "learning_rate": 4.518959269892211e-06, + "loss": 0.3605, + "step": 36878 + }, + { + "epoch": 0.6846797339996955, + "grad_norm": 0.3953653573989868, + "learning_rate": 4.5179836425639574e-06, + "loss": 0.3328, + "step": 36880 + }, + { + "epoch": 0.6847168641371142, + "grad_norm": 0.41256698966026306, + "learning_rate": 4.517008089829527e-06, + "loss": 0.4427, + "step": 36882 + }, + { + "epoch": 0.6847539942745328, + "grad_norm": 0.4884564280509949, + "learning_rate": 4.516032611702192e-06, + "loss": 0.3233, + "step": 36884 + }, + { + "epoch": 0.6847911244119514, + "grad_norm": 0.30782806873321533, + "learning_rate": 4.515057208195227e-06, + "loss": 0.3393, + "step": 36886 + }, + { + "epoch": 0.6848282545493701, + "grad_norm": 0.4034641981124878, + "learning_rate": 4.514081879321909e-06, + "loss": 0.1036, + "step": 36888 + }, + { + "epoch": 0.6848653846867887, + "grad_norm": 0.24981912970542908, + "learning_rate": 4.513106625095503e-06, + "loss": 0.036, + "step": 36890 + }, + { + "epoch": 0.6849025148242074, + "grad_norm": 0.46677884459495544, + "learning_rate": 4.512131445529278e-06, + "loss": 0.2502, + "step": 36892 + }, + { + "epoch": 0.684939644961626, + "grad_norm": 0.26096880435943604, + "learning_rate": 4.511156340636504e-06, + "loss": 0.3246, + "step": 36894 + }, + { + "epoch": 0.6849767750990446, + "grad_norm": 0.44398149847984314, + "learning_rate": 4.510181310430453e-06, + "loss": 0.519, + "step": 36896 + }, + { + "epoch": 0.6850139052364633, + "grad_norm": 0.44387316703796387, + "learning_rate": 4.509206354924389e-06, + "loss": 0.3591, + "step": 36898 + }, + { + "epoch": 0.6850510353738819, + "grad_norm": 0.25748056173324585, + "learning_rate": 4.508231474131585e-06, + "loss": 0.1555, + "step": 36900 + }, + { + "epoch": 0.6850881655113006, + "grad_norm": 0.3737777769565582, + "learning_rate": 4.507256668065296e-06, + "loss": 0.377, + "step": 36902 + }, + { + "epoch": 0.6851252956487192, + "grad_norm": 0.3655168414115906, + "learning_rate": 4.506281936738792e-06, + "loss": 0.3959, + "step": 36904 + }, + { + "epoch": 0.6851624257861378, + "grad_norm": 0.5662994980812073, + "learning_rate": 4.505307280165335e-06, + "loss": 0.2217, + "step": 36906 + }, + { + "epoch": 0.6851995559235565, + "grad_norm": 0.48609688878059387, + "learning_rate": 4.504332698358187e-06, + "loss": 0.4649, + "step": 36908 + }, + { + "epoch": 0.6852366860609751, + "grad_norm": 0.33742737770080566, + "learning_rate": 4.5033581913306135e-06, + "loss": 0.2685, + "step": 36910 + }, + { + "epoch": 0.6852738161983938, + "grad_norm": 0.3411497175693512, + "learning_rate": 4.502383759095871e-06, + "loss": 0.3153, + "step": 36912 + }, + { + "epoch": 0.6853109463358124, + "grad_norm": 0.564539909362793, + "learning_rate": 4.501409401667214e-06, + "loss": 0.2923, + "step": 36914 + }, + { + "epoch": 0.685348076473231, + "grad_norm": 0.31454044580459595, + "learning_rate": 4.500435119057905e-06, + "loss": 0.4349, + "step": 36916 + }, + { + "epoch": 0.6853852066106496, + "grad_norm": 0.49167755246162415, + "learning_rate": 4.499460911281201e-06, + "loss": 0.2544, + "step": 36918 + }, + { + "epoch": 0.6854223367480683, + "grad_norm": 0.4777070879936218, + "learning_rate": 4.4984867783503586e-06, + "loss": 0.1857, + "step": 36920 + }, + { + "epoch": 0.685459466885487, + "grad_norm": 0.48895907402038574, + "learning_rate": 4.497512720278631e-06, + "loss": 0.3379, + "step": 36922 + }, + { + "epoch": 0.6854965970229056, + "grad_norm": 0.28372129797935486, + "learning_rate": 4.496538737079273e-06, + "loss": 0.3508, + "step": 36924 + }, + { + "epoch": 0.6855337271603242, + "grad_norm": 0.19507038593292236, + "learning_rate": 4.495564828765544e-06, + "loss": 0.2041, + "step": 36926 + }, + { + "epoch": 0.6855708572977428, + "grad_norm": 0.43964684009552, + "learning_rate": 4.494590995350685e-06, + "loss": 0.2629, + "step": 36928 + }, + { + "epoch": 0.6856079874351615, + "grad_norm": 0.2153591513633728, + "learning_rate": 4.493617236847951e-06, + "loss": 0.0253, + "step": 36930 + }, + { + "epoch": 0.6856451175725802, + "grad_norm": 0.3572542369365692, + "learning_rate": 4.492643553270598e-06, + "loss": 0.1542, + "step": 36932 + }, + { + "epoch": 0.6856822477099987, + "grad_norm": 0.5211487412452698, + "learning_rate": 4.491669944631865e-06, + "loss": 0.1944, + "step": 36934 + }, + { + "epoch": 0.6857193778474174, + "grad_norm": 0.22446173429489136, + "learning_rate": 4.490696410945005e-06, + "loss": 0.1699, + "step": 36936 + }, + { + "epoch": 0.685756507984836, + "grad_norm": 0.48938754200935364, + "learning_rate": 4.48972295222327e-06, + "loss": 0.3704, + "step": 36938 + }, + { + "epoch": 0.6857936381222547, + "grad_norm": 0.3289289176464081, + "learning_rate": 4.488749568479894e-06, + "loss": 0.2067, + "step": 36940 + }, + { + "epoch": 0.6858307682596734, + "grad_norm": 0.2854820489883423, + "learning_rate": 4.487776259728131e-06, + "loss": 0.1188, + "step": 36942 + }, + { + "epoch": 0.685867898397092, + "grad_norm": 0.34336429834365845, + "learning_rate": 4.48680302598122e-06, + "loss": 0.3619, + "step": 36944 + }, + { + "epoch": 0.6859050285345106, + "grad_norm": 0.43612515926361084, + "learning_rate": 4.485829867252407e-06, + "loss": 0.4179, + "step": 36946 + }, + { + "epoch": 0.6859421586719292, + "grad_norm": 0.5785712599754333, + "learning_rate": 4.484856783554932e-06, + "loss": 0.1893, + "step": 36948 + }, + { + "epoch": 0.6859792888093479, + "grad_norm": 0.4804275631904602, + "learning_rate": 4.48388377490204e-06, + "loss": 0.2878, + "step": 36950 + }, + { + "epoch": 0.6860164189467666, + "grad_norm": 0.4517059922218323, + "learning_rate": 4.482910841306964e-06, + "loss": 0.2752, + "step": 36952 + }, + { + "epoch": 0.6860535490841851, + "grad_norm": 0.47712597250938416, + "learning_rate": 4.481937982782949e-06, + "loss": 0.3416, + "step": 36954 + }, + { + "epoch": 0.6860906792216038, + "grad_norm": 0.5473611354827881, + "learning_rate": 4.480965199343226e-06, + "loss": 0.263, + "step": 36956 + }, + { + "epoch": 0.6861278093590224, + "grad_norm": 0.4020240306854248, + "learning_rate": 4.479992491001035e-06, + "loss": 0.1193, + "step": 36958 + }, + { + "epoch": 0.6861649394964411, + "grad_norm": 0.48071375489234924, + "learning_rate": 4.4790198577696134e-06, + "loss": 0.3832, + "step": 36960 + }, + { + "epoch": 0.6862020696338598, + "grad_norm": 0.3266705572605133, + "learning_rate": 4.4780472996621925e-06, + "loss": 0.3906, + "step": 36962 + }, + { + "epoch": 0.6862391997712783, + "grad_norm": 0.4404544234275818, + "learning_rate": 4.4770748166920115e-06, + "loss": 0.3278, + "step": 36964 + }, + { + "epoch": 0.686276329908697, + "grad_norm": 0.19739089906215668, + "learning_rate": 4.476102408872296e-06, + "loss": 0.2208, + "step": 36966 + }, + { + "epoch": 0.6863134600461156, + "grad_norm": 0.3867587447166443, + "learning_rate": 4.4751300762162805e-06, + "loss": 0.2956, + "step": 36968 + }, + { + "epoch": 0.6863505901835343, + "grad_norm": 0.2025323510169983, + "learning_rate": 4.474157818737197e-06, + "loss": 0.2535, + "step": 36970 + }, + { + "epoch": 0.6863877203209529, + "grad_norm": 0.42796045541763306, + "learning_rate": 4.473185636448276e-06, + "loss": 0.0804, + "step": 36972 + }, + { + "epoch": 0.6864248504583715, + "grad_norm": 0.4834670126438141, + "learning_rate": 4.47221352936274e-06, + "loss": 0.4108, + "step": 36974 + }, + { + "epoch": 0.6864619805957902, + "grad_norm": 0.7377327084541321, + "learning_rate": 4.471241497493824e-06, + "loss": 0.1303, + "step": 36976 + }, + { + "epoch": 0.6864991107332088, + "grad_norm": 0.3907416760921478, + "learning_rate": 4.470269540854748e-06, + "loss": 0.2058, + "step": 36978 + }, + { + "epoch": 0.6865362408706275, + "grad_norm": 0.2603280544281006, + "learning_rate": 4.46929765945874e-06, + "loss": 0.2996, + "step": 36980 + }, + { + "epoch": 0.686573371008046, + "grad_norm": 0.5800443291664124, + "learning_rate": 4.468325853319024e-06, + "loss": 0.3388, + "step": 36982 + }, + { + "epoch": 0.6866105011454647, + "grad_norm": 0.5566034913063049, + "learning_rate": 4.467354122448824e-06, + "loss": 0.2598, + "step": 36984 + }, + { + "epoch": 0.6866476312828834, + "grad_norm": 0.39501988887786865, + "learning_rate": 4.466382466861362e-06, + "loss": 0.2347, + "step": 36986 + }, + { + "epoch": 0.686684761420302, + "grad_norm": 0.5609928369522095, + "learning_rate": 4.465410886569863e-06, + "loss": 0.2663, + "step": 36988 + }, + { + "epoch": 0.6867218915577207, + "grad_norm": 0.3192872107028961, + "learning_rate": 4.46443938158754e-06, + "loss": 0.3682, + "step": 36990 + }, + { + "epoch": 0.6867590216951392, + "grad_norm": 0.27095746994018555, + "learning_rate": 4.463467951927617e-06, + "loss": 0.2927, + "step": 36992 + }, + { + "epoch": 0.6867961518325579, + "grad_norm": 0.5867243409156799, + "learning_rate": 4.4624965976033105e-06, + "loss": 0.1791, + "step": 36994 + }, + { + "epoch": 0.6868332819699766, + "grad_norm": 0.1843239665031433, + "learning_rate": 4.461525318627843e-06, + "loss": 0.2847, + "step": 36996 + }, + { + "epoch": 0.6868704121073952, + "grad_norm": 0.39618611335754395, + "learning_rate": 4.460554115014421e-06, + "loss": 0.2212, + "step": 36998 + }, + { + "epoch": 0.6869075422448139, + "grad_norm": 0.4728761911392212, + "learning_rate": 4.4595829867762705e-06, + "loss": 0.2755, + "step": 37000 + }, + { + "epoch": 0.6869446723822324, + "grad_norm": 0.39870527386665344, + "learning_rate": 4.4586119339265955e-06, + "loss": 0.3097, + "step": 37002 + }, + { + "epoch": 0.6869818025196511, + "grad_norm": 0.2768265902996063, + "learning_rate": 4.457640956478614e-06, + "loss": 0.1313, + "step": 37004 + }, + { + "epoch": 0.6870189326570698, + "grad_norm": 0.44751498103141785, + "learning_rate": 4.456670054445538e-06, + "loss": 0.2012, + "step": 37006 + }, + { + "epoch": 0.6870560627944884, + "grad_norm": 0.7260276079177856, + "learning_rate": 4.455699227840577e-06, + "loss": 0.3053, + "step": 37008 + }, + { + "epoch": 0.6870931929319071, + "grad_norm": 0.3138602077960968, + "learning_rate": 4.454728476676944e-06, + "loss": 0.1375, + "step": 37010 + }, + { + "epoch": 0.6871303230693256, + "grad_norm": 0.35546284914016724, + "learning_rate": 4.453757800967845e-06, + "loss": 0.2777, + "step": 37012 + }, + { + "epoch": 0.6871674532067443, + "grad_norm": 0.2587350308895111, + "learning_rate": 4.452787200726495e-06, + "loss": 0.2965, + "step": 37014 + }, + { + "epoch": 0.687204583344163, + "grad_norm": 0.35835838317871094, + "learning_rate": 4.451816675966089e-06, + "loss": 0.2871, + "step": 37016 + }, + { + "epoch": 0.6872417134815816, + "grad_norm": 0.3472989499568939, + "learning_rate": 4.450846226699845e-06, + "loss": 0.2521, + "step": 37018 + }, + { + "epoch": 0.6872788436190003, + "grad_norm": 0.4447984993457794, + "learning_rate": 4.449875852940959e-06, + "loss": 0.1473, + "step": 37020 + }, + { + "epoch": 0.6873159737564188, + "grad_norm": 0.3388555347919464, + "learning_rate": 4.448905554702637e-06, + "loss": 0.1882, + "step": 37022 + }, + { + "epoch": 0.6873531038938375, + "grad_norm": 0.27160486578941345, + "learning_rate": 4.447935331998083e-06, + "loss": 0.3541, + "step": 37024 + }, + { + "epoch": 0.6873902340312561, + "grad_norm": 0.35241034626960754, + "learning_rate": 4.446965184840503e-06, + "loss": 0.2386, + "step": 37026 + }, + { + "epoch": 0.6874273641686748, + "grad_norm": 0.29288753867149353, + "learning_rate": 4.445995113243089e-06, + "loss": 0.1509, + "step": 37028 + }, + { + "epoch": 0.6874644943060935, + "grad_norm": 0.3887053430080414, + "learning_rate": 4.445025117219046e-06, + "loss": 0.3515, + "step": 37030 + }, + { + "epoch": 0.687501624443512, + "grad_norm": 0.34022843837738037, + "learning_rate": 4.444055196781572e-06, + "loss": 0.2832, + "step": 37032 + }, + { + "epoch": 0.6875387545809307, + "grad_norm": 0.4085644483566284, + "learning_rate": 4.443085351943865e-06, + "loss": 0.359, + "step": 37034 + }, + { + "epoch": 0.6875758847183493, + "grad_norm": 0.44787487387657166, + "learning_rate": 4.442115582719124e-06, + "loss": 0.3389, + "step": 37036 + }, + { + "epoch": 0.687613014855768, + "grad_norm": 0.2517201602458954, + "learning_rate": 4.441145889120539e-06, + "loss": 0.216, + "step": 37038 + }, + { + "epoch": 0.6876501449931867, + "grad_norm": 0.3492571711540222, + "learning_rate": 4.440176271161313e-06, + "loss": 0.222, + "step": 37040 + }, + { + "epoch": 0.6876872751306052, + "grad_norm": 0.6720255613327026, + "learning_rate": 4.43920672885463e-06, + "loss": 0.3845, + "step": 37042 + }, + { + "epoch": 0.6877244052680239, + "grad_norm": 0.36674150824546814, + "learning_rate": 4.438237262213685e-06, + "loss": 0.2585, + "step": 37044 + }, + { + "epoch": 0.6877615354054425, + "grad_norm": 0.31624552607536316, + "learning_rate": 4.437267871251674e-06, + "loss": 0.2443, + "step": 37046 + }, + { + "epoch": 0.6877986655428612, + "grad_norm": 0.37729647755622864, + "learning_rate": 4.4362985559817825e-06, + "loss": 0.2184, + "step": 37048 + }, + { + "epoch": 0.6878357956802799, + "grad_norm": 0.32645153999328613, + "learning_rate": 4.435329316417204e-06, + "loss": 0.1479, + "step": 37050 + }, + { + "epoch": 0.6878729258176984, + "grad_norm": 0.37583091855049133, + "learning_rate": 4.434360152571129e-06, + "loss": 0.2287, + "step": 37052 + }, + { + "epoch": 0.6879100559551171, + "grad_norm": 0.5464087724685669, + "learning_rate": 4.433391064456737e-06, + "loss": 0.2527, + "step": 37054 + }, + { + "epoch": 0.6879471860925357, + "grad_norm": 0.5519949793815613, + "learning_rate": 4.43242205208722e-06, + "loss": 0.2344, + "step": 37056 + }, + { + "epoch": 0.6879843162299544, + "grad_norm": 0.506227433681488, + "learning_rate": 4.431453115475764e-06, + "loss": 0.3044, + "step": 37058 + }, + { + "epoch": 0.6880214463673731, + "grad_norm": 0.43618252873420715, + "learning_rate": 4.430484254635548e-06, + "loss": 0.2465, + "step": 37060 + }, + { + "epoch": 0.6880585765047916, + "grad_norm": 0.45950615406036377, + "learning_rate": 4.429515469579759e-06, + "loss": 0.3313, + "step": 37062 + }, + { + "epoch": 0.6880957066422103, + "grad_norm": 0.6889962553977966, + "learning_rate": 4.428546760321584e-06, + "loss": 0.3683, + "step": 37064 + }, + { + "epoch": 0.6881328367796289, + "grad_norm": 0.4205383360385895, + "learning_rate": 4.427578126874193e-06, + "loss": 0.3777, + "step": 37066 + }, + { + "epoch": 0.6881699669170476, + "grad_norm": 0.32074326276779175, + "learning_rate": 4.426609569250774e-06, + "loss": 0.4353, + "step": 37068 + }, + { + "epoch": 0.6882070970544661, + "grad_norm": 0.2948293387889862, + "learning_rate": 4.425641087464504e-06, + "loss": 0.3363, + "step": 37070 + }, + { + "epoch": 0.6882442271918848, + "grad_norm": 0.25266048312187195, + "learning_rate": 4.424672681528563e-06, + "loss": 0.2227, + "step": 37072 + }, + { + "epoch": 0.6882813573293035, + "grad_norm": 0.47830379009246826, + "learning_rate": 4.4237043514561254e-06, + "loss": 0.2452, + "step": 37074 + }, + { + "epoch": 0.6883184874667221, + "grad_norm": 0.3104912042617798, + "learning_rate": 4.422736097260372e-06, + "loss": 0.2647, + "step": 37076 + }, + { + "epoch": 0.6883556176041408, + "grad_norm": 0.400849312543869, + "learning_rate": 4.42176791895447e-06, + "loss": 0.2601, + "step": 37078 + }, + { + "epoch": 0.6883927477415593, + "grad_norm": 0.3141663670539856, + "learning_rate": 4.420799816551604e-06, + "loss": 0.2259, + "step": 37080 + }, + { + "epoch": 0.688429877878978, + "grad_norm": 0.3414446711540222, + "learning_rate": 4.419831790064936e-06, + "loss": 0.095, + "step": 37082 + }, + { + "epoch": 0.6884670080163967, + "grad_norm": 0.4722117483615875, + "learning_rate": 4.418863839507642e-06, + "loss": 0.2639, + "step": 37084 + }, + { + "epoch": 0.6885041381538153, + "grad_norm": 0.29620495438575745, + "learning_rate": 4.417895964892895e-06, + "loss": 0.335, + "step": 37086 + }, + { + "epoch": 0.688541268291234, + "grad_norm": 0.6854696273803711, + "learning_rate": 4.416928166233862e-06, + "loss": 0.1591, + "step": 37088 + }, + { + "epoch": 0.6885783984286525, + "grad_norm": 0.40646234154701233, + "learning_rate": 4.415960443543718e-06, + "loss": 0.4555, + "step": 37090 + }, + { + "epoch": 0.6886155285660712, + "grad_norm": 0.32580870389938354, + "learning_rate": 4.414992796835624e-06, + "loss": 0.158, + "step": 37092 + }, + { + "epoch": 0.6886526587034899, + "grad_norm": 0.25465288758277893, + "learning_rate": 4.414025226122748e-06, + "loss": 0.3046, + "step": 37094 + }, + { + "epoch": 0.6886897888409085, + "grad_norm": 0.6603150963783264, + "learning_rate": 4.413057731418257e-06, + "loss": 0.1845, + "step": 37096 + }, + { + "epoch": 0.6887269189783272, + "grad_norm": 0.24706041812896729, + "learning_rate": 4.4120903127353145e-06, + "loss": 0.3081, + "step": 37098 + }, + { + "epoch": 0.6887640491157457, + "grad_norm": 0.2862301170825958, + "learning_rate": 4.411122970087089e-06, + "loss": 0.1914, + "step": 37100 + }, + { + "epoch": 0.6888011792531644, + "grad_norm": 0.47546929121017456, + "learning_rate": 4.410155703486736e-06, + "loss": 0.3477, + "step": 37102 + }, + { + "epoch": 0.6888383093905831, + "grad_norm": 0.36997124552726746, + "learning_rate": 4.409188512947423e-06, + "loss": 0.5391, + "step": 37104 + }, + { + "epoch": 0.6888754395280017, + "grad_norm": 0.2447068989276886, + "learning_rate": 4.408221398482305e-06, + "loss": 0.235, + "step": 37106 + }, + { + "epoch": 0.6889125696654204, + "grad_norm": 0.2989845871925354, + "learning_rate": 4.407254360104545e-06, + "loss": 0.1913, + "step": 37108 + }, + { + "epoch": 0.6889496998028389, + "grad_norm": 0.45974108576774597, + "learning_rate": 4.4062873978273e-06, + "loss": 0.3624, + "step": 37110 + }, + { + "epoch": 0.6889868299402576, + "grad_norm": 0.3065503239631653, + "learning_rate": 4.405320511663729e-06, + "loss": 0.2184, + "step": 37112 + }, + { + "epoch": 0.6890239600776763, + "grad_norm": 0.36659955978393555, + "learning_rate": 4.404353701626991e-06, + "loss": 0.0586, + "step": 37114 + }, + { + "epoch": 0.6890610902150949, + "grad_norm": 0.536537766456604, + "learning_rate": 4.4033869677302355e-06, + "loss": 0.3229, + "step": 37116 + }, + { + "epoch": 0.6890982203525136, + "grad_norm": 0.689934253692627, + "learning_rate": 4.402420309986619e-06, + "loss": 0.2022, + "step": 37118 + }, + { + "epoch": 0.6891353504899321, + "grad_norm": 0.29262688755989075, + "learning_rate": 4.401453728409295e-06, + "loss": 0.1958, + "step": 37120 + }, + { + "epoch": 0.6891724806273508, + "grad_norm": 0.3789560794830322, + "learning_rate": 4.40048722301142e-06, + "loss": 0.418, + "step": 37122 + }, + { + "epoch": 0.6892096107647694, + "grad_norm": 0.26866623759269714, + "learning_rate": 4.399520793806138e-06, + "loss": 0.1477, + "step": 37124 + }, + { + "epoch": 0.6892467409021881, + "grad_norm": 0.2915765941143036, + "learning_rate": 4.398554440806603e-06, + "loss": 0.2777, + "step": 37126 + }, + { + "epoch": 0.6892838710396068, + "grad_norm": 0.527785062789917, + "learning_rate": 4.3975881640259665e-06, + "loss": 0.1827, + "step": 37128 + }, + { + "epoch": 0.6893210011770253, + "grad_norm": 0.336327463388443, + "learning_rate": 4.396621963477371e-06, + "loss": 0.3295, + "step": 37130 + }, + { + "epoch": 0.689358131314444, + "grad_norm": 0.2956906259059906, + "learning_rate": 4.395655839173967e-06, + "loss": 0.289, + "step": 37132 + }, + { + "epoch": 0.6893952614518626, + "grad_norm": 0.33871522545814514, + "learning_rate": 4.3946897911288985e-06, + "loss": 0.4447, + "step": 37134 + }, + { + "epoch": 0.6894323915892813, + "grad_norm": 0.5412474870681763, + "learning_rate": 4.3937238193553135e-06, + "loss": 0.1601, + "step": 37136 + }, + { + "epoch": 0.6894695217267, + "grad_norm": 0.557263195514679, + "learning_rate": 4.392757923866354e-06, + "loss": 0.2278, + "step": 37138 + }, + { + "epoch": 0.6895066518641185, + "grad_norm": 0.4089566171169281, + "learning_rate": 4.391792104675167e-06, + "loss": 0.382, + "step": 37140 + }, + { + "epoch": 0.6895437820015372, + "grad_norm": 0.4293087422847748, + "learning_rate": 4.390826361794888e-06, + "loss": 0.2292, + "step": 37142 + }, + { + "epoch": 0.6895809121389558, + "grad_norm": 0.4252699911594391, + "learning_rate": 4.389860695238664e-06, + "loss": 0.2873, + "step": 37144 + }, + { + "epoch": 0.6896180422763745, + "grad_norm": 0.2242163121700287, + "learning_rate": 4.388895105019627e-06, + "loss": 0.1375, + "step": 37146 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.6254360675811768, + "learning_rate": 4.387929591150922e-06, + "loss": 0.2736, + "step": 37148 + }, + { + "epoch": 0.6896923025512117, + "grad_norm": 0.3510236442089081, + "learning_rate": 4.386964153645684e-06, + "loss": 0.2894, + "step": 37150 + }, + { + "epoch": 0.6897294326886304, + "grad_norm": 0.48587706685066223, + "learning_rate": 4.385998792517056e-06, + "loss": 0.3058, + "step": 37152 + }, + { + "epoch": 0.689766562826049, + "grad_norm": 0.45913079380989075, + "learning_rate": 4.385033507778163e-06, + "loss": 0.2292, + "step": 37154 + }, + { + "epoch": 0.6898036929634677, + "grad_norm": 0.33906930685043335, + "learning_rate": 4.3840682994421465e-06, + "loss": 0.1824, + "step": 37156 + }, + { + "epoch": 0.6898408231008863, + "grad_norm": 0.41159459948539734, + "learning_rate": 4.383103167522138e-06, + "loss": 0.346, + "step": 37158 + }, + { + "epoch": 0.6898779532383049, + "grad_norm": 0.4278871417045593, + "learning_rate": 4.382138112031271e-06, + "loss": 0.2318, + "step": 37160 + }, + { + "epoch": 0.6899150833757236, + "grad_norm": 0.39938104152679443, + "learning_rate": 4.381173132982678e-06, + "loss": 0.3812, + "step": 37162 + }, + { + "epoch": 0.6899522135131422, + "grad_norm": 0.3679214119911194, + "learning_rate": 4.380208230389491e-06, + "loss": 0.0884, + "step": 37164 + }, + { + "epoch": 0.6899893436505609, + "grad_norm": 0.35381996631622314, + "learning_rate": 4.3792434042648365e-06, + "loss": 0.2069, + "step": 37166 + }, + { + "epoch": 0.6900264737879795, + "grad_norm": 0.24392426013946533, + "learning_rate": 4.378278654621841e-06, + "loss": 0.1493, + "step": 37168 + }, + { + "epoch": 0.6900636039253981, + "grad_norm": 0.4757721722126007, + "learning_rate": 4.377313981473632e-06, + "loss": 0.0844, + "step": 37170 + }, + { + "epoch": 0.6901007340628168, + "grad_norm": 0.4118558168411255, + "learning_rate": 4.3763493848333385e-06, + "loss": 0.4053, + "step": 37172 + }, + { + "epoch": 0.6901378642002354, + "grad_norm": 0.24378761649131775, + "learning_rate": 4.375384864714085e-06, + "loss": 0.3587, + "step": 37174 + }, + { + "epoch": 0.6901749943376541, + "grad_norm": 0.46597692370414734, + "learning_rate": 4.374420421128997e-06, + "loss": 0.2149, + "step": 37176 + }, + { + "epoch": 0.6902121244750726, + "grad_norm": 0.5553494095802307, + "learning_rate": 4.373456054091199e-06, + "loss": 0.1589, + "step": 37178 + }, + { + "epoch": 0.6902492546124913, + "grad_norm": 0.2945960760116577, + "learning_rate": 4.372491763613807e-06, + "loss": 0.3541, + "step": 37180 + }, + { + "epoch": 0.69028638474991, + "grad_norm": 0.40553995966911316, + "learning_rate": 4.371527549709945e-06, + "loss": 0.3902, + "step": 37182 + }, + { + "epoch": 0.6903235148873286, + "grad_norm": 0.35749179124832153, + "learning_rate": 4.370563412392735e-06, + "loss": 0.1027, + "step": 37184 + }, + { + "epoch": 0.6903606450247473, + "grad_norm": 0.5515786409378052, + "learning_rate": 4.369599351675299e-06, + "loss": 0.1691, + "step": 37186 + }, + { + "epoch": 0.6903977751621658, + "grad_norm": 0.5842152833938599, + "learning_rate": 4.368635367570746e-06, + "loss": 0.3875, + "step": 37188 + }, + { + "epoch": 0.6904349052995845, + "grad_norm": 0.41106826066970825, + "learning_rate": 4.367671460092202e-06, + "loss": 0.2702, + "step": 37190 + }, + { + "epoch": 0.6904720354370032, + "grad_norm": 0.33555296063423157, + "learning_rate": 4.366707629252775e-06, + "loss": 0.2354, + "step": 37192 + }, + { + "epoch": 0.6905091655744218, + "grad_norm": 0.2797924876213074, + "learning_rate": 4.365743875065584e-06, + "loss": 0.1641, + "step": 37194 + }, + { + "epoch": 0.6905462957118405, + "grad_norm": 0.6033901572227478, + "learning_rate": 4.364780197543741e-06, + "loss": 0.3995, + "step": 37196 + }, + { + "epoch": 0.690583425849259, + "grad_norm": 0.5848039984703064, + "learning_rate": 4.363816596700361e-06, + "loss": 0.3406, + "step": 37198 + }, + { + "epoch": 0.6906205559866777, + "grad_norm": 0.5522786974906921, + "learning_rate": 4.362853072548554e-06, + "loss": 0.2111, + "step": 37200 + }, + { + "epoch": 0.6906576861240964, + "grad_norm": 0.5165181159973145, + "learning_rate": 4.361889625101435e-06, + "loss": 0.2292, + "step": 37202 + }, + { + "epoch": 0.690694816261515, + "grad_norm": 0.44205108284950256, + "learning_rate": 4.360926254372107e-06, + "loss": 0.2782, + "step": 37204 + }, + { + "epoch": 0.6907319463989336, + "grad_norm": 0.29006579518318176, + "learning_rate": 4.359962960373682e-06, + "loss": 0.2812, + "step": 37206 + }, + { + "epoch": 0.6907690765363522, + "grad_norm": 0.46617984771728516, + "learning_rate": 4.358999743119271e-06, + "loss": 0.2751, + "step": 37208 + }, + { + "epoch": 0.6908062066737709, + "grad_norm": 0.5370455980300903, + "learning_rate": 4.358036602621974e-06, + "loss": 0.3863, + "step": 37210 + }, + { + "epoch": 0.6908433368111896, + "grad_norm": 0.6497635841369629, + "learning_rate": 4.357073538894897e-06, + "loss": 0.2368, + "step": 37212 + }, + { + "epoch": 0.6908804669486082, + "grad_norm": 0.5513054728507996, + "learning_rate": 4.356110551951148e-06, + "loss": 0.1863, + "step": 37214 + }, + { + "epoch": 0.6909175970860268, + "grad_norm": 0.4180803596973419, + "learning_rate": 4.355147641803832e-06, + "loss": 0.3554, + "step": 37216 + }, + { + "epoch": 0.6909547272234454, + "grad_norm": 0.41346290707588196, + "learning_rate": 4.354184808466046e-06, + "loss": 0.2747, + "step": 37218 + }, + { + "epoch": 0.6909918573608641, + "grad_norm": 0.3466801643371582, + "learning_rate": 4.353222051950892e-06, + "loss": 0.1473, + "step": 37220 + }, + { + "epoch": 0.6910289874982827, + "grad_norm": 0.29186904430389404, + "learning_rate": 4.352259372271473e-06, + "loss": 0.2756, + "step": 37222 + }, + { + "epoch": 0.6910661176357014, + "grad_norm": 0.39038529992103577, + "learning_rate": 4.351296769440885e-06, + "loss": 0.409, + "step": 37224 + }, + { + "epoch": 0.69110324777312, + "grad_norm": 0.48075252771377563, + "learning_rate": 4.350334243472233e-06, + "loss": 0.1707, + "step": 37226 + }, + { + "epoch": 0.6911403779105386, + "grad_norm": 0.43311798572540283, + "learning_rate": 4.349371794378606e-06, + "loss": 0.1274, + "step": 37228 + }, + { + "epoch": 0.6911775080479573, + "grad_norm": 0.3471451997756958, + "learning_rate": 4.348409422173107e-06, + "loss": 0.2148, + "step": 37230 + }, + { + "epoch": 0.6912146381853759, + "grad_norm": 2.039639949798584, + "learning_rate": 4.347447126868822e-06, + "loss": 0.1433, + "step": 37232 + }, + { + "epoch": 0.6912517683227946, + "grad_norm": 0.42357128858566284, + "learning_rate": 4.346484908478852e-06, + "loss": 0.2106, + "step": 37234 + }, + { + "epoch": 0.6912888984602132, + "grad_norm": 0.4580634832382202, + "learning_rate": 4.345522767016288e-06, + "loss": 0.3317, + "step": 37236 + }, + { + "epoch": 0.6913260285976318, + "grad_norm": 0.2992669939994812, + "learning_rate": 4.3445607024942215e-06, + "loss": 0.3064, + "step": 37238 + }, + { + "epoch": 0.6913631587350505, + "grad_norm": 0.42594772577285767, + "learning_rate": 4.343598714925747e-06, + "loss": 0.1817, + "step": 37240 + }, + { + "epoch": 0.6914002888724691, + "grad_norm": 0.42388495802879333, + "learning_rate": 4.342636804323948e-06, + "loss": 0.2835, + "step": 37242 + }, + { + "epoch": 0.6914374190098878, + "grad_norm": 0.2760024666786194, + "learning_rate": 4.3416749707019175e-06, + "loss": 0.3488, + "step": 37244 + }, + { + "epoch": 0.6914745491473064, + "grad_norm": 0.4250372648239136, + "learning_rate": 4.340713214072741e-06, + "loss": 0.4974, + "step": 37246 + }, + { + "epoch": 0.691511679284725, + "grad_norm": 0.9084572196006775, + "learning_rate": 4.339751534449505e-06, + "loss": 0.1477, + "step": 37248 + }, + { + "epoch": 0.6915488094221437, + "grad_norm": 0.47688087821006775, + "learning_rate": 4.338789931845302e-06, + "loss": 0.1786, + "step": 37250 + }, + { + "epoch": 0.6915859395595623, + "grad_norm": 0.3504839241504669, + "learning_rate": 4.3378284062732055e-06, + "loss": 0.2847, + "step": 37252 + }, + { + "epoch": 0.691623069696981, + "grad_norm": 0.3641328513622284, + "learning_rate": 4.336866957746308e-06, + "loss": 0.2366, + "step": 37254 + }, + { + "epoch": 0.6916601998343996, + "grad_norm": 0.43362095952033997, + "learning_rate": 4.335905586277685e-06, + "loss": 0.3013, + "step": 37256 + }, + { + "epoch": 0.6916973299718182, + "grad_norm": 0.21120557188987732, + "learning_rate": 4.334944291880422e-06, + "loss": 0.1342, + "step": 37258 + }, + { + "epoch": 0.6917344601092369, + "grad_norm": 0.5634955763816833, + "learning_rate": 4.333983074567597e-06, + "loss": 0.1717, + "step": 37260 + }, + { + "epoch": 0.6917715902466555, + "grad_norm": 0.46009761095046997, + "learning_rate": 4.333021934352292e-06, + "loss": 0.4335, + "step": 37262 + }, + { + "epoch": 0.6918087203840741, + "grad_norm": 2.8798201084136963, + "learning_rate": 4.332060871247582e-06, + "loss": 0.1577, + "step": 37264 + }, + { + "epoch": 0.6918458505214928, + "grad_norm": 0.40647372603416443, + "learning_rate": 4.3310998852665505e-06, + "loss": 0.2408, + "step": 37266 + }, + { + "epoch": 0.6918829806589114, + "grad_norm": 0.4678947627544403, + "learning_rate": 4.330138976422266e-06, + "loss": 0.3207, + "step": 37268 + }, + { + "epoch": 0.6919201107963301, + "grad_norm": 0.3865966200828552, + "learning_rate": 4.329178144727807e-06, + "loss": 0.4162, + "step": 37270 + }, + { + "epoch": 0.6919572409337487, + "grad_norm": 0.3192349970340729, + "learning_rate": 4.32821739019625e-06, + "loss": 0.2819, + "step": 37272 + }, + { + "epoch": 0.6919943710711673, + "grad_norm": 0.32833749055862427, + "learning_rate": 4.327256712840663e-06, + "loss": 0.2574, + "step": 37274 + }, + { + "epoch": 0.6920315012085859, + "grad_norm": 0.5903516411781311, + "learning_rate": 4.3262961126741195e-06, + "loss": 0.3459, + "step": 37276 + }, + { + "epoch": 0.6920686313460046, + "grad_norm": 0.2598225176334381, + "learning_rate": 4.325335589709695e-06, + "loss": 0.301, + "step": 37278 + }, + { + "epoch": 0.6921057614834233, + "grad_norm": 0.5802574157714844, + "learning_rate": 4.324375143960452e-06, + "loss": 0.2321, + "step": 37280 + }, + { + "epoch": 0.6921428916208419, + "grad_norm": 0.36529380083084106, + "learning_rate": 4.323414775439463e-06, + "loss": 0.3606, + "step": 37282 + }, + { + "epoch": 0.6921800217582605, + "grad_norm": 0.34091514348983765, + "learning_rate": 4.322454484159795e-06, + "loss": 0.2104, + "step": 37284 + }, + { + "epoch": 0.6922171518956791, + "grad_norm": 0.3278229534626007, + "learning_rate": 4.321494270134515e-06, + "loss": 0.2301, + "step": 37286 + }, + { + "epoch": 0.6922542820330978, + "grad_norm": 0.4791397154331207, + "learning_rate": 4.3205341333766895e-06, + "loss": 0.4197, + "step": 37288 + }, + { + "epoch": 0.6922914121705165, + "grad_norm": 0.5173566937446594, + "learning_rate": 4.319574073899386e-06, + "loss": 0.1477, + "step": 37290 + }, + { + "epoch": 0.6923285423079351, + "grad_norm": 0.5060773491859436, + "learning_rate": 4.3186140917156615e-06, + "loss": 0.4774, + "step": 37292 + }, + { + "epoch": 0.6923656724453537, + "grad_norm": 0.39532729983329773, + "learning_rate": 4.3176541868385844e-06, + "loss": 0.2268, + "step": 37294 + }, + { + "epoch": 0.6924028025827723, + "grad_norm": 0.2902069389820099, + "learning_rate": 4.316694359281209e-06, + "loss": 0.145, + "step": 37296 + }, + { + "epoch": 0.692439932720191, + "grad_norm": 0.4081248641014099, + "learning_rate": 4.315734609056601e-06, + "loss": 0.1907, + "step": 37298 + }, + { + "epoch": 0.6924770628576097, + "grad_norm": 0.3958992660045624, + "learning_rate": 4.314774936177818e-06, + "loss": 0.3141, + "step": 37300 + }, + { + "epoch": 0.6925141929950283, + "grad_norm": 0.3282458484172821, + "learning_rate": 4.3138153406579196e-06, + "loss": 0.3317, + "step": 37302 + }, + { + "epoch": 0.6925513231324469, + "grad_norm": 0.5511623620986938, + "learning_rate": 4.3128558225099645e-06, + "loss": 0.2571, + "step": 37304 + }, + { + "epoch": 0.6925884532698655, + "grad_norm": 0.3292279541492462, + "learning_rate": 4.311896381747005e-06, + "loss": 0.2361, + "step": 37306 + }, + { + "epoch": 0.6926255834072842, + "grad_norm": 0.3538127541542053, + "learning_rate": 4.3109370183820965e-06, + "loss": 0.178, + "step": 37308 + }, + { + "epoch": 0.6926627135447029, + "grad_norm": 0.4217687249183655, + "learning_rate": 4.309977732428296e-06, + "loss": 0.3557, + "step": 37310 + }, + { + "epoch": 0.6926998436821215, + "grad_norm": 0.3295333683490753, + "learning_rate": 4.309018523898657e-06, + "loss": 0.3609, + "step": 37312 + }, + { + "epoch": 0.6927369738195401, + "grad_norm": 0.47251802682876587, + "learning_rate": 4.308059392806227e-06, + "loss": 0.2639, + "step": 37314 + }, + { + "epoch": 0.6927741039569587, + "grad_norm": 0.34967535734176636, + "learning_rate": 4.307100339164061e-06, + "loss": 0.3354, + "step": 37316 + }, + { + "epoch": 0.6928112340943774, + "grad_norm": 0.5792964100837708, + "learning_rate": 4.306141362985205e-06, + "loss": 0.2415, + "step": 37318 + }, + { + "epoch": 0.6928483642317961, + "grad_norm": 0.3297637104988098, + "learning_rate": 4.30518246428271e-06, + "loss": 0.2382, + "step": 37320 + }, + { + "epoch": 0.6928854943692146, + "grad_norm": 0.435459703207016, + "learning_rate": 4.304223643069624e-06, + "loss": 0.1386, + "step": 37322 + }, + { + "epoch": 0.6929226245066333, + "grad_norm": 0.3952932357788086, + "learning_rate": 4.303264899358992e-06, + "loss": 0.4376, + "step": 37324 + }, + { + "epoch": 0.6929597546440519, + "grad_norm": 0.377327561378479, + "learning_rate": 4.302306233163862e-06, + "loss": 0.2409, + "step": 37326 + }, + { + "epoch": 0.6929968847814706, + "grad_norm": 0.3206210434436798, + "learning_rate": 4.3013476444972814e-06, + "loss": 0.4144, + "step": 37328 + }, + { + "epoch": 0.6930340149188892, + "grad_norm": 0.29780298471450806, + "learning_rate": 4.300389133372286e-06, + "loss": 0.2489, + "step": 37330 + }, + { + "epoch": 0.6930711450563078, + "grad_norm": 0.26677048206329346, + "learning_rate": 4.299430699801922e-06, + "loss": 0.3057, + "step": 37332 + }, + { + "epoch": 0.6931082751937265, + "grad_norm": 0.43046334385871887, + "learning_rate": 4.298472343799235e-06, + "loss": 0.1996, + "step": 37334 + }, + { + "epoch": 0.6931454053311451, + "grad_norm": 0.4612695574760437, + "learning_rate": 4.297514065377257e-06, + "loss": 0.3163, + "step": 37336 + }, + { + "epoch": 0.6931825354685638, + "grad_norm": 0.3922799825668335, + "learning_rate": 4.296555864549032e-06, + "loss": 0.2494, + "step": 37338 + }, + { + "epoch": 0.6932196656059824, + "grad_norm": 0.3093006908893585, + "learning_rate": 4.295597741327599e-06, + "loss": 0.2086, + "step": 37340 + }, + { + "epoch": 0.693256795743401, + "grad_norm": 0.44677606225013733, + "learning_rate": 4.294639695725996e-06, + "loss": 0.2079, + "step": 37342 + }, + { + "epoch": 0.6932939258808197, + "grad_norm": 0.3983103930950165, + "learning_rate": 4.2936817277572555e-06, + "loss": 0.3056, + "step": 37344 + }, + { + "epoch": 0.6933310560182383, + "grad_norm": 0.2540654242038727, + "learning_rate": 4.292723837434413e-06, + "loss": 0.3015, + "step": 37346 + }, + { + "epoch": 0.693368186155657, + "grad_norm": 0.18011514842510223, + "learning_rate": 4.291766024770503e-06, + "loss": 0.2771, + "step": 37348 + }, + { + "epoch": 0.6934053162930756, + "grad_norm": 0.5991212129592896, + "learning_rate": 4.290808289778561e-06, + "loss": 0.2694, + "step": 37350 + }, + { + "epoch": 0.6934424464304942, + "grad_norm": 0.2610912024974823, + "learning_rate": 4.289850632471616e-06, + "loss": 0.2559, + "step": 37352 + }, + { + "epoch": 0.6934795765679129, + "grad_norm": 0.4103946089744568, + "learning_rate": 4.288893052862703e-06, + "loss": 0.2397, + "step": 37354 + }, + { + "epoch": 0.6935167067053315, + "grad_norm": 0.5175892114639282, + "learning_rate": 4.287935550964846e-06, + "loss": 0.2825, + "step": 37356 + }, + { + "epoch": 0.6935538368427502, + "grad_norm": 0.43431276082992554, + "learning_rate": 4.286978126791082e-06, + "loss": 0.2989, + "step": 37358 + }, + { + "epoch": 0.6935909669801688, + "grad_norm": 0.3124162256717682, + "learning_rate": 4.286020780354427e-06, + "loss": 0.1383, + "step": 37360 + }, + { + "epoch": 0.6936280971175874, + "grad_norm": 0.2724204957485199, + "learning_rate": 4.285063511667915e-06, + "loss": 0.2985, + "step": 37362 + }, + { + "epoch": 0.6936652272550061, + "grad_norm": 0.47156503796577454, + "learning_rate": 4.28410632074457e-06, + "loss": 0.5, + "step": 37364 + }, + { + "epoch": 0.6937023573924247, + "grad_norm": 0.3313979208469391, + "learning_rate": 4.283149207597422e-06, + "loss": 0.1364, + "step": 37366 + }, + { + "epoch": 0.6937394875298434, + "grad_norm": 0.46410733461380005, + "learning_rate": 4.2821921722394836e-06, + "loss": 0.3799, + "step": 37368 + }, + { + "epoch": 0.693776617667262, + "grad_norm": 0.4512125849723816, + "learning_rate": 4.281235214683783e-06, + "loss": 0.1625, + "step": 37370 + }, + { + "epoch": 0.6938137478046806, + "grad_norm": 0.33146151900291443, + "learning_rate": 4.280278334943343e-06, + "loss": 0.235, + "step": 37372 + }, + { + "epoch": 0.6938508779420992, + "grad_norm": 0.39799147844314575, + "learning_rate": 4.279321533031181e-06, + "loss": 0.4697, + "step": 37374 + }, + { + "epoch": 0.6938880080795179, + "grad_norm": 0.473401814699173, + "learning_rate": 4.278364808960322e-06, + "loss": 0.2879, + "step": 37376 + }, + { + "epoch": 0.6939251382169366, + "grad_norm": 0.42851999402046204, + "learning_rate": 4.277408162743776e-06, + "loss": 0.0827, + "step": 37378 + }, + { + "epoch": 0.6939622683543551, + "grad_norm": 0.3129371404647827, + "learning_rate": 4.276451594394566e-06, + "loss": 0.4156, + "step": 37380 + }, + { + "epoch": 0.6939993984917738, + "grad_norm": 0.26752379536628723, + "learning_rate": 4.275495103925704e-06, + "loss": 0.2589, + "step": 37382 + }, + { + "epoch": 0.6940365286291924, + "grad_norm": 0.404712975025177, + "learning_rate": 4.274538691350205e-06, + "loss": 0.2574, + "step": 37384 + }, + { + "epoch": 0.6940736587666111, + "grad_norm": 0.2926155626773834, + "learning_rate": 4.2735823566810865e-06, + "loss": 0.1791, + "step": 37386 + }, + { + "epoch": 0.6941107889040298, + "grad_norm": 0.34501489996910095, + "learning_rate": 4.272626099931359e-06, + "loss": 0.1423, + "step": 37388 + }, + { + "epoch": 0.6941479190414483, + "grad_norm": 0.31262749433517456, + "learning_rate": 4.271669921114033e-06, + "loss": 0.294, + "step": 37390 + }, + { + "epoch": 0.694185049178867, + "grad_norm": 0.2422153502702713, + "learning_rate": 4.270713820242127e-06, + "loss": 0.3373, + "step": 37392 + }, + { + "epoch": 0.6942221793162856, + "grad_norm": 0.3597490191459656, + "learning_rate": 4.26975779732864e-06, + "loss": 0.335, + "step": 37394 + }, + { + "epoch": 0.6942593094537043, + "grad_norm": 0.4977995455265045, + "learning_rate": 4.268801852386586e-06, + "loss": 0.1368, + "step": 37396 + }, + { + "epoch": 0.694296439591123, + "grad_norm": 1.1158075332641602, + "learning_rate": 4.267845985428974e-06, + "loss": 0.3053, + "step": 37398 + }, + { + "epoch": 0.6943335697285415, + "grad_norm": 0.612968385219574, + "learning_rate": 4.266890196468806e-06, + "loss": 0.2965, + "step": 37400 + }, + { + "epoch": 0.6943706998659602, + "grad_norm": 0.3525151014328003, + "learning_rate": 4.2659344855190895e-06, + "loss": 0.3093, + "step": 37402 + }, + { + "epoch": 0.6944078300033788, + "grad_norm": 0.446323424577713, + "learning_rate": 4.264978852592834e-06, + "loss": 0.3158, + "step": 37404 + }, + { + "epoch": 0.6944449601407975, + "grad_norm": 0.3727822005748749, + "learning_rate": 4.2640232977030326e-06, + "loss": 0.4399, + "step": 37406 + }, + { + "epoch": 0.6944820902782162, + "grad_norm": 0.2541791796684265, + "learning_rate": 4.263067820862693e-06, + "loss": 0.1576, + "step": 37408 + }, + { + "epoch": 0.6945192204156347, + "grad_norm": 0.4023820459842682, + "learning_rate": 4.2621124220848166e-06, + "loss": 0.1899, + "step": 37410 + }, + { + "epoch": 0.6945563505530534, + "grad_norm": 0.4519880712032318, + "learning_rate": 4.2611571013824024e-06, + "loss": 0.3036, + "step": 37412 + }, + { + "epoch": 0.694593480690472, + "grad_norm": 0.2853188216686249, + "learning_rate": 4.260201858768451e-06, + "loss": 0.3678, + "step": 37414 + }, + { + "epoch": 0.6946306108278907, + "grad_norm": 0.4123152196407318, + "learning_rate": 4.259246694255961e-06, + "loss": 0.2446, + "step": 37416 + }, + { + "epoch": 0.6946677409653094, + "grad_norm": 0.5391072034835815, + "learning_rate": 4.258291607857929e-06, + "loss": 0.3781, + "step": 37418 + }, + { + "epoch": 0.6947048711027279, + "grad_norm": 0.3825768828392029, + "learning_rate": 4.2573365995873515e-06, + "loss": 0.2631, + "step": 37420 + }, + { + "epoch": 0.6947420012401466, + "grad_norm": 0.350849986076355, + "learning_rate": 4.256381669457218e-06, + "loss": 0.2957, + "step": 37422 + }, + { + "epoch": 0.6947791313775652, + "grad_norm": 0.3619406223297119, + "learning_rate": 4.2554268174805245e-06, + "loss": 0.2819, + "step": 37424 + }, + { + "epoch": 0.6948162615149839, + "grad_norm": 0.3528803884983063, + "learning_rate": 4.2544720436702665e-06, + "loss": 0.3143, + "step": 37426 + }, + { + "epoch": 0.6948533916524025, + "grad_norm": 0.2968522608280182, + "learning_rate": 4.2535173480394335e-06, + "loss": 0.1522, + "step": 37428 + }, + { + "epoch": 0.6948905217898211, + "grad_norm": 0.7459464073181152, + "learning_rate": 4.25256273060102e-06, + "loss": 0.4226, + "step": 37430 + }, + { + "epoch": 0.6949276519272398, + "grad_norm": 0.368559330701828, + "learning_rate": 4.2516081913680085e-06, + "loss": 0.1973, + "step": 37432 + }, + { + "epoch": 0.6949647820646584, + "grad_norm": 0.39647242426872253, + "learning_rate": 4.250653730353393e-06, + "loss": 0.3448, + "step": 37434 + }, + { + "epoch": 0.6950019122020771, + "grad_norm": 0.6954098343849182, + "learning_rate": 4.249699347570156e-06, + "loss": 0.4064, + "step": 37436 + }, + { + "epoch": 0.6950390423394956, + "grad_norm": 0.5225175619125366, + "learning_rate": 4.248745043031288e-06, + "loss": 0.2281, + "step": 37438 + }, + { + "epoch": 0.6950761724769143, + "grad_norm": 0.23723793029785156, + "learning_rate": 4.247790816749777e-06, + "loss": 0.3442, + "step": 37440 + }, + { + "epoch": 0.695113302614333, + "grad_norm": 0.48138993978500366, + "learning_rate": 4.246836668738603e-06, + "loss": 0.1428, + "step": 37442 + }, + { + "epoch": 0.6951504327517516, + "grad_norm": 0.6554580926895142, + "learning_rate": 4.245882599010746e-06, + "loss": 0.3525, + "step": 37444 + }, + { + "epoch": 0.6951875628891703, + "grad_norm": 0.3643503785133362, + "learning_rate": 4.244928607579191e-06, + "loss": 0.3659, + "step": 37446 + }, + { + "epoch": 0.6952246930265888, + "grad_norm": 0.6216050386428833, + "learning_rate": 4.243974694456919e-06, + "loss": 0.381, + "step": 37448 + }, + { + "epoch": 0.6952618231640075, + "grad_norm": 0.49983587861061096, + "learning_rate": 4.243020859656909e-06, + "loss": 0.4454, + "step": 37450 + }, + { + "epoch": 0.6952989533014262, + "grad_norm": 0.3337538242340088, + "learning_rate": 4.242067103192142e-06, + "loss": 0.0458, + "step": 37452 + }, + { + "epoch": 0.6953360834388448, + "grad_norm": 0.38248777389526367, + "learning_rate": 4.241113425075598e-06, + "loss": 0.1986, + "step": 37454 + }, + { + "epoch": 0.6953732135762635, + "grad_norm": 0.2901327311992645, + "learning_rate": 4.240159825320247e-06, + "loss": 0.2572, + "step": 37456 + }, + { + "epoch": 0.695410343713682, + "grad_norm": 0.28926882147789, + "learning_rate": 4.239206303939069e-06, + "loss": 0.2684, + "step": 37458 + }, + { + "epoch": 0.6954474738511007, + "grad_norm": 0.5033291578292847, + "learning_rate": 4.238252860945037e-06, + "loss": 0.4442, + "step": 37460 + }, + { + "epoch": 0.6954846039885194, + "grad_norm": 0.632887601852417, + "learning_rate": 4.237299496351128e-06, + "loss": 0.2549, + "step": 37462 + }, + { + "epoch": 0.695521734125938, + "grad_norm": 0.38352036476135254, + "learning_rate": 4.236346210170309e-06, + "loss": 0.4555, + "step": 37464 + }, + { + "epoch": 0.6955588642633567, + "grad_norm": 0.46811455488204956, + "learning_rate": 4.2353930024155535e-06, + "loss": 0.4381, + "step": 37466 + }, + { + "epoch": 0.6955959944007752, + "grad_norm": 0.38507992029190063, + "learning_rate": 4.2344398730998344e-06, + "loss": 0.307, + "step": 37468 + }, + { + "epoch": 0.6956331245381939, + "grad_norm": 0.42499494552612305, + "learning_rate": 4.233486822236117e-06, + "loss": 0.2722, + "step": 37470 + }, + { + "epoch": 0.6956702546756126, + "grad_norm": 0.47811153531074524, + "learning_rate": 4.232533849837372e-06, + "loss": 0.153, + "step": 37472 + }, + { + "epoch": 0.6957073848130312, + "grad_norm": 0.4635964334011078, + "learning_rate": 4.231580955916563e-06, + "loss": 0.2245, + "step": 37474 + }, + { + "epoch": 0.6957445149504499, + "grad_norm": 0.4068347215652466, + "learning_rate": 4.230628140486661e-06, + "loss": 0.3483, + "step": 37476 + }, + { + "epoch": 0.6957816450878684, + "grad_norm": 0.42280450463294983, + "learning_rate": 4.229675403560628e-06, + "loss": 0.1681, + "step": 37478 + }, + { + "epoch": 0.6958187752252871, + "grad_norm": 0.6900840401649475, + "learning_rate": 4.228722745151432e-06, + "loss": 0.2912, + "step": 37480 + }, + { + "epoch": 0.6958559053627057, + "grad_norm": 0.5205751657485962, + "learning_rate": 4.227770165272029e-06, + "loss": 0.4093, + "step": 37482 + }, + { + "epoch": 0.6958930355001244, + "grad_norm": 0.3506011366844177, + "learning_rate": 4.226817663935389e-06, + "loss": 0.3064, + "step": 37484 + }, + { + "epoch": 0.6959301656375431, + "grad_norm": 0.29959818720817566, + "learning_rate": 4.225865241154463e-06, + "loss": 0.4677, + "step": 37486 + }, + { + "epoch": 0.6959672957749616, + "grad_norm": 0.4022466242313385, + "learning_rate": 4.224912896942216e-06, + "loss": 0.1247, + "step": 37488 + }, + { + "epoch": 0.6960044259123803, + "grad_norm": 0.3680751621723175, + "learning_rate": 4.223960631311605e-06, + "loss": 0.1713, + "step": 37490 + }, + { + "epoch": 0.6960415560497989, + "grad_norm": 0.4107588827610016, + "learning_rate": 4.223008444275594e-06, + "loss": 0.3979, + "step": 37492 + }, + { + "epoch": 0.6960786861872176, + "grad_norm": 0.2906644344329834, + "learning_rate": 4.222056335847129e-06, + "loss": 0.2652, + "step": 37494 + }, + { + "epoch": 0.6961158163246363, + "grad_norm": 0.48166534304618835, + "learning_rate": 4.221104306039171e-06, + "loss": 0.2049, + "step": 37496 + }, + { + "epoch": 0.6961529464620548, + "grad_norm": 0.34526917338371277, + "learning_rate": 4.220152354864673e-06, + "loss": 0.2235, + "step": 37498 + }, + { + "epoch": 0.6961900765994735, + "grad_norm": 0.4836229085922241, + "learning_rate": 4.21920048233659e-06, + "loss": 0.3955, + "step": 37500 + }, + { + "epoch": 0.6962272067368921, + "grad_norm": 0.26777270436286926, + "learning_rate": 4.218248688467875e-06, + "loss": 0.4313, + "step": 37502 + }, + { + "epoch": 0.6962643368743108, + "grad_norm": 0.3902604281902313, + "learning_rate": 4.217296973271474e-06, + "loss": 0.3593, + "step": 37504 + }, + { + "epoch": 0.6963014670117295, + "grad_norm": 0.36201363801956177, + "learning_rate": 4.216345336760343e-06, + "loss": 0.2395, + "step": 37506 + }, + { + "epoch": 0.696338597149148, + "grad_norm": 0.38509005308151245, + "learning_rate": 4.215393778947425e-06, + "loss": 0.2041, + "step": 37508 + }, + { + "epoch": 0.6963757272865667, + "grad_norm": 0.45131295919418335, + "learning_rate": 4.214442299845671e-06, + "loss": 0.3577, + "step": 37510 + }, + { + "epoch": 0.6964128574239853, + "grad_norm": 0.37255147099494934, + "learning_rate": 4.213490899468028e-06, + "loss": 0.3707, + "step": 37512 + }, + { + "epoch": 0.696449987561404, + "grad_norm": 0.29529792070388794, + "learning_rate": 4.21253957782744e-06, + "loss": 0.4726, + "step": 37514 + }, + { + "epoch": 0.6964871176988227, + "grad_norm": 0.38325929641723633, + "learning_rate": 4.211588334936854e-06, + "loss": 0.3211, + "step": 37516 + }, + { + "epoch": 0.6965242478362412, + "grad_norm": 0.27217838168144226, + "learning_rate": 4.210637170809215e-06, + "loss": 0.3198, + "step": 37518 + }, + { + "epoch": 0.6965613779736599, + "grad_norm": 0.23869448900222778, + "learning_rate": 4.20968608545746e-06, + "loss": 0.3588, + "step": 37520 + }, + { + "epoch": 0.6965985081110785, + "grad_norm": 0.4707834720611572, + "learning_rate": 4.208735078894533e-06, + "loss": 0.2953, + "step": 37522 + }, + { + "epoch": 0.6966356382484972, + "grad_norm": 0.3063238859176636, + "learning_rate": 4.207784151133376e-06, + "loss": 0.1503, + "step": 37524 + }, + { + "epoch": 0.6966727683859157, + "grad_norm": 0.342106431722641, + "learning_rate": 4.206833302186929e-06, + "loss": 0.1926, + "step": 37526 + }, + { + "epoch": 0.6967098985233344, + "grad_norm": 0.4294862151145935, + "learning_rate": 4.205882532068125e-06, + "loss": 0.3563, + "step": 37528 + }, + { + "epoch": 0.6967470286607531, + "grad_norm": 0.44247958064079285, + "learning_rate": 4.204931840789909e-06, + "loss": 0.3288, + "step": 37530 + }, + { + "epoch": 0.6967841587981717, + "grad_norm": 0.6680423021316528, + "learning_rate": 4.203981228365207e-06, + "loss": 0.3553, + "step": 37532 + }, + { + "epoch": 0.6968212889355904, + "grad_norm": 0.4630979895591736, + "learning_rate": 4.203030694806961e-06, + "loss": 0.2738, + "step": 37534 + }, + { + "epoch": 0.6968584190730089, + "grad_norm": 0.4503055214881897, + "learning_rate": 4.202080240128102e-06, + "loss": 0.2674, + "step": 37536 + }, + { + "epoch": 0.6968955492104276, + "grad_norm": 0.28805026412010193, + "learning_rate": 4.201129864341565e-06, + "loss": 0.2418, + "step": 37538 + }, + { + "epoch": 0.6969326793478463, + "grad_norm": 0.408663809299469, + "learning_rate": 4.20017956746028e-06, + "loss": 0.3822, + "step": 37540 + }, + { + "epoch": 0.6969698094852649, + "grad_norm": 0.4131579101085663, + "learning_rate": 4.19922934949718e-06, + "loss": 0.1813, + "step": 37542 + }, + { + "epoch": 0.6970069396226836, + "grad_norm": 0.5442785024642944, + "learning_rate": 4.198279210465197e-06, + "loss": 0.3224, + "step": 37544 + }, + { + "epoch": 0.6970440697601021, + "grad_norm": 0.31905093789100647, + "learning_rate": 4.19732915037725e-06, + "loss": 0.4477, + "step": 37546 + }, + { + "epoch": 0.6970811998975208, + "grad_norm": 0.5105418562889099, + "learning_rate": 4.1963791692462785e-06, + "loss": 0.3242, + "step": 37548 + }, + { + "epoch": 0.6971183300349395, + "grad_norm": 0.16466456651687622, + "learning_rate": 4.195429267085198e-06, + "loss": 0.1953, + "step": 37550 + }, + { + "epoch": 0.6971554601723581, + "grad_norm": 0.4903988540172577, + "learning_rate": 4.194479443906938e-06, + "loss": 0.2386, + "step": 37552 + }, + { + "epoch": 0.6971925903097768, + "grad_norm": 0.26322484016418457, + "learning_rate": 4.193529699724423e-06, + "loss": 0.296, + "step": 37554 + }, + { + "epoch": 0.6972297204471953, + "grad_norm": 0.30767589807510376, + "learning_rate": 4.19258003455058e-06, + "loss": 0.2821, + "step": 37556 + }, + { + "epoch": 0.697266850584614, + "grad_norm": 0.4158708155155182, + "learning_rate": 4.191630448398325e-06, + "loss": 0.1884, + "step": 37558 + }, + { + "epoch": 0.6973039807220327, + "grad_norm": 0.3561565577983856, + "learning_rate": 4.190680941280582e-06, + "loss": 0.1984, + "step": 37560 + }, + { + "epoch": 0.6973411108594513, + "grad_norm": 0.2912541329860687, + "learning_rate": 4.189731513210269e-06, + "loss": 0.3358, + "step": 37562 + }, + { + "epoch": 0.69737824099687, + "grad_norm": 0.21195116639137268, + "learning_rate": 4.188782164200306e-06, + "loss": 0.3805, + "step": 37564 + }, + { + "epoch": 0.6974153711342885, + "grad_norm": 0.5318635106086731, + "learning_rate": 4.187832894263616e-06, + "loss": 0.3543, + "step": 37566 + }, + { + "epoch": 0.6974525012717072, + "grad_norm": 0.43506497144699097, + "learning_rate": 4.186883703413106e-06, + "loss": 0.0685, + "step": 37568 + }, + { + "epoch": 0.6974896314091259, + "grad_norm": 0.4608241319656372, + "learning_rate": 4.185934591661701e-06, + "loss": 0.1482, + "step": 37570 + }, + { + "epoch": 0.6975267615465445, + "grad_norm": 0.6568871736526489, + "learning_rate": 4.184985559022308e-06, + "loss": 0.2944, + "step": 37572 + }, + { + "epoch": 0.6975638916839632, + "grad_norm": 0.43620240688323975, + "learning_rate": 4.184036605507842e-06, + "loss": 0.122, + "step": 37574 + }, + { + "epoch": 0.6976010218213817, + "grad_norm": 0.43029579520225525, + "learning_rate": 4.1830877311312176e-06, + "loss": 0.3049, + "step": 37576 + }, + { + "epoch": 0.6976381519588004, + "grad_norm": 0.35107287764549255, + "learning_rate": 4.182138935905346e-06, + "loss": 0.042, + "step": 37578 + }, + { + "epoch": 0.697675282096219, + "grad_norm": 0.5365400314331055, + "learning_rate": 4.18119021984314e-06, + "loss": 0.1935, + "step": 37580 + }, + { + "epoch": 0.6977124122336377, + "grad_norm": 0.265237957239151, + "learning_rate": 4.180241582957503e-06, + "loss": 0.3054, + "step": 37582 + }, + { + "epoch": 0.6977495423710564, + "grad_norm": 0.3441588878631592, + "learning_rate": 4.179293025261345e-06, + "loss": 0.1817, + "step": 37584 + }, + { + "epoch": 0.6977866725084749, + "grad_norm": 0.3767799735069275, + "learning_rate": 4.178344546767574e-06, + "loss": 0.3375, + "step": 37586 + }, + { + "epoch": 0.6978238026458936, + "grad_norm": 0.3517741858959198, + "learning_rate": 4.1773961474891e-06, + "loss": 0.3532, + "step": 37588 + }, + { + "epoch": 0.6978609327833122, + "grad_norm": 0.47272446751594543, + "learning_rate": 4.1764478274388184e-06, + "loss": 0.283, + "step": 37590 + }, + { + "epoch": 0.6978980629207309, + "grad_norm": 0.38956931233406067, + "learning_rate": 4.17549958662964e-06, + "loss": 0.2559, + "step": 37592 + }, + { + "epoch": 0.6979351930581496, + "grad_norm": 0.3741835951805115, + "learning_rate": 4.174551425074469e-06, + "loss": 0.1902, + "step": 37594 + }, + { + "epoch": 0.6979723231955681, + "grad_norm": 0.20991989970207214, + "learning_rate": 4.173603342786201e-06, + "loss": 0.1295, + "step": 37596 + }, + { + "epoch": 0.6980094533329868, + "grad_norm": 0.5151546597480774, + "learning_rate": 4.172655339777738e-06, + "loss": 0.1201, + "step": 37598 + }, + { + "epoch": 0.6980465834704054, + "grad_norm": 0.5855364799499512, + "learning_rate": 4.171707416061982e-06, + "loss": 0.2507, + "step": 37600 + }, + { + "epoch": 0.6980837136078241, + "grad_norm": 0.4099443554878235, + "learning_rate": 4.17075957165183e-06, + "loss": 0.2664, + "step": 37602 + }, + { + "epoch": 0.6981208437452427, + "grad_norm": 0.3729911148548126, + "learning_rate": 4.16981180656018e-06, + "loss": 0.2503, + "step": 37604 + }, + { + "epoch": 0.6981579738826613, + "grad_norm": 0.3064371943473816, + "learning_rate": 4.168864120799931e-06, + "loss": 0.2979, + "step": 37606 + }, + { + "epoch": 0.69819510402008, + "grad_norm": 0.24513548612594604, + "learning_rate": 4.167916514383972e-06, + "loss": 0.1514, + "step": 37608 + }, + { + "epoch": 0.6982322341574986, + "grad_norm": 0.4624926447868347, + "learning_rate": 4.1669689873252026e-06, + "loss": 0.3508, + "step": 37610 + }, + { + "epoch": 0.6982693642949173, + "grad_norm": 0.413936972618103, + "learning_rate": 4.1660215396365104e-06, + "loss": 0.1642, + "step": 37612 + }, + { + "epoch": 0.698306494432336, + "grad_norm": 0.5020574927330017, + "learning_rate": 4.16507417133079e-06, + "loss": 0.119, + "step": 37614 + }, + { + "epoch": 0.6983436245697545, + "grad_norm": 0.30622127652168274, + "learning_rate": 4.1641268824209326e-06, + "loss": 0.3045, + "step": 37616 + }, + { + "epoch": 0.6983807547071732, + "grad_norm": 0.24131809175014496, + "learning_rate": 4.163179672919831e-06, + "loss": 0.3791, + "step": 37618 + }, + { + "epoch": 0.6984178848445918, + "grad_norm": 0.3619144856929779, + "learning_rate": 4.162232542840369e-06, + "loss": 0.227, + "step": 37620 + }, + { + "epoch": 0.6984550149820105, + "grad_norm": 0.4356173276901245, + "learning_rate": 4.161285492195434e-06, + "loss": 0.3776, + "step": 37622 + }, + { + "epoch": 0.6984921451194291, + "grad_norm": 0.34050577878952026, + "learning_rate": 4.1603385209979155e-06, + "loss": 0.3184, + "step": 37624 + }, + { + "epoch": 0.6985292752568477, + "grad_norm": 0.43384894728660583, + "learning_rate": 4.159391629260698e-06, + "loss": 0.3066, + "step": 37626 + }, + { + "epoch": 0.6985664053942664, + "grad_norm": 0.5321500897407532, + "learning_rate": 4.158444816996665e-06, + "loss": 0.3077, + "step": 37628 + }, + { + "epoch": 0.698603535531685, + "grad_norm": 0.3725312352180481, + "learning_rate": 4.157498084218704e-06, + "loss": 0.3614, + "step": 37630 + }, + { + "epoch": 0.6986406656691037, + "grad_norm": 0.3035672903060913, + "learning_rate": 4.156551430939691e-06, + "loss": 0.2768, + "step": 37632 + }, + { + "epoch": 0.6986777958065222, + "grad_norm": 0.3667818605899811, + "learning_rate": 4.1556048571725125e-06, + "loss": 0.4527, + "step": 37634 + }, + { + "epoch": 0.6987149259439409, + "grad_norm": 0.314712792634964, + "learning_rate": 4.1546583629300445e-06, + "loss": 0.271, + "step": 37636 + }, + { + "epoch": 0.6987520560813596, + "grad_norm": 0.44323572516441345, + "learning_rate": 4.153711948225165e-06, + "loss": 0.2772, + "step": 37638 + }, + { + "epoch": 0.6987891862187782, + "grad_norm": 0.2460128366947174, + "learning_rate": 4.152765613070755e-06, + "loss": 0.1947, + "step": 37640 + }, + { + "epoch": 0.6988263163561969, + "grad_norm": 0.41295182704925537, + "learning_rate": 4.1518193574796904e-06, + "loss": 0.352, + "step": 37642 + }, + { + "epoch": 0.6988634464936154, + "grad_norm": 0.4137716591358185, + "learning_rate": 4.15087318146485e-06, + "loss": 0.1422, + "step": 37644 + }, + { + "epoch": 0.6989005766310341, + "grad_norm": 0.31835871934890747, + "learning_rate": 4.149927085039104e-06, + "loss": 0.1627, + "step": 37646 + }, + { + "epoch": 0.6989377067684528, + "grad_norm": 0.5508441925048828, + "learning_rate": 4.148981068215325e-06, + "loss": 0.3109, + "step": 37648 + }, + { + "epoch": 0.6989748369058714, + "grad_norm": 0.32172903418540955, + "learning_rate": 4.14803513100639e-06, + "loss": 0.184, + "step": 37650 + }, + { + "epoch": 0.69901196704329, + "grad_norm": 0.140688955783844, + "learning_rate": 4.14708927342517e-06, + "loss": 0.2795, + "step": 37652 + }, + { + "epoch": 0.6990490971807086, + "grad_norm": 0.2567431330680847, + "learning_rate": 4.146143495484531e-06, + "loss": 0.3537, + "step": 37654 + }, + { + "epoch": 0.6990862273181273, + "grad_norm": 0.3427571952342987, + "learning_rate": 4.145197797197347e-06, + "loss": 0.2126, + "step": 37656 + }, + { + "epoch": 0.699123357455546, + "grad_norm": 0.29616907238960266, + "learning_rate": 4.144252178576481e-06, + "loss": 0.0955, + "step": 37658 + }, + { + "epoch": 0.6991604875929646, + "grad_norm": 0.4767168462276459, + "learning_rate": 4.143306639634804e-06, + "loss": 0.4462, + "step": 37660 + }, + { + "epoch": 0.6991976177303832, + "grad_norm": 0.3542271554470062, + "learning_rate": 4.142361180385178e-06, + "loss": 0.4139, + "step": 37662 + }, + { + "epoch": 0.6992347478678018, + "grad_norm": 0.346851646900177, + "learning_rate": 4.141415800840472e-06, + "loss": 0.3809, + "step": 37664 + }, + { + "epoch": 0.6992718780052205, + "grad_norm": 0.43628594279289246, + "learning_rate": 4.140470501013549e-06, + "loss": 0.3494, + "step": 37666 + }, + { + "epoch": 0.6993090081426392, + "grad_norm": 0.41150835156440735, + "learning_rate": 4.139525280917272e-06, + "loss": 0.4228, + "step": 37668 + }, + { + "epoch": 0.6993461382800578, + "grad_norm": 0.2593783736228943, + "learning_rate": 4.138580140564504e-06, + "loss": 0.1452, + "step": 37670 + }, + { + "epoch": 0.6993832684174764, + "grad_norm": 0.6857787370681763, + "learning_rate": 4.1376350799681e-06, + "loss": 0.2792, + "step": 37672 + }, + { + "epoch": 0.699420398554895, + "grad_norm": 0.4030986726284027, + "learning_rate": 4.136690099140926e-06, + "loss": 0.2579, + "step": 37674 + }, + { + "epoch": 0.6994575286923137, + "grad_norm": 0.45294061303138733, + "learning_rate": 4.135745198095834e-06, + "loss": 0.2109, + "step": 37676 + }, + { + "epoch": 0.6994946588297323, + "grad_norm": 0.33016330003738403, + "learning_rate": 4.134800376845685e-06, + "loss": 0.2874, + "step": 37678 + }, + { + "epoch": 0.699531788967151, + "grad_norm": 0.32950305938720703, + "learning_rate": 4.133855635403333e-06, + "loss": 0.1439, + "step": 37680 + }, + { + "epoch": 0.6995689191045696, + "grad_norm": 0.3747512400150299, + "learning_rate": 4.13291097378164e-06, + "loss": 0.2902, + "step": 37682 + }, + { + "epoch": 0.6996060492419882, + "grad_norm": 0.3436223864555359, + "learning_rate": 4.1319663919934495e-06, + "loss": 0.0931, + "step": 37684 + }, + { + "epoch": 0.6996431793794069, + "grad_norm": 0.47288450598716736, + "learning_rate": 4.131021890051621e-06, + "loss": 0.3023, + "step": 37686 + }, + { + "epoch": 0.6996803095168255, + "grad_norm": 0.5227087140083313, + "learning_rate": 4.130077467969003e-06, + "loss": 0.2947, + "step": 37688 + }, + { + "epoch": 0.6997174396542442, + "grad_norm": 0.3664332330226898, + "learning_rate": 4.12913312575845e-06, + "loss": 0.3465, + "step": 37690 + }, + { + "epoch": 0.6997545697916628, + "grad_norm": 0.48409759998321533, + "learning_rate": 4.128188863432809e-06, + "loss": 0.2001, + "step": 37692 + }, + { + "epoch": 0.6997916999290814, + "grad_norm": 0.5607906579971313, + "learning_rate": 4.127244681004934e-06, + "loss": 0.3164, + "step": 37694 + }, + { + "epoch": 0.6998288300665001, + "grad_norm": 0.4359683096408844, + "learning_rate": 4.126300578487667e-06, + "loss": 0.452, + "step": 37696 + }, + { + "epoch": 0.6998659602039187, + "grad_norm": 0.3591301441192627, + "learning_rate": 4.125356555893852e-06, + "loss": 0.2975, + "step": 37698 + }, + { + "epoch": 0.6999030903413374, + "grad_norm": 0.3066635727882385, + "learning_rate": 4.124412613236338e-06, + "loss": 0.3708, + "step": 37700 + }, + { + "epoch": 0.699940220478756, + "grad_norm": 0.3267544209957123, + "learning_rate": 4.123468750527969e-06, + "loss": 0.2164, + "step": 37702 + }, + { + "epoch": 0.6999773506161746, + "grad_norm": 0.459110289812088, + "learning_rate": 4.122524967781587e-06, + "loss": 0.2629, + "step": 37704 + }, + { + "epoch": 0.7000144807535933, + "grad_norm": 0.46801772713661194, + "learning_rate": 4.121581265010036e-06, + "loss": 0.3632, + "step": 37706 + }, + { + "epoch": 0.7000516108910119, + "grad_norm": 0.4044855833053589, + "learning_rate": 4.1206376422261594e-06, + "loss": 0.2612, + "step": 37708 + }, + { + "epoch": 0.7000887410284306, + "grad_norm": 0.2556871175765991, + "learning_rate": 4.11969409944279e-06, + "loss": 0.1196, + "step": 37710 + }, + { + "epoch": 0.7001258711658492, + "grad_norm": 0.2033730447292328, + "learning_rate": 4.1187506366727715e-06, + "loss": 0.1117, + "step": 37712 + }, + { + "epoch": 0.7001630013032678, + "grad_norm": 0.4286501705646515, + "learning_rate": 4.117807253928939e-06, + "loss": 0.308, + "step": 37714 + }, + { + "epoch": 0.7002001314406865, + "grad_norm": 0.5378456115722656, + "learning_rate": 4.116863951224135e-06, + "loss": 0.3368, + "step": 37716 + }, + { + "epoch": 0.7002372615781051, + "grad_norm": 0.31801366806030273, + "learning_rate": 4.115920728571187e-06, + "loss": 0.2687, + "step": 37718 + }, + { + "epoch": 0.7002743917155237, + "grad_norm": 0.4315456449985504, + "learning_rate": 4.114977585982937e-06, + "loss": 0.2596, + "step": 37720 + }, + { + "epoch": 0.7003115218529424, + "grad_norm": 0.3213304281234741, + "learning_rate": 4.114034523472209e-06, + "loss": 0.1378, + "step": 37722 + }, + { + "epoch": 0.700348651990361, + "grad_norm": 0.47819802165031433, + "learning_rate": 4.113091541051842e-06, + "loss": 0.3502, + "step": 37724 + }, + { + "epoch": 0.7003857821277797, + "grad_norm": 0.43013232946395874, + "learning_rate": 4.112148638734665e-06, + "loss": 0.4163, + "step": 37726 + }, + { + "epoch": 0.7004229122651983, + "grad_norm": 0.29877662658691406, + "learning_rate": 4.11120581653351e-06, + "loss": 0.3496, + "step": 37728 + }, + { + "epoch": 0.700460042402617, + "grad_norm": 0.4377192258834839, + "learning_rate": 4.110263074461205e-06, + "loss": 0.3214, + "step": 37730 + }, + { + "epoch": 0.7004971725400355, + "grad_norm": 0.23702198266983032, + "learning_rate": 4.109320412530581e-06, + "loss": 0.2188, + "step": 37732 + }, + { + "epoch": 0.7005343026774542, + "grad_norm": 0.4501229524612427, + "learning_rate": 4.108377830754456e-06, + "loss": 0.2696, + "step": 37734 + }, + { + "epoch": 0.7005714328148729, + "grad_norm": 0.41851940751075745, + "learning_rate": 4.107435329145664e-06, + "loss": 0.3485, + "step": 37736 + }, + { + "epoch": 0.7006085629522915, + "grad_norm": 0.22722987830638885, + "learning_rate": 4.106492907717029e-06, + "loss": 0.2295, + "step": 37738 + }, + { + "epoch": 0.7006456930897101, + "grad_norm": 0.4519592225551605, + "learning_rate": 4.1055505664813686e-06, + "loss": 0.4492, + "step": 37740 + }, + { + "epoch": 0.7006828232271287, + "grad_norm": 0.3220359981060028, + "learning_rate": 4.1046083054515105e-06, + "loss": 0.1947, + "step": 37742 + }, + { + "epoch": 0.7007199533645474, + "grad_norm": 0.5905147790908813, + "learning_rate": 4.103666124640278e-06, + "loss": 0.2133, + "step": 37744 + }, + { + "epoch": 0.7007570835019661, + "grad_norm": 0.5934521555900574, + "learning_rate": 4.102724024060483e-06, + "loss": 0.2781, + "step": 37746 + }, + { + "epoch": 0.7007942136393847, + "grad_norm": 0.3777603805065155, + "learning_rate": 4.10178200372495e-06, + "loss": 0.4022, + "step": 37748 + }, + { + "epoch": 0.7008313437768033, + "grad_norm": 0.500083863735199, + "learning_rate": 4.100840063646497e-06, + "loss": 0.2882, + "step": 37750 + }, + { + "epoch": 0.7008684739142219, + "grad_norm": 0.5655398368835449, + "learning_rate": 4.099898203837942e-06, + "loss": 0.3197, + "step": 37752 + }, + { + "epoch": 0.7009056040516406, + "grad_norm": 0.3618127405643463, + "learning_rate": 4.098956424312098e-06, + "loss": 0.2952, + "step": 37754 + }, + { + "epoch": 0.7009427341890593, + "grad_norm": 0.45220354199409485, + "learning_rate": 4.098014725081786e-06, + "loss": 0.4423, + "step": 37756 + }, + { + "epoch": 0.7009798643264779, + "grad_norm": 0.33686986565589905, + "learning_rate": 4.09707310615981e-06, + "loss": 0.1224, + "step": 37758 + }, + { + "epoch": 0.7010169944638965, + "grad_norm": 0.4365827143192291, + "learning_rate": 4.096131567558993e-06, + "loss": 0.1186, + "step": 37760 + }, + { + "epoch": 0.7010541246013151, + "grad_norm": 0.5030139684677124, + "learning_rate": 4.095190109292136e-06, + "loss": 0.1427, + "step": 37762 + }, + { + "epoch": 0.7010912547387338, + "grad_norm": 0.23562489449977875, + "learning_rate": 4.094248731372056e-06, + "loss": 0.1305, + "step": 37764 + }, + { + "epoch": 0.7011283848761525, + "grad_norm": 0.27677369117736816, + "learning_rate": 4.0933074338115605e-06, + "loss": 0.1817, + "step": 37766 + }, + { + "epoch": 0.701165515013571, + "grad_norm": 0.3920479118824005, + "learning_rate": 4.092366216623458e-06, + "loss": 0.2752, + "step": 37768 + }, + { + "epoch": 0.7012026451509897, + "grad_norm": 0.34162870049476624, + "learning_rate": 4.09142507982056e-06, + "loss": 0.2336, + "step": 37770 + }, + { + "epoch": 0.7012397752884083, + "grad_norm": 0.3959967792034149, + "learning_rate": 4.090484023415665e-06, + "loss": 0.2677, + "step": 37772 + }, + { + "epoch": 0.701276905425827, + "grad_norm": 0.6603890657424927, + "learning_rate": 4.0895430474215805e-06, + "loss": 0.1808, + "step": 37774 + }, + { + "epoch": 0.7013140355632457, + "grad_norm": 0.293867290019989, + "learning_rate": 4.088602151851112e-06, + "loss": 0.4088, + "step": 37776 + }, + { + "epoch": 0.7013511657006642, + "grad_norm": 0.32179710268974304, + "learning_rate": 4.087661336717062e-06, + "loss": 0.2979, + "step": 37778 + }, + { + "epoch": 0.7013882958380829, + "grad_norm": 0.502496600151062, + "learning_rate": 4.086720602032234e-06, + "loss": 0.3175, + "step": 37780 + }, + { + "epoch": 0.7014254259755015, + "grad_norm": 0.29389771819114685, + "learning_rate": 4.0857799478094265e-06, + "loss": 0.401, + "step": 37782 + }, + { + "epoch": 0.7014625561129202, + "grad_norm": 0.33370691537857056, + "learning_rate": 4.0848393740614355e-06, + "loss": 0.4136, + "step": 37784 + }, + { + "epoch": 0.7014996862503388, + "grad_norm": 0.3592650592327118, + "learning_rate": 4.083898880801063e-06, + "loss": 0.1797, + "step": 37786 + }, + { + "epoch": 0.7015368163877574, + "grad_norm": 0.39016252756118774, + "learning_rate": 4.082958468041105e-06, + "loss": 0.4524, + "step": 37788 + }, + { + "epoch": 0.7015739465251761, + "grad_norm": 0.3908602297306061, + "learning_rate": 4.08201813579436e-06, + "loss": 0.2031, + "step": 37790 + }, + { + "epoch": 0.7016110766625947, + "grad_norm": 0.46403294801712036, + "learning_rate": 4.081077884073621e-06, + "loss": 0.3684, + "step": 37792 + }, + { + "epoch": 0.7016482068000134, + "grad_norm": 0.3583206236362457, + "learning_rate": 4.080137712891682e-06, + "loss": 0.1656, + "step": 37794 + }, + { + "epoch": 0.701685336937432, + "grad_norm": 0.4870445430278778, + "learning_rate": 4.079197622261342e-06, + "loss": 0.3314, + "step": 37796 + }, + { + "epoch": 0.7017224670748506, + "grad_norm": 0.5426719784736633, + "learning_rate": 4.078257612195382e-06, + "loss": 0.6092, + "step": 37798 + }, + { + "epoch": 0.7017595972122693, + "grad_norm": 0.5088409781455994, + "learning_rate": 4.0773176827065994e-06, + "loss": 0.3242, + "step": 37800 + }, + { + "epoch": 0.7017967273496879, + "grad_norm": 0.3242870569229126, + "learning_rate": 4.0763778338077865e-06, + "loss": 0.3396, + "step": 37802 + }, + { + "epoch": 0.7018338574871066, + "grad_norm": 0.34176456928253174, + "learning_rate": 4.075438065511724e-06, + "loss": 0.3384, + "step": 37804 + }, + { + "epoch": 0.7018709876245252, + "grad_norm": 0.4830511808395386, + "learning_rate": 4.074498377831203e-06, + "loss": 0.1871, + "step": 37806 + }, + { + "epoch": 0.7019081177619438, + "grad_norm": 0.3466936945915222, + "learning_rate": 4.073558770779015e-06, + "loss": 0.1705, + "step": 37808 + }, + { + "epoch": 0.7019452478993625, + "grad_norm": 0.3258918225765228, + "learning_rate": 4.072619244367935e-06, + "loss": 0.2298, + "step": 37810 + }, + { + "epoch": 0.7019823780367811, + "grad_norm": 0.4227856993675232, + "learning_rate": 4.071679798610754e-06, + "loss": 0.4509, + "step": 37812 + }, + { + "epoch": 0.7020195081741998, + "grad_norm": 0.39261776208877563, + "learning_rate": 4.070740433520253e-06, + "loss": 0.2549, + "step": 37814 + }, + { + "epoch": 0.7020566383116184, + "grad_norm": 0.3777517080307007, + "learning_rate": 4.069801149109214e-06, + "loss": 0.2878, + "step": 37816 + }, + { + "epoch": 0.702093768449037, + "grad_norm": 0.29705309867858887, + "learning_rate": 4.068861945390419e-06, + "loss": 0.3676, + "step": 37818 + }, + { + "epoch": 0.7021308985864557, + "grad_norm": 0.4161626994609833, + "learning_rate": 4.06792282237665e-06, + "loss": 0.3213, + "step": 37820 + }, + { + "epoch": 0.7021680287238743, + "grad_norm": 0.42853784561157227, + "learning_rate": 4.06698378008068e-06, + "loss": 0.3133, + "step": 37822 + }, + { + "epoch": 0.702205158861293, + "grad_norm": 0.35260993242263794, + "learning_rate": 4.066044818515292e-06, + "loss": 0.2353, + "step": 37824 + }, + { + "epoch": 0.7022422889987116, + "grad_norm": 0.34866058826446533, + "learning_rate": 4.065105937693257e-06, + "loss": 0.3114, + "step": 37826 + }, + { + "epoch": 0.7022794191361302, + "grad_norm": 0.27555426955223083, + "learning_rate": 4.064167137627353e-06, + "loss": 0.106, + "step": 37828 + }, + { + "epoch": 0.7023165492735488, + "grad_norm": 0.2515064477920532, + "learning_rate": 4.063228418330354e-06, + "loss": 0.2234, + "step": 37830 + }, + { + "epoch": 0.7023536794109675, + "grad_norm": 0.49192655086517334, + "learning_rate": 4.062289779815034e-06, + "loss": 0.3641, + "step": 37832 + }, + { + "epoch": 0.7023908095483862, + "grad_norm": 0.3787946105003357, + "learning_rate": 4.061351222094167e-06, + "loss": 0.214, + "step": 37834 + }, + { + "epoch": 0.7024279396858047, + "grad_norm": 0.43031445145606995, + "learning_rate": 4.06041274518052e-06, + "loss": 0.357, + "step": 37836 + }, + { + "epoch": 0.7024650698232234, + "grad_norm": 0.6578899621963501, + "learning_rate": 4.059474349086862e-06, + "loss": 0.5055, + "step": 37838 + }, + { + "epoch": 0.702502199960642, + "grad_norm": 0.381585031747818, + "learning_rate": 4.058536033825966e-06, + "loss": 0.2847, + "step": 37840 + }, + { + "epoch": 0.7025393300980607, + "grad_norm": 0.2762907147407532, + "learning_rate": 4.057597799410601e-06, + "loss": 0.1899, + "step": 37842 + }, + { + "epoch": 0.7025764602354794, + "grad_norm": 0.5346901416778564, + "learning_rate": 4.056659645853527e-06, + "loss": 0.3735, + "step": 37844 + }, + { + "epoch": 0.702613590372898, + "grad_norm": 0.41578084230422974, + "learning_rate": 4.055721573167516e-06, + "loss": 0.3694, + "step": 37846 + }, + { + "epoch": 0.7026507205103166, + "grad_norm": 1.122694969177246, + "learning_rate": 4.0547835813653255e-06, + "loss": 0.3383, + "step": 37848 + }, + { + "epoch": 0.7026878506477352, + "grad_norm": 0.43183985352516174, + "learning_rate": 4.053845670459723e-06, + "loss": 0.5496, + "step": 37850 + }, + { + "epoch": 0.7027249807851539, + "grad_norm": 0.49359989166259766, + "learning_rate": 4.0529078404634704e-06, + "loss": 0.2695, + "step": 37852 + }, + { + "epoch": 0.7027621109225726, + "grad_norm": 0.3461620807647705, + "learning_rate": 4.051970091389328e-06, + "loss": 0.4237, + "step": 37854 + }, + { + "epoch": 0.7027992410599911, + "grad_norm": 0.3669116795063019, + "learning_rate": 4.051032423250056e-06, + "loss": 0.2242, + "step": 37856 + }, + { + "epoch": 0.7028363711974098, + "grad_norm": 0.32042476534843445, + "learning_rate": 4.050094836058417e-06, + "loss": 0.3721, + "step": 37858 + }, + { + "epoch": 0.7028735013348284, + "grad_norm": 0.41872918605804443, + "learning_rate": 4.049157329827162e-06, + "loss": 0.2149, + "step": 37860 + }, + { + "epoch": 0.7029106314722471, + "grad_norm": 0.5100609064102173, + "learning_rate": 4.048219904569051e-06, + "loss": 0.2104, + "step": 37862 + }, + { + "epoch": 0.7029477616096658, + "grad_norm": 0.31945064663887024, + "learning_rate": 4.047282560296842e-06, + "loss": 0.1569, + "step": 37864 + }, + { + "epoch": 0.7029848917470843, + "grad_norm": 0.39309704303741455, + "learning_rate": 4.046345297023285e-06, + "loss": 0.1939, + "step": 37866 + }, + { + "epoch": 0.703022021884503, + "grad_norm": 0.4301413297653198, + "learning_rate": 4.045408114761134e-06, + "loss": 0.2032, + "step": 37868 + }, + { + "epoch": 0.7030591520219216, + "grad_norm": 0.2673805356025696, + "learning_rate": 4.044471013523147e-06, + "loss": 0.3107, + "step": 37870 + }, + { + "epoch": 0.7030962821593403, + "grad_norm": 0.3798066973686218, + "learning_rate": 4.043533993322066e-06, + "loss": 0.4007, + "step": 37872 + }, + { + "epoch": 0.703133412296759, + "grad_norm": 0.3704039454460144, + "learning_rate": 4.042597054170647e-06, + "loss": 0.2488, + "step": 37874 + }, + { + "epoch": 0.7031705424341775, + "grad_norm": 0.3890950083732605, + "learning_rate": 4.041660196081637e-06, + "loss": 0.3598, + "step": 37876 + }, + { + "epoch": 0.7032076725715962, + "grad_norm": 0.24388466775417328, + "learning_rate": 4.040723419067784e-06, + "loss": 0.1541, + "step": 37878 + }, + { + "epoch": 0.7032448027090148, + "grad_norm": 0.47143498063087463, + "learning_rate": 4.039786723141835e-06, + "loss": 0.3801, + "step": 37880 + }, + { + "epoch": 0.7032819328464335, + "grad_norm": 0.5811455845832825, + "learning_rate": 4.0388501083165365e-06, + "loss": 0.2348, + "step": 37882 + }, + { + "epoch": 0.703319062983852, + "grad_norm": 0.21302053332328796, + "learning_rate": 4.037913574604636e-06, + "loss": 0.2076, + "step": 37884 + }, + { + "epoch": 0.7033561931212707, + "grad_norm": 0.2585172951221466, + "learning_rate": 4.036977122018869e-06, + "loss": 0.3615, + "step": 37886 + }, + { + "epoch": 0.7033933232586894, + "grad_norm": 0.4588501751422882, + "learning_rate": 4.036040750571987e-06, + "loss": 0.3933, + "step": 37888 + }, + { + "epoch": 0.703430453396108, + "grad_norm": 0.3539140820503235, + "learning_rate": 4.035104460276721e-06, + "loss": 0.3266, + "step": 37890 + }, + { + "epoch": 0.7034675835335267, + "grad_norm": 0.2962561845779419, + "learning_rate": 4.034168251145817e-06, + "loss": 0.3606, + "step": 37892 + }, + { + "epoch": 0.7035047136709452, + "grad_norm": 0.37991786003112793, + "learning_rate": 4.033232123192013e-06, + "loss": 0.4681, + "step": 37894 + }, + { + "epoch": 0.7035418438083639, + "grad_norm": 0.37174397706985474, + "learning_rate": 4.0322960764280516e-06, + "loss": 0.3439, + "step": 37896 + }, + { + "epoch": 0.7035789739457826, + "grad_norm": 0.2964187562465668, + "learning_rate": 4.031360110866661e-06, + "loss": 0.2725, + "step": 37898 + }, + { + "epoch": 0.7036161040832012, + "grad_norm": 0.33882254362106323, + "learning_rate": 4.030424226520581e-06, + "loss": 0.4148, + "step": 37900 + }, + { + "epoch": 0.7036532342206199, + "grad_norm": 0.39383015036582947, + "learning_rate": 4.0294884234025475e-06, + "loss": 0.2916, + "step": 37902 + }, + { + "epoch": 0.7036903643580384, + "grad_norm": 0.3852511942386627, + "learning_rate": 4.028552701525293e-06, + "loss": 0.177, + "step": 37904 + }, + { + "epoch": 0.7037274944954571, + "grad_norm": 0.3373967707157135, + "learning_rate": 4.027617060901552e-06, + "loss": 0.2399, + "step": 37906 + }, + { + "epoch": 0.7037646246328758, + "grad_norm": 0.3069818317890167, + "learning_rate": 4.02668150154405e-06, + "loss": 0.2286, + "step": 37908 + }, + { + "epoch": 0.7038017547702944, + "grad_norm": 0.5309270620346069, + "learning_rate": 4.025746023465524e-06, + "loss": 0.2515, + "step": 37910 + }, + { + "epoch": 0.7038388849077131, + "grad_norm": 0.2947355806827545, + "learning_rate": 4.024810626678697e-06, + "loss": 0.169, + "step": 37912 + }, + { + "epoch": 0.7038760150451316, + "grad_norm": 0.3046303689479828, + "learning_rate": 4.023875311196298e-06, + "loss": 0.1359, + "step": 37914 + }, + { + "epoch": 0.7039131451825503, + "grad_norm": 0.42236244678497314, + "learning_rate": 4.022940077031057e-06, + "loss": 0.2365, + "step": 37916 + }, + { + "epoch": 0.703950275319969, + "grad_norm": 0.44367754459381104, + "learning_rate": 4.022004924195698e-06, + "loss": 0.3935, + "step": 37918 + }, + { + "epoch": 0.7039874054573876, + "grad_norm": 0.5501275658607483, + "learning_rate": 4.021069852702946e-06, + "loss": 0.3923, + "step": 37920 + }, + { + "epoch": 0.7040245355948063, + "grad_norm": 0.2703684866428375, + "learning_rate": 4.020134862565527e-06, + "loss": 0.1829, + "step": 37922 + }, + { + "epoch": 0.7040616657322248, + "grad_norm": 0.29451727867126465, + "learning_rate": 4.0191999537961565e-06, + "loss": 0.1554, + "step": 37924 + }, + { + "epoch": 0.7040987958696435, + "grad_norm": 0.301200807094574, + "learning_rate": 4.018265126407563e-06, + "loss": 0.2478, + "step": 37926 + }, + { + "epoch": 0.7041359260070622, + "grad_norm": 0.5121234655380249, + "learning_rate": 4.0173303804124645e-06, + "loss": 0.2792, + "step": 37928 + }, + { + "epoch": 0.7041730561444808, + "grad_norm": 0.4367184042930603, + "learning_rate": 4.016395715823577e-06, + "loss": 0.3085, + "step": 37930 + }, + { + "epoch": 0.7042101862818995, + "grad_norm": 0.3541370630264282, + "learning_rate": 4.01546113265362e-06, + "loss": 0.3836, + "step": 37932 + }, + { + "epoch": 0.704247316419318, + "grad_norm": 0.45512792468070984, + "learning_rate": 4.014526630915315e-06, + "loss": 0.2901, + "step": 37934 + }, + { + "epoch": 0.7042844465567367, + "grad_norm": 0.4876466989517212, + "learning_rate": 4.013592210621371e-06, + "loss": 0.3508, + "step": 37936 + }, + { + "epoch": 0.7043215766941553, + "grad_norm": 0.4165050983428955, + "learning_rate": 4.012657871784506e-06, + "loss": 0.2464, + "step": 37938 + }, + { + "epoch": 0.704358706831574, + "grad_norm": 0.3527783751487732, + "learning_rate": 4.011723614417432e-06, + "loss": 0.3196, + "step": 37940 + }, + { + "epoch": 0.7043958369689927, + "grad_norm": 1.5926580429077148, + "learning_rate": 4.010789438532864e-06, + "loss": 0.3011, + "step": 37942 + }, + { + "epoch": 0.7044329671064112, + "grad_norm": 0.49057716131210327, + "learning_rate": 4.00985534414351e-06, + "loss": 0.4677, + "step": 37944 + }, + { + "epoch": 0.7044700972438299, + "grad_norm": 0.3688822388648987, + "learning_rate": 4.0089213312620875e-06, + "loss": 0.3078, + "step": 37946 + }, + { + "epoch": 0.7045072273812485, + "grad_norm": 0.2785656750202179, + "learning_rate": 4.0079873999012966e-06, + "loss": 0.1786, + "step": 37948 + }, + { + "epoch": 0.7045443575186672, + "grad_norm": 0.40423738956451416, + "learning_rate": 4.007053550073852e-06, + "loss": 0.254, + "step": 37950 + }, + { + "epoch": 0.7045814876560859, + "grad_norm": 0.698407769203186, + "learning_rate": 4.0061197817924545e-06, + "loss": 0.272, + "step": 37952 + }, + { + "epoch": 0.7046186177935044, + "grad_norm": 0.4202597439289093, + "learning_rate": 4.005186095069813e-06, + "loss": 0.3807, + "step": 37954 + }, + { + "epoch": 0.7046557479309231, + "grad_norm": 0.34914085268974304, + "learning_rate": 4.004252489918632e-06, + "loss": 0.313, + "step": 37956 + }, + { + "epoch": 0.7046928780683417, + "grad_norm": 0.5189908146858215, + "learning_rate": 4.003318966351616e-06, + "loss": 0.2182, + "step": 37958 + }, + { + "epoch": 0.7047300082057604, + "grad_norm": 0.45974013209342957, + "learning_rate": 4.002385524381469e-06, + "loss": 0.2361, + "step": 37960 + }, + { + "epoch": 0.7047671383431791, + "grad_norm": 0.525264322757721, + "learning_rate": 4.001452164020887e-06, + "loss": 0.4253, + "step": 37962 + }, + { + "epoch": 0.7048042684805976, + "grad_norm": 0.3296203315258026, + "learning_rate": 4.0005188852825734e-06, + "loss": 0.2243, + "step": 37964 + }, + { + "epoch": 0.7048413986180163, + "grad_norm": 0.47202837467193604, + "learning_rate": 3.999585688179228e-06, + "loss": 0.4431, + "step": 37966 + }, + { + "epoch": 0.7048785287554349, + "grad_norm": 0.27697524428367615, + "learning_rate": 3.998652572723547e-06, + "loss": 0.1147, + "step": 37968 + }, + { + "epoch": 0.7049156588928536, + "grad_norm": 0.4982176721096039, + "learning_rate": 3.997719538928233e-06, + "loss": 0.2125, + "step": 37970 + }, + { + "epoch": 0.7049527890302723, + "grad_norm": 0.2884954512119293, + "learning_rate": 3.996786586805976e-06, + "loss": 0.365, + "step": 37972 + }, + { + "epoch": 0.7049899191676908, + "grad_norm": 0.3994104862213135, + "learning_rate": 3.9958537163694685e-06, + "loss": 0.0851, + "step": 37974 + }, + { + "epoch": 0.7050270493051095, + "grad_norm": 0.32072240114212036, + "learning_rate": 3.994920927631408e-06, + "loss": 0.3425, + "step": 37976 + }, + { + "epoch": 0.7050641794425281, + "grad_norm": 0.39811787009239197, + "learning_rate": 3.993988220604485e-06, + "loss": 0.2506, + "step": 37978 + }, + { + "epoch": 0.7051013095799468, + "grad_norm": 0.35012760758399963, + "learning_rate": 3.993055595301394e-06, + "loss": 0.179, + "step": 37980 + }, + { + "epoch": 0.7051384397173653, + "grad_norm": 0.48031920194625854, + "learning_rate": 3.992123051734823e-06, + "loss": 0.2654, + "step": 37982 + }, + { + "epoch": 0.705175569854784, + "grad_norm": 0.45286983251571655, + "learning_rate": 3.991190589917465e-06, + "loss": 0.2786, + "step": 37984 + }, + { + "epoch": 0.7052126999922027, + "grad_norm": 0.4098431169986725, + "learning_rate": 3.990258209862001e-06, + "loss": 0.1861, + "step": 37986 + }, + { + "epoch": 0.7052498301296213, + "grad_norm": 0.4155251085758209, + "learning_rate": 3.989325911581121e-06, + "loss": 0.3062, + "step": 37988 + }, + { + "epoch": 0.70528696026704, + "grad_norm": 0.3577605187892914, + "learning_rate": 3.988393695087512e-06, + "loss": 0.1595, + "step": 37990 + }, + { + "epoch": 0.7053240904044585, + "grad_norm": 0.6608291864395142, + "learning_rate": 3.987461560393862e-06, + "loss": 0.3441, + "step": 37992 + }, + { + "epoch": 0.7053612205418772, + "grad_norm": 0.39614182710647583, + "learning_rate": 3.986529507512845e-06, + "loss": 0.4644, + "step": 37994 + }, + { + "epoch": 0.7053983506792959, + "grad_norm": 0.66713947057724, + "learning_rate": 3.985597536457151e-06, + "loss": 0.2748, + "step": 37996 + }, + { + "epoch": 0.7054354808167145, + "grad_norm": 0.37530624866485596, + "learning_rate": 3.984665647239462e-06, + "loss": 0.1798, + "step": 37998 + }, + { + "epoch": 0.7054726109541332, + "grad_norm": 0.5856049656867981, + "learning_rate": 3.9837338398724525e-06, + "loss": 0.1973, + "step": 38000 + }, + { + "epoch": 0.7055097410915517, + "grad_norm": 0.3465645909309387, + "learning_rate": 3.982802114368803e-06, + "loss": 0.1632, + "step": 38002 + }, + { + "epoch": 0.7055468712289704, + "grad_norm": 0.5121181607246399, + "learning_rate": 3.981870470741195e-06, + "loss": 0.2642, + "step": 38004 + }, + { + "epoch": 0.7055840013663891, + "grad_norm": 0.3532576858997345, + "learning_rate": 3.980938909002303e-06, + "loss": 0.4296, + "step": 38006 + }, + { + "epoch": 0.7056211315038077, + "grad_norm": 0.5497809052467346, + "learning_rate": 3.980007429164803e-06, + "loss": 0.247, + "step": 38008 + }, + { + "epoch": 0.7056582616412264, + "grad_norm": 0.3505820631980896, + "learning_rate": 3.979076031241374e-06, + "loss": 0.1985, + "step": 38010 + }, + { + "epoch": 0.7056953917786449, + "grad_norm": 0.3434463441371918, + "learning_rate": 3.9781447152446805e-06, + "loss": 0.3259, + "step": 38012 + }, + { + "epoch": 0.7057325219160636, + "grad_norm": 0.39860329031944275, + "learning_rate": 3.977213481187404e-06, + "loss": 0.3177, + "step": 38014 + }, + { + "epoch": 0.7057696520534823, + "grad_norm": 0.46372976899147034, + "learning_rate": 3.97628232908221e-06, + "loss": 0.2025, + "step": 38016 + }, + { + "epoch": 0.7058067821909009, + "grad_norm": 0.2584788203239441, + "learning_rate": 3.975351258941769e-06, + "loss": 0.1173, + "step": 38018 + }, + { + "epoch": 0.7058439123283196, + "grad_norm": 0.3340776264667511, + "learning_rate": 3.974420270778751e-06, + "loss": 0.2663, + "step": 38020 + }, + { + "epoch": 0.7058810424657381, + "grad_norm": 0.41627636551856995, + "learning_rate": 3.973489364605828e-06, + "loss": 0.2294, + "step": 38022 + }, + { + "epoch": 0.7059181726031568, + "grad_norm": 0.4376927316188812, + "learning_rate": 3.97255854043566e-06, + "loss": 0.2727, + "step": 38024 + }, + { + "epoch": 0.7059553027405755, + "grad_norm": 0.5108228325843811, + "learning_rate": 3.9716277982809156e-06, + "loss": 0.3173, + "step": 38026 + }, + { + "epoch": 0.7059924328779941, + "grad_norm": 0.3428100347518921, + "learning_rate": 3.97069713815426e-06, + "loss": 0.371, + "step": 38028 + }, + { + "epoch": 0.7060295630154128, + "grad_norm": 0.6225501894950867, + "learning_rate": 3.969766560068358e-06, + "loss": 0.228, + "step": 38030 + }, + { + "epoch": 0.7060666931528313, + "grad_norm": 0.41492998600006104, + "learning_rate": 3.968836064035872e-06, + "loss": 0.2139, + "step": 38032 + }, + { + "epoch": 0.70610382329025, + "grad_norm": 0.4596366882324219, + "learning_rate": 3.967905650069459e-06, + "loss": 0.3766, + "step": 38034 + }, + { + "epoch": 0.7061409534276686, + "grad_norm": 0.5141488909721375, + "learning_rate": 3.966975318181785e-06, + "loss": 0.2643, + "step": 38036 + }, + { + "epoch": 0.7061780835650873, + "grad_norm": 0.3649926781654358, + "learning_rate": 3.9660450683855036e-06, + "loss": 0.1303, + "step": 38038 + }, + { + "epoch": 0.706215213702506, + "grad_norm": 0.35929074883461, + "learning_rate": 3.965114900693273e-06, + "loss": 0.2422, + "step": 38040 + }, + { + "epoch": 0.7062523438399245, + "grad_norm": 0.6279309988021851, + "learning_rate": 3.964184815117754e-06, + "loss": 0.3345, + "step": 38042 + }, + { + "epoch": 0.7062894739773432, + "grad_norm": 0.32389530539512634, + "learning_rate": 3.9632548116716e-06, + "loss": 0.2764, + "step": 38044 + }, + { + "epoch": 0.7063266041147618, + "grad_norm": 0.5120017528533936, + "learning_rate": 3.9623248903674645e-06, + "loss": 0.3296, + "step": 38046 + }, + { + "epoch": 0.7063637342521805, + "grad_norm": 0.33234813809394836, + "learning_rate": 3.961395051218007e-06, + "loss": 0.3002, + "step": 38048 + }, + { + "epoch": 0.7064008643895991, + "grad_norm": 0.36734670400619507, + "learning_rate": 3.960465294235871e-06, + "loss": 0.1954, + "step": 38050 + }, + { + "epoch": 0.7064379945270177, + "grad_norm": 0.38332879543304443, + "learning_rate": 3.959535619433713e-06, + "loss": 0.1737, + "step": 38052 + }, + { + "epoch": 0.7064751246644364, + "grad_norm": 0.29633790254592896, + "learning_rate": 3.958606026824181e-06, + "loss": 0.2484, + "step": 38054 + }, + { + "epoch": 0.706512254801855, + "grad_norm": 0.4117186367511749, + "learning_rate": 3.957676516419929e-06, + "loss": 0.3316, + "step": 38056 + }, + { + "epoch": 0.7065493849392737, + "grad_norm": 0.5229418873786926, + "learning_rate": 3.956747088233596e-06, + "loss": 0.1482, + "step": 38058 + }, + { + "epoch": 0.7065865150766923, + "grad_norm": 0.5126410126686096, + "learning_rate": 3.9558177422778375e-06, + "loss": 0.21, + "step": 38060 + }, + { + "epoch": 0.7066236452141109, + "grad_norm": 0.20744261145591736, + "learning_rate": 3.954888478565292e-06, + "loss": 0.2527, + "step": 38062 + }, + { + "epoch": 0.7066607753515296, + "grad_norm": 0.47574397921562195, + "learning_rate": 3.953959297108607e-06, + "loss": 0.2835, + "step": 38064 + }, + { + "epoch": 0.7066979054889482, + "grad_norm": 0.44473251700401306, + "learning_rate": 3.953030197920427e-06, + "loss": 0.2474, + "step": 38066 + }, + { + "epoch": 0.7067350356263669, + "grad_norm": 0.3982432782649994, + "learning_rate": 3.952101181013391e-06, + "loss": 0.2211, + "step": 38068 + }, + { + "epoch": 0.7067721657637855, + "grad_norm": 0.49544739723205566, + "learning_rate": 3.951172246400143e-06, + "loss": 0.2934, + "step": 38070 + }, + { + "epoch": 0.7068092959012041, + "grad_norm": 0.47970259189605713, + "learning_rate": 3.950243394093327e-06, + "loss": 0.2967, + "step": 38072 + }, + { + "epoch": 0.7068464260386228, + "grad_norm": 0.2899574339389801, + "learning_rate": 3.9493146241055735e-06, + "loss": 0.27, + "step": 38074 + }, + { + "epoch": 0.7068835561760414, + "grad_norm": 0.3603079319000244, + "learning_rate": 3.948385936449524e-06, + "loss": 0.3339, + "step": 38076 + }, + { + "epoch": 0.7069206863134601, + "grad_norm": 0.4132198095321655, + "learning_rate": 3.947457331137818e-06, + "loss": 0.1378, + "step": 38078 + }, + { + "epoch": 0.7069578164508787, + "grad_norm": 0.3556966483592987, + "learning_rate": 3.946528808183086e-06, + "loss": 0.1308, + "step": 38080 + }, + { + "epoch": 0.7069949465882973, + "grad_norm": 0.2878161370754242, + "learning_rate": 3.945600367597964e-06, + "loss": 0.202, + "step": 38082 + }, + { + "epoch": 0.707032076725716, + "grad_norm": 0.28508999943733215, + "learning_rate": 3.944672009395085e-06, + "loss": 0.3071, + "step": 38084 + }, + { + "epoch": 0.7070692068631346, + "grad_norm": 0.39727121591567993, + "learning_rate": 3.943743733587088e-06, + "loss": 0.2027, + "step": 38086 + }, + { + "epoch": 0.7071063370005533, + "grad_norm": 0.5152698755264282, + "learning_rate": 3.942815540186593e-06, + "loss": 0.2075, + "step": 38088 + }, + { + "epoch": 0.7071434671379718, + "grad_norm": 0.5086638331413269, + "learning_rate": 3.941887429206235e-06, + "loss": 0.4166, + "step": 38090 + }, + { + "epoch": 0.7071805972753905, + "grad_norm": 0.39790186285972595, + "learning_rate": 3.940959400658644e-06, + "loss": 0.2139, + "step": 38092 + }, + { + "epoch": 0.7072177274128092, + "grad_norm": 0.7603560090065002, + "learning_rate": 3.940031454556446e-06, + "loss": 0.4891, + "step": 38094 + }, + { + "epoch": 0.7072548575502278, + "grad_norm": 0.28156888484954834, + "learning_rate": 3.939103590912271e-06, + "loss": 0.1504, + "step": 38096 + }, + { + "epoch": 0.7072919876876465, + "grad_norm": 0.5872491002082825, + "learning_rate": 3.9381758097387394e-06, + "loss": 0.2438, + "step": 38098 + }, + { + "epoch": 0.707329117825065, + "grad_norm": 0.44886723160743713, + "learning_rate": 3.937248111048481e-06, + "loss": 0.2038, + "step": 38100 + }, + { + "epoch": 0.7073662479624837, + "grad_norm": 0.45089030265808105, + "learning_rate": 3.9363204948541115e-06, + "loss": 0.2464, + "step": 38102 + }, + { + "epoch": 0.7074033780999024, + "grad_norm": 0.35610711574554443, + "learning_rate": 3.935392961168257e-06, + "loss": 0.2474, + "step": 38104 + }, + { + "epoch": 0.707440508237321, + "grad_norm": 0.5256989598274231, + "learning_rate": 3.93446551000354e-06, + "loss": 0.1086, + "step": 38106 + }, + { + "epoch": 0.7074776383747396, + "grad_norm": 0.2214541733264923, + "learning_rate": 3.933538141372578e-06, + "loss": 0.2109, + "step": 38108 + }, + { + "epoch": 0.7075147685121582, + "grad_norm": 0.2744998037815094, + "learning_rate": 3.932610855287995e-06, + "loss": 0.3723, + "step": 38110 + }, + { + "epoch": 0.7075518986495769, + "grad_norm": 0.47936129570007324, + "learning_rate": 3.9316836517624e-06, + "loss": 0.2884, + "step": 38112 + }, + { + "epoch": 0.7075890287869956, + "grad_norm": 0.24909447133541107, + "learning_rate": 3.9307565308084135e-06, + "loss": 0.2459, + "step": 38114 + }, + { + "epoch": 0.7076261589244142, + "grad_norm": 0.44297605752944946, + "learning_rate": 3.929829492438653e-06, + "loss": 0.3863, + "step": 38116 + }, + { + "epoch": 0.7076632890618328, + "grad_norm": 0.45579683780670166, + "learning_rate": 3.928902536665733e-06, + "loss": 0.2943, + "step": 38118 + }, + { + "epoch": 0.7077004191992514, + "grad_norm": 0.3342800736427307, + "learning_rate": 3.9279756635022616e-06, + "loss": 0.3287, + "step": 38120 + }, + { + "epoch": 0.7077375493366701, + "grad_norm": 0.30340835452079773, + "learning_rate": 3.927048872960852e-06, + "loss": 0.4079, + "step": 38122 + }, + { + "epoch": 0.7077746794740888, + "grad_norm": 0.9781947731971741, + "learning_rate": 3.926122165054122e-06, + "loss": 0.3654, + "step": 38124 + }, + { + "epoch": 0.7078118096115074, + "grad_norm": 0.5402278900146484, + "learning_rate": 3.925195539794672e-06, + "loss": 0.2974, + "step": 38126 + }, + { + "epoch": 0.707848939748926, + "grad_norm": 0.507876455783844, + "learning_rate": 3.924268997195114e-06, + "loss": 0.4756, + "step": 38128 + }, + { + "epoch": 0.7078860698863446, + "grad_norm": 0.6688075065612793, + "learning_rate": 3.923342537268057e-06, + "loss": 0.394, + "step": 38130 + }, + { + "epoch": 0.7079232000237633, + "grad_norm": 0.352760910987854, + "learning_rate": 3.922416160026106e-06, + "loss": 0.2092, + "step": 38132 + }, + { + "epoch": 0.7079603301611819, + "grad_norm": 0.5699179172515869, + "learning_rate": 3.921489865481866e-06, + "loss": 0.3282, + "step": 38134 + }, + { + "epoch": 0.7079974602986006, + "grad_norm": 0.5377879738807678, + "learning_rate": 3.920563653647945e-06, + "loss": 0.2879, + "step": 38136 + }, + { + "epoch": 0.7080345904360192, + "grad_norm": 0.29251334071159363, + "learning_rate": 3.91963752453694e-06, + "loss": 0.1489, + "step": 38138 + }, + { + "epoch": 0.7080717205734378, + "grad_norm": 0.5590978264808655, + "learning_rate": 3.918711478161459e-06, + "loss": 0.2684, + "step": 38140 + }, + { + "epoch": 0.7081088507108565, + "grad_norm": 0.26250067353248596, + "learning_rate": 3.917785514534095e-06, + "loss": 0.2106, + "step": 38142 + }, + { + "epoch": 0.7081459808482751, + "grad_norm": 0.4042779803276062, + "learning_rate": 3.916859633667453e-06, + "loss": 0.5555, + "step": 38144 + }, + { + "epoch": 0.7081831109856938, + "grad_norm": 0.29905450344085693, + "learning_rate": 3.915933835574128e-06, + "loss": 0.1896, + "step": 38146 + }, + { + "epoch": 0.7082202411231124, + "grad_norm": 0.33537599444389343, + "learning_rate": 3.915008120266724e-06, + "loss": 0.2511, + "step": 38148 + }, + { + "epoch": 0.708257371260531, + "grad_norm": 0.3405703008174896, + "learning_rate": 3.914082487757831e-06, + "loss": 0.5252, + "step": 38150 + }, + { + "epoch": 0.7082945013979497, + "grad_norm": 0.462399959564209, + "learning_rate": 3.913156938060044e-06, + "loss": 0.4158, + "step": 38152 + }, + { + "epoch": 0.7083316315353683, + "grad_norm": 0.7287722229957581, + "learning_rate": 3.91223147118596e-06, + "loss": 0.3191, + "step": 38154 + }, + { + "epoch": 0.708368761672787, + "grad_norm": 0.3973499536514282, + "learning_rate": 3.911306087148171e-06, + "loss": 0.2025, + "step": 38156 + }, + { + "epoch": 0.7084058918102056, + "grad_norm": 0.4547470211982727, + "learning_rate": 3.910380785959268e-06, + "loss": 0.2317, + "step": 38158 + }, + { + "epoch": 0.7084430219476242, + "grad_norm": 0.2922212481498718, + "learning_rate": 3.909455567631845e-06, + "loss": 0.1776, + "step": 38160 + }, + { + "epoch": 0.7084801520850429, + "grad_norm": 0.48210448026657104, + "learning_rate": 3.9085304321784845e-06, + "loss": 0.2041, + "step": 38162 + }, + { + "epoch": 0.7085172822224615, + "grad_norm": 0.3467679023742676, + "learning_rate": 3.907605379611783e-06, + "loss": 0.2111, + "step": 38164 + }, + { + "epoch": 0.7085544123598801, + "grad_norm": 0.4955320358276367, + "learning_rate": 3.90668040994432e-06, + "loss": 0.4252, + "step": 38166 + }, + { + "epoch": 0.7085915424972988, + "grad_norm": 0.3571617007255554, + "learning_rate": 3.9057555231886856e-06, + "loss": 0.1792, + "step": 38168 + }, + { + "epoch": 0.7086286726347174, + "grad_norm": 0.32618558406829834, + "learning_rate": 3.904830719357463e-06, + "loss": 0.2523, + "step": 38170 + }, + { + "epoch": 0.7086658027721361, + "grad_norm": 0.3874558210372925, + "learning_rate": 3.903905998463238e-06, + "loss": 0.5445, + "step": 38172 + }, + { + "epoch": 0.7087029329095547, + "grad_norm": 0.33972954750061035, + "learning_rate": 3.902981360518595e-06, + "loss": 0.3982, + "step": 38174 + }, + { + "epoch": 0.7087400630469733, + "grad_norm": 0.35426416993141174, + "learning_rate": 3.902056805536109e-06, + "loss": 0.1883, + "step": 38176 + }, + { + "epoch": 0.708777193184392, + "grad_norm": 0.40433260798454285, + "learning_rate": 3.9011323335283665e-06, + "loss": 0.2872, + "step": 38178 + }, + { + "epoch": 0.7088143233218106, + "grad_norm": 0.36834022402763367, + "learning_rate": 3.900207944507944e-06, + "loss": 0.2822, + "step": 38180 + }, + { + "epoch": 0.7088514534592293, + "grad_norm": 0.5159149169921875, + "learning_rate": 3.899283638487424e-06, + "loss": 0.226, + "step": 38182 + }, + { + "epoch": 0.7088885835966479, + "grad_norm": 0.3613691031932831, + "learning_rate": 3.898359415479375e-06, + "loss": 0.2794, + "step": 38184 + }, + { + "epoch": 0.7089257137340665, + "grad_norm": 0.3555360734462738, + "learning_rate": 3.897435275496383e-06, + "loss": 0.4158, + "step": 38186 + }, + { + "epoch": 0.7089628438714851, + "grad_norm": 0.44267693161964417, + "learning_rate": 3.896511218551013e-06, + "loss": 0.2187, + "step": 38188 + }, + { + "epoch": 0.7089999740089038, + "grad_norm": 0.46430134773254395, + "learning_rate": 3.895587244655844e-06, + "loss": 0.3222, + "step": 38190 + }, + { + "epoch": 0.7090371041463225, + "grad_norm": 0.19684863090515137, + "learning_rate": 3.894663353823447e-06, + "loss": 0.2767, + "step": 38192 + }, + { + "epoch": 0.7090742342837411, + "grad_norm": 0.3146318793296814, + "learning_rate": 3.893739546066395e-06, + "loss": 0.2753, + "step": 38194 + }, + { + "epoch": 0.7091113644211597, + "grad_norm": 0.4783351421356201, + "learning_rate": 3.892815821397257e-06, + "loss": 0.4096, + "step": 38196 + }, + { + "epoch": 0.7091484945585783, + "grad_norm": 0.4032927453517914, + "learning_rate": 3.891892179828605e-06, + "loss": 0.1762, + "step": 38198 + }, + { + "epoch": 0.709185624695997, + "grad_norm": 0.3325378894805908, + "learning_rate": 3.890968621373004e-06, + "loss": 0.3008, + "step": 38200 + }, + { + "epoch": 0.7092227548334157, + "grad_norm": 0.21624745428562164, + "learning_rate": 3.890045146043019e-06, + "loss": 0.1306, + "step": 38202 + }, + { + "epoch": 0.7092598849708343, + "grad_norm": 0.3842718005180359, + "learning_rate": 3.889121753851222e-06, + "loss": 0.1393, + "step": 38204 + }, + { + "epoch": 0.7092970151082529, + "grad_norm": 0.4516873359680176, + "learning_rate": 3.888198444810169e-06, + "loss": 0.2511, + "step": 38206 + }, + { + "epoch": 0.7093341452456715, + "grad_norm": 0.6297973394393921, + "learning_rate": 3.8872752189324305e-06, + "loss": 0.3865, + "step": 38208 + }, + { + "epoch": 0.7093712753830902, + "grad_norm": 0.618631899356842, + "learning_rate": 3.886352076230565e-06, + "loss": 0.3341, + "step": 38210 + }, + { + "epoch": 0.7094084055205089, + "grad_norm": 0.38146597146987915, + "learning_rate": 3.885429016717139e-06, + "loss": 0.5065, + "step": 38212 + }, + { + "epoch": 0.7094455356579275, + "grad_norm": 0.42282363772392273, + "learning_rate": 3.884506040404706e-06, + "loss": 0.2547, + "step": 38214 + }, + { + "epoch": 0.7094826657953461, + "grad_norm": 0.37700262665748596, + "learning_rate": 3.883583147305828e-06, + "loss": 0.2777, + "step": 38216 + }, + { + "epoch": 0.7095197959327647, + "grad_norm": 0.4291996657848358, + "learning_rate": 3.882660337433061e-06, + "loss": 0.2212, + "step": 38218 + }, + { + "epoch": 0.7095569260701834, + "grad_norm": 0.23527291417121887, + "learning_rate": 3.881737610798965e-06, + "loss": 0.2815, + "step": 38220 + }, + { + "epoch": 0.7095940562076021, + "grad_norm": 0.28370603919029236, + "learning_rate": 3.880814967416093e-06, + "loss": 0.1791, + "step": 38222 + }, + { + "epoch": 0.7096311863450206, + "grad_norm": 0.4876558482646942, + "learning_rate": 3.879892407297004e-06, + "loss": 0.2803, + "step": 38224 + }, + { + "epoch": 0.7096683164824393, + "grad_norm": 0.2863287329673767, + "learning_rate": 3.878969930454247e-06, + "loss": 0.1425, + "step": 38226 + }, + { + "epoch": 0.7097054466198579, + "grad_norm": 0.3879002332687378, + "learning_rate": 3.878047536900371e-06, + "loss": 0.3444, + "step": 38228 + }, + { + "epoch": 0.7097425767572766, + "grad_norm": 0.2571834921836853, + "learning_rate": 3.877125226647932e-06, + "loss": 0.3097, + "step": 38230 + }, + { + "epoch": 0.7097797068946953, + "grad_norm": 0.45022931694984436, + "learning_rate": 3.876202999709477e-06, + "loss": 0.4099, + "step": 38232 + }, + { + "epoch": 0.7098168370321138, + "grad_norm": 0.384562224149704, + "learning_rate": 3.875280856097558e-06, + "loss": 0.3022, + "step": 38234 + }, + { + "epoch": 0.7098539671695325, + "grad_norm": 0.8028783202171326, + "learning_rate": 3.874358795824722e-06, + "loss": 0.258, + "step": 38236 + }, + { + "epoch": 0.7098910973069511, + "grad_norm": 0.667856752872467, + "learning_rate": 3.873436818903512e-06, + "loss": 0.3203, + "step": 38238 + }, + { + "epoch": 0.7099282274443698, + "grad_norm": 0.6080194115638733, + "learning_rate": 3.872514925346476e-06, + "loss": 0.4514, + "step": 38240 + }, + { + "epoch": 0.7099653575817884, + "grad_norm": 0.44267651438713074, + "learning_rate": 3.871593115166158e-06, + "loss": 0.3062, + "step": 38242 + }, + { + "epoch": 0.710002487719207, + "grad_norm": 0.39824554324150085, + "learning_rate": 3.8706713883751e-06, + "loss": 0.3794, + "step": 38244 + }, + { + "epoch": 0.7100396178566257, + "grad_norm": 0.4751010239124298, + "learning_rate": 3.86974974498585e-06, + "loss": 0.3806, + "step": 38246 + }, + { + "epoch": 0.7100767479940443, + "grad_norm": 0.24562574923038483, + "learning_rate": 3.868828185010939e-06, + "loss": 0.2565, + "step": 38248 + }, + { + "epoch": 0.710113878131463, + "grad_norm": 0.42882511019706726, + "learning_rate": 3.867906708462915e-06, + "loss": 0.4097, + "step": 38250 + }, + { + "epoch": 0.7101510082688816, + "grad_norm": 0.2850572466850281, + "learning_rate": 3.86698531535431e-06, + "loss": 0.1546, + "step": 38252 + }, + { + "epoch": 0.7101881384063002, + "grad_norm": 0.6860714554786682, + "learning_rate": 3.866064005697664e-06, + "loss": 0.2506, + "step": 38254 + }, + { + "epoch": 0.7102252685437189, + "grad_norm": 0.2567194402217865, + "learning_rate": 3.865142779505514e-06, + "loss": 0.1642, + "step": 38256 + }, + { + "epoch": 0.7102623986811375, + "grad_norm": 0.42734935879707336, + "learning_rate": 3.864221636790395e-06, + "loss": 0.2507, + "step": 38258 + }, + { + "epoch": 0.7102995288185562, + "grad_norm": 0.35731270909309387, + "learning_rate": 3.86330057756484e-06, + "loss": 0.3818, + "step": 38260 + }, + { + "epoch": 0.7103366589559748, + "grad_norm": 0.3298490643501282, + "learning_rate": 3.862379601841386e-06, + "loss": 0.391, + "step": 38262 + }, + { + "epoch": 0.7103737890933934, + "grad_norm": 0.32279857993125916, + "learning_rate": 3.8614587096325585e-06, + "loss": 0.412, + "step": 38264 + }, + { + "epoch": 0.7104109192308121, + "grad_norm": 0.3170374035835266, + "learning_rate": 3.860537900950891e-06, + "loss": 0.2558, + "step": 38266 + }, + { + "epoch": 0.7104480493682307, + "grad_norm": 0.4230540692806244, + "learning_rate": 3.859617175808915e-06, + "loss": 0.253, + "step": 38268 + }, + { + "epoch": 0.7104851795056494, + "grad_norm": 0.4637199938297272, + "learning_rate": 3.858696534219154e-06, + "loss": 0.1647, + "step": 38270 + }, + { + "epoch": 0.710522309643068, + "grad_norm": 0.6628230810165405, + "learning_rate": 3.857775976194138e-06, + "loss": 0.23, + "step": 38272 + }, + { + "epoch": 0.7105594397804866, + "grad_norm": 0.35321223735809326, + "learning_rate": 3.8568555017463975e-06, + "loss": 0.1419, + "step": 38274 + }, + { + "epoch": 0.7105965699179053, + "grad_norm": 0.39736875891685486, + "learning_rate": 3.855935110888447e-06, + "loss": 0.3967, + "step": 38276 + }, + { + "epoch": 0.7106337000553239, + "grad_norm": 0.4303283989429474, + "learning_rate": 3.855014803632817e-06, + "loss": 0.3366, + "step": 38278 + }, + { + "epoch": 0.7106708301927426, + "grad_norm": 0.2547784447669983, + "learning_rate": 3.854094579992029e-06, + "loss": 0.3631, + "step": 38280 + }, + { + "epoch": 0.7107079603301611, + "grad_norm": 0.39638444781303406, + "learning_rate": 3.853174439978605e-06, + "loss": 0.1956, + "step": 38282 + }, + { + "epoch": 0.7107450904675798, + "grad_norm": 0.4775794446468353, + "learning_rate": 3.852254383605065e-06, + "loss": 0.3613, + "step": 38284 + }, + { + "epoch": 0.7107822206049984, + "grad_norm": 0.48790913820266724, + "learning_rate": 3.851334410883932e-06, + "loss": 0.3249, + "step": 38286 + }, + { + "epoch": 0.7108193507424171, + "grad_norm": 0.5363473892211914, + "learning_rate": 3.8504145218277164e-06, + "loss": 0.4129, + "step": 38288 + }, + { + "epoch": 0.7108564808798358, + "grad_norm": 0.3864003121852875, + "learning_rate": 3.849494716448943e-06, + "loss": 0.2546, + "step": 38290 + }, + { + "epoch": 0.7108936110172543, + "grad_norm": 0.35749727487564087, + "learning_rate": 3.848574994760119e-06, + "loss": 0.3398, + "step": 38292 + }, + { + "epoch": 0.710930741154673, + "grad_norm": 0.35058918595314026, + "learning_rate": 3.847655356773764e-06, + "loss": 0.3352, + "step": 38294 + }, + { + "epoch": 0.7109678712920916, + "grad_norm": 0.5434895753860474, + "learning_rate": 3.846735802502391e-06, + "loss": 0.2436, + "step": 38296 + }, + { + "epoch": 0.7110050014295103, + "grad_norm": 0.44131389260292053, + "learning_rate": 3.845816331958514e-06, + "loss": 0.1371, + "step": 38298 + }, + { + "epoch": 0.711042131566929, + "grad_norm": 0.36640387773513794, + "learning_rate": 3.844896945154645e-06, + "loss": 0.2946, + "step": 38300 + }, + { + "epoch": 0.7110792617043475, + "grad_norm": 0.35546785593032837, + "learning_rate": 3.84397764210329e-06, + "loss": 0.2135, + "step": 38302 + }, + { + "epoch": 0.7111163918417662, + "grad_norm": 0.3662308156490326, + "learning_rate": 3.843058422816958e-06, + "loss": 0.4272, + "step": 38304 + }, + { + "epoch": 0.7111535219791848, + "grad_norm": 0.3492467999458313, + "learning_rate": 3.842139287308159e-06, + "loss": 0.2522, + "step": 38306 + }, + { + "epoch": 0.7111906521166035, + "grad_norm": 0.5802592039108276, + "learning_rate": 3.841220235589399e-06, + "loss": 0.2764, + "step": 38308 + }, + { + "epoch": 0.7112277822540222, + "grad_norm": 0.31080788373947144, + "learning_rate": 3.840301267673189e-06, + "loss": 0.1266, + "step": 38310 + }, + { + "epoch": 0.7112649123914407, + "grad_norm": 0.38897669315338135, + "learning_rate": 3.839382383572027e-06, + "loss": 0.3534, + "step": 38312 + }, + { + "epoch": 0.7113020425288594, + "grad_norm": 0.5285455584526062, + "learning_rate": 3.838463583298414e-06, + "loss": 0.2464, + "step": 38314 + }, + { + "epoch": 0.711339172666278, + "grad_norm": 0.2628207802772522, + "learning_rate": 3.837544866864856e-06, + "loss": 0.2145, + "step": 38316 + }, + { + "epoch": 0.7113763028036967, + "grad_norm": 0.5825287103652954, + "learning_rate": 3.836626234283854e-06, + "loss": 0.3187, + "step": 38318 + }, + { + "epoch": 0.7114134329411154, + "grad_norm": 0.28564298152923584, + "learning_rate": 3.835707685567908e-06, + "loss": 0.0994, + "step": 38320 + }, + { + "epoch": 0.7114505630785339, + "grad_norm": 0.3276146650314331, + "learning_rate": 3.834789220729516e-06, + "loss": 0.3169, + "step": 38322 + }, + { + "epoch": 0.7114876932159526, + "grad_norm": 0.4152318239212036, + "learning_rate": 3.833870839781178e-06, + "loss": 0.2812, + "step": 38324 + }, + { + "epoch": 0.7115248233533712, + "grad_norm": 0.34961146116256714, + "learning_rate": 3.832952542735387e-06, + "loss": 0.3839, + "step": 38326 + }, + { + "epoch": 0.7115619534907899, + "grad_norm": 0.3483884036540985, + "learning_rate": 3.832034329604637e-06, + "loss": 0.2881, + "step": 38328 + }, + { + "epoch": 0.7115990836282086, + "grad_norm": 0.2815057337284088, + "learning_rate": 3.831116200401426e-06, + "loss": 0.2821, + "step": 38330 + }, + { + "epoch": 0.7116362137656271, + "grad_norm": 0.5533283948898315, + "learning_rate": 3.830198155138248e-06, + "loss": 0.3768, + "step": 38332 + }, + { + "epoch": 0.7116733439030458, + "grad_norm": 0.7363576889038086, + "learning_rate": 3.829280193827589e-06, + "loss": 0.2616, + "step": 38334 + }, + { + "epoch": 0.7117104740404644, + "grad_norm": 0.4258559048175812, + "learning_rate": 3.828362316481944e-06, + "loss": 0.2619, + "step": 38336 + }, + { + "epoch": 0.7117476041778831, + "grad_norm": 0.4296591281890869, + "learning_rate": 3.827444523113805e-06, + "loss": 0.2965, + "step": 38338 + }, + { + "epoch": 0.7117847343153016, + "grad_norm": 0.40978506207466125, + "learning_rate": 3.8265268137356535e-06, + "loss": 0.2455, + "step": 38340 + }, + { + "epoch": 0.7118218644527203, + "grad_norm": 0.5486048460006714, + "learning_rate": 3.82560918835998e-06, + "loss": 0.7542, + "step": 38342 + }, + { + "epoch": 0.711858994590139, + "grad_norm": 0.3994015157222748, + "learning_rate": 3.824691646999271e-06, + "loss": 0.1585, + "step": 38344 + }, + { + "epoch": 0.7118961247275576, + "grad_norm": 0.3969513475894928, + "learning_rate": 3.823774189666012e-06, + "loss": 0.2761, + "step": 38346 + }, + { + "epoch": 0.7119332548649763, + "grad_norm": 0.38873445987701416, + "learning_rate": 3.822856816372686e-06, + "loss": 0.1581, + "step": 38348 + }, + { + "epoch": 0.7119703850023948, + "grad_norm": 0.31401100754737854, + "learning_rate": 3.821939527131779e-06, + "loss": 0.3822, + "step": 38350 + }, + { + "epoch": 0.7120075151398135, + "grad_norm": 0.3767162263393402, + "learning_rate": 3.821022321955768e-06, + "loss": 0.3399, + "step": 38352 + }, + { + "epoch": 0.7120446452772322, + "grad_norm": 0.37629154324531555, + "learning_rate": 3.8201052008571375e-06, + "loss": 0.2292, + "step": 38354 + }, + { + "epoch": 0.7120817754146508, + "grad_norm": 0.3102743923664093, + "learning_rate": 3.8191881638483596e-06, + "loss": 0.327, + "step": 38356 + }, + { + "epoch": 0.7121189055520695, + "grad_norm": 0.5763558149337769, + "learning_rate": 3.818271210941918e-06, + "loss": 0.2444, + "step": 38358 + }, + { + "epoch": 0.712156035689488, + "grad_norm": 0.5295024514198303, + "learning_rate": 3.817354342150289e-06, + "loss": 0.144, + "step": 38360 + }, + { + "epoch": 0.7121931658269067, + "grad_norm": 0.35246017575263977, + "learning_rate": 3.816437557485952e-06, + "loss": 0.2769, + "step": 38362 + }, + { + "epoch": 0.7122302959643254, + "grad_norm": 0.3564547598361969, + "learning_rate": 3.815520856961374e-06, + "loss": 0.2327, + "step": 38364 + }, + { + "epoch": 0.712267426101744, + "grad_norm": 0.24249692261219025, + "learning_rate": 3.8146042405890326e-06, + "loss": 0.4736, + "step": 38366 + }, + { + "epoch": 0.7123045562391627, + "grad_norm": 0.6170700788497925, + "learning_rate": 3.8136877083813993e-06, + "loss": 0.2762, + "step": 38368 + }, + { + "epoch": 0.7123416863765812, + "grad_norm": 0.3345470428466797, + "learning_rate": 3.812771260350947e-06, + "loss": 0.313, + "step": 38370 + }, + { + "epoch": 0.7123788165139999, + "grad_norm": 0.9421281218528748, + "learning_rate": 3.8118548965101486e-06, + "loss": 0.2248, + "step": 38372 + }, + { + "epoch": 0.7124159466514186, + "grad_norm": 0.285725474357605, + "learning_rate": 3.8109386168714657e-06, + "loss": 0.1987, + "step": 38374 + }, + { + "epoch": 0.7124530767888372, + "grad_norm": 0.4872460961341858, + "learning_rate": 3.8100224214473734e-06, + "loss": 0.255, + "step": 38376 + }, + { + "epoch": 0.7124902069262559, + "grad_norm": 0.4311636686325073, + "learning_rate": 3.809106310250331e-06, + "loss": 0.3171, + "step": 38378 + }, + { + "epoch": 0.7125273370636744, + "grad_norm": 0.5085113644599915, + "learning_rate": 3.8081902832928085e-06, + "loss": 0.2775, + "step": 38380 + }, + { + "epoch": 0.7125644672010931, + "grad_norm": 0.5581339597702026, + "learning_rate": 3.807274340587269e-06, + "loss": 0.285, + "step": 38382 + }, + { + "epoch": 0.7126015973385118, + "grad_norm": 0.3686096668243408, + "learning_rate": 3.8063584821461763e-06, + "loss": 0.4368, + "step": 38384 + }, + { + "epoch": 0.7126387274759304, + "grad_norm": 0.3417530655860901, + "learning_rate": 3.805442707981992e-06, + "loss": 0.4524, + "step": 38386 + }, + { + "epoch": 0.7126758576133491, + "grad_norm": 0.45178133249282837, + "learning_rate": 3.8045270181071824e-06, + "loss": 0.2018, + "step": 38388 + }, + { + "epoch": 0.7127129877507676, + "grad_norm": 0.43412157893180847, + "learning_rate": 3.8036114125341985e-06, + "loss": 0.3873, + "step": 38390 + }, + { + "epoch": 0.7127501178881863, + "grad_norm": 0.3364481031894684, + "learning_rate": 3.802695891275503e-06, + "loss": 0.6802, + "step": 38392 + }, + { + "epoch": 0.7127872480256049, + "grad_norm": 0.4735856056213379, + "learning_rate": 3.8017804543435568e-06, + "loss": 0.2713, + "step": 38394 + }, + { + "epoch": 0.7128243781630236, + "grad_norm": 0.901459813117981, + "learning_rate": 3.800865101750809e-06, + "loss": 0.2876, + "step": 38396 + }, + { + "epoch": 0.7128615083004423, + "grad_norm": 0.39798903465270996, + "learning_rate": 3.7999498335097187e-06, + "loss": 0.5497, + "step": 38398 + }, + { + "epoch": 0.7128986384378608, + "grad_norm": 0.4067172110080719, + "learning_rate": 3.7990346496327435e-06, + "loss": 0.3363, + "step": 38400 + }, + { + "epoch": 0.7129357685752795, + "grad_norm": 0.40109017491340637, + "learning_rate": 3.79811955013233e-06, + "loss": 0.2214, + "step": 38402 + }, + { + "epoch": 0.7129728987126981, + "grad_norm": 0.29809775948524475, + "learning_rate": 3.797204535020931e-06, + "loss": 0.2152, + "step": 38404 + }, + { + "epoch": 0.7130100288501168, + "grad_norm": 0.3489615023136139, + "learning_rate": 3.796289604311e-06, + "loss": 0.2355, + "step": 38406 + }, + { + "epoch": 0.7130471589875355, + "grad_norm": 0.8688030242919922, + "learning_rate": 3.7953747580149847e-06, + "loss": 0.2164, + "step": 38408 + }, + { + "epoch": 0.713084289124954, + "grad_norm": 0.63758784532547, + "learning_rate": 3.7944599961453343e-06, + "loss": 0.2976, + "step": 38410 + }, + { + "epoch": 0.7131214192623727, + "grad_norm": 0.5879611372947693, + "learning_rate": 3.7935453187144944e-06, + "loss": 0.346, + "step": 38412 + }, + { + "epoch": 0.7131585493997913, + "grad_norm": 0.3939766585826874, + "learning_rate": 3.792630725734917e-06, + "loss": 0.2136, + "step": 38414 + }, + { + "epoch": 0.71319567953721, + "grad_norm": 0.5130848288536072, + "learning_rate": 3.791716217219038e-06, + "loss": 0.466, + "step": 38416 + }, + { + "epoch": 0.7132328096746287, + "grad_norm": 0.5954288244247437, + "learning_rate": 3.7908017931793095e-06, + "loss": 0.2828, + "step": 38418 + }, + { + "epoch": 0.7132699398120472, + "grad_norm": 0.247306227684021, + "learning_rate": 3.789887453628166e-06, + "loss": 0.2534, + "step": 38420 + }, + { + "epoch": 0.7133070699494659, + "grad_norm": 0.3098422884941101, + "learning_rate": 3.788973198578053e-06, + "loss": 0.2136, + "step": 38422 + }, + { + "epoch": 0.7133442000868845, + "grad_norm": 0.3368147611618042, + "learning_rate": 3.788059028041411e-06, + "loss": 0.2806, + "step": 38424 + }, + { + "epoch": 0.7133813302243032, + "grad_norm": 0.3784393072128296, + "learning_rate": 3.7871449420306815e-06, + "loss": 0.0827, + "step": 38426 + }, + { + "epoch": 0.7134184603617219, + "grad_norm": 0.2543085217475891, + "learning_rate": 3.7862309405582966e-06, + "loss": 0.1402, + "step": 38428 + }, + { + "epoch": 0.7134555904991404, + "grad_norm": 0.6137259602546692, + "learning_rate": 3.785317023636695e-06, + "loss": 0.4223, + "step": 38430 + }, + { + "epoch": 0.7134927206365591, + "grad_norm": 0.4496777355670929, + "learning_rate": 3.784403191278315e-06, + "loss": 0.1814, + "step": 38432 + }, + { + "epoch": 0.7135298507739777, + "grad_norm": 0.32674068212509155, + "learning_rate": 3.7834894434955894e-06, + "loss": 0.3442, + "step": 38434 + }, + { + "epoch": 0.7135669809113964, + "grad_norm": 0.47954410314559937, + "learning_rate": 3.7825757803009557e-06, + "loss": 0.2269, + "step": 38436 + }, + { + "epoch": 0.7136041110488149, + "grad_norm": 0.44878795742988586, + "learning_rate": 3.781662201706838e-06, + "loss": 0.1351, + "step": 38438 + }, + { + "epoch": 0.7136412411862336, + "grad_norm": 0.9860539436340332, + "learning_rate": 3.7807487077256754e-06, + "loss": 0.4323, + "step": 38440 + }, + { + "epoch": 0.7136783713236523, + "grad_norm": 0.30242955684661865, + "learning_rate": 3.7798352983698905e-06, + "loss": 0.2371, + "step": 38442 + }, + { + "epoch": 0.7137155014610709, + "grad_norm": 0.39447104930877686, + "learning_rate": 3.7789219736519154e-06, + "loss": 0.5736, + "step": 38444 + }, + { + "epoch": 0.7137526315984896, + "grad_norm": 0.23475044965744019, + "learning_rate": 3.7780087335841786e-06, + "loss": 0.2296, + "step": 38446 + }, + { + "epoch": 0.7137897617359081, + "grad_norm": 0.5398349165916443, + "learning_rate": 3.7770955781791065e-06, + "loss": 0.4278, + "step": 38448 + }, + { + "epoch": 0.7138268918733268, + "grad_norm": 0.4245615005493164, + "learning_rate": 3.7761825074491254e-06, + "loss": 0.2991, + "step": 38450 + }, + { + "epoch": 0.7138640220107455, + "grad_norm": 0.20239369571208954, + "learning_rate": 3.775269521406656e-06, + "loss": 0.1779, + "step": 38452 + }, + { + "epoch": 0.7139011521481641, + "grad_norm": 0.4447181820869446, + "learning_rate": 3.7743566200641225e-06, + "loss": 0.2804, + "step": 38454 + }, + { + "epoch": 0.7139382822855828, + "grad_norm": 0.4130091965198517, + "learning_rate": 3.773443803433947e-06, + "loss": 0.3643, + "step": 38456 + }, + { + "epoch": 0.7139754124230013, + "grad_norm": 0.30447736382484436, + "learning_rate": 3.772531071528556e-06, + "loss": 0.3327, + "step": 38458 + }, + { + "epoch": 0.71401254256042, + "grad_norm": 0.45896947383880615, + "learning_rate": 3.7716184243603583e-06, + "loss": 0.2814, + "step": 38460 + }, + { + "epoch": 0.7140496726978387, + "grad_norm": 0.33555012941360474, + "learning_rate": 3.770705861941778e-06, + "loss": 0.0649, + "step": 38462 + }, + { + "epoch": 0.7140868028352573, + "grad_norm": 0.33059367537498474, + "learning_rate": 3.7697933842852363e-06, + "loss": 0.1829, + "step": 38464 + }, + { + "epoch": 0.714123932972676, + "grad_norm": 0.5058813691139221, + "learning_rate": 3.768880991403141e-06, + "loss": 0.3687, + "step": 38466 + }, + { + "epoch": 0.7141610631100945, + "grad_norm": 0.36249691247940063, + "learning_rate": 3.767968683307911e-06, + "loss": 0.3887, + "step": 38468 + }, + { + "epoch": 0.7141981932475132, + "grad_norm": 0.35768887400627136, + "learning_rate": 3.7670564600119596e-06, + "loss": 0.2034, + "step": 38470 + }, + { + "epoch": 0.7142353233849319, + "grad_norm": 0.36447030305862427, + "learning_rate": 3.7661443215277015e-06, + "loss": 0.2411, + "step": 38472 + }, + { + "epoch": 0.7142724535223505, + "grad_norm": 0.3206687569618225, + "learning_rate": 3.7652322678675458e-06, + "loss": 0.213, + "step": 38474 + }, + { + "epoch": 0.7143095836597692, + "grad_norm": 0.36723050475120544, + "learning_rate": 3.7643202990439065e-06, + "loss": 0.2718, + "step": 38476 + }, + { + "epoch": 0.7143467137971877, + "grad_norm": 0.20427601039409637, + "learning_rate": 3.7634084150691865e-06, + "loss": 0.1438, + "step": 38478 + }, + { + "epoch": 0.7143838439346064, + "grad_norm": 0.2274230718612671, + "learning_rate": 3.762496615955802e-06, + "loss": 0.1156, + "step": 38480 + }, + { + "epoch": 0.7144209740720251, + "grad_norm": 0.23206478357315063, + "learning_rate": 3.7615849017161497e-06, + "loss": 0.3244, + "step": 38482 + }, + { + "epoch": 0.7144581042094437, + "grad_norm": 0.256886750459671, + "learning_rate": 3.7606732723626426e-06, + "loss": 0.3071, + "step": 38484 + }, + { + "epoch": 0.7144952343468624, + "grad_norm": 0.31571251153945923, + "learning_rate": 3.7597617279076815e-06, + "loss": 0.3229, + "step": 38486 + }, + { + "epoch": 0.7145323644842809, + "grad_norm": 0.24469232559204102, + "learning_rate": 3.7588502683636763e-06, + "loss": 0.1685, + "step": 38488 + }, + { + "epoch": 0.7145694946216996, + "grad_norm": 0.5443921685218811, + "learning_rate": 3.7579388937430207e-06, + "loss": 0.4606, + "step": 38490 + }, + { + "epoch": 0.7146066247591182, + "grad_norm": 0.22091738879680634, + "learning_rate": 3.75702760405812e-06, + "loss": 0.1299, + "step": 38492 + }, + { + "epoch": 0.7146437548965369, + "grad_norm": 0.5289151668548584, + "learning_rate": 3.7561163993213734e-06, + "loss": 0.5152, + "step": 38494 + }, + { + "epoch": 0.7146808850339555, + "grad_norm": 0.3619935214519501, + "learning_rate": 3.75520527954518e-06, + "loss": 0.1097, + "step": 38496 + }, + { + "epoch": 0.7147180151713741, + "grad_norm": 0.6360693573951721, + "learning_rate": 3.7542942447419374e-06, + "loss": 0.2038, + "step": 38498 + }, + { + "epoch": 0.7147551453087928, + "grad_norm": 0.3501574397087097, + "learning_rate": 3.7533832949240457e-06, + "loss": 0.1341, + "step": 38500 + }, + { + "epoch": 0.7147922754462114, + "grad_norm": 0.5388885140419006, + "learning_rate": 3.752472430103896e-06, + "loss": 0.4631, + "step": 38502 + }, + { + "epoch": 0.7148294055836301, + "grad_norm": 0.17365466058254242, + "learning_rate": 3.75156165029388e-06, + "loss": 0.2216, + "step": 38504 + }, + { + "epoch": 0.7148665357210487, + "grad_norm": 0.35155782103538513, + "learning_rate": 3.750650955506394e-06, + "loss": 0.338, + "step": 38506 + }, + { + "epoch": 0.7149036658584673, + "grad_norm": 0.38686272501945496, + "learning_rate": 3.7497403457538297e-06, + "loss": 0.2281, + "step": 38508 + }, + { + "epoch": 0.714940795995886, + "grad_norm": 0.3953717350959778, + "learning_rate": 3.748829821048576e-06, + "loss": 0.1573, + "step": 38510 + }, + { + "epoch": 0.7149779261333046, + "grad_norm": 0.35503342747688293, + "learning_rate": 3.7479193814030255e-06, + "loss": 0.1951, + "step": 38512 + }, + { + "epoch": 0.7150150562707233, + "grad_norm": 0.4534502923488617, + "learning_rate": 3.747009026829569e-06, + "loss": 0.2853, + "step": 38514 + }, + { + "epoch": 0.7150521864081419, + "grad_norm": 0.4495185613632202, + "learning_rate": 3.746098757340585e-06, + "loss": 0.28, + "step": 38516 + }, + { + "epoch": 0.7150893165455605, + "grad_norm": 0.6893193125724792, + "learning_rate": 3.745188572948465e-06, + "loss": 0.2927, + "step": 38518 + }, + { + "epoch": 0.7151264466829792, + "grad_norm": 0.30690300464630127, + "learning_rate": 3.7442784736655926e-06, + "loss": 0.2728, + "step": 38520 + }, + { + "epoch": 0.7151635768203978, + "grad_norm": 0.5460488200187683, + "learning_rate": 3.743368459504356e-06, + "loss": 0.2771, + "step": 38522 + }, + { + "epoch": 0.7152007069578165, + "grad_norm": 0.5028060674667358, + "learning_rate": 3.742458530477131e-06, + "loss": 0.2461, + "step": 38524 + }, + { + "epoch": 0.7152378370952351, + "grad_norm": 0.37725409865379333, + "learning_rate": 3.7415486865963047e-06, + "loss": 0.2657, + "step": 38526 + }, + { + "epoch": 0.7152749672326537, + "grad_norm": 0.37274429202079773, + "learning_rate": 3.7406389278742503e-06, + "loss": 0.2229, + "step": 38528 + }, + { + "epoch": 0.7153120973700724, + "grad_norm": 0.5873746275901794, + "learning_rate": 3.7397292543233523e-06, + "loss": 0.1592, + "step": 38530 + }, + { + "epoch": 0.715349227507491, + "grad_norm": 0.3676605522632599, + "learning_rate": 3.7388196659559874e-06, + "loss": 0.2246, + "step": 38532 + }, + { + "epoch": 0.7153863576449097, + "grad_norm": 0.31116873025894165, + "learning_rate": 3.7379101627845317e-06, + "loss": 0.1475, + "step": 38534 + }, + { + "epoch": 0.7154234877823283, + "grad_norm": 0.4500538110733032, + "learning_rate": 3.737000744821362e-06, + "loss": 0.2156, + "step": 38536 + }, + { + "epoch": 0.7154606179197469, + "grad_norm": 0.42384371161460876, + "learning_rate": 3.7360914120788527e-06, + "loss": 0.2789, + "step": 38538 + }, + { + "epoch": 0.7154977480571656, + "grad_norm": 0.41394034028053284, + "learning_rate": 3.7351821645693797e-06, + "loss": 0.2751, + "step": 38540 + }, + { + "epoch": 0.7155348781945842, + "grad_norm": 0.41111302375793457, + "learning_rate": 3.7342730023053085e-06, + "loss": 0.4001, + "step": 38542 + }, + { + "epoch": 0.7155720083320029, + "grad_norm": 0.4480436146259308, + "learning_rate": 3.733363925299017e-06, + "loss": 0.7307, + "step": 38544 + }, + { + "epoch": 0.7156091384694214, + "grad_norm": 0.5152963399887085, + "learning_rate": 3.7324549335628692e-06, + "loss": 0.3211, + "step": 38546 + }, + { + "epoch": 0.7156462686068401, + "grad_norm": 0.35430845618247986, + "learning_rate": 3.7315460271092363e-06, + "loss": 0.3109, + "step": 38548 + }, + { + "epoch": 0.7156833987442588, + "grad_norm": 0.3785333037376404, + "learning_rate": 3.7306372059504846e-06, + "loss": 0.2386, + "step": 38550 + }, + { + "epoch": 0.7157205288816774, + "grad_norm": 0.3565354347229004, + "learning_rate": 3.7297284700989866e-06, + "loss": 0.1629, + "step": 38552 + }, + { + "epoch": 0.715757659019096, + "grad_norm": 0.33217304944992065, + "learning_rate": 3.7288198195670976e-06, + "loss": 0.2015, + "step": 38554 + }, + { + "epoch": 0.7157947891565146, + "grad_norm": 0.34040918946266174, + "learning_rate": 3.727911254367187e-06, + "loss": 0.5029, + "step": 38556 + }, + { + "epoch": 0.7158319192939333, + "grad_norm": 0.3734451234340668, + "learning_rate": 3.727002774511618e-06, + "loss": 0.3488, + "step": 38558 + }, + { + "epoch": 0.715869049431352, + "grad_norm": 0.35075655579566956, + "learning_rate": 3.72609438001275e-06, + "loss": 0.2575, + "step": 38560 + }, + { + "epoch": 0.7159061795687706, + "grad_norm": 0.37108591198921204, + "learning_rate": 3.725186070882949e-06, + "loss": 0.1609, + "step": 38562 + }, + { + "epoch": 0.7159433097061892, + "grad_norm": 0.28926882147789, + "learning_rate": 3.7242778471345676e-06, + "loss": 0.194, + "step": 38564 + }, + { + "epoch": 0.7159804398436078, + "grad_norm": 0.30193620920181274, + "learning_rate": 3.723369708779969e-06, + "loss": 0.1306, + "step": 38566 + }, + { + "epoch": 0.7160175699810265, + "grad_norm": 0.30801576375961304, + "learning_rate": 3.722461655831505e-06, + "loss": 0.2919, + "step": 38568 + }, + { + "epoch": 0.7160547001184452, + "grad_norm": 0.3678168058395386, + "learning_rate": 3.7215536883015346e-06, + "loss": 0.3321, + "step": 38570 + }, + { + "epoch": 0.7160918302558638, + "grad_norm": 0.33992069959640503, + "learning_rate": 3.720645806202412e-06, + "loss": 0.3422, + "step": 38572 + }, + { + "epoch": 0.7161289603932824, + "grad_norm": 0.37005648016929626, + "learning_rate": 3.719738009546492e-06, + "loss": 0.2142, + "step": 38574 + }, + { + "epoch": 0.716166090530701, + "grad_norm": 0.3993622362613678, + "learning_rate": 3.7188302983461255e-06, + "loss": 0.3502, + "step": 38576 + }, + { + "epoch": 0.7162032206681197, + "grad_norm": 0.3127598762512207, + "learning_rate": 3.7179226726136674e-06, + "loss": 0.2744, + "step": 38578 + }, + { + "epoch": 0.7162403508055384, + "grad_norm": 0.505318820476532, + "learning_rate": 3.7170151323614624e-06, + "loss": 0.133, + "step": 38580 + }, + { + "epoch": 0.716277480942957, + "grad_norm": 0.33114007115364075, + "learning_rate": 3.716107677601861e-06, + "loss": 0.2207, + "step": 38582 + }, + { + "epoch": 0.7163146110803756, + "grad_norm": 0.38612401485443115, + "learning_rate": 3.715200308347211e-06, + "loss": 0.1554, + "step": 38584 + }, + { + "epoch": 0.7163517412177942, + "grad_norm": 0.42608556151390076, + "learning_rate": 3.714293024609864e-06, + "loss": 0.1669, + "step": 38586 + }, + { + "epoch": 0.7163888713552129, + "grad_norm": 0.3281004726886749, + "learning_rate": 3.713385826402157e-06, + "loss": 0.1572, + "step": 38588 + }, + { + "epoch": 0.7164260014926315, + "grad_norm": 0.5750089883804321, + "learning_rate": 3.712478713736443e-06, + "loss": 0.2794, + "step": 38590 + }, + { + "epoch": 0.7164631316300502, + "grad_norm": 0.3530252277851105, + "learning_rate": 3.7115716866250563e-06, + "loss": 0.3004, + "step": 38592 + }, + { + "epoch": 0.7165002617674688, + "grad_norm": 0.37191370129585266, + "learning_rate": 3.710664745080342e-06, + "loss": 0.2944, + "step": 38594 + }, + { + "epoch": 0.7165373919048874, + "grad_norm": 0.33158376812934875, + "learning_rate": 3.7097578891146434e-06, + "loss": 0.5051, + "step": 38596 + }, + { + "epoch": 0.7165745220423061, + "grad_norm": 0.2931126654148102, + "learning_rate": 3.7088511187402976e-06, + "loss": 0.1531, + "step": 38598 + }, + { + "epoch": 0.7166116521797247, + "grad_norm": 0.45178812742233276, + "learning_rate": 3.7079444339696433e-06, + "loss": 0.3219, + "step": 38600 + }, + { + "epoch": 0.7166487823171434, + "grad_norm": 0.21858333051204681, + "learning_rate": 3.707037834815023e-06, + "loss": 0.25, + "step": 38602 + }, + { + "epoch": 0.716685912454562, + "grad_norm": 0.36935535073280334, + "learning_rate": 3.706131321288764e-06, + "loss": 0.3288, + "step": 38604 + }, + { + "epoch": 0.7167230425919806, + "grad_norm": 0.38025790452957153, + "learning_rate": 3.705224893403205e-06, + "loss": 0.1961, + "step": 38606 + }, + { + "epoch": 0.7167601727293993, + "grad_norm": 0.3116885721683502, + "learning_rate": 3.7043185511706847e-06, + "loss": 0.2673, + "step": 38608 + }, + { + "epoch": 0.7167973028668179, + "grad_norm": 0.3381451964378357, + "learning_rate": 3.703412294603528e-06, + "loss": 0.2672, + "step": 38610 + }, + { + "epoch": 0.7168344330042365, + "grad_norm": 0.561720073223114, + "learning_rate": 3.702506123714068e-06, + "loss": 0.2193, + "step": 38612 + }, + { + "epoch": 0.7168715631416552, + "grad_norm": 0.1104595810174942, + "learning_rate": 3.7016000385146413e-06, + "loss": 0.2583, + "step": 38614 + }, + { + "epoch": 0.7169086932790738, + "grad_norm": 0.2584749758243561, + "learning_rate": 3.700694039017567e-06, + "loss": 0.2453, + "step": 38616 + }, + { + "epoch": 0.7169458234164925, + "grad_norm": 0.2639651298522949, + "learning_rate": 3.6997881252351797e-06, + "loss": 0.2326, + "step": 38618 + }, + { + "epoch": 0.7169829535539111, + "grad_norm": 0.28743451833724976, + "learning_rate": 3.6988822971798042e-06, + "loss": 0.318, + "step": 38620 + }, + { + "epoch": 0.7170200836913297, + "grad_norm": 0.32242169976234436, + "learning_rate": 3.697976554863767e-06, + "loss": 0.3733, + "step": 38622 + }, + { + "epoch": 0.7170572138287484, + "grad_norm": 0.6726976037025452, + "learning_rate": 3.697070898299392e-06, + "loss": 0.2761, + "step": 38624 + }, + { + "epoch": 0.717094343966167, + "grad_norm": 0.3981928825378418, + "learning_rate": 3.696165327499006e-06, + "loss": 0.2128, + "step": 38626 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.49595940113067627, + "learning_rate": 3.695259842474924e-06, + "loss": 0.3546, + "step": 38628 + }, + { + "epoch": 0.7171686042410043, + "grad_norm": 0.5916197896003723, + "learning_rate": 3.694354443239474e-06, + "loss": 0.2263, + "step": 38630 + }, + { + "epoch": 0.7172057343784229, + "grad_norm": 0.5026167035102844, + "learning_rate": 3.693449129804969e-06, + "loss": 0.3319, + "step": 38632 + }, + { + "epoch": 0.7172428645158416, + "grad_norm": 0.37770113348960876, + "learning_rate": 3.6925439021837316e-06, + "loss": 0.1355, + "step": 38634 + }, + { + "epoch": 0.7172799946532602, + "grad_norm": 0.32266294956207275, + "learning_rate": 3.6916387603880787e-06, + "loss": 0.2021, + "step": 38636 + }, + { + "epoch": 0.7173171247906789, + "grad_norm": 0.3962160050868988, + "learning_rate": 3.690733704430326e-06, + "loss": 0.426, + "step": 38638 + }, + { + "epoch": 0.7173542549280975, + "grad_norm": 0.4419653117656708, + "learning_rate": 3.6898287343227925e-06, + "loss": 0.2823, + "step": 38640 + }, + { + "epoch": 0.7173913850655161, + "grad_norm": 0.31579461693763733, + "learning_rate": 3.688923850077786e-06, + "loss": 0.2263, + "step": 38642 + }, + { + "epoch": 0.7174285152029347, + "grad_norm": 0.25499096512794495, + "learning_rate": 3.688019051707622e-06, + "loss": 0.3703, + "step": 38644 + }, + { + "epoch": 0.7174656453403534, + "grad_norm": 0.43640610575675964, + "learning_rate": 3.687114339224611e-06, + "loss": 0.3093, + "step": 38646 + }, + { + "epoch": 0.7175027754777721, + "grad_norm": 0.41525140404701233, + "learning_rate": 3.686209712641069e-06, + "loss": 0.1812, + "step": 38648 + }, + { + "epoch": 0.7175399056151907, + "grad_norm": 0.2539508640766144, + "learning_rate": 3.685305171969298e-06, + "loss": 0.3593, + "step": 38650 + }, + { + "epoch": 0.7175770357526093, + "grad_norm": 0.4719572365283966, + "learning_rate": 3.684400717221611e-06, + "loss": 0.182, + "step": 38652 + }, + { + "epoch": 0.7176141658900279, + "grad_norm": 0.5121989250183105, + "learning_rate": 3.6834963484103102e-06, + "loss": 0.3614, + "step": 38654 + }, + { + "epoch": 0.7176512960274466, + "grad_norm": 0.2766980230808258, + "learning_rate": 3.6825920655477034e-06, + "loss": 0.1745, + "step": 38656 + }, + { + "epoch": 0.7176884261648653, + "grad_norm": 0.5101406574249268, + "learning_rate": 3.6816878686460967e-06, + "loss": 0.3317, + "step": 38658 + }, + { + "epoch": 0.7177255563022839, + "grad_norm": 0.4496828615665436, + "learning_rate": 3.6807837577177916e-06, + "loss": 0.1731, + "step": 38660 + }, + { + "epoch": 0.7177626864397025, + "grad_norm": 0.4221780300140381, + "learning_rate": 3.6798797327750913e-06, + "loss": 0.2247, + "step": 38662 + }, + { + "epoch": 0.7177998165771211, + "grad_norm": 0.36494335532188416, + "learning_rate": 3.6789757938302973e-06, + "loss": 0.2578, + "step": 38664 + }, + { + "epoch": 0.7178369467145398, + "grad_norm": 0.34253060817718506, + "learning_rate": 3.678071940895712e-06, + "loss": 0.2417, + "step": 38666 + }, + { + "epoch": 0.7178740768519585, + "grad_norm": 0.43805208802223206, + "learning_rate": 3.6771681739836286e-06, + "loss": 0.3266, + "step": 38668 + }, + { + "epoch": 0.717911206989377, + "grad_norm": 0.6333506107330322, + "learning_rate": 3.6762644931063506e-06, + "loss": 0.3294, + "step": 38670 + }, + { + "epoch": 0.7179483371267957, + "grad_norm": 0.3238885998725891, + "learning_rate": 3.675360898276168e-06, + "loss": 0.2315, + "step": 38672 + }, + { + "epoch": 0.7179854672642143, + "grad_norm": 0.3620810806751251, + "learning_rate": 3.674457389505378e-06, + "loss": 0.2176, + "step": 38674 + }, + { + "epoch": 0.718022597401633, + "grad_norm": 0.33447256684303284, + "learning_rate": 3.673553966806276e-06, + "loss": 0.2949, + "step": 38676 + }, + { + "epoch": 0.7180597275390517, + "grad_norm": 0.49827396869659424, + "learning_rate": 3.672650630191158e-06, + "loss": 0.3553, + "step": 38678 + }, + { + "epoch": 0.7180968576764702, + "grad_norm": 0.40996673703193665, + "learning_rate": 3.671747379672309e-06, + "loss": 0.3466, + "step": 38680 + }, + { + "epoch": 0.7181339878138889, + "grad_norm": 0.35140761733055115, + "learning_rate": 3.6708442152620238e-06, + "loss": 0.4009, + "step": 38682 + }, + { + "epoch": 0.7181711179513075, + "grad_norm": 0.47585082054138184, + "learning_rate": 3.6699411369725903e-06, + "loss": 0.2795, + "step": 38684 + }, + { + "epoch": 0.7182082480887262, + "grad_norm": 0.5417736172676086, + "learning_rate": 3.669038144816296e-06, + "loss": 0.2558, + "step": 38686 + }, + { + "epoch": 0.7182453782261448, + "grad_norm": 0.19093072414398193, + "learning_rate": 3.6681352388054303e-06, + "loss": 0.1152, + "step": 38688 + }, + { + "epoch": 0.7182825083635634, + "grad_norm": 0.6106931567192078, + "learning_rate": 3.6672324189522813e-06, + "loss": 0.2991, + "step": 38690 + }, + { + "epoch": 0.7183196385009821, + "grad_norm": 0.5000886917114258, + "learning_rate": 3.6663296852691265e-06, + "loss": 0.2618, + "step": 38692 + }, + { + "epoch": 0.7183567686384007, + "grad_norm": 0.5093188285827637, + "learning_rate": 3.6654270377682566e-06, + "loss": 0.2094, + "step": 38694 + }, + { + "epoch": 0.7183938987758194, + "grad_norm": 0.3340912163257599, + "learning_rate": 3.664524476461947e-06, + "loss": 0.1836, + "step": 38696 + }, + { + "epoch": 0.718431028913238, + "grad_norm": 0.528056263923645, + "learning_rate": 3.6636220013624823e-06, + "loss": 0.4291, + "step": 38698 + }, + { + "epoch": 0.7184681590506566, + "grad_norm": 0.3353152275085449, + "learning_rate": 3.662719612482143e-06, + "loss": 0.2204, + "step": 38700 + }, + { + "epoch": 0.7185052891880753, + "grad_norm": 0.3706236481666565, + "learning_rate": 3.6618173098332067e-06, + "loss": 0.2895, + "step": 38702 + }, + { + "epoch": 0.7185424193254939, + "grad_norm": 0.44620075821876526, + "learning_rate": 3.6609150934279557e-06, + "loss": 0.2825, + "step": 38704 + }, + { + "epoch": 0.7185795494629126, + "grad_norm": 0.4237919747829437, + "learning_rate": 3.6600129632786584e-06, + "loss": 0.3391, + "step": 38706 + }, + { + "epoch": 0.7186166796003312, + "grad_norm": 0.4652755558490753, + "learning_rate": 3.659110919397595e-06, + "loss": 0.1831, + "step": 38708 + }, + { + "epoch": 0.7186538097377498, + "grad_norm": 0.38460132479667664, + "learning_rate": 3.6582089617970383e-06, + "loss": 0.2693, + "step": 38710 + }, + { + "epoch": 0.7186909398751685, + "grad_norm": 0.30823004245758057, + "learning_rate": 3.657307090489266e-06, + "loss": 0.1929, + "step": 38712 + }, + { + "epoch": 0.7187280700125871, + "grad_norm": 0.32021015882492065, + "learning_rate": 3.656405305486541e-06, + "loss": 0.4334, + "step": 38714 + }, + { + "epoch": 0.7187652001500058, + "grad_norm": 0.4665972888469696, + "learning_rate": 3.655503606801143e-06, + "loss": 0.443, + "step": 38716 + }, + { + "epoch": 0.7188023302874244, + "grad_norm": 0.44418954849243164, + "learning_rate": 3.654601994445334e-06, + "loss": 0.3701, + "step": 38718 + }, + { + "epoch": 0.718839460424843, + "grad_norm": 0.39013493061065674, + "learning_rate": 3.6537004684313836e-06, + "loss": 0.457, + "step": 38720 + }, + { + "epoch": 0.7188765905622617, + "grad_norm": 0.5318259596824646, + "learning_rate": 3.652799028771561e-06, + "loss": 0.2509, + "step": 38722 + }, + { + "epoch": 0.7189137206996803, + "grad_norm": 0.5585427284240723, + "learning_rate": 3.651897675478132e-06, + "loss": 0.2396, + "step": 38724 + }, + { + "epoch": 0.718950850837099, + "grad_norm": 0.3859190046787262, + "learning_rate": 3.6509964085633607e-06, + "loss": 0.4403, + "step": 38726 + }, + { + "epoch": 0.7189879809745175, + "grad_norm": 0.6153118014335632, + "learning_rate": 3.650095228039514e-06, + "loss": 0.304, + "step": 38728 + }, + { + "epoch": 0.7190251111119362, + "grad_norm": 0.4078846573829651, + "learning_rate": 3.6491941339188473e-06, + "loss": 0.1878, + "step": 38730 + }, + { + "epoch": 0.7190622412493549, + "grad_norm": 0.47715505957603455, + "learning_rate": 3.648293126213627e-06, + "loss": 0.2386, + "step": 38732 + }, + { + "epoch": 0.7190993713867735, + "grad_norm": 0.24284495413303375, + "learning_rate": 3.6473922049361134e-06, + "loss": 0.2406, + "step": 38734 + }, + { + "epoch": 0.7191365015241922, + "grad_norm": 0.4004441797733307, + "learning_rate": 3.6464913700985616e-06, + "loss": 0.4131, + "step": 38736 + }, + { + "epoch": 0.7191736316616107, + "grad_norm": 0.35591989755630493, + "learning_rate": 3.6455906217132297e-06, + "loss": 0.4608, + "step": 38738 + }, + { + "epoch": 0.7192107617990294, + "grad_norm": 0.47812169790267944, + "learning_rate": 3.64468995979238e-06, + "loss": 0.1441, + "step": 38740 + }, + { + "epoch": 0.719247891936448, + "grad_norm": 0.3468095064163208, + "learning_rate": 3.6437893843482608e-06, + "loss": 0.2238, + "step": 38742 + }, + { + "epoch": 0.7192850220738667, + "grad_norm": 0.4522748291492462, + "learning_rate": 3.6428888953931287e-06, + "loss": 0.2369, + "step": 38744 + }, + { + "epoch": 0.7193221522112854, + "grad_norm": 0.5012431740760803, + "learning_rate": 3.641988492939237e-06, + "loss": 0.2444, + "step": 38746 + }, + { + "epoch": 0.7193592823487039, + "grad_norm": 0.39005109667778015, + "learning_rate": 3.641088176998837e-06, + "loss": 0.1278, + "step": 38748 + }, + { + "epoch": 0.7193964124861226, + "grad_norm": 0.4419976472854614, + "learning_rate": 3.6401879475841807e-06, + "loss": 0.2312, + "step": 38750 + }, + { + "epoch": 0.7194335426235412, + "grad_norm": 0.7120946645736694, + "learning_rate": 3.6392878047075155e-06, + "loss": 0.2971, + "step": 38752 + }, + { + "epoch": 0.7194706727609599, + "grad_norm": 0.48163482546806335, + "learning_rate": 3.6383877483810947e-06, + "loss": 0.1335, + "step": 38754 + }, + { + "epoch": 0.7195078028983786, + "grad_norm": 0.3545670211315155, + "learning_rate": 3.6374877786171616e-06, + "loss": 0.2138, + "step": 38756 + }, + { + "epoch": 0.7195449330357971, + "grad_norm": 0.4362489879131317, + "learning_rate": 3.6365878954279578e-06, + "loss": 0.299, + "step": 38758 + }, + { + "epoch": 0.7195820631732158, + "grad_norm": 0.5791985392570496, + "learning_rate": 3.6356880988257326e-06, + "loss": 0.3673, + "step": 38760 + }, + { + "epoch": 0.7196191933106344, + "grad_norm": 0.39850103855133057, + "learning_rate": 3.6347883888227287e-06, + "loss": 0.2784, + "step": 38762 + }, + { + "epoch": 0.7196563234480531, + "grad_norm": 0.5402176380157471, + "learning_rate": 3.6338887654311896e-06, + "loss": 0.1832, + "step": 38764 + }, + { + "epoch": 0.7196934535854718, + "grad_norm": 0.4282661974430084, + "learning_rate": 3.632989228663358e-06, + "loss": 0.3637, + "step": 38766 + }, + { + "epoch": 0.7197305837228903, + "grad_norm": 0.42857134342193604, + "learning_rate": 3.632089778531469e-06, + "loss": 0.3296, + "step": 38768 + }, + { + "epoch": 0.719767713860309, + "grad_norm": 0.29255399107933044, + "learning_rate": 3.631190415047764e-06, + "loss": 0.2624, + "step": 38770 + }, + { + "epoch": 0.7198048439977276, + "grad_norm": 0.43775710463523865, + "learning_rate": 3.63029113822448e-06, + "loss": 0.2476, + "step": 38772 + }, + { + "epoch": 0.7198419741351463, + "grad_norm": 0.33160141110420227, + "learning_rate": 3.629391948073855e-06, + "loss": 0.2211, + "step": 38774 + }, + { + "epoch": 0.719879104272565, + "grad_norm": 0.28628262877464294, + "learning_rate": 3.628492844608126e-06, + "loss": 0.2424, + "step": 38776 + }, + { + "epoch": 0.7199162344099835, + "grad_norm": 0.34296634793281555, + "learning_rate": 3.6275938278395252e-06, + "loss": 0.2618, + "step": 38778 + }, + { + "epoch": 0.7199533645474022, + "grad_norm": 0.4045264720916748, + "learning_rate": 3.6266948977802808e-06, + "loss": 0.2621, + "step": 38780 + }, + { + "epoch": 0.7199904946848208, + "grad_norm": 0.3822517991065979, + "learning_rate": 3.6257960544426297e-06, + "loss": 0.2516, + "step": 38782 + }, + { + "epoch": 0.7200276248222395, + "grad_norm": 0.3708129823207855, + "learning_rate": 3.6248972978388e-06, + "loss": 0.3812, + "step": 38784 + }, + { + "epoch": 0.7200647549596582, + "grad_norm": 0.3437800109386444, + "learning_rate": 3.6239986279810223e-06, + "loss": 0.3967, + "step": 38786 + }, + { + "epoch": 0.7201018850970767, + "grad_norm": 0.47998106479644775, + "learning_rate": 3.6231000448815256e-06, + "loss": 0.0318, + "step": 38788 + }, + { + "epoch": 0.7201390152344954, + "grad_norm": 0.3666190803050995, + "learning_rate": 3.6222015485525365e-06, + "loss": 0.3328, + "step": 38790 + }, + { + "epoch": 0.720176145371914, + "grad_norm": 0.43083828687667847, + "learning_rate": 3.6213031390062846e-06, + "loss": 0.204, + "step": 38792 + }, + { + "epoch": 0.7202132755093327, + "grad_norm": 0.29224878549575806, + "learning_rate": 3.6204048162549855e-06, + "loss": 0.2584, + "step": 38794 + }, + { + "epoch": 0.7202504056467512, + "grad_norm": 0.6617841124534607, + "learning_rate": 3.6195065803108687e-06, + "loss": 0.3285, + "step": 38796 + }, + { + "epoch": 0.7202875357841699, + "grad_norm": 0.32580098509788513, + "learning_rate": 3.618608431186159e-06, + "loss": 0.1982, + "step": 38798 + }, + { + "epoch": 0.7203246659215886, + "grad_norm": 0.4350251853466034, + "learning_rate": 3.6177103688930705e-06, + "loss": 0.2385, + "step": 38800 + }, + { + "epoch": 0.7203617960590072, + "grad_norm": 0.34577810764312744, + "learning_rate": 3.6168123934438273e-06, + "loss": 0.2533, + "step": 38802 + }, + { + "epoch": 0.7203989261964259, + "grad_norm": 0.4980188012123108, + "learning_rate": 3.6159145048506507e-06, + "loss": 0.2436, + "step": 38804 + }, + { + "epoch": 0.7204360563338444, + "grad_norm": 0.6040366888046265, + "learning_rate": 3.615016703125751e-06, + "loss": 0.2507, + "step": 38806 + }, + { + "epoch": 0.7204731864712631, + "grad_norm": 0.40945670008659363, + "learning_rate": 3.614118988281349e-06, + "loss": 0.2976, + "step": 38808 + }, + { + "epoch": 0.7205103166086818, + "grad_norm": 0.23386062681674957, + "learning_rate": 3.6132213603296597e-06, + "loss": 0.1554, + "step": 38810 + }, + { + "epoch": 0.7205474467461004, + "grad_norm": 0.357394814491272, + "learning_rate": 3.612323819282897e-06, + "loss": 0.2703, + "step": 38812 + }, + { + "epoch": 0.7205845768835191, + "grad_norm": 0.34070026874542236, + "learning_rate": 3.6114263651532745e-06, + "loss": 0.219, + "step": 38814 + }, + { + "epoch": 0.7206217070209376, + "grad_norm": 0.900668203830719, + "learning_rate": 3.6105289979530057e-06, + "loss": 0.4366, + "step": 38816 + }, + { + "epoch": 0.7206588371583563, + "grad_norm": 0.490684449672699, + "learning_rate": 3.609631717694295e-06, + "loss": 0.185, + "step": 38818 + }, + { + "epoch": 0.720695967295775, + "grad_norm": 0.47929099202156067, + "learning_rate": 3.60873452438936e-06, + "loss": 0.3218, + "step": 38820 + }, + { + "epoch": 0.7207330974331936, + "grad_norm": 0.3474893569946289, + "learning_rate": 3.6078374180503993e-06, + "loss": 0.1781, + "step": 38822 + }, + { + "epoch": 0.7207702275706123, + "grad_norm": 0.4055374264717102, + "learning_rate": 3.606940398689626e-06, + "loss": 0.2203, + "step": 38824 + }, + { + "epoch": 0.7208073577080308, + "grad_norm": 0.08936754614114761, + "learning_rate": 3.6060434663192435e-06, + "loss": 0.2689, + "step": 38826 + }, + { + "epoch": 0.7208444878454495, + "grad_norm": 0.3887677490711212, + "learning_rate": 3.6051466209514574e-06, + "loss": 0.3051, + "step": 38828 + }, + { + "epoch": 0.7208816179828682, + "grad_norm": 0.36987999081611633, + "learning_rate": 3.6042498625984745e-06, + "loss": 0.2451, + "step": 38830 + }, + { + "epoch": 0.7209187481202868, + "grad_norm": 0.3292480707168579, + "learning_rate": 3.6033531912724895e-06, + "loss": 0.1286, + "step": 38832 + }, + { + "epoch": 0.7209558782577055, + "grad_norm": 0.31218263506889343, + "learning_rate": 3.602456606985708e-06, + "loss": 0.1441, + "step": 38834 + }, + { + "epoch": 0.720993008395124, + "grad_norm": 0.35919642448425293, + "learning_rate": 3.6015601097503293e-06, + "loss": 0.3411, + "step": 38836 + }, + { + "epoch": 0.7210301385325427, + "grad_norm": 0.35082876682281494, + "learning_rate": 3.6006636995785517e-06, + "loss": 0.1913, + "step": 38838 + }, + { + "epoch": 0.7210672686699613, + "grad_norm": 0.26235806941986084, + "learning_rate": 3.599767376482576e-06, + "loss": 0.2155, + "step": 38840 + }, + { + "epoch": 0.72110439880738, + "grad_norm": 0.322581022977829, + "learning_rate": 3.598871140474596e-06, + "loss": 0.2072, + "step": 38842 + }, + { + "epoch": 0.7211415289447987, + "grad_norm": 0.45166969299316406, + "learning_rate": 3.597974991566803e-06, + "loss": 0.6084, + "step": 38844 + }, + { + "epoch": 0.7211786590822172, + "grad_norm": 0.3500579297542572, + "learning_rate": 3.5970789297713924e-06, + "loss": 0.2474, + "step": 38846 + }, + { + "epoch": 0.7212157892196359, + "grad_norm": 0.5835371017456055, + "learning_rate": 3.59618295510056e-06, + "loss": 0.2226, + "step": 38848 + }, + { + "epoch": 0.7212529193570545, + "grad_norm": 0.5012018084526062, + "learning_rate": 3.5952870675664964e-06, + "loss": 0.3825, + "step": 38850 + }, + { + "epoch": 0.7212900494944732, + "grad_norm": 0.403816819190979, + "learning_rate": 3.594391267181391e-06, + "loss": 0.3176, + "step": 38852 + }, + { + "epoch": 0.7213271796318919, + "grad_norm": 0.2957979142665863, + "learning_rate": 3.5934955539574355e-06, + "loss": 0.1617, + "step": 38854 + }, + { + "epoch": 0.7213643097693104, + "grad_norm": 0.1890823394060135, + "learning_rate": 3.592599927906815e-06, + "loss": 0.195, + "step": 38856 + }, + { + "epoch": 0.7214014399067291, + "grad_norm": 0.6210729479789734, + "learning_rate": 3.5917043890417147e-06, + "loss": 0.2169, + "step": 38858 + }, + { + "epoch": 0.7214385700441477, + "grad_norm": 0.2653856873512268, + "learning_rate": 3.590808937374324e-06, + "loss": 0.0939, + "step": 38860 + }, + { + "epoch": 0.7214757001815664, + "grad_norm": 0.4769550561904907, + "learning_rate": 3.5899135729168287e-06, + "loss": 0.3384, + "step": 38862 + }, + { + "epoch": 0.721512830318985, + "grad_norm": 0.4075268805027008, + "learning_rate": 3.5890182956814057e-06, + "loss": 0.2335, + "step": 38864 + }, + { + "epoch": 0.7215499604564036, + "grad_norm": 0.4530476927757263, + "learning_rate": 3.5881231056802414e-06, + "loss": 0.2859, + "step": 38866 + }, + { + "epoch": 0.7215870905938223, + "grad_norm": 0.4054543077945709, + "learning_rate": 3.587228002925518e-06, + "loss": 0.2894, + "step": 38868 + }, + { + "epoch": 0.7216242207312409, + "grad_norm": 0.4946069121360779, + "learning_rate": 3.5863329874294108e-06, + "loss": 0.3096, + "step": 38870 + }, + { + "epoch": 0.7216613508686596, + "grad_norm": 0.5204428434371948, + "learning_rate": 3.5854380592040995e-06, + "loss": 0.1586, + "step": 38872 + }, + { + "epoch": 0.7216984810060783, + "grad_norm": 0.6086781024932861, + "learning_rate": 3.5845432182617633e-06, + "loss": 0.3147, + "step": 38874 + }, + { + "epoch": 0.7217356111434968, + "grad_norm": 0.20208783447742462, + "learning_rate": 3.5836484646145765e-06, + "loss": 0.2429, + "step": 38876 + }, + { + "epoch": 0.7217727412809155, + "grad_norm": 0.37577173113822937, + "learning_rate": 3.582753798274715e-06, + "loss": 0.2683, + "step": 38878 + }, + { + "epoch": 0.7218098714183341, + "grad_norm": 0.25602689385414124, + "learning_rate": 3.5818592192543567e-06, + "loss": 0.1358, + "step": 38880 + }, + { + "epoch": 0.7218470015557528, + "grad_norm": 0.36536502838134766, + "learning_rate": 3.5809647275656657e-06, + "loss": 0.4456, + "step": 38882 + }, + { + "epoch": 0.7218841316931714, + "grad_norm": 0.584618091583252, + "learning_rate": 3.5800703232208213e-06, + "loss": 0.274, + "step": 38884 + }, + { + "epoch": 0.72192126183059, + "grad_norm": 0.5205422043800354, + "learning_rate": 3.579176006231987e-06, + "loss": 0.3038, + "step": 38886 + }, + { + "epoch": 0.7219583919680087, + "grad_norm": 0.3048643469810486, + "learning_rate": 3.578281776611333e-06, + "loss": 0.1461, + "step": 38888 + }, + { + "epoch": 0.7219955221054273, + "grad_norm": 0.32386571168899536, + "learning_rate": 3.57738763437103e-06, + "loss": 0.3194, + "step": 38890 + }, + { + "epoch": 0.722032652242846, + "grad_norm": 0.2678619623184204, + "learning_rate": 3.576493579523247e-06, + "loss": 0.4651, + "step": 38892 + }, + { + "epoch": 0.7220697823802645, + "grad_norm": 0.49344614148139954, + "learning_rate": 3.5755996120801416e-06, + "loss": 0.3791, + "step": 38894 + }, + { + "epoch": 0.7221069125176832, + "grad_norm": 0.3099924921989441, + "learning_rate": 3.5747057320538826e-06, + "loss": 0.342, + "step": 38896 + }, + { + "epoch": 0.7221440426551019, + "grad_norm": 0.474805623292923, + "learning_rate": 3.5738119394566327e-06, + "loss": 0.2895, + "step": 38898 + }, + { + "epoch": 0.7221811727925205, + "grad_norm": 0.3559885621070862, + "learning_rate": 3.572918234300553e-06, + "loss": 0.4125, + "step": 38900 + }, + { + "epoch": 0.7222183029299392, + "grad_norm": 0.29377293586730957, + "learning_rate": 3.572024616597809e-06, + "loss": 0.2539, + "step": 38902 + }, + { + "epoch": 0.7222554330673577, + "grad_norm": 0.5472588539123535, + "learning_rate": 3.5711310863605517e-06, + "loss": 0.203, + "step": 38904 + }, + { + "epoch": 0.7222925632047764, + "grad_norm": 0.40619930624961853, + "learning_rate": 3.570237643600948e-06, + "loss": 0.347, + "step": 38906 + }, + { + "epoch": 0.7223296933421951, + "grad_norm": 0.22255311906337738, + "learning_rate": 3.569344288331146e-06, + "loss": 0.204, + "step": 38908 + }, + { + "epoch": 0.7223668234796137, + "grad_norm": 0.47249698638916016, + "learning_rate": 3.568451020563307e-06, + "loss": 0.1732, + "step": 38910 + }, + { + "epoch": 0.7224039536170324, + "grad_norm": 0.26445773243904114, + "learning_rate": 3.5675578403095845e-06, + "loss": 0.2383, + "step": 38912 + }, + { + "epoch": 0.7224410837544509, + "grad_norm": 0.4643542468547821, + "learning_rate": 3.5666647475821325e-06, + "loss": 0.2817, + "step": 38914 + }, + { + "epoch": 0.7224782138918696, + "grad_norm": 0.23916217684745789, + "learning_rate": 3.5657717423931038e-06, + "loss": 0.3744, + "step": 38916 + }, + { + "epoch": 0.7225153440292883, + "grad_norm": 0.3727911710739136, + "learning_rate": 3.564878824754652e-06, + "loss": 0.5394, + "step": 38918 + }, + { + "epoch": 0.7225524741667069, + "grad_norm": 0.2643372714519501, + "learning_rate": 3.56398599467892e-06, + "loss": 0.2251, + "step": 38920 + }, + { + "epoch": 0.7225896043041256, + "grad_norm": 0.3309539258480072, + "learning_rate": 3.5630932521780615e-06, + "loss": 0.2046, + "step": 38922 + }, + { + "epoch": 0.7226267344415441, + "grad_norm": 0.2792668342590332, + "learning_rate": 3.562200597264226e-06, + "loss": 0.2128, + "step": 38924 + }, + { + "epoch": 0.7226638645789628, + "grad_norm": 0.29750505089759827, + "learning_rate": 3.5613080299495526e-06, + "loss": 0.1791, + "step": 38926 + }, + { + "epoch": 0.7227009947163815, + "grad_norm": 0.2564142346382141, + "learning_rate": 3.5604155502461924e-06, + "loss": 0.0639, + "step": 38928 + }, + { + "epoch": 0.7227381248538001, + "grad_norm": 0.3811742663383484, + "learning_rate": 3.5595231581662903e-06, + "loss": 0.2042, + "step": 38930 + }, + { + "epoch": 0.7227752549912188, + "grad_norm": 0.2708301842212677, + "learning_rate": 3.558630853721984e-06, + "loss": 0.2386, + "step": 38932 + }, + { + "epoch": 0.7228123851286373, + "grad_norm": 0.5155462026596069, + "learning_rate": 3.5577386369254163e-06, + "loss": 0.2811, + "step": 38934 + }, + { + "epoch": 0.722849515266056, + "grad_norm": 0.5553277730941772, + "learning_rate": 3.5568465077887294e-06, + "loss": 0.2966, + "step": 38936 + }, + { + "epoch": 0.7228866454034747, + "grad_norm": 0.3409940004348755, + "learning_rate": 3.555954466324062e-06, + "loss": 0.25, + "step": 38938 + }, + { + "epoch": 0.7229237755408933, + "grad_norm": 0.4441196322441101, + "learning_rate": 3.5550625125435533e-06, + "loss": 0.3048, + "step": 38940 + }, + { + "epoch": 0.722960905678312, + "grad_norm": 0.29282763600349426, + "learning_rate": 3.5541706464593407e-06, + "loss": 0.2121, + "step": 38942 + }, + { + "epoch": 0.7229980358157305, + "grad_norm": 0.649804413318634, + "learning_rate": 3.5532788680835563e-06, + "loss": 0.2436, + "step": 38944 + }, + { + "epoch": 0.7230351659531492, + "grad_norm": 0.2992003262042999, + "learning_rate": 3.552387177428335e-06, + "loss": 0.2873, + "step": 38946 + }, + { + "epoch": 0.7230722960905678, + "grad_norm": 0.4115583598613739, + "learning_rate": 3.5514955745058155e-06, + "loss": 0.2796, + "step": 38948 + }, + { + "epoch": 0.7231094262279865, + "grad_norm": 0.4025840163230896, + "learning_rate": 3.5506040593281223e-06, + "loss": 0.2979, + "step": 38950 + }, + { + "epoch": 0.7231465563654051, + "grad_norm": 0.5188094973564148, + "learning_rate": 3.5497126319073894e-06, + "loss": 0.3105, + "step": 38952 + }, + { + "epoch": 0.7231836865028237, + "grad_norm": 0.4124195873737335, + "learning_rate": 3.5488212922557463e-06, + "loss": 0.3394, + "step": 38954 + }, + { + "epoch": 0.7232208166402424, + "grad_norm": 0.4021106958389282, + "learning_rate": 3.547930040385327e-06, + "loss": 0.3372, + "step": 38956 + }, + { + "epoch": 0.723257946777661, + "grad_norm": 0.4518987834453583, + "learning_rate": 3.5470388763082485e-06, + "loss": 0.2095, + "step": 38958 + }, + { + "epoch": 0.7232950769150797, + "grad_norm": 0.30070164799690247, + "learning_rate": 3.546147800036642e-06, + "loss": 0.2437, + "step": 38960 + }, + { + "epoch": 0.7233322070524983, + "grad_norm": 0.43122047185897827, + "learning_rate": 3.5452568115826336e-06, + "loss": 0.3089, + "step": 38962 + }, + { + "epoch": 0.7233693371899169, + "grad_norm": 1.0666844844818115, + "learning_rate": 3.544365910958345e-06, + "loss": 0.3335, + "step": 38964 + }, + { + "epoch": 0.7234064673273356, + "grad_norm": 0.4523346722126007, + "learning_rate": 3.5434750981759035e-06, + "loss": 0.2773, + "step": 38966 + }, + { + "epoch": 0.7234435974647542, + "grad_norm": 0.4578306972980499, + "learning_rate": 3.542584373247423e-06, + "loss": 0.3336, + "step": 38968 + }, + { + "epoch": 0.7234807276021729, + "grad_norm": 0.37370413541793823, + "learning_rate": 3.54169373618503e-06, + "loss": 0.2858, + "step": 38970 + }, + { + "epoch": 0.7235178577395915, + "grad_norm": 0.35634171962738037, + "learning_rate": 3.5408031870008363e-06, + "loss": 0.3599, + "step": 38972 + }, + { + "epoch": 0.7235549878770101, + "grad_norm": 0.19497917592525482, + "learning_rate": 3.5399127257069643e-06, + "loss": 0.4497, + "step": 38974 + }, + { + "epoch": 0.7235921180144288, + "grad_norm": 0.5412720441818237, + "learning_rate": 3.5390223523155296e-06, + "loss": 0.1097, + "step": 38976 + }, + { + "epoch": 0.7236292481518474, + "grad_norm": 0.3420836329460144, + "learning_rate": 3.5381320668386487e-06, + "loss": 0.3406, + "step": 38978 + }, + { + "epoch": 0.723666378289266, + "grad_norm": 0.4148283898830414, + "learning_rate": 3.537241869288438e-06, + "loss": 0.2928, + "step": 38980 + }, + { + "epoch": 0.7237035084266847, + "grad_norm": 0.3723365366458893, + "learning_rate": 3.5363517596770035e-06, + "loss": 0.1594, + "step": 38982 + }, + { + "epoch": 0.7237406385641033, + "grad_norm": 0.3955720365047455, + "learning_rate": 3.5354617380164614e-06, + "loss": 0.1664, + "step": 38984 + }, + { + "epoch": 0.723777768701522, + "grad_norm": 0.37481918931007385, + "learning_rate": 3.5345718043189204e-06, + "loss": 0.3109, + "step": 38986 + }, + { + "epoch": 0.7238148988389406, + "grad_norm": 0.32017987966537476, + "learning_rate": 3.5336819585964943e-06, + "loss": 0.1629, + "step": 38988 + }, + { + "epoch": 0.7238520289763593, + "grad_norm": 0.1633184552192688, + "learning_rate": 3.532792200861286e-06, + "loss": 0.2089, + "step": 38990 + }, + { + "epoch": 0.7238891591137778, + "grad_norm": 0.346179336309433, + "learning_rate": 3.531902531125403e-06, + "loss": 0.4263, + "step": 38992 + }, + { + "epoch": 0.7239262892511965, + "grad_norm": 0.42145711183547974, + "learning_rate": 3.5310129494009558e-06, + "loss": 0.224, + "step": 38994 + }, + { + "epoch": 0.7239634193886152, + "grad_norm": 0.3653205633163452, + "learning_rate": 3.530123455700043e-06, + "loss": 0.4381, + "step": 38996 + }, + { + "epoch": 0.7240005495260338, + "grad_norm": 0.42879360914230347, + "learning_rate": 3.5292340500347687e-06, + "loss": 0.4225, + "step": 38998 + }, + { + "epoch": 0.7240376796634524, + "grad_norm": 0.472304105758667, + "learning_rate": 3.528344732417238e-06, + "loss": 0.3585, + "step": 39000 + }, + { + "epoch": 0.724074809800871, + "grad_norm": 0.2679572105407715, + "learning_rate": 3.5274555028595515e-06, + "loss": 0.2104, + "step": 39002 + }, + { + "epoch": 0.7241119399382897, + "grad_norm": 0.5813097357749939, + "learning_rate": 3.5265663613738067e-06, + "loss": 0.1899, + "step": 39004 + }, + { + "epoch": 0.7241490700757084, + "grad_norm": 0.3221306800842285, + "learning_rate": 3.5256773079721074e-06, + "loss": 0.3697, + "step": 39006 + }, + { + "epoch": 0.724186200213127, + "grad_norm": 0.6098265051841736, + "learning_rate": 3.524788342666544e-06, + "loss": 0.236, + "step": 39008 + }, + { + "epoch": 0.7242233303505456, + "grad_norm": 0.3353172838687897, + "learning_rate": 3.523899465469218e-06, + "loss": 0.2855, + "step": 39010 + }, + { + "epoch": 0.7242604604879642, + "grad_norm": 0.3790881037712097, + "learning_rate": 3.52301067639222e-06, + "loss": 0.2982, + "step": 39012 + }, + { + "epoch": 0.7242975906253829, + "grad_norm": 0.3624052405357361, + "learning_rate": 3.5221219754476454e-06, + "loss": 0.3355, + "step": 39014 + }, + { + "epoch": 0.7243347207628016, + "grad_norm": 0.319830983877182, + "learning_rate": 3.521233362647587e-06, + "loss": 0.2454, + "step": 39016 + }, + { + "epoch": 0.7243718509002202, + "grad_norm": 0.3455754220485687, + "learning_rate": 3.5203448380041405e-06, + "loss": 0.3669, + "step": 39018 + }, + { + "epoch": 0.7244089810376388, + "grad_norm": 0.2667527198791504, + "learning_rate": 3.5194564015293876e-06, + "loss": 0.1728, + "step": 39020 + }, + { + "epoch": 0.7244461111750574, + "grad_norm": 0.3749663531780243, + "learning_rate": 3.5185680532354228e-06, + "loss": 0.1594, + "step": 39022 + }, + { + "epoch": 0.7244832413124761, + "grad_norm": 0.3437434732913971, + "learning_rate": 3.517679793134332e-06, + "loss": 0.125, + "step": 39024 + }, + { + "epoch": 0.7245203714498948, + "grad_norm": 0.3875744342803955, + "learning_rate": 3.5167916212382013e-06, + "loss": 0.3497, + "step": 39026 + }, + { + "epoch": 0.7245575015873134, + "grad_norm": 0.2606813311576843, + "learning_rate": 3.515903537559119e-06, + "loss": 0.2169, + "step": 39028 + }, + { + "epoch": 0.724594631724732, + "grad_norm": 0.33072134852409363, + "learning_rate": 3.5150155421091703e-06, + "loss": 0.3459, + "step": 39030 + }, + { + "epoch": 0.7246317618621506, + "grad_norm": 0.3636607229709625, + "learning_rate": 3.514127634900435e-06, + "loss": 0.2582, + "step": 39032 + }, + { + "epoch": 0.7246688919995693, + "grad_norm": 0.37538769841194153, + "learning_rate": 3.513239815944991e-06, + "loss": 0.3284, + "step": 39034 + }, + { + "epoch": 0.724706022136988, + "grad_norm": 0.3180486857891083, + "learning_rate": 3.512352085254924e-06, + "loss": 0.4365, + "step": 39036 + }, + { + "epoch": 0.7247431522744066, + "grad_norm": 0.4815988540649414, + "learning_rate": 3.511464442842313e-06, + "loss": 0.3549, + "step": 39038 + }, + { + "epoch": 0.7247802824118252, + "grad_norm": 0.6721015572547913, + "learning_rate": 3.510576888719234e-06, + "loss": 0.2752, + "step": 39040 + }, + { + "epoch": 0.7248174125492438, + "grad_norm": 0.3308669626712799, + "learning_rate": 3.5096894228977664e-06, + "loss": 0.1925, + "step": 39042 + }, + { + "epoch": 0.7248545426866625, + "grad_norm": 0.5335550308227539, + "learning_rate": 3.5088020453899887e-06, + "loss": 0.3357, + "step": 39044 + }, + { + "epoch": 0.7248916728240811, + "grad_norm": 0.39075931906700134, + "learning_rate": 3.5079147562079674e-06, + "loss": 0.2614, + "step": 39046 + }, + { + "epoch": 0.7249288029614998, + "grad_norm": 0.4553295373916626, + "learning_rate": 3.50702755536378e-06, + "loss": 0.1827, + "step": 39048 + }, + { + "epoch": 0.7249659330989184, + "grad_norm": 0.29857999086380005, + "learning_rate": 3.5061404428695002e-06, + "loss": 0.1295, + "step": 39050 + }, + { + "epoch": 0.725003063236337, + "grad_norm": 0.49474528431892395, + "learning_rate": 3.5052534187372002e-06, + "loss": 0.2598, + "step": 39052 + }, + { + "epoch": 0.7250401933737557, + "grad_norm": 0.3226276636123657, + "learning_rate": 3.5043664829789435e-06, + "loss": 0.2712, + "step": 39054 + }, + { + "epoch": 0.7250773235111743, + "grad_norm": 0.4511318802833557, + "learning_rate": 3.5034796356068067e-06, + "loss": 0.308, + "step": 39056 + }, + { + "epoch": 0.725114453648593, + "grad_norm": 0.3627657890319824, + "learning_rate": 3.502592876632849e-06, + "loss": 0.3243, + "step": 39058 + }, + { + "epoch": 0.7251515837860116, + "grad_norm": 0.5099171996116638, + "learning_rate": 3.5017062060691396e-06, + "loss": 0.1566, + "step": 39060 + }, + { + "epoch": 0.7251887139234302, + "grad_norm": 0.37432652711868286, + "learning_rate": 3.5008196239277447e-06, + "loss": 0.1857, + "step": 39062 + }, + { + "epoch": 0.7252258440608489, + "grad_norm": 0.4011692404747009, + "learning_rate": 3.4999331302207273e-06, + "loss": 0.3195, + "step": 39064 + }, + { + "epoch": 0.7252629741982675, + "grad_norm": 0.6910839080810547, + "learning_rate": 3.49904672496015e-06, + "loss": 0.2733, + "step": 39066 + }, + { + "epoch": 0.7253001043356861, + "grad_norm": 0.3382725417613983, + "learning_rate": 3.498160408158077e-06, + "loss": 0.2549, + "step": 39068 + }, + { + "epoch": 0.7253372344731048, + "grad_norm": 0.3551293611526489, + "learning_rate": 3.497274179826563e-06, + "loss": 0.2115, + "step": 39070 + }, + { + "epoch": 0.7253743646105234, + "grad_norm": 0.3523249626159668, + "learning_rate": 3.4963880399776696e-06, + "loss": 0.2902, + "step": 39072 + }, + { + "epoch": 0.7254114947479421, + "grad_norm": 0.4653478264808655, + "learning_rate": 3.4955019886234563e-06, + "loss": 0.1894, + "step": 39074 + }, + { + "epoch": 0.7254486248853607, + "grad_norm": 0.38542091846466064, + "learning_rate": 3.4946160257759753e-06, + "loss": 0.2841, + "step": 39076 + }, + { + "epoch": 0.7254857550227793, + "grad_norm": 0.49824249744415283, + "learning_rate": 3.4937301514472854e-06, + "loss": 0.5006, + "step": 39078 + }, + { + "epoch": 0.725522885160198, + "grad_norm": 0.2981157898902893, + "learning_rate": 3.492844365649438e-06, + "loss": 0.0844, + "step": 39080 + }, + { + "epoch": 0.7255600152976166, + "grad_norm": 0.3175773620605469, + "learning_rate": 3.491958668394492e-06, + "loss": 0.1994, + "step": 39082 + }, + { + "epoch": 0.7255971454350353, + "grad_norm": 0.3956991136074066, + "learning_rate": 3.491073059694491e-06, + "loss": 0.3547, + "step": 39084 + }, + { + "epoch": 0.7256342755724539, + "grad_norm": 0.3419788181781769, + "learning_rate": 3.490187539561489e-06, + "loss": 0.2259, + "step": 39086 + }, + { + "epoch": 0.7256714057098725, + "grad_norm": 0.4093378186225891, + "learning_rate": 3.489302108007535e-06, + "loss": 0.233, + "step": 39088 + }, + { + "epoch": 0.7257085358472912, + "grad_norm": 0.29068395495414734, + "learning_rate": 3.488416765044679e-06, + "loss": 0.4468, + "step": 39090 + }, + { + "epoch": 0.7257456659847098, + "grad_norm": 0.6384782195091248, + "learning_rate": 3.4875315106849682e-06, + "loss": 0.2849, + "step": 39092 + }, + { + "epoch": 0.7257827961221285, + "grad_norm": 0.618625283241272, + "learning_rate": 3.486646344940443e-06, + "loss": 0.3089, + "step": 39094 + }, + { + "epoch": 0.725819926259547, + "grad_norm": 0.3980785310268402, + "learning_rate": 3.4857612678231554e-06, + "loss": 0.2213, + "step": 39096 + }, + { + "epoch": 0.7258570563969657, + "grad_norm": 0.4390384256839752, + "learning_rate": 3.4848762793451417e-06, + "loss": 0.3302, + "step": 39098 + }, + { + "epoch": 0.7258941865343843, + "grad_norm": 0.3453618288040161, + "learning_rate": 3.4839913795184453e-06, + "loss": 0.2619, + "step": 39100 + }, + { + "epoch": 0.725931316671803, + "grad_norm": 0.6470021605491638, + "learning_rate": 3.4831065683551104e-06, + "loss": 0.363, + "step": 39102 + }, + { + "epoch": 0.7259684468092217, + "grad_norm": 0.332330584526062, + "learning_rate": 3.482221845867173e-06, + "loss": 0.4367, + "step": 39104 + }, + { + "epoch": 0.7260055769466403, + "grad_norm": 0.3575783371925354, + "learning_rate": 3.481337212066678e-06, + "loss": 0.393, + "step": 39106 + }, + { + "epoch": 0.7260427070840589, + "grad_norm": 0.5372342467308044, + "learning_rate": 3.4804526669656536e-06, + "loss": 0.2858, + "step": 39108 + }, + { + "epoch": 0.7260798372214775, + "grad_norm": 0.3620225191116333, + "learning_rate": 3.479568210576141e-06, + "loss": 0.2535, + "step": 39110 + }, + { + "epoch": 0.7261169673588962, + "grad_norm": 0.3068416714668274, + "learning_rate": 3.478683842910173e-06, + "loss": 0.2907, + "step": 39112 + }, + { + "epoch": 0.7261540974963149, + "grad_norm": 0.5327022671699524, + "learning_rate": 3.4777995639797844e-06, + "loss": 0.3093, + "step": 39114 + }, + { + "epoch": 0.7261912276337334, + "grad_norm": 0.34905552864074707, + "learning_rate": 3.4769153737970117e-06, + "loss": 0.3011, + "step": 39116 + }, + { + "epoch": 0.7262283577711521, + "grad_norm": 0.5817269682884216, + "learning_rate": 3.4760312723738776e-06, + "loss": 0.1598, + "step": 39118 + }, + { + "epoch": 0.7262654879085707, + "grad_norm": 0.4042501449584961, + "learning_rate": 3.47514725972242e-06, + "loss": 0.2427, + "step": 39120 + }, + { + "epoch": 0.7263026180459894, + "grad_norm": 0.4381115138530731, + "learning_rate": 3.4742633358546605e-06, + "loss": 0.2477, + "step": 39122 + }, + { + "epoch": 0.7263397481834081, + "grad_norm": 0.5624004602432251, + "learning_rate": 3.473379500782631e-06, + "loss": 0.2253, + "step": 39124 + }, + { + "epoch": 0.7263768783208266, + "grad_norm": 0.38847243785858154, + "learning_rate": 3.4724957545183556e-06, + "loss": 0.3138, + "step": 39126 + }, + { + "epoch": 0.7264140084582453, + "grad_norm": 0.3769180178642273, + "learning_rate": 3.471612097073862e-06, + "loss": 0.1553, + "step": 39128 + }, + { + "epoch": 0.7264511385956639, + "grad_norm": 0.3771804869174957, + "learning_rate": 3.4707285284611724e-06, + "loss": 0.0705, + "step": 39130 + }, + { + "epoch": 0.7264882687330826, + "grad_norm": 0.2771605849266052, + "learning_rate": 3.469845048692314e-06, + "loss": 0.2467, + "step": 39132 + }, + { + "epoch": 0.7265253988705013, + "grad_norm": 0.4463084936141968, + "learning_rate": 3.4689616577793007e-06, + "loss": 0.192, + "step": 39134 + }, + { + "epoch": 0.7265625290079198, + "grad_norm": 0.36550235748291016, + "learning_rate": 3.4680783557341567e-06, + "loss": 0.2851, + "step": 39136 + }, + { + "epoch": 0.7265996591453385, + "grad_norm": 0.7167015075683594, + "learning_rate": 3.467195142568904e-06, + "loss": 0.2407, + "step": 39138 + }, + { + "epoch": 0.7266367892827571, + "grad_norm": 0.5674812197685242, + "learning_rate": 3.4663120182955547e-06, + "loss": 0.3297, + "step": 39140 + }, + { + "epoch": 0.7266739194201758, + "grad_norm": 0.2951454222202301, + "learning_rate": 3.4654289829261265e-06, + "loss": 0.2042, + "step": 39142 + }, + { + "epoch": 0.7267110495575944, + "grad_norm": 0.29790204763412476, + "learning_rate": 3.464546036472641e-06, + "loss": 0.1843, + "step": 39144 + }, + { + "epoch": 0.726748179695013, + "grad_norm": 0.38922035694122314, + "learning_rate": 3.4636631789471053e-06, + "loss": 0.3293, + "step": 39146 + }, + { + "epoch": 0.7267853098324317, + "grad_norm": 0.5129856467247009, + "learning_rate": 3.462780410361535e-06, + "loss": 0.166, + "step": 39148 + }, + { + "epoch": 0.7268224399698503, + "grad_norm": 0.36723607778549194, + "learning_rate": 3.4618977307279412e-06, + "loss": 0.149, + "step": 39150 + }, + { + "epoch": 0.726859570107269, + "grad_norm": 0.30715444684028625, + "learning_rate": 3.461015140058336e-06, + "loss": 0.3101, + "step": 39152 + }, + { + "epoch": 0.7268967002446876, + "grad_norm": 0.3355923891067505, + "learning_rate": 3.460132638364727e-06, + "loss": 0.3299, + "step": 39154 + }, + { + "epoch": 0.7269338303821062, + "grad_norm": 0.5309499502182007, + "learning_rate": 3.4592502256591287e-06, + "loss": 0.1902, + "step": 39156 + }, + { + "epoch": 0.7269709605195249, + "grad_norm": 0.5021377205848694, + "learning_rate": 3.4583679019535386e-06, + "loss": 0.3357, + "step": 39158 + }, + { + "epoch": 0.7270080906569435, + "grad_norm": 0.3348003625869751, + "learning_rate": 3.457485667259971e-06, + "loss": 0.2555, + "step": 39160 + }, + { + "epoch": 0.7270452207943622, + "grad_norm": 0.5919007062911987, + "learning_rate": 3.4566035215904227e-06, + "loss": 0.2806, + "step": 39162 + }, + { + "epoch": 0.7270823509317808, + "grad_norm": 0.37456679344177246, + "learning_rate": 3.4557214649569005e-06, + "loss": 0.1653, + "step": 39164 + }, + { + "epoch": 0.7271194810691994, + "grad_norm": 0.30915290117263794, + "learning_rate": 3.454839497371406e-06, + "loss": 0.3886, + "step": 39166 + }, + { + "epoch": 0.7271566112066181, + "grad_norm": 0.43398627638816833, + "learning_rate": 3.4539576188459425e-06, + "loss": 0.5498, + "step": 39168 + }, + { + "epoch": 0.7271937413440367, + "grad_norm": 0.3363274931907654, + "learning_rate": 3.4530758293925103e-06, + "loss": 0.1289, + "step": 39170 + }, + { + "epoch": 0.7272308714814554, + "grad_norm": 0.40138885378837585, + "learning_rate": 3.452194129023103e-06, + "loss": 0.367, + "step": 39172 + }, + { + "epoch": 0.727268001618874, + "grad_norm": 0.2927631437778473, + "learning_rate": 3.451312517749721e-06, + "loss": 0.098, + "step": 39174 + }, + { + "epoch": 0.7273051317562926, + "grad_norm": 0.43219462037086487, + "learning_rate": 3.4504309955843595e-06, + "loss": 0.4389, + "step": 39176 + }, + { + "epoch": 0.7273422618937113, + "grad_norm": 0.4215664267539978, + "learning_rate": 3.4495495625390172e-06, + "loss": 0.3566, + "step": 39178 + }, + { + "epoch": 0.7273793920311299, + "grad_norm": 0.4232175350189209, + "learning_rate": 3.4486682186256815e-06, + "loss": 0.2824, + "step": 39180 + }, + { + "epoch": 0.7274165221685486, + "grad_norm": 0.3134605288505554, + "learning_rate": 3.4477869638563512e-06, + "loss": 0.2967, + "step": 39182 + }, + { + "epoch": 0.7274536523059671, + "grad_norm": 0.6582512259483337, + "learning_rate": 3.44690579824301e-06, + "loss": 0.1889, + "step": 39184 + }, + { + "epoch": 0.7274907824433858, + "grad_norm": 0.4590010344982147, + "learning_rate": 3.446024721797653e-06, + "loss": 0.2772, + "step": 39186 + }, + { + "epoch": 0.7275279125808045, + "grad_norm": 0.35719501972198486, + "learning_rate": 3.445143734532268e-06, + "loss": 0.2911, + "step": 39188 + }, + { + "epoch": 0.7275650427182231, + "grad_norm": 0.19828538596630096, + "learning_rate": 3.4442628364588425e-06, + "loss": 0.0846, + "step": 39190 + }, + { + "epoch": 0.7276021728556418, + "grad_norm": 0.4374946057796478, + "learning_rate": 3.4433820275893626e-06, + "loss": 0.294, + "step": 39192 + }, + { + "epoch": 0.7276393029930603, + "grad_norm": 0.505297064781189, + "learning_rate": 3.4425013079358184e-06, + "loss": 0.4135, + "step": 39194 + }, + { + "epoch": 0.727676433130479, + "grad_norm": 0.39555108547210693, + "learning_rate": 3.4416206775101846e-06, + "loss": 0.1913, + "step": 39196 + }, + { + "epoch": 0.7277135632678976, + "grad_norm": 0.3262774348258972, + "learning_rate": 3.44074013632445e-06, + "loss": 0.1972, + "step": 39198 + }, + { + "epoch": 0.7277506934053163, + "grad_norm": 0.31796813011169434, + "learning_rate": 3.4398596843905976e-06, + "loss": 0.3438, + "step": 39200 + }, + { + "epoch": 0.727787823542735, + "grad_norm": 0.37448248267173767, + "learning_rate": 3.4389793217206026e-06, + "loss": 0.4026, + "step": 39202 + }, + { + "epoch": 0.7278249536801535, + "grad_norm": 0.5521661639213562, + "learning_rate": 3.4380990483264453e-06, + "loss": 0.1558, + "step": 39204 + }, + { + "epoch": 0.7278620838175722, + "grad_norm": 0.5784611105918884, + "learning_rate": 3.4372188642201055e-06, + "loss": 0.1396, + "step": 39206 + }, + { + "epoch": 0.7278992139549908, + "grad_norm": 0.3011987507343292, + "learning_rate": 3.436338769413562e-06, + "loss": 0.1555, + "step": 39208 + }, + { + "epoch": 0.7279363440924095, + "grad_norm": 0.39167916774749756, + "learning_rate": 3.4354587639187843e-06, + "loss": 0.2627, + "step": 39210 + }, + { + "epoch": 0.7279734742298282, + "grad_norm": 0.3705577254295349, + "learning_rate": 3.4345788477477502e-06, + "loss": 0.3626, + "step": 39212 + }, + { + "epoch": 0.7280106043672467, + "grad_norm": 0.41052165627479553, + "learning_rate": 3.433699020912432e-06, + "loss": 0.1662, + "step": 39214 + }, + { + "epoch": 0.7280477345046654, + "grad_norm": 0.32566291093826294, + "learning_rate": 3.4328192834248007e-06, + "loss": 0.3315, + "step": 39216 + }, + { + "epoch": 0.728084864642084, + "grad_norm": 0.30807965993881226, + "learning_rate": 3.431939635296829e-06, + "loss": 0.2695, + "step": 39218 + }, + { + "epoch": 0.7281219947795027, + "grad_norm": 0.43532320857048035, + "learning_rate": 3.431060076540489e-06, + "loss": 0.2736, + "step": 39220 + }, + { + "epoch": 0.7281591249169214, + "grad_norm": 0.2833642363548279, + "learning_rate": 3.430180607167741e-06, + "loss": 0.0409, + "step": 39222 + }, + { + "epoch": 0.7281962550543399, + "grad_norm": 0.3956841826438904, + "learning_rate": 3.4293012271905602e-06, + "loss": 0.5156, + "step": 39224 + }, + { + "epoch": 0.7282333851917586, + "grad_norm": 0.3698021471500397, + "learning_rate": 3.4284219366209036e-06, + "loss": 0.4876, + "step": 39226 + }, + { + "epoch": 0.7282705153291772, + "grad_norm": 0.42781808972358704, + "learning_rate": 3.4275427354707414e-06, + "loss": 0.1689, + "step": 39228 + }, + { + "epoch": 0.7283076454665959, + "grad_norm": 0.373699426651001, + "learning_rate": 3.4266636237520357e-06, + "loss": 0.2146, + "step": 39230 + }, + { + "epoch": 0.7283447756040146, + "grad_norm": 0.38557422161102295, + "learning_rate": 3.4257846014767515e-06, + "loss": 0.2868, + "step": 39232 + }, + { + "epoch": 0.7283819057414331, + "grad_norm": 0.3292366564273834, + "learning_rate": 3.424905668656844e-06, + "loss": 0.372, + "step": 39234 + }, + { + "epoch": 0.7284190358788518, + "grad_norm": 0.3065301775932312, + "learning_rate": 3.4240268253042754e-06, + "loss": 0.2086, + "step": 39236 + }, + { + "epoch": 0.7284561660162704, + "grad_norm": 0.21838930249214172, + "learning_rate": 3.423148071431004e-06, + "loss": 0.2176, + "step": 39238 + }, + { + "epoch": 0.7284932961536891, + "grad_norm": 0.42174071073532104, + "learning_rate": 3.4222694070489872e-06, + "loss": 0.3312, + "step": 39240 + }, + { + "epoch": 0.7285304262911078, + "grad_norm": 0.5030283331871033, + "learning_rate": 3.4213908321701848e-06, + "loss": 0.1766, + "step": 39242 + }, + { + "epoch": 0.7285675564285263, + "grad_norm": 0.21340882778167725, + "learning_rate": 3.420512346806545e-06, + "loss": 0.2802, + "step": 39244 + }, + { + "epoch": 0.728604686565945, + "grad_norm": 0.5842553377151489, + "learning_rate": 3.4196339509700284e-06, + "loss": 0.3174, + "step": 39246 + }, + { + "epoch": 0.7286418167033636, + "grad_norm": 0.3922097980976105, + "learning_rate": 3.418755644672579e-06, + "loss": 0.1877, + "step": 39248 + }, + { + "epoch": 0.7286789468407823, + "grad_norm": 0.5512311458587646, + "learning_rate": 3.4178774279261516e-06, + "loss": 0.3478, + "step": 39250 + }, + { + "epoch": 0.7287160769782008, + "grad_norm": 0.4869624376296997, + "learning_rate": 3.416999300742697e-06, + "loss": 0.1459, + "step": 39252 + }, + { + "epoch": 0.7287532071156195, + "grad_norm": 0.42814716696739197, + "learning_rate": 3.4161212631341645e-06, + "loss": 0.3873, + "step": 39254 + }, + { + "epoch": 0.7287903372530382, + "grad_norm": 0.2851299047470093, + "learning_rate": 3.4152433151124996e-06, + "loss": 0.2699, + "step": 39256 + }, + { + "epoch": 0.7288274673904568, + "grad_norm": 0.5726222395896912, + "learning_rate": 3.4143654566896533e-06, + "loss": 0.4267, + "step": 39258 + }, + { + "epoch": 0.7288645975278755, + "grad_norm": 0.41617101430892944, + "learning_rate": 3.4134876878775637e-06, + "loss": 0.2671, + "step": 39260 + }, + { + "epoch": 0.728901727665294, + "grad_norm": 0.4523519277572632, + "learning_rate": 3.412610008688176e-06, + "loss": 0.2047, + "step": 39262 + }, + { + "epoch": 0.7289388578027127, + "grad_norm": 0.41227251291275024, + "learning_rate": 3.41173241913344e-06, + "loss": 0.2582, + "step": 39264 + }, + { + "epoch": 0.7289759879401314, + "grad_norm": 0.5662005543708801, + "learning_rate": 3.410854919225288e-06, + "loss": 0.3484, + "step": 39266 + }, + { + "epoch": 0.72901311807755, + "grad_norm": 0.3841313123703003, + "learning_rate": 3.409977508975664e-06, + "loss": 0.3902, + "step": 39268 + }, + { + "epoch": 0.7290502482149687, + "grad_norm": 0.3165387809276581, + "learning_rate": 3.409100188396509e-06, + "loss": 0.4446, + "step": 39270 + }, + { + "epoch": 0.7290873783523872, + "grad_norm": 0.428327739238739, + "learning_rate": 3.408222957499755e-06, + "loss": 0.306, + "step": 39272 + }, + { + "epoch": 0.7291245084898059, + "grad_norm": 0.4113166034221649, + "learning_rate": 3.4073458162973426e-06, + "loss": 0.197, + "step": 39274 + }, + { + "epoch": 0.7291616386272246, + "grad_norm": 0.4515472948551178, + "learning_rate": 3.4064687648012065e-06, + "loss": 0.0695, + "step": 39276 + }, + { + "epoch": 0.7291987687646432, + "grad_norm": 0.36745816469192505, + "learning_rate": 3.4055918030232803e-06, + "loss": 0.174, + "step": 39278 + }, + { + "epoch": 0.7292358989020619, + "grad_norm": 0.25630059838294983, + "learning_rate": 3.404714930975497e-06, + "loss": 0.1856, + "step": 39280 + }, + { + "epoch": 0.7292730290394804, + "grad_norm": 0.3291914761066437, + "learning_rate": 3.403838148669789e-06, + "loss": 0.2835, + "step": 39282 + }, + { + "epoch": 0.7293101591768991, + "grad_norm": 0.2569425106048584, + "learning_rate": 3.402961456118088e-06, + "loss": 0.117, + "step": 39284 + }, + { + "epoch": 0.7293472893143178, + "grad_norm": 0.352387934923172, + "learning_rate": 3.4020848533323224e-06, + "loss": 0.3116, + "step": 39286 + }, + { + "epoch": 0.7293844194517364, + "grad_norm": 0.32251402735710144, + "learning_rate": 3.4012083403244156e-06, + "loss": 0.2413, + "step": 39288 + }, + { + "epoch": 0.7294215495891551, + "grad_norm": 0.260231614112854, + "learning_rate": 3.4003319171062966e-06, + "loss": 0.1673, + "step": 39290 + }, + { + "epoch": 0.7294586797265736, + "grad_norm": 0.58722984790802, + "learning_rate": 3.3994555836898924e-06, + "loss": 0.2837, + "step": 39292 + }, + { + "epoch": 0.7294958098639923, + "grad_norm": 0.33602431416511536, + "learning_rate": 3.398579340087127e-06, + "loss": 0.1967, + "step": 39294 + }, + { + "epoch": 0.7295329400014109, + "grad_norm": 0.3525410592556, + "learning_rate": 3.3977031863099264e-06, + "loss": 0.2276, + "step": 39296 + }, + { + "epoch": 0.7295700701388296, + "grad_norm": 0.30403149127960205, + "learning_rate": 3.3968271223702052e-06, + "loss": 0.3183, + "step": 39298 + }, + { + "epoch": 0.7296072002762483, + "grad_norm": 0.7099027037620544, + "learning_rate": 3.395951148279889e-06, + "loss": 0.3452, + "step": 39300 + }, + { + "epoch": 0.7296443304136668, + "grad_norm": 0.5833398103713989, + "learning_rate": 3.3950752640508965e-06, + "loss": 0.1752, + "step": 39302 + }, + { + "epoch": 0.7296814605510855, + "grad_norm": 0.5422753095626831, + "learning_rate": 3.3941994696951452e-06, + "loss": 0.3048, + "step": 39304 + }, + { + "epoch": 0.7297185906885041, + "grad_norm": 0.34894996881484985, + "learning_rate": 3.3933237652245555e-06, + "loss": 0.1206, + "step": 39306 + }, + { + "epoch": 0.7297557208259228, + "grad_norm": 0.3858398199081421, + "learning_rate": 3.392448150651041e-06, + "loss": 0.1022, + "step": 39308 + }, + { + "epoch": 0.7297928509633415, + "grad_norm": 0.259263277053833, + "learning_rate": 3.3915726259865113e-06, + "loss": 0.1286, + "step": 39310 + }, + { + "epoch": 0.72982998110076, + "grad_norm": 0.2320471554994583, + "learning_rate": 3.3906971912428834e-06, + "loss": 0.3445, + "step": 39312 + }, + { + "epoch": 0.7298671112381787, + "grad_norm": 0.22795124351978302, + "learning_rate": 3.3898218464320698e-06, + "loss": 0.1224, + "step": 39314 + }, + { + "epoch": 0.7299042413755973, + "grad_norm": 0.3710617423057556, + "learning_rate": 3.3889465915659815e-06, + "loss": 0.2996, + "step": 39316 + }, + { + "epoch": 0.729941371513016, + "grad_norm": 0.42007747292518616, + "learning_rate": 3.388071426656526e-06, + "loss": 0.2574, + "step": 39318 + }, + { + "epoch": 0.7299785016504347, + "grad_norm": 0.28993046283721924, + "learning_rate": 3.387196351715619e-06, + "loss": 0.1761, + "step": 39320 + }, + { + "epoch": 0.7300156317878532, + "grad_norm": 0.28955498337745667, + "learning_rate": 3.3863213667551577e-06, + "loss": 0.231, + "step": 39322 + }, + { + "epoch": 0.7300527619252719, + "grad_norm": 0.5247309803962708, + "learning_rate": 3.3854464717870515e-06, + "loss": 0.3921, + "step": 39324 + }, + { + "epoch": 0.7300898920626905, + "grad_norm": 0.5087648630142212, + "learning_rate": 3.3845716668232077e-06, + "loss": 0.4016, + "step": 39326 + }, + { + "epoch": 0.7301270222001092, + "grad_norm": 0.30517423152923584, + "learning_rate": 3.3836969518755305e-06, + "loss": 0.2436, + "step": 39328 + }, + { + "epoch": 0.7301641523375278, + "grad_norm": 0.39813855290412903, + "learning_rate": 3.3828223269559156e-06, + "loss": 0.2611, + "step": 39330 + }, + { + "epoch": 0.7302012824749464, + "grad_norm": 0.4249803423881531, + "learning_rate": 3.381947792076269e-06, + "loss": 0.2707, + "step": 39332 + }, + { + "epoch": 0.7302384126123651, + "grad_norm": 0.36574143171310425, + "learning_rate": 3.3810733472484925e-06, + "loss": 0.1892, + "step": 39334 + }, + { + "epoch": 0.7302755427497837, + "grad_norm": 0.37473776936531067, + "learning_rate": 3.380198992484478e-06, + "loss": 0.243, + "step": 39336 + }, + { + "epoch": 0.7303126728872024, + "grad_norm": 0.6076406836509705, + "learning_rate": 3.379324727796127e-06, + "loss": 0.1917, + "step": 39338 + }, + { + "epoch": 0.730349803024621, + "grad_norm": 0.40308690071105957, + "learning_rate": 3.378450553195336e-06, + "loss": 0.3379, + "step": 39340 + }, + { + "epoch": 0.7303869331620396, + "grad_norm": 0.4628306031227112, + "learning_rate": 3.377576468693998e-06, + "loss": 0.3224, + "step": 39342 + }, + { + "epoch": 0.7304240632994583, + "grad_norm": 0.29092615842819214, + "learning_rate": 3.3767024743040087e-06, + "loss": 0.2771, + "step": 39344 + }, + { + "epoch": 0.7304611934368769, + "grad_norm": 0.48770588636398315, + "learning_rate": 3.3758285700372627e-06, + "loss": 0.4219, + "step": 39346 + }, + { + "epoch": 0.7304983235742956, + "grad_norm": 0.8077456951141357, + "learning_rate": 3.374954755905645e-06, + "loss": 0.3397, + "step": 39348 + }, + { + "epoch": 0.7305354537117141, + "grad_norm": 0.5066648125648499, + "learning_rate": 3.3740810319210517e-06, + "loss": 0.4439, + "step": 39350 + }, + { + "epoch": 0.7305725838491328, + "grad_norm": 0.29786914587020874, + "learning_rate": 3.373207398095365e-06, + "loss": 0.4096, + "step": 39352 + }, + { + "epoch": 0.7306097139865515, + "grad_norm": 0.37848058342933655, + "learning_rate": 3.372333854440477e-06, + "loss": 0.5382, + "step": 39354 + }, + { + "epoch": 0.7306468441239701, + "grad_norm": 0.4330310523509979, + "learning_rate": 3.3714604009682715e-06, + "loss": 0.3551, + "step": 39356 + }, + { + "epoch": 0.7306839742613888, + "grad_norm": 0.5164570808410645, + "learning_rate": 3.37058703769064e-06, + "loss": 0.1785, + "step": 39358 + }, + { + "epoch": 0.7307211043988073, + "grad_norm": 0.4931236505508423, + "learning_rate": 3.3697137646194577e-06, + "loss": 0.1962, + "step": 39360 + }, + { + "epoch": 0.730758234536226, + "grad_norm": 0.24815839529037476, + "learning_rate": 3.3688405817666114e-06, + "loss": 0.2879, + "step": 39362 + }, + { + "epoch": 0.7307953646736447, + "grad_norm": 0.4302636981010437, + "learning_rate": 3.367967489143982e-06, + "loss": 0.0748, + "step": 39364 + }, + { + "epoch": 0.7308324948110633, + "grad_norm": 0.2541126012802124, + "learning_rate": 3.367094486763449e-06, + "loss": 0.294, + "step": 39366 + }, + { + "epoch": 0.730869624948482, + "grad_norm": 0.42234769463539124, + "learning_rate": 3.366221574636893e-06, + "loss": 0.1281, + "step": 39368 + }, + { + "epoch": 0.7309067550859005, + "grad_norm": 0.39338475465774536, + "learning_rate": 3.365348752776195e-06, + "loss": 0.1647, + "step": 39370 + }, + { + "epoch": 0.7309438852233192, + "grad_norm": 0.4355759918689728, + "learning_rate": 3.364476021193226e-06, + "loss": 0.3694, + "step": 39372 + }, + { + "epoch": 0.7309810153607379, + "grad_norm": 0.32050520181655884, + "learning_rate": 3.3636033798998602e-06, + "loss": 0.304, + "step": 39374 + }, + { + "epoch": 0.7310181454981565, + "grad_norm": 0.37411239743232727, + "learning_rate": 3.362730828907974e-06, + "loss": 0.3821, + "step": 39376 + }, + { + "epoch": 0.7310552756355752, + "grad_norm": 0.6035988330841064, + "learning_rate": 3.36185836822944e-06, + "loss": 0.4661, + "step": 39378 + }, + { + "epoch": 0.7310924057729937, + "grad_norm": 0.36967048048973083, + "learning_rate": 3.3609859978761294e-06, + "loss": 0.2105, + "step": 39380 + }, + { + "epoch": 0.7311295359104124, + "grad_norm": 0.320677787065506, + "learning_rate": 3.360113717859914e-06, + "loss": 0.2503, + "step": 39382 + }, + { + "epoch": 0.7311666660478311, + "grad_norm": 0.2092316448688507, + "learning_rate": 3.359241528192666e-06, + "loss": 0.1744, + "step": 39384 + }, + { + "epoch": 0.7312037961852497, + "grad_norm": 0.34371185302734375, + "learning_rate": 3.3583694288862454e-06, + "loss": 0.2696, + "step": 39386 + }, + { + "epoch": 0.7312409263226683, + "grad_norm": 0.5447477102279663, + "learning_rate": 3.3574974199525234e-06, + "loss": 0.2308, + "step": 39388 + }, + { + "epoch": 0.7312780564600869, + "grad_norm": 0.41978776454925537, + "learning_rate": 3.3566255014033645e-06, + "loss": 0.1992, + "step": 39390 + }, + { + "epoch": 0.7313151865975056, + "grad_norm": 0.7561306953430176, + "learning_rate": 3.3557536732506367e-06, + "loss": 0.3107, + "step": 39392 + }, + { + "epoch": 0.7313523167349243, + "grad_norm": 0.2962585687637329, + "learning_rate": 3.354881935506196e-06, + "loss": 0.2137, + "step": 39394 + }, + { + "epoch": 0.7313894468723429, + "grad_norm": 0.4822298288345337, + "learning_rate": 3.3540102881819114e-06, + "loss": 0.1435, + "step": 39396 + }, + { + "epoch": 0.7314265770097615, + "grad_norm": 0.6164640188217163, + "learning_rate": 3.353138731289636e-06, + "loss": 0.2851, + "step": 39398 + }, + { + "epoch": 0.7314637071471801, + "grad_norm": 0.41828978061676025, + "learning_rate": 3.3522672648412336e-06, + "loss": 0.2906, + "step": 39400 + }, + { + "epoch": 0.7315008372845988, + "grad_norm": 0.29566556215286255, + "learning_rate": 3.351395888848561e-06, + "loss": 0.118, + "step": 39402 + }, + { + "epoch": 0.7315379674220174, + "grad_norm": 0.4122690260410309, + "learning_rate": 3.350524603323475e-06, + "loss": 0.2738, + "step": 39404 + }, + { + "epoch": 0.7315750975594361, + "grad_norm": 0.47900405526161194, + "learning_rate": 3.349653408277832e-06, + "loss": 0.2906, + "step": 39406 + }, + { + "epoch": 0.7316122276968547, + "grad_norm": 0.36774539947509766, + "learning_rate": 3.348782303723486e-06, + "loss": 0.1752, + "step": 39408 + }, + { + "epoch": 0.7316493578342733, + "grad_norm": 0.34023237228393555, + "learning_rate": 3.3479112896722934e-06, + "loss": 0.1673, + "step": 39410 + }, + { + "epoch": 0.731686487971692, + "grad_norm": 0.44448593258857727, + "learning_rate": 3.3470403661360996e-06, + "loss": 0.4472, + "step": 39412 + }, + { + "epoch": 0.7317236181091106, + "grad_norm": 0.31473252177238464, + "learning_rate": 3.346169533126762e-06, + "loss": 0.0896, + "step": 39414 + }, + { + "epoch": 0.7317607482465293, + "grad_norm": 0.3743666708469391, + "learning_rate": 3.3452987906561228e-06, + "loss": 0.4201, + "step": 39416 + }, + { + "epoch": 0.7317978783839479, + "grad_norm": 0.4425252377986908, + "learning_rate": 3.3444281387360343e-06, + "loss": 0.3131, + "step": 39418 + }, + { + "epoch": 0.7318350085213665, + "grad_norm": 0.5893719792366028, + "learning_rate": 3.3435575773783436e-06, + "loss": 0.2184, + "step": 39420 + }, + { + "epoch": 0.7318721386587852, + "grad_norm": 0.42541906237602234, + "learning_rate": 3.3426871065948986e-06, + "loss": 0.3098, + "step": 39422 + }, + { + "epoch": 0.7319092687962038, + "grad_norm": 0.5064725875854492, + "learning_rate": 3.341816726397538e-06, + "loss": 0.2096, + "step": 39424 + }, + { + "epoch": 0.7319463989336225, + "grad_norm": 0.4209304749965668, + "learning_rate": 3.3409464367981083e-06, + "loss": 0.1485, + "step": 39426 + }, + { + "epoch": 0.7319835290710411, + "grad_norm": 0.4730643033981323, + "learning_rate": 3.3400762378084507e-06, + "loss": 0.1575, + "step": 39428 + }, + { + "epoch": 0.7320206592084597, + "grad_norm": 0.27889904379844666, + "learning_rate": 3.339206129440408e-06, + "loss": 0.4311, + "step": 39430 + }, + { + "epoch": 0.7320577893458784, + "grad_norm": 0.4501406252384186, + "learning_rate": 3.33833611170582e-06, + "loss": 0.2664, + "step": 39432 + }, + { + "epoch": 0.732094919483297, + "grad_norm": 0.48181676864624023, + "learning_rate": 3.3374661846165214e-06, + "loss": 0.3467, + "step": 39434 + }, + { + "epoch": 0.7321320496207157, + "grad_norm": 0.38327500224113464, + "learning_rate": 3.3365963481843544e-06, + "loss": 0.4224, + "step": 39436 + }, + { + "epoch": 0.7321691797581343, + "grad_norm": 0.36210042238235474, + "learning_rate": 3.335726602421149e-06, + "loss": 0.1029, + "step": 39438 + }, + { + "epoch": 0.7322063098955529, + "grad_norm": 0.2959725260734558, + "learning_rate": 3.3348569473387427e-06, + "loss": 0.2102, + "step": 39440 + }, + { + "epoch": 0.7322434400329716, + "grad_norm": 0.34667396545410156, + "learning_rate": 3.3339873829489686e-06, + "loss": 0.3824, + "step": 39442 + }, + { + "epoch": 0.7322805701703902, + "grad_norm": 0.5159398317337036, + "learning_rate": 3.333117909263659e-06, + "loss": 0.1984, + "step": 39444 + }, + { + "epoch": 0.7323177003078088, + "grad_norm": 0.306907057762146, + "learning_rate": 3.3322485262946456e-06, + "loss": 0.2022, + "step": 39446 + }, + { + "epoch": 0.7323548304452274, + "grad_norm": 0.24252590537071228, + "learning_rate": 3.3313792340537597e-06, + "loss": 0.1741, + "step": 39448 + }, + { + "epoch": 0.7323919605826461, + "grad_norm": 0.4049168527126312, + "learning_rate": 3.3305100325528263e-06, + "loss": 0.5092, + "step": 39450 + }, + { + "epoch": 0.7324290907200648, + "grad_norm": 0.35631072521209717, + "learning_rate": 3.329640921803672e-06, + "loss": 0.4262, + "step": 39452 + }, + { + "epoch": 0.7324662208574834, + "grad_norm": 0.4704231917858124, + "learning_rate": 3.3287719018181286e-06, + "loss": 0.3182, + "step": 39454 + }, + { + "epoch": 0.732503350994902, + "grad_norm": 0.3431044816970825, + "learning_rate": 3.3279029726080148e-06, + "loss": 0.3056, + "step": 39456 + }, + { + "epoch": 0.7325404811323206, + "grad_norm": 0.5290939807891846, + "learning_rate": 3.3270341341851555e-06, + "loss": 0.4403, + "step": 39458 + }, + { + "epoch": 0.7325776112697393, + "grad_norm": 0.3120960295200348, + "learning_rate": 3.326165386561376e-06, + "loss": 0.2928, + "step": 39460 + }, + { + "epoch": 0.732614741407158, + "grad_norm": 0.3846617043018341, + "learning_rate": 3.325296729748493e-06, + "loss": 0.1502, + "step": 39462 + }, + { + "epoch": 0.7326518715445766, + "grad_norm": 0.5202431082725525, + "learning_rate": 3.3244281637583288e-06, + "loss": 0.2824, + "step": 39464 + }, + { + "epoch": 0.7326890016819952, + "grad_norm": 0.5688328146934509, + "learning_rate": 3.323559688602701e-06, + "loss": 0.12, + "step": 39466 + }, + { + "epoch": 0.7327261318194138, + "grad_norm": 0.3555378317832947, + "learning_rate": 3.3226913042934272e-06, + "loss": 0.2833, + "step": 39468 + }, + { + "epoch": 0.7327632619568325, + "grad_norm": 0.28759363293647766, + "learning_rate": 3.3218230108423242e-06, + "loss": 0.1606, + "step": 39470 + }, + { + "epoch": 0.7328003920942512, + "grad_norm": 0.3010222911834717, + "learning_rate": 3.3209548082612097e-06, + "loss": 0.2953, + "step": 39472 + }, + { + "epoch": 0.7328375222316698, + "grad_norm": 0.38120535016059875, + "learning_rate": 3.320086696561892e-06, + "loss": 0.2438, + "step": 39474 + }, + { + "epoch": 0.7328746523690884, + "grad_norm": 0.5771275758743286, + "learning_rate": 3.319218675756185e-06, + "loss": 0.1046, + "step": 39476 + }, + { + "epoch": 0.732911782506507, + "grad_norm": 0.3151102364063263, + "learning_rate": 3.3183507458559037e-06, + "loss": 0.3682, + "step": 39478 + }, + { + "epoch": 0.7329489126439257, + "grad_norm": 0.642266035079956, + "learning_rate": 3.317482906872851e-06, + "loss": 0.3063, + "step": 39480 + }, + { + "epoch": 0.7329860427813444, + "grad_norm": 0.6876683831214905, + "learning_rate": 3.316615158818841e-06, + "loss": 0.3108, + "step": 39482 + }, + { + "epoch": 0.733023172918763, + "grad_norm": 0.42287275195121765, + "learning_rate": 3.315747501705682e-06, + "loss": 0.2266, + "step": 39484 + }, + { + "epoch": 0.7330603030561816, + "grad_norm": 0.5441722869873047, + "learning_rate": 3.3148799355451745e-06, + "loss": 0.2867, + "step": 39486 + }, + { + "epoch": 0.7330974331936002, + "grad_norm": 0.33868300914764404, + "learning_rate": 3.3140124603491265e-06, + "loss": 0.2, + "step": 39488 + }, + { + "epoch": 0.7331345633310189, + "grad_norm": 0.43034493923187256, + "learning_rate": 3.3131450761293425e-06, + "loss": 0.3117, + "step": 39490 + }, + { + "epoch": 0.7331716934684376, + "grad_norm": 0.27191710472106934, + "learning_rate": 3.312277782897625e-06, + "loss": 0.2676, + "step": 39492 + }, + { + "epoch": 0.7332088236058562, + "grad_norm": 0.3336094319820404, + "learning_rate": 3.3114105806657747e-06, + "loss": 0.3404, + "step": 39494 + }, + { + "epoch": 0.7332459537432748, + "grad_norm": 0.2710396647453308, + "learning_rate": 3.310543469445594e-06, + "loss": 0.1291, + "step": 39496 + }, + { + "epoch": 0.7332830838806934, + "grad_norm": 0.4027945399284363, + "learning_rate": 3.3096764492488776e-06, + "loss": 0.6429, + "step": 39498 + }, + { + "epoch": 0.7333202140181121, + "grad_norm": 0.3284349739551544, + "learning_rate": 3.308809520087427e-06, + "loss": 0.3183, + "step": 39500 + }, + { + "epoch": 0.7333573441555307, + "grad_norm": 0.3933369815349579, + "learning_rate": 3.3079426819730344e-06, + "loss": 0.329, + "step": 39502 + }, + { + "epoch": 0.7333944742929493, + "grad_norm": 0.35107147693634033, + "learning_rate": 3.3070759349174953e-06, + "loss": 0.4135, + "step": 39504 + }, + { + "epoch": 0.733431604430368, + "grad_norm": 0.4709293246269226, + "learning_rate": 3.306209278932606e-06, + "loss": 0.3575, + "step": 39506 + }, + { + "epoch": 0.7334687345677866, + "grad_norm": 0.3905387818813324, + "learning_rate": 3.3053427140301587e-06, + "loss": 0.2984, + "step": 39508 + }, + { + "epoch": 0.7335058647052053, + "grad_norm": 0.3877747654914856, + "learning_rate": 3.3044762402219464e-06, + "loss": 0.1817, + "step": 39510 + }, + { + "epoch": 0.7335429948426239, + "grad_norm": 0.24385499954223633, + "learning_rate": 3.303609857519754e-06, + "loss": 0.363, + "step": 39512 + }, + { + "epoch": 0.7335801249800425, + "grad_norm": 0.3909846246242523, + "learning_rate": 3.3027435659353735e-06, + "loss": 0.3082, + "step": 39514 + }, + { + "epoch": 0.7336172551174612, + "grad_norm": 0.221132293343544, + "learning_rate": 3.3018773654805934e-06, + "loss": 0.2259, + "step": 39516 + }, + { + "epoch": 0.7336543852548798, + "grad_norm": 0.5370063781738281, + "learning_rate": 3.3010112561672013e-06, + "loss": 0.2294, + "step": 39518 + }, + { + "epoch": 0.7336915153922985, + "grad_norm": 0.39975836873054504, + "learning_rate": 3.300145238006978e-06, + "loss": 0.2423, + "step": 39520 + }, + { + "epoch": 0.7337286455297171, + "grad_norm": 0.3813387155532837, + "learning_rate": 3.2992793110117114e-06, + "loss": 0.4738, + "step": 39522 + }, + { + "epoch": 0.7337657756671357, + "grad_norm": 0.42539745569229126, + "learning_rate": 3.2984134751931797e-06, + "loss": 0.305, + "step": 39524 + }, + { + "epoch": 0.7338029058045544, + "grad_norm": 0.7070034146308899, + "learning_rate": 3.2975477305631675e-06, + "loss": 0.3733, + "step": 39526 + }, + { + "epoch": 0.733840035941973, + "grad_norm": 0.44668787717819214, + "learning_rate": 3.296682077133454e-06, + "loss": 0.1876, + "step": 39528 + }, + { + "epoch": 0.7338771660793917, + "grad_norm": 0.37090885639190674, + "learning_rate": 3.2958165149158195e-06, + "loss": 0.2827, + "step": 39530 + }, + { + "epoch": 0.7339142962168103, + "grad_norm": 0.23615257441997528, + "learning_rate": 3.29495104392204e-06, + "loss": 0.1217, + "step": 39532 + }, + { + "epoch": 0.7339514263542289, + "grad_norm": 0.3470562696456909, + "learning_rate": 3.294085664163893e-06, + "loss": 0.266, + "step": 39534 + }, + { + "epoch": 0.7339885564916476, + "grad_norm": 0.29207083582878113, + "learning_rate": 3.2932203756531577e-06, + "loss": 0.2794, + "step": 39536 + }, + { + "epoch": 0.7340256866290662, + "grad_norm": 0.3248654305934906, + "learning_rate": 3.2923551784015993e-06, + "loss": 0.2084, + "step": 39538 + }, + { + "epoch": 0.7340628167664849, + "grad_norm": 0.4295586347579956, + "learning_rate": 3.2914900724209997e-06, + "loss": 0.3463, + "step": 39540 + }, + { + "epoch": 0.7340999469039035, + "grad_norm": 0.38350972533226013, + "learning_rate": 3.2906250577231227e-06, + "loss": 0.4041, + "step": 39542 + }, + { + "epoch": 0.7341370770413221, + "grad_norm": 0.31241822242736816, + "learning_rate": 3.2897601343197404e-06, + "loss": 0.2645, + "step": 39544 + }, + { + "epoch": 0.7341742071787408, + "grad_norm": 0.42178383469581604, + "learning_rate": 3.2888953022226245e-06, + "loss": 0.1052, + "step": 39546 + }, + { + "epoch": 0.7342113373161594, + "grad_norm": 0.5436000227928162, + "learning_rate": 3.2880305614435436e-06, + "loss": 0.4197, + "step": 39548 + }, + { + "epoch": 0.7342484674535781, + "grad_norm": 0.2490389496088028, + "learning_rate": 3.287165911994259e-06, + "loss": 0.1374, + "step": 39550 + }, + { + "epoch": 0.7342855975909967, + "grad_norm": 0.4188910126686096, + "learning_rate": 3.2863013538865398e-06, + "loss": 0.23, + "step": 39552 + }, + { + "epoch": 0.7343227277284153, + "grad_norm": 0.396639347076416, + "learning_rate": 3.2854368871321486e-06, + "loss": 0.2075, + "step": 39554 + }, + { + "epoch": 0.7343598578658339, + "grad_norm": 0.39778420329093933, + "learning_rate": 3.2845725117428494e-06, + "loss": 0.1571, + "step": 39556 + }, + { + "epoch": 0.7343969880032526, + "grad_norm": 0.8328045606613159, + "learning_rate": 3.283708227730403e-06, + "loss": 0.2332, + "step": 39558 + }, + { + "epoch": 0.7344341181406713, + "grad_norm": 0.3782265782356262, + "learning_rate": 3.2828440351065736e-06, + "loss": 0.26, + "step": 39560 + }, + { + "epoch": 0.7344712482780898, + "grad_norm": 0.5119785666465759, + "learning_rate": 3.281979933883117e-06, + "loss": 0.2451, + "step": 39562 + }, + { + "epoch": 0.7345083784155085, + "grad_norm": 0.3259422481060028, + "learning_rate": 3.2811159240717873e-06, + "loss": 0.3005, + "step": 39564 + }, + { + "epoch": 0.7345455085529271, + "grad_norm": 0.48967939615249634, + "learning_rate": 3.2802520056843434e-06, + "loss": 0.2943, + "step": 39566 + }, + { + "epoch": 0.7345826386903458, + "grad_norm": 0.5661710500717163, + "learning_rate": 3.279388178732542e-06, + "loss": 0.2235, + "step": 39568 + }, + { + "epoch": 0.7346197688277645, + "grad_norm": 0.44337931275367737, + "learning_rate": 3.278524443228138e-06, + "loss": 0.3713, + "step": 39570 + }, + { + "epoch": 0.734656898965183, + "grad_norm": 0.5170077681541443, + "learning_rate": 3.2776607991828822e-06, + "loss": 0.2708, + "step": 39572 + }, + { + "epoch": 0.7346940291026017, + "grad_norm": 0.3662564754486084, + "learning_rate": 3.2767972466085306e-06, + "loss": 0.22, + "step": 39574 + }, + { + "epoch": 0.7347311592400203, + "grad_norm": 0.1713951677083969, + "learning_rate": 3.275933785516827e-06, + "loss": 0.1951, + "step": 39576 + }, + { + "epoch": 0.734768289377439, + "grad_norm": 0.22462594509124756, + "learning_rate": 3.2750704159195247e-06, + "loss": 0.3248, + "step": 39578 + }, + { + "epoch": 0.7348054195148577, + "grad_norm": 0.3305954933166504, + "learning_rate": 3.274207137828369e-06, + "loss": 0.2504, + "step": 39580 + }, + { + "epoch": 0.7348425496522762, + "grad_norm": 0.4176628291606903, + "learning_rate": 3.2733439512551123e-06, + "loss": 0.3172, + "step": 39582 + }, + { + "epoch": 0.7348796797896949, + "grad_norm": 0.3020023703575134, + "learning_rate": 3.272480856211492e-06, + "loss": 0.3791, + "step": 39584 + }, + { + "epoch": 0.7349168099271135, + "grad_norm": 0.5806213617324829, + "learning_rate": 3.271617852709259e-06, + "loss": 0.3438, + "step": 39586 + }, + { + "epoch": 0.7349539400645322, + "grad_norm": 0.3570534586906433, + "learning_rate": 3.2707549407601504e-06, + "loss": 0.1697, + "step": 39588 + }, + { + "epoch": 0.7349910702019509, + "grad_norm": 0.30950888991355896, + "learning_rate": 3.2698921203759103e-06, + "loss": 0.1169, + "step": 39590 + }, + { + "epoch": 0.7350282003393694, + "grad_norm": 0.5300682783126831, + "learning_rate": 3.2690293915682803e-06, + "loss": 0.4432, + "step": 39592 + }, + { + "epoch": 0.7350653304767881, + "grad_norm": 0.2543310523033142, + "learning_rate": 3.268166754348998e-06, + "loss": 0.1145, + "step": 39594 + }, + { + "epoch": 0.7351024606142067, + "grad_norm": 0.3860691785812378, + "learning_rate": 3.267304208729801e-06, + "loss": 0.3455, + "step": 39596 + }, + { + "epoch": 0.7351395907516254, + "grad_norm": 0.5638052821159363, + "learning_rate": 3.2664417547224325e-06, + "loss": 0.2403, + "step": 39598 + }, + { + "epoch": 0.735176720889044, + "grad_norm": 0.20706762373447418, + "learning_rate": 3.265579392338617e-06, + "loss": 0.2002, + "step": 39600 + }, + { + "epoch": 0.7352138510264626, + "grad_norm": 0.42782899737358093, + "learning_rate": 3.2647171215900953e-06, + "loss": 0.1812, + "step": 39602 + }, + { + "epoch": 0.7352509811638813, + "grad_norm": 0.2844115197658539, + "learning_rate": 3.2638549424886025e-06, + "loss": 0.3141, + "step": 39604 + }, + { + "epoch": 0.7352881113012999, + "grad_norm": 0.2873739004135132, + "learning_rate": 3.262992855045862e-06, + "loss": 0.296, + "step": 39606 + }, + { + "epoch": 0.7353252414387186, + "grad_norm": 0.3876924216747284, + "learning_rate": 3.262130859273611e-06, + "loss": 0.2826, + "step": 39608 + }, + { + "epoch": 0.7353623715761372, + "grad_norm": 1.2880855798721313, + "learning_rate": 3.2612689551835785e-06, + "loss": 0.428, + "step": 39610 + }, + { + "epoch": 0.7353995017135558, + "grad_norm": 0.3159998953342438, + "learning_rate": 3.2604071427874882e-06, + "loss": 0.2949, + "step": 39612 + }, + { + "epoch": 0.7354366318509745, + "grad_norm": 0.3501976728439331, + "learning_rate": 3.2595454220970692e-06, + "loss": 0.2649, + "step": 39614 + }, + { + "epoch": 0.7354737619883931, + "grad_norm": 3.603623867034912, + "learning_rate": 3.2586837931240465e-06, + "loss": 0.2248, + "step": 39616 + }, + { + "epoch": 0.7355108921258118, + "grad_norm": 0.4075184166431427, + "learning_rate": 3.2578222558801455e-06, + "loss": 0.3439, + "step": 39618 + }, + { + "epoch": 0.7355480222632303, + "grad_norm": 0.286770224571228, + "learning_rate": 3.2569608103770887e-06, + "loss": 0.1231, + "step": 39620 + }, + { + "epoch": 0.735585152400649, + "grad_norm": 0.2635183036327362, + "learning_rate": 3.2560994566266e-06, + "loss": 0.3503, + "step": 39622 + }, + { + "epoch": 0.7356222825380677, + "grad_norm": 0.32128143310546875, + "learning_rate": 3.2552381946403942e-06, + "loss": 0.4085, + "step": 39624 + }, + { + "epoch": 0.7356594126754863, + "grad_norm": 0.48798471689224243, + "learning_rate": 3.2543770244301977e-06, + "loss": 0.2308, + "step": 39626 + }, + { + "epoch": 0.735696542812905, + "grad_norm": 0.345411479473114, + "learning_rate": 3.25351594600772e-06, + "loss": 0.1833, + "step": 39628 + }, + { + "epoch": 0.7357336729503235, + "grad_norm": 0.22205598652362823, + "learning_rate": 3.2526549593846834e-06, + "loss": 0.1376, + "step": 39630 + }, + { + "epoch": 0.7357708030877422, + "grad_norm": 0.510295033454895, + "learning_rate": 3.251794064572801e-06, + "loss": 0.154, + "step": 39632 + }, + { + "epoch": 0.7358079332251609, + "grad_norm": 0.30556991696357727, + "learning_rate": 3.2509332615837886e-06, + "loss": 0.3628, + "step": 39634 + }, + { + "epoch": 0.7358450633625795, + "grad_norm": 0.38054174184799194, + "learning_rate": 3.2500725504293616e-06, + "loss": 0.154, + "step": 39636 + }, + { + "epoch": 0.7358821934999982, + "grad_norm": 0.6307283639907837, + "learning_rate": 3.249211931121227e-06, + "loss": 0.5357, + "step": 39638 + }, + { + "epoch": 0.7359193236374167, + "grad_norm": 0.3596384823322296, + "learning_rate": 3.2483514036710963e-06, + "loss": 0.1648, + "step": 39640 + }, + { + "epoch": 0.7359564537748354, + "grad_norm": 0.3554809093475342, + "learning_rate": 3.2474909680906784e-06, + "loss": 0.3361, + "step": 39642 + }, + { + "epoch": 0.7359935839122541, + "grad_norm": 0.29304617643356323, + "learning_rate": 3.2466306243916834e-06, + "loss": 0.2129, + "step": 39644 + }, + { + "epoch": 0.7360307140496727, + "grad_norm": 0.47748661041259766, + "learning_rate": 3.2457703725858203e-06, + "loss": 0.2595, + "step": 39646 + }, + { + "epoch": 0.7360678441870914, + "grad_norm": 0.4446309506893158, + "learning_rate": 3.244910212684791e-06, + "loss": 0.5336, + "step": 39648 + }, + { + "epoch": 0.7361049743245099, + "grad_norm": 0.49341025948524475, + "learning_rate": 3.244050144700296e-06, + "loss": 0.2737, + "step": 39650 + }, + { + "epoch": 0.7361421044619286, + "grad_norm": 0.1671096533536911, + "learning_rate": 3.2431901686440424e-06, + "loss": 0.1101, + "step": 39652 + }, + { + "epoch": 0.7361792345993472, + "grad_norm": 0.42571279406547546, + "learning_rate": 3.2423302845277316e-06, + "loss": 0.3083, + "step": 39654 + }, + { + "epoch": 0.7362163647367659, + "grad_norm": 0.4584483802318573, + "learning_rate": 3.2414704923630645e-06, + "loss": 0.1336, + "step": 39656 + }, + { + "epoch": 0.7362534948741846, + "grad_norm": 0.36921730637550354, + "learning_rate": 3.240610792161739e-06, + "loss": 0.3621, + "step": 39658 + }, + { + "epoch": 0.7362906250116031, + "grad_norm": 0.49731460213661194, + "learning_rate": 3.2397511839354555e-06, + "loss": 0.1976, + "step": 39660 + }, + { + "epoch": 0.7363277551490218, + "grad_norm": 0.3102233111858368, + "learning_rate": 3.2388916676959117e-06, + "loss": 0.2801, + "step": 39662 + }, + { + "epoch": 0.7363648852864404, + "grad_norm": 0.43967992067337036, + "learning_rate": 3.238032243454796e-06, + "loss": 0.1177, + "step": 39664 + }, + { + "epoch": 0.7364020154238591, + "grad_norm": 0.39288806915283203, + "learning_rate": 3.2371729112238083e-06, + "loss": 0.1723, + "step": 39666 + }, + { + "epoch": 0.7364391455612778, + "grad_norm": 0.3797621428966522, + "learning_rate": 3.2363136710146426e-06, + "loss": 0.2175, + "step": 39668 + }, + { + "epoch": 0.7364762756986963, + "grad_norm": 0.3723621666431427, + "learning_rate": 3.2354545228389855e-06, + "loss": 0.1907, + "step": 39670 + }, + { + "epoch": 0.736513405836115, + "grad_norm": 0.4345323145389557, + "learning_rate": 3.23459546670853e-06, + "loss": 0.325, + "step": 39672 + }, + { + "epoch": 0.7365505359735336, + "grad_norm": 0.4412917196750641, + "learning_rate": 3.2337365026349686e-06, + "loss": 0.4544, + "step": 39674 + }, + { + "epoch": 0.7365876661109523, + "grad_norm": 0.48725977540016174, + "learning_rate": 3.2328776306299837e-06, + "loss": 0.157, + "step": 39676 + }, + { + "epoch": 0.736624796248371, + "grad_norm": 0.4334671199321747, + "learning_rate": 3.2320188507052643e-06, + "loss": 0.2227, + "step": 39678 + }, + { + "epoch": 0.7366619263857895, + "grad_norm": 0.43429210782051086, + "learning_rate": 3.2311601628724954e-06, + "loss": 0.2566, + "step": 39680 + }, + { + "epoch": 0.7366990565232082, + "grad_norm": 0.2580873966217041, + "learning_rate": 3.2303015671433613e-06, + "loss": 0.1283, + "step": 39682 + }, + { + "epoch": 0.7367361866606268, + "grad_norm": 0.5126158595085144, + "learning_rate": 3.229443063529546e-06, + "loss": 0.356, + "step": 39684 + }, + { + "epoch": 0.7367733167980455, + "grad_norm": 0.3800863027572632, + "learning_rate": 3.2285846520427334e-06, + "loss": 0.2777, + "step": 39686 + }, + { + "epoch": 0.7368104469354642, + "grad_norm": 0.31357845664024353, + "learning_rate": 3.2277263326945973e-06, + "loss": 0.2567, + "step": 39688 + }, + { + "epoch": 0.7368475770728827, + "grad_norm": 0.3310134708881378, + "learning_rate": 3.2268681054968244e-06, + "loss": 0.0345, + "step": 39690 + }, + { + "epoch": 0.7368847072103014, + "grad_norm": 0.42560744285583496, + "learning_rate": 3.2260099704610846e-06, + "loss": 0.1788, + "step": 39692 + }, + { + "epoch": 0.73692183734772, + "grad_norm": 0.43040984869003296, + "learning_rate": 3.2251519275990597e-06, + "loss": 0.3009, + "step": 39694 + }, + { + "epoch": 0.7369589674851387, + "grad_norm": 0.5072306394577026, + "learning_rate": 3.2242939769224222e-06, + "loss": 0.3337, + "step": 39696 + }, + { + "epoch": 0.7369960976225574, + "grad_norm": 0.3687780797481537, + "learning_rate": 3.22343611844285e-06, + "loss": 0.6503, + "step": 39698 + }, + { + "epoch": 0.7370332277599759, + "grad_norm": 0.5077753067016602, + "learning_rate": 3.222578352172017e-06, + "loss": 0.4813, + "step": 39700 + }, + { + "epoch": 0.7370703578973946, + "grad_norm": 0.27504462003707886, + "learning_rate": 3.221720678121588e-06, + "loss": 0.1662, + "step": 39702 + }, + { + "epoch": 0.7371074880348132, + "grad_norm": 0.6009109020233154, + "learning_rate": 3.2208630963032373e-06, + "loss": 0.3595, + "step": 39704 + }, + { + "epoch": 0.7371446181722319, + "grad_norm": 0.43973997235298157, + "learning_rate": 3.2200056067286346e-06, + "loss": 0.2319, + "step": 39706 + }, + { + "epoch": 0.7371817483096504, + "grad_norm": 0.4314044713973999, + "learning_rate": 3.2191482094094507e-06, + "loss": 0.19, + "step": 39708 + }, + { + "epoch": 0.7372188784470691, + "grad_norm": 0.3657730221748352, + "learning_rate": 3.218290904357344e-06, + "loss": 0.1613, + "step": 39710 + }, + { + "epoch": 0.7372560085844878, + "grad_norm": 0.8209778070449829, + "learning_rate": 3.217433691583989e-06, + "loss": 0.4598, + "step": 39712 + }, + { + "epoch": 0.7372931387219064, + "grad_norm": 0.19347453117370605, + "learning_rate": 3.2165765711010422e-06, + "loss": 0.2209, + "step": 39714 + }, + { + "epoch": 0.7373302688593251, + "grad_norm": 0.5446634888648987, + "learning_rate": 3.2157195429201682e-06, + "loss": 0.4326, + "step": 39716 + }, + { + "epoch": 0.7373673989967436, + "grad_norm": 0.6134762763977051, + "learning_rate": 3.214862607053031e-06, + "loss": 0.392, + "step": 39718 + }, + { + "epoch": 0.7374045291341623, + "grad_norm": 0.5886852145195007, + "learning_rate": 3.2140057635112897e-06, + "loss": 0.3081, + "step": 39720 + }, + { + "epoch": 0.737441659271581, + "grad_norm": 0.2883561849594116, + "learning_rate": 3.213149012306603e-06, + "loss": 0.2047, + "step": 39722 + }, + { + "epoch": 0.7374787894089996, + "grad_norm": 0.4255123436450958, + "learning_rate": 3.2122923534506345e-06, + "loss": 0.1911, + "step": 39724 + }, + { + "epoch": 0.7375159195464183, + "grad_norm": 0.349149614572525, + "learning_rate": 3.2114357869550304e-06, + "loss": 0.2862, + "step": 39726 + }, + { + "epoch": 0.7375530496838368, + "grad_norm": 0.3913489282131195, + "learning_rate": 3.210579312831451e-06, + "loss": 0.3193, + "step": 39728 + }, + { + "epoch": 0.7375901798212555, + "grad_norm": 0.3857501745223999, + "learning_rate": 3.2097229310915543e-06, + "loss": 0.2556, + "step": 39730 + }, + { + "epoch": 0.7376273099586742, + "grad_norm": 0.4340142607688904, + "learning_rate": 3.208866641746986e-06, + "loss": 0.3038, + "step": 39732 + }, + { + "epoch": 0.7376644400960928, + "grad_norm": 0.4785262942314148, + "learning_rate": 3.2080104448094016e-06, + "loss": 0.3301, + "step": 39734 + }, + { + "epoch": 0.7377015702335115, + "grad_norm": 0.42924487590789795, + "learning_rate": 3.2071543402904505e-06, + "loss": 0.2738, + "step": 39736 + }, + { + "epoch": 0.73773870037093, + "grad_norm": 0.37452811002731323, + "learning_rate": 3.2062983282017844e-06, + "loss": 0.4482, + "step": 39738 + }, + { + "epoch": 0.7377758305083487, + "grad_norm": 0.7111393809318542, + "learning_rate": 3.205442408555045e-06, + "loss": 0.2099, + "step": 39740 + }, + { + "epoch": 0.7378129606457674, + "grad_norm": 0.4545170068740845, + "learning_rate": 3.2045865813618835e-06, + "loss": 0.3603, + "step": 39742 + }, + { + "epoch": 0.737850090783186, + "grad_norm": 0.4308466911315918, + "learning_rate": 3.2037308466339434e-06, + "loss": 0.5005, + "step": 39744 + }, + { + "epoch": 0.7378872209206047, + "grad_norm": 0.3586469292640686, + "learning_rate": 3.202875204382869e-06, + "loss": 0.1148, + "step": 39746 + }, + { + "epoch": 0.7379243510580232, + "grad_norm": 0.4995114505290985, + "learning_rate": 3.202019654620304e-06, + "loss": 0.1833, + "step": 39748 + }, + { + "epoch": 0.7379614811954419, + "grad_norm": 0.4337833821773529, + "learning_rate": 3.201164197357892e-06, + "loss": 0.4176, + "step": 39750 + }, + { + "epoch": 0.7379986113328605, + "grad_norm": 0.4128907024860382, + "learning_rate": 3.2003088326072675e-06, + "loss": 0.2835, + "step": 39752 + }, + { + "epoch": 0.7380357414702792, + "grad_norm": 0.4682595133781433, + "learning_rate": 3.199453560380075e-06, + "loss": 0.2604, + "step": 39754 + }, + { + "epoch": 0.7380728716076979, + "grad_norm": 0.3407163619995117, + "learning_rate": 3.1985983806879473e-06, + "loss": 0.3679, + "step": 39756 + }, + { + "epoch": 0.7381100017451164, + "grad_norm": 0.6640310883522034, + "learning_rate": 3.197743293542522e-06, + "loss": 0.3785, + "step": 39758 + }, + { + "epoch": 0.7381471318825351, + "grad_norm": 0.3402019143104553, + "learning_rate": 3.196888298955436e-06, + "loss": 0.1933, + "step": 39760 + }, + { + "epoch": 0.7381842620199537, + "grad_norm": 0.617081344127655, + "learning_rate": 3.196033396938325e-06, + "loss": 0.3261, + "step": 39762 + }, + { + "epoch": 0.7382213921573724, + "grad_norm": 0.44483864307403564, + "learning_rate": 3.1951785875028172e-06, + "loss": 0.2887, + "step": 39764 + }, + { + "epoch": 0.738258522294791, + "grad_norm": 0.3634507358074188, + "learning_rate": 3.194323870660545e-06, + "loss": 0.3555, + "step": 39766 + }, + { + "epoch": 0.7382956524322096, + "grad_norm": 0.39583802223205566, + "learning_rate": 3.19346924642314e-06, + "loss": 0.388, + "step": 39768 + }, + { + "epoch": 0.7383327825696283, + "grad_norm": 0.4209096431732178, + "learning_rate": 3.192614714802231e-06, + "loss": 0.191, + "step": 39770 + }, + { + "epoch": 0.7383699127070469, + "grad_norm": 0.5071036219596863, + "learning_rate": 3.191760275809448e-06, + "loss": 0.369, + "step": 39772 + }, + { + "epoch": 0.7384070428444656, + "grad_norm": 0.5053966045379639, + "learning_rate": 3.1909059294564115e-06, + "loss": 0.1357, + "step": 39774 + }, + { + "epoch": 0.7384441729818843, + "grad_norm": 0.34127503633499146, + "learning_rate": 3.190051675754753e-06, + "loss": 0.1335, + "step": 39776 + }, + { + "epoch": 0.7384813031193028, + "grad_norm": 0.3356647491455078, + "learning_rate": 3.1891975147160893e-06, + "loss": 0.3487, + "step": 39778 + }, + { + "epoch": 0.7385184332567215, + "grad_norm": 0.5177041888237, + "learning_rate": 3.188343446352048e-06, + "loss": 0.45, + "step": 39780 + }, + { + "epoch": 0.7385555633941401, + "grad_norm": 0.33546823263168335, + "learning_rate": 3.1874894706742476e-06, + "loss": 0.2361, + "step": 39782 + }, + { + "epoch": 0.7385926935315588, + "grad_norm": 0.24496221542358398, + "learning_rate": 3.18663558769431e-06, + "loss": 0.1783, + "step": 39784 + }, + { + "epoch": 0.7386298236689774, + "grad_norm": 0.19015000760555267, + "learning_rate": 3.185781797423855e-06, + "loss": 0.2165, + "step": 39786 + }, + { + "epoch": 0.738666953806396, + "grad_norm": 0.6021552681922913, + "learning_rate": 3.1849280998745003e-06, + "loss": 0.3359, + "step": 39788 + }, + { + "epoch": 0.7387040839438147, + "grad_norm": 0.3418736457824707, + "learning_rate": 3.1840744950578583e-06, + "loss": 0.2606, + "step": 39790 + }, + { + "epoch": 0.7387412140812333, + "grad_norm": 0.412597119808197, + "learning_rate": 3.1832209829855465e-06, + "loss": 0.2359, + "step": 39792 + }, + { + "epoch": 0.738778344218652, + "grad_norm": 0.6657752990722656, + "learning_rate": 3.182367563669181e-06, + "loss": 0.2669, + "step": 39794 + }, + { + "epoch": 0.7388154743560706, + "grad_norm": 0.6148823499679565, + "learning_rate": 3.1815142371203687e-06, + "loss": 0.2966, + "step": 39796 + }, + { + "epoch": 0.7388526044934892, + "grad_norm": 0.4595191776752472, + "learning_rate": 3.1806610033507246e-06, + "loss": 0.2413, + "step": 39798 + }, + { + "epoch": 0.7388897346309079, + "grad_norm": 0.23713254928588867, + "learning_rate": 3.179807862371861e-06, + "loss": 0.2934, + "step": 39800 + }, + { + "epoch": 0.7389268647683265, + "grad_norm": 0.4516656994819641, + "learning_rate": 3.1789548141953798e-06, + "loss": 0.0727, + "step": 39802 + }, + { + "epoch": 0.7389639949057452, + "grad_norm": 0.42876461148262024, + "learning_rate": 3.178101858832893e-06, + "loss": 0.2759, + "step": 39804 + }, + { + "epoch": 0.7390011250431637, + "grad_norm": 0.5186970829963684, + "learning_rate": 3.177248996296005e-06, + "loss": 0.0909, + "step": 39806 + }, + { + "epoch": 0.7390382551805824, + "grad_norm": 0.16999712586402893, + "learning_rate": 3.176396226596322e-06, + "loss": 0.2335, + "step": 39808 + }, + { + "epoch": 0.7390753853180011, + "grad_norm": 0.4646111726760864, + "learning_rate": 3.1755435497454477e-06, + "loss": 0.4137, + "step": 39810 + }, + { + "epoch": 0.7391125154554197, + "grad_norm": 0.3292817771434784, + "learning_rate": 3.174690965754987e-06, + "loss": 0.4099, + "step": 39812 + }, + { + "epoch": 0.7391496455928384, + "grad_norm": 0.5161586403846741, + "learning_rate": 3.1738384746365347e-06, + "loss": 0.4954, + "step": 39814 + }, + { + "epoch": 0.7391867757302569, + "grad_norm": 0.3667740225791931, + "learning_rate": 3.172986076401697e-06, + "loss": 0.174, + "step": 39816 + }, + { + "epoch": 0.7392239058676756, + "grad_norm": 0.4248465597629547, + "learning_rate": 3.172133771062067e-06, + "loss": 0.3805, + "step": 39818 + }, + { + "epoch": 0.7392610360050943, + "grad_norm": 0.3711097240447998, + "learning_rate": 3.1712815586292445e-06, + "loss": 0.2976, + "step": 39820 + }, + { + "epoch": 0.7392981661425129, + "grad_norm": 0.28385937213897705, + "learning_rate": 3.170429439114825e-06, + "loss": 0.1331, + "step": 39822 + }, + { + "epoch": 0.7393352962799316, + "grad_norm": 0.4015321731567383, + "learning_rate": 3.1695774125304047e-06, + "loss": 0.2598, + "step": 39824 + }, + { + "epoch": 0.7393724264173501, + "grad_norm": 0.36594337224960327, + "learning_rate": 3.168725478887579e-06, + "loss": 0.235, + "step": 39826 + }, + { + "epoch": 0.7394095565547688, + "grad_norm": 0.40637338161468506, + "learning_rate": 3.1678736381979347e-06, + "loss": 0.3429, + "step": 39828 + }, + { + "epoch": 0.7394466866921875, + "grad_norm": 0.369426429271698, + "learning_rate": 3.1670218904730653e-06, + "loss": 0.2319, + "step": 39830 + }, + { + "epoch": 0.7394838168296061, + "grad_norm": 0.35711997747421265, + "learning_rate": 3.1661702357245604e-06, + "loss": 0.2706, + "step": 39832 + }, + { + "epoch": 0.7395209469670248, + "grad_norm": 0.4420630931854248, + "learning_rate": 3.1653186739640098e-06, + "loss": 0.3401, + "step": 39834 + }, + { + "epoch": 0.7395580771044433, + "grad_norm": 0.387168824672699, + "learning_rate": 3.1644672052030023e-06, + "loss": 0.3225, + "step": 39836 + }, + { + "epoch": 0.739595207241862, + "grad_norm": 0.4297964572906494, + "learning_rate": 3.1636158294531218e-06, + "loss": 0.3028, + "step": 39838 + }, + { + "epoch": 0.7396323373792807, + "grad_norm": 0.33744072914123535, + "learning_rate": 3.1627645467259495e-06, + "loss": 0.2597, + "step": 39840 + }, + { + "epoch": 0.7396694675166993, + "grad_norm": 0.30913621187210083, + "learning_rate": 3.1619133570330705e-06, + "loss": 0.2664, + "step": 39842 + }, + { + "epoch": 0.739706597654118, + "grad_norm": 0.5663681626319885, + "learning_rate": 3.1610622603860696e-06, + "loss": 0.2754, + "step": 39844 + }, + { + "epoch": 0.7397437277915365, + "grad_norm": 0.4782780706882477, + "learning_rate": 3.160211256796526e-06, + "loss": 0.3141, + "step": 39846 + }, + { + "epoch": 0.7397808579289552, + "grad_norm": 0.3798877000808716, + "learning_rate": 3.1593603462760202e-06, + "loss": 0.3105, + "step": 39848 + }, + { + "epoch": 0.7398179880663739, + "grad_norm": 0.21026486158370972, + "learning_rate": 3.1585095288361322e-06, + "loss": 0.2449, + "step": 39850 + }, + { + "epoch": 0.7398551182037925, + "grad_norm": 0.7013682723045349, + "learning_rate": 3.157658804488434e-06, + "loss": 0.3149, + "step": 39852 + }, + { + "epoch": 0.7398922483412111, + "grad_norm": 0.3784891664981842, + "learning_rate": 3.1568081732445044e-06, + "loss": 0.1808, + "step": 39854 + }, + { + "epoch": 0.7399293784786297, + "grad_norm": 0.4772290885448456, + "learning_rate": 3.1559576351159183e-06, + "loss": 0.3087, + "step": 39856 + }, + { + "epoch": 0.7399665086160484, + "grad_norm": 0.2988712191581726, + "learning_rate": 3.1551071901142505e-06, + "loss": 0.1484, + "step": 39858 + }, + { + "epoch": 0.740003638753467, + "grad_norm": 0.5337780117988586, + "learning_rate": 3.1542568382510686e-06, + "loss": 0.4094, + "step": 39860 + }, + { + "epoch": 0.7400407688908857, + "grad_norm": 0.30269649624824524, + "learning_rate": 3.1534065795379464e-06, + "loss": 0.2871, + "step": 39862 + }, + { + "epoch": 0.7400778990283043, + "grad_norm": 0.4526297152042389, + "learning_rate": 3.1525564139864553e-06, + "loss": 0.2711, + "step": 39864 + }, + { + "epoch": 0.7401150291657229, + "grad_norm": 0.3100440502166748, + "learning_rate": 3.1517063416081572e-06, + "loss": 0.4672, + "step": 39866 + }, + { + "epoch": 0.7401521593031416, + "grad_norm": 0.37063223123550415, + "learning_rate": 3.1508563624146227e-06, + "loss": 0.2753, + "step": 39868 + }, + { + "epoch": 0.7401892894405602, + "grad_norm": 0.7800668478012085, + "learning_rate": 3.150006476417419e-06, + "loss": 0.1789, + "step": 39870 + }, + { + "epoch": 0.7402264195779789, + "grad_norm": 0.5451666116714478, + "learning_rate": 3.1491566836281075e-06, + "loss": 0.3022, + "step": 39872 + }, + { + "epoch": 0.7402635497153975, + "grad_norm": 0.3655959665775299, + "learning_rate": 3.1483069840582538e-06, + "loss": 0.3683, + "step": 39874 + }, + { + "epoch": 0.7403006798528161, + "grad_norm": 0.378343403339386, + "learning_rate": 3.1474573777194217e-06, + "loss": 0.1686, + "step": 39876 + }, + { + "epoch": 0.7403378099902348, + "grad_norm": 0.3279966115951538, + "learning_rate": 3.1466078646231658e-06, + "loss": 0.2033, + "step": 39878 + }, + { + "epoch": 0.7403749401276534, + "grad_norm": 0.3416300117969513, + "learning_rate": 3.1457584447810517e-06, + "loss": 0.327, + "step": 39880 + }, + { + "epoch": 0.740412070265072, + "grad_norm": 0.2918615937232971, + "learning_rate": 3.144909118204631e-06, + "loss": 0.2836, + "step": 39882 + }, + { + "epoch": 0.7404492004024907, + "grad_norm": 0.34912699460983276, + "learning_rate": 3.144059884905465e-06, + "loss": 0.4867, + "step": 39884 + }, + { + "epoch": 0.7404863305399093, + "grad_norm": 0.3318246901035309, + "learning_rate": 3.1432107448951066e-06, + "loss": 0.1362, + "step": 39886 + }, + { + "epoch": 0.740523460677328, + "grad_norm": 0.4059092700481415, + "learning_rate": 3.1423616981851148e-06, + "loss": 0.2818, + "step": 39888 + }, + { + "epoch": 0.7405605908147466, + "grad_norm": 0.43322592973709106, + "learning_rate": 3.1415127447870363e-06, + "loss": 0.2981, + "step": 39890 + }, + { + "epoch": 0.7405977209521653, + "grad_norm": 0.43988093733787537, + "learning_rate": 3.140663884712426e-06, + "loss": 0.1899, + "step": 39892 + }, + { + "epoch": 0.7406348510895839, + "grad_norm": 0.3788839876651764, + "learning_rate": 3.1398151179728344e-06, + "loss": 0.2817, + "step": 39894 + }, + { + "epoch": 0.7406719812270025, + "grad_norm": 0.4490831196308136, + "learning_rate": 3.1389664445798107e-06, + "loss": 0.2957, + "step": 39896 + }, + { + "epoch": 0.7407091113644212, + "grad_norm": 0.3374093472957611, + "learning_rate": 3.138117864544902e-06, + "loss": 0.3962, + "step": 39898 + }, + { + "epoch": 0.7407462415018398, + "grad_norm": 0.4968375265598297, + "learning_rate": 3.1372693778796583e-06, + "loss": 0.3103, + "step": 39900 + }, + { + "epoch": 0.7407833716392584, + "grad_norm": 0.4294300377368927, + "learning_rate": 3.136420984595623e-06, + "loss": 0.4285, + "step": 39902 + }, + { + "epoch": 0.740820501776677, + "grad_norm": 0.472463995218277, + "learning_rate": 3.1355726847043343e-06, + "loss": 0.1948, + "step": 39904 + }, + { + "epoch": 0.7408576319140957, + "grad_norm": 0.37947896122932434, + "learning_rate": 3.1347244782173413e-06, + "loss": 0.2748, + "step": 39906 + }, + { + "epoch": 0.7408947620515144, + "grad_norm": 0.3646430969238281, + "learning_rate": 3.1338763651461846e-06, + "loss": 0.3316, + "step": 39908 + }, + { + "epoch": 0.740931892188933, + "grad_norm": 0.35680311918258667, + "learning_rate": 3.1330283455024037e-06, + "loss": 0.2169, + "step": 39910 + }, + { + "epoch": 0.7409690223263516, + "grad_norm": 0.4057343602180481, + "learning_rate": 3.132180419297538e-06, + "loss": 0.5018, + "step": 39912 + }, + { + "epoch": 0.7410061524637702, + "grad_norm": 0.29532474279403687, + "learning_rate": 3.1313325865431288e-06, + "loss": 0.1944, + "step": 39914 + }, + { + "epoch": 0.7410432826011889, + "grad_norm": 0.3516652286052704, + "learning_rate": 3.1304848472507054e-06, + "loss": 0.3425, + "step": 39916 + }, + { + "epoch": 0.7410804127386076, + "grad_norm": 0.6633748412132263, + "learning_rate": 3.1296372014318066e-06, + "loss": 0.2972, + "step": 39918 + }, + { + "epoch": 0.7411175428760262, + "grad_norm": 0.47135451436042786, + "learning_rate": 3.1287896490979675e-06, + "loss": 0.2095, + "step": 39920 + }, + { + "epoch": 0.7411546730134448, + "grad_norm": 0.4451085925102234, + "learning_rate": 3.127942190260722e-06, + "loss": 0.2657, + "step": 39922 + }, + { + "epoch": 0.7411918031508634, + "grad_norm": 0.39433467388153076, + "learning_rate": 3.1270948249315956e-06, + "loss": 0.3109, + "step": 39924 + }, + { + "epoch": 0.7412289332882821, + "grad_norm": 0.30415859818458557, + "learning_rate": 3.1262475531221246e-06, + "loss": 0.2722, + "step": 39926 + }, + { + "epoch": 0.7412660634257008, + "grad_norm": 0.31332701444625854, + "learning_rate": 3.1254003748438333e-06, + "loss": 0.1085, + "step": 39928 + }, + { + "epoch": 0.7413031935631194, + "grad_norm": 0.285642147064209, + "learning_rate": 3.124553290108251e-06, + "loss": 0.4008, + "step": 39930 + }, + { + "epoch": 0.741340323700538, + "grad_norm": 0.41343799233436584, + "learning_rate": 3.123706298926903e-06, + "loss": 0.4399, + "step": 39932 + }, + { + "epoch": 0.7413774538379566, + "grad_norm": 0.3357764482498169, + "learning_rate": 3.1228594013113157e-06, + "loss": 0.3903, + "step": 39934 + }, + { + "epoch": 0.7414145839753753, + "grad_norm": 0.4706973731517792, + "learning_rate": 3.122012597273013e-06, + "loss": 0.2892, + "step": 39936 + }, + { + "epoch": 0.741451714112794, + "grad_norm": 0.23890571296215057, + "learning_rate": 3.1211658868235206e-06, + "loss": 0.1868, + "step": 39938 + }, + { + "epoch": 0.7414888442502126, + "grad_norm": 0.3745531439781189, + "learning_rate": 3.1203192699743512e-06, + "loss": 0.1773, + "step": 39940 + }, + { + "epoch": 0.7415259743876312, + "grad_norm": 0.36497291922569275, + "learning_rate": 3.1194727467370313e-06, + "loss": 0.1237, + "step": 39942 + }, + { + "epoch": 0.7415631045250498, + "grad_norm": 0.5514853596687317, + "learning_rate": 3.118626317123079e-06, + "loss": 0.2574, + "step": 39944 + }, + { + "epoch": 0.7416002346624685, + "grad_norm": 0.49003151059150696, + "learning_rate": 3.117779981144009e-06, + "loss": 0.2311, + "step": 39946 + }, + { + "epoch": 0.7416373647998872, + "grad_norm": 0.25451457500457764, + "learning_rate": 3.1169337388113372e-06, + "loss": 0.2101, + "step": 39948 + }, + { + "epoch": 0.7416744949373058, + "grad_norm": 0.5134959816932678, + "learning_rate": 3.1160875901365817e-06, + "loss": 0.3662, + "step": 39950 + }, + { + "epoch": 0.7417116250747244, + "grad_norm": 0.31824877858161926, + "learning_rate": 3.115241535131257e-06, + "loss": 0.3163, + "step": 39952 + }, + { + "epoch": 0.741748755212143, + "grad_norm": 0.4710465967655182, + "learning_rate": 3.114395573806869e-06, + "loss": 0.3642, + "step": 39954 + }, + { + "epoch": 0.7417858853495617, + "grad_norm": 0.45654985308647156, + "learning_rate": 3.113549706174933e-06, + "loss": 0.2995, + "step": 39956 + }, + { + "epoch": 0.7418230154869803, + "grad_norm": 0.5098035335540771, + "learning_rate": 3.1127039322469577e-06, + "loss": 0.1788, + "step": 39958 + }, + { + "epoch": 0.741860145624399, + "grad_norm": 0.3298649489879608, + "learning_rate": 3.111858252034452e-06, + "loss": 0.4088, + "step": 39960 + }, + { + "epoch": 0.7418972757618176, + "grad_norm": 0.48691198229789734, + "learning_rate": 3.111012665548927e-06, + "loss": 0.2603, + "step": 39962 + }, + { + "epoch": 0.7419344058992362, + "grad_norm": 0.22462467849254608, + "learning_rate": 3.1101671728018812e-06, + "loss": 0.209, + "step": 39964 + }, + { + "epoch": 0.7419715360366549, + "grad_norm": 0.38143405318260193, + "learning_rate": 3.1093217738048263e-06, + "loss": 0.2922, + "step": 39966 + }, + { + "epoch": 0.7420086661740735, + "grad_norm": 0.34232017397880554, + "learning_rate": 3.108476468569258e-06, + "loss": 0.3315, + "step": 39968 + }, + { + "epoch": 0.7420457963114921, + "grad_norm": 0.3852739632129669, + "learning_rate": 3.1076312571066837e-06, + "loss": 0.2004, + "step": 39970 + }, + { + "epoch": 0.7420829264489108, + "grad_norm": 0.489287406206131, + "learning_rate": 3.1067861394286015e-06, + "loss": 0.2068, + "step": 39972 + }, + { + "epoch": 0.7421200565863294, + "grad_norm": 0.43663865327835083, + "learning_rate": 3.1059411155465137e-06, + "loss": 0.235, + "step": 39974 + }, + { + "epoch": 0.7421571867237481, + "grad_norm": 0.3968579173088074, + "learning_rate": 3.10509618547192e-06, + "loss": 0.1015, + "step": 39976 + }, + { + "epoch": 0.7421943168611667, + "grad_norm": 0.40655627846717834, + "learning_rate": 3.1042513492163107e-06, + "loss": 0.2812, + "step": 39978 + }, + { + "epoch": 0.7422314469985853, + "grad_norm": 0.2259611189365387, + "learning_rate": 3.103406606791186e-06, + "loss": 0.3766, + "step": 39980 + }, + { + "epoch": 0.742268577136004, + "grad_norm": 0.4808831512928009, + "learning_rate": 3.10256195820804e-06, + "loss": 0.2848, + "step": 39982 + }, + { + "epoch": 0.7423057072734226, + "grad_norm": 0.3178793489933014, + "learning_rate": 3.1017174034783682e-06, + "loss": 0.1367, + "step": 39984 + }, + { + "epoch": 0.7423428374108413, + "grad_norm": 0.5424390435218811, + "learning_rate": 3.1008729426136564e-06, + "loss": 0.2466, + "step": 39986 + }, + { + "epoch": 0.7423799675482599, + "grad_norm": 0.4486656188964844, + "learning_rate": 3.100028575625399e-06, + "loss": 0.2471, + "step": 39988 + }, + { + "epoch": 0.7424170976856785, + "grad_norm": 0.46225687861442566, + "learning_rate": 3.099184302525088e-06, + "loss": 0.2324, + "step": 39990 + }, + { + "epoch": 0.7424542278230972, + "grad_norm": 0.3951317071914673, + "learning_rate": 3.098340123324205e-06, + "loss": 0.1971, + "step": 39992 + }, + { + "epoch": 0.7424913579605158, + "grad_norm": 0.4322395622730255, + "learning_rate": 3.09749603803424e-06, + "loss": 0.2646, + "step": 39994 + }, + { + "epoch": 0.7425284880979345, + "grad_norm": 0.30992117524147034, + "learning_rate": 3.0966520466666796e-06, + "loss": 0.2623, + "step": 39996 + }, + { + "epoch": 0.742565618235353, + "grad_norm": 0.4846774637699127, + "learning_rate": 3.095808149233005e-06, + "loss": 0.3134, + "step": 39998 + }, + { + "epoch": 0.7426027483727717, + "grad_norm": 0.30779051780700684, + "learning_rate": 3.0949643457447022e-06, + "loss": 0.2868, + "step": 40000 + }, + { + "epoch": 0.7426398785101904, + "grad_norm": 0.27895352244377136, + "learning_rate": 3.0941206362132546e-06, + "loss": 0.2276, + "step": 40002 + }, + { + "epoch": 0.742677008647609, + "grad_norm": 0.37713536620140076, + "learning_rate": 3.0932770206501373e-06, + "loss": 0.4282, + "step": 40004 + }, + { + "epoch": 0.7427141387850277, + "grad_norm": 0.5171748995780945, + "learning_rate": 3.0924334990668304e-06, + "loss": 0.2221, + "step": 40006 + }, + { + "epoch": 0.7427512689224463, + "grad_norm": 0.427176296710968, + "learning_rate": 3.091590071474817e-06, + "loss": 0.24, + "step": 40008 + }, + { + "epoch": 0.7427883990598649, + "grad_norm": 0.40278199315071106, + "learning_rate": 3.090746737885566e-06, + "loss": 0.2462, + "step": 40010 + }, + { + "epoch": 0.7428255291972835, + "grad_norm": 0.4706867039203644, + "learning_rate": 3.089903498310557e-06, + "loss": 0.1171, + "step": 40012 + }, + { + "epoch": 0.7428626593347022, + "grad_norm": 0.39051365852355957, + "learning_rate": 3.089060352761266e-06, + "loss": 0.1655, + "step": 40014 + }, + { + "epoch": 0.7428997894721209, + "grad_norm": 0.6257150769233704, + "learning_rate": 3.088217301249159e-06, + "loss": 0.3879, + "step": 40016 + }, + { + "epoch": 0.7429369196095394, + "grad_norm": 0.30941474437713623, + "learning_rate": 3.087374343785713e-06, + "loss": 0.3056, + "step": 40018 + }, + { + "epoch": 0.7429740497469581, + "grad_norm": 0.4920826554298401, + "learning_rate": 3.0865314803823954e-06, + "loss": 0.3418, + "step": 40020 + }, + { + "epoch": 0.7430111798843767, + "grad_norm": 0.47558194398880005, + "learning_rate": 3.085688711050676e-06, + "loss": 0.274, + "step": 40022 + }, + { + "epoch": 0.7430483100217954, + "grad_norm": 0.35618728399276733, + "learning_rate": 3.084846035802023e-06, + "loss": 0.2482, + "step": 40024 + }, + { + "epoch": 0.7430854401592141, + "grad_norm": 0.4760759770870209, + "learning_rate": 3.084003454647905e-06, + "loss": 0.3701, + "step": 40026 + }, + { + "epoch": 0.7431225702966326, + "grad_norm": 0.3827405869960785, + "learning_rate": 3.083160967599781e-06, + "loss": 0.273, + "step": 40028 + }, + { + "epoch": 0.7431597004340513, + "grad_norm": 0.3624255359172821, + "learning_rate": 3.082318574669121e-06, + "loss": 0.348, + "step": 40030 + }, + { + "epoch": 0.7431968305714699, + "grad_norm": 0.2995072305202484, + "learning_rate": 3.081476275867381e-06, + "loss": 0.2827, + "step": 40032 + }, + { + "epoch": 0.7432339607088886, + "grad_norm": 0.24099791049957275, + "learning_rate": 3.0806340712060258e-06, + "loss": 0.2786, + "step": 40034 + }, + { + "epoch": 0.7432710908463073, + "grad_norm": 0.3649945557117462, + "learning_rate": 3.0797919606965154e-06, + "loss": 0.4394, + "step": 40036 + }, + { + "epoch": 0.7433082209837258, + "grad_norm": 0.406194806098938, + "learning_rate": 3.078949944350308e-06, + "loss": 0.282, + "step": 40038 + }, + { + "epoch": 0.7433453511211445, + "grad_norm": 0.38495367765426636, + "learning_rate": 3.0781080221788628e-06, + "loss": 0.3534, + "step": 40040 + }, + { + "epoch": 0.7433824812585631, + "grad_norm": 0.580451250076294, + "learning_rate": 3.0772661941936323e-06, + "loss": 0.2458, + "step": 40042 + }, + { + "epoch": 0.7434196113959818, + "grad_norm": 0.5322532057762146, + "learning_rate": 3.0764244604060724e-06, + "loss": 0.1291, + "step": 40044 + }, + { + "epoch": 0.7434567415334005, + "grad_norm": 0.3556428849697113, + "learning_rate": 3.075582820827636e-06, + "loss": 0.1982, + "step": 40046 + }, + { + "epoch": 0.743493871670819, + "grad_norm": 0.7268369197845459, + "learning_rate": 3.0747412754697813e-06, + "loss": 0.3917, + "step": 40048 + }, + { + "epoch": 0.7435310018082377, + "grad_norm": 0.4990501403808594, + "learning_rate": 3.07389982434395e-06, + "loss": 0.3903, + "step": 40050 + }, + { + "epoch": 0.7435681319456563, + "grad_norm": 0.25578606128692627, + "learning_rate": 3.073058467461599e-06, + "loss": 0.3038, + "step": 40052 + }, + { + "epoch": 0.743605262083075, + "grad_norm": 0.2587500512599945, + "learning_rate": 3.0722172048341713e-06, + "loss": 0.3948, + "step": 40054 + }, + { + "epoch": 0.7436423922204936, + "grad_norm": 0.30377891659736633, + "learning_rate": 3.071376036473116e-06, + "loss": 0.2219, + "step": 40056 + }, + { + "epoch": 0.7436795223579122, + "grad_norm": 0.36774876713752747, + "learning_rate": 3.0705349623898796e-06, + "loss": 0.4256, + "step": 40058 + }, + { + "epoch": 0.7437166524953309, + "grad_norm": 0.6472439765930176, + "learning_rate": 3.069693982595906e-06, + "loss": 0.263, + "step": 40060 + }, + { + "epoch": 0.7437537826327495, + "grad_norm": 0.33867040276527405, + "learning_rate": 3.0688530971026388e-06, + "loss": 0.2411, + "step": 40062 + }, + { + "epoch": 0.7437909127701682, + "grad_norm": 0.36092501878738403, + "learning_rate": 3.068012305921523e-06, + "loss": 0.1856, + "step": 40064 + }, + { + "epoch": 0.7438280429075868, + "grad_norm": 0.3139655888080597, + "learning_rate": 3.0671716090639915e-06, + "loss": 0.2603, + "step": 40066 + }, + { + "epoch": 0.7438651730450054, + "grad_norm": 0.7444578409194946, + "learning_rate": 3.0663310065414897e-06, + "loss": 0.3964, + "step": 40068 + }, + { + "epoch": 0.7439023031824241, + "grad_norm": 0.419436514377594, + "learning_rate": 3.0654904983654564e-06, + "loss": 0.4007, + "step": 40070 + }, + { + "epoch": 0.7439394333198427, + "grad_norm": 0.6879094243049622, + "learning_rate": 3.0646500845473237e-06, + "loss": 0.1479, + "step": 40072 + }, + { + "epoch": 0.7439765634572614, + "grad_norm": 0.2978479266166687, + "learning_rate": 3.063809765098529e-06, + "loss": 0.3227, + "step": 40074 + }, + { + "epoch": 0.74401369359468, + "grad_norm": 0.3856264650821686, + "learning_rate": 3.062969540030507e-06, + "loss": 0.2612, + "step": 40076 + }, + { + "epoch": 0.7440508237320986, + "grad_norm": 0.5719103217124939, + "learning_rate": 3.0621294093546948e-06, + "loss": 0.3589, + "step": 40078 + }, + { + "epoch": 0.7440879538695173, + "grad_norm": 0.38829633593559265, + "learning_rate": 3.0612893730825155e-06, + "loss": 0.1944, + "step": 40080 + }, + { + "epoch": 0.7441250840069359, + "grad_norm": 0.4330374300479889, + "learning_rate": 3.060449431225405e-06, + "loss": 0.2702, + "step": 40082 + }, + { + "epoch": 0.7441622141443546, + "grad_norm": 0.44694167375564575, + "learning_rate": 3.0596095837947914e-06, + "loss": 0.1857, + "step": 40084 + }, + { + "epoch": 0.7441993442817731, + "grad_norm": 0.29867634177207947, + "learning_rate": 3.058769830802102e-06, + "loss": 0.2242, + "step": 40086 + }, + { + "epoch": 0.7442364744191918, + "grad_norm": 0.4112626612186432, + "learning_rate": 3.057930172258764e-06, + "loss": 0.3648, + "step": 40088 + }, + { + "epoch": 0.7442736045566105, + "grad_norm": 0.33459389209747314, + "learning_rate": 3.0570906081762063e-06, + "loss": 0.2828, + "step": 40090 + }, + { + "epoch": 0.7443107346940291, + "grad_norm": 0.21339619159698486, + "learning_rate": 3.056251138565848e-06, + "loss": 0.3155, + "step": 40092 + }, + { + "epoch": 0.7443478648314478, + "grad_norm": 0.5885301828384399, + "learning_rate": 3.05541176343911e-06, + "loss": 0.2771, + "step": 40094 + }, + { + "epoch": 0.7443849949688663, + "grad_norm": 0.3786029517650604, + "learning_rate": 3.0545724828074162e-06, + "loss": 0.2223, + "step": 40096 + }, + { + "epoch": 0.744422125106285, + "grad_norm": 0.7119247317314148, + "learning_rate": 3.0537332966821874e-06, + "loss": 0.247, + "step": 40098 + }, + { + "epoch": 0.7444592552437037, + "grad_norm": 0.277366042137146, + "learning_rate": 3.0528942050748422e-06, + "loss": 0.2029, + "step": 40100 + }, + { + "epoch": 0.7444963853811223, + "grad_norm": 0.6237053275108337, + "learning_rate": 3.0520552079968e-06, + "loss": 0.2902, + "step": 40102 + }, + { + "epoch": 0.744533515518541, + "grad_norm": 0.6523361206054688, + "learning_rate": 3.0512163054594725e-06, + "loss": 0.2365, + "step": 40104 + }, + { + "epoch": 0.7445706456559595, + "grad_norm": 0.33660900592803955, + "learning_rate": 3.050377497474277e-06, + "loss": 0.252, + "step": 40106 + }, + { + "epoch": 0.7446077757933782, + "grad_norm": 0.38165482878685, + "learning_rate": 3.049538784052627e-06, + "loss": 0.2441, + "step": 40108 + }, + { + "epoch": 0.7446449059307968, + "grad_norm": 0.369779497385025, + "learning_rate": 3.0487001652059355e-06, + "loss": 0.2445, + "step": 40110 + }, + { + "epoch": 0.7446820360682155, + "grad_norm": 0.4809916317462921, + "learning_rate": 3.047861640945615e-06, + "loss": 0.2055, + "step": 40112 + }, + { + "epoch": 0.7447191662056342, + "grad_norm": 0.4940428137779236, + "learning_rate": 3.0470232112830723e-06, + "loss": 0.301, + "step": 40114 + }, + { + "epoch": 0.7447562963430527, + "grad_norm": 0.32643792033195496, + "learning_rate": 3.0461848762297195e-06, + "loss": 0.3649, + "step": 40116 + }, + { + "epoch": 0.7447934264804714, + "grad_norm": 0.5354380011558533, + "learning_rate": 3.0453466357969574e-06, + "loss": 0.2329, + "step": 40118 + }, + { + "epoch": 0.74483055661789, + "grad_norm": 0.4701504707336426, + "learning_rate": 3.0445084899961976e-06, + "loss": 0.157, + "step": 40120 + }, + { + "epoch": 0.7448676867553087, + "grad_norm": 0.4035387933254242, + "learning_rate": 3.0436704388388416e-06, + "loss": 0.3417, + "step": 40122 + }, + { + "epoch": 0.7449048168927274, + "grad_norm": 0.22432273626327515, + "learning_rate": 3.042832482336295e-06, + "loss": 0.2711, + "step": 40124 + }, + { + "epoch": 0.7449419470301459, + "grad_norm": 0.4862229824066162, + "learning_rate": 3.0419946204999597e-06, + "loss": 0.2719, + "step": 40126 + }, + { + "epoch": 0.7449790771675646, + "grad_norm": 0.2762148082256317, + "learning_rate": 3.0411568533412385e-06, + "loss": 0.1014, + "step": 40128 + }, + { + "epoch": 0.7450162073049832, + "grad_norm": 0.5028297305107117, + "learning_rate": 3.040319180871526e-06, + "loss": 0.3867, + "step": 40130 + }, + { + "epoch": 0.7450533374424019, + "grad_norm": 0.4299204349517822, + "learning_rate": 3.0394816031022214e-06, + "loss": 0.2066, + "step": 40132 + }, + { + "epoch": 0.7450904675798206, + "grad_norm": 0.4392630159854889, + "learning_rate": 3.038644120044727e-06, + "loss": 0.1771, + "step": 40134 + }, + { + "epoch": 0.7451275977172391, + "grad_norm": 0.24648761749267578, + "learning_rate": 3.037806731710431e-06, + "loss": 0.242, + "step": 40136 + }, + { + "epoch": 0.7451647278546578, + "grad_norm": 0.19979175925254822, + "learning_rate": 3.0369694381107315e-06, + "loss": 0.2478, + "step": 40138 + }, + { + "epoch": 0.7452018579920764, + "grad_norm": 0.43782052397727966, + "learning_rate": 3.0361322392570247e-06, + "loss": 0.2497, + "step": 40140 + }, + { + "epoch": 0.7452389881294951, + "grad_norm": 0.30452510714530945, + "learning_rate": 3.035295135160695e-06, + "loss": 0.1498, + "step": 40142 + }, + { + "epoch": 0.7452761182669138, + "grad_norm": 0.3463420569896698, + "learning_rate": 3.0344581258331373e-06, + "loss": 0.2236, + "step": 40144 + }, + { + "epoch": 0.7453132484043323, + "grad_norm": 0.3357095420360565, + "learning_rate": 3.03362121128574e-06, + "loss": 0.3303, + "step": 40146 + }, + { + "epoch": 0.745350378541751, + "grad_norm": 0.40374359488487244, + "learning_rate": 3.0327843915298918e-06, + "loss": 0.2101, + "step": 40148 + }, + { + "epoch": 0.7453875086791696, + "grad_norm": 0.31130921840667725, + "learning_rate": 3.0319476665769786e-06, + "loss": 0.1229, + "step": 40150 + }, + { + "epoch": 0.7454246388165883, + "grad_norm": 0.5515148639678955, + "learning_rate": 3.031111036438389e-06, + "loss": 0.1517, + "step": 40152 + }, + { + "epoch": 0.745461768954007, + "grad_norm": 0.42077669501304626, + "learning_rate": 3.030274501125501e-06, + "loss": 0.2315, + "step": 40154 + }, + { + "epoch": 0.7454988990914255, + "grad_norm": 0.44162219762802124, + "learning_rate": 3.0294380606497032e-06, + "loss": 0.3968, + "step": 40156 + }, + { + "epoch": 0.7455360292288442, + "grad_norm": 0.40335991978645325, + "learning_rate": 3.028601715022371e-06, + "loss": 0.1982, + "step": 40158 + }, + { + "epoch": 0.7455731593662628, + "grad_norm": 0.42558398842811584, + "learning_rate": 3.027765464254887e-06, + "loss": 0.1857, + "step": 40160 + }, + { + "epoch": 0.7456102895036815, + "grad_norm": 0.4341014325618744, + "learning_rate": 3.0269293083586314e-06, + "loss": 0.1832, + "step": 40162 + }, + { + "epoch": 0.7456474196411, + "grad_norm": 0.4233351945877075, + "learning_rate": 3.0260932473449824e-06, + "loss": 0.3077, + "step": 40164 + }, + { + "epoch": 0.7456845497785187, + "grad_norm": 0.4022819697856903, + "learning_rate": 3.0252572812253167e-06, + "loss": 0.2358, + "step": 40166 + }, + { + "epoch": 0.7457216799159374, + "grad_norm": 0.4743821620941162, + "learning_rate": 3.024421410011005e-06, + "loss": 0.2105, + "step": 40168 + }, + { + "epoch": 0.745758810053356, + "grad_norm": 0.36108964681625366, + "learning_rate": 3.023585633713423e-06, + "loss": 0.2758, + "step": 40170 + }, + { + "epoch": 0.7457959401907747, + "grad_norm": 0.24381716549396515, + "learning_rate": 3.022749952343944e-06, + "loss": 0.1177, + "step": 40172 + }, + { + "epoch": 0.7458330703281932, + "grad_norm": 0.34277912974357605, + "learning_rate": 3.0219143659139394e-06, + "loss": 0.3313, + "step": 40174 + }, + { + "epoch": 0.7458702004656119, + "grad_norm": 0.2952805161476135, + "learning_rate": 3.0210788744347807e-06, + "loss": 0.3066, + "step": 40176 + }, + { + "epoch": 0.7459073306030306, + "grad_norm": 0.5980736017227173, + "learning_rate": 3.020243477917836e-06, + "loss": 0.2788, + "step": 40178 + }, + { + "epoch": 0.7459444607404492, + "grad_norm": 0.3407723903656006, + "learning_rate": 3.0194081763744655e-06, + "loss": 0.2993, + "step": 40180 + }, + { + "epoch": 0.7459815908778679, + "grad_norm": 0.2439137101173401, + "learning_rate": 3.0185729698160416e-06, + "loss": 0.2172, + "step": 40182 + }, + { + "epoch": 0.7460187210152864, + "grad_norm": 0.420911580324173, + "learning_rate": 3.0177378582539276e-06, + "loss": 0.2, + "step": 40184 + }, + { + "epoch": 0.7460558511527051, + "grad_norm": 0.35678088665008545, + "learning_rate": 3.0169028416994873e-06, + "loss": 0.159, + "step": 40186 + }, + { + "epoch": 0.7460929812901238, + "grad_norm": 0.4752412736415863, + "learning_rate": 3.016067920164082e-06, + "loss": 0.1152, + "step": 40188 + }, + { + "epoch": 0.7461301114275424, + "grad_norm": 0.24738076329231262, + "learning_rate": 3.015233093659077e-06, + "loss": 0.3083, + "step": 40190 + }, + { + "epoch": 0.7461672415649611, + "grad_norm": 0.47351518273353577, + "learning_rate": 3.014398362195824e-06, + "loss": 0.4381, + "step": 40192 + }, + { + "epoch": 0.7462043717023796, + "grad_norm": 0.41921135783195496, + "learning_rate": 3.0135637257856843e-06, + "loss": 0.3212, + "step": 40194 + }, + { + "epoch": 0.7462415018397983, + "grad_norm": 0.33750271797180176, + "learning_rate": 3.012729184440016e-06, + "loss": 0.2053, + "step": 40196 + }, + { + "epoch": 0.746278631977217, + "grad_norm": 0.6457516551017761, + "learning_rate": 3.011894738170178e-06, + "loss": 0.3314, + "step": 40198 + }, + { + "epoch": 0.7463157621146356, + "grad_norm": 0.4144114553928375, + "learning_rate": 3.0110603869875175e-06, + "loss": 0.2872, + "step": 40200 + }, + { + "epoch": 0.7463528922520543, + "grad_norm": 0.49976420402526855, + "learning_rate": 3.01022613090339e-06, + "loss": 0.4605, + "step": 40202 + }, + { + "epoch": 0.7463900223894728, + "grad_norm": 0.6803915500640869, + "learning_rate": 3.009391969929152e-06, + "loss": 0.2895, + "step": 40204 + }, + { + "epoch": 0.7464271525268915, + "grad_norm": 0.5855103135108948, + "learning_rate": 3.008557904076147e-06, + "loss": 0.3693, + "step": 40206 + }, + { + "epoch": 0.7464642826643101, + "grad_norm": 0.519738495349884, + "learning_rate": 3.0077239333557262e-06, + "loss": 0.3317, + "step": 40208 + }, + { + "epoch": 0.7465014128017288, + "grad_norm": 0.5718578696250916, + "learning_rate": 3.006890057779239e-06, + "loss": 0.3804, + "step": 40210 + }, + { + "epoch": 0.7465385429391475, + "grad_norm": 0.45269685983657837, + "learning_rate": 3.0060562773580315e-06, + "loss": 0.4294, + "step": 40212 + }, + { + "epoch": 0.746575673076566, + "grad_norm": 0.3525994122028351, + "learning_rate": 3.0052225921034484e-06, + "loss": 0.236, + "step": 40214 + }, + { + "epoch": 0.7466128032139847, + "grad_norm": 0.2537766098976135, + "learning_rate": 3.0043890020268375e-06, + "loss": 0.2448, + "step": 40216 + }, + { + "epoch": 0.7466499333514033, + "grad_norm": 0.3200022876262665, + "learning_rate": 3.003555507139535e-06, + "loss": 0.1323, + "step": 40218 + }, + { + "epoch": 0.746687063488822, + "grad_norm": 0.3174964487552643, + "learning_rate": 3.0027221074528877e-06, + "loss": 0.1313, + "step": 40220 + }, + { + "epoch": 0.7467241936262407, + "grad_norm": 0.1668201982975006, + "learning_rate": 3.00188880297823e-06, + "loss": 0.1752, + "step": 40222 + }, + { + "epoch": 0.7467613237636592, + "grad_norm": 0.5355526804924011, + "learning_rate": 3.0010555937269047e-06, + "loss": 0.3681, + "step": 40224 + }, + { + "epoch": 0.7467984539010779, + "grad_norm": 0.4486269950866699, + "learning_rate": 3.0002224797102486e-06, + "loss": 0.301, + "step": 40226 + }, + { + "epoch": 0.7468355840384965, + "grad_norm": 0.3842775821685791, + "learning_rate": 2.9993894609396e-06, + "loss": 0.2093, + "step": 40228 + }, + { + "epoch": 0.7468727141759152, + "grad_norm": 0.2321619838476181, + "learning_rate": 2.9985565374262894e-06, + "loss": 0.3219, + "step": 40230 + }, + { + "epoch": 0.7469098443133338, + "grad_norm": 0.4199943244457245, + "learning_rate": 2.9977237091816526e-06, + "loss": 0.1489, + "step": 40232 + }, + { + "epoch": 0.7469469744507524, + "grad_norm": 0.4171363413333893, + "learning_rate": 2.9968909762170208e-06, + "loss": 0.2019, + "step": 40234 + }, + { + "epoch": 0.7469841045881711, + "grad_norm": 0.5603946447372437, + "learning_rate": 2.996058338543727e-06, + "loss": 0.3675, + "step": 40236 + }, + { + "epoch": 0.7470212347255897, + "grad_norm": 0.36896660923957825, + "learning_rate": 2.9952257961731023e-06, + "loss": 0.1187, + "step": 40238 + }, + { + "epoch": 0.7470583648630084, + "grad_norm": 0.35125967860221863, + "learning_rate": 2.99439334911647e-06, + "loss": 0.5072, + "step": 40240 + }, + { + "epoch": 0.747095495000427, + "grad_norm": 0.5793779492378235, + "learning_rate": 2.9935609973851644e-06, + "loss": 0.3325, + "step": 40242 + }, + { + "epoch": 0.7471326251378456, + "grad_norm": 0.2723342180252075, + "learning_rate": 2.9927287409905028e-06, + "loss": 0.3587, + "step": 40244 + }, + { + "epoch": 0.7471697552752643, + "grad_norm": 0.2540375888347626, + "learning_rate": 2.9918965799438125e-06, + "loss": 0.2405, + "step": 40246 + }, + { + "epoch": 0.7472068854126829, + "grad_norm": 0.521479070186615, + "learning_rate": 2.99106451425642e-06, + "loss": 0.5053, + "step": 40248 + }, + { + "epoch": 0.7472440155501016, + "grad_norm": 0.48804280161857605, + "learning_rate": 2.990232543939644e-06, + "loss": 0.2348, + "step": 40250 + }, + { + "epoch": 0.7472811456875202, + "grad_norm": 0.5943536162376404, + "learning_rate": 2.9894006690048072e-06, + "loss": 0.3825, + "step": 40252 + }, + { + "epoch": 0.7473182758249388, + "grad_norm": 0.2933368384838104, + "learning_rate": 2.9885688894632315e-06, + "loss": 0.4072, + "step": 40254 + }, + { + "epoch": 0.7473554059623575, + "grad_norm": 0.46099716424942017, + "learning_rate": 2.9877372053262275e-06, + "loss": 0.2886, + "step": 40256 + }, + { + "epoch": 0.7473925360997761, + "grad_norm": 0.2820901870727539, + "learning_rate": 2.9869056166051167e-06, + "loss": 0.2344, + "step": 40258 + }, + { + "epoch": 0.7474296662371948, + "grad_norm": 0.2998506724834442, + "learning_rate": 2.9860741233112168e-06, + "loss": 0.3496, + "step": 40260 + }, + { + "epoch": 0.7474667963746133, + "grad_norm": 0.40661871433258057, + "learning_rate": 2.9852427254558347e-06, + "loss": 0.3477, + "step": 40262 + }, + { + "epoch": 0.747503926512032, + "grad_norm": 0.3183445930480957, + "learning_rate": 2.9844114230502886e-06, + "loss": 0.2939, + "step": 40264 + }, + { + "epoch": 0.7475410566494507, + "grad_norm": 0.3170273005962372, + "learning_rate": 2.9835802161058926e-06, + "loss": 0.4823, + "step": 40266 + }, + { + "epoch": 0.7475781867868693, + "grad_norm": 0.4835923910140991, + "learning_rate": 2.9827491046339483e-06, + "loss": 0.308, + "step": 40268 + }, + { + "epoch": 0.747615316924288, + "grad_norm": 0.4837155342102051, + "learning_rate": 2.981918088645771e-06, + "loss": 0.3922, + "step": 40270 + }, + { + "epoch": 0.7476524470617065, + "grad_norm": 0.28407055139541626, + "learning_rate": 2.9810871681526664e-06, + "loss": 0.1866, + "step": 40272 + }, + { + "epoch": 0.7476895771991252, + "grad_norm": 0.4125809669494629, + "learning_rate": 2.9802563431659416e-06, + "loss": 0.2422, + "step": 40274 + }, + { + "epoch": 0.7477267073365439, + "grad_norm": 0.30942633748054504, + "learning_rate": 2.9794256136969013e-06, + "loss": 0.235, + "step": 40276 + }, + { + "epoch": 0.7477638374739625, + "grad_norm": 0.5042203068733215, + "learning_rate": 2.978594979756849e-06, + "loss": 0.3663, + "step": 40278 + }, + { + "epoch": 0.7478009676113812, + "grad_norm": 0.620173454284668, + "learning_rate": 2.9777644413570915e-06, + "loss": 0.3015, + "step": 40280 + }, + { + "epoch": 0.7478380977487997, + "grad_norm": 0.3843126595020294, + "learning_rate": 2.9769339985089217e-06, + "loss": 0.1388, + "step": 40282 + }, + { + "epoch": 0.7478752278862184, + "grad_norm": 0.32178694009780884, + "learning_rate": 2.9761036512236484e-06, + "loss": 0.3168, + "step": 40284 + }, + { + "epoch": 0.7479123580236371, + "grad_norm": 0.8829051852226257, + "learning_rate": 2.9752733995125603e-06, + "loss": 0.3297, + "step": 40286 + }, + { + "epoch": 0.7479494881610557, + "grad_norm": 0.21603497862815857, + "learning_rate": 2.9744432433869608e-06, + "loss": 0.3962, + "step": 40288 + }, + { + "epoch": 0.7479866182984743, + "grad_norm": 0.5693198442459106, + "learning_rate": 2.973613182858145e-06, + "loss": 0.3574, + "step": 40290 + }, + { + "epoch": 0.7480237484358929, + "grad_norm": 0.37900087237358093, + "learning_rate": 2.9727832179374107e-06, + "loss": 0.3285, + "step": 40292 + }, + { + "epoch": 0.7480608785733116, + "grad_norm": 0.3850388824939728, + "learning_rate": 2.971953348636044e-06, + "loss": 0.32, + "step": 40294 + }, + { + "epoch": 0.7480980087107303, + "grad_norm": 0.44898733496665955, + "learning_rate": 2.971123574965341e-06, + "loss": 0.421, + "step": 40296 + }, + { + "epoch": 0.7481351388481489, + "grad_norm": 0.41429710388183594, + "learning_rate": 2.9702938969365915e-06, + "loss": 0.2416, + "step": 40298 + }, + { + "epoch": 0.7481722689855675, + "grad_norm": 0.5588423013687134, + "learning_rate": 2.969464314561087e-06, + "loss": 0.4317, + "step": 40300 + }, + { + "epoch": 0.7482093991229861, + "grad_norm": 0.2603084146976471, + "learning_rate": 2.9686348278501166e-06, + "loss": 0.3596, + "step": 40302 + }, + { + "epoch": 0.7482465292604048, + "grad_norm": 0.37287580966949463, + "learning_rate": 2.9678054368149613e-06, + "loss": 0.3465, + "step": 40304 + }, + { + "epoch": 0.7482836593978235, + "grad_norm": 0.4959515929222107, + "learning_rate": 2.966976141466914e-06, + "loss": 0.2328, + "step": 40306 + }, + { + "epoch": 0.7483207895352421, + "grad_norm": 0.34679073095321655, + "learning_rate": 2.966146941817252e-06, + "loss": 0.184, + "step": 40308 + }, + { + "epoch": 0.7483579196726607, + "grad_norm": 0.3961580991744995, + "learning_rate": 2.9653178378772606e-06, + "loss": 0.2349, + "step": 40310 + }, + { + "epoch": 0.7483950498100793, + "grad_norm": 0.3487274944782257, + "learning_rate": 2.964488829658223e-06, + "loss": 0.2646, + "step": 40312 + }, + { + "epoch": 0.748432179947498, + "grad_norm": 0.6037985682487488, + "learning_rate": 2.9636599171714177e-06, + "loss": 0.4347, + "step": 40314 + }, + { + "epoch": 0.7484693100849166, + "grad_norm": 0.4712100923061371, + "learning_rate": 2.962831100428125e-06, + "loss": 0.3173, + "step": 40316 + }, + { + "epoch": 0.7485064402223353, + "grad_norm": 0.3319389522075653, + "learning_rate": 2.962002379439626e-06, + "loss": 0.2744, + "step": 40318 + }, + { + "epoch": 0.7485435703597539, + "grad_norm": 0.5780928730964661, + "learning_rate": 2.9611737542171894e-06, + "loss": 0.2695, + "step": 40320 + }, + { + "epoch": 0.7485807004971725, + "grad_norm": 0.32323622703552246, + "learning_rate": 2.9603452247720942e-06, + "loss": 0.178, + "step": 40322 + }, + { + "epoch": 0.7486178306345912, + "grad_norm": 0.401479035615921, + "learning_rate": 2.9595167911156185e-06, + "loss": 0.2292, + "step": 40324 + }, + { + "epoch": 0.7486549607720098, + "grad_norm": 0.33542394638061523, + "learning_rate": 2.9586884532590266e-06, + "loss": 0.2103, + "step": 40326 + }, + { + "epoch": 0.7486920909094285, + "grad_norm": 0.5383530259132385, + "learning_rate": 2.9578602112135936e-06, + "loss": 0.1851, + "step": 40328 + }, + { + "epoch": 0.7487292210468471, + "grad_norm": 0.35175302624702454, + "learning_rate": 2.9570320649905924e-06, + "loss": 0.2687, + "step": 40330 + }, + { + "epoch": 0.7487663511842657, + "grad_norm": 0.3417210876941681, + "learning_rate": 2.956204014601286e-06, + "loss": 0.2776, + "step": 40332 + }, + { + "epoch": 0.7488034813216844, + "grad_norm": 0.7169796824455261, + "learning_rate": 2.955376060056945e-06, + "loss": 0.2694, + "step": 40334 + }, + { + "epoch": 0.748840611459103, + "grad_norm": 0.26623085141181946, + "learning_rate": 2.9545482013688342e-06, + "loss": 0.2998, + "step": 40336 + }, + { + "epoch": 0.7488777415965217, + "grad_norm": 0.3821668326854706, + "learning_rate": 2.953720438548219e-06, + "loss": 0.2424, + "step": 40338 + }, + { + "epoch": 0.7489148717339403, + "grad_norm": 0.5337623357772827, + "learning_rate": 2.9528927716063615e-06, + "loss": 0.2972, + "step": 40340 + }, + { + "epoch": 0.7489520018713589, + "grad_norm": 0.3525103032588959, + "learning_rate": 2.9520652005545292e-06, + "loss": 0.296, + "step": 40342 + }, + { + "epoch": 0.7489891320087776, + "grad_norm": 0.4213385581970215, + "learning_rate": 2.9512377254039747e-06, + "loss": 0.1518, + "step": 40344 + }, + { + "epoch": 0.7490262621461962, + "grad_norm": 0.4230386018753052, + "learning_rate": 2.950410346165965e-06, + "loss": 0.3798, + "step": 40346 + }, + { + "epoch": 0.7490633922836148, + "grad_norm": 0.28160858154296875, + "learning_rate": 2.9495830628517508e-06, + "loss": 0.2271, + "step": 40348 + }, + { + "epoch": 0.7491005224210335, + "grad_norm": 0.22811903059482574, + "learning_rate": 2.9487558754725933e-06, + "loss": 0.1881, + "step": 40350 + }, + { + "epoch": 0.7491376525584521, + "grad_norm": 0.5182791352272034, + "learning_rate": 2.947928784039746e-06, + "loss": 0.3255, + "step": 40352 + }, + { + "epoch": 0.7491747826958708, + "grad_norm": 0.40683820843696594, + "learning_rate": 2.9471017885644694e-06, + "loss": 0.1133, + "step": 40354 + }, + { + "epoch": 0.7492119128332894, + "grad_norm": 0.3923896253108978, + "learning_rate": 2.9462748890580073e-06, + "loss": 0.4264, + "step": 40356 + }, + { + "epoch": 0.749249042970708, + "grad_norm": 0.44196438789367676, + "learning_rate": 2.9454480855316157e-06, + "loss": 0.1462, + "step": 40358 + }, + { + "epoch": 0.7492861731081266, + "grad_norm": 0.40478309988975525, + "learning_rate": 2.944621377996546e-06, + "loss": 0.5034, + "step": 40360 + }, + { + "epoch": 0.7493233032455453, + "grad_norm": 0.5707475543022156, + "learning_rate": 2.9437947664640453e-06, + "loss": 0.3089, + "step": 40362 + }, + { + "epoch": 0.749360433382964, + "grad_norm": 0.3860756754875183, + "learning_rate": 2.9429682509453615e-06, + "loss": 0.165, + "step": 40364 + }, + { + "epoch": 0.7493975635203826, + "grad_norm": 0.34053704142570496, + "learning_rate": 2.9421418314517447e-06, + "loss": 0.2351, + "step": 40366 + }, + { + "epoch": 0.7494346936578012, + "grad_norm": 0.3145916163921356, + "learning_rate": 2.9413155079944376e-06, + "loss": 0.2494, + "step": 40368 + }, + { + "epoch": 0.7494718237952198, + "grad_norm": 0.44671714305877686, + "learning_rate": 2.9404892805846794e-06, + "loss": 0.2587, + "step": 40370 + }, + { + "epoch": 0.7495089539326385, + "grad_norm": 0.4623572528362274, + "learning_rate": 2.9396631492337168e-06, + "loss": 0.2719, + "step": 40372 + }, + { + "epoch": 0.7495460840700572, + "grad_norm": 0.62232905626297, + "learning_rate": 2.9388371139527918e-06, + "loss": 0.1431, + "step": 40374 + }, + { + "epoch": 0.7495832142074758, + "grad_norm": 0.35104212164878845, + "learning_rate": 2.9380111747531416e-06, + "loss": 0.4099, + "step": 40376 + }, + { + "epoch": 0.7496203443448944, + "grad_norm": 0.3375902473926544, + "learning_rate": 2.9371853316460065e-06, + "loss": 0.2952, + "step": 40378 + }, + { + "epoch": 0.749657474482313, + "grad_norm": 0.2781085968017578, + "learning_rate": 2.9363595846426264e-06, + "loss": 0.1928, + "step": 40380 + }, + { + "epoch": 0.7496946046197317, + "grad_norm": 0.25676101446151733, + "learning_rate": 2.935533933754231e-06, + "loss": 0.0724, + "step": 40382 + }, + { + "epoch": 0.7497317347571504, + "grad_norm": 0.4692309498786926, + "learning_rate": 2.9347083789920583e-06, + "loss": 0.1377, + "step": 40384 + }, + { + "epoch": 0.749768864894569, + "grad_norm": 0.470002681016922, + "learning_rate": 2.9338829203673415e-06, + "loss": 0.2447, + "step": 40386 + }, + { + "epoch": 0.7498059950319876, + "grad_norm": 0.5296831727027893, + "learning_rate": 2.9330575578913167e-06, + "loss": 0.2101, + "step": 40388 + }, + { + "epoch": 0.7498431251694062, + "grad_norm": 0.3243440091609955, + "learning_rate": 2.9322322915752065e-06, + "loss": 0.4029, + "step": 40390 + }, + { + "epoch": 0.7498802553068249, + "grad_norm": 0.46210619807243347, + "learning_rate": 2.9314071214302473e-06, + "loss": 0.2203, + "step": 40392 + }, + { + "epoch": 0.7499173854442436, + "grad_norm": 0.598175585269928, + "learning_rate": 2.9305820474676627e-06, + "loss": 0.3718, + "step": 40394 + }, + { + "epoch": 0.7499545155816622, + "grad_norm": 0.3679395318031311, + "learning_rate": 2.92975706969868e-06, + "loss": 0.2839, + "step": 40396 + }, + { + "epoch": 0.7499916457190808, + "grad_norm": 0.3775363862514496, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.3137, + "step": 40398 + }, + { + "epoch": 0.7500287758564994, + "grad_norm": 0.3015807569026947, + "learning_rate": 2.9281074027864243e-06, + "loss": 0.0868, + "step": 40400 + }, + { + "epoch": 0.7500659059939181, + "grad_norm": 0.2993025481700897, + "learning_rate": 2.927282713665598e-06, + "loss": 0.2463, + "step": 40402 + }, + { + "epoch": 0.7501030361313368, + "grad_norm": 0.29711946845054626, + "learning_rate": 2.9264581207832687e-06, + "loss": 0.264, + "step": 40404 + }, + { + "epoch": 0.7501401662687553, + "grad_norm": 0.5153632760047913, + "learning_rate": 2.92563362415066e-06, + "loss": 0.423, + "step": 40406 + }, + { + "epoch": 0.750177296406174, + "grad_norm": 0.30841514468193054, + "learning_rate": 2.924809223778985e-06, + "loss": 0.5534, + "step": 40408 + }, + { + "epoch": 0.7502144265435926, + "grad_norm": 0.4487874507904053, + "learning_rate": 2.923984919679467e-06, + "loss": 0.3449, + "step": 40410 + }, + { + "epoch": 0.7502515566810113, + "grad_norm": 0.3042234778404236, + "learning_rate": 2.9231607118633143e-06, + "loss": 0.2943, + "step": 40412 + }, + { + "epoch": 0.7502886868184299, + "grad_norm": 0.38850268721580505, + "learning_rate": 2.9223366003417487e-06, + "loss": 0.239, + "step": 40414 + }, + { + "epoch": 0.7503258169558485, + "grad_norm": 0.5220369100570679, + "learning_rate": 2.9215125851259806e-06, + "loss": 0.2571, + "step": 40416 + }, + { + "epoch": 0.7503629470932672, + "grad_norm": 0.34970369935035706, + "learning_rate": 2.920688666227227e-06, + "loss": 0.284, + "step": 40418 + }, + { + "epoch": 0.7504000772306858, + "grad_norm": 0.3791286051273346, + "learning_rate": 2.9198648436566935e-06, + "loss": 0.1438, + "step": 40420 + }, + { + "epoch": 0.7504372073681045, + "grad_norm": 0.45752692222595215, + "learning_rate": 2.9190411174255907e-06, + "loss": 0.1872, + "step": 40422 + }, + { + "epoch": 0.7504743375055231, + "grad_norm": 0.662827730178833, + "learning_rate": 2.9182174875451287e-06, + "loss": 0.3803, + "step": 40424 + }, + { + "epoch": 0.7505114676429417, + "grad_norm": 0.49187585711479187, + "learning_rate": 2.917393954026515e-06, + "loss": 0.2737, + "step": 40426 + }, + { + "epoch": 0.7505485977803604, + "grad_norm": 0.3631449341773987, + "learning_rate": 2.9165705168809544e-06, + "loss": 0.2014, + "step": 40428 + }, + { + "epoch": 0.750585727917779, + "grad_norm": 0.3902400732040405, + "learning_rate": 2.9157471761196543e-06, + "loss": 0.1933, + "step": 40430 + }, + { + "epoch": 0.7506228580551977, + "grad_norm": 0.340748131275177, + "learning_rate": 2.9149239317538156e-06, + "loss": 0.3324, + "step": 40432 + }, + { + "epoch": 0.7506599881926163, + "grad_norm": 0.4365726709365845, + "learning_rate": 2.9141007837946378e-06, + "loss": 0.3099, + "step": 40434 + }, + { + "epoch": 0.7506971183300349, + "grad_norm": 0.5434020161628723, + "learning_rate": 2.9132777322533224e-06, + "loss": 0.2396, + "step": 40436 + }, + { + "epoch": 0.7507342484674536, + "grad_norm": 0.5789579153060913, + "learning_rate": 2.9124547771410706e-06, + "loss": 0.3646, + "step": 40438 + }, + { + "epoch": 0.7507713786048722, + "grad_norm": 0.5691447854042053, + "learning_rate": 2.911631918469079e-06, + "loss": 0.1807, + "step": 40440 + }, + { + "epoch": 0.7508085087422909, + "grad_norm": 0.5294763445854187, + "learning_rate": 2.910809156248545e-06, + "loss": 0.2373, + "step": 40442 + }, + { + "epoch": 0.7508456388797095, + "grad_norm": 0.5125042200088501, + "learning_rate": 2.909986490490667e-06, + "loss": 0.186, + "step": 40444 + }, + { + "epoch": 0.7508827690171281, + "grad_norm": 0.5759803056716919, + "learning_rate": 2.9091639212066323e-06, + "loss": 0.3935, + "step": 40446 + }, + { + "epoch": 0.7509198991545468, + "grad_norm": 0.4840807616710663, + "learning_rate": 2.9083414484076377e-06, + "loss": 0.3367, + "step": 40448 + }, + { + "epoch": 0.7509570292919654, + "grad_norm": 0.46738529205322266, + "learning_rate": 2.907519072104874e-06, + "loss": 0.4648, + "step": 40450 + }, + { + "epoch": 0.7509941594293841, + "grad_norm": 0.6332831978797913, + "learning_rate": 2.9066967923095345e-06, + "loss": 0.1158, + "step": 40452 + }, + { + "epoch": 0.7510312895668027, + "grad_norm": 0.5030101537704468, + "learning_rate": 2.9058746090328015e-06, + "loss": 0.2799, + "step": 40454 + }, + { + "epoch": 0.7510684197042213, + "grad_norm": 0.43413105607032776, + "learning_rate": 2.9050525222858693e-06, + "loss": 0.2372, + "step": 40456 + }, + { + "epoch": 0.75110554984164, + "grad_norm": 0.3051797151565552, + "learning_rate": 2.904230532079917e-06, + "loss": 0.3472, + "step": 40458 + }, + { + "epoch": 0.7511426799790586, + "grad_norm": 0.24058611690998077, + "learning_rate": 2.903408638426132e-06, + "loss": 0.1473, + "step": 40460 + }, + { + "epoch": 0.7511798101164773, + "grad_norm": 0.5001422762870789, + "learning_rate": 2.9025868413356995e-06, + "loss": 0.2563, + "step": 40462 + }, + { + "epoch": 0.7512169402538958, + "grad_norm": 0.31162333488464355, + "learning_rate": 2.901765140819801e-06, + "loss": 0.3397, + "step": 40464 + }, + { + "epoch": 0.7512540703913145, + "grad_norm": 0.48022735118865967, + "learning_rate": 2.9009435368896167e-06, + "loss": 0.2954, + "step": 40466 + }, + { + "epoch": 0.7512912005287331, + "grad_norm": 0.35977819561958313, + "learning_rate": 2.9001220295563306e-06, + "loss": 0.3145, + "step": 40468 + }, + { + "epoch": 0.7513283306661518, + "grad_norm": 0.4534611701965332, + "learning_rate": 2.8993006188311136e-06, + "loss": 0.3072, + "step": 40470 + }, + { + "epoch": 0.7513654608035705, + "grad_norm": 0.27284952998161316, + "learning_rate": 2.898479304725146e-06, + "loss": 0.166, + "step": 40472 + }, + { + "epoch": 0.751402590940989, + "grad_norm": 0.39503586292266846, + "learning_rate": 2.897658087249606e-06, + "loss": 0.2237, + "step": 40474 + }, + { + "epoch": 0.7514397210784077, + "grad_norm": 0.3290507197380066, + "learning_rate": 2.8968369664156636e-06, + "loss": 0.4754, + "step": 40476 + }, + { + "epoch": 0.7514768512158263, + "grad_norm": 0.3885556757450104, + "learning_rate": 2.8960159422344925e-06, + "loss": 0.5884, + "step": 40478 + }, + { + "epoch": 0.751513981353245, + "grad_norm": 0.32867395877838135, + "learning_rate": 2.8951950147172694e-06, + "loss": 0.225, + "step": 40480 + }, + { + "epoch": 0.7515511114906637, + "grad_norm": 0.8079530000686646, + "learning_rate": 2.8943741838751572e-06, + "loss": 0.2395, + "step": 40482 + }, + { + "epoch": 0.7515882416280822, + "grad_norm": 0.3274528682231903, + "learning_rate": 2.8935534497193297e-06, + "loss": 0.1432, + "step": 40484 + }, + { + "epoch": 0.7516253717655009, + "grad_norm": 0.32301655411720276, + "learning_rate": 2.892732812260952e-06, + "loss": 0.2714, + "step": 40486 + }, + { + "epoch": 0.7516625019029195, + "grad_norm": 0.45325884222984314, + "learning_rate": 2.8919122715111924e-06, + "loss": 0.242, + "step": 40488 + }, + { + "epoch": 0.7516996320403382, + "grad_norm": 0.6501412987709045, + "learning_rate": 2.891091827481216e-06, + "loss": 0.5304, + "step": 40490 + }, + { + "epoch": 0.7517367621777569, + "grad_norm": 0.32770755887031555, + "learning_rate": 2.8902714801821896e-06, + "loss": 0.2237, + "step": 40492 + }, + { + "epoch": 0.7517738923151754, + "grad_norm": 0.4423248767852783, + "learning_rate": 2.8894512296252688e-06, + "loss": 0.3921, + "step": 40494 + }, + { + "epoch": 0.7518110224525941, + "grad_norm": 0.2605426013469696, + "learning_rate": 2.8886310758216206e-06, + "loss": 0.2229, + "step": 40496 + }, + { + "epoch": 0.7518481525900127, + "grad_norm": 0.5913214087486267, + "learning_rate": 2.8878110187823993e-06, + "loss": 0.348, + "step": 40498 + }, + { + "epoch": 0.7518852827274314, + "grad_norm": 0.21425849199295044, + "learning_rate": 2.886991058518768e-06, + "loss": 0.1859, + "step": 40500 + }, + { + "epoch": 0.7519224128648501, + "grad_norm": 0.5110229253768921, + "learning_rate": 2.8861711950418813e-06, + "loss": 0.3533, + "step": 40502 + }, + { + "epoch": 0.7519595430022686, + "grad_norm": 0.4079522490501404, + "learning_rate": 2.885351428362897e-06, + "loss": 0.176, + "step": 40504 + }, + { + "epoch": 0.7519966731396873, + "grad_norm": 0.36067524552345276, + "learning_rate": 2.884531758492971e-06, + "loss": 0.2977, + "step": 40506 + }, + { + "epoch": 0.7520338032771059, + "grad_norm": 0.3455793261528015, + "learning_rate": 2.8837121854432524e-06, + "loss": 0.3418, + "step": 40508 + }, + { + "epoch": 0.7520709334145246, + "grad_norm": 0.3733421266078949, + "learning_rate": 2.882892709224895e-06, + "loss": 0.2708, + "step": 40510 + }, + { + "epoch": 0.7521080635519432, + "grad_norm": 0.617104172706604, + "learning_rate": 2.882073329849049e-06, + "loss": 0.2565, + "step": 40512 + }, + { + "epoch": 0.7521451936893618, + "grad_norm": 0.33861207962036133, + "learning_rate": 2.881254047326868e-06, + "loss": 0.1494, + "step": 40514 + }, + { + "epoch": 0.7521823238267805, + "grad_norm": 0.37827083468437195, + "learning_rate": 2.8804348616694933e-06, + "loss": 0.4645, + "step": 40516 + }, + { + "epoch": 0.7522194539641991, + "grad_norm": 0.33447524905204773, + "learning_rate": 2.8796157728880767e-06, + "loss": 0.3223, + "step": 40518 + }, + { + "epoch": 0.7522565841016178, + "grad_norm": 0.2812304198741913, + "learning_rate": 2.878796780993759e-06, + "loss": 0.2313, + "step": 40520 + }, + { + "epoch": 0.7522937142390363, + "grad_norm": 0.39499467611312866, + "learning_rate": 2.877977885997687e-06, + "loss": 0.3434, + "step": 40522 + }, + { + "epoch": 0.752330844376455, + "grad_norm": 0.5273628234863281, + "learning_rate": 2.8771590879110022e-06, + "loss": 0.144, + "step": 40524 + }, + { + "epoch": 0.7523679745138737, + "grad_norm": 0.3484586477279663, + "learning_rate": 2.8763403867448482e-06, + "loss": 0.2065, + "step": 40526 + }, + { + "epoch": 0.7524051046512923, + "grad_norm": 0.31164637207984924, + "learning_rate": 2.875521782510362e-06, + "loss": 0.286, + "step": 40528 + }, + { + "epoch": 0.752442234788711, + "grad_norm": 0.2688003480434418, + "learning_rate": 2.8747032752186856e-06, + "loss": 0.3688, + "step": 40530 + }, + { + "epoch": 0.7524793649261295, + "grad_norm": 0.3683149218559265, + "learning_rate": 2.873884864880957e-06, + "loss": 0.3259, + "step": 40532 + }, + { + "epoch": 0.7525164950635482, + "grad_norm": 0.412943959236145, + "learning_rate": 2.8730665515083066e-06, + "loss": 0.3885, + "step": 40534 + }, + { + "epoch": 0.7525536252009669, + "grad_norm": 0.3783700466156006, + "learning_rate": 2.8722483351118735e-06, + "loss": 0.3355, + "step": 40536 + }, + { + "epoch": 0.7525907553383855, + "grad_norm": 0.40683338046073914, + "learning_rate": 2.871430215702794e-06, + "loss": 0.2741, + "step": 40538 + }, + { + "epoch": 0.7526278854758042, + "grad_norm": 0.198213130235672, + "learning_rate": 2.870612193292194e-06, + "loss": 0.0902, + "step": 40540 + }, + { + "epoch": 0.7526650156132227, + "grad_norm": 0.4948136806488037, + "learning_rate": 2.8697942678912062e-06, + "loss": 0.2335, + "step": 40542 + }, + { + "epoch": 0.7527021457506414, + "grad_norm": 0.4490640163421631, + "learning_rate": 2.8689764395109643e-06, + "loss": 0.1516, + "step": 40544 + }, + { + "epoch": 0.7527392758880601, + "grad_norm": 0.3021415174007416, + "learning_rate": 2.8681587081625893e-06, + "loss": 0.4037, + "step": 40546 + }, + { + "epoch": 0.7527764060254787, + "grad_norm": 0.3851489722728729, + "learning_rate": 2.867341073857213e-06, + "loss": 0.3788, + "step": 40548 + }, + { + "epoch": 0.7528135361628974, + "grad_norm": 0.2587454319000244, + "learning_rate": 2.8665235366059596e-06, + "loss": 0.2941, + "step": 40550 + }, + { + "epoch": 0.7528506663003159, + "grad_norm": 0.40624019503593445, + "learning_rate": 2.8657060964199535e-06, + "loss": 0.2295, + "step": 40552 + }, + { + "epoch": 0.7528877964377346, + "grad_norm": 0.3450055718421936, + "learning_rate": 2.8648887533103178e-06, + "loss": 0.3944, + "step": 40554 + }, + { + "epoch": 0.7529249265751533, + "grad_norm": 0.6019555926322937, + "learning_rate": 2.8640715072881766e-06, + "loss": 0.4686, + "step": 40556 + }, + { + "epoch": 0.7529620567125719, + "grad_norm": 0.3090994954109192, + "learning_rate": 2.8632543583646456e-06, + "loss": 0.2455, + "step": 40558 + }, + { + "epoch": 0.7529991868499906, + "grad_norm": 0.43548765778541565, + "learning_rate": 2.8624373065508483e-06, + "loss": 0.3514, + "step": 40560 + }, + { + "epoch": 0.7530363169874091, + "grad_norm": 0.2948155105113983, + "learning_rate": 2.8616203518578966e-06, + "loss": 0.1562, + "step": 40562 + }, + { + "epoch": 0.7530734471248278, + "grad_norm": 0.35642364621162415, + "learning_rate": 2.86080349429691e-06, + "loss": 0.3864, + "step": 40564 + }, + { + "epoch": 0.7531105772622464, + "grad_norm": 0.774732232093811, + "learning_rate": 2.859986733879003e-06, + "loss": 0.3173, + "step": 40566 + }, + { + "epoch": 0.7531477073996651, + "grad_norm": 0.3449834883213043, + "learning_rate": 2.8591700706152904e-06, + "loss": 0.218, + "step": 40568 + }, + { + "epoch": 0.7531848375370838, + "grad_norm": 0.4287796914577484, + "learning_rate": 2.8583535045168865e-06, + "loss": 0.2804, + "step": 40570 + }, + { + "epoch": 0.7532219676745023, + "grad_norm": 0.5407571196556091, + "learning_rate": 2.8575370355948974e-06, + "loss": 0.3317, + "step": 40572 + }, + { + "epoch": 0.753259097811921, + "grad_norm": 0.43815404176712036, + "learning_rate": 2.856720663860435e-06, + "loss": 0.5291, + "step": 40574 + }, + { + "epoch": 0.7532962279493396, + "grad_norm": 0.27817341685295105, + "learning_rate": 2.8559043893246074e-06, + "loss": 0.258, + "step": 40576 + }, + { + "epoch": 0.7533333580867583, + "grad_norm": 0.2578461766242981, + "learning_rate": 2.8550882119985245e-06, + "loss": 0.1228, + "step": 40578 + }, + { + "epoch": 0.753370488224177, + "grad_norm": 0.5653959512710571, + "learning_rate": 2.8542721318932875e-06, + "loss": 0.1571, + "step": 40580 + }, + { + "epoch": 0.7534076183615955, + "grad_norm": 0.605609655380249, + "learning_rate": 2.853456149020005e-06, + "loss": 0.1191, + "step": 40582 + }, + { + "epoch": 0.7534447484990142, + "grad_norm": 0.47384288907051086, + "learning_rate": 2.8526402633897754e-06, + "loss": 0.3095, + "step": 40584 + }, + { + "epoch": 0.7534818786364328, + "grad_norm": 0.4427448511123657, + "learning_rate": 2.8518244750137024e-06, + "loss": 0.3287, + "step": 40586 + }, + { + "epoch": 0.7535190087738515, + "grad_norm": 0.8036258220672607, + "learning_rate": 2.8510087839028877e-06, + "loss": 0.1581, + "step": 40588 + }, + { + "epoch": 0.7535561389112702, + "grad_norm": 0.30833467841148376, + "learning_rate": 2.85019319006843e-06, + "loss": 0.2485, + "step": 40590 + }, + { + "epoch": 0.7535932690486887, + "grad_norm": 0.5380029678344727, + "learning_rate": 2.8493776935214268e-06, + "loss": 0.1126, + "step": 40592 + }, + { + "epoch": 0.7536303991861074, + "grad_norm": 0.32691627740859985, + "learning_rate": 2.8485622942729775e-06, + "loss": 0.2155, + "step": 40594 + }, + { + "epoch": 0.753667529323526, + "grad_norm": 0.3697568476200104, + "learning_rate": 2.8477469923341707e-06, + "loss": 0.3845, + "step": 40596 + }, + { + "epoch": 0.7537046594609447, + "grad_norm": 0.4236752688884735, + "learning_rate": 2.846931787716104e-06, + "loss": 0.2602, + "step": 40598 + }, + { + "epoch": 0.7537417895983634, + "grad_norm": 0.48736318945884705, + "learning_rate": 2.8461166804298736e-06, + "loss": 0.3049, + "step": 40600 + }, + { + "epoch": 0.7537789197357819, + "grad_norm": 0.2925927937030792, + "learning_rate": 2.8453016704865633e-06, + "loss": 0.2303, + "step": 40602 + }, + { + "epoch": 0.7538160498732006, + "grad_norm": 0.2856064438819885, + "learning_rate": 2.8444867578972656e-06, + "loss": 0.3776, + "step": 40604 + }, + { + "epoch": 0.7538531800106192, + "grad_norm": 0.36873093247413635, + "learning_rate": 2.8436719426730706e-06, + "loss": 0.3348, + "step": 40606 + }, + { + "epoch": 0.7538903101480379, + "grad_norm": 0.2899578809738159, + "learning_rate": 2.8428572248250687e-06, + "loss": 0.1771, + "step": 40608 + }, + { + "epoch": 0.7539274402854566, + "grad_norm": 0.5378159880638123, + "learning_rate": 2.8420426043643377e-06, + "loss": 0.1927, + "step": 40610 + }, + { + "epoch": 0.7539645704228751, + "grad_norm": 0.26763346791267395, + "learning_rate": 2.8412280813019665e-06, + "loss": 0.2314, + "step": 40612 + }, + { + "epoch": 0.7540017005602938, + "grad_norm": 0.3388088345527649, + "learning_rate": 2.840413655649038e-06, + "loss": 0.2281, + "step": 40614 + }, + { + "epoch": 0.7540388306977124, + "grad_norm": 0.36535537242889404, + "learning_rate": 2.8395993274166344e-06, + "loss": 0.2545, + "step": 40616 + }, + { + "epoch": 0.7540759608351311, + "grad_norm": 0.35980138182640076, + "learning_rate": 2.8387850966158357e-06, + "loss": 0.3308, + "step": 40618 + }, + { + "epoch": 0.7541130909725496, + "grad_norm": 0.5129694938659668, + "learning_rate": 2.837970963257726e-06, + "loss": 0.1353, + "step": 40620 + }, + { + "epoch": 0.7541502211099683, + "grad_norm": 0.6546342968940735, + "learning_rate": 2.8371569273533773e-06, + "loss": 0.2279, + "step": 40622 + }, + { + "epoch": 0.754187351247387, + "grad_norm": 0.3177323341369629, + "learning_rate": 2.836342988913865e-06, + "loss": 0.4437, + "step": 40624 + }, + { + "epoch": 0.7542244813848056, + "grad_norm": 0.34303364157676697, + "learning_rate": 2.8355291479502665e-06, + "loss": 0.3014, + "step": 40626 + }, + { + "epoch": 0.7542616115222243, + "grad_norm": 0.4379226565361023, + "learning_rate": 2.8347154044736567e-06, + "loss": 0.4724, + "step": 40628 + }, + { + "epoch": 0.7542987416596428, + "grad_norm": 0.2685122787952423, + "learning_rate": 2.833901758495108e-06, + "loss": 0.2648, + "step": 40630 + }, + { + "epoch": 0.7543358717970615, + "grad_norm": 0.49419277906417847, + "learning_rate": 2.833088210025694e-06, + "loss": 0.288, + "step": 40632 + }, + { + "epoch": 0.7543730019344802, + "grad_norm": 0.41111984848976135, + "learning_rate": 2.8322747590764786e-06, + "loss": 0.2239, + "step": 40634 + }, + { + "epoch": 0.7544101320718988, + "grad_norm": 0.34498411417007446, + "learning_rate": 2.8314614056585342e-06, + "loss": 0.1597, + "step": 40636 + }, + { + "epoch": 0.7544472622093175, + "grad_norm": 0.13146619498729706, + "learning_rate": 2.8306481497829288e-06, + "loss": 0.0745, + "step": 40638 + }, + { + "epoch": 0.754484392346736, + "grad_norm": 0.41899600625038147, + "learning_rate": 2.8298349914607258e-06, + "loss": 0.1341, + "step": 40640 + }, + { + "epoch": 0.7545215224841547, + "grad_norm": 0.35171565413475037, + "learning_rate": 2.8290219307029963e-06, + "loss": 0.1642, + "step": 40642 + }, + { + "epoch": 0.7545586526215734, + "grad_norm": 0.2993064522743225, + "learning_rate": 2.8282089675207947e-06, + "loss": 0.216, + "step": 40644 + }, + { + "epoch": 0.754595782758992, + "grad_norm": 0.7365838885307312, + "learning_rate": 2.8273961019251895e-06, + "loss": 0.2661, + "step": 40646 + }, + { + "epoch": 0.7546329128964107, + "grad_norm": 0.3117922246456146, + "learning_rate": 2.8265833339272366e-06, + "loss": 0.3188, + "step": 40648 + }, + { + "epoch": 0.7546700430338292, + "grad_norm": 0.2788543701171875, + "learning_rate": 2.8257706635379977e-06, + "loss": 0.1794, + "step": 40650 + }, + { + "epoch": 0.7547071731712479, + "grad_norm": 0.5035040974617004, + "learning_rate": 2.8249580907685302e-06, + "loss": 0.2956, + "step": 40652 + }, + { + "epoch": 0.7547443033086666, + "grad_norm": 0.35695546865463257, + "learning_rate": 2.824145615629892e-06, + "loss": 0.2673, + "step": 40654 + }, + { + "epoch": 0.7547814334460852, + "grad_norm": 0.4621291756629944, + "learning_rate": 2.823333238133138e-06, + "loss": 0.1805, + "step": 40656 + }, + { + "epoch": 0.7548185635835039, + "grad_norm": 0.3534008264541626, + "learning_rate": 2.822520958289324e-06, + "loss": 0.1501, + "step": 40658 + }, + { + "epoch": 0.7548556937209224, + "grad_norm": 0.41241827607154846, + "learning_rate": 2.8217087761094986e-06, + "loss": 0.2812, + "step": 40660 + }, + { + "epoch": 0.7548928238583411, + "grad_norm": 0.34639212489128113, + "learning_rate": 2.8208966916047154e-06, + "loss": 0.1477, + "step": 40662 + }, + { + "epoch": 0.7549299539957597, + "grad_norm": 0.35248103737831116, + "learning_rate": 2.820084704786027e-06, + "loss": 0.148, + "step": 40664 + }, + { + "epoch": 0.7549670841331784, + "grad_norm": 0.4236123263835907, + "learning_rate": 2.8192728156644766e-06, + "loss": 0.1804, + "step": 40666 + }, + { + "epoch": 0.755004214270597, + "grad_norm": 0.2215155065059662, + "learning_rate": 2.8184610242511134e-06, + "loss": 0.2202, + "step": 40668 + }, + { + "epoch": 0.7550413444080156, + "grad_norm": 0.2704628109931946, + "learning_rate": 2.8176493305569885e-06, + "loss": 0.3568, + "step": 40670 + }, + { + "epoch": 0.7550784745454343, + "grad_norm": 0.4117063283920288, + "learning_rate": 2.8168377345931396e-06, + "loss": 0.3519, + "step": 40672 + }, + { + "epoch": 0.7551156046828529, + "grad_norm": 0.3211822211742401, + "learning_rate": 2.8160262363706126e-06, + "loss": 0.2875, + "step": 40674 + }, + { + "epoch": 0.7551527348202716, + "grad_norm": 0.5700035691261292, + "learning_rate": 2.8152148359004505e-06, + "loss": 0.1892, + "step": 40676 + }, + { + "epoch": 0.7551898649576902, + "grad_norm": 0.16704770922660828, + "learning_rate": 2.8144035331936924e-06, + "loss": 0.3511, + "step": 40678 + }, + { + "epoch": 0.7552269950951088, + "grad_norm": 0.2647544741630554, + "learning_rate": 2.8135923282613797e-06, + "loss": 0.3958, + "step": 40680 + }, + { + "epoch": 0.7552641252325275, + "grad_norm": 0.37671977281570435, + "learning_rate": 2.812781221114553e-06, + "loss": 0.3345, + "step": 40682 + }, + { + "epoch": 0.7553012553699461, + "grad_norm": 0.5648159980773926, + "learning_rate": 2.8119702117642422e-06, + "loss": 0.2912, + "step": 40684 + }, + { + "epoch": 0.7553383855073648, + "grad_norm": 0.4012536406517029, + "learning_rate": 2.81115930022149e-06, + "loss": 0.1996, + "step": 40686 + }, + { + "epoch": 0.7553755156447834, + "grad_norm": 0.3259325325489044, + "learning_rate": 2.810348486497323e-06, + "loss": 0.2116, + "step": 40688 + }, + { + "epoch": 0.755412645782202, + "grad_norm": 0.3881808817386627, + "learning_rate": 2.809537770602777e-06, + "loss": 0.348, + "step": 40690 + }, + { + "epoch": 0.7554497759196207, + "grad_norm": 0.39706704020500183, + "learning_rate": 2.8087271525488847e-06, + "loss": 0.3307, + "step": 40692 + }, + { + "epoch": 0.7554869060570393, + "grad_norm": 0.24664393067359924, + "learning_rate": 2.8079166323466754e-06, + "loss": 0.2776, + "step": 40694 + }, + { + "epoch": 0.755524036194458, + "grad_norm": 0.33150362968444824, + "learning_rate": 2.807106210007181e-06, + "loss": 0.2319, + "step": 40696 + }, + { + "epoch": 0.7555611663318766, + "grad_norm": 0.638157069683075, + "learning_rate": 2.8062958855414225e-06, + "loss": 0.3357, + "step": 40698 + }, + { + "epoch": 0.7555982964692952, + "grad_norm": 0.4024103581905365, + "learning_rate": 2.8054856589604294e-06, + "loss": 0.2249, + "step": 40700 + }, + { + "epoch": 0.7556354266067139, + "grad_norm": 0.3136601150035858, + "learning_rate": 2.8046755302752272e-06, + "loss": 0.1606, + "step": 40702 + }, + { + "epoch": 0.7556725567441325, + "grad_norm": 0.3830137252807617, + "learning_rate": 2.8038654994968385e-06, + "loss": 0.2685, + "step": 40704 + }, + { + "epoch": 0.7557096868815512, + "grad_norm": 0.4332003593444824, + "learning_rate": 2.803055566636288e-06, + "loss": 0.2527, + "step": 40706 + }, + { + "epoch": 0.7557468170189698, + "grad_norm": 0.39073067903518677, + "learning_rate": 2.8022457317045938e-06, + "loss": 0.3277, + "step": 40708 + }, + { + "epoch": 0.7557839471563884, + "grad_norm": 0.3377404510974884, + "learning_rate": 2.801435994712772e-06, + "loss": 0.2248, + "step": 40710 + }, + { + "epoch": 0.7558210772938071, + "grad_norm": 0.4313433766365051, + "learning_rate": 2.800626355671845e-06, + "loss": 0.4138, + "step": 40712 + }, + { + "epoch": 0.7558582074312257, + "grad_norm": 0.3503766655921936, + "learning_rate": 2.7998168145928274e-06, + "loss": 0.1313, + "step": 40714 + }, + { + "epoch": 0.7558953375686444, + "grad_norm": 0.3087615370750427, + "learning_rate": 2.7990073714867373e-06, + "loss": 0.2765, + "step": 40716 + }, + { + "epoch": 0.7559324677060629, + "grad_norm": 0.36310046911239624, + "learning_rate": 2.798198026364587e-06, + "loss": 0.2314, + "step": 40718 + }, + { + "epoch": 0.7559695978434816, + "grad_norm": 0.7252974510192871, + "learning_rate": 2.797388779237392e-06, + "loss": 0.3686, + "step": 40720 + }, + { + "epoch": 0.7560067279809003, + "grad_norm": 0.25719764828681946, + "learning_rate": 2.7965796301161596e-06, + "loss": 0.198, + "step": 40722 + }, + { + "epoch": 0.7560438581183189, + "grad_norm": 0.45833274722099304, + "learning_rate": 2.7957705790119005e-06, + "loss": 0.287, + "step": 40724 + }, + { + "epoch": 0.7560809882557376, + "grad_norm": 0.38273656368255615, + "learning_rate": 2.794961625935625e-06, + "loss": 0.464, + "step": 40726 + }, + { + "epoch": 0.7561181183931561, + "grad_norm": 0.32022449374198914, + "learning_rate": 2.794152770898344e-06, + "loss": 0.1667, + "step": 40728 + }, + { + "epoch": 0.7561552485305748, + "grad_norm": 0.4735936224460602, + "learning_rate": 2.793344013911056e-06, + "loss": 0.3079, + "step": 40730 + }, + { + "epoch": 0.7561923786679935, + "grad_norm": 0.2802967429161072, + "learning_rate": 2.792535354984769e-06, + "loss": 0.1607, + "step": 40732 + }, + { + "epoch": 0.7562295088054121, + "grad_norm": 0.3773616552352905, + "learning_rate": 2.7917267941304917e-06, + "loss": 0.2924, + "step": 40734 + }, + { + "epoch": 0.7562666389428307, + "grad_norm": 0.3613657057285309, + "learning_rate": 2.790918331359217e-06, + "loss": 0.1795, + "step": 40736 + }, + { + "epoch": 0.7563037690802493, + "grad_norm": 0.6214427351951599, + "learning_rate": 2.7901099666819497e-06, + "loss": 0.2521, + "step": 40738 + }, + { + "epoch": 0.756340899217668, + "grad_norm": 0.277411550283432, + "learning_rate": 2.789301700109691e-06, + "loss": 0.1323, + "step": 40740 + }, + { + "epoch": 0.7563780293550867, + "grad_norm": 0.3306434452533722, + "learning_rate": 2.788493531653437e-06, + "loss": 0.2587, + "step": 40742 + }, + { + "epoch": 0.7564151594925053, + "grad_norm": 0.3856564462184906, + "learning_rate": 2.7876854613241854e-06, + "loss": 0.4018, + "step": 40744 + }, + { + "epoch": 0.756452289629924, + "grad_norm": 0.20774756371974945, + "learning_rate": 2.786877489132934e-06, + "loss": 0.3185, + "step": 40746 + }, + { + "epoch": 0.7564894197673425, + "grad_norm": 0.3551657199859619, + "learning_rate": 2.786069615090671e-06, + "loss": 0.2058, + "step": 40748 + }, + { + "epoch": 0.7565265499047612, + "grad_norm": 0.4539802074432373, + "learning_rate": 2.785261839208395e-06, + "loss": 0.3063, + "step": 40750 + }, + { + "epoch": 0.7565636800421799, + "grad_norm": 0.3690294921398163, + "learning_rate": 2.7844541614970934e-06, + "loss": 0.2518, + "step": 40752 + }, + { + "epoch": 0.7566008101795985, + "grad_norm": 0.3058300316333771, + "learning_rate": 2.783646581967756e-06, + "loss": 0.2164, + "step": 40754 + }, + { + "epoch": 0.7566379403170171, + "grad_norm": 0.33675098419189453, + "learning_rate": 2.782839100631374e-06, + "loss": 0.3299, + "step": 40756 + }, + { + "epoch": 0.7566750704544357, + "grad_norm": 0.3547455668449402, + "learning_rate": 2.7820317174989354e-06, + "loss": 0.2072, + "step": 40758 + }, + { + "epoch": 0.7567122005918544, + "grad_norm": 0.3308797776699066, + "learning_rate": 2.781224432581423e-06, + "loss": 0.0879, + "step": 40760 + }, + { + "epoch": 0.7567493307292731, + "grad_norm": 0.4448954463005066, + "learning_rate": 2.7804172458898236e-06, + "loss": 0.2873, + "step": 40762 + }, + { + "epoch": 0.7567864608666917, + "grad_norm": 0.2275913506746292, + "learning_rate": 2.77961015743512e-06, + "loss": 0.0451, + "step": 40764 + }, + { + "epoch": 0.7568235910041103, + "grad_norm": 0.46014997363090515, + "learning_rate": 2.7788031672282946e-06, + "loss": 0.2575, + "step": 40766 + }, + { + "epoch": 0.7568607211415289, + "grad_norm": 0.9196771383285522, + "learning_rate": 2.7779962752803315e-06, + "loss": 0.1706, + "step": 40768 + }, + { + "epoch": 0.7568978512789476, + "grad_norm": 0.3335045874118805, + "learning_rate": 2.777189481602203e-06, + "loss": 0.3175, + "step": 40770 + }, + { + "epoch": 0.7569349814163662, + "grad_norm": 0.4726858139038086, + "learning_rate": 2.776382786204894e-06, + "loss": 0.2372, + "step": 40772 + }, + { + "epoch": 0.7569721115537849, + "grad_norm": 0.35630112886428833, + "learning_rate": 2.775576189099375e-06, + "loss": 0.2678, + "step": 40774 + }, + { + "epoch": 0.7570092416912035, + "grad_norm": 0.49457061290740967, + "learning_rate": 2.7747696902966246e-06, + "loss": 0.2602, + "step": 40776 + }, + { + "epoch": 0.7570463718286221, + "grad_norm": 0.48716500401496887, + "learning_rate": 2.7739632898076164e-06, + "loss": 0.327, + "step": 40778 + }, + { + "epoch": 0.7570835019660408, + "grad_norm": 0.3364877998828888, + "learning_rate": 2.773156987643324e-06, + "loss": 0.1637, + "step": 40780 + }, + { + "epoch": 0.7571206321034594, + "grad_norm": 0.38432666659355164, + "learning_rate": 2.7723507838147167e-06, + "loss": 0.3638, + "step": 40782 + }, + { + "epoch": 0.757157762240878, + "grad_norm": 0.29207751154899597, + "learning_rate": 2.7715446783327706e-06, + "loss": 0.1658, + "step": 40784 + }, + { + "epoch": 0.7571948923782967, + "grad_norm": 0.47117847204208374, + "learning_rate": 2.770738671208446e-06, + "loss": 0.4867, + "step": 40786 + }, + { + "epoch": 0.7572320225157153, + "grad_norm": 0.24877022206783295, + "learning_rate": 2.769932762452714e-06, + "loss": 0.1501, + "step": 40788 + }, + { + "epoch": 0.757269152653134, + "grad_norm": 0.6729599833488464, + "learning_rate": 2.769126952076543e-06, + "loss": 0.3526, + "step": 40790 + }, + { + "epoch": 0.7573062827905526, + "grad_norm": 0.6289013028144836, + "learning_rate": 2.768321240090892e-06, + "loss": 0.3321, + "step": 40792 + }, + { + "epoch": 0.7573434129279712, + "grad_norm": 0.5748385190963745, + "learning_rate": 2.767515626506728e-06, + "loss": 0.3446, + "step": 40794 + }, + { + "epoch": 0.7573805430653899, + "grad_norm": 0.34937381744384766, + "learning_rate": 2.766710111335017e-06, + "loss": 0.1764, + "step": 40796 + }, + { + "epoch": 0.7574176732028085, + "grad_norm": 0.416733980178833, + "learning_rate": 2.765904694586711e-06, + "loss": 0.1628, + "step": 40798 + }, + { + "epoch": 0.7574548033402272, + "grad_norm": 0.2753572463989258, + "learning_rate": 2.765099376272773e-06, + "loss": 0.2028, + "step": 40800 + }, + { + "epoch": 0.7574919334776458, + "grad_norm": 0.27628135681152344, + "learning_rate": 2.7642941564041613e-06, + "loss": 0.3271, + "step": 40802 + }, + { + "epoch": 0.7575290636150644, + "grad_norm": 0.4308212995529175, + "learning_rate": 2.763489034991833e-06, + "loss": 0.2245, + "step": 40804 + }, + { + "epoch": 0.7575661937524831, + "grad_norm": 0.381976455450058, + "learning_rate": 2.7626840120467434e-06, + "loss": 0.1739, + "step": 40806 + }, + { + "epoch": 0.7576033238899017, + "grad_norm": 0.49038568139076233, + "learning_rate": 2.7618790875798498e-06, + "loss": 0.3273, + "step": 40808 + }, + { + "epoch": 0.7576404540273204, + "grad_norm": 0.2933395802974701, + "learning_rate": 2.761074261602097e-06, + "loss": 0.2748, + "step": 40810 + }, + { + "epoch": 0.757677584164739, + "grad_norm": 0.3352925777435303, + "learning_rate": 2.76026953412444e-06, + "loss": 0.156, + "step": 40812 + }, + { + "epoch": 0.7577147143021576, + "grad_norm": 0.5246044397354126, + "learning_rate": 2.7594649051578337e-06, + "loss": 0.3903, + "step": 40814 + }, + { + "epoch": 0.7577518444395762, + "grad_norm": 0.3429300785064697, + "learning_rate": 2.758660374713218e-06, + "loss": 0.3326, + "step": 40816 + }, + { + "epoch": 0.7577889745769949, + "grad_norm": 0.3924695551395416, + "learning_rate": 2.757855942801544e-06, + "loss": 0.2355, + "step": 40818 + }, + { + "epoch": 0.7578261047144136, + "grad_norm": 0.24280685186386108, + "learning_rate": 2.7570516094337583e-06, + "loss": 0.2993, + "step": 40820 + }, + { + "epoch": 0.7578632348518322, + "grad_norm": 0.3577711880207062, + "learning_rate": 2.7562473746208083e-06, + "loss": 0.3782, + "step": 40822 + }, + { + "epoch": 0.7579003649892508, + "grad_norm": 2.0443711280822754, + "learning_rate": 2.755443238373632e-06, + "loss": 0.4801, + "step": 40824 + }, + { + "epoch": 0.7579374951266694, + "grad_norm": 0.4994984567165375, + "learning_rate": 2.7546392007031718e-06, + "loss": 0.2256, + "step": 40826 + }, + { + "epoch": 0.7579746252640881, + "grad_norm": 0.38579806685447693, + "learning_rate": 2.75383526162037e-06, + "loss": 0.3012, + "step": 40828 + }, + { + "epoch": 0.7580117554015068, + "grad_norm": 0.29628825187683105, + "learning_rate": 2.7530314211361653e-06, + "loss": 0.1733, + "step": 40830 + }, + { + "epoch": 0.7580488855389254, + "grad_norm": 0.37760624289512634, + "learning_rate": 2.7522276792614988e-06, + "loss": 0.2721, + "step": 40832 + }, + { + "epoch": 0.758086015676344, + "grad_norm": 0.5267041325569153, + "learning_rate": 2.751424036007302e-06, + "loss": 0.1598, + "step": 40834 + }, + { + "epoch": 0.7581231458137626, + "grad_norm": 0.49770599603652954, + "learning_rate": 2.7506204913845134e-06, + "loss": 0.3423, + "step": 40836 + }, + { + "epoch": 0.7581602759511813, + "grad_norm": 0.3012617230415344, + "learning_rate": 2.749817045404064e-06, + "loss": 0.1593, + "step": 40838 + }, + { + "epoch": 0.7581974060886, + "grad_norm": 0.1653476506471634, + "learning_rate": 2.7490136980768867e-06, + "loss": 0.1664, + "step": 40840 + }, + { + "epoch": 0.7582345362260186, + "grad_norm": 0.26005804538726807, + "learning_rate": 2.748210449413915e-06, + "loss": 0.335, + "step": 40842 + }, + { + "epoch": 0.7582716663634372, + "grad_norm": 0.463040828704834, + "learning_rate": 2.747407299426076e-06, + "loss": 0.2482, + "step": 40844 + }, + { + "epoch": 0.7583087965008558, + "grad_norm": 0.8196528553962708, + "learning_rate": 2.7466042481243036e-06, + "loss": 0.1671, + "step": 40846 + }, + { + "epoch": 0.7583459266382745, + "grad_norm": 0.3480672538280487, + "learning_rate": 2.745801295519518e-06, + "loss": 0.2907, + "step": 40848 + }, + { + "epoch": 0.7583830567756932, + "grad_norm": 0.4572582542896271, + "learning_rate": 2.7449984416226473e-06, + "loss": 0.259, + "step": 40850 + }, + { + "epoch": 0.7584201869131117, + "grad_norm": 0.2905207872390747, + "learning_rate": 2.7441956864446175e-06, + "loss": 0.2163, + "step": 40852 + }, + { + "epoch": 0.7584573170505304, + "grad_norm": 0.2966296970844269, + "learning_rate": 2.743393029996353e-06, + "loss": 0.2132, + "step": 40854 + }, + { + "epoch": 0.758494447187949, + "grad_norm": 0.5910990238189697, + "learning_rate": 2.7425904722887696e-06, + "loss": 0.2204, + "step": 40856 + }, + { + "epoch": 0.7585315773253677, + "grad_norm": 0.3515777587890625, + "learning_rate": 2.7417880133327922e-06, + "loss": 0.3988, + "step": 40858 + }, + { + "epoch": 0.7585687074627864, + "grad_norm": 0.43112078309059143, + "learning_rate": 2.740985653139343e-06, + "loss": 0.4316, + "step": 40860 + }, + { + "epoch": 0.758605837600205, + "grad_norm": 0.4284381866455078, + "learning_rate": 2.740183391719332e-06, + "loss": 0.4098, + "step": 40862 + }, + { + "epoch": 0.7586429677376236, + "grad_norm": 0.4038177728652954, + "learning_rate": 2.7393812290836784e-06, + "loss": 0.3272, + "step": 40864 + }, + { + "epoch": 0.7586800978750422, + "grad_norm": 0.3926438093185425, + "learning_rate": 2.7385791652433e-06, + "loss": 0.3342, + "step": 40866 + }, + { + "epoch": 0.7587172280124609, + "grad_norm": 0.5182461738586426, + "learning_rate": 2.7377772002091076e-06, + "loss": 0.3247, + "step": 40868 + }, + { + "epoch": 0.7587543581498795, + "grad_norm": 0.6056269407272339, + "learning_rate": 2.7369753339920145e-06, + "loss": 0.2858, + "step": 40870 + }, + { + "epoch": 0.7587914882872981, + "grad_norm": 0.5373923778533936, + "learning_rate": 2.7361735666029356e-06, + "loss": 0.506, + "step": 40872 + }, + { + "epoch": 0.7588286184247168, + "grad_norm": 0.4367193281650543, + "learning_rate": 2.735371898052773e-06, + "loss": 0.262, + "step": 40874 + }, + { + "epoch": 0.7588657485621354, + "grad_norm": 0.3949589431285858, + "learning_rate": 2.7345703283524428e-06, + "loss": 0.145, + "step": 40876 + }, + { + "epoch": 0.7589028786995541, + "grad_norm": 0.3879443407058716, + "learning_rate": 2.733768857512844e-06, + "loss": 0.2715, + "step": 40878 + }, + { + "epoch": 0.7589400088369727, + "grad_norm": 0.457638680934906, + "learning_rate": 2.732967485544886e-06, + "loss": 0.4456, + "step": 40880 + }, + { + "epoch": 0.7589771389743913, + "grad_norm": 0.40680208802223206, + "learning_rate": 2.732166212459474e-06, + "loss": 0.139, + "step": 40882 + }, + { + "epoch": 0.75901426911181, + "grad_norm": 0.3470662534236908, + "learning_rate": 2.7313650382675127e-06, + "loss": 0.3023, + "step": 40884 + }, + { + "epoch": 0.7590513992492286, + "grad_norm": 0.34856969118118286, + "learning_rate": 2.7305639629798986e-06, + "loss": 0.1068, + "step": 40886 + }, + { + "epoch": 0.7590885293866473, + "grad_norm": 0.3986126780509949, + "learning_rate": 2.729762986607534e-06, + "loss": 0.3204, + "step": 40888 + }, + { + "epoch": 0.7591256595240659, + "grad_norm": 0.32615989446640015, + "learning_rate": 2.7289621091613184e-06, + "loss": 0.1, + "step": 40890 + }, + { + "epoch": 0.7591627896614845, + "grad_norm": 0.3104472756385803, + "learning_rate": 2.7281613306521494e-06, + "loss": 0.2878, + "step": 40892 + }, + { + "epoch": 0.7591999197989032, + "grad_norm": 0.4942153990268707, + "learning_rate": 2.7273606510909222e-06, + "loss": 0.183, + "step": 40894 + }, + { + "epoch": 0.7592370499363218, + "grad_norm": 0.37183234095573425, + "learning_rate": 2.7265600704885365e-06, + "loss": 0.3119, + "step": 40896 + }, + { + "epoch": 0.7592741800737405, + "grad_norm": 0.30014580488204956, + "learning_rate": 2.725759588855882e-06, + "loss": 0.3464, + "step": 40898 + }, + { + "epoch": 0.759311310211159, + "grad_norm": 0.33586806058883667, + "learning_rate": 2.7249592062038467e-06, + "loss": 0.1825, + "step": 40900 + }, + { + "epoch": 0.7593484403485777, + "grad_norm": 0.23866114020347595, + "learning_rate": 2.724158922543325e-06, + "loss": 0.1541, + "step": 40902 + }, + { + "epoch": 0.7593855704859964, + "grad_norm": 0.3009186089038849, + "learning_rate": 2.7233587378852076e-06, + "loss": 0.1852, + "step": 40904 + }, + { + "epoch": 0.759422700623415, + "grad_norm": 0.4748780131340027, + "learning_rate": 2.722558652240381e-06, + "loss": 0.1761, + "step": 40906 + }, + { + "epoch": 0.7594598307608337, + "grad_norm": 0.3828006982803345, + "learning_rate": 2.7217586656197336e-06, + "loss": 0.389, + "step": 40908 + }, + { + "epoch": 0.7594969608982522, + "grad_norm": 0.3322638273239136, + "learning_rate": 2.720958778034153e-06, + "loss": 0.159, + "step": 40910 + }, + { + "epoch": 0.7595340910356709, + "grad_norm": 0.5305048227310181, + "learning_rate": 2.720158989494517e-06, + "loss": 0.2746, + "step": 40912 + }, + { + "epoch": 0.7595712211730896, + "grad_norm": 0.3397318422794342, + "learning_rate": 2.71935930001171e-06, + "loss": 0.2252, + "step": 40914 + }, + { + "epoch": 0.7596083513105082, + "grad_norm": 0.5906352996826172, + "learning_rate": 2.7185597095966165e-06, + "loss": 0.2785, + "step": 40916 + }, + { + "epoch": 0.7596454814479269, + "grad_norm": 0.43257367610931396, + "learning_rate": 2.717760218260117e-06, + "loss": 0.5038, + "step": 40918 + }, + { + "epoch": 0.7596826115853454, + "grad_norm": 0.27028948068618774, + "learning_rate": 2.7169608260130862e-06, + "loss": 0.2928, + "step": 40920 + }, + { + "epoch": 0.7597197417227641, + "grad_norm": 0.44712451100349426, + "learning_rate": 2.716161532866406e-06, + "loss": 0.2535, + "step": 40922 + }, + { + "epoch": 0.7597568718601827, + "grad_norm": 0.3930681049823761, + "learning_rate": 2.715362338830946e-06, + "loss": 0.4347, + "step": 40924 + }, + { + "epoch": 0.7597940019976014, + "grad_norm": 0.3414361774921417, + "learning_rate": 2.7145632439175853e-06, + "loss": 0.2947, + "step": 40926 + }, + { + "epoch": 0.7598311321350201, + "grad_norm": 0.4201281666755676, + "learning_rate": 2.7137642481371953e-06, + "loss": 0.1969, + "step": 40928 + }, + { + "epoch": 0.7598682622724386, + "grad_norm": 0.4431511461734772, + "learning_rate": 2.7129653515006506e-06, + "loss": 0.1672, + "step": 40930 + }, + { + "epoch": 0.7599053924098573, + "grad_norm": 0.4840647280216217, + "learning_rate": 2.712166554018819e-06, + "loss": 0.2179, + "step": 40932 + }, + { + "epoch": 0.7599425225472759, + "grad_norm": 0.3678118586540222, + "learning_rate": 2.711367855702575e-06, + "loss": 0.2847, + "step": 40934 + }, + { + "epoch": 0.7599796526846946, + "grad_norm": 0.3222925364971161, + "learning_rate": 2.7105692565627782e-06, + "loss": 0.3016, + "step": 40936 + }, + { + "epoch": 0.7600167828221133, + "grad_norm": 0.28203877806663513, + "learning_rate": 2.7097707566103005e-06, + "loss": 0.2516, + "step": 40938 + }, + { + "epoch": 0.7600539129595318, + "grad_norm": 0.42676109075546265, + "learning_rate": 2.708972355856009e-06, + "loss": 0.3502, + "step": 40940 + }, + { + "epoch": 0.7600910430969505, + "grad_norm": 0.5906369686126709, + "learning_rate": 2.7081740543107614e-06, + "loss": 0.2216, + "step": 40942 + }, + { + "epoch": 0.7601281732343691, + "grad_norm": 0.2692893445491791, + "learning_rate": 2.707375851985423e-06, + "loss": 0.2279, + "step": 40944 + }, + { + "epoch": 0.7601653033717878, + "grad_norm": 0.5732405781745911, + "learning_rate": 2.706577748890856e-06, + "loss": 0.2457, + "step": 40946 + }, + { + "epoch": 0.7602024335092065, + "grad_norm": 0.25625014305114746, + "learning_rate": 2.7057797450379218e-06, + "loss": 0.3518, + "step": 40948 + }, + { + "epoch": 0.760239563646625, + "grad_norm": 0.48644208908081055, + "learning_rate": 2.704981840437474e-06, + "loss": 0.3543, + "step": 40950 + }, + { + "epoch": 0.7602766937840437, + "grad_norm": 0.57731693983078, + "learning_rate": 2.7041840351003722e-06, + "loss": 0.1213, + "step": 40952 + }, + { + "epoch": 0.7603138239214623, + "grad_norm": 0.8326742649078369, + "learning_rate": 2.703386329037473e-06, + "loss": 0.3231, + "step": 40954 + }, + { + "epoch": 0.760350954058881, + "grad_norm": 0.2908480167388916, + "learning_rate": 2.702588722259628e-06, + "loss": 0.3126, + "step": 40956 + }, + { + "epoch": 0.7603880841962997, + "grad_norm": 0.29514217376708984, + "learning_rate": 2.7017912147776938e-06, + "loss": 0.2398, + "step": 40958 + }, + { + "epoch": 0.7604252143337182, + "grad_norm": 0.4586414694786072, + "learning_rate": 2.700993806602523e-06, + "loss": 0.3116, + "step": 40960 + }, + { + "epoch": 0.7604623444711369, + "grad_norm": 0.31312039494514465, + "learning_rate": 2.7001964977449637e-06, + "loss": 0.3059, + "step": 40962 + }, + { + "epoch": 0.7604994746085555, + "grad_norm": 0.3502884805202484, + "learning_rate": 2.6993992882158616e-06, + "loss": 0.3452, + "step": 40964 + }, + { + "epoch": 0.7605366047459742, + "grad_norm": 0.469490647315979, + "learning_rate": 2.6986021780260674e-06, + "loss": 0.2631, + "step": 40966 + }, + { + "epoch": 0.7605737348833927, + "grad_norm": 0.3721438944339752, + "learning_rate": 2.697805167186427e-06, + "loss": 0.3057, + "step": 40968 + }, + { + "epoch": 0.7606108650208114, + "grad_norm": 0.20231510698795319, + "learning_rate": 2.697008255707786e-06, + "loss": 0.2964, + "step": 40970 + }, + { + "epoch": 0.7606479951582301, + "grad_norm": 0.38106149435043335, + "learning_rate": 2.69621144360099e-06, + "loss": 0.2287, + "step": 40972 + }, + { + "epoch": 0.7606851252956487, + "grad_norm": 0.36628153920173645, + "learning_rate": 2.6954147308768764e-06, + "loss": 0.3927, + "step": 40974 + }, + { + "epoch": 0.7607222554330674, + "grad_norm": 0.4119666814804077, + "learning_rate": 2.6946181175462884e-06, + "loss": 0.2832, + "step": 40976 + }, + { + "epoch": 0.760759385570486, + "grad_norm": 0.649245023727417, + "learning_rate": 2.693821603620066e-06, + "loss": 0.2484, + "step": 40978 + }, + { + "epoch": 0.7607965157079046, + "grad_norm": 0.3675916790962219, + "learning_rate": 2.6930251891090463e-06, + "loss": 0.279, + "step": 40980 + }, + { + "epoch": 0.7608336458453233, + "grad_norm": 0.35791996121406555, + "learning_rate": 2.692228874024071e-06, + "loss": 0.2104, + "step": 40982 + }, + { + "epoch": 0.7608707759827419, + "grad_norm": 0.26106682419776917, + "learning_rate": 2.6914326583759677e-06, + "loss": 0.3992, + "step": 40984 + }, + { + "epoch": 0.7609079061201606, + "grad_norm": 0.37780362367630005, + "learning_rate": 2.6906365421755763e-06, + "loss": 0.3532, + "step": 40986 + }, + { + "epoch": 0.7609450362575791, + "grad_norm": 0.35651278495788574, + "learning_rate": 2.6898405254337258e-06, + "loss": 0.2441, + "step": 40988 + }, + { + "epoch": 0.7609821663949978, + "grad_norm": 0.4947573244571686, + "learning_rate": 2.689044608161249e-06, + "loss": 0.3364, + "step": 40990 + }, + { + "epoch": 0.7610192965324165, + "grad_norm": 0.7682983875274658, + "learning_rate": 2.6882487903689757e-06, + "loss": 0.252, + "step": 40992 + }, + { + "epoch": 0.7610564266698351, + "grad_norm": 0.18238750100135803, + "learning_rate": 2.687453072067736e-06, + "loss": 0.288, + "step": 40994 + }, + { + "epoch": 0.7610935568072538, + "grad_norm": 0.3088197112083435, + "learning_rate": 2.6866574532683563e-06, + "loss": 0.283, + "step": 40996 + }, + { + "epoch": 0.7611306869446723, + "grad_norm": 0.915503203868866, + "learning_rate": 2.685861933981665e-06, + "loss": 0.2549, + "step": 40998 + }, + { + "epoch": 0.761167817082091, + "grad_norm": 0.5539266467094421, + "learning_rate": 2.6850665142184818e-06, + "loss": 0.2969, + "step": 41000 + }, + { + "epoch": 0.7612049472195097, + "grad_norm": 0.3839748799800873, + "learning_rate": 2.6842711939896325e-06, + "loss": 0.2877, + "step": 41002 + }, + { + "epoch": 0.7612420773569283, + "grad_norm": 0.2746099829673767, + "learning_rate": 2.683475973305942e-06, + "loss": 0.3158, + "step": 41004 + }, + { + "epoch": 0.761279207494347, + "grad_norm": 0.2909497320652008, + "learning_rate": 2.6826808521782256e-06, + "loss": 0.4406, + "step": 41006 + }, + { + "epoch": 0.7613163376317655, + "grad_norm": 0.3750675916671753, + "learning_rate": 2.6818858306173034e-06, + "loss": 0.265, + "step": 41008 + }, + { + "epoch": 0.7613534677691842, + "grad_norm": 0.3192642033100128, + "learning_rate": 2.6810909086339988e-06, + "loss": 0.1916, + "step": 41010 + }, + { + "epoch": 0.7613905979066029, + "grad_norm": 0.3078516721725464, + "learning_rate": 2.6802960862391216e-06, + "loss": 0.3007, + "step": 41012 + }, + { + "epoch": 0.7614277280440215, + "grad_norm": 0.39554136991500854, + "learning_rate": 2.67950136344349e-06, + "loss": 0.4257, + "step": 41014 + }, + { + "epoch": 0.7614648581814402, + "grad_norm": 0.34376537799835205, + "learning_rate": 2.6787067402579158e-06, + "loss": 0.3747, + "step": 41016 + }, + { + "epoch": 0.7615019883188587, + "grad_norm": 0.49329254031181335, + "learning_rate": 2.6779122166932135e-06, + "loss": 0.1303, + "step": 41018 + }, + { + "epoch": 0.7615391184562774, + "grad_norm": 0.46545106172561646, + "learning_rate": 2.677117792760194e-06, + "loss": 0.2855, + "step": 41020 + }, + { + "epoch": 0.761576248593696, + "grad_norm": 0.4448985457420349, + "learning_rate": 2.6763234684696706e-06, + "loss": 0.2375, + "step": 41022 + }, + { + "epoch": 0.7616133787311147, + "grad_norm": 0.752397358417511, + "learning_rate": 2.6755292438324444e-06, + "loss": 0.1571, + "step": 41024 + }, + { + "epoch": 0.7616505088685334, + "grad_norm": 0.22327204048633575, + "learning_rate": 2.6747351188593286e-06, + "loss": 0.0794, + "step": 41026 + }, + { + "epoch": 0.7616876390059519, + "grad_norm": 0.41629987955093384, + "learning_rate": 2.673941093561123e-06, + "loss": 0.2194, + "step": 41028 + }, + { + "epoch": 0.7617247691433706, + "grad_norm": 0.47478723526000977, + "learning_rate": 2.673147167948634e-06, + "loss": 0.4039, + "step": 41030 + }, + { + "epoch": 0.7617618992807892, + "grad_norm": 0.5064082741737366, + "learning_rate": 2.6723533420326675e-06, + "loss": 0.2813, + "step": 41032 + }, + { + "epoch": 0.7617990294182079, + "grad_norm": 0.48901456594467163, + "learning_rate": 2.671559615824022e-06, + "loss": 0.2156, + "step": 41034 + }, + { + "epoch": 0.7618361595556266, + "grad_norm": 0.5695748925209045, + "learning_rate": 2.6707659893335024e-06, + "loss": 0.2217, + "step": 41036 + }, + { + "epoch": 0.7618732896930451, + "grad_norm": 0.45571279525756836, + "learning_rate": 2.6699724625719015e-06, + "loss": 0.3061, + "step": 41038 + }, + { + "epoch": 0.7619104198304638, + "grad_norm": 0.3587796986103058, + "learning_rate": 2.6691790355500192e-06, + "loss": 0.3659, + "step": 41040 + }, + { + "epoch": 0.7619475499678824, + "grad_norm": 0.2791910767555237, + "learning_rate": 2.6683857082786514e-06, + "loss": 0.0489, + "step": 41042 + }, + { + "epoch": 0.7619846801053011, + "grad_norm": 0.4093324542045593, + "learning_rate": 2.667592480768596e-06, + "loss": 0.2645, + "step": 41044 + }, + { + "epoch": 0.7620218102427198, + "grad_norm": 0.4447821080684662, + "learning_rate": 2.666799353030641e-06, + "loss": 0.5226, + "step": 41046 + }, + { + "epoch": 0.7620589403801383, + "grad_norm": 0.4293154776096344, + "learning_rate": 2.6660063250755853e-06, + "loss": 0.1619, + "step": 41048 + }, + { + "epoch": 0.762096070517557, + "grad_norm": 0.2621178925037384, + "learning_rate": 2.6652133969142126e-06, + "loss": 0.2014, + "step": 41050 + }, + { + "epoch": 0.7621332006549756, + "grad_norm": 0.5110308527946472, + "learning_rate": 2.664420568557314e-06, + "loss": 0.3006, + "step": 41052 + }, + { + "epoch": 0.7621703307923943, + "grad_norm": 0.8813275694847107, + "learning_rate": 2.6636278400156803e-06, + "loss": 0.4204, + "step": 41054 + }, + { + "epoch": 0.762207460929813, + "grad_norm": 0.26043567061424255, + "learning_rate": 2.6628352113000954e-06, + "loss": 0.1976, + "step": 41056 + }, + { + "epoch": 0.7622445910672315, + "grad_norm": 0.4937804341316223, + "learning_rate": 2.6620426824213473e-06, + "loss": 0.2605, + "step": 41058 + }, + { + "epoch": 0.7622817212046502, + "grad_norm": 1.9092246294021606, + "learning_rate": 2.6612502533902206e-06, + "loss": 0.2245, + "step": 41060 + }, + { + "epoch": 0.7623188513420688, + "grad_norm": 0.40777167677879333, + "learning_rate": 2.660457924217493e-06, + "loss": 0.2333, + "step": 41062 + }, + { + "epoch": 0.7623559814794875, + "grad_norm": 0.47166988253593445, + "learning_rate": 2.659665694913949e-06, + "loss": 0.1921, + "step": 41064 + }, + { + "epoch": 0.7623931116169061, + "grad_norm": 0.41206973791122437, + "learning_rate": 2.6588735654903675e-06, + "loss": 0.3573, + "step": 41066 + }, + { + "epoch": 0.7624302417543247, + "grad_norm": 0.3124183118343353, + "learning_rate": 2.6580815359575306e-06, + "loss": 0.1857, + "step": 41068 + }, + { + "epoch": 0.7624673718917434, + "grad_norm": 0.45635151863098145, + "learning_rate": 2.65728960632621e-06, + "loss": 0.3106, + "step": 41070 + }, + { + "epoch": 0.762504502029162, + "grad_norm": 0.5393530130386353, + "learning_rate": 2.656497776607183e-06, + "loss": 0.2173, + "step": 41072 + }, + { + "epoch": 0.7625416321665807, + "grad_norm": 0.4308006465435028, + "learning_rate": 2.6557060468112284e-06, + "loss": 0.2219, + "step": 41074 + }, + { + "epoch": 0.7625787623039992, + "grad_norm": 0.14382848143577576, + "learning_rate": 2.6549144169491135e-06, + "loss": 0.3323, + "step": 41076 + }, + { + "epoch": 0.7626158924414179, + "grad_norm": 0.4609472155570984, + "learning_rate": 2.6541228870316115e-06, + "loss": 0.3864, + "step": 41078 + }, + { + "epoch": 0.7626530225788366, + "grad_norm": 0.45031967759132385, + "learning_rate": 2.6533314570694933e-06, + "loss": 0.4156, + "step": 41080 + }, + { + "epoch": 0.7626901527162552, + "grad_norm": 0.4361385107040405, + "learning_rate": 2.6525401270735284e-06, + "loss": 0.1051, + "step": 41082 + }, + { + "epoch": 0.7627272828536739, + "grad_norm": 0.4677934944629669, + "learning_rate": 2.6517488970544847e-06, + "loss": 0.2938, + "step": 41084 + }, + { + "epoch": 0.7627644129910924, + "grad_norm": 0.5199944376945496, + "learning_rate": 2.6509577670231303e-06, + "loss": 0.3326, + "step": 41086 + }, + { + "epoch": 0.7628015431285111, + "grad_norm": 0.2940211296081543, + "learning_rate": 2.6501667369902252e-06, + "loss": 0.3922, + "step": 41088 + }, + { + "epoch": 0.7628386732659298, + "grad_norm": 0.4243672788143158, + "learning_rate": 2.649375806966539e-06, + "loss": 0.3212, + "step": 41090 + }, + { + "epoch": 0.7628758034033484, + "grad_norm": 0.3078504502773285, + "learning_rate": 2.648584976962826e-06, + "loss": 0.1186, + "step": 41092 + }, + { + "epoch": 0.7629129335407671, + "grad_norm": 0.35507500171661377, + "learning_rate": 2.647794246989852e-06, + "loss": 0.2292, + "step": 41094 + }, + { + "epoch": 0.7629500636781856, + "grad_norm": 0.36059433221817017, + "learning_rate": 2.6470036170583768e-06, + "loss": 0.2498, + "step": 41096 + }, + { + "epoch": 0.7629871938156043, + "grad_norm": 0.48841819167137146, + "learning_rate": 2.6462130871791592e-06, + "loss": 0.3218, + "step": 41098 + }, + { + "epoch": 0.763024323953023, + "grad_norm": 0.3681288957595825, + "learning_rate": 2.645422657362953e-06, + "loss": 0.2215, + "step": 41100 + }, + { + "epoch": 0.7630614540904416, + "grad_norm": 0.20018750429153442, + "learning_rate": 2.644632327620513e-06, + "loss": 0.1489, + "step": 41102 + }, + { + "epoch": 0.7630985842278603, + "grad_norm": 0.2612859904766083, + "learning_rate": 2.6438420979625967e-06, + "loss": 0.1815, + "step": 41104 + }, + { + "epoch": 0.7631357143652788, + "grad_norm": 0.4553103446960449, + "learning_rate": 2.6430519683999545e-06, + "loss": 0.1839, + "step": 41106 + }, + { + "epoch": 0.7631728445026975, + "grad_norm": 0.46828901767730713, + "learning_rate": 2.6422619389433413e-06, + "loss": 0.3429, + "step": 41108 + }, + { + "epoch": 0.7632099746401162, + "grad_norm": 0.290111780166626, + "learning_rate": 2.641472009603501e-06, + "loss": 0.2655, + "step": 41110 + }, + { + "epoch": 0.7632471047775348, + "grad_norm": 0.3366932272911072, + "learning_rate": 2.6406821803911887e-06, + "loss": 0.2062, + "step": 41112 + }, + { + "epoch": 0.7632842349149535, + "grad_norm": 0.6364966034889221, + "learning_rate": 2.6398924513171454e-06, + "loss": 0.323, + "step": 41114 + }, + { + "epoch": 0.763321365052372, + "grad_norm": 0.5466961860656738, + "learning_rate": 2.639102822392119e-06, + "loss": 0.3669, + "step": 41116 + }, + { + "epoch": 0.7633584951897907, + "grad_norm": 0.32155272364616394, + "learning_rate": 2.6383132936268553e-06, + "loss": 0.1624, + "step": 41118 + }, + { + "epoch": 0.7633956253272093, + "grad_norm": 0.49891331791877747, + "learning_rate": 2.637523865032097e-06, + "loss": 0.3506, + "step": 41120 + }, + { + "epoch": 0.763432755464628, + "grad_norm": 0.4251953959465027, + "learning_rate": 2.636734536618586e-06, + "loss": 0.178, + "step": 41122 + }, + { + "epoch": 0.7634698856020466, + "grad_norm": 0.45255738496780396, + "learning_rate": 2.635945308397064e-06, + "loss": 0.3099, + "step": 41124 + }, + { + "epoch": 0.7635070157394652, + "grad_norm": 0.6898943185806274, + "learning_rate": 2.6351561803782668e-06, + "loss": 0.3141, + "step": 41126 + }, + { + "epoch": 0.7635441458768839, + "grad_norm": 0.3628913462162018, + "learning_rate": 2.634367152572933e-06, + "loss": 0.1773, + "step": 41128 + }, + { + "epoch": 0.7635812760143025, + "grad_norm": 0.38953152298927307, + "learning_rate": 2.633578224991803e-06, + "loss": 0.2982, + "step": 41130 + }, + { + "epoch": 0.7636184061517212, + "grad_norm": 10.194908142089844, + "learning_rate": 2.6327893976456055e-06, + "loss": 0.1907, + "step": 41132 + }, + { + "epoch": 0.7636555362891398, + "grad_norm": 0.33288565278053284, + "learning_rate": 2.632000670545077e-06, + "loss": 0.2028, + "step": 41134 + }, + { + "epoch": 0.7636926664265584, + "grad_norm": 0.45828086137771606, + "learning_rate": 2.6312120437009524e-06, + "loss": 0.1836, + "step": 41136 + }, + { + "epoch": 0.7637297965639771, + "grad_norm": 0.5351374745368958, + "learning_rate": 2.630423517123957e-06, + "loss": 0.3413, + "step": 41138 + }, + { + "epoch": 0.7637669267013957, + "grad_norm": 0.5626407265663147, + "learning_rate": 2.6296350908248227e-06, + "loss": 0.1455, + "step": 41140 + }, + { + "epoch": 0.7638040568388144, + "grad_norm": 0.30331602692604065, + "learning_rate": 2.628846764814278e-06, + "loss": 0.1756, + "step": 41142 + }, + { + "epoch": 0.763841186976233, + "grad_norm": 0.32940930128097534, + "learning_rate": 2.62805853910305e-06, + "loss": 0.1913, + "step": 41144 + }, + { + "epoch": 0.7638783171136516, + "grad_norm": 0.4012812674045563, + "learning_rate": 2.6272704137018646e-06, + "loss": 0.3143, + "step": 41146 + }, + { + "epoch": 0.7639154472510703, + "grad_norm": 0.4202346205711365, + "learning_rate": 2.626482388621443e-06, + "loss": 0.2092, + "step": 41148 + }, + { + "epoch": 0.7639525773884889, + "grad_norm": 0.5187693238258362, + "learning_rate": 2.6256944638725147e-06, + "loss": 0.096, + "step": 41150 + }, + { + "epoch": 0.7639897075259076, + "grad_norm": 0.7458369135856628, + "learning_rate": 2.624906639465795e-06, + "loss": 0.3796, + "step": 41152 + }, + { + "epoch": 0.7640268376633262, + "grad_norm": 0.33917543292045593, + "learning_rate": 2.624118915412003e-06, + "loss": 0.3203, + "step": 41154 + }, + { + "epoch": 0.7640639678007448, + "grad_norm": 0.34963271021842957, + "learning_rate": 2.623331291721858e-06, + "loss": 0.3116, + "step": 41156 + }, + { + "epoch": 0.7641010979381635, + "grad_norm": 0.33579564094543457, + "learning_rate": 2.6225437684060793e-06, + "loss": 0.3612, + "step": 41158 + }, + { + "epoch": 0.7641382280755821, + "grad_norm": 0.2659354507923126, + "learning_rate": 2.6217563454753815e-06, + "loss": 0.1381, + "step": 41160 + }, + { + "epoch": 0.7641753582130008, + "grad_norm": 0.7723900079727173, + "learning_rate": 2.6209690229404828e-06, + "loss": 0.3926, + "step": 41162 + }, + { + "epoch": 0.7642124883504194, + "grad_norm": 0.2998749017715454, + "learning_rate": 2.6201818008120894e-06, + "loss": 0.3617, + "step": 41164 + }, + { + "epoch": 0.764249618487838, + "grad_norm": 0.2181549072265625, + "learning_rate": 2.6193946791009173e-06, + "loss": 0.3556, + "step": 41166 + }, + { + "epoch": 0.7642867486252567, + "grad_norm": 0.2983764111995697, + "learning_rate": 2.6186076578176756e-06, + "loss": 0.253, + "step": 41168 + }, + { + "epoch": 0.7643238787626753, + "grad_norm": 0.38671931624412537, + "learning_rate": 2.6178207369730736e-06, + "loss": 0.2498, + "step": 41170 + }, + { + "epoch": 0.764361008900094, + "grad_norm": 0.29907190799713135, + "learning_rate": 2.6170339165778225e-06, + "loss": 0.2993, + "step": 41172 + }, + { + "epoch": 0.7643981390375125, + "grad_norm": 0.44874662160873413, + "learning_rate": 2.6162471966426216e-06, + "loss": 0.2429, + "step": 41174 + }, + { + "epoch": 0.7644352691749312, + "grad_norm": 0.3117087483406067, + "learning_rate": 2.6154605771781838e-06, + "loss": 0.2427, + "step": 41176 + }, + { + "epoch": 0.7644723993123499, + "grad_norm": 0.3766685426235199, + "learning_rate": 2.6146740581952045e-06, + "loss": 0.2231, + "step": 41178 + }, + { + "epoch": 0.7645095294497685, + "grad_norm": 0.44768568873405457, + "learning_rate": 2.61388763970439e-06, + "loss": 0.3214, + "step": 41180 + }, + { + "epoch": 0.7645466595871871, + "grad_norm": 0.3736570179462433, + "learning_rate": 2.6131013217164404e-06, + "loss": 0.2873, + "step": 41182 + }, + { + "epoch": 0.7645837897246057, + "grad_norm": 0.3789091408252716, + "learning_rate": 2.612315104242056e-06, + "loss": 0.16, + "step": 41184 + }, + { + "epoch": 0.7646209198620244, + "grad_norm": 0.3238079845905304, + "learning_rate": 2.6115289872919334e-06, + "loss": 0.3223, + "step": 41186 + }, + { + "epoch": 0.7646580499994431, + "grad_norm": 0.4763450026512146, + "learning_rate": 2.610742970876774e-06, + "loss": 0.1994, + "step": 41188 + }, + { + "epoch": 0.7646951801368617, + "grad_norm": 0.3456266522407532, + "learning_rate": 2.609957055007266e-06, + "loss": 0.1561, + "step": 41190 + }, + { + "epoch": 0.7647323102742803, + "grad_norm": 0.4724782407283783, + "learning_rate": 2.609171239694107e-06, + "loss": 0.1384, + "step": 41192 + }, + { + "epoch": 0.7647694404116989, + "grad_norm": 0.4341837465763092, + "learning_rate": 2.608385524947993e-06, + "loss": 0.382, + "step": 41194 + }, + { + "epoch": 0.7648065705491176, + "grad_norm": 0.3822661340236664, + "learning_rate": 2.6075999107796067e-06, + "loss": 0.2093, + "step": 41196 + }, + { + "epoch": 0.7648437006865363, + "grad_norm": 0.3665282130241394, + "learning_rate": 2.606814397199644e-06, + "loss": 0.2275, + "step": 41198 + }, + { + "epoch": 0.7648808308239549, + "grad_norm": 0.40559834241867065, + "learning_rate": 2.6060289842187956e-06, + "loss": 0.3496, + "step": 41200 + }, + { + "epoch": 0.7649179609613735, + "grad_norm": 0.40877243876457214, + "learning_rate": 2.6052436718477413e-06, + "loss": 0.2161, + "step": 41202 + }, + { + "epoch": 0.7649550910987921, + "grad_norm": 0.3546595871448517, + "learning_rate": 2.6044584600971723e-06, + "loss": 0.2223, + "step": 41204 + }, + { + "epoch": 0.7649922212362108, + "grad_norm": 0.4898965060710907, + "learning_rate": 2.60367334897777e-06, + "loss": 0.3304, + "step": 41206 + }, + { + "epoch": 0.7650293513736295, + "grad_norm": 0.6012561917304993, + "learning_rate": 2.602888338500219e-06, + "loss": 0.1831, + "step": 41208 + }, + { + "epoch": 0.7650664815110481, + "grad_norm": 0.3670376241207123, + "learning_rate": 2.602103428675201e-06, + "loss": 0.297, + "step": 41210 + }, + { + "epoch": 0.7651036116484667, + "grad_norm": 0.3435477912425995, + "learning_rate": 2.601318619513399e-06, + "loss": 0.1587, + "step": 41212 + }, + { + "epoch": 0.7651407417858853, + "grad_norm": 0.25566667318344116, + "learning_rate": 2.6005339110254866e-06, + "loss": 0.2079, + "step": 41214 + }, + { + "epoch": 0.765177871923304, + "grad_norm": 0.3421875238418579, + "learning_rate": 2.5997493032221455e-06, + "loss": 0.4131, + "step": 41216 + }, + { + "epoch": 0.7652150020607227, + "grad_norm": 0.393698126077652, + "learning_rate": 2.5989647961140485e-06, + "loss": 0.3347, + "step": 41218 + }, + { + "epoch": 0.7652521321981413, + "grad_norm": 0.43134650588035583, + "learning_rate": 2.5981803897118705e-06, + "loss": 0.1141, + "step": 41220 + }, + { + "epoch": 0.7652892623355599, + "grad_norm": 0.34233418107032776, + "learning_rate": 2.5973960840262878e-06, + "loss": 0.2819, + "step": 41222 + }, + { + "epoch": 0.7653263924729785, + "grad_norm": 0.41467759013175964, + "learning_rate": 2.596611879067973e-06, + "loss": 0.1764, + "step": 41224 + }, + { + "epoch": 0.7653635226103972, + "grad_norm": 0.6019743084907532, + "learning_rate": 2.5958277748475924e-06, + "loss": 0.2623, + "step": 41226 + }, + { + "epoch": 0.7654006527478158, + "grad_norm": 0.3277321457862854, + "learning_rate": 2.595043771375817e-06, + "loss": 0.2983, + "step": 41228 + }, + { + "epoch": 0.7654377828852345, + "grad_norm": 0.3492259979248047, + "learning_rate": 2.5942598686633157e-06, + "loss": 0.2445, + "step": 41230 + }, + { + "epoch": 0.7654749130226531, + "grad_norm": 0.4110633432865143, + "learning_rate": 2.5934760667207548e-06, + "loss": 0.228, + "step": 41232 + }, + { + "epoch": 0.7655120431600717, + "grad_norm": 0.5250723361968994, + "learning_rate": 2.5926923655588e-06, + "loss": 0.3368, + "step": 41234 + }, + { + "epoch": 0.7655491732974904, + "grad_norm": 0.7087105512619019, + "learning_rate": 2.591908765188118e-06, + "loss": 0.2308, + "step": 41236 + }, + { + "epoch": 0.765586303434909, + "grad_norm": 0.3392013609409332, + "learning_rate": 2.5911252656193676e-06, + "loss": 0.3082, + "step": 41238 + }, + { + "epoch": 0.7656234335723276, + "grad_norm": 0.371787428855896, + "learning_rate": 2.5903418668632062e-06, + "loss": 0.2243, + "step": 41240 + }, + { + "epoch": 0.7656605637097463, + "grad_norm": 0.47207289934158325, + "learning_rate": 2.589558568930298e-06, + "loss": 0.2583, + "step": 41242 + }, + { + "epoch": 0.7656976938471649, + "grad_norm": 0.3173682689666748, + "learning_rate": 2.5887753718313012e-06, + "loss": 0.227, + "step": 41244 + }, + { + "epoch": 0.7657348239845836, + "grad_norm": 0.6015033721923828, + "learning_rate": 2.5879922755768727e-06, + "loss": 0.259, + "step": 41246 + }, + { + "epoch": 0.7657719541220022, + "grad_norm": 0.45075878500938416, + "learning_rate": 2.587209280177667e-06, + "loss": 0.1965, + "step": 41248 + }, + { + "epoch": 0.7658090842594208, + "grad_norm": 0.448700875043869, + "learning_rate": 2.586426385644343e-06, + "loss": 0.3554, + "step": 41250 + }, + { + "epoch": 0.7658462143968395, + "grad_norm": 0.45653247833251953, + "learning_rate": 2.5856435919875457e-06, + "loss": 0.2149, + "step": 41252 + }, + { + "epoch": 0.7658833445342581, + "grad_norm": 0.384328156709671, + "learning_rate": 2.5848608992179303e-06, + "loss": 0.317, + "step": 41254 + }, + { + "epoch": 0.7659204746716768, + "grad_norm": 0.21103191375732422, + "learning_rate": 2.584078307346148e-06, + "loss": 0.108, + "step": 41256 + }, + { + "epoch": 0.7659576048090954, + "grad_norm": 0.32534170150756836, + "learning_rate": 2.5832958163828504e-06, + "loss": 0.3078, + "step": 41258 + }, + { + "epoch": 0.765994734946514, + "grad_norm": 0.33861976861953735, + "learning_rate": 2.5825134263386764e-06, + "loss": 0.3208, + "step": 41260 + }, + { + "epoch": 0.7660318650839327, + "grad_norm": 0.39179614186286926, + "learning_rate": 2.5817311372242803e-06, + "loss": 0.2554, + "step": 41262 + }, + { + "epoch": 0.7660689952213513, + "grad_norm": 0.30000942945480347, + "learning_rate": 2.5809489490503003e-06, + "loss": 0.2221, + "step": 41264 + }, + { + "epoch": 0.76610612535877, + "grad_norm": 0.38789182901382446, + "learning_rate": 2.5801668618273833e-06, + "loss": 0.2187, + "step": 41266 + }, + { + "epoch": 0.7661432554961886, + "grad_norm": 0.49861547350883484, + "learning_rate": 2.57938487556617e-06, + "loss": 0.2791, + "step": 41268 + }, + { + "epoch": 0.7661803856336072, + "grad_norm": 0.28582337498664856, + "learning_rate": 2.5786029902773e-06, + "loss": 0.1793, + "step": 41270 + }, + { + "epoch": 0.7662175157710258, + "grad_norm": 0.38925376534461975, + "learning_rate": 2.5778212059714145e-06, + "loss": 0.2437, + "step": 41272 + }, + { + "epoch": 0.7662546459084445, + "grad_norm": 0.4097864627838135, + "learning_rate": 2.5770395226591505e-06, + "loss": 0.4566, + "step": 41274 + }, + { + "epoch": 0.7662917760458632, + "grad_norm": 0.3490731120109558, + "learning_rate": 2.576257940351147e-06, + "loss": 0.3496, + "step": 41276 + }, + { + "epoch": 0.7663289061832818, + "grad_norm": 0.47219184041023254, + "learning_rate": 2.575476459058034e-06, + "loss": 0.3711, + "step": 41278 + }, + { + "epoch": 0.7663660363207004, + "grad_norm": 0.1974325031042099, + "learning_rate": 2.574695078790449e-06, + "loss": 0.2986, + "step": 41280 + }, + { + "epoch": 0.766403166458119, + "grad_norm": 0.33266741037368774, + "learning_rate": 2.5739137995590204e-06, + "loss": 0.2612, + "step": 41282 + }, + { + "epoch": 0.7664402965955377, + "grad_norm": 0.3887407183647156, + "learning_rate": 2.5731326213743814e-06, + "loss": 0.2571, + "step": 41284 + }, + { + "epoch": 0.7664774267329564, + "grad_norm": 0.3010798692703247, + "learning_rate": 2.5723515442471614e-06, + "loss": 0.3578, + "step": 41286 + }, + { + "epoch": 0.766514556870375, + "grad_norm": 0.37065666913986206, + "learning_rate": 2.571570568187991e-06, + "loss": 0.3085, + "step": 41288 + }, + { + "epoch": 0.7665516870077936, + "grad_norm": 0.9915363192558289, + "learning_rate": 2.5707896932074915e-06, + "loss": 0.3521, + "step": 41290 + }, + { + "epoch": 0.7665888171452122, + "grad_norm": 0.31265366077423096, + "learning_rate": 2.5700089193162912e-06, + "loss": 0.2885, + "step": 41292 + }, + { + "epoch": 0.7666259472826309, + "grad_norm": 0.48649343848228455, + "learning_rate": 2.5692282465250152e-06, + "loss": 0.3434, + "step": 41294 + }, + { + "epoch": 0.7666630774200496, + "grad_norm": 0.31086084246635437, + "learning_rate": 2.5684476748442845e-06, + "loss": 0.1331, + "step": 41296 + }, + { + "epoch": 0.7667002075574681, + "grad_norm": 0.40793880820274353, + "learning_rate": 2.567667204284723e-06, + "loss": 0.378, + "step": 41298 + }, + { + "epoch": 0.7667373376948868, + "grad_norm": 0.28296178579330444, + "learning_rate": 2.5668868348569464e-06, + "loss": 0.2055, + "step": 41300 + }, + { + "epoch": 0.7667744678323054, + "grad_norm": 0.3801786005496979, + "learning_rate": 2.566106566571578e-06, + "loss": 0.1626, + "step": 41302 + }, + { + "epoch": 0.7668115979697241, + "grad_norm": 0.591468870639801, + "learning_rate": 2.5653263994392285e-06, + "loss": 0.234, + "step": 41304 + }, + { + "epoch": 0.7668487281071428, + "grad_norm": 0.3889619708061218, + "learning_rate": 2.564546333470518e-06, + "loss": 0.2971, + "step": 41306 + }, + { + "epoch": 0.7668858582445613, + "grad_norm": 0.4041593372821808, + "learning_rate": 2.5637663686760592e-06, + "loss": 0.182, + "step": 41308 + }, + { + "epoch": 0.76692298838198, + "grad_norm": 0.4182489514350891, + "learning_rate": 2.562986505066467e-06, + "loss": 0.3509, + "step": 41310 + }, + { + "epoch": 0.7669601185193986, + "grad_norm": 0.4631083905696869, + "learning_rate": 2.562206742652352e-06, + "loss": 0.3249, + "step": 41312 + }, + { + "epoch": 0.7669972486568173, + "grad_norm": 0.5836560726165771, + "learning_rate": 2.5614270814443264e-06, + "loss": 0.2897, + "step": 41314 + }, + { + "epoch": 0.767034378794236, + "grad_norm": 0.3607669472694397, + "learning_rate": 2.5606475214529958e-06, + "loss": 0.2521, + "step": 41316 + }, + { + "epoch": 0.7670715089316545, + "grad_norm": 0.15277954936027527, + "learning_rate": 2.5598680626889684e-06, + "loss": 0.1379, + "step": 41318 + }, + { + "epoch": 0.7671086390690732, + "grad_norm": 0.5786731839179993, + "learning_rate": 2.5590887051628533e-06, + "loss": 0.3198, + "step": 41320 + }, + { + "epoch": 0.7671457692064918, + "grad_norm": 0.34919285774230957, + "learning_rate": 2.55830944888525e-06, + "loss": 0.3126, + "step": 41322 + }, + { + "epoch": 0.7671828993439105, + "grad_norm": 0.3165053725242615, + "learning_rate": 2.5575302938667647e-06, + "loss": 0.2398, + "step": 41324 + }, + { + "epoch": 0.7672200294813291, + "grad_norm": 0.49193501472473145, + "learning_rate": 2.5567512401180027e-06, + "loss": 0.378, + "step": 41326 + }, + { + "epoch": 0.7672571596187477, + "grad_norm": 0.26617106795310974, + "learning_rate": 2.5559722876495575e-06, + "loss": 0.1259, + "step": 41328 + }, + { + "epoch": 0.7672942897561664, + "grad_norm": 0.2973635792732239, + "learning_rate": 2.555193436472032e-06, + "loss": 0.2417, + "step": 41330 + }, + { + "epoch": 0.767331419893585, + "grad_norm": 0.363119900226593, + "learning_rate": 2.554414686596025e-06, + "loss": 0.2647, + "step": 41332 + }, + { + "epoch": 0.7673685500310037, + "grad_norm": 0.3887489140033722, + "learning_rate": 2.5536360380321314e-06, + "loss": 0.4158, + "step": 41334 + }, + { + "epoch": 0.7674056801684223, + "grad_norm": 0.5190216302871704, + "learning_rate": 2.5528574907909466e-06, + "loss": 0.2936, + "step": 41336 + }, + { + "epoch": 0.7674428103058409, + "grad_norm": 0.31827104091644287, + "learning_rate": 2.5520790448830677e-06, + "loss": 0.1956, + "step": 41338 + }, + { + "epoch": 0.7674799404432596, + "grad_norm": 0.5050176978111267, + "learning_rate": 2.5513007003190816e-06, + "loss": 0.437, + "step": 41340 + }, + { + "epoch": 0.7675170705806782, + "grad_norm": 0.3137609362602234, + "learning_rate": 2.550522457109581e-06, + "loss": 0.5051, + "step": 41342 + }, + { + "epoch": 0.7675542007180969, + "grad_norm": 0.48798704147338867, + "learning_rate": 2.5497443152651593e-06, + "loss": 0.3145, + "step": 41344 + }, + { + "epoch": 0.7675913308555155, + "grad_norm": 0.3828844130039215, + "learning_rate": 2.5489662747963985e-06, + "loss": 0.3825, + "step": 41346 + }, + { + "epoch": 0.7676284609929341, + "grad_norm": 0.4482521712779999, + "learning_rate": 2.548188335713887e-06, + "loss": 0.3455, + "step": 41348 + }, + { + "epoch": 0.7676655911303528, + "grad_norm": 0.46659302711486816, + "learning_rate": 2.5474104980282156e-06, + "loss": 0.2417, + "step": 41350 + }, + { + "epoch": 0.7677027212677714, + "grad_norm": 0.48956945538520813, + "learning_rate": 2.546632761749961e-06, + "loss": 0.3291, + "step": 41352 + }, + { + "epoch": 0.7677398514051901, + "grad_norm": 0.4789769649505615, + "learning_rate": 2.545855126889709e-06, + "loss": 0.2285, + "step": 41354 + }, + { + "epoch": 0.7677769815426086, + "grad_norm": 0.340114027261734, + "learning_rate": 2.545077593458042e-06, + "loss": 0.1609, + "step": 41356 + }, + { + "epoch": 0.7678141116800273, + "grad_norm": 0.3755229413509369, + "learning_rate": 2.5443001614655373e-06, + "loss": 0.5751, + "step": 41358 + }, + { + "epoch": 0.767851241817446, + "grad_norm": 0.51650071144104, + "learning_rate": 2.5435228309227754e-06, + "loss": 0.2585, + "step": 41360 + }, + { + "epoch": 0.7678883719548646, + "grad_norm": 0.4635002613067627, + "learning_rate": 2.5427456018403363e-06, + "loss": 0.1262, + "step": 41362 + }, + { + "epoch": 0.7679255020922833, + "grad_norm": 0.4288936257362366, + "learning_rate": 2.5419684742287897e-06, + "loss": 0.2473, + "step": 41364 + }, + { + "epoch": 0.7679626322297018, + "grad_norm": 0.27689090371131897, + "learning_rate": 2.5411914480987156e-06, + "loss": 0.1617, + "step": 41366 + }, + { + "epoch": 0.7679997623671205, + "grad_norm": 0.6193722486495972, + "learning_rate": 2.5404145234606814e-06, + "loss": 0.2986, + "step": 41368 + }, + { + "epoch": 0.7680368925045392, + "grad_norm": 0.23524649441242218, + "learning_rate": 2.5396377003252617e-06, + "loss": 0.3073, + "step": 41370 + }, + { + "epoch": 0.7680740226419578, + "grad_norm": 0.39432278275489807, + "learning_rate": 2.538860978703025e-06, + "loss": 0.3311, + "step": 41372 + }, + { + "epoch": 0.7681111527793765, + "grad_norm": 0.31275299191474915, + "learning_rate": 2.538084358604543e-06, + "loss": 0.331, + "step": 41374 + }, + { + "epoch": 0.768148282916795, + "grad_norm": 0.3133927881717682, + "learning_rate": 2.537307840040385e-06, + "loss": 0.1934, + "step": 41376 + }, + { + "epoch": 0.7681854130542137, + "grad_norm": 0.41305121779441833, + "learning_rate": 2.53653142302111e-06, + "loss": 0.1964, + "step": 41378 + }, + { + "epoch": 0.7682225431916323, + "grad_norm": 0.29816102981567383, + "learning_rate": 2.5357551075572874e-06, + "loss": 0.3428, + "step": 41380 + }, + { + "epoch": 0.768259673329051, + "grad_norm": 0.6308209300041199, + "learning_rate": 2.53497889365948e-06, + "loss": 0.1286, + "step": 41382 + }, + { + "epoch": 0.7682968034664697, + "grad_norm": 0.40792134404182434, + "learning_rate": 2.534202781338252e-06, + "loss": 0.2186, + "step": 41384 + }, + { + "epoch": 0.7683339336038882, + "grad_norm": 0.43371978402137756, + "learning_rate": 2.533426770604158e-06, + "loss": 0.4609, + "step": 41386 + }, + { + "epoch": 0.7683710637413069, + "grad_norm": 0.36207014322280884, + "learning_rate": 2.532650861467765e-06, + "loss": 0.1504, + "step": 41388 + }, + { + "epoch": 0.7684081938787255, + "grad_norm": 0.42898428440093994, + "learning_rate": 2.531875053939622e-06, + "loss": 0.3226, + "step": 41390 + }, + { + "epoch": 0.7684453240161442, + "grad_norm": 0.3628973960876465, + "learning_rate": 2.5310993480302916e-06, + "loss": 0.2205, + "step": 41392 + }, + { + "epoch": 0.7684824541535629, + "grad_norm": 0.5887414813041687, + "learning_rate": 2.530323743750326e-06, + "loss": 0.143, + "step": 41394 + }, + { + "epoch": 0.7685195842909814, + "grad_norm": 0.2971482574939728, + "learning_rate": 2.52954824111028e-06, + "loss": 0.2751, + "step": 41396 + }, + { + "epoch": 0.7685567144284001, + "grad_norm": 0.28399187326431274, + "learning_rate": 2.5287728401207056e-06, + "loss": 0.4495, + "step": 41398 + }, + { + "epoch": 0.7685938445658187, + "grad_norm": 0.459029883146286, + "learning_rate": 2.5279975407921543e-06, + "loss": 0.4372, + "step": 41400 + }, + { + "epoch": 0.7686309747032374, + "grad_norm": 0.5715258717536926, + "learning_rate": 2.5272223431351784e-06, + "loss": 0.3319, + "step": 41402 + }, + { + "epoch": 0.7686681048406561, + "grad_norm": 0.4502350389957428, + "learning_rate": 2.5264472471603197e-06, + "loss": 0.3261, + "step": 41404 + }, + { + "epoch": 0.7687052349780746, + "grad_norm": 0.5230157971382141, + "learning_rate": 2.525672252878131e-06, + "loss": 0.2604, + "step": 41406 + }, + { + "epoch": 0.7687423651154933, + "grad_norm": 0.23249544203281403, + "learning_rate": 2.524897360299152e-06, + "loss": 0.2151, + "step": 41408 + }, + { + "epoch": 0.7687794952529119, + "grad_norm": 0.3170226812362671, + "learning_rate": 2.5241225694339288e-06, + "loss": 0.3592, + "step": 41410 + }, + { + "epoch": 0.7688166253903306, + "grad_norm": 0.42040392756462097, + "learning_rate": 2.5233478802930057e-06, + "loss": 0.1604, + "step": 41412 + }, + { + "epoch": 0.7688537555277493, + "grad_norm": 0.25480395555496216, + "learning_rate": 2.522573292886925e-06, + "loss": 0.3222, + "step": 41414 + }, + { + "epoch": 0.7688908856651678, + "grad_norm": 0.36990073323249817, + "learning_rate": 2.521798807226221e-06, + "loss": 0.4038, + "step": 41416 + }, + { + "epoch": 0.7689280158025865, + "grad_norm": 0.35002514719963074, + "learning_rate": 2.5210244233214353e-06, + "loss": 0.2675, + "step": 41418 + }, + { + "epoch": 0.7689651459400051, + "grad_norm": 0.3868134915828705, + "learning_rate": 2.5202501411831058e-06, + "loss": 0.3042, + "step": 41420 + }, + { + "epoch": 0.7690022760774238, + "grad_norm": 0.6874318718910217, + "learning_rate": 2.519475960821768e-06, + "loss": 0.3285, + "step": 41422 + }, + { + "epoch": 0.7690394062148423, + "grad_norm": 0.3507044315338135, + "learning_rate": 2.518701882247955e-06, + "loss": 0.2658, + "step": 41424 + }, + { + "epoch": 0.769076536352261, + "grad_norm": 0.3999587595462799, + "learning_rate": 2.5179279054722026e-06, + "loss": 0.3024, + "step": 41426 + }, + { + "epoch": 0.7691136664896797, + "grad_norm": 0.42895013093948364, + "learning_rate": 2.51715403050504e-06, + "loss": 0.2357, + "step": 41428 + }, + { + "epoch": 0.7691507966270983, + "grad_norm": 0.3126152753829956, + "learning_rate": 2.516380257356995e-06, + "loss": 0.4376, + "step": 41430 + }, + { + "epoch": 0.769187926764517, + "grad_norm": 0.36698758602142334, + "learning_rate": 2.5156065860385994e-06, + "loss": 0.3081, + "step": 41432 + }, + { + "epoch": 0.7692250569019355, + "grad_norm": 0.6157290935516357, + "learning_rate": 2.5148330165603783e-06, + "loss": 0.3686, + "step": 41434 + }, + { + "epoch": 0.7692621870393542, + "grad_norm": 0.3823455274105072, + "learning_rate": 2.5140595489328603e-06, + "loss": 0.0638, + "step": 41436 + }, + { + "epoch": 0.7692993171767729, + "grad_norm": 0.44972336292266846, + "learning_rate": 2.513286183166568e-06, + "loss": 0.2873, + "step": 41438 + }, + { + "epoch": 0.7693364473141915, + "grad_norm": 0.3725402057170868, + "learning_rate": 2.512512919272029e-06, + "loss": 0.4105, + "step": 41440 + }, + { + "epoch": 0.7693735774516102, + "grad_norm": 0.388278067111969, + "learning_rate": 2.511739757259758e-06, + "loss": 0.1952, + "step": 41442 + }, + { + "epoch": 0.7694107075890287, + "grad_norm": 0.5143905282020569, + "learning_rate": 2.5109666971402792e-06, + "loss": 0.4773, + "step": 41444 + }, + { + "epoch": 0.7694478377264474, + "grad_norm": 0.4239763617515564, + "learning_rate": 2.5101937389241117e-06, + "loss": 0.1891, + "step": 41446 + }, + { + "epoch": 0.7694849678638661, + "grad_norm": 0.19605471193790436, + "learning_rate": 2.5094208826217758e-06, + "loss": 0.1952, + "step": 41448 + }, + { + "epoch": 0.7695220980012847, + "grad_norm": 0.49823063611984253, + "learning_rate": 2.5086481282437813e-06, + "loss": 0.2845, + "step": 41450 + }, + { + "epoch": 0.7695592281387034, + "grad_norm": 0.4230511784553528, + "learning_rate": 2.5078754758006505e-06, + "loss": 0.4119, + "step": 41452 + }, + { + "epoch": 0.7695963582761219, + "grad_norm": 0.34655478596687317, + "learning_rate": 2.507102925302889e-06, + "loss": 0.3519, + "step": 41454 + }, + { + "epoch": 0.7696334884135406, + "grad_norm": 0.3689468801021576, + "learning_rate": 2.506330476761013e-06, + "loss": 0.2239, + "step": 41456 + }, + { + "epoch": 0.7696706185509593, + "grad_norm": 0.23845991492271423, + "learning_rate": 2.505558130185534e-06, + "loss": 0.1857, + "step": 41458 + }, + { + "epoch": 0.7697077486883779, + "grad_norm": 0.26497703790664673, + "learning_rate": 2.50478588558696e-06, + "loss": 0.4189, + "step": 41460 + }, + { + "epoch": 0.7697448788257966, + "grad_norm": 0.22562922537326813, + "learning_rate": 2.5040137429757982e-06, + "loss": 0.1965, + "step": 41462 + }, + { + "epoch": 0.7697820089632151, + "grad_norm": 0.4532822072505951, + "learning_rate": 2.503241702362561e-06, + "loss": 0.3235, + "step": 41464 + }, + { + "epoch": 0.7698191391006338, + "grad_norm": 0.3611896336078644, + "learning_rate": 2.5024697637577445e-06, + "loss": 0.1812, + "step": 41466 + }, + { + "epoch": 0.7698562692380525, + "grad_norm": 0.41187095642089844, + "learning_rate": 2.501697927171858e-06, + "loss": 0.374, + "step": 41468 + }, + { + "epoch": 0.7698933993754711, + "grad_norm": 0.38752293586730957, + "learning_rate": 2.500926192615405e-06, + "loss": 0.2303, + "step": 41470 + }, + { + "epoch": 0.7699305295128898, + "grad_norm": 0.4563531279563904, + "learning_rate": 2.5001545600988806e-06, + "loss": 0.5203, + "step": 41472 + }, + { + "epoch": 0.7699676596503083, + "grad_norm": 0.30065885186195374, + "learning_rate": 2.499383029632788e-06, + "loss": 0.2839, + "step": 41474 + }, + { + "epoch": 0.770004789787727, + "grad_norm": 0.4574066698551178, + "learning_rate": 2.4986116012276263e-06, + "loss": 0.1869, + "step": 41476 + }, + { + "epoch": 0.7700419199251456, + "grad_norm": 0.5186640620231628, + "learning_rate": 2.4978402748938944e-06, + "loss": 0.464, + "step": 41478 + }, + { + "epoch": 0.7700790500625643, + "grad_norm": 0.42823919653892517, + "learning_rate": 2.4970690506420814e-06, + "loss": 0.2618, + "step": 41480 + }, + { + "epoch": 0.770116180199983, + "grad_norm": 0.8293660283088684, + "learning_rate": 2.496297928482685e-06, + "loss": 0.3068, + "step": 41482 + }, + { + "epoch": 0.7701533103374015, + "grad_norm": 0.5593637228012085, + "learning_rate": 2.495526908426198e-06, + "loss": 0.3727, + "step": 41484 + }, + { + "epoch": 0.7701904404748202, + "grad_norm": 0.6823593378067017, + "learning_rate": 2.494755990483111e-06, + "loss": 0.2315, + "step": 41486 + }, + { + "epoch": 0.7702275706122388, + "grad_norm": 0.33526623249053955, + "learning_rate": 2.4939851746639133e-06, + "loss": 0.2859, + "step": 41488 + }, + { + "epoch": 0.7702647007496575, + "grad_norm": 0.3988734781742096, + "learning_rate": 2.4932144609790977e-06, + "loss": 0.4348, + "step": 41490 + }, + { + "epoch": 0.7703018308870762, + "grad_norm": 0.4875473082065582, + "learning_rate": 2.492443849439148e-06, + "loss": 0.3288, + "step": 41492 + }, + { + "epoch": 0.7703389610244947, + "grad_norm": 0.33681178092956543, + "learning_rate": 2.491673340054547e-06, + "loss": 0.1195, + "step": 41494 + }, + { + "epoch": 0.7703760911619134, + "grad_norm": 0.5214869976043701, + "learning_rate": 2.4909029328357816e-06, + "loss": 0.2344, + "step": 41496 + }, + { + "epoch": 0.770413221299332, + "grad_norm": 0.4211408197879791, + "learning_rate": 2.490132627793335e-06, + "loss": 0.2452, + "step": 41498 + }, + { + "epoch": 0.7704503514367507, + "grad_norm": 0.4766230285167694, + "learning_rate": 2.489362424937689e-06, + "loss": 0.2905, + "step": 41500 + }, + { + "epoch": 0.7704874815741694, + "grad_norm": 0.37303000688552856, + "learning_rate": 2.488592324279325e-06, + "loss": 0.2822, + "step": 41502 + }, + { + "epoch": 0.7705246117115879, + "grad_norm": 0.2579355537891388, + "learning_rate": 2.4878223258287194e-06, + "loss": 0.1827, + "step": 41504 + }, + { + "epoch": 0.7705617418490066, + "grad_norm": 0.24069981276988983, + "learning_rate": 2.4870524295963485e-06, + "loss": 0.1159, + "step": 41506 + }, + { + "epoch": 0.7705988719864252, + "grad_norm": 0.3725051283836365, + "learning_rate": 2.486282635592692e-06, + "loss": 0.4259, + "step": 41508 + }, + { + "epoch": 0.7706360021238439, + "grad_norm": 0.447422593832016, + "learning_rate": 2.485512943828221e-06, + "loss": 0.1063, + "step": 41510 + }, + { + "epoch": 0.7706731322612626, + "grad_norm": 0.2678152322769165, + "learning_rate": 2.4847433543134137e-06, + "loss": 0.2049, + "step": 41512 + }, + { + "epoch": 0.7707102623986811, + "grad_norm": 0.29245525598526, + "learning_rate": 2.483973867058739e-06, + "loss": 0.2605, + "step": 41514 + }, + { + "epoch": 0.7707473925360998, + "grad_norm": 0.3939357399940491, + "learning_rate": 2.4832044820746627e-06, + "loss": 0.3611, + "step": 41516 + }, + { + "epoch": 0.7707845226735184, + "grad_norm": 0.23557208478450775, + "learning_rate": 2.482435199371659e-06, + "loss": 0.2193, + "step": 41518 + }, + { + "epoch": 0.7708216528109371, + "grad_norm": 0.44960853457450867, + "learning_rate": 2.481666018960195e-06, + "loss": 0.1783, + "step": 41520 + }, + { + "epoch": 0.7708587829483556, + "grad_norm": 0.4091038703918457, + "learning_rate": 2.4808969408507355e-06, + "loss": 0.3157, + "step": 41522 + }, + { + "epoch": 0.7708959130857743, + "grad_norm": 0.4602786600589752, + "learning_rate": 2.4801279650537467e-06, + "loss": 0.2898, + "step": 41524 + }, + { + "epoch": 0.770933043223193, + "grad_norm": 0.3340320885181427, + "learning_rate": 2.479359091579692e-06, + "loss": 0.3423, + "step": 41526 + }, + { + "epoch": 0.7709701733606116, + "grad_norm": 0.3317561745643616, + "learning_rate": 2.478590320439035e-06, + "loss": 0.2855, + "step": 41528 + }, + { + "epoch": 0.7710073034980303, + "grad_norm": 0.3557310998439789, + "learning_rate": 2.4778216516422328e-06, + "loss": 0.2481, + "step": 41530 + }, + { + "epoch": 0.7710444336354488, + "grad_norm": 0.2813953161239624, + "learning_rate": 2.4770530851997455e-06, + "loss": 0.1293, + "step": 41532 + }, + { + "epoch": 0.7710815637728675, + "grad_norm": 0.322879433631897, + "learning_rate": 2.4762846211220358e-06, + "loss": 0.2917, + "step": 41534 + }, + { + "epoch": 0.7711186939102862, + "grad_norm": 0.3595936894416809, + "learning_rate": 2.4755162594195524e-06, + "loss": 0.2568, + "step": 41536 + }, + { + "epoch": 0.7711558240477048, + "grad_norm": 0.6751272082328796, + "learning_rate": 2.4747480001027547e-06, + "loss": 0.1539, + "step": 41538 + }, + { + "epoch": 0.7711929541851235, + "grad_norm": 0.40497925877571106, + "learning_rate": 2.4739798431820983e-06, + "loss": 0.1931, + "step": 41540 + }, + { + "epoch": 0.771230084322542, + "grad_norm": 0.3457527160644531, + "learning_rate": 2.4732117886680306e-06, + "loss": 0.336, + "step": 41542 + }, + { + "epoch": 0.7712672144599607, + "grad_norm": 0.3313685357570648, + "learning_rate": 2.4724438365710057e-06, + "loss": 0.3642, + "step": 41544 + }, + { + "epoch": 0.7713043445973794, + "grad_norm": 0.290331095457077, + "learning_rate": 2.471675986901473e-06, + "loss": 0.049, + "step": 41546 + }, + { + "epoch": 0.771341474734798, + "grad_norm": 0.28030937910079956, + "learning_rate": 2.470908239669878e-06, + "loss": 0.3186, + "step": 41548 + }, + { + "epoch": 0.7713786048722167, + "grad_norm": 0.6392655968666077, + "learning_rate": 2.470140594886672e-06, + "loss": 0.2418, + "step": 41550 + }, + { + "epoch": 0.7714157350096352, + "grad_norm": 0.36246079206466675, + "learning_rate": 2.4693730525622995e-06, + "loss": 0.1193, + "step": 41552 + }, + { + "epoch": 0.7714528651470539, + "grad_norm": 0.2808990776538849, + "learning_rate": 2.4686056127072e-06, + "loss": 0.3331, + "step": 41554 + }, + { + "epoch": 0.7714899952844726, + "grad_norm": 0.3309437334537506, + "learning_rate": 2.467838275331822e-06, + "loss": 0.1739, + "step": 41556 + }, + { + "epoch": 0.7715271254218912, + "grad_norm": 0.6773702502250671, + "learning_rate": 2.4670710404466016e-06, + "loss": 0.1455, + "step": 41558 + }, + { + "epoch": 0.7715642555593099, + "grad_norm": 0.4083969295024872, + "learning_rate": 2.46630390806198e-06, + "loss": 0.189, + "step": 41560 + }, + { + "epoch": 0.7716013856967284, + "grad_norm": 0.4024791121482849, + "learning_rate": 2.465536878188396e-06, + "loss": 0.4499, + "step": 41562 + }, + { + "epoch": 0.7716385158341471, + "grad_norm": 0.5677170157432556, + "learning_rate": 2.4647699508362864e-06, + "loss": 0.1573, + "step": 41564 + }, + { + "epoch": 0.7716756459715658, + "grad_norm": 0.3135163486003876, + "learning_rate": 2.46400312601609e-06, + "loss": 0.0972, + "step": 41566 + }, + { + "epoch": 0.7717127761089844, + "grad_norm": 0.5223904848098755, + "learning_rate": 2.4632364037382362e-06, + "loss": 0.2923, + "step": 41568 + }, + { + "epoch": 0.771749906246403, + "grad_norm": 0.3882283568382263, + "learning_rate": 2.4624697840131593e-06, + "loss": 0.1685, + "step": 41570 + }, + { + "epoch": 0.7717870363838216, + "grad_norm": 0.7938657402992249, + "learning_rate": 2.461703266851291e-06, + "loss": 0.3399, + "step": 41572 + }, + { + "epoch": 0.7718241665212403, + "grad_norm": 0.5284688472747803, + "learning_rate": 2.460936852263065e-06, + "loss": 0.2061, + "step": 41574 + }, + { + "epoch": 0.7718612966586589, + "grad_norm": 0.4011911451816559, + "learning_rate": 2.460170540258903e-06, + "loss": 0.4157, + "step": 41576 + }, + { + "epoch": 0.7718984267960776, + "grad_norm": 0.5499333143234253, + "learning_rate": 2.4594043308492377e-06, + "loss": 0.432, + "step": 41578 + }, + { + "epoch": 0.7719355569334962, + "grad_norm": 0.3336786925792694, + "learning_rate": 2.4586382240444906e-06, + "loss": 0.1987, + "step": 41580 + }, + { + "epoch": 0.7719726870709148, + "grad_norm": 0.4181004464626312, + "learning_rate": 2.4578722198550873e-06, + "loss": 0.3984, + "step": 41582 + }, + { + "epoch": 0.7720098172083335, + "grad_norm": 0.6200852990150452, + "learning_rate": 2.4571063182914534e-06, + "loss": 0.322, + "step": 41584 + }, + { + "epoch": 0.7720469473457521, + "grad_norm": 0.4113323986530304, + "learning_rate": 2.4563405193640076e-06, + "loss": 0.3082, + "step": 41586 + }, + { + "epoch": 0.7720840774831708, + "grad_norm": 0.42549222707748413, + "learning_rate": 2.4555748230831724e-06, + "loss": 0.3536, + "step": 41588 + }, + { + "epoch": 0.7721212076205894, + "grad_norm": 0.30046191811561584, + "learning_rate": 2.4548092294593685e-06, + "loss": 0.4161, + "step": 41590 + }, + { + "epoch": 0.772158337758008, + "grad_norm": 0.30612480640411377, + "learning_rate": 2.454043738503007e-06, + "loss": 0.3458, + "step": 41592 + }, + { + "epoch": 0.7721954678954267, + "grad_norm": 0.4304139018058777, + "learning_rate": 2.4532783502245085e-06, + "loss": 0.2181, + "step": 41594 + }, + { + "epoch": 0.7722325980328453, + "grad_norm": 0.46697211265563965, + "learning_rate": 2.4525130646342856e-06, + "loss": 0.3102, + "step": 41596 + }, + { + "epoch": 0.772269728170264, + "grad_norm": 0.25491881370544434, + "learning_rate": 2.4517478817427553e-06, + "loss": 0.3937, + "step": 41598 + }, + { + "epoch": 0.7723068583076826, + "grad_norm": 0.42776280641555786, + "learning_rate": 2.4509828015603242e-06, + "loss": 0.2448, + "step": 41600 + }, + { + "epoch": 0.7723439884451012, + "grad_norm": 0.2742282450199127, + "learning_rate": 2.450217824097405e-06, + "loss": 0.4178, + "step": 41602 + }, + { + "epoch": 0.7723811185825199, + "grad_norm": 0.5798923969268799, + "learning_rate": 2.4494529493644104e-06, + "loss": 0.3012, + "step": 41604 + }, + { + "epoch": 0.7724182487199385, + "grad_norm": 0.6069666147232056, + "learning_rate": 2.4486881773717417e-06, + "loss": 0.4084, + "step": 41606 + }, + { + "epoch": 0.7724553788573572, + "grad_norm": 0.666145384311676, + "learning_rate": 2.4479235081298083e-06, + "loss": 0.1521, + "step": 41608 + }, + { + "epoch": 0.7724925089947758, + "grad_norm": 0.4640580415725708, + "learning_rate": 2.4471589416490137e-06, + "loss": 0.3958, + "step": 41610 + }, + { + "epoch": 0.7725296391321944, + "grad_norm": 0.3107159733772278, + "learning_rate": 2.446394477939764e-06, + "loss": 0.1295, + "step": 41612 + }, + { + "epoch": 0.7725667692696131, + "grad_norm": 0.3692168891429901, + "learning_rate": 2.4456301170124584e-06, + "loss": 0.2517, + "step": 41614 + }, + { + "epoch": 0.7726038994070317, + "grad_norm": 0.44470641016960144, + "learning_rate": 2.444865858877503e-06, + "loss": 0.3008, + "step": 41616 + }, + { + "epoch": 0.7726410295444504, + "grad_norm": 0.477758526802063, + "learning_rate": 2.4441017035452897e-06, + "loss": 0.3206, + "step": 41618 + }, + { + "epoch": 0.772678159681869, + "grad_norm": 0.6561027765274048, + "learning_rate": 2.443337651026223e-06, + "loss": 0.2473, + "step": 41620 + }, + { + "epoch": 0.7727152898192876, + "grad_norm": 0.31912335753440857, + "learning_rate": 2.4425737013306926e-06, + "loss": 0.1001, + "step": 41622 + }, + { + "epoch": 0.7727524199567063, + "grad_norm": 0.5909704566001892, + "learning_rate": 2.441809854469097e-06, + "loss": 0.217, + "step": 41624 + }, + { + "epoch": 0.7727895500941249, + "grad_norm": 0.3824181854724884, + "learning_rate": 2.44104611045183e-06, + "loss": 0.3191, + "step": 41626 + }, + { + "epoch": 0.7728266802315436, + "grad_norm": 0.3913993537425995, + "learning_rate": 2.4402824692892867e-06, + "loss": 0.3017, + "step": 41628 + }, + { + "epoch": 0.7728638103689621, + "grad_norm": 0.4373913109302521, + "learning_rate": 2.4395189309918522e-06, + "loss": 0.3209, + "step": 41630 + }, + { + "epoch": 0.7729009405063808, + "grad_norm": 0.3915177285671234, + "learning_rate": 2.438755495569918e-06, + "loss": 0.2948, + "step": 41632 + }, + { + "epoch": 0.7729380706437995, + "grad_norm": 0.4179781973361969, + "learning_rate": 2.4379921630338744e-06, + "loss": 0.3249, + "step": 41634 + }, + { + "epoch": 0.7729752007812181, + "grad_norm": 0.5380401015281677, + "learning_rate": 2.4372289333941058e-06, + "loss": 0.3862, + "step": 41636 + }, + { + "epoch": 0.7730123309186367, + "grad_norm": 0.44070830941200256, + "learning_rate": 2.436465806661e-06, + "loss": 0.5223, + "step": 41638 + }, + { + "epoch": 0.7730494610560553, + "grad_norm": 0.40604567527770996, + "learning_rate": 2.4357027828449386e-06, + "loss": 0.3453, + "step": 41640 + }, + { + "epoch": 0.773086591193474, + "grad_norm": 0.28185978531837463, + "learning_rate": 2.434939861956306e-06, + "loss": 0.2626, + "step": 41642 + }, + { + "epoch": 0.7731237213308927, + "grad_norm": 0.4027245342731476, + "learning_rate": 2.434177044005479e-06, + "loss": 0.3112, + "step": 41644 + }, + { + "epoch": 0.7731608514683113, + "grad_norm": 0.4761393368244171, + "learning_rate": 2.433414329002841e-06, + "loss": 0.4452, + "step": 41646 + }, + { + "epoch": 0.77319798160573, + "grad_norm": 0.4719437062740326, + "learning_rate": 2.4326517169587695e-06, + "loss": 0.1538, + "step": 41648 + }, + { + "epoch": 0.7732351117431485, + "grad_norm": 0.3605237603187561, + "learning_rate": 2.43188920788364e-06, + "loss": 0.4141, + "step": 41650 + }, + { + "epoch": 0.7732722418805672, + "grad_norm": 0.3867614269256592, + "learning_rate": 2.431126801787831e-06, + "loss": 0.1989, + "step": 41652 + }, + { + "epoch": 0.7733093720179859, + "grad_norm": 0.40725454688072205, + "learning_rate": 2.4303644986817165e-06, + "loss": 0.1945, + "step": 41654 + }, + { + "epoch": 0.7733465021554045, + "grad_norm": 0.6267882585525513, + "learning_rate": 2.4296022985756653e-06, + "loss": 0.3202, + "step": 41656 + }, + { + "epoch": 0.7733836322928231, + "grad_norm": 0.46100282669067383, + "learning_rate": 2.4288402014800495e-06, + "loss": 0.2566, + "step": 41658 + }, + { + "epoch": 0.7734207624302417, + "grad_norm": 0.45701903104782104, + "learning_rate": 2.4280782074052446e-06, + "loss": 0.2879, + "step": 41660 + }, + { + "epoch": 0.7734578925676604, + "grad_norm": 0.32045847177505493, + "learning_rate": 2.427316316361611e-06, + "loss": 0.212, + "step": 41662 + }, + { + "epoch": 0.7734950227050791, + "grad_norm": 0.21286286413669586, + "learning_rate": 2.4265545283595206e-06, + "loss": 0.1692, + "step": 41664 + }, + { + "epoch": 0.7735321528424977, + "grad_norm": 0.577670693397522, + "learning_rate": 2.4257928434093402e-06, + "loss": 0.2247, + "step": 41666 + }, + { + "epoch": 0.7735692829799163, + "grad_norm": 0.241823211312294, + "learning_rate": 2.4250312615214287e-06, + "loss": 0.2234, + "step": 41668 + }, + { + "epoch": 0.7736064131173349, + "grad_norm": 0.5653408169746399, + "learning_rate": 2.4242697827061524e-06, + "loss": 0.312, + "step": 41670 + }, + { + "epoch": 0.7736435432547536, + "grad_norm": 0.4778139293193817, + "learning_rate": 2.4235084069738723e-06, + "loss": 0.5307, + "step": 41672 + }, + { + "epoch": 0.7736806733921722, + "grad_norm": 0.26522332429885864, + "learning_rate": 2.4227471343349484e-06, + "loss": 0.1984, + "step": 41674 + }, + { + "epoch": 0.7737178035295909, + "grad_norm": 0.37601980566978455, + "learning_rate": 2.4219859647997403e-06, + "loss": 0.2903, + "step": 41676 + }, + { + "epoch": 0.7737549336670095, + "grad_norm": 0.29758980870246887, + "learning_rate": 2.4212248983786056e-06, + "loss": 0.1257, + "step": 41678 + }, + { + "epoch": 0.7737920638044281, + "grad_norm": 0.44700977206230164, + "learning_rate": 2.420463935081898e-06, + "loss": 0.1678, + "step": 41680 + }, + { + "epoch": 0.7738291939418468, + "grad_norm": 0.3596184253692627, + "learning_rate": 2.4197030749199746e-06, + "loss": 0.2693, + "step": 41682 + }, + { + "epoch": 0.7738663240792654, + "grad_norm": 0.3034147620201111, + "learning_rate": 2.4189423179031844e-06, + "loss": 0.2562, + "step": 41684 + }, + { + "epoch": 0.773903454216684, + "grad_norm": 0.3415099084377289, + "learning_rate": 2.418181664041881e-06, + "loss": 0.4591, + "step": 41686 + }, + { + "epoch": 0.7739405843541027, + "grad_norm": 0.4192591607570648, + "learning_rate": 2.4174211133464155e-06, + "loss": 0.2941, + "step": 41688 + }, + { + "epoch": 0.7739777144915213, + "grad_norm": 0.3644876778125763, + "learning_rate": 2.4166606658271353e-06, + "loss": 0.1712, + "step": 41690 + }, + { + "epoch": 0.77401484462894, + "grad_norm": 0.20664454996585846, + "learning_rate": 2.415900321494392e-06, + "loss": 0.1908, + "step": 41692 + }, + { + "epoch": 0.7740519747663586, + "grad_norm": 0.5559563040733337, + "learning_rate": 2.4151400803585267e-06, + "loss": 0.1693, + "step": 41694 + }, + { + "epoch": 0.7740891049037772, + "grad_norm": 0.28753381967544556, + "learning_rate": 2.414379942429884e-06, + "loss": 0.2624, + "step": 41696 + }, + { + "epoch": 0.7741262350411959, + "grad_norm": 0.5968526005744934, + "learning_rate": 2.4136199077188095e-06, + "loss": 0.1892, + "step": 41698 + }, + { + "epoch": 0.7741633651786145, + "grad_norm": 0.3468347191810608, + "learning_rate": 2.412859976235644e-06, + "loss": 0.2166, + "step": 41700 + }, + { + "epoch": 0.7742004953160332, + "grad_norm": 0.28130000829696655, + "learning_rate": 2.4121001479907303e-06, + "loss": 0.162, + "step": 41702 + }, + { + "epoch": 0.7742376254534518, + "grad_norm": 0.28966277837753296, + "learning_rate": 2.4113404229944035e-06, + "loss": 0.2251, + "step": 41704 + }, + { + "epoch": 0.7742747555908704, + "grad_norm": 0.2702259421348572, + "learning_rate": 2.4105808012570054e-06, + "loss": 0.2324, + "step": 41706 + }, + { + "epoch": 0.7743118857282891, + "grad_norm": 0.32897037267684937, + "learning_rate": 2.409821282788867e-06, + "loss": 0.2493, + "step": 41708 + }, + { + "epoch": 0.7743490158657077, + "grad_norm": 0.5918217301368713, + "learning_rate": 2.4090618676003242e-06, + "loss": 0.2549, + "step": 41710 + }, + { + "epoch": 0.7743861460031264, + "grad_norm": 0.36162179708480835, + "learning_rate": 2.408302555701714e-06, + "loss": 0.3247, + "step": 41712 + }, + { + "epoch": 0.774423276140545, + "grad_norm": 0.3251967430114746, + "learning_rate": 2.4075433471033647e-06, + "loss": 0.2881, + "step": 41714 + }, + { + "epoch": 0.7744604062779636, + "grad_norm": 0.24003437161445618, + "learning_rate": 2.4067842418156117e-06, + "loss": 0.2203, + "step": 41716 + }, + { + "epoch": 0.7744975364153823, + "grad_norm": 0.6811937689781189, + "learning_rate": 2.4060252398487784e-06, + "loss": 0.2679, + "step": 41718 + }, + { + "epoch": 0.7745346665528009, + "grad_norm": 0.41955360770225525, + "learning_rate": 2.405266341213195e-06, + "loss": 0.3401, + "step": 41720 + }, + { + "epoch": 0.7745717966902196, + "grad_norm": 0.3426463007926941, + "learning_rate": 2.4045075459191868e-06, + "loss": 0.3038, + "step": 41722 + }, + { + "epoch": 0.7746089268276382, + "grad_norm": 0.2100699245929718, + "learning_rate": 2.403748853977084e-06, + "loss": 0.1662, + "step": 41724 + }, + { + "epoch": 0.7746460569650568, + "grad_norm": 0.23500922322273254, + "learning_rate": 2.4029902653972024e-06, + "loss": 0.3091, + "step": 41726 + }, + { + "epoch": 0.7746831871024754, + "grad_norm": 0.3701079785823822, + "learning_rate": 2.4022317801898677e-06, + "loss": 0.3313, + "step": 41728 + }, + { + "epoch": 0.7747203172398941, + "grad_norm": 0.4769270420074463, + "learning_rate": 2.4014733983654025e-06, + "loss": 0.3257, + "step": 41730 + }, + { + "epoch": 0.7747574473773128, + "grad_norm": 0.5516478419303894, + "learning_rate": 2.400715119934123e-06, + "loss": 0.2694, + "step": 41732 + }, + { + "epoch": 0.7747945775147314, + "grad_norm": 0.49802184104919434, + "learning_rate": 2.3999569449063464e-06, + "loss": 0.3135, + "step": 41734 + }, + { + "epoch": 0.77483170765215, + "grad_norm": 0.42903468012809753, + "learning_rate": 2.3991988732923923e-06, + "loss": 0.2142, + "step": 41736 + }, + { + "epoch": 0.7748688377895686, + "grad_norm": 0.2886156737804413, + "learning_rate": 2.398440905102574e-06, + "loss": 0.2809, + "step": 41738 + }, + { + "epoch": 0.7749059679269873, + "grad_norm": 0.34705883264541626, + "learning_rate": 2.3976830403472062e-06, + "loss": 0.2142, + "step": 41740 + }, + { + "epoch": 0.774943098064406, + "grad_norm": 0.5105149745941162, + "learning_rate": 2.3969252790366026e-06, + "loss": 0.2641, + "step": 41742 + }, + { + "epoch": 0.7749802282018246, + "grad_norm": 0.37746235728263855, + "learning_rate": 2.3961676211810704e-06, + "loss": 0.3541, + "step": 41744 + }, + { + "epoch": 0.7750173583392432, + "grad_norm": 0.3768836557865143, + "learning_rate": 2.395410066790922e-06, + "loss": 0.1611, + "step": 41746 + }, + { + "epoch": 0.7750544884766618, + "grad_norm": 0.40296801924705505, + "learning_rate": 2.3946526158764626e-06, + "loss": 0.2451, + "step": 41748 + }, + { + "epoch": 0.7750916186140805, + "grad_norm": 0.33458423614501953, + "learning_rate": 2.393895268448e-06, + "loss": 0.2358, + "step": 41750 + }, + { + "epoch": 0.7751287487514992, + "grad_norm": 0.4118732511997223, + "learning_rate": 2.39313802451584e-06, + "loss": 0.2973, + "step": 41752 + }, + { + "epoch": 0.7751658788889177, + "grad_norm": 0.2701396942138672, + "learning_rate": 2.3923808840902898e-06, + "loss": 0.2615, + "step": 41754 + }, + { + "epoch": 0.7752030090263364, + "grad_norm": 0.40476492047309875, + "learning_rate": 2.3916238471816444e-06, + "loss": 0.2912, + "step": 41756 + }, + { + "epoch": 0.775240139163755, + "grad_norm": 0.406203955411911, + "learning_rate": 2.3908669138002094e-06, + "loss": 0.5011, + "step": 41758 + }, + { + "epoch": 0.7752772693011737, + "grad_norm": 0.4594648778438568, + "learning_rate": 2.390110083956283e-06, + "loss": 0.1457, + "step": 41760 + }, + { + "epoch": 0.7753143994385924, + "grad_norm": 0.1936478465795517, + "learning_rate": 2.389353357660165e-06, + "loss": 0.3277, + "step": 41762 + }, + { + "epoch": 0.775351529576011, + "grad_norm": 0.41431301832199097, + "learning_rate": 2.3885967349221505e-06, + "loss": 0.1855, + "step": 41764 + }, + { + "epoch": 0.7753886597134296, + "grad_norm": 0.48124954104423523, + "learning_rate": 2.3878402157525393e-06, + "loss": 0.2191, + "step": 41766 + }, + { + "epoch": 0.7754257898508482, + "grad_norm": 0.33568528294563293, + "learning_rate": 2.3870838001616216e-06, + "loss": 0.3367, + "step": 41768 + }, + { + "epoch": 0.7754629199882669, + "grad_norm": 0.2685660719871521, + "learning_rate": 2.3863274881596866e-06, + "loss": 0.2314, + "step": 41770 + }, + { + "epoch": 0.7755000501256856, + "grad_norm": 0.6494361162185669, + "learning_rate": 2.3855712797570287e-06, + "loss": 0.3372, + "step": 41772 + }, + { + "epoch": 0.7755371802631041, + "grad_norm": 0.36974701285362244, + "learning_rate": 2.384815174963938e-06, + "loss": 0.2773, + "step": 41774 + }, + { + "epoch": 0.7755743104005228, + "grad_norm": 0.3431699573993683, + "learning_rate": 2.3840591737907038e-06, + "loss": 0.2583, + "step": 41776 + }, + { + "epoch": 0.7756114405379414, + "grad_norm": 0.21646954119205475, + "learning_rate": 2.3833032762476103e-06, + "loss": 0.4935, + "step": 41778 + }, + { + "epoch": 0.7756485706753601, + "grad_norm": 0.467838853597641, + "learning_rate": 2.382547482344948e-06, + "loss": 0.1118, + "step": 41780 + }, + { + "epoch": 0.7756857008127787, + "grad_norm": 0.4340430796146393, + "learning_rate": 2.3817917920929946e-06, + "loss": 0.4535, + "step": 41782 + }, + { + "epoch": 0.7757228309501973, + "grad_norm": 0.4275455176830292, + "learning_rate": 2.3810362055020365e-06, + "loss": 0.3143, + "step": 41784 + }, + { + "epoch": 0.775759961087616, + "grad_norm": 0.43500059843063354, + "learning_rate": 2.3802807225823534e-06, + "loss": 0.2285, + "step": 41786 + }, + { + "epoch": 0.7757970912250346, + "grad_norm": 0.4928950369358063, + "learning_rate": 2.379525343344229e-06, + "loss": 0.1554, + "step": 41788 + }, + { + "epoch": 0.7758342213624533, + "grad_norm": 0.4028860330581665, + "learning_rate": 2.3787700677979363e-06, + "loss": 0.1325, + "step": 41790 + }, + { + "epoch": 0.7758713514998719, + "grad_norm": 0.3991203308105469, + "learning_rate": 2.3780148959537564e-06, + "loss": 0.2951, + "step": 41792 + }, + { + "epoch": 0.7759084816372905, + "grad_norm": 0.4491865336894989, + "learning_rate": 2.3772598278219618e-06, + "loss": 0.2211, + "step": 41794 + }, + { + "epoch": 0.7759456117747092, + "grad_norm": 0.361009806394577, + "learning_rate": 2.3765048634128273e-06, + "loss": 0.3317, + "step": 41796 + }, + { + "epoch": 0.7759827419121278, + "grad_norm": 0.41626113653182983, + "learning_rate": 2.3757500027366276e-06, + "loss": 0.1571, + "step": 41798 + }, + { + "epoch": 0.7760198720495465, + "grad_norm": 0.3314739465713501, + "learning_rate": 2.374995245803632e-06, + "loss": 0.379, + "step": 41800 + }, + { + "epoch": 0.776057002186965, + "grad_norm": 0.5541840195655823, + "learning_rate": 2.374240592624112e-06, + "loss": 0.2985, + "step": 41802 + }, + { + "epoch": 0.7760941323243837, + "grad_norm": 0.38556793332099915, + "learning_rate": 2.3734860432083384e-06, + "loss": 0.2697, + "step": 41804 + }, + { + "epoch": 0.7761312624618024, + "grad_norm": 0.5331282615661621, + "learning_rate": 2.372731597566573e-06, + "loss": 0.2788, + "step": 41806 + }, + { + "epoch": 0.776168392599221, + "grad_norm": 0.46881264448165894, + "learning_rate": 2.371977255709085e-06, + "loss": 0.2824, + "step": 41808 + }, + { + "epoch": 0.7762055227366397, + "grad_norm": 0.28259921073913574, + "learning_rate": 2.3712230176461394e-06, + "loss": 0.2136, + "step": 41810 + }, + { + "epoch": 0.7762426528740582, + "grad_norm": 0.4571775794029236, + "learning_rate": 2.370468883387995e-06, + "loss": 0.4443, + "step": 41812 + }, + { + "epoch": 0.7762797830114769, + "grad_norm": 0.4409199655056, + "learning_rate": 2.369714852944918e-06, + "loss": 0.3909, + "step": 41814 + }, + { + "epoch": 0.7763169131488956, + "grad_norm": 0.2831723988056183, + "learning_rate": 2.368960926327164e-06, + "loss": 0.2213, + "step": 41816 + }, + { + "epoch": 0.7763540432863142, + "grad_norm": 0.6265026330947876, + "learning_rate": 2.3682071035449984e-06, + "loss": 0.3633, + "step": 41818 + }, + { + "epoch": 0.7763911734237329, + "grad_norm": 0.2759802043437958, + "learning_rate": 2.367453384608671e-06, + "loss": 0.415, + "step": 41820 + }, + { + "epoch": 0.7764283035611514, + "grad_norm": 0.4581483006477356, + "learning_rate": 2.366699769528441e-06, + "loss": 0.2496, + "step": 41822 + }, + { + "epoch": 0.7764654336985701, + "grad_norm": 0.5159185528755188, + "learning_rate": 2.365946258314563e-06, + "loss": 0.4825, + "step": 41824 + }, + { + "epoch": 0.7765025638359887, + "grad_norm": 0.5229365229606628, + "learning_rate": 2.365192850977289e-06, + "loss": 0.2857, + "step": 41826 + }, + { + "epoch": 0.7765396939734074, + "grad_norm": 0.6025939583778381, + "learning_rate": 2.3644395475268754e-06, + "loss": 0.3303, + "step": 41828 + }, + { + "epoch": 0.7765768241108261, + "grad_norm": 0.31592264771461487, + "learning_rate": 2.363686347973565e-06, + "loss": 0.2831, + "step": 41830 + }, + { + "epoch": 0.7766139542482446, + "grad_norm": 0.2643633186817169, + "learning_rate": 2.3629332523276126e-06, + "loss": 0.3305, + "step": 41832 + }, + { + "epoch": 0.7766510843856633, + "grad_norm": 0.2707612216472626, + "learning_rate": 2.3621802605992605e-06, + "loss": 0.2775, + "step": 41834 + }, + { + "epoch": 0.7766882145230819, + "grad_norm": 0.4132586121559143, + "learning_rate": 2.361427372798757e-06, + "loss": 0.3679, + "step": 41836 + }, + { + "epoch": 0.7767253446605006, + "grad_norm": 0.4120941758155823, + "learning_rate": 2.3606745889363456e-06, + "loss": 0.2227, + "step": 41838 + }, + { + "epoch": 0.7767624747979193, + "grad_norm": 0.5422465801239014, + "learning_rate": 2.3599219090222725e-06, + "loss": 0.3482, + "step": 41840 + }, + { + "epoch": 0.7767996049353378, + "grad_norm": 0.27213039994239807, + "learning_rate": 2.35916933306678e-06, + "loss": 0.2711, + "step": 41842 + }, + { + "epoch": 0.7768367350727565, + "grad_norm": 0.4004543423652649, + "learning_rate": 2.358416861080103e-06, + "loss": 0.3512, + "step": 41844 + }, + { + "epoch": 0.7768738652101751, + "grad_norm": 0.5901560187339783, + "learning_rate": 2.3576644930724825e-06, + "loss": 0.3286, + "step": 41846 + }, + { + "epoch": 0.7769109953475938, + "grad_norm": 0.4954366087913513, + "learning_rate": 2.3569122290541568e-06, + "loss": 0.2201, + "step": 41848 + }, + { + "epoch": 0.7769481254850125, + "grad_norm": 0.4452698528766632, + "learning_rate": 2.356160069035366e-06, + "loss": 0.2376, + "step": 41850 + }, + { + "epoch": 0.776985255622431, + "grad_norm": 0.37313178181648254, + "learning_rate": 2.355408013026337e-06, + "loss": 0.2182, + "step": 41852 + }, + { + "epoch": 0.7770223857598497, + "grad_norm": 0.43775200843811035, + "learning_rate": 2.354656061037307e-06, + "loss": 0.2139, + "step": 41854 + }, + { + "epoch": 0.7770595158972683, + "grad_norm": 0.24441379308700562, + "learning_rate": 2.3539042130785106e-06, + "loss": 0.1814, + "step": 41856 + }, + { + "epoch": 0.777096646034687, + "grad_norm": 0.3462609052658081, + "learning_rate": 2.3531524691601714e-06, + "loss": 0.2181, + "step": 41858 + }, + { + "epoch": 0.7771337761721057, + "grad_norm": 0.31912118196487427, + "learning_rate": 2.3524008292925237e-06, + "loss": 0.2718, + "step": 41860 + }, + { + "epoch": 0.7771709063095242, + "grad_norm": 0.23799973726272583, + "learning_rate": 2.3516492934857925e-06, + "loss": 0.1257, + "step": 41862 + }, + { + "epoch": 0.7772080364469429, + "grad_norm": 0.445029079914093, + "learning_rate": 2.3508978617502044e-06, + "loss": 0.4255, + "step": 41864 + }, + { + "epoch": 0.7772451665843615, + "grad_norm": 0.40211769938468933, + "learning_rate": 2.350146534095986e-06, + "loss": 0.2717, + "step": 41866 + }, + { + "epoch": 0.7772822967217802, + "grad_norm": 0.38863760232925415, + "learning_rate": 2.349395310533361e-06, + "loss": 0.3589, + "step": 41868 + }, + { + "epoch": 0.7773194268591989, + "grad_norm": 0.4855385720729828, + "learning_rate": 2.3486441910725477e-06, + "loss": 0.2991, + "step": 41870 + }, + { + "epoch": 0.7773565569966174, + "grad_norm": 0.3380052149295807, + "learning_rate": 2.3478931757237675e-06, + "loss": 0.2445, + "step": 41872 + }, + { + "epoch": 0.7773936871340361, + "grad_norm": 0.3508371114730835, + "learning_rate": 2.3471422644972443e-06, + "loss": 0.2756, + "step": 41874 + }, + { + "epoch": 0.7774308172714547, + "grad_norm": 0.23072625696659088, + "learning_rate": 2.346391457403189e-06, + "loss": 0.1805, + "step": 41876 + }, + { + "epoch": 0.7774679474088734, + "grad_norm": 0.3609980046749115, + "learning_rate": 2.34564075445182e-06, + "loss": 0.2276, + "step": 41878 + }, + { + "epoch": 0.777505077546292, + "grad_norm": 0.30046242475509644, + "learning_rate": 2.3448901556533565e-06, + "loss": 0.1758, + "step": 41880 + }, + { + "epoch": 0.7775422076837106, + "grad_norm": 0.2343037724494934, + "learning_rate": 2.3441396610180044e-06, + "loss": 0.1896, + "step": 41882 + }, + { + "epoch": 0.7775793378211293, + "grad_norm": 0.48427316546440125, + "learning_rate": 2.3433892705559803e-06, + "loss": 0.2592, + "step": 41884 + }, + { + "epoch": 0.7776164679585479, + "grad_norm": 0.2749733626842499, + "learning_rate": 2.3426389842774945e-06, + "loss": 0.2929, + "step": 41886 + }, + { + "epoch": 0.7776535980959666, + "grad_norm": 0.3403457701206207, + "learning_rate": 2.3418888021927544e-06, + "loss": 0.1602, + "step": 41888 + }, + { + "epoch": 0.7776907282333851, + "grad_norm": 0.5123617649078369, + "learning_rate": 2.34113872431197e-06, + "loss": 0.265, + "step": 41890 + }, + { + "epoch": 0.7777278583708038, + "grad_norm": 0.4079894423484802, + "learning_rate": 2.3403887506453495e-06, + "loss": 0.161, + "step": 41892 + }, + { + "epoch": 0.7777649885082225, + "grad_norm": 0.39733049273490906, + "learning_rate": 2.3396388812030913e-06, + "loss": 0.2258, + "step": 41894 + }, + { + "epoch": 0.7778021186456411, + "grad_norm": 0.40713706612586975, + "learning_rate": 2.3388891159954053e-06, + "loss": 0.3025, + "step": 41896 + }, + { + "epoch": 0.7778392487830598, + "grad_norm": 0.4357052445411682, + "learning_rate": 2.3381394550324886e-06, + "loss": 0.2517, + "step": 41898 + }, + { + "epoch": 0.7778763789204783, + "grad_norm": 0.31053557991981506, + "learning_rate": 2.3373898983245435e-06, + "loss": 0.1198, + "step": 41900 + }, + { + "epoch": 0.777913509057897, + "grad_norm": 0.31501707434654236, + "learning_rate": 2.3366404458817705e-06, + "loss": 0.2008, + "step": 41902 + }, + { + "epoch": 0.7779506391953157, + "grad_norm": 0.26989200711250305, + "learning_rate": 2.3358910977143657e-06, + "loss": 0.484, + "step": 41904 + }, + { + "epoch": 0.7779877693327343, + "grad_norm": 0.3833233714103699, + "learning_rate": 2.33514185383253e-06, + "loss": 0.2718, + "step": 41906 + }, + { + "epoch": 0.778024899470153, + "grad_norm": 0.29126936197280884, + "learning_rate": 2.334392714246452e-06, + "loss": 0.1786, + "step": 41908 + }, + { + "epoch": 0.7780620296075715, + "grad_norm": 0.4694153368473053, + "learning_rate": 2.3336436789663276e-06, + "loss": 0.2016, + "step": 41910 + }, + { + "epoch": 0.7780991597449902, + "grad_norm": 0.4486958682537079, + "learning_rate": 2.332894748002349e-06, + "loss": 0.2268, + "step": 41912 + }, + { + "epoch": 0.7781362898824089, + "grad_norm": 0.4123126268386841, + "learning_rate": 2.33214592136471e-06, + "loss": 0.2729, + "step": 41914 + }, + { + "epoch": 0.7781734200198275, + "grad_norm": 0.27961716055870056, + "learning_rate": 2.3313971990635943e-06, + "loss": 0.184, + "step": 41916 + }, + { + "epoch": 0.7782105501572462, + "grad_norm": 0.5163271427154541, + "learning_rate": 2.3306485811091962e-06, + "loss": 0.2869, + "step": 41918 + }, + { + "epoch": 0.7782476802946647, + "grad_norm": 0.56910240650177, + "learning_rate": 2.3299000675116954e-06, + "loss": 0.2688, + "step": 41920 + }, + { + "epoch": 0.7782848104320834, + "grad_norm": 0.34338659048080444, + "learning_rate": 2.32915165828128e-06, + "loss": 0.4302, + "step": 41922 + }, + { + "epoch": 0.7783219405695021, + "grad_norm": 0.4305938184261322, + "learning_rate": 2.3284033534281334e-06, + "loss": 0.4123, + "step": 41924 + }, + { + "epoch": 0.7783590707069207, + "grad_norm": 0.8954707384109497, + "learning_rate": 2.327655152962438e-06, + "loss": 0.2256, + "step": 41926 + }, + { + "epoch": 0.7783962008443394, + "grad_norm": 0.6107361912727356, + "learning_rate": 2.326907056894375e-06, + "loss": 0.2014, + "step": 41928 + }, + { + "epoch": 0.7784333309817579, + "grad_norm": 0.38959434628486633, + "learning_rate": 2.3261590652341257e-06, + "loss": 0.3396, + "step": 41930 + }, + { + "epoch": 0.7784704611191766, + "grad_norm": 0.38291648030281067, + "learning_rate": 2.325411177991864e-06, + "loss": 0.2578, + "step": 41932 + }, + { + "epoch": 0.7785075912565952, + "grad_norm": 0.40688517689704895, + "learning_rate": 2.3246633951777675e-06, + "loss": 0.3241, + "step": 41934 + }, + { + "epoch": 0.7785447213940139, + "grad_norm": 0.9046632647514343, + "learning_rate": 2.323915716802014e-06, + "loss": 0.3008, + "step": 41936 + }, + { + "epoch": 0.7785818515314326, + "grad_norm": 0.5526497960090637, + "learning_rate": 2.323168142874773e-06, + "loss": 0.2242, + "step": 41938 + }, + { + "epoch": 0.7786189816688511, + "grad_norm": 0.43880075216293335, + "learning_rate": 2.3224206734062196e-06, + "loss": 0.2335, + "step": 41940 + }, + { + "epoch": 0.7786561118062698, + "grad_norm": 0.4893726408481598, + "learning_rate": 2.3216733084065223e-06, + "loss": 0.5103, + "step": 41942 + }, + { + "epoch": 0.7786932419436884, + "grad_norm": 0.3787483870983124, + "learning_rate": 2.3209260478858552e-06, + "loss": 0.3352, + "step": 41944 + }, + { + "epoch": 0.7787303720811071, + "grad_norm": 0.3361474275588989, + "learning_rate": 2.320178891854381e-06, + "loss": 0.1912, + "step": 41946 + }, + { + "epoch": 0.7787675022185258, + "grad_norm": 0.4269227087497711, + "learning_rate": 2.319431840322268e-06, + "loss": 0.3048, + "step": 41948 + }, + { + "epoch": 0.7788046323559443, + "grad_norm": 0.3966909945011139, + "learning_rate": 2.318684893299682e-06, + "loss": 0.2092, + "step": 41950 + }, + { + "epoch": 0.778841762493363, + "grad_norm": 0.42484644055366516, + "learning_rate": 2.3179380507967853e-06, + "loss": 0.1543, + "step": 41952 + }, + { + "epoch": 0.7788788926307816, + "grad_norm": 0.18529032170772552, + "learning_rate": 2.3171913128237424e-06, + "loss": 0.2175, + "step": 41954 + }, + { + "epoch": 0.7789160227682003, + "grad_norm": 0.303256094455719, + "learning_rate": 2.3164446793907146e-06, + "loss": 0.2586, + "step": 41956 + }, + { + "epoch": 0.778953152905619, + "grad_norm": 0.4764872193336487, + "learning_rate": 2.3156981505078614e-06, + "loss": 0.2801, + "step": 41958 + }, + { + "epoch": 0.7789902830430375, + "grad_norm": 0.2969540059566498, + "learning_rate": 2.314951726185336e-06, + "loss": 0.3265, + "step": 41960 + }, + { + "epoch": 0.7790274131804562, + "grad_norm": 0.5049210786819458, + "learning_rate": 2.3142054064332973e-06, + "loss": 0.2828, + "step": 41962 + }, + { + "epoch": 0.7790645433178748, + "grad_norm": 0.30695393681526184, + "learning_rate": 2.313459191261902e-06, + "loss": 0.1335, + "step": 41964 + }, + { + "epoch": 0.7791016734552935, + "grad_norm": 0.3008454144001007, + "learning_rate": 2.3127130806813037e-06, + "loss": 0.1898, + "step": 41966 + }, + { + "epoch": 0.7791388035927121, + "grad_norm": 0.5297009944915771, + "learning_rate": 2.3119670747016565e-06, + "loss": 0.3236, + "step": 41968 + }, + { + "epoch": 0.7791759337301307, + "grad_norm": 0.42279383540153503, + "learning_rate": 2.311221173333106e-06, + "loss": 0.2575, + "step": 41970 + }, + { + "epoch": 0.7792130638675494, + "grad_norm": 0.4686029851436615, + "learning_rate": 2.3104753765858056e-06, + "loss": 0.1267, + "step": 41972 + }, + { + "epoch": 0.779250194004968, + "grad_norm": 0.39017078280448914, + "learning_rate": 2.3097296844699015e-06, + "loss": 0.4226, + "step": 41974 + }, + { + "epoch": 0.7792873241423867, + "grad_norm": 0.35871565341949463, + "learning_rate": 2.3089840969955425e-06, + "loss": 0.3409, + "step": 41976 + }, + { + "epoch": 0.7793244542798052, + "grad_norm": 0.34710395336151123, + "learning_rate": 2.308238614172875e-06, + "loss": 0.3553, + "step": 41978 + }, + { + "epoch": 0.7793615844172239, + "grad_norm": 0.31683486700057983, + "learning_rate": 2.307493236012037e-06, + "loss": 0.211, + "step": 41980 + }, + { + "epoch": 0.7793987145546426, + "grad_norm": 0.3108522593975067, + "learning_rate": 2.306747962523178e-06, + "loss": 0.2902, + "step": 41982 + }, + { + "epoch": 0.7794358446920612, + "grad_norm": 0.472512811422348, + "learning_rate": 2.306002793716432e-06, + "loss": 0.2839, + "step": 41984 + }, + { + "epoch": 0.7794729748294799, + "grad_norm": 0.14931365847587585, + "learning_rate": 2.305257729601942e-06, + "loss": 0.0513, + "step": 41986 + }, + { + "epoch": 0.7795101049668984, + "grad_norm": 0.5165044069290161, + "learning_rate": 2.304512770189846e-06, + "loss": 0.2634, + "step": 41988 + }, + { + "epoch": 0.7795472351043171, + "grad_norm": 0.29606005549430847, + "learning_rate": 2.3037679154902805e-06, + "loss": 0.2472, + "step": 41990 + }, + { + "epoch": 0.7795843652417358, + "grad_norm": 0.2096748650074005, + "learning_rate": 2.3030231655133806e-06, + "loss": 0.3337, + "step": 41992 + }, + { + "epoch": 0.7796214953791544, + "grad_norm": 0.5780453085899353, + "learning_rate": 2.302278520269283e-06, + "loss": 0.1443, + "step": 41994 + }, + { + "epoch": 0.7796586255165731, + "grad_norm": 0.5483688712120056, + "learning_rate": 2.3015339797681158e-06, + "loss": 0.3706, + "step": 41996 + }, + { + "epoch": 0.7796957556539916, + "grad_norm": 0.6324063539505005, + "learning_rate": 2.30078954402001e-06, + "loss": 0.2967, + "step": 41998 + }, + { + "epoch": 0.7797328857914103, + "grad_norm": 0.29727110266685486, + "learning_rate": 2.3000452130351e-06, + "loss": 0.1619, + "step": 42000 + }, + { + "epoch": 0.779770015928829, + "grad_norm": 0.4517129957675934, + "learning_rate": 2.2993009868235084e-06, + "loss": 0.2119, + "step": 42002 + }, + { + "epoch": 0.7798071460662476, + "grad_norm": 0.37397605180740356, + "learning_rate": 2.2985568653953637e-06, + "loss": 0.3, + "step": 42004 + }, + { + "epoch": 0.7798442762036663, + "grad_norm": 0.5794094800949097, + "learning_rate": 2.2978128487607943e-06, + "loss": 0.15, + "step": 42006 + }, + { + "epoch": 0.7798814063410848, + "grad_norm": 0.5519453883171082, + "learning_rate": 2.297068936929918e-06, + "loss": 0.2503, + "step": 42008 + }, + { + "epoch": 0.7799185364785035, + "grad_norm": 0.28805795311927795, + "learning_rate": 2.296325129912861e-06, + "loss": 0.2611, + "step": 42010 + }, + { + "epoch": 0.7799556666159222, + "grad_norm": 0.368137925863266, + "learning_rate": 2.295581427719744e-06, + "loss": 0.2944, + "step": 42012 + }, + { + "epoch": 0.7799927967533408, + "grad_norm": 0.23401226103305817, + "learning_rate": 2.2948378303606855e-06, + "loss": 0.2212, + "step": 42014 + }, + { + "epoch": 0.7800299268907595, + "grad_norm": 0.4040607810020447, + "learning_rate": 2.294094337845806e-06, + "loss": 0.3175, + "step": 42016 + }, + { + "epoch": 0.780067057028178, + "grad_norm": 0.4615797698497772, + "learning_rate": 2.293350950185219e-06, + "loss": 0.3436, + "step": 42018 + }, + { + "epoch": 0.7801041871655967, + "grad_norm": 0.9331826567649841, + "learning_rate": 2.292607667389045e-06, + "loss": 0.3342, + "step": 42020 + }, + { + "epoch": 0.7801413173030154, + "grad_norm": 0.34921762347221375, + "learning_rate": 2.2918644894673948e-06, + "loss": 0.153, + "step": 42022 + }, + { + "epoch": 0.780178447440434, + "grad_norm": 0.2864750623703003, + "learning_rate": 2.2911214164303776e-06, + "loss": 0.2322, + "step": 42024 + }, + { + "epoch": 0.7802155775778526, + "grad_norm": 0.2754424512386322, + "learning_rate": 2.2903784482881063e-06, + "loss": 0.2532, + "step": 42026 + }, + { + "epoch": 0.7802527077152712, + "grad_norm": 0.3516775965690613, + "learning_rate": 2.2896355850506923e-06, + "loss": 0.2639, + "step": 42028 + }, + { + "epoch": 0.7802898378526899, + "grad_norm": 1.0208972692489624, + "learning_rate": 2.2888928267282428e-06, + "loss": 0.1987, + "step": 42030 + }, + { + "epoch": 0.7803269679901085, + "grad_norm": 0.47181200981140137, + "learning_rate": 2.2881501733308663e-06, + "loss": 0.2332, + "step": 42032 + }, + { + "epoch": 0.7803640981275272, + "grad_norm": 0.3775668740272522, + "learning_rate": 2.2874076248686637e-06, + "loss": 0.2129, + "step": 42034 + }, + { + "epoch": 0.7804012282649458, + "grad_norm": 0.4194645583629608, + "learning_rate": 2.2866651813517417e-06, + "loss": 0.315, + "step": 42036 + }, + { + "epoch": 0.7804383584023644, + "grad_norm": 0.47878801822662354, + "learning_rate": 2.2859228427902026e-06, + "loss": 0.2684, + "step": 42038 + }, + { + "epoch": 0.7804754885397831, + "grad_norm": 0.6330925822257996, + "learning_rate": 2.2851806091941476e-06, + "loss": 0.248, + "step": 42040 + }, + { + "epoch": 0.7805126186772017, + "grad_norm": 0.5842950344085693, + "learning_rate": 2.2844384805736776e-06, + "loss": 0.2271, + "step": 42042 + }, + { + "epoch": 0.7805497488146204, + "grad_norm": 0.49504387378692627, + "learning_rate": 2.2836964569388895e-06, + "loss": 0.3843, + "step": 42044 + }, + { + "epoch": 0.780586878952039, + "grad_norm": 0.4040229618549347, + "learning_rate": 2.2829545382998777e-06, + "loss": 0.2696, + "step": 42046 + }, + { + "epoch": 0.7806240090894576, + "grad_norm": 0.20205792784690857, + "learning_rate": 2.282212724666739e-06, + "loss": 0.2228, + "step": 42048 + }, + { + "epoch": 0.7806611392268763, + "grad_norm": 0.46668437123298645, + "learning_rate": 2.281471016049568e-06, + "loss": 0.3114, + "step": 42050 + }, + { + "epoch": 0.7806982693642949, + "grad_norm": 0.6821794509887695, + "learning_rate": 2.2807294124584557e-06, + "loss": 0.3434, + "step": 42052 + }, + { + "epoch": 0.7807353995017136, + "grad_norm": 0.4331595003604889, + "learning_rate": 2.2799879139034943e-06, + "loss": 0.4891, + "step": 42054 + }, + { + "epoch": 0.7807725296391322, + "grad_norm": 0.41589832305908203, + "learning_rate": 2.2792465203947744e-06, + "loss": 0.2665, + "step": 42056 + }, + { + "epoch": 0.7808096597765508, + "grad_norm": 0.5310564041137695, + "learning_rate": 2.278505231942385e-06, + "loss": 0.4039, + "step": 42058 + }, + { + "epoch": 0.7808467899139695, + "grad_norm": 0.38395994901657104, + "learning_rate": 2.277764048556408e-06, + "loss": 0.2773, + "step": 42060 + }, + { + "epoch": 0.7808839200513881, + "grad_norm": 0.5153673887252808, + "learning_rate": 2.2770229702469314e-06, + "loss": 0.1792, + "step": 42062 + }, + { + "epoch": 0.7809210501888068, + "grad_norm": 0.2802532911300659, + "learning_rate": 2.2762819970240425e-06, + "loss": 0.1351, + "step": 42064 + }, + { + "epoch": 0.7809581803262254, + "grad_norm": 0.30472829937934875, + "learning_rate": 2.275541128897818e-06, + "loss": 0.4752, + "step": 42066 + }, + { + "epoch": 0.780995310463644, + "grad_norm": 0.30499258637428284, + "learning_rate": 2.2748003658783403e-06, + "loss": 0.1809, + "step": 42068 + }, + { + "epoch": 0.7810324406010627, + "grad_norm": 0.39047542214393616, + "learning_rate": 2.274059707975693e-06, + "loss": 0.1683, + "step": 42070 + }, + { + "epoch": 0.7810695707384813, + "grad_norm": 0.5352392196655273, + "learning_rate": 2.273319155199949e-06, + "loss": 0.1672, + "step": 42072 + }, + { + "epoch": 0.7811067008759, + "grad_norm": 0.39798739552497864, + "learning_rate": 2.2725787075611873e-06, + "loss": 0.3741, + "step": 42074 + }, + { + "epoch": 0.7811438310133186, + "grad_norm": 0.2674466669559479, + "learning_rate": 2.271838365069482e-06, + "loss": 0.1709, + "step": 42076 + }, + { + "epoch": 0.7811809611507372, + "grad_norm": 0.7873106598854065, + "learning_rate": 2.2710981277349085e-06, + "loss": 0.2155, + "step": 42078 + }, + { + "epoch": 0.7812180912881559, + "grad_norm": 0.6064042448997498, + "learning_rate": 2.2703579955675393e-06, + "loss": 0.2535, + "step": 42080 + }, + { + "epoch": 0.7812552214255745, + "grad_norm": 0.4949207603931427, + "learning_rate": 2.2696179685774467e-06, + "loss": 0.4307, + "step": 42082 + }, + { + "epoch": 0.7812923515629931, + "grad_norm": 0.3666563928127289, + "learning_rate": 2.268878046774696e-06, + "loss": 0.198, + "step": 42084 + }, + { + "epoch": 0.7813294817004117, + "grad_norm": 0.3478747606277466, + "learning_rate": 2.268138230169361e-06, + "loss": 0.23, + "step": 42086 + }, + { + "epoch": 0.7813666118378304, + "grad_norm": 0.4900912046432495, + "learning_rate": 2.2673985187715018e-06, + "loss": 0.4835, + "step": 42088 + }, + { + "epoch": 0.7814037419752491, + "grad_norm": 0.25482162833213806, + "learning_rate": 2.2666589125911865e-06, + "loss": 0.2716, + "step": 42090 + }, + { + "epoch": 0.7814408721126677, + "grad_norm": 0.4350324869155884, + "learning_rate": 2.2659194116384807e-06, + "loss": 0.1359, + "step": 42092 + }, + { + "epoch": 0.7814780022500863, + "grad_norm": 0.6658808588981628, + "learning_rate": 2.265180015923447e-06, + "loss": 0.2899, + "step": 42094 + }, + { + "epoch": 0.7815151323875049, + "grad_norm": 0.46713539958000183, + "learning_rate": 2.264440725456143e-06, + "loss": 0.4535, + "step": 42096 + }, + { + "epoch": 0.7815522625249236, + "grad_norm": 0.5268535614013672, + "learning_rate": 2.263701540246629e-06, + "loss": 0.4016, + "step": 42098 + }, + { + "epoch": 0.7815893926623423, + "grad_norm": 0.27765798568725586, + "learning_rate": 2.262962460304965e-06, + "loss": 0.2465, + "step": 42100 + }, + { + "epoch": 0.7816265227997609, + "grad_norm": 0.24521902203559875, + "learning_rate": 2.262223485641206e-06, + "loss": 0.3014, + "step": 42102 + }, + { + "epoch": 0.7816636529371795, + "grad_norm": 0.23071905970573425, + "learning_rate": 2.2614846162654114e-06, + "loss": 0.2213, + "step": 42104 + }, + { + "epoch": 0.7817007830745981, + "grad_norm": 0.32160794734954834, + "learning_rate": 2.260745852187628e-06, + "loss": 0.2285, + "step": 42106 + }, + { + "epoch": 0.7817379132120168, + "grad_norm": 0.5220719575881958, + "learning_rate": 2.2600071934179156e-06, + "loss": 0.4042, + "step": 42108 + }, + { + "epoch": 0.7817750433494355, + "grad_norm": 0.27637115120887756, + "learning_rate": 2.2592686399663176e-06, + "loss": 0.4533, + "step": 42110 + }, + { + "epoch": 0.7818121734868541, + "grad_norm": 0.2628991901874542, + "learning_rate": 2.258530191842888e-06, + "loss": 0.3948, + "step": 42112 + }, + { + "epoch": 0.7818493036242727, + "grad_norm": 0.37218040227890015, + "learning_rate": 2.2577918490576747e-06, + "loss": 0.2119, + "step": 42114 + }, + { + "epoch": 0.7818864337616913, + "grad_norm": 0.4413298964500427, + "learning_rate": 2.257053611620723e-06, + "loss": 0.363, + "step": 42116 + }, + { + "epoch": 0.78192356389911, + "grad_norm": 0.5194867253303528, + "learning_rate": 2.2563154795420785e-06, + "loss": 0.2652, + "step": 42118 + }, + { + "epoch": 0.7819606940365287, + "grad_norm": 0.555852472782135, + "learning_rate": 2.2555774528317885e-06, + "loss": 0.1859, + "step": 42120 + }, + { + "epoch": 0.7819978241739473, + "grad_norm": 0.4232317805290222, + "learning_rate": 2.2548395314998895e-06, + "loss": 0.185, + "step": 42122 + }, + { + "epoch": 0.7820349543113659, + "grad_norm": 0.43475258350372314, + "learning_rate": 2.254101715556425e-06, + "loss": 0.3747, + "step": 42124 + }, + { + "epoch": 0.7820720844487845, + "grad_norm": 0.4701539874076843, + "learning_rate": 2.2533640050114338e-06, + "loss": 0.2549, + "step": 42126 + }, + { + "epoch": 0.7821092145862032, + "grad_norm": 0.45172321796417236, + "learning_rate": 2.2526263998749575e-06, + "loss": 0.4192, + "step": 42128 + }, + { + "epoch": 0.7821463447236218, + "grad_norm": 0.3604142367839813, + "learning_rate": 2.2518889001570276e-06, + "loss": 0.3117, + "step": 42130 + }, + { + "epoch": 0.7821834748610405, + "grad_norm": 0.5219382643699646, + "learning_rate": 2.2511515058676848e-06, + "loss": 0.4455, + "step": 42132 + }, + { + "epoch": 0.7822206049984591, + "grad_norm": 0.4895544648170471, + "learning_rate": 2.2504142170169554e-06, + "loss": 0.3093, + "step": 42134 + }, + { + "epoch": 0.7822577351358777, + "grad_norm": 0.25226712226867676, + "learning_rate": 2.2496770336148767e-06, + "loss": 0.3284, + "step": 42136 + }, + { + "epoch": 0.7822948652732964, + "grad_norm": 0.37229016423225403, + "learning_rate": 2.2489399556714786e-06, + "loss": 0.2861, + "step": 42138 + }, + { + "epoch": 0.782331995410715, + "grad_norm": 0.28121745586395264, + "learning_rate": 2.24820298319679e-06, + "loss": 0.2673, + "step": 42140 + }, + { + "epoch": 0.7823691255481336, + "grad_norm": 0.527967095375061, + "learning_rate": 2.2474661162008404e-06, + "loss": 0.2407, + "step": 42142 + }, + { + "epoch": 0.7824062556855523, + "grad_norm": 0.46580854058265686, + "learning_rate": 2.2467293546936543e-06, + "loss": 0.3723, + "step": 42144 + }, + { + "epoch": 0.7824433858229709, + "grad_norm": 0.2727673053741455, + "learning_rate": 2.245992698685262e-06, + "loss": 0.3469, + "step": 42146 + }, + { + "epoch": 0.7824805159603896, + "grad_norm": 0.7394891977310181, + "learning_rate": 2.2452561481856794e-06, + "loss": 0.3744, + "step": 42148 + }, + { + "epoch": 0.7825176460978082, + "grad_norm": 0.5637769103050232, + "learning_rate": 2.244519703204936e-06, + "loss": 0.1769, + "step": 42150 + }, + { + "epoch": 0.7825547762352268, + "grad_norm": 0.339983195066452, + "learning_rate": 2.243783363753045e-06, + "loss": 0.2278, + "step": 42152 + }, + { + "epoch": 0.7825919063726455, + "grad_norm": 0.4092691242694855, + "learning_rate": 2.2430471298400315e-06, + "loss": 0.2464, + "step": 42154 + }, + { + "epoch": 0.7826290365100641, + "grad_norm": 0.35632917284965515, + "learning_rate": 2.2423110014759106e-06, + "loss": 0.2528, + "step": 42156 + }, + { + "epoch": 0.7826661666474828, + "grad_norm": 0.36183103919029236, + "learning_rate": 2.241574978670703e-06, + "loss": 0.3273, + "step": 42158 + }, + { + "epoch": 0.7827032967849014, + "grad_norm": 0.35036203265190125, + "learning_rate": 2.240839061434419e-06, + "loss": 0.2789, + "step": 42160 + }, + { + "epoch": 0.78274042692232, + "grad_norm": 0.3006502389907837, + "learning_rate": 2.2401032497770725e-06, + "loss": 0.3745, + "step": 42162 + }, + { + "epoch": 0.7827775570597387, + "grad_norm": 0.4061864912509918, + "learning_rate": 2.239367543708678e-06, + "loss": 0.2231, + "step": 42164 + }, + { + "epoch": 0.7828146871971573, + "grad_norm": 0.39592668414115906, + "learning_rate": 2.2386319432392457e-06, + "loss": 0.2048, + "step": 42166 + }, + { + "epoch": 0.782851817334576, + "grad_norm": 0.2266625612974167, + "learning_rate": 2.237896448378787e-06, + "loss": 0.2765, + "step": 42168 + }, + { + "epoch": 0.7828889474719946, + "grad_norm": 0.363943487405777, + "learning_rate": 2.237161059137305e-06, + "loss": 0.1758, + "step": 42170 + }, + { + "epoch": 0.7829260776094132, + "grad_norm": 0.43294623494148254, + "learning_rate": 2.236425775524811e-06, + "loss": 0.358, + "step": 42172 + }, + { + "epoch": 0.7829632077468319, + "grad_norm": 0.42422956228256226, + "learning_rate": 2.2356905975513042e-06, + "loss": 0.1218, + "step": 42174 + }, + { + "epoch": 0.7830003378842505, + "grad_norm": 0.5290468335151672, + "learning_rate": 2.2349555252267928e-06, + "loss": 0.4254, + "step": 42176 + }, + { + "epoch": 0.7830374680216692, + "grad_norm": 0.2562585175037384, + "learning_rate": 2.2342205585612775e-06, + "loss": 0.2152, + "step": 42178 + }, + { + "epoch": 0.7830745981590878, + "grad_norm": 0.632354199886322, + "learning_rate": 2.233485697564758e-06, + "loss": 0.4759, + "step": 42180 + }, + { + "epoch": 0.7831117282965064, + "grad_norm": 0.588380753993988, + "learning_rate": 2.232750942247236e-06, + "loss": 0.31, + "step": 42182 + }, + { + "epoch": 0.783148858433925, + "grad_norm": 0.43167319893836975, + "learning_rate": 2.2320162926187108e-06, + "loss": 0.3145, + "step": 42184 + }, + { + "epoch": 0.7831859885713437, + "grad_norm": 0.4440044164657593, + "learning_rate": 2.2312817486891723e-06, + "loss": 0.2789, + "step": 42186 + }, + { + "epoch": 0.7832231187087624, + "grad_norm": 0.31821054220199585, + "learning_rate": 2.2305473104686205e-06, + "loss": 0.3079, + "step": 42188 + }, + { + "epoch": 0.783260248846181, + "grad_norm": 0.38078948855400085, + "learning_rate": 2.2298129779670497e-06, + "loss": 0.3356, + "step": 42190 + }, + { + "epoch": 0.7832973789835996, + "grad_norm": 0.32156088948249817, + "learning_rate": 2.2290787511944467e-06, + "loss": 0.2949, + "step": 42192 + }, + { + "epoch": 0.7833345091210182, + "grad_norm": 0.29858896136283875, + "learning_rate": 2.2283446301608056e-06, + "loss": 0.3889, + "step": 42194 + }, + { + "epoch": 0.7833716392584369, + "grad_norm": 0.32905834913253784, + "learning_rate": 2.227610614876119e-06, + "loss": 0.2349, + "step": 42196 + }, + { + "epoch": 0.7834087693958556, + "grad_norm": 0.3558305501937866, + "learning_rate": 2.226876705350367e-06, + "loss": 0.3838, + "step": 42198 + }, + { + "epoch": 0.7834458995332741, + "grad_norm": 0.5639286637306213, + "learning_rate": 2.22614290159354e-06, + "loss": 0.2484, + "step": 42200 + }, + { + "epoch": 0.7834830296706928, + "grad_norm": 0.4013896882534027, + "learning_rate": 2.2254092036156226e-06, + "loss": 0.2619, + "step": 42202 + }, + { + "epoch": 0.7835201598081114, + "grad_norm": 0.46153512597084045, + "learning_rate": 2.2246756114265986e-06, + "loss": 0.1666, + "step": 42204 + }, + { + "epoch": 0.7835572899455301, + "grad_norm": 0.49233052134513855, + "learning_rate": 2.22394212503645e-06, + "loss": 0.3297, + "step": 42206 + }, + { + "epoch": 0.7835944200829488, + "grad_norm": 0.4743593633174896, + "learning_rate": 2.2232087444551596e-06, + "loss": 0.1964, + "step": 42208 + }, + { + "epoch": 0.7836315502203673, + "grad_norm": 0.49502843618392944, + "learning_rate": 2.2224754696927007e-06, + "loss": 0.2462, + "step": 42210 + }, + { + "epoch": 0.783668680357786, + "grad_norm": 0.3522895276546478, + "learning_rate": 2.2217423007590575e-06, + "loss": 0.2152, + "step": 42212 + }, + { + "epoch": 0.7837058104952046, + "grad_norm": 0.43937209248542786, + "learning_rate": 2.2210092376642012e-06, + "loss": 0.2305, + "step": 42214 + }, + { + "epoch": 0.7837429406326233, + "grad_norm": 0.5793728828430176, + "learning_rate": 2.220276280418108e-06, + "loss": 0.4118, + "step": 42216 + }, + { + "epoch": 0.783780070770042, + "grad_norm": 0.34709978103637695, + "learning_rate": 2.2195434290307507e-06, + "loss": 0.3892, + "step": 42218 + }, + { + "epoch": 0.7838172009074605, + "grad_norm": 0.4877430498600006, + "learning_rate": 2.218810683512105e-06, + "loss": 0.1858, + "step": 42220 + }, + { + "epoch": 0.7838543310448792, + "grad_norm": 0.20193272829055786, + "learning_rate": 2.218078043872136e-06, + "loss": 0.1909, + "step": 42222 + }, + { + "epoch": 0.7838914611822978, + "grad_norm": 0.6479068994522095, + "learning_rate": 2.217345510120816e-06, + "loss": 0.1777, + "step": 42224 + }, + { + "epoch": 0.7839285913197165, + "grad_norm": 0.3782392740249634, + "learning_rate": 2.216613082268111e-06, + "loss": 0.3121, + "step": 42226 + }, + { + "epoch": 0.7839657214571352, + "grad_norm": 0.5034322142601013, + "learning_rate": 2.2158807603239883e-06, + "loss": 0.1657, + "step": 42228 + }, + { + "epoch": 0.7840028515945537, + "grad_norm": 0.6439116597175598, + "learning_rate": 2.2151485442984123e-06, + "loss": 0.2578, + "step": 42230 + }, + { + "epoch": 0.7840399817319724, + "grad_norm": 0.3102177679538727, + "learning_rate": 2.2144164342013495e-06, + "loss": 0.2814, + "step": 42232 + }, + { + "epoch": 0.784077111869391, + "grad_norm": 0.43839606642723083, + "learning_rate": 2.213684430042755e-06, + "loss": 0.4285, + "step": 42234 + }, + { + "epoch": 0.7841142420068097, + "grad_norm": 0.3004555404186249, + "learning_rate": 2.2129525318325962e-06, + "loss": 0.2383, + "step": 42236 + }, + { + "epoch": 0.7841513721442283, + "grad_norm": 0.4291225075721741, + "learning_rate": 2.212220739580825e-06, + "loss": 0.301, + "step": 42238 + }, + { + "epoch": 0.7841885022816469, + "grad_norm": 0.2695433497428894, + "learning_rate": 2.2114890532974033e-06, + "loss": 0.1461, + "step": 42240 + }, + { + "epoch": 0.7842256324190656, + "grad_norm": 0.7617182731628418, + "learning_rate": 2.2107574729922855e-06, + "loss": 0.2494, + "step": 42242 + }, + { + "epoch": 0.7842627625564842, + "grad_norm": 0.5847040414810181, + "learning_rate": 2.2100259986754267e-06, + "loss": 0.27, + "step": 42244 + }, + { + "epoch": 0.7842998926939029, + "grad_norm": 0.3790019750595093, + "learning_rate": 2.2092946303567842e-06, + "loss": 0.2949, + "step": 42246 + }, + { + "epoch": 0.7843370228313215, + "grad_norm": 0.38911423087120056, + "learning_rate": 2.2085633680463026e-06, + "loss": 0.2054, + "step": 42248 + }, + { + "epoch": 0.7843741529687401, + "grad_norm": 0.2824215292930603, + "learning_rate": 2.2078322117539363e-06, + "loss": 0.1621, + "step": 42250 + }, + { + "epoch": 0.7844112831061588, + "grad_norm": 0.37508857250213623, + "learning_rate": 2.207101161489632e-06, + "loss": 0.2754, + "step": 42252 + }, + { + "epoch": 0.7844484132435774, + "grad_norm": 0.45144888758659363, + "learning_rate": 2.2063702172633417e-06, + "loss": 0.1477, + "step": 42254 + }, + { + "epoch": 0.7844855433809961, + "grad_norm": 0.4616270661354065, + "learning_rate": 2.205639379085006e-06, + "loss": 0.3076, + "step": 42256 + }, + { + "epoch": 0.7845226735184146, + "grad_norm": 0.3871453106403351, + "learning_rate": 2.204908646964574e-06, + "loss": 0.1862, + "step": 42258 + }, + { + "epoch": 0.7845598036558333, + "grad_norm": 0.2531983256340027, + "learning_rate": 2.2041780209119833e-06, + "loss": 0.2268, + "step": 42260 + }, + { + "epoch": 0.784596933793252, + "grad_norm": 0.227292999625206, + "learning_rate": 2.203447500937178e-06, + "loss": 0.2692, + "step": 42262 + }, + { + "epoch": 0.7846340639306706, + "grad_norm": 0.6295682787895203, + "learning_rate": 2.2027170870501002e-06, + "loss": 0.1114, + "step": 42264 + }, + { + "epoch": 0.7846711940680893, + "grad_norm": 0.45311617851257324, + "learning_rate": 2.2019867792606865e-06, + "loss": 0.2884, + "step": 42266 + }, + { + "epoch": 0.7847083242055078, + "grad_norm": 0.45629945397377014, + "learning_rate": 2.2012565775788754e-06, + "loss": 0.2424, + "step": 42268 + }, + { + "epoch": 0.7847454543429265, + "grad_norm": 0.6066263318061829, + "learning_rate": 2.2005264820146012e-06, + "loss": 0.3604, + "step": 42270 + }, + { + "epoch": 0.7847825844803452, + "grad_norm": 0.3385358452796936, + "learning_rate": 2.1997964925778028e-06, + "loss": 0.2448, + "step": 42272 + }, + { + "epoch": 0.7848197146177638, + "grad_norm": 0.27375373244285583, + "learning_rate": 2.1990666092784076e-06, + "loss": 0.3813, + "step": 42274 + }, + { + "epoch": 0.7848568447551825, + "grad_norm": 0.20720945298671722, + "learning_rate": 2.198336832126352e-06, + "loss": 0.2247, + "step": 42276 + }, + { + "epoch": 0.784893974892601, + "grad_norm": 0.5400685667991638, + "learning_rate": 2.19760716113156e-06, + "loss": 0.2763, + "step": 42278 + }, + { + "epoch": 0.7849311050300197, + "grad_norm": 0.5421027541160583, + "learning_rate": 2.1968775963039647e-06, + "loss": 0.2893, + "step": 42280 + }, + { + "epoch": 0.7849682351674383, + "grad_norm": 0.3093223571777344, + "learning_rate": 2.196148137653491e-06, + "loss": 0.2853, + "step": 42282 + }, + { + "epoch": 0.785005365304857, + "grad_norm": 0.43486881256103516, + "learning_rate": 2.19541878519007e-06, + "loss": 0.2939, + "step": 42284 + }, + { + "epoch": 0.7850424954422757, + "grad_norm": 0.35611236095428467, + "learning_rate": 2.1946895389236177e-06, + "loss": 0.2443, + "step": 42286 + }, + { + "epoch": 0.7850796255796942, + "grad_norm": 0.3992459177970886, + "learning_rate": 2.1939603988640623e-06, + "loss": 0.1532, + "step": 42288 + }, + { + "epoch": 0.7851167557171129, + "grad_norm": 0.4848586916923523, + "learning_rate": 2.1932313650213234e-06, + "loss": 0.3774, + "step": 42290 + }, + { + "epoch": 0.7851538858545315, + "grad_norm": 0.7071346044540405, + "learning_rate": 2.192502437405322e-06, + "loss": 0.2309, + "step": 42292 + }, + { + "epoch": 0.7851910159919502, + "grad_norm": 0.39986705780029297, + "learning_rate": 2.1917736160259763e-06, + "loss": 0.1922, + "step": 42294 + }, + { + "epoch": 0.7852281461293689, + "grad_norm": 0.4086349606513977, + "learning_rate": 2.1910449008932057e-06, + "loss": 0.3061, + "step": 42296 + }, + { + "epoch": 0.7852652762667874, + "grad_norm": 0.3096737265586853, + "learning_rate": 2.190316292016924e-06, + "loss": 0.1053, + "step": 42298 + }, + { + "epoch": 0.7853024064042061, + "grad_norm": 0.2733769416809082, + "learning_rate": 2.1895877894070415e-06, + "loss": 0.3471, + "step": 42300 + }, + { + "epoch": 0.7853395365416247, + "grad_norm": 1.0041214227676392, + "learning_rate": 2.188859393073475e-06, + "loss": 0.0803, + "step": 42302 + }, + { + "epoch": 0.7853766666790434, + "grad_norm": 0.5119099617004395, + "learning_rate": 2.1881311030261345e-06, + "loss": 0.3452, + "step": 42304 + }, + { + "epoch": 0.7854137968164621, + "grad_norm": 0.3575119972229004, + "learning_rate": 2.18740291927493e-06, + "loss": 0.3916, + "step": 42306 + }, + { + "epoch": 0.7854509269538806, + "grad_norm": 0.368917852640152, + "learning_rate": 2.1866748418297724e-06, + "loss": 0.1356, + "step": 42308 + }, + { + "epoch": 0.7854880570912993, + "grad_norm": 0.35192573070526123, + "learning_rate": 2.185946870700567e-06, + "loss": 0.1853, + "step": 42310 + }, + { + "epoch": 0.7855251872287179, + "grad_norm": 0.468219131231308, + "learning_rate": 2.1852190058972176e-06, + "loss": 0.2264, + "step": 42312 + }, + { + "epoch": 0.7855623173661366, + "grad_norm": 0.27757829427719116, + "learning_rate": 2.1844912474296288e-06, + "loss": 0.1853, + "step": 42314 + }, + { + "epoch": 0.7855994475035553, + "grad_norm": 0.41886138916015625, + "learning_rate": 2.183763595307704e-06, + "loss": 0.536, + "step": 42316 + }, + { + "epoch": 0.7856365776409738, + "grad_norm": 0.35210514068603516, + "learning_rate": 2.183036049541347e-06, + "loss": 0.2265, + "step": 42318 + }, + { + "epoch": 0.7856737077783925, + "grad_norm": 0.3935351073741913, + "learning_rate": 2.1823086101404524e-06, + "loss": 0.2188, + "step": 42320 + }, + { + "epoch": 0.7857108379158111, + "grad_norm": 0.26120105385780334, + "learning_rate": 2.1815812771149235e-06, + "loss": 0.1932, + "step": 42322 + }, + { + "epoch": 0.7857479680532298, + "grad_norm": 0.3296315371990204, + "learning_rate": 2.1808540504746524e-06, + "loss": 0.1641, + "step": 42324 + }, + { + "epoch": 0.7857850981906485, + "grad_norm": 0.31115928292274475, + "learning_rate": 2.180126930229536e-06, + "loss": 0.3933, + "step": 42326 + }, + { + "epoch": 0.785822228328067, + "grad_norm": 0.3197717070579529, + "learning_rate": 2.1793999163894695e-06, + "loss": 0.249, + "step": 42328 + }, + { + "epoch": 0.7858593584654857, + "grad_norm": 0.4750738739967346, + "learning_rate": 2.178673008964344e-06, + "loss": 0.3761, + "step": 42330 + }, + { + "epoch": 0.7858964886029043, + "grad_norm": 0.232594296336174, + "learning_rate": 2.1779462079640513e-06, + "loss": 0.2651, + "step": 42332 + }, + { + "epoch": 0.785933618740323, + "grad_norm": 0.3119640648365021, + "learning_rate": 2.177219513398484e-06, + "loss": 0.1816, + "step": 42334 + }, + { + "epoch": 0.7859707488777415, + "grad_norm": 0.4839460849761963, + "learning_rate": 2.1764929252775247e-06, + "loss": 0.2784, + "step": 42336 + }, + { + "epoch": 0.7860078790151602, + "grad_norm": 0.46130073070526123, + "learning_rate": 2.1757664436110613e-06, + "loss": 0.3859, + "step": 42338 + }, + { + "epoch": 0.7860450091525789, + "grad_norm": 0.32810646295547485, + "learning_rate": 2.1750400684089844e-06, + "loss": 0.2414, + "step": 42340 + }, + { + "epoch": 0.7860821392899975, + "grad_norm": 0.5095705986022949, + "learning_rate": 2.17431379968117e-06, + "loss": 0.2154, + "step": 42342 + }, + { + "epoch": 0.7861192694274162, + "grad_norm": 0.5689878463745117, + "learning_rate": 2.1735876374375054e-06, + "loss": 0.0926, + "step": 42344 + }, + { + "epoch": 0.7861563995648347, + "grad_norm": 0.28199559450149536, + "learning_rate": 2.1728615816878694e-06, + "loss": 0.1367, + "step": 42346 + }, + { + "epoch": 0.7861935297022534, + "grad_norm": 0.4046151041984558, + "learning_rate": 2.172135632442145e-06, + "loss": 0.3003, + "step": 42348 + }, + { + "epoch": 0.7862306598396721, + "grad_norm": 0.2964744567871094, + "learning_rate": 2.171409789710205e-06, + "loss": 0.3055, + "step": 42350 + }, + { + "epoch": 0.7862677899770907, + "grad_norm": 0.33051028847694397, + "learning_rate": 2.170684053501928e-06, + "loss": 0.3447, + "step": 42352 + }, + { + "epoch": 0.7863049201145094, + "grad_norm": 0.45862480998039246, + "learning_rate": 2.16995842382719e-06, + "loss": 0.2065, + "step": 42354 + }, + { + "epoch": 0.7863420502519279, + "grad_norm": 0.2809135615825653, + "learning_rate": 2.1692329006958637e-06, + "loss": 0.1613, + "step": 42356 + }, + { + "epoch": 0.7863791803893466, + "grad_norm": 0.34259822964668274, + "learning_rate": 2.1685074841178245e-06, + "loss": 0.3361, + "step": 42358 + }, + { + "epoch": 0.7864163105267653, + "grad_norm": 0.32796937227249146, + "learning_rate": 2.167782174102938e-06, + "loss": 0.214, + "step": 42360 + }, + { + "epoch": 0.7864534406641839, + "grad_norm": 0.47342145442962646, + "learning_rate": 2.167056970661079e-06, + "loss": 0.2404, + "step": 42362 + }, + { + "epoch": 0.7864905708016026, + "grad_norm": 2.0475125312805176, + "learning_rate": 2.1663318738021087e-06, + "loss": 0.1652, + "step": 42364 + }, + { + "epoch": 0.7865277009390211, + "grad_norm": 0.4430631697177887, + "learning_rate": 2.1656068835358967e-06, + "loss": 0.29, + "step": 42366 + }, + { + "epoch": 0.7865648310764398, + "grad_norm": 0.2994740605354309, + "learning_rate": 2.164881999872309e-06, + "loss": 0.2988, + "step": 42368 + }, + { + "epoch": 0.7866019612138585, + "grad_norm": 0.41451776027679443, + "learning_rate": 2.1641572228212095e-06, + "loss": 0.3117, + "step": 42370 + }, + { + "epoch": 0.7866390913512771, + "grad_norm": 0.7492827773094177, + "learning_rate": 2.1634325523924603e-06, + "loss": 0.1539, + "step": 42372 + }, + { + "epoch": 0.7866762214886958, + "grad_norm": 0.40187567472457886, + "learning_rate": 2.162707988595918e-06, + "loss": 0.205, + "step": 42374 + }, + { + "epoch": 0.7867133516261143, + "grad_norm": 0.3473794162273407, + "learning_rate": 2.161983531441445e-06, + "loss": 0.1093, + "step": 42376 + }, + { + "epoch": 0.786750481763533, + "grad_norm": 0.39647749066352844, + "learning_rate": 2.1612591809388993e-06, + "loss": 0.2748, + "step": 42378 + }, + { + "epoch": 0.7867876119009517, + "grad_norm": 0.4211435616016388, + "learning_rate": 2.160534937098139e-06, + "loss": 0.2987, + "step": 42380 + }, + { + "epoch": 0.7868247420383703, + "grad_norm": 0.34683167934417725, + "learning_rate": 2.1598107999290117e-06, + "loss": 0.106, + "step": 42382 + }, + { + "epoch": 0.786861872175789, + "grad_norm": 0.2671588659286499, + "learning_rate": 2.15908676944138e-06, + "loss": 0.1848, + "step": 42384 + }, + { + "epoch": 0.7868990023132075, + "grad_norm": 0.4586368501186371, + "learning_rate": 2.1583628456450878e-06, + "loss": 0.2322, + "step": 42386 + }, + { + "epoch": 0.7869361324506262, + "grad_norm": 0.28732019662857056, + "learning_rate": 2.157639028549989e-06, + "loss": 0.2642, + "step": 42388 + }, + { + "epoch": 0.7869732625880448, + "grad_norm": 0.62124103307724, + "learning_rate": 2.156915318165932e-06, + "loss": 0.2349, + "step": 42390 + }, + { + "epoch": 0.7870103927254635, + "grad_norm": 0.6514405608177185, + "learning_rate": 2.1561917145027657e-06, + "loss": 0.3771, + "step": 42392 + }, + { + "epoch": 0.7870475228628822, + "grad_norm": 0.3576472997665405, + "learning_rate": 2.1554682175703333e-06, + "loss": 0.2889, + "step": 42394 + }, + { + "epoch": 0.7870846530003007, + "grad_norm": 0.2909958064556122, + "learning_rate": 2.154744827378483e-06, + "loss": 0.1988, + "step": 42396 + }, + { + "epoch": 0.7871217831377194, + "grad_norm": 0.3952322006225586, + "learning_rate": 2.1540215439370594e-06, + "loss": 0.2046, + "step": 42398 + }, + { + "epoch": 0.787158913275138, + "grad_norm": 0.33975329995155334, + "learning_rate": 2.153298367255897e-06, + "loss": 0.1792, + "step": 42400 + }, + { + "epoch": 0.7871960434125567, + "grad_norm": 0.3457679748535156, + "learning_rate": 2.152575297344841e-06, + "loss": 0.2563, + "step": 42402 + }, + { + "epoch": 0.7872331735499754, + "grad_norm": 0.3037508726119995, + "learning_rate": 2.151852334213732e-06, + "loss": 0.3027, + "step": 42404 + }, + { + "epoch": 0.7872703036873939, + "grad_norm": 0.3304939568042755, + "learning_rate": 2.151129477872402e-06, + "loss": 0.3583, + "step": 42406 + }, + { + "epoch": 0.7873074338248126, + "grad_norm": 0.27041134238243103, + "learning_rate": 2.1504067283306906e-06, + "loss": 0.1613, + "step": 42408 + }, + { + "epoch": 0.7873445639622312, + "grad_norm": 0.5043504238128662, + "learning_rate": 2.1496840855984336e-06, + "loss": 0.3279, + "step": 42410 + }, + { + "epoch": 0.7873816940996499, + "grad_norm": 0.39772164821624756, + "learning_rate": 2.1489615496854587e-06, + "loss": 0.4533, + "step": 42412 + }, + { + "epoch": 0.7874188242370685, + "grad_norm": 0.4006732106208801, + "learning_rate": 2.1482391206016006e-06, + "loss": 0.2669, + "step": 42414 + }, + { + "epoch": 0.7874559543744871, + "grad_norm": 0.3448793590068817, + "learning_rate": 2.1475167983566893e-06, + "loss": 0.3113, + "step": 42416 + }, + { + "epoch": 0.7874930845119058, + "grad_norm": 0.4120835065841675, + "learning_rate": 2.146794582960554e-06, + "loss": 0.3534, + "step": 42418 + }, + { + "epoch": 0.7875302146493244, + "grad_norm": 0.28927019238471985, + "learning_rate": 2.146072474423021e-06, + "loss": 0.2854, + "step": 42420 + }, + { + "epoch": 0.7875673447867431, + "grad_norm": 0.35343897342681885, + "learning_rate": 2.1453504727539188e-06, + "loss": 0.272, + "step": 42422 + }, + { + "epoch": 0.7876044749241617, + "grad_norm": 0.32779720425605774, + "learning_rate": 2.144628577963067e-06, + "loss": 0.1675, + "step": 42424 + }, + { + "epoch": 0.7876416050615803, + "grad_norm": 0.30982351303100586, + "learning_rate": 2.143906790060293e-06, + "loss": 0.1517, + "step": 42426 + }, + { + "epoch": 0.787678735198999, + "grad_norm": 0.19453322887420654, + "learning_rate": 2.1431851090554135e-06, + "loss": 0.1929, + "step": 42428 + }, + { + "epoch": 0.7877158653364176, + "grad_norm": 0.4988711476325989, + "learning_rate": 2.14246353495825e-06, + "loss": 0.1879, + "step": 42430 + }, + { + "epoch": 0.7877529954738363, + "grad_norm": 0.4961640536785126, + "learning_rate": 2.1417420677786227e-06, + "loss": 0.4679, + "step": 42432 + }, + { + "epoch": 0.7877901256112548, + "grad_norm": 0.4295714199542999, + "learning_rate": 2.141020707526347e-06, + "loss": 0.3894, + "step": 42434 + }, + { + "epoch": 0.7878272557486735, + "grad_norm": 0.4100876450538635, + "learning_rate": 2.140299454211242e-06, + "loss": 0.3361, + "step": 42436 + }, + { + "epoch": 0.7878643858860922, + "grad_norm": 0.40491577982902527, + "learning_rate": 2.1395783078431167e-06, + "loss": 0.3033, + "step": 42438 + }, + { + "epoch": 0.7879015160235108, + "grad_norm": 0.35602709650993347, + "learning_rate": 2.1388572684317856e-06, + "loss": 0.2957, + "step": 42440 + }, + { + "epoch": 0.7879386461609295, + "grad_norm": 0.4789207875728607, + "learning_rate": 2.13813633598706e-06, + "loss": 0.3623, + "step": 42442 + }, + { + "epoch": 0.787975776298348, + "grad_norm": 0.4444025158882141, + "learning_rate": 2.1374155105187533e-06, + "loss": 0.3309, + "step": 42444 + }, + { + "epoch": 0.7880129064357667, + "grad_norm": 0.6774869561195374, + "learning_rate": 2.1366947920366663e-06, + "loss": 0.3555, + "step": 42446 + }, + { + "epoch": 0.7880500365731854, + "grad_norm": 0.35451751947402954, + "learning_rate": 2.135974180550614e-06, + "loss": 0.3058, + "step": 42448 + }, + { + "epoch": 0.788087166710604, + "grad_norm": 0.43377426266670227, + "learning_rate": 2.1352536760703946e-06, + "loss": 0.2583, + "step": 42450 + }, + { + "epoch": 0.7881242968480227, + "grad_norm": 0.4566328525543213, + "learning_rate": 2.134533278605814e-06, + "loss": 0.28, + "step": 42452 + }, + { + "epoch": 0.7881614269854412, + "grad_norm": 0.38286250829696655, + "learning_rate": 2.133812988166677e-06, + "loss": 0.1169, + "step": 42454 + }, + { + "epoch": 0.7881985571228599, + "grad_norm": 0.4602077305316925, + "learning_rate": 2.133092804762782e-06, + "loss": 0.3982, + "step": 42456 + }, + { + "epoch": 0.7882356872602786, + "grad_norm": 0.341356098651886, + "learning_rate": 2.132372728403931e-06, + "loss": 0.2433, + "step": 42458 + }, + { + "epoch": 0.7882728173976972, + "grad_norm": 0.5872789621353149, + "learning_rate": 2.1316527590999225e-06, + "loss": 0.4408, + "step": 42460 + }, + { + "epoch": 0.7883099475351159, + "grad_norm": 0.42184120416641235, + "learning_rate": 2.130932896860549e-06, + "loss": 0.4544, + "step": 42462 + }, + { + "epoch": 0.7883470776725344, + "grad_norm": 0.2469044029712677, + "learning_rate": 2.1302131416956094e-06, + "loss": 0.0916, + "step": 42464 + }, + { + "epoch": 0.7883842078099531, + "grad_norm": 0.44726428389549255, + "learning_rate": 2.129493493614898e-06, + "loss": 0.1535, + "step": 42466 + }, + { + "epoch": 0.7884213379473718, + "grad_norm": 0.3943835198879242, + "learning_rate": 2.1287739526282026e-06, + "loss": 0.4625, + "step": 42468 + }, + { + "epoch": 0.7884584680847904, + "grad_norm": 0.30070164799690247, + "learning_rate": 2.128054518745316e-06, + "loss": 0.307, + "step": 42470 + }, + { + "epoch": 0.788495598222209, + "grad_norm": 0.29949524998664856, + "learning_rate": 2.1273351919760287e-06, + "loss": 0.2415, + "step": 42472 + }, + { + "epoch": 0.7885327283596276, + "grad_norm": 0.17144176363945007, + "learning_rate": 2.1266159723301315e-06, + "loss": 0.3365, + "step": 42474 + }, + { + "epoch": 0.7885698584970463, + "grad_norm": 0.4717441201210022, + "learning_rate": 2.1258968598174046e-06, + "loss": 0.5046, + "step": 42476 + }, + { + "epoch": 0.788606988634465, + "grad_norm": 0.32951632142066956, + "learning_rate": 2.1251778544476344e-06, + "loss": 0.3228, + "step": 42478 + }, + { + "epoch": 0.7886441187718836, + "grad_norm": 0.668451726436615, + "learning_rate": 2.124458956230607e-06, + "loss": 0.3139, + "step": 42480 + }, + { + "epoch": 0.7886812489093022, + "grad_norm": 0.41362839937210083, + "learning_rate": 2.1237401651761024e-06, + "loss": 0.3509, + "step": 42482 + }, + { + "epoch": 0.7887183790467208, + "grad_norm": 0.4281545877456665, + "learning_rate": 2.1230214812939032e-06, + "loss": 0.2162, + "step": 42484 + }, + { + "epoch": 0.7887555091841395, + "grad_norm": 0.2861446738243103, + "learning_rate": 2.122302904593789e-06, + "loss": 0.315, + "step": 42486 + }, + { + "epoch": 0.7887926393215581, + "grad_norm": 0.33064553141593933, + "learning_rate": 2.1215844350855365e-06, + "loss": 0.3075, + "step": 42488 + }, + { + "epoch": 0.7888297694589768, + "grad_norm": 0.3047144114971161, + "learning_rate": 2.120866072778919e-06, + "loss": 0.1781, + "step": 42490 + }, + { + "epoch": 0.7888668995963954, + "grad_norm": 0.36978021264076233, + "learning_rate": 2.120147817683712e-06, + "loss": 0.3506, + "step": 42492 + }, + { + "epoch": 0.788904029733814, + "grad_norm": 0.25501900911331177, + "learning_rate": 2.119429669809692e-06, + "loss": 0.2326, + "step": 42494 + }, + { + "epoch": 0.7889411598712327, + "grad_norm": 0.39574605226516724, + "learning_rate": 2.1187116291666278e-06, + "loss": 0.2789, + "step": 42496 + }, + { + "epoch": 0.7889782900086513, + "grad_norm": 0.500994861125946, + "learning_rate": 2.1179936957642943e-06, + "loss": 0.2235, + "step": 42498 + }, + { + "epoch": 0.78901542014607, + "grad_norm": 0.43373164534568787, + "learning_rate": 2.117275869612454e-06, + "loss": 0.3813, + "step": 42500 + }, + { + "epoch": 0.7890525502834886, + "grad_norm": 0.39462223649024963, + "learning_rate": 2.1165581507208786e-06, + "loss": 0.3382, + "step": 42502 + }, + { + "epoch": 0.7890896804209072, + "grad_norm": 0.5577334761619568, + "learning_rate": 2.115840539099332e-06, + "loss": 0.1985, + "step": 42504 + }, + { + "epoch": 0.7891268105583259, + "grad_norm": 0.4708207845687866, + "learning_rate": 2.1151230347575814e-06, + "loss": 0.0915, + "step": 42506 + }, + { + "epoch": 0.7891639406957445, + "grad_norm": 0.3929736912250519, + "learning_rate": 2.11440563770539e-06, + "loss": 0.1518, + "step": 42508 + }, + { + "epoch": 0.7892010708331632, + "grad_norm": 0.6720669269561768, + "learning_rate": 2.113688347952515e-06, + "loss": 0.2856, + "step": 42510 + }, + { + "epoch": 0.7892382009705818, + "grad_norm": 0.4696556329727173, + "learning_rate": 2.1129711655087227e-06, + "loss": 0.1705, + "step": 42512 + }, + { + "epoch": 0.7892753311080004, + "grad_norm": 0.4105079174041748, + "learning_rate": 2.112254090383765e-06, + "loss": 0.2812, + "step": 42514 + }, + { + "epoch": 0.7893124612454191, + "grad_norm": 0.4384016990661621, + "learning_rate": 2.1115371225874027e-06, + "loss": 0.4045, + "step": 42516 + }, + { + "epoch": 0.7893495913828377, + "grad_norm": 0.3876022398471832, + "learning_rate": 2.1108202621293917e-06, + "loss": 0.2695, + "step": 42518 + }, + { + "epoch": 0.7893867215202564, + "grad_norm": 0.4062402546405792, + "learning_rate": 2.1101035090194867e-06, + "loss": 0.1986, + "step": 42520 + }, + { + "epoch": 0.789423851657675, + "grad_norm": 0.33494803309440613, + "learning_rate": 2.1093868632674396e-06, + "loss": 0.3949, + "step": 42522 + }, + { + "epoch": 0.7894609817950936, + "grad_norm": 0.3725855052471161, + "learning_rate": 2.108670324883004e-06, + "loss": 0.2272, + "step": 42524 + }, + { + "epoch": 0.7894981119325123, + "grad_norm": 0.34512820839881897, + "learning_rate": 2.107953893875927e-06, + "loss": 0.2717, + "step": 42526 + }, + { + "epoch": 0.7895352420699309, + "grad_norm": 0.4230319559574127, + "learning_rate": 2.1072375702559565e-06, + "loss": 0.3753, + "step": 42528 + }, + { + "epoch": 0.7895723722073495, + "grad_norm": 0.3849067687988281, + "learning_rate": 2.1065213540328444e-06, + "loss": 0.1527, + "step": 42530 + }, + { + "epoch": 0.7896095023447682, + "grad_norm": 0.7458896040916443, + "learning_rate": 2.10580524521633e-06, + "loss": 0.2073, + "step": 42532 + }, + { + "epoch": 0.7896466324821868, + "grad_norm": 0.5071929693222046, + "learning_rate": 2.1050892438161607e-06, + "loss": 0.3121, + "step": 42534 + }, + { + "epoch": 0.7896837626196055, + "grad_norm": 0.38275641202926636, + "learning_rate": 2.1043733498420815e-06, + "loss": 0.1871, + "step": 42536 + }, + { + "epoch": 0.7897208927570241, + "grad_norm": 0.39187997579574585, + "learning_rate": 2.103657563303828e-06, + "loss": 0.3532, + "step": 42538 + }, + { + "epoch": 0.7897580228944427, + "grad_norm": 0.3579058051109314, + "learning_rate": 2.102941884211144e-06, + "loss": 0.3955, + "step": 42540 + }, + { + "epoch": 0.7897951530318613, + "grad_norm": 0.545877993106842, + "learning_rate": 2.102226312573765e-06, + "loss": 0.2927, + "step": 42542 + }, + { + "epoch": 0.78983228316928, + "grad_norm": 0.4625374674797058, + "learning_rate": 2.101510848401429e-06, + "loss": 0.3327, + "step": 42544 + }, + { + "epoch": 0.7898694133066987, + "grad_norm": 0.4088573753833771, + "learning_rate": 2.100795491703873e-06, + "loss": 0.2822, + "step": 42546 + }, + { + "epoch": 0.7899065434441173, + "grad_norm": 0.6364077925682068, + "learning_rate": 2.1000802424908316e-06, + "loss": 0.1923, + "step": 42548 + }, + { + "epoch": 0.7899436735815359, + "grad_norm": 0.8586711883544922, + "learning_rate": 2.0993651007720318e-06, + "loss": 0.3471, + "step": 42550 + }, + { + "epoch": 0.7899808037189545, + "grad_norm": 0.4499157667160034, + "learning_rate": 2.098650066557212e-06, + "loss": 0.1479, + "step": 42552 + }, + { + "epoch": 0.7900179338563732, + "grad_norm": 0.25522860884666443, + "learning_rate": 2.097935139856093e-06, + "loss": 0.3336, + "step": 42554 + }, + { + "epoch": 0.7900550639937919, + "grad_norm": 0.38141849637031555, + "learning_rate": 2.097220320678409e-06, + "loss": 0.198, + "step": 42556 + }, + { + "epoch": 0.7900921941312105, + "grad_norm": 0.2683364748954773, + "learning_rate": 2.0965056090338844e-06, + "loss": 0.124, + "step": 42558 + }, + { + "epoch": 0.7901293242686291, + "grad_norm": 0.3264162540435791, + "learning_rate": 2.095791004932244e-06, + "loss": 0.2497, + "step": 42560 + }, + { + "epoch": 0.7901664544060477, + "grad_norm": 0.23377372324466705, + "learning_rate": 2.0950765083832146e-06, + "loss": 0.3535, + "step": 42562 + }, + { + "epoch": 0.7902035845434664, + "grad_norm": 0.47008216381073, + "learning_rate": 2.0943621193965145e-06, + "loss": 0.3933, + "step": 42564 + }, + { + "epoch": 0.7902407146808851, + "grad_norm": 0.31055015325546265, + "learning_rate": 2.0936478379818657e-06, + "loss": 0.4835, + "step": 42566 + }, + { + "epoch": 0.7902778448183037, + "grad_norm": 0.38977161049842834, + "learning_rate": 2.092933664148987e-06, + "loss": 0.2732, + "step": 42568 + }, + { + "epoch": 0.7903149749557223, + "grad_norm": 0.342987060546875, + "learning_rate": 2.0922195979075965e-06, + "loss": 0.1817, + "step": 42570 + }, + { + "epoch": 0.7903521050931409, + "grad_norm": 0.36170870065689087, + "learning_rate": 2.091505639267414e-06, + "loss": 0.2003, + "step": 42572 + }, + { + "epoch": 0.7903892352305596, + "grad_norm": 0.33908453583717346, + "learning_rate": 2.090791788238151e-06, + "loss": 0.2686, + "step": 42574 + }, + { + "epoch": 0.7904263653679783, + "grad_norm": 0.23294876515865326, + "learning_rate": 2.0900780448295177e-06, + "loss": 0.2642, + "step": 42576 + }, + { + "epoch": 0.7904634955053969, + "grad_norm": 0.4259909391403198, + "learning_rate": 2.08936440905123e-06, + "loss": 0.3708, + "step": 42578 + }, + { + "epoch": 0.7905006256428155, + "grad_norm": 0.3567473292350769, + "learning_rate": 2.088650880912997e-06, + "loss": 0.3266, + "step": 42580 + }, + { + "epoch": 0.7905377557802341, + "grad_norm": 0.3649671971797943, + "learning_rate": 2.0879374604245286e-06, + "loss": 0.2853, + "step": 42582 + }, + { + "epoch": 0.7905748859176528, + "grad_norm": 0.38092663884162903, + "learning_rate": 2.0872241475955325e-06, + "loss": 0.3168, + "step": 42584 + }, + { + "epoch": 0.7906120160550714, + "grad_norm": 0.35730108618736267, + "learning_rate": 2.0865109424357154e-06, + "loss": 0.1507, + "step": 42586 + }, + { + "epoch": 0.79064914619249, + "grad_norm": 0.4844895899295807, + "learning_rate": 2.08579784495478e-06, + "loss": 0.1877, + "step": 42588 + }, + { + "epoch": 0.7906862763299087, + "grad_norm": 0.3098269999027252, + "learning_rate": 2.085084855162429e-06, + "loss": 0.3054, + "step": 42590 + }, + { + "epoch": 0.7907234064673273, + "grad_norm": 0.4970020353794098, + "learning_rate": 2.0843719730683655e-06, + "loss": 0.3478, + "step": 42592 + }, + { + "epoch": 0.790760536604746, + "grad_norm": 0.2863791286945343, + "learning_rate": 2.083659198682292e-06, + "loss": 0.2598, + "step": 42594 + }, + { + "epoch": 0.7907976667421646, + "grad_norm": 0.3540296256542206, + "learning_rate": 2.082946532013902e-06, + "loss": 0.3192, + "step": 42596 + }, + { + "epoch": 0.7908347968795832, + "grad_norm": 0.49094539880752563, + "learning_rate": 2.082233973072895e-06, + "loss": 0.3213, + "step": 42598 + }, + { + "epoch": 0.7908719270170019, + "grad_norm": 0.24183067679405212, + "learning_rate": 2.0815215218689698e-06, + "loss": 0.1513, + "step": 42600 + }, + { + "epoch": 0.7909090571544205, + "grad_norm": 0.4134245812892914, + "learning_rate": 2.0808091784118157e-06, + "loss": 0.2975, + "step": 42602 + }, + { + "epoch": 0.7909461872918392, + "grad_norm": 0.5017105340957642, + "learning_rate": 2.0800969427111285e-06, + "loss": 0.3647, + "step": 42604 + }, + { + "epoch": 0.7909833174292578, + "grad_norm": 0.4237438142299652, + "learning_rate": 2.079384814776598e-06, + "loss": 0.4172, + "step": 42606 + }, + { + "epoch": 0.7910204475666764, + "grad_norm": 0.5569114089012146, + "learning_rate": 2.0786727946179163e-06, + "loss": 0.3092, + "step": 42608 + }, + { + "epoch": 0.7910575777040951, + "grad_norm": 0.52723628282547, + "learning_rate": 2.0779608822447693e-06, + "loss": 0.1331, + "step": 42610 + }, + { + "epoch": 0.7910947078415137, + "grad_norm": 0.41421768069267273, + "learning_rate": 2.07724907766685e-06, + "loss": 0.2478, + "step": 42612 + }, + { + "epoch": 0.7911318379789324, + "grad_norm": 0.508815348148346, + "learning_rate": 2.076537380893835e-06, + "loss": 0.2951, + "step": 42614 + }, + { + "epoch": 0.791168968116351, + "grad_norm": 0.3274082839488983, + "learning_rate": 2.0758257919354163e-06, + "loss": 0.183, + "step": 42616 + }, + { + "epoch": 0.7912060982537696, + "grad_norm": 0.33555614948272705, + "learning_rate": 2.0751143108012693e-06, + "loss": 0.336, + "step": 42618 + }, + { + "epoch": 0.7912432283911883, + "grad_norm": 0.6409013271331787, + "learning_rate": 2.07440293750108e-06, + "loss": 0.1538, + "step": 42620 + }, + { + "epoch": 0.7912803585286069, + "grad_norm": 0.4628830850124359, + "learning_rate": 2.0736916720445264e-06, + "loss": 0.2658, + "step": 42622 + }, + { + "epoch": 0.7913174886660256, + "grad_norm": 0.3782122731208801, + "learning_rate": 2.0729805144412895e-06, + "loss": 0.2932, + "step": 42624 + }, + { + "epoch": 0.7913546188034442, + "grad_norm": 0.286088764667511, + "learning_rate": 2.072269464701041e-06, + "loss": 0.2568, + "step": 42626 + }, + { + "epoch": 0.7913917489408628, + "grad_norm": 0.5430276989936829, + "learning_rate": 2.071558522833459e-06, + "loss": 0.2003, + "step": 42628 + }, + { + "epoch": 0.7914288790782815, + "grad_norm": 0.501216471195221, + "learning_rate": 2.0708476888482174e-06, + "loss": 0.3363, + "step": 42630 + }, + { + "epoch": 0.7914660092157001, + "grad_norm": 0.38458243012428284, + "learning_rate": 2.070136962754987e-06, + "loss": 0.2582, + "step": 42632 + }, + { + "epoch": 0.7915031393531188, + "grad_norm": 0.3722245991230011, + "learning_rate": 2.0694263445634445e-06, + "loss": 0.1312, + "step": 42634 + }, + { + "epoch": 0.7915402694905374, + "grad_norm": 0.3490140438079834, + "learning_rate": 2.06871583428325e-06, + "loss": 0.3663, + "step": 42636 + }, + { + "epoch": 0.791577399627956, + "grad_norm": 0.49844953417778015, + "learning_rate": 2.0680054319240793e-06, + "loss": 0.2152, + "step": 42638 + }, + { + "epoch": 0.7916145297653746, + "grad_norm": 0.25851577520370483, + "learning_rate": 2.0672951374955928e-06, + "loss": 0.2768, + "step": 42640 + }, + { + "epoch": 0.7916516599027933, + "grad_norm": 0.481437087059021, + "learning_rate": 2.0665849510074585e-06, + "loss": 0.3078, + "step": 42642 + }, + { + "epoch": 0.791688790040212, + "grad_norm": 0.4198446571826935, + "learning_rate": 2.06587487246934e-06, + "loss": 0.2242, + "step": 42644 + }, + { + "epoch": 0.7917259201776305, + "grad_norm": 0.3985229432582855, + "learning_rate": 2.0651649018908993e-06, + "loss": 0.2596, + "step": 42646 + }, + { + "epoch": 0.7917630503150492, + "grad_norm": 0.2468804270029068, + "learning_rate": 2.0644550392817964e-06, + "loss": 0.417, + "step": 42648 + }, + { + "epoch": 0.7918001804524678, + "grad_norm": 0.28037264943122864, + "learning_rate": 2.0637452846516925e-06, + "loss": 0.2331, + "step": 42650 + }, + { + "epoch": 0.7918373105898865, + "grad_norm": 0.37517064809799194, + "learning_rate": 2.063035638010242e-06, + "loss": 0.173, + "step": 42652 + }, + { + "epoch": 0.7918744407273052, + "grad_norm": 0.32266315817832947, + "learning_rate": 2.062326099367101e-06, + "loss": 0.3654, + "step": 42654 + }, + { + "epoch": 0.7919115708647237, + "grad_norm": 0.20128904283046722, + "learning_rate": 2.0616166687319273e-06, + "loss": 0.4743, + "step": 42656 + }, + { + "epoch": 0.7919487010021424, + "grad_norm": 0.3021332025527954, + "learning_rate": 2.0609073461143747e-06, + "loss": 0.2246, + "step": 42658 + }, + { + "epoch": 0.791985831139561, + "grad_norm": 0.5699960589408875, + "learning_rate": 2.0601981315240892e-06, + "loss": 0.3673, + "step": 42660 + }, + { + "epoch": 0.7920229612769797, + "grad_norm": 0.5187786817550659, + "learning_rate": 2.059489024970729e-06, + "loss": 0.2218, + "step": 42662 + }, + { + "epoch": 0.7920600914143984, + "grad_norm": 0.34131884574890137, + "learning_rate": 2.0587800264639346e-06, + "loss": 0.3281, + "step": 42664 + }, + { + "epoch": 0.7920972215518169, + "grad_norm": 0.3134617507457733, + "learning_rate": 2.0580711360133574e-06, + "loss": 0.2675, + "step": 42666 + }, + { + "epoch": 0.7921343516892356, + "grad_norm": 0.27192190289497375, + "learning_rate": 2.0573623536286434e-06, + "loss": 0.3998, + "step": 42668 + }, + { + "epoch": 0.7921714818266542, + "grad_norm": 0.32284533977508545, + "learning_rate": 2.056653679319438e-06, + "loss": 0.3717, + "step": 42670 + }, + { + "epoch": 0.7922086119640729, + "grad_norm": 0.504384458065033, + "learning_rate": 2.055945113095381e-06, + "loss": 0.3891, + "step": 42672 + }, + { + "epoch": 0.7922457421014916, + "grad_norm": 0.41510099172592163, + "learning_rate": 2.0552366549661194e-06, + "loss": 0.173, + "step": 42674 + }, + { + "epoch": 0.7922828722389101, + "grad_norm": 0.343056321144104, + "learning_rate": 2.054528304941288e-06, + "loss": 0.1487, + "step": 42676 + }, + { + "epoch": 0.7923200023763288, + "grad_norm": 0.3308608829975128, + "learning_rate": 2.053820063030525e-06, + "loss": 0.2845, + "step": 42678 + }, + { + "epoch": 0.7923571325137474, + "grad_norm": 0.3284551203250885, + "learning_rate": 2.053111929243473e-06, + "loss": 0.2697, + "step": 42680 + }, + { + "epoch": 0.7923942626511661, + "grad_norm": 0.250306099653244, + "learning_rate": 2.0524039035897614e-06, + "loss": 0.1573, + "step": 42682 + }, + { + "epoch": 0.7924313927885848, + "grad_norm": 0.3870696723461151, + "learning_rate": 2.0516959860790263e-06, + "loss": 0.2162, + "step": 42684 + }, + { + "epoch": 0.7924685229260033, + "grad_norm": 0.4715813994407654, + "learning_rate": 2.0509881767209015e-06, + "loss": 0.1898, + "step": 42686 + }, + { + "epoch": 0.792505653063422, + "grad_norm": 0.4110667407512665, + "learning_rate": 2.0502804755250203e-06, + "loss": 0.2649, + "step": 42688 + }, + { + "epoch": 0.7925427832008406, + "grad_norm": 0.5015087127685547, + "learning_rate": 2.0495728825010064e-06, + "loss": 0.1971, + "step": 42690 + }, + { + "epoch": 0.7925799133382593, + "grad_norm": 0.6716548204421997, + "learning_rate": 2.0488653976584916e-06, + "loss": 0.4662, + "step": 42692 + }, + { + "epoch": 0.7926170434756779, + "grad_norm": 0.33569470047950745, + "learning_rate": 2.048158021007103e-06, + "loss": 0.3139, + "step": 42694 + }, + { + "epoch": 0.7926541736130965, + "grad_norm": 0.272247314453125, + "learning_rate": 2.0474507525564633e-06, + "loss": 0.2761, + "step": 42696 + }, + { + "epoch": 0.7926913037505152, + "grad_norm": 0.35322609543800354, + "learning_rate": 2.046743592316203e-06, + "loss": 0.2827, + "step": 42698 + }, + { + "epoch": 0.7927284338879338, + "grad_norm": 0.2537001073360443, + "learning_rate": 2.046036540295936e-06, + "loss": 0.4237, + "step": 42700 + }, + { + "epoch": 0.7927655640253525, + "grad_norm": 0.40118294954299927, + "learning_rate": 2.045329596505289e-06, + "loss": 0.2705, + "step": 42702 + }, + { + "epoch": 0.792802694162771, + "grad_norm": 0.41252779960632324, + "learning_rate": 2.044622760953877e-06, + "loss": 0.2519, + "step": 42704 + }, + { + "epoch": 0.7928398243001897, + "grad_norm": 0.26591622829437256, + "learning_rate": 2.043916033651321e-06, + "loss": 0.2575, + "step": 42706 + }, + { + "epoch": 0.7928769544376084, + "grad_norm": 0.38855138421058655, + "learning_rate": 2.0432094146072356e-06, + "loss": 0.4377, + "step": 42708 + }, + { + "epoch": 0.792914084575027, + "grad_norm": 0.43108057975769043, + "learning_rate": 2.042502903831236e-06, + "loss": 0.3193, + "step": 42710 + }, + { + "epoch": 0.7929512147124457, + "grad_norm": 0.6577187180519104, + "learning_rate": 2.041796501332941e-06, + "loss": 0.4136, + "step": 42712 + }, + { + "epoch": 0.7929883448498642, + "grad_norm": 0.481794148683548, + "learning_rate": 2.041090207121954e-06, + "loss": 0.3517, + "step": 42714 + }, + { + "epoch": 0.7930254749872829, + "grad_norm": 0.5122260451316833, + "learning_rate": 2.0403840212078898e-06, + "loss": 0.0995, + "step": 42716 + }, + { + "epoch": 0.7930626051247016, + "grad_norm": 0.4683792293071747, + "learning_rate": 2.0396779436003577e-06, + "loss": 0.3361, + "step": 42718 + }, + { + "epoch": 0.7930997352621202, + "grad_norm": 0.42059043049812317, + "learning_rate": 2.0389719743089663e-06, + "loss": 0.1698, + "step": 42720 + }, + { + "epoch": 0.7931368653995389, + "grad_norm": 0.33432382345199585, + "learning_rate": 2.0382661133433192e-06, + "loss": 0.326, + "step": 42722 + }, + { + "epoch": 0.7931739955369574, + "grad_norm": 0.39011383056640625, + "learning_rate": 2.0375603607130213e-06, + "loss": 0.2388, + "step": 42724 + }, + { + "epoch": 0.7932111256743761, + "grad_norm": 0.38684576749801636, + "learning_rate": 2.0368547164276796e-06, + "loss": 0.3854, + "step": 42726 + }, + { + "epoch": 0.7932482558117948, + "grad_norm": 0.35732534527778625, + "learning_rate": 2.0361491804968902e-06, + "loss": 0.0299, + "step": 42728 + }, + { + "epoch": 0.7932853859492134, + "grad_norm": 0.3931863605976105, + "learning_rate": 2.035443752930255e-06, + "loss": 0.2579, + "step": 42730 + }, + { + "epoch": 0.7933225160866321, + "grad_norm": 0.7063838243484497, + "learning_rate": 2.0347384337373745e-06, + "loss": 0.3594, + "step": 42732 + }, + { + "epoch": 0.7933596462240506, + "grad_norm": 0.3051188290119171, + "learning_rate": 2.0340332229278458e-06, + "loss": 0.2238, + "step": 42734 + }, + { + "epoch": 0.7933967763614693, + "grad_norm": 0.6285567879676819, + "learning_rate": 2.033328120511263e-06, + "loss": 0.2459, + "step": 42736 + }, + { + "epoch": 0.7934339064988879, + "grad_norm": 0.43248236179351807, + "learning_rate": 2.0326231264972253e-06, + "loss": 0.3369, + "step": 42738 + }, + { + "epoch": 0.7934710366363066, + "grad_norm": 0.6834849715232849, + "learning_rate": 2.0319182408953186e-06, + "loss": 0.4614, + "step": 42740 + }, + { + "epoch": 0.7935081667737253, + "grad_norm": 0.3905276358127594, + "learning_rate": 2.0312134637151393e-06, + "loss": 0.2969, + "step": 42742 + }, + { + "epoch": 0.7935452969111438, + "grad_norm": 0.2289569079875946, + "learning_rate": 2.030508794966274e-06, + "loss": 0.1603, + "step": 42744 + }, + { + "epoch": 0.7935824270485625, + "grad_norm": 0.31870412826538086, + "learning_rate": 2.0298042346583122e-06, + "loss": 0.3701, + "step": 42746 + }, + { + "epoch": 0.7936195571859811, + "grad_norm": 0.3010765314102173, + "learning_rate": 2.029099782800841e-06, + "loss": 0.2594, + "step": 42748 + }, + { + "epoch": 0.7936566873233998, + "grad_norm": 0.46039465069770813, + "learning_rate": 2.028395439403449e-06, + "loss": 0.3354, + "step": 42750 + }, + { + "epoch": 0.7936938174608185, + "grad_norm": 0.28095942735671997, + "learning_rate": 2.027691204475715e-06, + "loss": 0.2535, + "step": 42752 + }, + { + "epoch": 0.793730947598237, + "grad_norm": 0.3010506331920624, + "learning_rate": 2.026987078027224e-06, + "loss": 0.4395, + "step": 42754 + }, + { + "epoch": 0.7937680777356557, + "grad_norm": 0.4402254521846771, + "learning_rate": 2.0262830600675563e-06, + "loss": 0.2088, + "step": 42756 + }, + { + "epoch": 0.7938052078730743, + "grad_norm": 0.42874911427497864, + "learning_rate": 2.0255791506062915e-06, + "loss": 0.1753, + "step": 42758 + }, + { + "epoch": 0.793842338010493, + "grad_norm": 0.2947536110877991, + "learning_rate": 2.024875349653009e-06, + "loss": 0.1437, + "step": 42760 + }, + { + "epoch": 0.7938794681479117, + "grad_norm": 0.2735714316368103, + "learning_rate": 2.0241716572172885e-06, + "loss": 0.2229, + "step": 42762 + }, + { + "epoch": 0.7939165982853302, + "grad_norm": 0.3092462122440338, + "learning_rate": 2.0234680733086977e-06, + "loss": 0.2173, + "step": 42764 + }, + { + "epoch": 0.7939537284227489, + "grad_norm": 0.364353746175766, + "learning_rate": 2.022764597936816e-06, + "loss": 0.3058, + "step": 42766 + }, + { + "epoch": 0.7939908585601675, + "grad_norm": 0.4056829810142517, + "learning_rate": 2.022061231111212e-06, + "loss": 0.3435, + "step": 42768 + }, + { + "epoch": 0.7940279886975862, + "grad_norm": 0.6425531506538391, + "learning_rate": 2.021357972841458e-06, + "loss": 0.2642, + "step": 42770 + }, + { + "epoch": 0.7940651188350049, + "grad_norm": 0.2900737226009369, + "learning_rate": 2.0206548231371225e-06, + "loss": 0.2802, + "step": 42772 + }, + { + "epoch": 0.7941022489724234, + "grad_norm": 0.3155817687511444, + "learning_rate": 2.019951782007774e-06, + "loss": 0.1816, + "step": 42774 + }, + { + "epoch": 0.7941393791098421, + "grad_norm": 0.4117618203163147, + "learning_rate": 2.0192488494629816e-06, + "loss": 0.1825, + "step": 42776 + }, + { + "epoch": 0.7941765092472607, + "grad_norm": 0.6537511348724365, + "learning_rate": 2.018546025512305e-06, + "loss": 0.2367, + "step": 42778 + }, + { + "epoch": 0.7942136393846794, + "grad_norm": 0.24910946190357208, + "learning_rate": 2.0178433101653084e-06, + "loss": 0.1547, + "step": 42780 + }, + { + "epoch": 0.794250769522098, + "grad_norm": 0.48479151725769043, + "learning_rate": 2.017140703431556e-06, + "loss": 0.1673, + "step": 42782 + }, + { + "epoch": 0.7942878996595166, + "grad_norm": 0.40298548340797424, + "learning_rate": 2.01643820532061e-06, + "loss": 0.3878, + "step": 42784 + }, + { + "epoch": 0.7943250297969353, + "grad_norm": 0.26045864820480347, + "learning_rate": 2.0157358158420228e-06, + "loss": 0.1864, + "step": 42786 + }, + { + "epoch": 0.7943621599343539, + "grad_norm": 0.3012307286262512, + "learning_rate": 2.0150335350053596e-06, + "loss": 0.2213, + "step": 42788 + }, + { + "epoch": 0.7943992900717726, + "grad_norm": 0.43078145384788513, + "learning_rate": 2.014331362820169e-06, + "loss": 0.1012, + "step": 42790 + }, + { + "epoch": 0.7944364202091911, + "grad_norm": 0.3820379972457886, + "learning_rate": 2.013629299296008e-06, + "loss": 0.1971, + "step": 42792 + }, + { + "epoch": 0.7944735503466098, + "grad_norm": 0.32335078716278076, + "learning_rate": 2.0129273444424315e-06, + "loss": 0.3745, + "step": 42794 + }, + { + "epoch": 0.7945106804840285, + "grad_norm": 0.3137783110141754, + "learning_rate": 2.0122254982689892e-06, + "loss": 0.3607, + "step": 42796 + }, + { + "epoch": 0.7945478106214471, + "grad_norm": 0.3571280241012573, + "learning_rate": 2.0115237607852312e-06, + "loss": 0.3246, + "step": 42798 + }, + { + "epoch": 0.7945849407588658, + "grad_norm": 0.4563015103340149, + "learning_rate": 2.0108221320007104e-06, + "loss": 0.2029, + "step": 42800 + }, + { + "epoch": 0.7946220708962843, + "grad_norm": 0.5401472449302673, + "learning_rate": 2.0101206119249663e-06, + "loss": 0.2232, + "step": 42802 + }, + { + "epoch": 0.794659201033703, + "grad_norm": 0.45033398270606995, + "learning_rate": 2.0094192005675485e-06, + "loss": 0.3162, + "step": 42804 + }, + { + "epoch": 0.7946963311711217, + "grad_norm": 0.22998209297657013, + "learning_rate": 2.008717897938004e-06, + "loss": 0.2192, + "step": 42806 + }, + { + "epoch": 0.7947334613085403, + "grad_norm": 0.5538310408592224, + "learning_rate": 2.0080167040458686e-06, + "loss": 0.1284, + "step": 42808 + }, + { + "epoch": 0.794770591445959, + "grad_norm": 0.5740215182304382, + "learning_rate": 2.007315618900688e-06, + "loss": 0.2657, + "step": 42810 + }, + { + "epoch": 0.7948077215833775, + "grad_norm": 0.37102216482162476, + "learning_rate": 2.0066146425120004e-06, + "loss": 0.2641, + "step": 42812 + }, + { + "epoch": 0.7948448517207962, + "grad_norm": 0.4133845865726471, + "learning_rate": 2.0059137748893477e-06, + "loss": 0.37, + "step": 42814 + }, + { + "epoch": 0.7948819818582149, + "grad_norm": 0.7754574418067932, + "learning_rate": 2.0052130160422602e-06, + "loss": 0.3412, + "step": 42816 + }, + { + "epoch": 0.7949191119956335, + "grad_norm": 0.33565637469291687, + "learning_rate": 2.0045123659802766e-06, + "loss": 0.2758, + "step": 42818 + }, + { + "epoch": 0.7949562421330522, + "grad_norm": 0.27820920944213867, + "learning_rate": 2.0038118247129314e-06, + "loss": 0.3167, + "step": 42820 + }, + { + "epoch": 0.7949933722704707, + "grad_norm": 0.4165036082267761, + "learning_rate": 2.0031113922497546e-06, + "loss": 0.3382, + "step": 42822 + }, + { + "epoch": 0.7950305024078894, + "grad_norm": 0.4344783127307892, + "learning_rate": 2.0024110686002783e-06, + "loss": 0.2094, + "step": 42824 + }, + { + "epoch": 0.7950676325453081, + "grad_norm": 0.4044579863548279, + "learning_rate": 2.0017108537740358e-06, + "loss": 0.2182, + "step": 42826 + }, + { + "epoch": 0.7951047626827267, + "grad_norm": 0.4348785877227783, + "learning_rate": 2.0010107477805494e-06, + "loss": 0.2287, + "step": 42828 + }, + { + "epoch": 0.7951418928201454, + "grad_norm": 0.4260804355144501, + "learning_rate": 2.000310750629345e-06, + "loss": 0.3942, + "step": 42830 + }, + { + "epoch": 0.7951790229575639, + "grad_norm": 0.38059136271476746, + "learning_rate": 1.9996108623299504e-06, + "loss": 0.253, + "step": 42832 + }, + { + "epoch": 0.7952161530949826, + "grad_norm": 0.41512730717658997, + "learning_rate": 1.9989110828918865e-06, + "loss": 0.4418, + "step": 42834 + }, + { + "epoch": 0.7952532832324013, + "grad_norm": 0.6739332675933838, + "learning_rate": 1.9982114123246776e-06, + "loss": 0.3184, + "step": 42836 + }, + { + "epoch": 0.7952904133698199, + "grad_norm": 0.6416725516319275, + "learning_rate": 1.9975118506378454e-06, + "loss": 0.3195, + "step": 42838 + }, + { + "epoch": 0.7953275435072386, + "grad_norm": 0.37072065472602844, + "learning_rate": 1.996812397840904e-06, + "loss": 0.2285, + "step": 42840 + }, + { + "epoch": 0.7953646736446571, + "grad_norm": 0.3906102478504181, + "learning_rate": 1.9961130539433737e-06, + "loss": 0.315, + "step": 42842 + }, + { + "epoch": 0.7954018037820758, + "grad_norm": 0.6522887349128723, + "learning_rate": 1.9954138189547712e-06, + "loss": 0.4844, + "step": 42844 + }, + { + "epoch": 0.7954389339194944, + "grad_norm": 0.5065922141075134, + "learning_rate": 1.9947146928846083e-06, + "loss": 0.4491, + "step": 42846 + }, + { + "epoch": 0.7954760640569131, + "grad_norm": 0.6735919713973999, + "learning_rate": 1.9940156757424046e-06, + "loss": 0.1456, + "step": 42848 + }, + { + "epoch": 0.7955131941943318, + "grad_norm": 0.3414495587348938, + "learning_rate": 1.993316767537663e-06, + "loss": 0.478, + "step": 42850 + }, + { + "epoch": 0.7955503243317503, + "grad_norm": 0.4428774118423462, + "learning_rate": 1.9926179682798997e-06, + "loss": 0.3729, + "step": 42852 + }, + { + "epoch": 0.795587454469169, + "grad_norm": 0.5150340795516968, + "learning_rate": 1.991919277978619e-06, + "loss": 0.1835, + "step": 42854 + }, + { + "epoch": 0.7956245846065876, + "grad_norm": 0.5379697680473328, + "learning_rate": 1.991220696643329e-06, + "loss": 0.272, + "step": 42856 + }, + { + "epoch": 0.7956617147440063, + "grad_norm": 0.37882760167121887, + "learning_rate": 1.9905222242835375e-06, + "loss": 0.3643, + "step": 42858 + }, + { + "epoch": 0.795698844881425, + "grad_norm": 0.3648139536380768, + "learning_rate": 1.989823860908747e-06, + "loss": 0.3321, + "step": 42860 + }, + { + "epoch": 0.7957359750188435, + "grad_norm": 0.4506866931915283, + "learning_rate": 1.98912560652846e-06, + "loss": 0.3394, + "step": 42862 + }, + { + "epoch": 0.7957731051562622, + "grad_norm": 0.24526220560073853, + "learning_rate": 1.9884274611521816e-06, + "loss": 0.2202, + "step": 42864 + }, + { + "epoch": 0.7958102352936808, + "grad_norm": 0.3648502230644226, + "learning_rate": 1.9877294247894043e-06, + "loss": 0.4003, + "step": 42866 + }, + { + "epoch": 0.7958473654310995, + "grad_norm": 0.3403089642524719, + "learning_rate": 1.987031497449631e-06, + "loss": 0.2933, + "step": 42868 + }, + { + "epoch": 0.7958844955685181, + "grad_norm": 0.37401846051216125, + "learning_rate": 1.9863336791423594e-06, + "loss": 0.3157, + "step": 42870 + }, + { + "epoch": 0.7959216257059367, + "grad_norm": 0.43707484006881714, + "learning_rate": 1.9856359698770798e-06, + "loss": 0.203, + "step": 42872 + }, + { + "epoch": 0.7959587558433554, + "grad_norm": 0.767066478729248, + "learning_rate": 1.9849383696632895e-06, + "loss": 0.4028, + "step": 42874 + }, + { + "epoch": 0.795995885980774, + "grad_norm": 0.28582361340522766, + "learning_rate": 1.984240878510483e-06, + "loss": 0.2731, + "step": 42876 + }, + { + "epoch": 0.7960330161181927, + "grad_norm": 0.47415080666542053, + "learning_rate": 1.983543496428145e-06, + "loss": 0.393, + "step": 42878 + }, + { + "epoch": 0.7960701462556113, + "grad_norm": 0.3783133625984192, + "learning_rate": 1.9828462234257683e-06, + "loss": 0.3412, + "step": 42880 + }, + { + "epoch": 0.7961072763930299, + "grad_norm": 0.7819389700889587, + "learning_rate": 1.98214905951284e-06, + "loss": 0.1384, + "step": 42882 + }, + { + "epoch": 0.7961444065304486, + "grad_norm": 0.45372262597084045, + "learning_rate": 1.981452004698847e-06, + "loss": 0.1567, + "step": 42884 + }, + { + "epoch": 0.7961815366678672, + "grad_norm": 0.37073835730552673, + "learning_rate": 1.980755058993273e-06, + "loss": 0.2749, + "step": 42886 + }, + { + "epoch": 0.7962186668052859, + "grad_norm": 0.2847364842891693, + "learning_rate": 1.9800582224056066e-06, + "loss": 0.3282, + "step": 42888 + }, + { + "epoch": 0.7962557969427044, + "grad_norm": 0.3301716148853302, + "learning_rate": 1.979361494945321e-06, + "loss": 0.2652, + "step": 42890 + }, + { + "epoch": 0.7962929270801231, + "grad_norm": 0.33778923749923706, + "learning_rate": 1.978664876621904e-06, + "loss": 0.272, + "step": 42892 + }, + { + "epoch": 0.7963300572175418, + "grad_norm": 0.3764488399028778, + "learning_rate": 1.977968367444829e-06, + "loss": 0.3562, + "step": 42894 + }, + { + "epoch": 0.7963671873549604, + "grad_norm": 0.33144626021385193, + "learning_rate": 1.9772719674235764e-06, + "loss": 0.3188, + "step": 42896 + }, + { + "epoch": 0.796404317492379, + "grad_norm": 0.3059694766998291, + "learning_rate": 1.97657567656762e-06, + "loss": 0.3798, + "step": 42898 + }, + { + "epoch": 0.7964414476297976, + "grad_norm": 0.6471332311630249, + "learning_rate": 1.975879494886437e-06, + "loss": 0.2643, + "step": 42900 + }, + { + "epoch": 0.7964785777672163, + "grad_norm": 0.40864214301109314, + "learning_rate": 1.9751834223895004e-06, + "loss": 0.2001, + "step": 42902 + }, + { + "epoch": 0.796515707904635, + "grad_norm": 0.40074893832206726, + "learning_rate": 1.974487459086278e-06, + "loss": 0.4009, + "step": 42904 + }, + { + "epoch": 0.7965528380420536, + "grad_norm": 0.3613109886646271, + "learning_rate": 1.9737916049862425e-06, + "loss": 0.3632, + "step": 42906 + }, + { + "epoch": 0.7965899681794723, + "grad_norm": 0.4784510135650635, + "learning_rate": 1.9730958600988627e-06, + "loss": 0.2825, + "step": 42908 + }, + { + "epoch": 0.7966270983168908, + "grad_norm": 0.3904340863227844, + "learning_rate": 1.972400224433606e-06, + "loss": 0.2128, + "step": 42910 + }, + { + "epoch": 0.7966642284543095, + "grad_norm": 0.48943015933036804, + "learning_rate": 1.9717046979999356e-06, + "loss": 0.1876, + "step": 42912 + }, + { + "epoch": 0.7967013585917282, + "grad_norm": 0.36092355847358704, + "learning_rate": 1.971009280807319e-06, + "loss": 0.2211, + "step": 42914 + }, + { + "epoch": 0.7967384887291468, + "grad_norm": 0.30325502157211304, + "learning_rate": 1.970313972865213e-06, + "loss": 0.3446, + "step": 42916 + }, + { + "epoch": 0.7967756188665654, + "grad_norm": 0.4336217939853668, + "learning_rate": 1.969618774183083e-06, + "loss": 0.2004, + "step": 42918 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.35627490282058716, + "learning_rate": 1.9689236847703873e-06, + "loss": 0.2784, + "step": 42920 + }, + { + "epoch": 0.7968498791414027, + "grad_norm": 0.4171198904514313, + "learning_rate": 1.9682287046365846e-06, + "loss": 0.1664, + "step": 42922 + }, + { + "epoch": 0.7968870092788214, + "grad_norm": 0.39028748869895935, + "learning_rate": 1.967533833791131e-06, + "loss": 0.2367, + "step": 42924 + }, + { + "epoch": 0.79692413941624, + "grad_norm": 0.42063596844673157, + "learning_rate": 1.9668390722434816e-06, + "loss": 0.2655, + "step": 42926 + }, + { + "epoch": 0.7969612695536586, + "grad_norm": 0.4091411530971527, + "learning_rate": 1.9661444200030933e-06, + "loss": 0.4197, + "step": 42928 + }, + { + "epoch": 0.7969983996910772, + "grad_norm": 0.3154064416885376, + "learning_rate": 1.965449877079413e-06, + "loss": 0.2299, + "step": 42930 + }, + { + "epoch": 0.7970355298284959, + "grad_norm": 0.46301957964897156, + "learning_rate": 1.9647554434818928e-06, + "loss": 0.3565, + "step": 42932 + }, + { + "epoch": 0.7970726599659146, + "grad_norm": 0.36202600598335266, + "learning_rate": 1.9640611192199864e-06, + "loss": 0.0981, + "step": 42934 + }, + { + "epoch": 0.7971097901033332, + "grad_norm": 0.344178169965744, + "learning_rate": 1.9633669043031344e-06, + "loss": 0.2722, + "step": 42936 + }, + { + "epoch": 0.7971469202407518, + "grad_norm": 0.28540897369384766, + "learning_rate": 1.9626727987407867e-06, + "loss": 0.3595, + "step": 42938 + }, + { + "epoch": 0.7971840503781704, + "grad_norm": 0.3103768229484558, + "learning_rate": 1.9619788025423903e-06, + "loss": 0.3873, + "step": 42940 + }, + { + "epoch": 0.7972211805155891, + "grad_norm": 0.38639652729034424, + "learning_rate": 1.961284915717382e-06, + "loss": 0.2072, + "step": 42942 + }, + { + "epoch": 0.7972583106530077, + "grad_norm": 0.5752630829811096, + "learning_rate": 1.9605911382752084e-06, + "loss": 0.1325, + "step": 42944 + }, + { + "epoch": 0.7972954407904264, + "grad_norm": 0.4168678820133209, + "learning_rate": 1.9598974702253082e-06, + "loss": 0.3326, + "step": 42946 + }, + { + "epoch": 0.797332570927845, + "grad_norm": 0.33254486322402954, + "learning_rate": 1.959203911577121e-06, + "loss": 0.2906, + "step": 42948 + }, + { + "epoch": 0.7973697010652636, + "grad_norm": 0.3816289007663727, + "learning_rate": 1.958510462340083e-06, + "loss": 0.2565, + "step": 42950 + }, + { + "epoch": 0.7974068312026823, + "grad_norm": 0.5795354247093201, + "learning_rate": 1.957817122523633e-06, + "loss": 0.2944, + "step": 42952 + }, + { + "epoch": 0.7974439613401009, + "grad_norm": 0.5652730464935303, + "learning_rate": 1.9571238921372017e-06, + "loss": 0.3168, + "step": 42954 + }, + { + "epoch": 0.7974810914775196, + "grad_norm": 0.4556003212928772, + "learning_rate": 1.956430771190224e-06, + "loss": 0.3636, + "step": 42956 + }, + { + "epoch": 0.7975182216149382, + "grad_norm": 0.35166963934898376, + "learning_rate": 1.955737759692128e-06, + "loss": 0.2805, + "step": 42958 + }, + { + "epoch": 0.7975553517523568, + "grad_norm": 0.4868939220905304, + "learning_rate": 1.955044857652346e-06, + "loss": 0.2895, + "step": 42960 + }, + { + "epoch": 0.7975924818897755, + "grad_norm": 0.47664740681648254, + "learning_rate": 1.954352065080306e-06, + "loss": 0.2442, + "step": 42962 + }, + { + "epoch": 0.7976296120271941, + "grad_norm": 0.37350475788116455, + "learning_rate": 1.9536593819854377e-06, + "loss": 0.276, + "step": 42964 + }, + { + "epoch": 0.7976667421646128, + "grad_norm": 0.30765920877456665, + "learning_rate": 1.95296680837716e-06, + "loss": 0.3317, + "step": 42966 + }, + { + "epoch": 0.7977038723020314, + "grad_norm": 0.4172854423522949, + "learning_rate": 1.9522743442649015e-06, + "loss": 0.2897, + "step": 42968 + }, + { + "epoch": 0.79774100243945, + "grad_norm": 0.4467003345489502, + "learning_rate": 1.9515819896580835e-06, + "loss": 0.3109, + "step": 42970 + }, + { + "epoch": 0.7977781325768687, + "grad_norm": 0.26181867718696594, + "learning_rate": 1.950889744566126e-06, + "loss": 0.166, + "step": 42972 + }, + { + "epoch": 0.7978152627142873, + "grad_norm": 0.40757080912590027, + "learning_rate": 1.950197608998452e-06, + "loss": 0.2841, + "step": 42974 + }, + { + "epoch": 0.797852392851706, + "grad_norm": 0.24695859849452972, + "learning_rate": 1.9495055829644748e-06, + "loss": 0.2295, + "step": 42976 + }, + { + "epoch": 0.7978895229891246, + "grad_norm": 0.6922567486763, + "learning_rate": 1.9488136664736148e-06, + "loss": 0.4111, + "step": 42978 + }, + { + "epoch": 0.7979266531265432, + "grad_norm": 0.430816113948822, + "learning_rate": 1.9481218595352814e-06, + "loss": 0.4955, + "step": 42980 + }, + { + "epoch": 0.7979637832639619, + "grad_norm": 0.25292617082595825, + "learning_rate": 1.947430162158892e-06, + "loss": 0.1351, + "step": 42982 + }, + { + "epoch": 0.7980009134013805, + "grad_norm": 0.3127245604991913, + "learning_rate": 1.946738574353858e-06, + "loss": 0.3354, + "step": 42984 + }, + { + "epoch": 0.7980380435387991, + "grad_norm": 0.3873004615306854, + "learning_rate": 1.9460470961295887e-06, + "loss": 0.2199, + "step": 42986 + }, + { + "epoch": 0.7980751736762178, + "grad_norm": 0.6320559978485107, + "learning_rate": 1.945355727495495e-06, + "loss": 0.2128, + "step": 42988 + }, + { + "epoch": 0.7981123038136364, + "grad_norm": 0.5764524936676025, + "learning_rate": 1.9446644684609863e-06, + "loss": 0.1694, + "step": 42990 + }, + { + "epoch": 0.7981494339510551, + "grad_norm": 0.3776129186153412, + "learning_rate": 1.9439733190354617e-06, + "loss": 0.4406, + "step": 42992 + }, + { + "epoch": 0.7981865640884737, + "grad_norm": 0.3860102891921997, + "learning_rate": 1.9432822792283303e-06, + "loss": 0.2736, + "step": 42994 + }, + { + "epoch": 0.7982236942258923, + "grad_norm": 0.25898656249046326, + "learning_rate": 1.942591349048998e-06, + "loss": 0.3975, + "step": 42996 + }, + { + "epoch": 0.7982608243633109, + "grad_norm": 0.6694298982620239, + "learning_rate": 1.9419005285068584e-06, + "loss": 0.3899, + "step": 42998 + }, + { + "epoch": 0.7982979545007296, + "grad_norm": 0.37497520446777344, + "learning_rate": 1.941209817611317e-06, + "loss": 0.1908, + "step": 43000 + }, + { + "epoch": 0.7983350846381483, + "grad_norm": 0.5986818671226501, + "learning_rate": 1.9405192163717735e-06, + "loss": 0.3549, + "step": 43002 + }, + { + "epoch": 0.7983722147755669, + "grad_norm": 0.34583595395088196, + "learning_rate": 1.93982872479762e-06, + "loss": 0.065, + "step": 43004 + }, + { + "epoch": 0.7984093449129855, + "grad_norm": 0.2747690975666046, + "learning_rate": 1.9391383428982536e-06, + "loss": 0.3264, + "step": 43006 + }, + { + "epoch": 0.7984464750504041, + "grad_norm": 0.4840778112411499, + "learning_rate": 1.93844807068307e-06, + "loss": 0.2221, + "step": 43008 + }, + { + "epoch": 0.7984836051878228, + "grad_norm": 0.24710655212402344, + "learning_rate": 1.9377579081614615e-06, + "loss": 0.2501, + "step": 43010 + }, + { + "epoch": 0.7985207353252415, + "grad_norm": 0.3082912266254425, + "learning_rate": 1.937067855342818e-06, + "loss": 0.1341, + "step": 43012 + }, + { + "epoch": 0.79855786546266, + "grad_norm": 0.21395233273506165, + "learning_rate": 1.93637791223653e-06, + "loss": 0.1109, + "step": 43014 + }, + { + "epoch": 0.7985949956000787, + "grad_norm": 0.3201897442340851, + "learning_rate": 1.9356880788519874e-06, + "loss": 0.3512, + "step": 43016 + }, + { + "epoch": 0.7986321257374973, + "grad_norm": 0.7574721574783325, + "learning_rate": 1.934998355198575e-06, + "loss": 0.2542, + "step": 43018 + }, + { + "epoch": 0.798669255874916, + "grad_norm": 0.38197097182273865, + "learning_rate": 1.9343087412856755e-06, + "loss": 0.2005, + "step": 43020 + }, + { + "epoch": 0.7987063860123347, + "grad_norm": 0.519420325756073, + "learning_rate": 1.9336192371226735e-06, + "loss": 0.1919, + "step": 43022 + }, + { + "epoch": 0.7987435161497533, + "grad_norm": 0.22292166948318481, + "learning_rate": 1.9329298427189525e-06, + "loss": 0.3386, + "step": 43024 + }, + { + "epoch": 0.7987806462871719, + "grad_norm": 0.1667504757642746, + "learning_rate": 1.932240558083892e-06, + "loss": 0.3664, + "step": 43026 + }, + { + "epoch": 0.7988177764245905, + "grad_norm": 0.41540899872779846, + "learning_rate": 1.9315513832268763e-06, + "loss": 0.1933, + "step": 43028 + }, + { + "epoch": 0.7988549065620092, + "grad_norm": 0.7609919905662537, + "learning_rate": 1.930862318157274e-06, + "loss": 0.5274, + "step": 43030 + }, + { + "epoch": 0.7988920366994279, + "grad_norm": 0.40697553753852844, + "learning_rate": 1.9301733628844666e-06, + "loss": 0.3669, + "step": 43032 + }, + { + "epoch": 0.7989291668368464, + "grad_norm": 0.34802672266960144, + "learning_rate": 1.929484517417827e-06, + "loss": 0.2315, + "step": 43034 + }, + { + "epoch": 0.7989662969742651, + "grad_norm": 0.45419275760650635, + "learning_rate": 1.928795781766729e-06, + "loss": 0.107, + "step": 43036 + }, + { + "epoch": 0.7990034271116837, + "grad_norm": 0.2737857401371002, + "learning_rate": 1.9281071559405484e-06, + "loss": 0.254, + "step": 43038 + }, + { + "epoch": 0.7990405572491024, + "grad_norm": 0.3402908146381378, + "learning_rate": 1.927418639948647e-06, + "loss": 0.4026, + "step": 43040 + }, + { + "epoch": 0.799077687386521, + "grad_norm": 0.2896142303943634, + "learning_rate": 1.9267302338004012e-06, + "loss": 0.1903, + "step": 43042 + }, + { + "epoch": 0.7991148175239396, + "grad_norm": 0.4730847179889679, + "learning_rate": 1.9260419375051732e-06, + "loss": 0.4513, + "step": 43044 + }, + { + "epoch": 0.7991519476613583, + "grad_norm": 0.33516234159469604, + "learning_rate": 1.9253537510723284e-06, + "loss": 0.2122, + "step": 43046 + }, + { + "epoch": 0.7991890777987769, + "grad_norm": 0.45864537358283997, + "learning_rate": 1.9246656745112337e-06, + "loss": 0.2465, + "step": 43048 + }, + { + "epoch": 0.7992262079361956, + "grad_norm": 0.305649071931839, + "learning_rate": 1.923977707831249e-06, + "loss": 0.4121, + "step": 43050 + }, + { + "epoch": 0.7992633380736142, + "grad_norm": 0.29891708493232727, + "learning_rate": 1.9232898510417397e-06, + "loss": 0.1992, + "step": 43052 + }, + { + "epoch": 0.7993004682110328, + "grad_norm": 0.21371294558048248, + "learning_rate": 1.9226021041520636e-06, + "loss": 0.1172, + "step": 43054 + }, + { + "epoch": 0.7993375983484515, + "grad_norm": 0.38667210936546326, + "learning_rate": 1.9219144671715774e-06, + "loss": 0.5409, + "step": 43056 + }, + { + "epoch": 0.7993747284858701, + "grad_norm": 0.29345375299453735, + "learning_rate": 1.9212269401096372e-06, + "loss": 0.3697, + "step": 43058 + }, + { + "epoch": 0.7994118586232888, + "grad_norm": 0.5943610668182373, + "learning_rate": 1.920539522975603e-06, + "loss": 0.3014, + "step": 43060 + }, + { + "epoch": 0.7994489887607074, + "grad_norm": 0.37347593903541565, + "learning_rate": 1.9198522157788214e-06, + "loss": 0.2693, + "step": 43062 + }, + { + "epoch": 0.799486118898126, + "grad_norm": 0.15082262456417084, + "learning_rate": 1.919165018528649e-06, + "loss": 0.1237, + "step": 43064 + }, + { + "epoch": 0.7995232490355447, + "grad_norm": 0.1935180425643921, + "learning_rate": 1.9184779312344393e-06, + "loss": 0.2395, + "step": 43066 + }, + { + "epoch": 0.7995603791729633, + "grad_norm": 0.41766008734703064, + "learning_rate": 1.917790953905534e-06, + "loss": 0.4238, + "step": 43068 + }, + { + "epoch": 0.799597509310382, + "grad_norm": 0.45843106508255005, + "learning_rate": 1.917104086551286e-06, + "loss": 0.2885, + "step": 43070 + }, + { + "epoch": 0.7996346394478006, + "grad_norm": 0.5241666436195374, + "learning_rate": 1.9164173291810394e-06, + "loss": 0.3662, + "step": 43072 + }, + { + "epoch": 0.7996717695852192, + "grad_norm": 0.4685191214084625, + "learning_rate": 1.91573068180414e-06, + "loss": 0.2577, + "step": 43074 + }, + { + "epoch": 0.7997088997226379, + "grad_norm": 0.45422470569610596, + "learning_rate": 1.9150441444299305e-06, + "loss": 0.315, + "step": 43076 + }, + { + "epoch": 0.7997460298600565, + "grad_norm": 0.8422712683677673, + "learning_rate": 1.9143577170677564e-06, + "loss": 0.2394, + "step": 43078 + }, + { + "epoch": 0.7997831599974752, + "grad_norm": 0.5112595558166504, + "learning_rate": 1.9136713997269517e-06, + "loss": 0.1634, + "step": 43080 + }, + { + "epoch": 0.7998202901348938, + "grad_norm": 0.38265547156333923, + "learning_rate": 1.9129851924168594e-06, + "loss": 0.4626, + "step": 43082 + }, + { + "epoch": 0.7998574202723124, + "grad_norm": 0.5256198048591614, + "learning_rate": 1.9122990951468134e-06, + "loss": 0.4621, + "step": 43084 + }, + { + "epoch": 0.7998945504097311, + "grad_norm": 0.3918473720550537, + "learning_rate": 1.911613107926151e-06, + "loss": 0.3365, + "step": 43086 + }, + { + "epoch": 0.7999316805471497, + "grad_norm": 0.4054127335548401, + "learning_rate": 1.910927230764208e-06, + "loss": 0.2619, + "step": 43088 + }, + { + "epoch": 0.7999688106845684, + "grad_norm": 0.4376373887062073, + "learning_rate": 1.910241463670317e-06, + "loss": 0.3786, + "step": 43090 + }, + { + "epoch": 0.800005940821987, + "grad_norm": 0.5270215272903442, + "learning_rate": 1.9095558066538056e-06, + "loss": 0.2439, + "step": 43092 + }, + { + "epoch": 0.8000430709594056, + "grad_norm": 0.6403298377990723, + "learning_rate": 1.908870259724007e-06, + "loss": 0.2846, + "step": 43094 + }, + { + "epoch": 0.8000802010968242, + "grad_norm": 0.3347005844116211, + "learning_rate": 1.9081848228902477e-06, + "loss": 0.4026, + "step": 43096 + }, + { + "epoch": 0.8001173312342429, + "grad_norm": 0.44083598256111145, + "learning_rate": 1.9074994961618543e-06, + "loss": 0.2276, + "step": 43098 + }, + { + "epoch": 0.8001544613716616, + "grad_norm": 0.39511510729789734, + "learning_rate": 1.9068142795481537e-06, + "loss": 0.2694, + "step": 43100 + }, + { + "epoch": 0.8001915915090801, + "grad_norm": 0.33721503615379333, + "learning_rate": 1.9061291730584718e-06, + "loss": 0.261, + "step": 43102 + }, + { + "epoch": 0.8002287216464988, + "grad_norm": 0.39148256182670593, + "learning_rate": 1.9054441767021282e-06, + "loss": 0.2343, + "step": 43104 + }, + { + "epoch": 0.8002658517839174, + "grad_norm": 0.38831228017807007, + "learning_rate": 1.9047592904884393e-06, + "loss": 0.2623, + "step": 43106 + }, + { + "epoch": 0.8003029819213361, + "grad_norm": 0.2891169786453247, + "learning_rate": 1.9040745144267293e-06, + "loss": 0.4191, + "step": 43108 + }, + { + "epoch": 0.8003401120587548, + "grad_norm": 0.46889522671699524, + "learning_rate": 1.9033898485263159e-06, + "loss": 0.345, + "step": 43110 + }, + { + "epoch": 0.8003772421961733, + "grad_norm": 0.4684017598628998, + "learning_rate": 1.9027052927965129e-06, + "loss": 0.3747, + "step": 43112 + }, + { + "epoch": 0.800414372333592, + "grad_norm": 0.3638305068016052, + "learning_rate": 1.9020208472466372e-06, + "loss": 0.1795, + "step": 43114 + }, + { + "epoch": 0.8004515024710106, + "grad_norm": 0.4729718863964081, + "learning_rate": 1.901336511886005e-06, + "loss": 0.1291, + "step": 43116 + }, + { + "epoch": 0.8004886326084293, + "grad_norm": 0.3358307182788849, + "learning_rate": 1.9006522867239207e-06, + "loss": 0.1718, + "step": 43118 + }, + { + "epoch": 0.800525762745848, + "grad_norm": 0.5696702003479004, + "learning_rate": 1.8999681717696993e-06, + "loss": 0.2886, + "step": 43120 + }, + { + "epoch": 0.8005628928832665, + "grad_norm": 0.450535386800766, + "learning_rate": 1.8992841670326479e-06, + "loss": 0.2308, + "step": 43122 + }, + { + "epoch": 0.8006000230206852, + "grad_norm": 0.5439473986625671, + "learning_rate": 1.8986002725220775e-06, + "loss": 0.1825, + "step": 43124 + }, + { + "epoch": 0.8006371531581038, + "grad_norm": 0.30289989709854126, + "learning_rate": 1.897916488247289e-06, + "loss": 0.1576, + "step": 43126 + }, + { + "epoch": 0.8006742832955225, + "grad_norm": 0.367119163274765, + "learning_rate": 1.8972328142175899e-06, + "loss": 0.2993, + "step": 43128 + }, + { + "epoch": 0.8007114134329412, + "grad_norm": 0.4330754280090332, + "learning_rate": 1.89654925044228e-06, + "loss": 0.3078, + "step": 43130 + }, + { + "epoch": 0.8007485435703597, + "grad_norm": 0.5412703156471252, + "learning_rate": 1.8958657969306615e-06, + "loss": 0.2562, + "step": 43132 + }, + { + "epoch": 0.8007856737077784, + "grad_norm": 0.8298667073249817, + "learning_rate": 1.8951824536920349e-06, + "loss": 0.2649, + "step": 43134 + }, + { + "epoch": 0.800822803845197, + "grad_norm": 0.29756802320480347, + "learning_rate": 1.8944992207356993e-06, + "loss": 0.3362, + "step": 43136 + }, + { + "epoch": 0.8008599339826157, + "grad_norm": 0.5000346899032593, + "learning_rate": 1.8938160980709497e-06, + "loss": 0.3412, + "step": 43138 + }, + { + "epoch": 0.8008970641200344, + "grad_norm": 0.41299769282341003, + "learning_rate": 1.8931330857070817e-06, + "loss": 0.3147, + "step": 43140 + }, + { + "epoch": 0.8009341942574529, + "grad_norm": 0.1876785159111023, + "learning_rate": 1.8924501836533926e-06, + "loss": 0.1849, + "step": 43142 + }, + { + "epoch": 0.8009713243948716, + "grad_norm": 0.2150462567806244, + "learning_rate": 1.8917673919191693e-06, + "loss": 0.3636, + "step": 43144 + }, + { + "epoch": 0.8010084545322902, + "grad_norm": 0.29601457715034485, + "learning_rate": 1.8910847105137075e-06, + "loss": 0.4118, + "step": 43146 + }, + { + "epoch": 0.8010455846697089, + "grad_norm": 0.3665584623813629, + "learning_rate": 1.8904021394462902e-06, + "loss": 0.3455, + "step": 43148 + }, + { + "epoch": 0.8010827148071274, + "grad_norm": 0.3078038990497589, + "learning_rate": 1.8897196787262095e-06, + "loss": 0.1855, + "step": 43150 + }, + { + "epoch": 0.8011198449445461, + "grad_norm": 0.5149518847465515, + "learning_rate": 1.8890373283627505e-06, + "loss": 0.2345, + "step": 43152 + }, + { + "epoch": 0.8011569750819648, + "grad_norm": 0.5233734250068665, + "learning_rate": 1.8883550883652014e-06, + "loss": 0.3004, + "step": 43154 + }, + { + "epoch": 0.8011941052193834, + "grad_norm": 0.5146218538284302, + "learning_rate": 1.88767295874284e-06, + "loss": 0.37, + "step": 43156 + }, + { + "epoch": 0.8012312353568021, + "grad_norm": 0.40019264817237854, + "learning_rate": 1.8869909395049502e-06, + "loss": 0.1849, + "step": 43158 + }, + { + "epoch": 0.8012683654942206, + "grad_norm": 0.8153854608535767, + "learning_rate": 1.8863090306608113e-06, + "loss": 0.2683, + "step": 43160 + }, + { + "epoch": 0.8013054956316393, + "grad_norm": 0.4583037197589874, + "learning_rate": 1.8856272322197044e-06, + "loss": 0.215, + "step": 43162 + }, + { + "epoch": 0.801342625769058, + "grad_norm": 1.4215084314346313, + "learning_rate": 1.884945544190908e-06, + "loss": 0.0513, + "step": 43164 + }, + { + "epoch": 0.8013797559064766, + "grad_norm": 0.22587573528289795, + "learning_rate": 1.8842639665836926e-06, + "loss": 0.31, + "step": 43166 + }, + { + "epoch": 0.8014168860438953, + "grad_norm": 0.495988130569458, + "learning_rate": 1.883582499407337e-06, + "loss": 0.2364, + "step": 43168 + }, + { + "epoch": 0.8014540161813138, + "grad_norm": 0.49853113293647766, + "learning_rate": 1.8829011426711109e-06, + "loss": 0.1589, + "step": 43170 + }, + { + "epoch": 0.8014911463187325, + "grad_norm": 0.5496615767478943, + "learning_rate": 1.8822198963842852e-06, + "loss": 0.2869, + "step": 43172 + }, + { + "epoch": 0.8015282764561512, + "grad_norm": 0.3180420994758606, + "learning_rate": 1.8815387605561319e-06, + "loss": 0.3316, + "step": 43174 + }, + { + "epoch": 0.8015654065935698, + "grad_norm": 0.4430505931377411, + "learning_rate": 1.8808577351959189e-06, + "loss": 0.4357, + "step": 43176 + }, + { + "epoch": 0.8016025367309885, + "grad_norm": 0.2955609858036041, + "learning_rate": 1.8801768203129122e-06, + "loss": 0.2116, + "step": 43178 + }, + { + "epoch": 0.801639666868407, + "grad_norm": 0.46044185757637024, + "learning_rate": 1.879496015916379e-06, + "loss": 0.3636, + "step": 43180 + }, + { + "epoch": 0.8016767970058257, + "grad_norm": 0.5946810245513916, + "learning_rate": 1.8788153220155803e-06, + "loss": 0.2224, + "step": 43182 + }, + { + "epoch": 0.8017139271432444, + "grad_norm": 0.45550569891929626, + "learning_rate": 1.878134738619778e-06, + "loss": 0.2776, + "step": 43184 + }, + { + "epoch": 0.801751057280663, + "grad_norm": 0.34014013409614563, + "learning_rate": 1.8774542657382344e-06, + "loss": 0.2012, + "step": 43186 + }, + { + "epoch": 0.8017881874180817, + "grad_norm": 1.281672477722168, + "learning_rate": 1.8767739033802113e-06, + "loss": 0.1994, + "step": 43188 + }, + { + "epoch": 0.8018253175555002, + "grad_norm": 0.41418516635894775, + "learning_rate": 1.876093651554961e-06, + "loss": 0.2363, + "step": 43190 + }, + { + "epoch": 0.8018624476929189, + "grad_norm": 0.42823171615600586, + "learning_rate": 1.8754135102717442e-06, + "loss": 0.3085, + "step": 43192 + }, + { + "epoch": 0.8018995778303375, + "grad_norm": 0.4472355246543884, + "learning_rate": 1.874733479539811e-06, + "loss": 0.2975, + "step": 43194 + }, + { + "epoch": 0.8019367079677562, + "grad_norm": 0.36870241165161133, + "learning_rate": 1.8740535593684184e-06, + "loss": 0.2924, + "step": 43196 + }, + { + "epoch": 0.8019738381051749, + "grad_norm": 0.8169774413108826, + "learning_rate": 1.8733737497668158e-06, + "loss": 0.1596, + "step": 43198 + }, + { + "epoch": 0.8020109682425934, + "grad_norm": 0.3381081521511078, + "learning_rate": 1.8726940507442536e-06, + "loss": 0.1528, + "step": 43200 + }, + { + "epoch": 0.8020480983800121, + "grad_norm": 0.5653008222579956, + "learning_rate": 1.8720144623099823e-06, + "loss": 0.3512, + "step": 43202 + }, + { + "epoch": 0.8020852285174307, + "grad_norm": 0.19422206282615662, + "learning_rate": 1.871334984473251e-06, + "loss": 0.1944, + "step": 43204 + }, + { + "epoch": 0.8021223586548494, + "grad_norm": 0.24484436213970184, + "learning_rate": 1.8706556172432988e-06, + "loss": 0.2368, + "step": 43206 + }, + { + "epoch": 0.8021594887922681, + "grad_norm": 0.35575175285339355, + "learning_rate": 1.8699763606293742e-06, + "loss": 0.2693, + "step": 43208 + }, + { + "epoch": 0.8021966189296866, + "grad_norm": 0.5147548317909241, + "learning_rate": 1.8692972146407219e-06, + "loss": 0.2344, + "step": 43210 + }, + { + "epoch": 0.8022337490671053, + "grad_norm": 0.38157403469085693, + "learning_rate": 1.8686181792865764e-06, + "loss": 0.3073, + "step": 43212 + }, + { + "epoch": 0.8022708792045239, + "grad_norm": 0.6146352291107178, + "learning_rate": 1.8679392545761821e-06, + "loss": 0.2166, + "step": 43214 + }, + { + "epoch": 0.8023080093419426, + "grad_norm": 0.5257559418678284, + "learning_rate": 1.8672604405187755e-06, + "loss": 0.2163, + "step": 43216 + }, + { + "epoch": 0.8023451394793613, + "grad_norm": 0.4373333752155304, + "learning_rate": 1.8665817371235973e-06, + "loss": 0.2744, + "step": 43218 + }, + { + "epoch": 0.8023822696167798, + "grad_norm": 0.3160369396209717, + "learning_rate": 1.8659031443998766e-06, + "loss": 0.2421, + "step": 43220 + }, + { + "epoch": 0.8024193997541985, + "grad_norm": 0.41223639249801636, + "learning_rate": 1.8652246623568493e-06, + "loss": 0.3403, + "step": 43222 + }, + { + "epoch": 0.8024565298916171, + "grad_norm": 0.5229523777961731, + "learning_rate": 1.8645462910037481e-06, + "loss": 0.3453, + "step": 43224 + }, + { + "epoch": 0.8024936600290358, + "grad_norm": 0.4297162890434265, + "learning_rate": 1.8638680303498036e-06, + "loss": 0.2659, + "step": 43226 + }, + { + "epoch": 0.8025307901664545, + "grad_norm": 0.3684268295764923, + "learning_rate": 1.863189880404247e-06, + "loss": 0.3765, + "step": 43228 + }, + { + "epoch": 0.802567920303873, + "grad_norm": 0.24633879959583282, + "learning_rate": 1.862511841176301e-06, + "loss": 0.214, + "step": 43230 + }, + { + "epoch": 0.8026050504412917, + "grad_norm": 0.5089553594589233, + "learning_rate": 1.8618339126751961e-06, + "loss": 0.2495, + "step": 43232 + }, + { + "epoch": 0.8026421805787103, + "grad_norm": 0.5402042865753174, + "learning_rate": 1.8611560949101537e-06, + "loss": 0.388, + "step": 43234 + }, + { + "epoch": 0.802679310716129, + "grad_norm": 0.24427495896816254, + "learning_rate": 1.8604783878903975e-06, + "loss": 0.2162, + "step": 43236 + }, + { + "epoch": 0.8027164408535477, + "grad_norm": 0.6427628993988037, + "learning_rate": 1.8598007916251503e-06, + "loss": 0.1537, + "step": 43238 + }, + { + "epoch": 0.8027535709909662, + "grad_norm": 0.19967681169509888, + "learning_rate": 1.8591233061236301e-06, + "loss": 0.289, + "step": 43240 + }, + { + "epoch": 0.8027907011283849, + "grad_norm": 0.22936366498470306, + "learning_rate": 1.858445931395061e-06, + "loss": 0.122, + "step": 43242 + }, + { + "epoch": 0.8028278312658035, + "grad_norm": 0.4564876854419708, + "learning_rate": 1.8577686674486528e-06, + "loss": 0.3432, + "step": 43244 + }, + { + "epoch": 0.8028649614032222, + "grad_norm": 0.3696427047252655, + "learning_rate": 1.8570915142936251e-06, + "loss": 0.2891, + "step": 43246 + }, + { + "epoch": 0.8029020915406407, + "grad_norm": 0.3821374475955963, + "learning_rate": 1.8564144719391897e-06, + "loss": 0.2975, + "step": 43248 + }, + { + "epoch": 0.8029392216780594, + "grad_norm": 0.33499541878700256, + "learning_rate": 1.8557375403945643e-06, + "loss": 0.1411, + "step": 43250 + }, + { + "epoch": 0.8029763518154781, + "grad_norm": 0.5015783309936523, + "learning_rate": 1.855060719668953e-06, + "loss": 0.2137, + "step": 43252 + }, + { + "epoch": 0.8030134819528967, + "grad_norm": 0.34600141644477844, + "learning_rate": 1.8543840097715705e-06, + "loss": 0.2724, + "step": 43254 + }, + { + "epoch": 0.8030506120903154, + "grad_norm": 0.2666163742542267, + "learning_rate": 1.8537074107116205e-06, + "loss": 0.2577, + "step": 43256 + }, + { + "epoch": 0.8030877422277339, + "grad_norm": 0.655259907245636, + "learning_rate": 1.8530309224983123e-06, + "loss": 0.2425, + "step": 43258 + }, + { + "epoch": 0.8031248723651526, + "grad_norm": 0.40070047974586487, + "learning_rate": 1.8523545451408497e-06, + "loss": 0.3429, + "step": 43260 + }, + { + "epoch": 0.8031620025025713, + "grad_norm": 0.6754332184791565, + "learning_rate": 1.8516782786484367e-06, + "loss": 0.3918, + "step": 43262 + }, + { + "epoch": 0.8031991326399899, + "grad_norm": 0.2892024517059326, + "learning_rate": 1.8510021230302744e-06, + "loss": 0.2347, + "step": 43264 + }, + { + "epoch": 0.8032362627774086, + "grad_norm": 0.25573599338531494, + "learning_rate": 1.8503260782955656e-06, + "loss": 0.2774, + "step": 43266 + }, + { + "epoch": 0.8032733929148271, + "grad_norm": 0.295149028301239, + "learning_rate": 1.8496501444535097e-06, + "loss": 0.2412, + "step": 43268 + }, + { + "epoch": 0.8033105230522458, + "grad_norm": 0.6195911765098572, + "learning_rate": 1.8489743215132993e-06, + "loss": 0.4111, + "step": 43270 + }, + { + "epoch": 0.8033476531896645, + "grad_norm": 0.3579482138156891, + "learning_rate": 1.848298609484136e-06, + "loss": 0.3796, + "step": 43272 + }, + { + "epoch": 0.8033847833270831, + "grad_norm": 0.4425990581512451, + "learning_rate": 1.8476230083752088e-06, + "loss": 0.194, + "step": 43274 + }, + { + "epoch": 0.8034219134645018, + "grad_norm": 0.5649939775466919, + "learning_rate": 1.8469475181957142e-06, + "loss": 0.3571, + "step": 43276 + }, + { + "epoch": 0.8034590436019203, + "grad_norm": 0.5413402318954468, + "learning_rate": 1.8462721389548422e-06, + "loss": 0.0872, + "step": 43278 + }, + { + "epoch": 0.803496173739339, + "grad_norm": 0.3577542006969452, + "learning_rate": 1.8455968706617854e-06, + "loss": 0.203, + "step": 43280 + }, + { + "epoch": 0.8035333038767577, + "grad_norm": 0.4656463861465454, + "learning_rate": 1.844921713325727e-06, + "loss": 0.3137, + "step": 43282 + }, + { + "epoch": 0.8035704340141763, + "grad_norm": 0.3261988162994385, + "learning_rate": 1.8442466669558578e-06, + "loss": 0.2861, + "step": 43284 + }, + { + "epoch": 0.803607564151595, + "grad_norm": 0.30262404680252075, + "learning_rate": 1.843571731561361e-06, + "loss": 0.389, + "step": 43286 + }, + { + "epoch": 0.8036446942890135, + "grad_norm": 0.3228495717048645, + "learning_rate": 1.8428969071514225e-06, + "loss": 0.3675, + "step": 43288 + }, + { + "epoch": 0.8036818244264322, + "grad_norm": 0.3985765874385834, + "learning_rate": 1.8422221937352236e-06, + "loss": 0.2045, + "step": 43290 + }, + { + "epoch": 0.8037189545638509, + "grad_norm": 0.32033851742744446, + "learning_rate": 1.841547591321947e-06, + "loss": 0.2345, + "step": 43292 + }, + { + "epoch": 0.8037560847012695, + "grad_norm": 0.6886205077171326, + "learning_rate": 1.840873099920769e-06, + "loss": 0.1642, + "step": 43294 + }, + { + "epoch": 0.8037932148386882, + "grad_norm": 0.39457011222839355, + "learning_rate": 1.84019871954087e-06, + "loss": 0.1899, + "step": 43296 + }, + { + "epoch": 0.8038303449761067, + "grad_norm": 0.5090628862380981, + "learning_rate": 1.8395244501914234e-06, + "loss": 0.2008, + "step": 43298 + }, + { + "epoch": 0.8038674751135254, + "grad_norm": 0.4681437313556671, + "learning_rate": 1.8388502918816053e-06, + "loss": 0.3015, + "step": 43300 + }, + { + "epoch": 0.803904605250944, + "grad_norm": 0.38829556107521057, + "learning_rate": 1.8381762446205887e-06, + "loss": 0.2017, + "step": 43302 + }, + { + "epoch": 0.8039417353883627, + "grad_norm": 0.25878778100013733, + "learning_rate": 1.8375023084175469e-06, + "loss": 0.2045, + "step": 43304 + }, + { + "epoch": 0.8039788655257813, + "grad_norm": 0.3184488117694855, + "learning_rate": 1.8368284832816508e-06, + "loss": 0.2564, + "step": 43306 + }, + { + "epoch": 0.8040159956631999, + "grad_norm": 0.4944175183773041, + "learning_rate": 1.836154769222066e-06, + "loss": 0.2729, + "step": 43308 + }, + { + "epoch": 0.8040531258006186, + "grad_norm": 0.39799389243125916, + "learning_rate": 1.83548116624796e-06, + "loss": 0.2656, + "step": 43310 + }, + { + "epoch": 0.8040902559380372, + "grad_norm": 0.47483983635902405, + "learning_rate": 1.8348076743685005e-06, + "loss": 0.3832, + "step": 43312 + }, + { + "epoch": 0.8041273860754559, + "grad_norm": 0.4117342531681061, + "learning_rate": 1.8341342935928542e-06, + "loss": 0.4255, + "step": 43314 + }, + { + "epoch": 0.8041645162128745, + "grad_norm": 0.40704625844955444, + "learning_rate": 1.8334610239301765e-06, + "loss": 0.1167, + "step": 43316 + }, + { + "epoch": 0.8042016463502931, + "grad_norm": 0.38332149386405945, + "learning_rate": 1.8327878653896358e-06, + "loss": 0.2752, + "step": 43318 + }, + { + "epoch": 0.8042387764877118, + "grad_norm": 0.43960854411125183, + "learning_rate": 1.8321148179803871e-06, + "loss": 0.4366, + "step": 43320 + }, + { + "epoch": 0.8042759066251304, + "grad_norm": 0.5045108199119568, + "learning_rate": 1.8314418817115887e-06, + "loss": 0.0976, + "step": 43322 + }, + { + "epoch": 0.8043130367625491, + "grad_norm": 0.41069746017456055, + "learning_rate": 1.8307690565923986e-06, + "loss": 0.2607, + "step": 43324 + }, + { + "epoch": 0.8043501668999677, + "grad_norm": 0.518173336982727, + "learning_rate": 1.830096342631973e-06, + "loss": 0.3668, + "step": 43326 + }, + { + "epoch": 0.8043872970373863, + "grad_norm": 0.30913227796554565, + "learning_rate": 1.8294237398394643e-06, + "loss": 0.2582, + "step": 43328 + }, + { + "epoch": 0.804424427174805, + "grad_norm": 0.3206116855144501, + "learning_rate": 1.8287512482240266e-06, + "loss": 0.143, + "step": 43330 + }, + { + "epoch": 0.8044615573122236, + "grad_norm": 0.48731645941734314, + "learning_rate": 1.8280788677948068e-06, + "loss": 0.2488, + "step": 43332 + }, + { + "epoch": 0.8044986874496423, + "grad_norm": 0.3795047402381897, + "learning_rate": 1.8274065985609557e-06, + "loss": 0.346, + "step": 43334 + }, + { + "epoch": 0.8045358175870609, + "grad_norm": 0.3395436108112335, + "learning_rate": 1.8267344405316235e-06, + "loss": 0.2858, + "step": 43336 + }, + { + "epoch": 0.8045729477244795, + "grad_norm": 0.3880978524684906, + "learning_rate": 1.8260623937159506e-06, + "loss": 0.1366, + "step": 43338 + }, + { + "epoch": 0.8046100778618982, + "grad_norm": 0.5826380848884583, + "learning_rate": 1.8253904581230851e-06, + "loss": 0.2398, + "step": 43340 + }, + { + "epoch": 0.8046472079993168, + "grad_norm": 0.3195175528526306, + "learning_rate": 1.8247186337621702e-06, + "loss": 0.4254, + "step": 43342 + }, + { + "epoch": 0.8046843381367355, + "grad_norm": 0.4237681031227112, + "learning_rate": 1.8240469206423495e-06, + "loss": 0.2074, + "step": 43344 + }, + { + "epoch": 0.804721468274154, + "grad_norm": 0.47419479489326477, + "learning_rate": 1.8233753187727577e-06, + "loss": 0.3294, + "step": 43346 + }, + { + "epoch": 0.8047585984115727, + "grad_norm": 0.7384077310562134, + "learning_rate": 1.8227038281625353e-06, + "loss": 0.3279, + "step": 43348 + }, + { + "epoch": 0.8047957285489914, + "grad_norm": 0.7821800708770752, + "learning_rate": 1.8220324488208207e-06, + "loss": 0.2406, + "step": 43350 + }, + { + "epoch": 0.80483285868641, + "grad_norm": 0.27602943778038025, + "learning_rate": 1.8213611807567488e-06, + "loss": 0.1983, + "step": 43352 + }, + { + "epoch": 0.8048699888238287, + "grad_norm": 0.4104331433773041, + "learning_rate": 1.8206900239794533e-06, + "loss": 0.4232, + "step": 43354 + }, + { + "epoch": 0.8049071189612472, + "grad_norm": 0.4724610149860382, + "learning_rate": 1.8200189784980683e-06, + "loss": 0.1915, + "step": 43356 + }, + { + "epoch": 0.8049442490986659, + "grad_norm": 0.4686844050884247, + "learning_rate": 1.8193480443217238e-06, + "loss": 0.2924, + "step": 43358 + }, + { + "epoch": 0.8049813792360846, + "grad_norm": 0.3730349540710449, + "learning_rate": 1.818677221459546e-06, + "loss": 0.1122, + "step": 43360 + }, + { + "epoch": 0.8050185093735032, + "grad_norm": 0.3631017804145813, + "learning_rate": 1.8180065099206657e-06, + "loss": 0.406, + "step": 43362 + }, + { + "epoch": 0.8050556395109218, + "grad_norm": 0.41883212327957153, + "learning_rate": 1.8173359097142085e-06, + "loss": 0.1862, + "step": 43364 + }, + { + "epoch": 0.8050927696483404, + "grad_norm": 0.1606208086013794, + "learning_rate": 1.8166654208492994e-06, + "loss": 0.2055, + "step": 43366 + }, + { + "epoch": 0.8051298997857591, + "grad_norm": 0.3141595721244812, + "learning_rate": 1.8159950433350648e-06, + "loss": 0.1557, + "step": 43368 + }, + { + "epoch": 0.8051670299231778, + "grad_norm": 0.48711565136909485, + "learning_rate": 1.8153247771806215e-06, + "loss": 0.2681, + "step": 43370 + }, + { + "epoch": 0.8052041600605964, + "grad_norm": 0.2692526876926422, + "learning_rate": 1.8146546223950912e-06, + "loss": 0.45, + "step": 43372 + }, + { + "epoch": 0.805241290198015, + "grad_norm": 0.19378980994224548, + "learning_rate": 1.8139845789875943e-06, + "loss": 0.2439, + "step": 43374 + }, + { + "epoch": 0.8052784203354336, + "grad_norm": 0.4063666760921478, + "learning_rate": 1.8133146469672469e-06, + "loss": 0.3842, + "step": 43376 + }, + { + "epoch": 0.8053155504728523, + "grad_norm": 0.4238714873790741, + "learning_rate": 1.8126448263431673e-06, + "loss": 0.2104, + "step": 43378 + }, + { + "epoch": 0.805352680610271, + "grad_norm": 0.47500571608543396, + "learning_rate": 1.8119751171244681e-06, + "loss": 0.4069, + "step": 43380 + }, + { + "epoch": 0.8053898107476896, + "grad_norm": 0.34225329756736755, + "learning_rate": 1.81130551932026e-06, + "loss": 0.1589, + "step": 43382 + }, + { + "epoch": 0.8054269408851082, + "grad_norm": 0.47826430201530457, + "learning_rate": 1.8106360329396544e-06, + "loss": 0.2706, + "step": 43384 + }, + { + "epoch": 0.8054640710225268, + "grad_norm": 0.49866336584091187, + "learning_rate": 1.809966657991763e-06, + "loss": 0.2475, + "step": 43386 + }, + { + "epoch": 0.8055012011599455, + "grad_norm": 0.3814980089664459, + "learning_rate": 1.809297394485694e-06, + "loss": 0.1413, + "step": 43388 + }, + { + "epoch": 0.8055383312973642, + "grad_norm": 0.31049486994743347, + "learning_rate": 1.8086282424305525e-06, + "loss": 0.2359, + "step": 43390 + }, + { + "epoch": 0.8055754614347828, + "grad_norm": 0.5000649690628052, + "learning_rate": 1.8079592018354453e-06, + "loss": 0.22, + "step": 43392 + }, + { + "epoch": 0.8056125915722014, + "grad_norm": 0.6004167795181274, + "learning_rate": 1.8072902727094777e-06, + "loss": 0.1379, + "step": 43394 + }, + { + "epoch": 0.80564972170962, + "grad_norm": 0.335161030292511, + "learning_rate": 1.8066214550617466e-06, + "loss": 0.4015, + "step": 43396 + }, + { + "epoch": 0.8056868518470387, + "grad_norm": 0.37503138184547424, + "learning_rate": 1.8059527489013551e-06, + "loss": 0.2933, + "step": 43398 + }, + { + "epoch": 0.8057239819844573, + "grad_norm": 0.27074676752090454, + "learning_rate": 1.8052841542374056e-06, + "loss": 0.3332, + "step": 43400 + }, + { + "epoch": 0.805761112121876, + "grad_norm": 0.4484935998916626, + "learning_rate": 1.8046156710789907e-06, + "loss": 0.4415, + "step": 43402 + }, + { + "epoch": 0.8057982422592946, + "grad_norm": 0.3852640688419342, + "learning_rate": 1.803947299435208e-06, + "loss": 0.1681, + "step": 43404 + }, + { + "epoch": 0.8058353723967132, + "grad_norm": 0.3704209625720978, + "learning_rate": 1.8032790393151556e-06, + "loss": 0.0885, + "step": 43406 + }, + { + "epoch": 0.8058725025341319, + "grad_norm": 0.3663286864757538, + "learning_rate": 1.8026108907279204e-06, + "loss": 0.2001, + "step": 43408 + }, + { + "epoch": 0.8059096326715505, + "grad_norm": 0.7199390530586243, + "learning_rate": 1.8019428536825978e-06, + "loss": 0.4433, + "step": 43410 + }, + { + "epoch": 0.8059467628089692, + "grad_norm": 0.31049951910972595, + "learning_rate": 1.8012749281882759e-06, + "loss": 0.2676, + "step": 43412 + }, + { + "epoch": 0.8059838929463878, + "grad_norm": 0.403421014547348, + "learning_rate": 1.8006071142540448e-06, + "loss": 0.2044, + "step": 43414 + }, + { + "epoch": 0.8060210230838064, + "grad_norm": 0.3136270344257355, + "learning_rate": 1.7999394118889901e-06, + "loss": 0.1751, + "step": 43416 + }, + { + "epoch": 0.8060581532212251, + "grad_norm": 0.26560845971107483, + "learning_rate": 1.799271821102201e-06, + "loss": 0.1704, + "step": 43418 + }, + { + "epoch": 0.8060952833586437, + "grad_norm": 0.45790261030197144, + "learning_rate": 1.7986043419027565e-06, + "loss": 0.3555, + "step": 43420 + }, + { + "epoch": 0.8061324134960623, + "grad_norm": 0.327329158782959, + "learning_rate": 1.7979369742997421e-06, + "loss": 0.1462, + "step": 43422 + }, + { + "epoch": 0.806169543633481, + "grad_norm": 0.2910301983356476, + "learning_rate": 1.797269718302236e-06, + "loss": 0.3186, + "step": 43424 + }, + { + "epoch": 0.8062066737708996, + "grad_norm": 0.7805314064025879, + "learning_rate": 1.7966025739193194e-06, + "loss": 0.2915, + "step": 43426 + }, + { + "epoch": 0.8062438039083183, + "grad_norm": 0.3197399377822876, + "learning_rate": 1.7959355411600688e-06, + "loss": 0.3809, + "step": 43428 + }, + { + "epoch": 0.8062809340457369, + "grad_norm": 0.2715917229652405, + "learning_rate": 1.7952686200335612e-06, + "loss": 0.2173, + "step": 43430 + }, + { + "epoch": 0.8063180641831555, + "grad_norm": 0.28683269023895264, + "learning_rate": 1.794601810548875e-06, + "loss": 0.1731, + "step": 43432 + }, + { + "epoch": 0.8063551943205742, + "grad_norm": 0.31278592348098755, + "learning_rate": 1.7939351127150773e-06, + "loss": 0.1094, + "step": 43434 + }, + { + "epoch": 0.8063923244579928, + "grad_norm": 0.6151852011680603, + "learning_rate": 1.793268526541242e-06, + "loss": 0.4134, + "step": 43436 + }, + { + "epoch": 0.8064294545954115, + "grad_norm": 0.6536759734153748, + "learning_rate": 1.7926020520364407e-06, + "loss": 0.3735, + "step": 43438 + }, + { + "epoch": 0.8064665847328301, + "grad_norm": 0.5904718041419983, + "learning_rate": 1.791935689209745e-06, + "loss": 0.445, + "step": 43440 + }, + { + "epoch": 0.8065037148702487, + "grad_norm": 0.5025285482406616, + "learning_rate": 1.7912694380702144e-06, + "loss": 0.0996, + "step": 43442 + }, + { + "epoch": 0.8065408450076674, + "grad_norm": 0.5815796256065369, + "learning_rate": 1.7906032986269216e-06, + "loss": 0.2888, + "step": 43444 + }, + { + "epoch": 0.806577975145086, + "grad_norm": 0.27919381856918335, + "learning_rate": 1.7899372708889262e-06, + "loss": 0.1935, + "step": 43446 + }, + { + "epoch": 0.8066151052825047, + "grad_norm": 0.6288081407546997, + "learning_rate": 1.7892713548652918e-06, + "loss": 0.3838, + "step": 43448 + }, + { + "epoch": 0.8066522354199233, + "grad_norm": 0.37158867716789246, + "learning_rate": 1.7886055505650812e-06, + "loss": 0.2229, + "step": 43450 + }, + { + "epoch": 0.8066893655573419, + "grad_norm": 0.35470694303512573, + "learning_rate": 1.7879398579973518e-06, + "loss": 0.3228, + "step": 43452 + }, + { + "epoch": 0.8067264956947605, + "grad_norm": 0.34727033972740173, + "learning_rate": 1.7872742771711638e-06, + "loss": 0.2563, + "step": 43454 + }, + { + "epoch": 0.8067636258321792, + "grad_norm": 0.49045348167419434, + "learning_rate": 1.7866088080955758e-06, + "loss": 0.2129, + "step": 43456 + }, + { + "epoch": 0.8068007559695979, + "grad_norm": 0.3321934938430786, + "learning_rate": 1.7859434507796368e-06, + "loss": 0.2538, + "step": 43458 + }, + { + "epoch": 0.8068378861070165, + "grad_norm": 0.3904259502887726, + "learning_rate": 1.7852782052324035e-06, + "loss": 0.3278, + "step": 43460 + }, + { + "epoch": 0.8068750162444351, + "grad_norm": 0.49974340200424194, + "learning_rate": 1.7846130714629284e-06, + "loss": 0.2433, + "step": 43462 + }, + { + "epoch": 0.8069121463818537, + "grad_norm": 0.3082742989063263, + "learning_rate": 1.783948049480263e-06, + "loss": 0.2594, + "step": 43464 + }, + { + "epoch": 0.8069492765192724, + "grad_norm": 0.42806223034858704, + "learning_rate": 1.7832831392934536e-06, + "loss": 0.254, + "step": 43466 + }, + { + "epoch": 0.8069864066566911, + "grad_norm": 0.3587949275970459, + "learning_rate": 1.7826183409115471e-06, + "loss": 0.3253, + "step": 43468 + }, + { + "epoch": 0.8070235367941097, + "grad_norm": 0.4119024872779846, + "learning_rate": 1.7819536543435945e-06, + "loss": 0.4468, + "step": 43470 + }, + { + "epoch": 0.8070606669315283, + "grad_norm": 0.32273563742637634, + "learning_rate": 1.781289079598635e-06, + "loss": 0.2009, + "step": 43472 + }, + { + "epoch": 0.8070977970689469, + "grad_norm": 0.6856324076652527, + "learning_rate": 1.7806246166857122e-06, + "loss": 0.2399, + "step": 43474 + }, + { + "epoch": 0.8071349272063656, + "grad_norm": 0.43645039200782776, + "learning_rate": 1.7799602656138681e-06, + "loss": 0.2001, + "step": 43476 + }, + { + "epoch": 0.8071720573437843, + "grad_norm": 0.32688260078430176, + "learning_rate": 1.7792960263921433e-06, + "loss": 0.2047, + "step": 43478 + }, + { + "epoch": 0.8072091874812029, + "grad_norm": 0.3702280819416046, + "learning_rate": 1.7786318990295758e-06, + "loss": 0.2412, + "step": 43480 + }, + { + "epoch": 0.8072463176186215, + "grad_norm": 0.30157771706581116, + "learning_rate": 1.7779678835352054e-06, + "loss": 0.3085, + "step": 43482 + }, + { + "epoch": 0.8072834477560401, + "grad_norm": 0.4237672686576843, + "learning_rate": 1.7773039799180614e-06, + "loss": 0.5321, + "step": 43484 + }, + { + "epoch": 0.8073205778934588, + "grad_norm": 0.47945550084114075, + "learning_rate": 1.776640188187183e-06, + "loss": 0.3135, + "step": 43486 + }, + { + "epoch": 0.8073577080308775, + "grad_norm": 0.29122671484947205, + "learning_rate": 1.7759765083515967e-06, + "loss": 0.2092, + "step": 43488 + }, + { + "epoch": 0.807394838168296, + "grad_norm": 0.3656920790672302, + "learning_rate": 1.7753129404203372e-06, + "loss": 0.3622, + "step": 43490 + }, + { + "epoch": 0.8074319683057147, + "grad_norm": 0.41408294439315796, + "learning_rate": 1.774649484402432e-06, + "loss": 0.1361, + "step": 43492 + }, + { + "epoch": 0.8074690984431333, + "grad_norm": 0.25159546732902527, + "learning_rate": 1.7739861403069124e-06, + "loss": 0.2163, + "step": 43494 + }, + { + "epoch": 0.807506228580552, + "grad_norm": 0.2922087013721466, + "learning_rate": 1.7733229081427995e-06, + "loss": 0.2089, + "step": 43496 + }, + { + "epoch": 0.8075433587179706, + "grad_norm": 0.4318699836730957, + "learning_rate": 1.7726597879191198e-06, + "loss": 0.297, + "step": 43498 + }, + { + "epoch": 0.8075804888553892, + "grad_norm": 0.5053518414497375, + "learning_rate": 1.7719967796448968e-06, + "loss": 0.3329, + "step": 43500 + }, + { + "epoch": 0.8076176189928079, + "grad_norm": 0.5303958654403687, + "learning_rate": 1.7713338833291516e-06, + "loss": 0.281, + "step": 43502 + }, + { + "epoch": 0.8076547491302265, + "grad_norm": 0.3132563829421997, + "learning_rate": 1.7706710989809074e-06, + "loss": 0.1988, + "step": 43504 + }, + { + "epoch": 0.8076918792676452, + "grad_norm": 0.3681938052177429, + "learning_rate": 1.7700084266091766e-06, + "loss": 0.1013, + "step": 43506 + }, + { + "epoch": 0.8077290094050638, + "grad_norm": 0.379341721534729, + "learning_rate": 1.7693458662229824e-06, + "loss": 0.2417, + "step": 43508 + }, + { + "epoch": 0.8077661395424824, + "grad_norm": 0.2612236738204956, + "learning_rate": 1.768683417831335e-06, + "loss": 0.3124, + "step": 43510 + }, + { + "epoch": 0.8078032696799011, + "grad_norm": 0.3381321132183075, + "learning_rate": 1.7680210814432508e-06, + "loss": 0.2379, + "step": 43512 + }, + { + "epoch": 0.8078403998173197, + "grad_norm": 0.3066912889480591, + "learning_rate": 1.7673588570677414e-06, + "loss": 0.1938, + "step": 43514 + }, + { + "epoch": 0.8078775299547384, + "grad_norm": 0.4357501268386841, + "learning_rate": 1.7666967447138184e-06, + "loss": 0.3481, + "step": 43516 + }, + { + "epoch": 0.807914660092157, + "grad_norm": 0.25423452258110046, + "learning_rate": 1.7660347443904912e-06, + "loss": 0.4242, + "step": 43518 + }, + { + "epoch": 0.8079517902295756, + "grad_norm": 0.255190908908844, + "learning_rate": 1.7653728561067707e-06, + "loss": 0.1953, + "step": 43520 + }, + { + "epoch": 0.8079889203669943, + "grad_norm": 0.455807089805603, + "learning_rate": 1.7647110798716571e-06, + "loss": 0.104, + "step": 43522 + }, + { + "epoch": 0.8080260505044129, + "grad_norm": 0.3583429753780365, + "learning_rate": 1.7640494156941579e-06, + "loss": 0.1357, + "step": 43524 + }, + { + "epoch": 0.8080631806418316, + "grad_norm": 0.3683110475540161, + "learning_rate": 1.763387863583279e-06, + "loss": 0.1254, + "step": 43526 + }, + { + "epoch": 0.8081003107792502, + "grad_norm": 0.35647693276405334, + "learning_rate": 1.762726423548018e-06, + "loss": 0.4962, + "step": 43528 + }, + { + "epoch": 0.8081374409166688, + "grad_norm": 0.38255202770233154, + "learning_rate": 1.762065095597376e-06, + "loss": 0.1685, + "step": 43530 + }, + { + "epoch": 0.8081745710540875, + "grad_norm": 0.36853960156440735, + "learning_rate": 1.7614038797403555e-06, + "loss": 0.3672, + "step": 43532 + }, + { + "epoch": 0.8082117011915061, + "grad_norm": 0.518334150314331, + "learning_rate": 1.7607427759859487e-06, + "loss": 0.2918, + "step": 43534 + }, + { + "epoch": 0.8082488313289248, + "grad_norm": 0.23984551429748535, + "learning_rate": 1.7600817843431528e-06, + "loss": 0.1752, + "step": 43536 + }, + { + "epoch": 0.8082859614663434, + "grad_norm": 0.39685407280921936, + "learning_rate": 1.7594209048209632e-06, + "loss": 0.2374, + "step": 43538 + }, + { + "epoch": 0.808323091603762, + "grad_norm": 0.4364679455757141, + "learning_rate": 1.7587601374283703e-06, + "loss": 0.1603, + "step": 43540 + }, + { + "epoch": 0.8083602217411807, + "grad_norm": 0.41620877385139465, + "learning_rate": 1.7580994821743668e-06, + "loss": 0.2609, + "step": 43542 + }, + { + "epoch": 0.8083973518785993, + "grad_norm": 0.19475655257701874, + "learning_rate": 1.7574389390679458e-06, + "loss": 0.2037, + "step": 43544 + }, + { + "epoch": 0.808434482016018, + "grad_norm": 0.31679633259773254, + "learning_rate": 1.7567785081180876e-06, + "loss": 0.3769, + "step": 43546 + }, + { + "epoch": 0.8084716121534365, + "grad_norm": 0.4953288435935974, + "learning_rate": 1.7561181893337865e-06, + "loss": 0.2136, + "step": 43548 + }, + { + "epoch": 0.8085087422908552, + "grad_norm": 0.5830520391464233, + "learning_rate": 1.7554579827240192e-06, + "loss": 0.3411, + "step": 43550 + }, + { + "epoch": 0.8085458724282738, + "grad_norm": 0.3301014006137848, + "learning_rate": 1.7547978882977745e-06, + "loss": 0.2588, + "step": 43552 + }, + { + "epoch": 0.8085830025656925, + "grad_norm": 0.4469843804836273, + "learning_rate": 1.754137906064034e-06, + "loss": 0.2422, + "step": 43554 + }, + { + "epoch": 0.8086201327031112, + "grad_norm": 0.3511788845062256, + "learning_rate": 1.753478036031776e-06, + "loss": 0.2667, + "step": 43556 + }, + { + "epoch": 0.8086572628405297, + "grad_norm": 0.4620882272720337, + "learning_rate": 1.7528182782099845e-06, + "loss": 0.2264, + "step": 43558 + }, + { + "epoch": 0.8086943929779484, + "grad_norm": 0.3282831609249115, + "learning_rate": 1.75215863260763e-06, + "loss": 0.3191, + "step": 43560 + }, + { + "epoch": 0.808731523115367, + "grad_norm": 0.6383742094039917, + "learning_rate": 1.7514990992336912e-06, + "loss": 0.4763, + "step": 43562 + }, + { + "epoch": 0.8087686532527857, + "grad_norm": 0.39464277029037476, + "learning_rate": 1.7508396780971426e-06, + "loss": 0.2699, + "step": 43564 + }, + { + "epoch": 0.8088057833902044, + "grad_norm": 0.3656119704246521, + "learning_rate": 1.7501803692069575e-06, + "loss": 0.3724, + "step": 43566 + }, + { + "epoch": 0.8088429135276229, + "grad_norm": 0.3517090380191803, + "learning_rate": 1.7495211725721084e-06, + "loss": 0.2025, + "step": 43568 + }, + { + "epoch": 0.8088800436650416, + "grad_norm": 0.37723904848098755, + "learning_rate": 1.7488620882015606e-06, + "loss": 0.1969, + "step": 43570 + }, + { + "epoch": 0.8089171738024602, + "grad_norm": 0.5901374816894531, + "learning_rate": 1.7482031161042868e-06, + "loss": 0.2981, + "step": 43572 + }, + { + "epoch": 0.8089543039398789, + "grad_norm": 0.24585402011871338, + "learning_rate": 1.7475442562892497e-06, + "loss": 0.2039, + "step": 43574 + }, + { + "epoch": 0.8089914340772976, + "grad_norm": 0.31929776072502136, + "learning_rate": 1.7468855087654168e-06, + "loss": 0.3258, + "step": 43576 + }, + { + "epoch": 0.8090285642147161, + "grad_norm": 0.36173340678215027, + "learning_rate": 1.746226873541751e-06, + "loss": 0.3943, + "step": 43578 + }, + { + "epoch": 0.8090656943521348, + "grad_norm": 0.518196165561676, + "learning_rate": 1.7455683506272137e-06, + "loss": 0.3303, + "step": 43580 + }, + { + "epoch": 0.8091028244895534, + "grad_norm": 0.3349676728248596, + "learning_rate": 1.744909940030769e-06, + "loss": 0.3504, + "step": 43582 + }, + { + "epoch": 0.8091399546269721, + "grad_norm": 0.39362040162086487, + "learning_rate": 1.7442516417613708e-06, + "loss": 0.3603, + "step": 43584 + }, + { + "epoch": 0.8091770847643908, + "grad_norm": 0.34264281392097473, + "learning_rate": 1.7435934558279787e-06, + "loss": 0.3535, + "step": 43586 + }, + { + "epoch": 0.8092142149018093, + "grad_norm": 0.46950778365135193, + "learning_rate": 1.7429353822395501e-06, + "loss": 0.1485, + "step": 43588 + }, + { + "epoch": 0.809251345039228, + "grad_norm": 0.32493266463279724, + "learning_rate": 1.7422774210050398e-06, + "loss": 0.116, + "step": 43590 + }, + { + "epoch": 0.8092884751766466, + "grad_norm": 0.41430068016052246, + "learning_rate": 1.741619572133396e-06, + "loss": 0.2963, + "step": 43592 + }, + { + "epoch": 0.8093256053140653, + "grad_norm": 0.6539960503578186, + "learning_rate": 1.7409618356335733e-06, + "loss": 0.3436, + "step": 43594 + }, + { + "epoch": 0.809362735451484, + "grad_norm": 0.39869973063468933, + "learning_rate": 1.740304211514524e-06, + "loss": 0.3251, + "step": 43596 + }, + { + "epoch": 0.8093998655889025, + "grad_norm": 0.329857736825943, + "learning_rate": 1.7396466997851925e-06, + "loss": 0.3835, + "step": 43598 + }, + { + "epoch": 0.8094369957263212, + "grad_norm": 0.6175298094749451, + "learning_rate": 1.7389893004545254e-06, + "loss": 0.5915, + "step": 43600 + }, + { + "epoch": 0.8094741258637398, + "grad_norm": 0.7558514475822449, + "learning_rate": 1.7383320135314697e-06, + "loss": 0.3094, + "step": 43602 + }, + { + "epoch": 0.8095112560011585, + "grad_norm": 0.44288742542266846, + "learning_rate": 1.7376748390249686e-06, + "loss": 0.1627, + "step": 43604 + }, + { + "epoch": 0.809548386138577, + "grad_norm": 0.3741776943206787, + "learning_rate": 1.7370177769439644e-06, + "loss": 0.3526, + "step": 43606 + }, + { + "epoch": 0.8095855162759957, + "grad_norm": 0.6091138124465942, + "learning_rate": 1.7363608272973997e-06, + "loss": 0.2258, + "step": 43608 + }, + { + "epoch": 0.8096226464134144, + "grad_norm": 0.37969157099723816, + "learning_rate": 1.7357039900942108e-06, + "loss": 0.3495, + "step": 43610 + }, + { + "epoch": 0.809659776550833, + "grad_norm": 0.3240143954753876, + "learning_rate": 1.735047265343337e-06, + "loss": 0.1554, + "step": 43612 + }, + { + "epoch": 0.8096969066882517, + "grad_norm": 0.4269893169403076, + "learning_rate": 1.7343906530537114e-06, + "loss": 0.3377, + "step": 43614 + }, + { + "epoch": 0.8097340368256702, + "grad_norm": 0.3002816438674927, + "learning_rate": 1.733734153234271e-06, + "loss": 0.2701, + "step": 43616 + }, + { + "epoch": 0.8097711669630889, + "grad_norm": 0.26816660165786743, + "learning_rate": 1.7330777658939491e-06, + "loss": 0.3328, + "step": 43618 + }, + { + "epoch": 0.8098082971005076, + "grad_norm": 0.5631070733070374, + "learning_rate": 1.732421491041678e-06, + "loss": 0.3373, + "step": 43620 + }, + { + "epoch": 0.8098454272379262, + "grad_norm": 0.20925818383693695, + "learning_rate": 1.7317653286863833e-06, + "loss": 0.2821, + "step": 43622 + }, + { + "epoch": 0.8098825573753449, + "grad_norm": 0.43348854780197144, + "learning_rate": 1.7311092788369977e-06, + "loss": 0.5323, + "step": 43624 + }, + { + "epoch": 0.8099196875127634, + "grad_norm": 0.8420249819755554, + "learning_rate": 1.730453341502445e-06, + "loss": 0.1256, + "step": 43626 + }, + { + "epoch": 0.8099568176501821, + "grad_norm": 0.3945586085319519, + "learning_rate": 1.7297975166916537e-06, + "loss": 0.1872, + "step": 43628 + }, + { + "epoch": 0.8099939477876008, + "grad_norm": 0.47009968757629395, + "learning_rate": 1.7291418044135445e-06, + "loss": 0.2801, + "step": 43630 + }, + { + "epoch": 0.8100310779250194, + "grad_norm": 0.5788520574569702, + "learning_rate": 1.7284862046770444e-06, + "loss": 0.4035, + "step": 43632 + }, + { + "epoch": 0.8100682080624381, + "grad_norm": 0.45796093344688416, + "learning_rate": 1.7278307174910713e-06, + "loss": 0.1926, + "step": 43634 + }, + { + "epoch": 0.8101053381998566, + "grad_norm": 0.3990139961242676, + "learning_rate": 1.727175342864542e-06, + "loss": 0.2553, + "step": 43636 + }, + { + "epoch": 0.8101424683372753, + "grad_norm": 0.3560784161090851, + "learning_rate": 1.7265200808063753e-06, + "loss": 0.3187, + "step": 43638 + }, + { + "epoch": 0.810179598474694, + "grad_norm": 0.41287750005722046, + "learning_rate": 1.7258649313254894e-06, + "loss": 0.4058, + "step": 43640 + }, + { + "epoch": 0.8102167286121126, + "grad_norm": 0.37703830003738403, + "learning_rate": 1.725209894430797e-06, + "loss": 0.1721, + "step": 43642 + }, + { + "epoch": 0.8102538587495313, + "grad_norm": 0.45382794737815857, + "learning_rate": 1.7245549701312125e-06, + "loss": 0.2617, + "step": 43644 + }, + { + "epoch": 0.8102909888869498, + "grad_norm": 0.408495157957077, + "learning_rate": 1.7239001584356497e-06, + "loss": 0.2693, + "step": 43646 + }, + { + "epoch": 0.8103281190243685, + "grad_norm": 0.25961384177207947, + "learning_rate": 1.7232454593530134e-06, + "loss": 0.3486, + "step": 43648 + }, + { + "epoch": 0.8103652491617871, + "grad_norm": 0.25137338042259216, + "learning_rate": 1.7225908728922136e-06, + "loss": 0.2948, + "step": 43650 + }, + { + "epoch": 0.8104023792992058, + "grad_norm": 0.4384453296661377, + "learning_rate": 1.7219363990621595e-06, + "loss": 0.2747, + "step": 43652 + }, + { + "epoch": 0.8104395094366245, + "grad_norm": 0.46359142661094666, + "learning_rate": 1.7212820378717577e-06, + "loss": 0.272, + "step": 43654 + }, + { + "epoch": 0.810476639574043, + "grad_norm": 0.4201693534851074, + "learning_rate": 1.7206277893299073e-06, + "loss": 0.1944, + "step": 43656 + }, + { + "epoch": 0.8105137697114617, + "grad_norm": 0.4915314316749573, + "learning_rate": 1.7199736534455146e-06, + "loss": 0.3, + "step": 43658 + }, + { + "epoch": 0.8105508998488803, + "grad_norm": 0.42742177844047546, + "learning_rate": 1.7193196302274773e-06, + "loss": 0.2002, + "step": 43660 + }, + { + "epoch": 0.810588029986299, + "grad_norm": 0.29782113432884216, + "learning_rate": 1.7186657196846968e-06, + "loss": 0.3463, + "step": 43662 + }, + { + "epoch": 0.8106251601237177, + "grad_norm": 0.2809637188911438, + "learning_rate": 1.7180119218260694e-06, + "loss": 0.2557, + "step": 43664 + }, + { + "epoch": 0.8106622902611362, + "grad_norm": 0.45260393619537354, + "learning_rate": 1.7173582366604923e-06, + "loss": 0.2476, + "step": 43666 + }, + { + "epoch": 0.8106994203985549, + "grad_norm": 0.3119635283946991, + "learning_rate": 1.716704664196861e-06, + "loss": 0.4167, + "step": 43668 + }, + { + "epoch": 0.8107365505359735, + "grad_norm": 0.2795673906803131, + "learning_rate": 1.7160512044440704e-06, + "loss": 0.1192, + "step": 43670 + }, + { + "epoch": 0.8107736806733922, + "grad_norm": 0.37746718525886536, + "learning_rate": 1.7153978574110063e-06, + "loss": 0.206, + "step": 43672 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.28515276312828064, + "learning_rate": 1.7147446231065623e-06, + "loss": 0.2016, + "step": 43674 + }, + { + "epoch": 0.8108479409482294, + "grad_norm": 0.48526784777641296, + "learning_rate": 1.7140915015396299e-06, + "loss": 0.2982, + "step": 43676 + }, + { + "epoch": 0.8108850710856481, + "grad_norm": 0.24649102985858917, + "learning_rate": 1.7134384927190895e-06, + "loss": 0.2257, + "step": 43678 + }, + { + "epoch": 0.8109222012230667, + "grad_norm": 0.7558265924453735, + "learning_rate": 1.7127855966538309e-06, + "loss": 0.2141, + "step": 43680 + }, + { + "epoch": 0.8109593313604854, + "grad_norm": 0.29816606640815735, + "learning_rate": 1.7121328133527371e-06, + "loss": 0.1889, + "step": 43682 + }, + { + "epoch": 0.810996461497904, + "grad_norm": 0.5247555375099182, + "learning_rate": 1.711480142824693e-06, + "loss": 0.5191, + "step": 43684 + }, + { + "epoch": 0.8110335916353226, + "grad_norm": 0.3596543073654175, + "learning_rate": 1.7108275850785737e-06, + "loss": 0.1118, + "step": 43686 + }, + { + "epoch": 0.8110707217727413, + "grad_norm": 0.332597017288208, + "learning_rate": 1.7101751401232636e-06, + "loss": 0.2631, + "step": 43688 + }, + { + "epoch": 0.8111078519101599, + "grad_norm": 0.31524085998535156, + "learning_rate": 1.7095228079676385e-06, + "loss": 0.2453, + "step": 43690 + }, + { + "epoch": 0.8111449820475786, + "grad_norm": 0.5466545820236206, + "learning_rate": 1.7088705886205748e-06, + "loss": 0.2723, + "step": 43692 + }, + { + "epoch": 0.8111821121849973, + "grad_norm": 0.37573733925819397, + "learning_rate": 1.708218482090951e-06, + "loss": 0.2743, + "step": 43694 + }, + { + "epoch": 0.8112192423224158, + "grad_norm": 0.32732468843460083, + "learning_rate": 1.7075664883876331e-06, + "loss": 0.3459, + "step": 43696 + }, + { + "epoch": 0.8112563724598345, + "grad_norm": 0.4555458724498749, + "learning_rate": 1.7069146075194998e-06, + "loss": 0.1121, + "step": 43698 + }, + { + "epoch": 0.8112935025972531, + "grad_norm": 0.4011319875717163, + "learning_rate": 1.706262839495415e-06, + "loss": 0.1717, + "step": 43700 + }, + { + "epoch": 0.8113306327346718, + "grad_norm": 0.29051095247268677, + "learning_rate": 1.7056111843242507e-06, + "loss": 0.1859, + "step": 43702 + }, + { + "epoch": 0.8113677628720903, + "grad_norm": 1.0778127908706665, + "learning_rate": 1.704959642014874e-06, + "loss": 0.5908, + "step": 43704 + }, + { + "epoch": 0.811404893009509, + "grad_norm": 0.5177686214447021, + "learning_rate": 1.7043082125761502e-06, + "loss": 0.3489, + "step": 43706 + }, + { + "epoch": 0.8114420231469277, + "grad_norm": 0.3519238829612732, + "learning_rate": 1.7036568960169443e-06, + "loss": 0.3251, + "step": 43708 + }, + { + "epoch": 0.8114791532843463, + "grad_norm": 0.3471061885356903, + "learning_rate": 1.7030056923461158e-06, + "loss": 0.14, + "step": 43710 + }, + { + "epoch": 0.811516283421765, + "grad_norm": 0.4146586060523987, + "learning_rate": 1.702354601572528e-06, + "loss": 0.333, + "step": 43712 + }, + { + "epoch": 0.8115534135591835, + "grad_norm": 0.305102676153183, + "learning_rate": 1.7017036237050387e-06, + "loss": 0.2562, + "step": 43714 + }, + { + "epoch": 0.8115905436966022, + "grad_norm": 0.32578641176223755, + "learning_rate": 1.7010527587525084e-06, + "loss": 0.2171, + "step": 43716 + }, + { + "epoch": 0.8116276738340209, + "grad_norm": 0.45331937074661255, + "learning_rate": 1.7004020067237925e-06, + "loss": 0.2093, + "step": 43718 + }, + { + "epoch": 0.8116648039714395, + "grad_norm": 0.3839343786239624, + "learning_rate": 1.6997513676277445e-06, + "loss": 0.1565, + "step": 43720 + }, + { + "epoch": 0.8117019341088582, + "grad_norm": 0.44491147994995117, + "learning_rate": 1.699100841473219e-06, + "loss": 0.4958, + "step": 43722 + }, + { + "epoch": 0.8117390642462767, + "grad_norm": 0.2750934064388275, + "learning_rate": 1.698450428269065e-06, + "loss": 0.2854, + "step": 43724 + }, + { + "epoch": 0.8117761943836954, + "grad_norm": 0.508905827999115, + "learning_rate": 1.6978001280241353e-06, + "loss": 0.2595, + "step": 43726 + }, + { + "epoch": 0.8118133245211141, + "grad_norm": 0.42436084151268005, + "learning_rate": 1.697149940747278e-06, + "loss": 0.3276, + "step": 43728 + }, + { + "epoch": 0.8118504546585327, + "grad_norm": 0.25700637698173523, + "learning_rate": 1.6964998664473388e-06, + "loss": 0.2736, + "step": 43730 + }, + { + "epoch": 0.8118875847959514, + "grad_norm": 0.32283902168273926, + "learning_rate": 1.6958499051331657e-06, + "loss": 0.2861, + "step": 43732 + }, + { + "epoch": 0.8119247149333699, + "grad_norm": 0.5871114730834961, + "learning_rate": 1.695200056813604e-06, + "loss": 0.2956, + "step": 43734 + }, + { + "epoch": 0.8119618450707886, + "grad_norm": 0.37595927715301514, + "learning_rate": 1.6945503214974912e-06, + "loss": 0.1454, + "step": 43736 + }, + { + "epoch": 0.8119989752082073, + "grad_norm": 0.5304111838340759, + "learning_rate": 1.6939006991936702e-06, + "loss": 0.4389, + "step": 43738 + }, + { + "epoch": 0.8120361053456259, + "grad_norm": 0.6789892911911011, + "learning_rate": 1.6932511899109837e-06, + "loss": 0.2895, + "step": 43740 + }, + { + "epoch": 0.8120732354830446, + "grad_norm": 0.4144756495952606, + "learning_rate": 1.692601793658265e-06, + "loss": 0.355, + "step": 43742 + }, + { + "epoch": 0.8121103656204631, + "grad_norm": 0.3600856363773346, + "learning_rate": 1.6919525104443513e-06, + "loss": 0.1588, + "step": 43744 + }, + { + "epoch": 0.8121474957578818, + "grad_norm": 0.33181440830230713, + "learning_rate": 1.691303340278082e-06, + "loss": 0.3635, + "step": 43746 + }, + { + "epoch": 0.8121846258953005, + "grad_norm": 0.2566782236099243, + "learning_rate": 1.6906542831682837e-06, + "loss": 0.1676, + "step": 43748 + }, + { + "epoch": 0.8122217560327191, + "grad_norm": 0.4105609059333801, + "learning_rate": 1.6900053391237914e-06, + "loss": 0.2778, + "step": 43750 + }, + { + "epoch": 0.8122588861701378, + "grad_norm": 0.46978235244750977, + "learning_rate": 1.6893565081534359e-06, + "loss": 0.5005, + "step": 43752 + }, + { + "epoch": 0.8122960163075563, + "grad_norm": 0.2576170861721039, + "learning_rate": 1.6887077902660443e-06, + "loss": 0.3812, + "step": 43754 + }, + { + "epoch": 0.812333146444975, + "grad_norm": 0.4677191972732544, + "learning_rate": 1.6880591854704443e-06, + "loss": 0.0429, + "step": 43756 + }, + { + "epoch": 0.8123702765823936, + "grad_norm": 0.3043809235095978, + "learning_rate": 1.6874106937754642e-06, + "loss": 0.2045, + "step": 43758 + }, + { + "epoch": 0.8124074067198123, + "grad_norm": 0.44103580713272095, + "learning_rate": 1.6867623151899237e-06, + "loss": 0.3023, + "step": 43760 + }, + { + "epoch": 0.812444536857231, + "grad_norm": 0.49585819244384766, + "learning_rate": 1.6861140497226492e-06, + "loss": 0.1249, + "step": 43762 + }, + { + "epoch": 0.8124816669946495, + "grad_norm": 0.17080366611480713, + "learning_rate": 1.6854658973824566e-06, + "loss": 0.2339, + "step": 43764 + }, + { + "epoch": 0.8125187971320682, + "grad_norm": 0.3271997272968292, + "learning_rate": 1.6848178581781682e-06, + "loss": 0.2231, + "step": 43766 + }, + { + "epoch": 0.8125559272694868, + "grad_norm": 1.031008005142212, + "learning_rate": 1.6841699321186023e-06, + "loss": 0.1958, + "step": 43768 + }, + { + "epoch": 0.8125930574069055, + "grad_norm": 0.5688890814781189, + "learning_rate": 1.683522119212575e-06, + "loss": 0.2949, + "step": 43770 + }, + { + "epoch": 0.8126301875443241, + "grad_norm": 0.4250592291355133, + "learning_rate": 1.682874419468903e-06, + "loss": 0.1574, + "step": 43772 + }, + { + "epoch": 0.8126673176817427, + "grad_norm": 0.34404996037483215, + "learning_rate": 1.6822268328963954e-06, + "loss": 0.5192, + "step": 43774 + }, + { + "epoch": 0.8127044478191614, + "grad_norm": 0.5535452961921692, + "learning_rate": 1.6815793595038655e-06, + "loss": 0.3362, + "step": 43776 + }, + { + "epoch": 0.81274157795658, + "grad_norm": 0.41953033208847046, + "learning_rate": 1.6809319993001239e-06, + "loss": 0.3028, + "step": 43778 + }, + { + "epoch": 0.8127787080939987, + "grad_norm": 0.39952221512794495, + "learning_rate": 1.6802847522939814e-06, + "loss": 0.507, + "step": 43780 + }, + { + "epoch": 0.8128158382314173, + "grad_norm": 0.4492514729499817, + "learning_rate": 1.679637618494242e-06, + "loss": 0.1695, + "step": 43782 + }, + { + "epoch": 0.8128529683688359, + "grad_norm": 0.5864405035972595, + "learning_rate": 1.6789905979097132e-06, + "loss": 0.2931, + "step": 43784 + }, + { + "epoch": 0.8128900985062546, + "grad_norm": 0.374038964509964, + "learning_rate": 1.6783436905491969e-06, + "loss": 0.2267, + "step": 43786 + }, + { + "epoch": 0.8129272286436732, + "grad_norm": 0.5111145973205566, + "learning_rate": 1.6776968964214957e-06, + "loss": 0.3198, + "step": 43788 + }, + { + "epoch": 0.8129643587810919, + "grad_norm": 0.20995105803012848, + "learning_rate": 1.6770502155354118e-06, + "loss": 0.2311, + "step": 43790 + }, + { + "epoch": 0.8130014889185105, + "grad_norm": 0.23587463796138763, + "learning_rate": 1.6764036478997448e-06, + "loss": 0.139, + "step": 43792 + }, + { + "epoch": 0.8130386190559291, + "grad_norm": 0.4376339912414551, + "learning_rate": 1.6757571935232918e-06, + "loss": 0.383, + "step": 43794 + }, + { + "epoch": 0.8130757491933478, + "grad_norm": 0.4064905047416687, + "learning_rate": 1.6751108524148484e-06, + "loss": 0.3448, + "step": 43796 + }, + { + "epoch": 0.8131128793307664, + "grad_norm": 0.38001057505607605, + "learning_rate": 1.6744646245832142e-06, + "loss": 0.3068, + "step": 43798 + }, + { + "epoch": 0.813150009468185, + "grad_norm": 0.44383519887924194, + "learning_rate": 1.6738185100371763e-06, + "loss": 0.2787, + "step": 43800 + }, + { + "epoch": 0.8131871396056036, + "grad_norm": 0.36150261759757996, + "learning_rate": 1.67317250878553e-06, + "loss": 0.2444, + "step": 43802 + }, + { + "epoch": 0.8132242697430223, + "grad_norm": 0.5254998803138733, + "learning_rate": 1.6725266208370628e-06, + "loss": 0.1087, + "step": 43804 + }, + { + "epoch": 0.813261399880441, + "grad_norm": 0.4981311559677124, + "learning_rate": 1.6718808462005632e-06, + "loss": 0.1902, + "step": 43806 + }, + { + "epoch": 0.8132985300178596, + "grad_norm": 0.4114186763763428, + "learning_rate": 1.6712351848848207e-06, + "loss": 0.2026, + "step": 43808 + }, + { + "epoch": 0.8133356601552783, + "grad_norm": 0.45485007762908936, + "learning_rate": 1.670589636898622e-06, + "loss": 0.4116, + "step": 43810 + }, + { + "epoch": 0.8133727902926968, + "grad_norm": 0.2727290093898773, + "learning_rate": 1.6699442022507462e-06, + "loss": 0.2806, + "step": 43812 + }, + { + "epoch": 0.8134099204301155, + "grad_norm": 0.4047304093837738, + "learning_rate": 1.6692988809499788e-06, + "loss": 0.2575, + "step": 43814 + }, + { + "epoch": 0.8134470505675342, + "grad_norm": 0.36853736639022827, + "learning_rate": 1.6686536730051006e-06, + "loss": 0.2622, + "step": 43816 + }, + { + "epoch": 0.8134841807049528, + "grad_norm": 0.3714786469936371, + "learning_rate": 1.66800857842489e-06, + "loss": 0.2092, + "step": 43818 + }, + { + "epoch": 0.8135213108423714, + "grad_norm": 0.4923979341983795, + "learning_rate": 1.6673635972181257e-06, + "loss": 0.2292, + "step": 43820 + }, + { + "epoch": 0.81355844097979, + "grad_norm": 0.26455625891685486, + "learning_rate": 1.666718729393586e-06, + "loss": 0.1163, + "step": 43822 + }, + { + "epoch": 0.8135955711172087, + "grad_norm": 0.3776501417160034, + "learning_rate": 1.6660739749600408e-06, + "loss": 0.2926, + "step": 43824 + }, + { + "epoch": 0.8136327012546274, + "grad_norm": 0.4441525638103485, + "learning_rate": 1.6654293339262684e-06, + "loss": 0.1847, + "step": 43826 + }, + { + "epoch": 0.813669831392046, + "grad_norm": 0.2903951406478882, + "learning_rate": 1.6647848063010364e-06, + "loss": 0.1942, + "step": 43828 + }, + { + "epoch": 0.8137069615294646, + "grad_norm": 0.2905699610710144, + "learning_rate": 1.6641403920931142e-06, + "loss": 0.2435, + "step": 43830 + }, + { + "epoch": 0.8137440916668832, + "grad_norm": 0.29813718795776367, + "learning_rate": 1.6634960913112742e-06, + "loss": 0.3317, + "step": 43832 + }, + { + "epoch": 0.8137812218043019, + "grad_norm": 0.4835970401763916, + "learning_rate": 1.6628519039642832e-06, + "loss": 0.3703, + "step": 43834 + }, + { + "epoch": 0.8138183519417206, + "grad_norm": 0.3298129439353943, + "learning_rate": 1.6622078300609035e-06, + "loss": 0.2678, + "step": 43836 + }, + { + "epoch": 0.8138554820791392, + "grad_norm": 0.4878776967525482, + "learning_rate": 1.6615638696099002e-06, + "loss": 0.1213, + "step": 43838 + }, + { + "epoch": 0.8138926122165578, + "grad_norm": 0.33900871872901917, + "learning_rate": 1.6609200226200362e-06, + "loss": 0.2325, + "step": 43840 + }, + { + "epoch": 0.8139297423539764, + "grad_norm": 0.4741169214248657, + "learning_rate": 1.6602762891000724e-06, + "loss": 0.5836, + "step": 43842 + }, + { + "epoch": 0.8139668724913951, + "grad_norm": 0.6076623201370239, + "learning_rate": 1.6596326690587695e-06, + "loss": 0.3045, + "step": 43844 + }, + { + "epoch": 0.8140040026288138, + "grad_norm": 0.3318552076816559, + "learning_rate": 1.6589891625048816e-06, + "loss": 0.2573, + "step": 43846 + }, + { + "epoch": 0.8140411327662324, + "grad_norm": 0.2661123275756836, + "learning_rate": 1.6583457694471705e-06, + "loss": 0.1295, + "step": 43848 + }, + { + "epoch": 0.814078262903651, + "grad_norm": 0.27453526854515076, + "learning_rate": 1.6577024898943838e-06, + "loss": 0.3613, + "step": 43850 + }, + { + "epoch": 0.8141153930410696, + "grad_norm": 0.353855699300766, + "learning_rate": 1.6570593238552779e-06, + "loss": 0.3577, + "step": 43852 + }, + { + "epoch": 0.8141525231784883, + "grad_norm": 0.31913813948631287, + "learning_rate": 1.6564162713386056e-06, + "loss": 0.3685, + "step": 43854 + }, + { + "epoch": 0.8141896533159069, + "grad_norm": 0.3153173625469208, + "learning_rate": 1.6557733323531155e-06, + "loss": 0.2144, + "step": 43856 + }, + { + "epoch": 0.8142267834533256, + "grad_norm": 0.2876848578453064, + "learning_rate": 1.6551305069075564e-06, + "loss": 0.2075, + "step": 43858 + }, + { + "epoch": 0.8142639135907442, + "grad_norm": 0.4811458885669708, + "learning_rate": 1.6544877950106774e-06, + "loss": 0.2782, + "step": 43860 + }, + { + "epoch": 0.8143010437281628, + "grad_norm": 0.29733672738075256, + "learning_rate": 1.6538451966712198e-06, + "loss": 0.4645, + "step": 43862 + }, + { + "epoch": 0.8143381738655815, + "grad_norm": 0.3425471782684326, + "learning_rate": 1.65320271189793e-06, + "loss": 0.2849, + "step": 43864 + }, + { + "epoch": 0.8143753040030001, + "grad_norm": 0.38238421082496643, + "learning_rate": 1.6525603406995516e-06, + "loss": 0.2479, + "step": 43866 + }, + { + "epoch": 0.8144124341404188, + "grad_norm": 0.5536106824874878, + "learning_rate": 1.6519180830848213e-06, + "loss": 0.1826, + "step": 43868 + }, + { + "epoch": 0.8144495642778374, + "grad_norm": 0.350225031375885, + "learning_rate": 1.65127593906248e-06, + "loss": 0.2923, + "step": 43870 + }, + { + "epoch": 0.814486694415256, + "grad_norm": 0.2707739770412445, + "learning_rate": 1.6506339086412682e-06, + "loss": 0.3373, + "step": 43872 + }, + { + "epoch": 0.8145238245526747, + "grad_norm": 0.4076531231403351, + "learning_rate": 1.649991991829918e-06, + "loss": 0.2416, + "step": 43874 + }, + { + "epoch": 0.8145609546900933, + "grad_norm": 0.3382411003112793, + "learning_rate": 1.6493501886371655e-06, + "loss": 0.2422, + "step": 43876 + }, + { + "epoch": 0.814598084827512, + "grad_norm": 0.4349682927131653, + "learning_rate": 1.6487084990717428e-06, + "loss": 0.2905, + "step": 43878 + }, + { + "epoch": 0.8146352149649306, + "grad_norm": 0.3168734908103943, + "learning_rate": 1.648066923142383e-06, + "loss": 0.3292, + "step": 43880 + }, + { + "epoch": 0.8146723451023492, + "grad_norm": 0.46333861351013184, + "learning_rate": 1.6474254608578156e-06, + "loss": 0.2352, + "step": 43882 + }, + { + "epoch": 0.8147094752397679, + "grad_norm": 0.3276359736919403, + "learning_rate": 1.6467841122267692e-06, + "loss": 0.1302, + "step": 43884 + }, + { + "epoch": 0.8147466053771865, + "grad_norm": 0.3139045834541321, + "learning_rate": 1.6461428772579724e-06, + "loss": 0.129, + "step": 43886 + }, + { + "epoch": 0.8147837355146051, + "grad_norm": 0.4689866602420807, + "learning_rate": 1.6455017559601483e-06, + "loss": 0.3683, + "step": 43888 + }, + { + "epoch": 0.8148208656520238, + "grad_norm": 0.44363242387771606, + "learning_rate": 1.6448607483420175e-06, + "loss": 0.3406, + "step": 43890 + }, + { + "epoch": 0.8148579957894424, + "grad_norm": 0.41888853907585144, + "learning_rate": 1.6442198544123067e-06, + "loss": 0.432, + "step": 43892 + }, + { + "epoch": 0.8148951259268611, + "grad_norm": 0.5602725744247437, + "learning_rate": 1.6435790741797342e-06, + "loss": 0.1871, + "step": 43894 + }, + { + "epoch": 0.8149322560642797, + "grad_norm": 0.4456685185432434, + "learning_rate": 1.6429384076530208e-06, + "loss": 0.3115, + "step": 43896 + }, + { + "epoch": 0.8149693862016983, + "grad_norm": 0.35547104477882385, + "learning_rate": 1.6422978548408852e-06, + "loss": 0.1889, + "step": 43898 + }, + { + "epoch": 0.815006516339117, + "grad_norm": 0.539585530757904, + "learning_rate": 1.6416574157520394e-06, + "loss": 0.0922, + "step": 43900 + }, + { + "epoch": 0.8150436464765356, + "grad_norm": 0.3318989872932434, + "learning_rate": 1.6410170903952006e-06, + "loss": 0.1243, + "step": 43902 + }, + { + "epoch": 0.8150807766139543, + "grad_norm": 0.3800659477710724, + "learning_rate": 1.6403768787790797e-06, + "loss": 0.2037, + "step": 43904 + }, + { + "epoch": 0.8151179067513729, + "grad_norm": 0.5128672122955322, + "learning_rate": 1.639736780912391e-06, + "loss": 0.3348, + "step": 43906 + }, + { + "epoch": 0.8151550368887915, + "grad_norm": 0.4685610830783844, + "learning_rate": 1.639096796803844e-06, + "loss": 0.2097, + "step": 43908 + }, + { + "epoch": 0.8151921670262101, + "grad_norm": 0.23054294288158417, + "learning_rate": 1.6384569264621474e-06, + "loss": 0.2733, + "step": 43910 + }, + { + "epoch": 0.8152292971636288, + "grad_norm": 0.5500622391700745, + "learning_rate": 1.6378171698960022e-06, + "loss": 0.2587, + "step": 43912 + }, + { + "epoch": 0.8152664273010475, + "grad_norm": 0.326969712972641, + "learning_rate": 1.6371775271141187e-06, + "loss": 0.3395, + "step": 43914 + }, + { + "epoch": 0.815303557438466, + "grad_norm": 0.4631718695163727, + "learning_rate": 1.6365379981251994e-06, + "loss": 0.3009, + "step": 43916 + }, + { + "epoch": 0.8153406875758847, + "grad_norm": 0.35248807072639465, + "learning_rate": 1.6358985829379459e-06, + "loss": 0.4968, + "step": 43918 + }, + { + "epoch": 0.8153778177133033, + "grad_norm": 0.24632041156291962, + "learning_rate": 1.6352592815610603e-06, + "loss": 0.2135, + "step": 43920 + }, + { + "epoch": 0.815414947850722, + "grad_norm": 0.5282867550849915, + "learning_rate": 1.6346200940032397e-06, + "loss": 0.4033, + "step": 43922 + }, + { + "epoch": 0.8154520779881407, + "grad_norm": 0.5651643872261047, + "learning_rate": 1.6339810202731854e-06, + "loss": 0.3463, + "step": 43924 + }, + { + "epoch": 0.8154892081255593, + "grad_norm": 0.45410507917404175, + "learning_rate": 1.6333420603795868e-06, + "loss": 0.1565, + "step": 43926 + }, + { + "epoch": 0.8155263382629779, + "grad_norm": 0.3409724831581116, + "learning_rate": 1.6327032143311427e-06, + "loss": 0.2562, + "step": 43928 + }, + { + "epoch": 0.8155634684003965, + "grad_norm": 0.50171959400177, + "learning_rate": 1.6320644821365471e-06, + "loss": 0.1365, + "step": 43930 + }, + { + "epoch": 0.8156005985378152, + "grad_norm": 0.3927929103374481, + "learning_rate": 1.6314258638044866e-06, + "loss": 0.3394, + "step": 43932 + }, + { + "epoch": 0.8156377286752339, + "grad_norm": 0.4505486786365509, + "learning_rate": 1.630787359343654e-06, + "loss": 0.2718, + "step": 43934 + }, + { + "epoch": 0.8156748588126524, + "grad_norm": 0.6248704791069031, + "learning_rate": 1.6301489687627382e-06, + "loss": 0.2119, + "step": 43936 + }, + { + "epoch": 0.8157119889500711, + "grad_norm": 0.3558638095855713, + "learning_rate": 1.6295106920704228e-06, + "loss": 0.2582, + "step": 43938 + }, + { + "epoch": 0.8157491190874897, + "grad_norm": 0.37645500898361206, + "learning_rate": 1.6288725292753937e-06, + "loss": 0.4796, + "step": 43940 + }, + { + "epoch": 0.8157862492249084, + "grad_norm": 0.3724375367164612, + "learning_rate": 1.6282344803863348e-06, + "loss": 0.3206, + "step": 43942 + }, + { + "epoch": 0.8158233793623271, + "grad_norm": 0.4562768340110779, + "learning_rate": 1.6275965454119292e-06, + "loss": 0.3631, + "step": 43944 + }, + { + "epoch": 0.8158605094997456, + "grad_norm": 0.4188615679740906, + "learning_rate": 1.6269587243608553e-06, + "loss": 0.2426, + "step": 43946 + }, + { + "epoch": 0.8158976396371643, + "grad_norm": 0.5399248003959656, + "learning_rate": 1.6263210172417954e-06, + "loss": 0.2613, + "step": 43948 + }, + { + "epoch": 0.8159347697745829, + "grad_norm": 0.4648520350456238, + "learning_rate": 1.6256834240634222e-06, + "loss": 0.3088, + "step": 43950 + }, + { + "epoch": 0.8159718999120016, + "grad_norm": 0.4649866819381714, + "learning_rate": 1.625045944834417e-06, + "loss": 0.1763, + "step": 43952 + }, + { + "epoch": 0.8160090300494202, + "grad_norm": 0.5629376173019409, + "learning_rate": 1.624408579563448e-06, + "loss": 0.3001, + "step": 43954 + }, + { + "epoch": 0.8160461601868388, + "grad_norm": 0.5710993409156799, + "learning_rate": 1.6237713282591893e-06, + "loss": 0.4661, + "step": 43956 + }, + { + "epoch": 0.8160832903242575, + "grad_norm": 0.3183075189590454, + "learning_rate": 1.6231341909303133e-06, + "loss": 0.1294, + "step": 43958 + }, + { + "epoch": 0.8161204204616761, + "grad_norm": 0.5468972325325012, + "learning_rate": 1.6224971675854928e-06, + "loss": 0.3241, + "step": 43960 + }, + { + "epoch": 0.8161575505990948, + "grad_norm": 0.4538935422897339, + "learning_rate": 1.6218602582333886e-06, + "loss": 0.3479, + "step": 43962 + }, + { + "epoch": 0.8161946807365134, + "grad_norm": 0.3551821708679199, + "learning_rate": 1.6212234628826717e-06, + "loss": 0.1079, + "step": 43964 + }, + { + "epoch": 0.816231810873932, + "grad_norm": 0.3374553918838501, + "learning_rate": 1.6205867815420062e-06, + "loss": 0.3534, + "step": 43966 + }, + { + "epoch": 0.8162689410113507, + "grad_norm": 0.5981477499008179, + "learning_rate": 1.6199502142200551e-06, + "loss": 0.2684, + "step": 43968 + }, + { + "epoch": 0.8163060711487693, + "grad_norm": 0.42788711190223694, + "learning_rate": 1.6193137609254829e-06, + "loss": 0.1529, + "step": 43970 + }, + { + "epoch": 0.816343201286188, + "grad_norm": 0.3363659977912903, + "learning_rate": 1.6186774216669455e-06, + "loss": 0.2782, + "step": 43972 + }, + { + "epoch": 0.8163803314236066, + "grad_norm": 0.32389792799949646, + "learning_rate": 1.6180411964531052e-06, + "loss": 0.1284, + "step": 43974 + }, + { + "epoch": 0.8164174615610252, + "grad_norm": 0.3659617006778717, + "learning_rate": 1.6174050852926148e-06, + "loss": 0.1553, + "step": 43976 + }, + { + "epoch": 0.8164545916984439, + "grad_norm": 0.4228678345680237, + "learning_rate": 1.6167690881941323e-06, + "loss": 0.232, + "step": 43978 + }, + { + "epoch": 0.8164917218358625, + "grad_norm": 0.4784436523914337, + "learning_rate": 1.6161332051663125e-06, + "loss": 0.3913, + "step": 43980 + }, + { + "epoch": 0.8165288519732812, + "grad_norm": 0.9528792500495911, + "learning_rate": 1.6154974362178067e-06, + "loss": 0.4612, + "step": 43982 + }, + { + "epoch": 0.8165659821106998, + "grad_norm": 0.4677400588989258, + "learning_rate": 1.6148617813572664e-06, + "loss": 0.1463, + "step": 43984 + }, + { + "epoch": 0.8166031122481184, + "grad_norm": 0.5206021070480347, + "learning_rate": 1.6142262405933428e-06, + "loss": 0.3774, + "step": 43986 + }, + { + "epoch": 0.8166402423855371, + "grad_norm": 0.43053147196769714, + "learning_rate": 1.61359081393468e-06, + "loss": 0.1857, + "step": 43988 + }, + { + "epoch": 0.8166773725229557, + "grad_norm": 0.40627163648605347, + "learning_rate": 1.6129555013899256e-06, + "loss": 0.2517, + "step": 43990 + }, + { + "epoch": 0.8167145026603744, + "grad_norm": 0.4479183852672577, + "learning_rate": 1.6123203029677247e-06, + "loss": 0.3315, + "step": 43992 + }, + { + "epoch": 0.816751632797793, + "grad_norm": 0.4351753294467926, + "learning_rate": 1.611685218676724e-06, + "loss": 0.3958, + "step": 43994 + }, + { + "epoch": 0.8167887629352116, + "grad_norm": 0.36620035767555237, + "learning_rate": 1.6110502485255575e-06, + "loss": 0.236, + "step": 43996 + }, + { + "epoch": 0.8168258930726303, + "grad_norm": 0.24258331954479218, + "learning_rate": 1.6104153925228728e-06, + "loss": 0.2845, + "step": 43998 + }, + { + "epoch": 0.8168630232100489, + "grad_norm": 0.3536120355129242, + "learning_rate": 1.6097806506773027e-06, + "loss": 0.2841, + "step": 44000 + }, + { + "epoch": 0.8169001533474676, + "grad_norm": 0.31925177574157715, + "learning_rate": 1.6091460229974853e-06, + "loss": 0.3033, + "step": 44002 + }, + { + "epoch": 0.8169372834848861, + "grad_norm": 0.31828728318214417, + "learning_rate": 1.6085115094920578e-06, + "loss": 0.2449, + "step": 44004 + }, + { + "epoch": 0.8169744136223048, + "grad_norm": 0.32721397280693054, + "learning_rate": 1.6078771101696534e-06, + "loss": 0.1434, + "step": 44006 + }, + { + "epoch": 0.8170115437597234, + "grad_norm": 0.33132708072662354, + "learning_rate": 1.6072428250389028e-06, + "loss": 0.3186, + "step": 44008 + }, + { + "epoch": 0.8170486738971421, + "grad_norm": 0.5203912258148193, + "learning_rate": 1.6066086541084391e-06, + "loss": 0.208, + "step": 44010 + }, + { + "epoch": 0.8170858040345608, + "grad_norm": 0.34158751368522644, + "learning_rate": 1.6059745973868933e-06, + "loss": 0.2157, + "step": 44012 + }, + { + "epoch": 0.8171229341719793, + "grad_norm": 0.5236937403678894, + "learning_rate": 1.6053406548828876e-06, + "loss": 0.3309, + "step": 44014 + }, + { + "epoch": 0.817160064309398, + "grad_norm": 0.3399065434932709, + "learning_rate": 1.6047068266050524e-06, + "loss": 0.1099, + "step": 44016 + }, + { + "epoch": 0.8171971944468166, + "grad_norm": 0.19152748584747314, + "learning_rate": 1.604073112562009e-06, + "loss": 0.1964, + "step": 44018 + }, + { + "epoch": 0.8172343245842353, + "grad_norm": 0.43222638964653015, + "learning_rate": 1.6034395127623815e-06, + "loss": 0.3053, + "step": 44020 + }, + { + "epoch": 0.817271454721654, + "grad_norm": 0.4710613191127777, + "learning_rate": 1.6028060272147915e-06, + "loss": 0.172, + "step": 44022 + }, + { + "epoch": 0.8173085848590725, + "grad_norm": 0.23491613566875458, + "learning_rate": 1.6021726559278616e-06, + "loss": 0.1829, + "step": 44024 + }, + { + "epoch": 0.8173457149964912, + "grad_norm": 0.8284361362457275, + "learning_rate": 1.6015393989102047e-06, + "loss": 0.298, + "step": 44026 + }, + { + "epoch": 0.8173828451339098, + "grad_norm": 0.3189184069633484, + "learning_rate": 1.6009062561704402e-06, + "loss": 0.0867, + "step": 44028 + }, + { + "epoch": 0.8174199752713285, + "grad_norm": 0.47937431931495667, + "learning_rate": 1.600273227717183e-06, + "loss": 0.2103, + "step": 44030 + }, + { + "epoch": 0.8174571054087472, + "grad_norm": 0.5728077292442322, + "learning_rate": 1.5996403135590478e-06, + "loss": 0.0759, + "step": 44032 + }, + { + "epoch": 0.8174942355461657, + "grad_norm": 0.5197650194168091, + "learning_rate": 1.5990075137046479e-06, + "loss": 0.4132, + "step": 44034 + }, + { + "epoch": 0.8175313656835844, + "grad_norm": 0.37295806407928467, + "learning_rate": 1.5983748281625899e-06, + "loss": 0.3885, + "step": 44036 + }, + { + "epoch": 0.817568495821003, + "grad_norm": 0.3562738597393036, + "learning_rate": 1.5977422569414868e-06, + "loss": 0.184, + "step": 44038 + }, + { + "epoch": 0.8176056259584217, + "grad_norm": 0.3940262198448181, + "learning_rate": 1.5971098000499418e-06, + "loss": 0.1789, + "step": 44040 + }, + { + "epoch": 0.8176427560958404, + "grad_norm": 0.3284790813922882, + "learning_rate": 1.5964774574965624e-06, + "loss": 0.3115, + "step": 44042 + }, + { + "epoch": 0.8176798862332589, + "grad_norm": 0.4503403604030609, + "learning_rate": 1.595845229289954e-06, + "loss": 0.3403, + "step": 44044 + }, + { + "epoch": 0.8177170163706776, + "grad_norm": 0.4714115262031555, + "learning_rate": 1.5952131154387184e-06, + "loss": 0.1564, + "step": 44046 + }, + { + "epoch": 0.8177541465080962, + "grad_norm": 0.36949166655540466, + "learning_rate": 1.5945811159514568e-06, + "loss": 0.2124, + "step": 44048 + }, + { + "epoch": 0.8177912766455149, + "grad_norm": 0.516167402267456, + "learning_rate": 1.5939492308367722e-06, + "loss": 0.2565, + "step": 44050 + }, + { + "epoch": 0.8178284067829336, + "grad_norm": 0.46870315074920654, + "learning_rate": 1.5933174601032564e-06, + "loss": 0.5464, + "step": 44052 + }, + { + "epoch": 0.8178655369203521, + "grad_norm": 1.149971604347229, + "learning_rate": 1.5926858037595095e-06, + "loss": 0.3273, + "step": 44054 + }, + { + "epoch": 0.8179026670577708, + "grad_norm": 0.40364304184913635, + "learning_rate": 1.5920542618141278e-06, + "loss": 0.3236, + "step": 44056 + }, + { + "epoch": 0.8179397971951894, + "grad_norm": 0.41620972752571106, + "learning_rate": 1.591422834275701e-06, + "loss": 0.1287, + "step": 44058 + }, + { + "epoch": 0.8179769273326081, + "grad_norm": 0.2796846628189087, + "learning_rate": 1.5907915211528225e-06, + "loss": 0.3472, + "step": 44060 + }, + { + "epoch": 0.8180140574700266, + "grad_norm": 0.3278990089893341, + "learning_rate": 1.5901603224540851e-06, + "loss": 0.3872, + "step": 44062 + }, + { + "epoch": 0.8180511876074453, + "grad_norm": 0.2739548981189728, + "learning_rate": 1.5895292381880735e-06, + "loss": 0.2449, + "step": 44064 + }, + { + "epoch": 0.818088317744864, + "grad_norm": 0.360414057970047, + "learning_rate": 1.5888982683633759e-06, + "loss": 0.1413, + "step": 44066 + }, + { + "epoch": 0.8181254478822826, + "grad_norm": 0.40283066034317017, + "learning_rate": 1.588267412988579e-06, + "loss": 0.1498, + "step": 44068 + }, + { + "epoch": 0.8181625780197013, + "grad_norm": 0.316351979970932, + "learning_rate": 1.5876366720722668e-06, + "loss": 0.3619, + "step": 44070 + }, + { + "epoch": 0.8181997081571198, + "grad_norm": 0.42310062050819397, + "learning_rate": 1.5870060456230218e-06, + "loss": 0.1621, + "step": 44072 + }, + { + "epoch": 0.8182368382945385, + "grad_norm": 0.48297426104545593, + "learning_rate": 1.5863755336494268e-06, + "loss": 0.3274, + "step": 44074 + }, + { + "epoch": 0.8182739684319572, + "grad_norm": 0.908526599407196, + "learning_rate": 1.5857451361600561e-06, + "loss": 0.4363, + "step": 44076 + }, + { + "epoch": 0.8183110985693758, + "grad_norm": 0.2524868845939636, + "learning_rate": 1.5851148531634942e-06, + "loss": 0.1849, + "step": 44078 + }, + { + "epoch": 0.8183482287067945, + "grad_norm": 0.6079502105712891, + "learning_rate": 1.5844846846683104e-06, + "loss": 0.2352, + "step": 44080 + }, + { + "epoch": 0.818385358844213, + "grad_norm": 0.7709314823150635, + "learning_rate": 1.5838546306830827e-06, + "loss": 0.2474, + "step": 44082 + }, + { + "epoch": 0.8184224889816317, + "grad_norm": 0.35421517491340637, + "learning_rate": 1.583224691216384e-06, + "loss": 0.216, + "step": 44084 + }, + { + "epoch": 0.8184596191190504, + "grad_norm": 0.44044116139411926, + "learning_rate": 1.5825948662767866e-06, + "loss": 0.2545, + "step": 44086 + }, + { + "epoch": 0.818496749256469, + "grad_norm": 0.3081692159175873, + "learning_rate": 1.5819651558728621e-06, + "loss": 0.1488, + "step": 44088 + }, + { + "epoch": 0.8185338793938877, + "grad_norm": 0.33745357394218445, + "learning_rate": 1.5813355600131742e-06, + "loss": 0.2372, + "step": 44090 + }, + { + "epoch": 0.8185710095313062, + "grad_norm": 0.20291011035442352, + "learning_rate": 1.5807060787062933e-06, + "loss": 0.3586, + "step": 44092 + }, + { + "epoch": 0.8186081396687249, + "grad_norm": 0.5969129204750061, + "learning_rate": 1.580076711960783e-06, + "loss": 0.2437, + "step": 44094 + }, + { + "epoch": 0.8186452698061436, + "grad_norm": 0.49870145320892334, + "learning_rate": 1.5794474597852095e-06, + "loss": 0.2061, + "step": 44096 + }, + { + "epoch": 0.8186823999435622, + "grad_norm": 0.3558120131492615, + "learning_rate": 1.5788183221881347e-06, + "loss": 0.3857, + "step": 44098 + }, + { + "epoch": 0.8187195300809809, + "grad_norm": 0.5142087340354919, + "learning_rate": 1.5781892991781156e-06, + "loss": 0.4661, + "step": 44100 + }, + { + "epoch": 0.8187566602183994, + "grad_norm": 0.5399808287620544, + "learning_rate": 1.5775603907637171e-06, + "loss": 0.2665, + "step": 44102 + }, + { + "epoch": 0.8187937903558181, + "grad_norm": 0.2618241310119629, + "learning_rate": 1.5769315969534916e-06, + "loss": 0.1562, + "step": 44104 + }, + { + "epoch": 0.8188309204932367, + "grad_norm": 0.368053138256073, + "learning_rate": 1.5763029177559963e-06, + "loss": 0.3559, + "step": 44106 + }, + { + "epoch": 0.8188680506306554, + "grad_norm": 0.3717603087425232, + "learning_rate": 1.5756743531797857e-06, + "loss": 0.3437, + "step": 44108 + }, + { + "epoch": 0.8189051807680741, + "grad_norm": 0.21356190741062164, + "learning_rate": 1.5750459032334142e-06, + "loss": 0.2217, + "step": 44110 + }, + { + "epoch": 0.8189423109054926, + "grad_norm": 0.39795204997062683, + "learning_rate": 1.5744175679254347e-06, + "loss": 0.3126, + "step": 44112 + }, + { + "epoch": 0.8189794410429113, + "grad_norm": 0.37931033968925476, + "learning_rate": 1.5737893472643927e-06, + "loss": 0.3546, + "step": 44114 + }, + { + "epoch": 0.8190165711803299, + "grad_norm": 0.4684171676635742, + "learning_rate": 1.5731612412588371e-06, + "loss": 0.2365, + "step": 44116 + }, + { + "epoch": 0.8190537013177486, + "grad_norm": 0.2616354823112488, + "learning_rate": 1.5725332499173163e-06, + "loss": 0.2437, + "step": 44118 + }, + { + "epoch": 0.8190908314551673, + "grad_norm": 0.6100311279296875, + "learning_rate": 1.5719053732483768e-06, + "loss": 0.4585, + "step": 44120 + }, + { + "epoch": 0.8191279615925858, + "grad_norm": 0.36489251255989075, + "learning_rate": 1.5712776112605589e-06, + "loss": 0.256, + "step": 44122 + }, + { + "epoch": 0.8191650917300045, + "grad_norm": 0.32745206356048584, + "learning_rate": 1.5706499639624072e-06, + "loss": 0.0543, + "step": 44124 + }, + { + "epoch": 0.8192022218674231, + "grad_norm": 0.41591495275497437, + "learning_rate": 1.5700224313624578e-06, + "loss": 0.2679, + "step": 44126 + }, + { + "epoch": 0.8192393520048418, + "grad_norm": 0.6219328045845032, + "learning_rate": 1.5693950134692537e-06, + "loss": 0.4099, + "step": 44128 + }, + { + "epoch": 0.8192764821422605, + "grad_norm": 0.2517624795436859, + "learning_rate": 1.56876771029133e-06, + "loss": 0.2144, + "step": 44130 + }, + { + "epoch": 0.819313612279679, + "grad_norm": 0.5595732927322388, + "learning_rate": 1.5681405218372237e-06, + "loss": 0.3024, + "step": 44132 + }, + { + "epoch": 0.8193507424170977, + "grad_norm": 0.23955067992210388, + "learning_rate": 1.5675134481154674e-06, + "loss": 0.2433, + "step": 44134 + }, + { + "epoch": 0.8193878725545163, + "grad_norm": 1.2503514289855957, + "learning_rate": 1.5668864891345959e-06, + "loss": 0.1195, + "step": 44136 + }, + { + "epoch": 0.819425002691935, + "grad_norm": 0.2486780732870102, + "learning_rate": 1.566259644903142e-06, + "loss": 0.2884, + "step": 44138 + }, + { + "epoch": 0.8194621328293537, + "grad_norm": 0.5575205683708191, + "learning_rate": 1.565632915429629e-06, + "loss": 0.3196, + "step": 44140 + }, + { + "epoch": 0.8194992629667722, + "grad_norm": 0.4635719656944275, + "learning_rate": 1.5650063007225913e-06, + "loss": 0.5353, + "step": 44142 + }, + { + "epoch": 0.8195363931041909, + "grad_norm": 0.29539725184440613, + "learning_rate": 1.5643798007905486e-06, + "loss": 0.1751, + "step": 44144 + }, + { + "epoch": 0.8195735232416095, + "grad_norm": 0.3746224343776703, + "learning_rate": 1.563753415642031e-06, + "loss": 0.321, + "step": 44146 + }, + { + "epoch": 0.8196106533790282, + "grad_norm": 0.37644311785697937, + "learning_rate": 1.563127145285558e-06, + "loss": 0.2188, + "step": 44148 + }, + { + "epoch": 0.8196477835164468, + "grad_norm": 0.4069465398788452, + "learning_rate": 1.5625009897296572e-06, + "loss": 0.2552, + "step": 44150 + }, + { + "epoch": 0.8196849136538654, + "grad_norm": 0.321184903383255, + "learning_rate": 1.5618749489828411e-06, + "loss": 0.3302, + "step": 44152 + }, + { + "epoch": 0.8197220437912841, + "grad_norm": 0.4351695477962494, + "learning_rate": 1.5612490230536325e-06, + "loss": 0.2446, + "step": 44154 + }, + { + "epoch": 0.8197591739287027, + "grad_norm": 0.4680473208427429, + "learning_rate": 1.560623211950547e-06, + "loss": 0.3693, + "step": 44156 + }, + { + "epoch": 0.8197963040661214, + "grad_norm": 0.5691108703613281, + "learning_rate": 1.5599975156821024e-06, + "loss": 0.2885, + "step": 44158 + }, + { + "epoch": 0.8198334342035399, + "grad_norm": 0.38574764132499695, + "learning_rate": 1.5593719342568093e-06, + "loss": 0.2292, + "step": 44160 + }, + { + "epoch": 0.8198705643409586, + "grad_norm": 0.3775882422924042, + "learning_rate": 1.5587464676831843e-06, + "loss": 0.2681, + "step": 44162 + }, + { + "epoch": 0.8199076944783773, + "grad_norm": 0.2684120833873749, + "learning_rate": 1.5581211159697352e-06, + "loss": 0.2878, + "step": 44164 + }, + { + "epoch": 0.8199448246157959, + "grad_norm": 0.5364150404930115, + "learning_rate": 1.5574958791249694e-06, + "loss": 0.3013, + "step": 44166 + }, + { + "epoch": 0.8199819547532146, + "grad_norm": 0.4092472493648529, + "learning_rate": 1.556870757157396e-06, + "loss": 0.151, + "step": 44168 + }, + { + "epoch": 0.8200190848906331, + "grad_norm": 0.555978000164032, + "learning_rate": 1.5562457500755223e-06, + "loss": 0.3746, + "step": 44170 + }, + { + "epoch": 0.8200562150280518, + "grad_norm": 0.2833896279335022, + "learning_rate": 1.5556208578878518e-06, + "loss": 0.2734, + "step": 44172 + }, + { + "epoch": 0.8200933451654705, + "grad_norm": 0.4228624403476715, + "learning_rate": 1.5549960806028875e-06, + "loss": 0.1934, + "step": 44174 + }, + { + "epoch": 0.8201304753028891, + "grad_norm": 0.4768389165401459, + "learning_rate": 1.5543714182291325e-06, + "loss": 0.1256, + "step": 44176 + }, + { + "epoch": 0.8201676054403078, + "grad_norm": 0.35951560735702515, + "learning_rate": 1.5537468707750824e-06, + "loss": 0.1034, + "step": 44178 + }, + { + "epoch": 0.8202047355777263, + "grad_norm": 0.5261889696121216, + "learning_rate": 1.5531224382492393e-06, + "loss": 0.3207, + "step": 44180 + }, + { + "epoch": 0.820241865715145, + "grad_norm": 0.6791105270385742, + "learning_rate": 1.5524981206600976e-06, + "loss": 0.3543, + "step": 44182 + }, + { + "epoch": 0.8202789958525637, + "grad_norm": 0.4426318109035492, + "learning_rate": 1.5518739180161556e-06, + "loss": 0.1638, + "step": 44184 + }, + { + "epoch": 0.8203161259899823, + "grad_norm": 0.4529627561569214, + "learning_rate": 1.5512498303259027e-06, + "loss": 0.3714, + "step": 44186 + }, + { + "epoch": 0.820353256127401, + "grad_norm": 0.37510091066360474, + "learning_rate": 1.5506258575978339e-06, + "loss": 0.1934, + "step": 44188 + }, + { + "epoch": 0.8203903862648195, + "grad_norm": 0.4047066569328308, + "learning_rate": 1.550001999840437e-06, + "loss": 0.2071, + "step": 44190 + }, + { + "epoch": 0.8204275164022382, + "grad_norm": 0.5375328660011292, + "learning_rate": 1.5493782570622018e-06, + "loss": 0.4762, + "step": 44192 + }, + { + "epoch": 0.8204646465396569, + "grad_norm": 0.37993961572647095, + "learning_rate": 1.548754629271616e-06, + "loss": 0.3103, + "step": 44194 + }, + { + "epoch": 0.8205017766770755, + "grad_norm": 0.41086775064468384, + "learning_rate": 1.5481311164771661e-06, + "loss": 0.3731, + "step": 44196 + }, + { + "epoch": 0.8205389068144942, + "grad_norm": 0.40745508670806885, + "learning_rate": 1.5475077186873345e-06, + "loss": 0.1917, + "step": 44198 + }, + { + "epoch": 0.8205760369519127, + "grad_norm": 0.2797167897224426, + "learning_rate": 1.5468844359106072e-06, + "loss": 0.2564, + "step": 44200 + }, + { + "epoch": 0.8206131670893314, + "grad_norm": 0.46691906452178955, + "learning_rate": 1.5462612681554613e-06, + "loss": 0.3409, + "step": 44202 + }, + { + "epoch": 0.82065029722675, + "grad_norm": 0.33289816975593567, + "learning_rate": 1.5456382154303763e-06, + "loss": 0.188, + "step": 44204 + }, + { + "epoch": 0.8206874273641687, + "grad_norm": 0.39042872190475464, + "learning_rate": 1.5450152777438343e-06, + "loss": 0.2446, + "step": 44206 + }, + { + "epoch": 0.8207245575015873, + "grad_norm": 0.28449153900146484, + "learning_rate": 1.5443924551043066e-06, + "loss": 0.1707, + "step": 44208 + }, + { + "epoch": 0.8207616876390059, + "grad_norm": 0.43035706877708435, + "learning_rate": 1.5437697475202694e-06, + "loss": 0.2923, + "step": 44210 + }, + { + "epoch": 0.8207988177764246, + "grad_norm": 0.43349942564964294, + "learning_rate": 1.5431471550001975e-06, + "loss": 0.3461, + "step": 44212 + }, + { + "epoch": 0.8208359479138432, + "grad_norm": 0.3353295922279358, + "learning_rate": 1.5425246775525637e-06, + "loss": 0.308, + "step": 44214 + }, + { + "epoch": 0.8208730780512619, + "grad_norm": 0.3252819776535034, + "learning_rate": 1.5419023151858336e-06, + "loss": 0.1301, + "step": 44216 + }, + { + "epoch": 0.8209102081886805, + "grad_norm": 0.4683806300163269, + "learning_rate": 1.5412800679084784e-06, + "loss": 0.4131, + "step": 44218 + }, + { + "epoch": 0.8209473383260991, + "grad_norm": 0.37426263093948364, + "learning_rate": 1.540657935728964e-06, + "loss": 0.3867, + "step": 44220 + }, + { + "epoch": 0.8209844684635178, + "grad_norm": 0.4133332073688507, + "learning_rate": 1.5400359186557567e-06, + "loss": 0.351, + "step": 44222 + }, + { + "epoch": 0.8210215986009364, + "grad_norm": 0.5562565326690674, + "learning_rate": 1.5394140166973227e-06, + "loss": 0.2968, + "step": 44224 + }, + { + "epoch": 0.8210587287383551, + "grad_norm": 0.360899955034256, + "learning_rate": 1.5387922298621182e-06, + "loss": 0.21, + "step": 44226 + }, + { + "epoch": 0.8210958588757737, + "grad_norm": 0.4655608832836151, + "learning_rate": 1.5381705581586104e-06, + "loss": 0.149, + "step": 44228 + }, + { + "epoch": 0.8211329890131923, + "grad_norm": 0.3761020004749298, + "learning_rate": 1.537549001595252e-06, + "loss": 0.2578, + "step": 44230 + }, + { + "epoch": 0.821170119150611, + "grad_norm": 0.4797358214855194, + "learning_rate": 1.5369275601805045e-06, + "loss": 0.365, + "step": 44232 + }, + { + "epoch": 0.8212072492880296, + "grad_norm": 0.32516419887542725, + "learning_rate": 1.536306233922823e-06, + "loss": 0.1893, + "step": 44234 + }, + { + "epoch": 0.8212443794254483, + "grad_norm": 0.5489771962165833, + "learning_rate": 1.535685022830662e-06, + "loss": 0.2811, + "step": 44236 + }, + { + "epoch": 0.8212815095628669, + "grad_norm": 0.40650615096092224, + "learning_rate": 1.5350639269124768e-06, + "loss": 0.1781, + "step": 44238 + }, + { + "epoch": 0.8213186397002855, + "grad_norm": 0.34532561898231506, + "learning_rate": 1.534442946176713e-06, + "loss": 0.2754, + "step": 44240 + }, + { + "epoch": 0.8213557698377042, + "grad_norm": 0.141364187002182, + "learning_rate": 1.5338220806318238e-06, + "loss": 0.142, + "step": 44242 + }, + { + "epoch": 0.8213928999751228, + "grad_norm": 0.4559357464313507, + "learning_rate": 1.5332013302862581e-06, + "loss": 0.2467, + "step": 44244 + }, + { + "epoch": 0.8214300301125415, + "grad_norm": 0.5665714740753174, + "learning_rate": 1.53258069514846e-06, + "loss": 0.3813, + "step": 44246 + }, + { + "epoch": 0.8214671602499601, + "grad_norm": 0.30410560965538025, + "learning_rate": 1.5319601752268786e-06, + "loss": 0.3216, + "step": 44248 + }, + { + "epoch": 0.8215042903873787, + "grad_norm": 0.4002247452735901, + "learning_rate": 1.5313397705299537e-06, + "loss": 0.2743, + "step": 44250 + }, + { + "epoch": 0.8215414205247974, + "grad_norm": 0.33095186948776245, + "learning_rate": 1.5307194810661262e-06, + "loss": 0.2396, + "step": 44252 + }, + { + "epoch": 0.821578550662216, + "grad_norm": 0.18876159191131592, + "learning_rate": 1.5300993068438386e-06, + "loss": 0.0076, + "step": 44254 + }, + { + "epoch": 0.8216156807996347, + "grad_norm": 0.31562960147857666, + "learning_rate": 1.5294792478715282e-06, + "loss": 0.2895, + "step": 44256 + }, + { + "epoch": 0.8216528109370532, + "grad_norm": 0.38490352034568787, + "learning_rate": 1.5288593041576337e-06, + "loss": 0.3119, + "step": 44258 + }, + { + "epoch": 0.8216899410744719, + "grad_norm": 0.2892584502696991, + "learning_rate": 1.5282394757105901e-06, + "loss": 0.2146, + "step": 44260 + }, + { + "epoch": 0.8217270712118906, + "grad_norm": 0.4311923682689667, + "learning_rate": 1.5276197625388312e-06, + "loss": 0.2787, + "step": 44262 + }, + { + "epoch": 0.8217642013493092, + "grad_norm": 0.32901525497436523, + "learning_rate": 1.5270001646507914e-06, + "loss": 0.1997, + "step": 44264 + }, + { + "epoch": 0.8218013314867278, + "grad_norm": 0.5071597695350647, + "learning_rate": 1.5263806820548987e-06, + "loss": 0.2246, + "step": 44266 + }, + { + "epoch": 0.8218384616241464, + "grad_norm": 0.3114582300186157, + "learning_rate": 1.5257613147595829e-06, + "loss": 0.3343, + "step": 44268 + }, + { + "epoch": 0.8218755917615651, + "grad_norm": 0.33477526903152466, + "learning_rate": 1.5251420627732739e-06, + "loss": 0.2512, + "step": 44270 + }, + { + "epoch": 0.8219127218989838, + "grad_norm": 0.5042020082473755, + "learning_rate": 1.5245229261043948e-06, + "loss": 0.3139, + "step": 44272 + }, + { + "epoch": 0.8219498520364024, + "grad_norm": 0.2940605878829956, + "learning_rate": 1.5239039047613713e-06, + "loss": 0.2378, + "step": 44274 + }, + { + "epoch": 0.821986982173821, + "grad_norm": 0.3481087386608124, + "learning_rate": 1.52328499875263e-06, + "loss": 0.3632, + "step": 44276 + }, + { + "epoch": 0.8220241123112396, + "grad_norm": 0.2972831428050995, + "learning_rate": 1.5226662080865861e-06, + "loss": 0.3158, + "step": 44278 + }, + { + "epoch": 0.8220612424486583, + "grad_norm": 0.41324931383132935, + "learning_rate": 1.522047532771662e-06, + "loss": 0.1576, + "step": 44280 + }, + { + "epoch": 0.822098372586077, + "grad_norm": 0.3038996458053589, + "learning_rate": 1.5214289728162778e-06, + "loss": 0.1198, + "step": 44282 + }, + { + "epoch": 0.8221355027234956, + "grad_norm": 0.4997726380825043, + "learning_rate": 1.5208105282288477e-06, + "loss": 0.408, + "step": 44284 + }, + { + "epoch": 0.8221726328609142, + "grad_norm": 0.40526893734931946, + "learning_rate": 1.5201921990177892e-06, + "loss": 0.2071, + "step": 44286 + }, + { + "epoch": 0.8222097629983328, + "grad_norm": 0.3132908344268799, + "learning_rate": 1.519573985191517e-06, + "loss": 0.2834, + "step": 44288 + }, + { + "epoch": 0.8222468931357515, + "grad_norm": 0.6940421462059021, + "learning_rate": 1.5189558867584386e-06, + "loss": 0.1553, + "step": 44290 + }, + { + "epoch": 0.8222840232731702, + "grad_norm": 0.3042430579662323, + "learning_rate": 1.5183379037269697e-06, + "loss": 0.3869, + "step": 44292 + }, + { + "epoch": 0.8223211534105888, + "grad_norm": 0.33500972390174866, + "learning_rate": 1.5177200361055133e-06, + "loss": 0.13, + "step": 44294 + }, + { + "epoch": 0.8223582835480074, + "grad_norm": 0.4052986204624176, + "learning_rate": 1.5171022839024808e-06, + "loss": 0.3161, + "step": 44296 + }, + { + "epoch": 0.822395413685426, + "grad_norm": 0.7102774381637573, + "learning_rate": 1.5164846471262762e-06, + "loss": 0.2341, + "step": 44298 + }, + { + "epoch": 0.8224325438228447, + "grad_norm": 0.27940109372138977, + "learning_rate": 1.5158671257853042e-06, + "loss": 0.1825, + "step": 44300 + }, + { + "epoch": 0.8224696739602634, + "grad_norm": 1.5168792009353638, + "learning_rate": 1.5152497198879713e-06, + "loss": 0.2945, + "step": 44302 + }, + { + "epoch": 0.822506804097682, + "grad_norm": 0.27114924788475037, + "learning_rate": 1.5146324294426718e-06, + "loss": 0.437, + "step": 44304 + }, + { + "epoch": 0.8225439342351006, + "grad_norm": 0.5465795397758484, + "learning_rate": 1.5140152544578079e-06, + "loss": 0.5151, + "step": 44306 + }, + { + "epoch": 0.8225810643725192, + "grad_norm": 0.25701451301574707, + "learning_rate": 1.5133981949417787e-06, + "loss": 0.2856, + "step": 44308 + }, + { + "epoch": 0.8226181945099379, + "grad_norm": 0.5882032513618469, + "learning_rate": 1.5127812509029815e-06, + "loss": 0.2083, + "step": 44310 + }, + { + "epoch": 0.8226553246473565, + "grad_norm": 0.37077853083610535, + "learning_rate": 1.5121644223498066e-06, + "loss": 0.3286, + "step": 44312 + }, + { + "epoch": 0.8226924547847752, + "grad_norm": 0.4213380515575409, + "learning_rate": 1.5115477092906539e-06, + "loss": 0.2537, + "step": 44314 + }, + { + "epoch": 0.8227295849221938, + "grad_norm": 0.4263063073158264, + "learning_rate": 1.5109311117339076e-06, + "loss": 0.1955, + "step": 44316 + }, + { + "epoch": 0.8227667150596124, + "grad_norm": 0.4740867018699646, + "learning_rate": 1.5103146296879612e-06, + "loss": 0.2858, + "step": 44318 + }, + { + "epoch": 0.8228038451970311, + "grad_norm": 0.4690374732017517, + "learning_rate": 1.5096982631612023e-06, + "loss": 0.2006, + "step": 44320 + }, + { + "epoch": 0.8228409753344497, + "grad_norm": 0.6269438862800598, + "learning_rate": 1.5090820121620197e-06, + "loss": 0.3952, + "step": 44322 + }, + { + "epoch": 0.8228781054718683, + "grad_norm": 0.46135011315345764, + "learning_rate": 1.508465876698798e-06, + "loss": 0.3379, + "step": 44324 + }, + { + "epoch": 0.822915235609287, + "grad_norm": 0.42056548595428467, + "learning_rate": 1.5078498567799227e-06, + "loss": 0.273, + "step": 44326 + }, + { + "epoch": 0.8229523657467056, + "grad_norm": 0.5488064885139465, + "learning_rate": 1.5072339524137714e-06, + "loss": 0.1931, + "step": 44328 + }, + { + "epoch": 0.8229894958841243, + "grad_norm": 0.3139699101448059, + "learning_rate": 1.5066181636087262e-06, + "loss": 0.309, + "step": 44330 + }, + { + "epoch": 0.8230266260215429, + "grad_norm": 0.3122418522834778, + "learning_rate": 1.5060024903731707e-06, + "loss": 0.1209, + "step": 44332 + }, + { + "epoch": 0.8230637561589615, + "grad_norm": 0.3864465057849884, + "learning_rate": 1.505386932715477e-06, + "loss": 0.2298, + "step": 44334 + }, + { + "epoch": 0.8231008862963802, + "grad_norm": 0.3195379674434662, + "learning_rate": 1.5047714906440215e-06, + "loss": 0.1217, + "step": 44336 + }, + { + "epoch": 0.8231380164337988, + "grad_norm": 0.49088746309280396, + "learning_rate": 1.504156164167181e-06, + "loss": 0.3662, + "step": 44338 + }, + { + "epoch": 0.8231751465712175, + "grad_norm": 0.2560213506221771, + "learning_rate": 1.5035409532933287e-06, + "loss": 0.1689, + "step": 44340 + }, + { + "epoch": 0.8232122767086361, + "grad_norm": 0.3888169825077057, + "learning_rate": 1.5029258580308314e-06, + "loss": 0.2426, + "step": 44342 + }, + { + "epoch": 0.8232494068460547, + "grad_norm": 0.359810471534729, + "learning_rate": 1.5023108783880625e-06, + "loss": 0.5328, + "step": 44344 + }, + { + "epoch": 0.8232865369834734, + "grad_norm": 0.5266593098640442, + "learning_rate": 1.5016960143733883e-06, + "loss": 0.2677, + "step": 44346 + }, + { + "epoch": 0.823323667120892, + "grad_norm": 0.4259595274925232, + "learning_rate": 1.5010812659951767e-06, + "loss": 0.348, + "step": 44348 + }, + { + "epoch": 0.8233607972583107, + "grad_norm": 0.47508442401885986, + "learning_rate": 1.5004666332617913e-06, + "loss": 0.293, + "step": 44350 + }, + { + "epoch": 0.8233979273957293, + "grad_norm": 0.5308201313018799, + "learning_rate": 1.4998521161815981e-06, + "loss": 0.2716, + "step": 44352 + }, + { + "epoch": 0.8234350575331479, + "grad_norm": 0.4218639135360718, + "learning_rate": 1.4992377147629556e-06, + "loss": 0.3105, + "step": 44354 + }, + { + "epoch": 0.8234721876705665, + "grad_norm": 0.475699245929718, + "learning_rate": 1.4986234290142265e-06, + "loss": 0.2317, + "step": 44356 + }, + { + "epoch": 0.8235093178079852, + "grad_norm": 0.3580717444419861, + "learning_rate": 1.4980092589437656e-06, + "loss": 0.2966, + "step": 44358 + }, + { + "epoch": 0.8235464479454039, + "grad_norm": 0.33864977955818176, + "learning_rate": 1.4973952045599316e-06, + "loss": 0.231, + "step": 44360 + }, + { + "epoch": 0.8235835780828225, + "grad_norm": 0.3307175636291504, + "learning_rate": 1.4967812658710822e-06, + "loss": 0.2193, + "step": 44362 + }, + { + "epoch": 0.8236207082202411, + "grad_norm": 0.548649251461029, + "learning_rate": 1.496167442885571e-06, + "loss": 0.3306, + "step": 44364 + }, + { + "epoch": 0.8236578383576597, + "grad_norm": 0.3636893630027771, + "learning_rate": 1.4955537356117466e-06, + "loss": 0.2705, + "step": 44366 + }, + { + "epoch": 0.8236949684950784, + "grad_norm": 0.5229859948158264, + "learning_rate": 1.4949401440579625e-06, + "loss": 0.2167, + "step": 44368 + }, + { + "epoch": 0.8237320986324971, + "grad_norm": 0.2911817133426666, + "learning_rate": 1.4943266682325675e-06, + "loss": 0.3016, + "step": 44370 + }, + { + "epoch": 0.8237692287699157, + "grad_norm": 0.4500266909599304, + "learning_rate": 1.4937133081439081e-06, + "loss": 0.3195, + "step": 44372 + }, + { + "epoch": 0.8238063589073343, + "grad_norm": 0.304797887802124, + "learning_rate": 1.4931000638003346e-06, + "loss": 0.0676, + "step": 44374 + }, + { + "epoch": 0.8238434890447529, + "grad_norm": 0.4162597060203552, + "learning_rate": 1.4924869352101856e-06, + "loss": 0.3111, + "step": 44376 + }, + { + "epoch": 0.8238806191821716, + "grad_norm": 0.47652313113212585, + "learning_rate": 1.4918739223818091e-06, + "loss": 0.3251, + "step": 44378 + }, + { + "epoch": 0.8239177493195903, + "grad_norm": 0.20339912176132202, + "learning_rate": 1.4912610253235405e-06, + "loss": 0.2798, + "step": 44380 + }, + { + "epoch": 0.8239548794570088, + "grad_norm": 0.5866124033927917, + "learning_rate": 1.4906482440437241e-06, + "loss": 0.4829, + "step": 44382 + }, + { + "epoch": 0.8239920095944275, + "grad_norm": 0.37735360860824585, + "learning_rate": 1.490035578550696e-06, + "loss": 0.5747, + "step": 44384 + }, + { + "epoch": 0.8240291397318461, + "grad_norm": 0.4492835998535156, + "learning_rate": 1.4894230288527921e-06, + "loss": 0.1048, + "step": 44386 + }, + { + "epoch": 0.8240662698692648, + "grad_norm": 0.4577048718929291, + "learning_rate": 1.4888105949583508e-06, + "loss": 0.145, + "step": 44388 + }, + { + "epoch": 0.8241034000066835, + "grad_norm": 0.5359029769897461, + "learning_rate": 1.4881982768757043e-06, + "loss": 0.3492, + "step": 44390 + }, + { + "epoch": 0.824140530144102, + "grad_norm": 0.33775848150253296, + "learning_rate": 1.487586074613181e-06, + "loss": 0.3007, + "step": 44392 + }, + { + "epoch": 0.8241776602815207, + "grad_norm": 0.4457545578479767, + "learning_rate": 1.4869739881791146e-06, + "loss": 0.3328, + "step": 44394 + }, + { + "epoch": 0.8242147904189393, + "grad_norm": 0.3008882999420166, + "learning_rate": 1.4863620175818337e-06, + "loss": 0.4618, + "step": 44396 + }, + { + "epoch": 0.824251920556358, + "grad_norm": 0.45163577795028687, + "learning_rate": 1.4857501628296634e-06, + "loss": 0.3514, + "step": 44398 + }, + { + "epoch": 0.8242890506937767, + "grad_norm": 0.4003308415412903, + "learning_rate": 1.4851384239309296e-06, + "loss": 0.273, + "step": 44400 + }, + { + "epoch": 0.8243261808311952, + "grad_norm": 0.31293508410453796, + "learning_rate": 1.4845268008939596e-06, + "loss": 0.1541, + "step": 44402 + }, + { + "epoch": 0.8243633109686139, + "grad_norm": 0.2741767168045044, + "learning_rate": 1.4839152937270706e-06, + "loss": 0.1512, + "step": 44404 + }, + { + "epoch": 0.8244004411060325, + "grad_norm": 0.48680737614631653, + "learning_rate": 1.4833039024385854e-06, + "loss": 0.3504, + "step": 44406 + }, + { + "epoch": 0.8244375712434512, + "grad_norm": 0.2666643261909485, + "learning_rate": 1.4826926270368248e-06, + "loss": 0.3203, + "step": 44408 + }, + { + "epoch": 0.8244747013808698, + "grad_norm": 0.3038474917411804, + "learning_rate": 1.4820814675301043e-06, + "loss": 0.1406, + "step": 44410 + }, + { + "epoch": 0.8245118315182884, + "grad_norm": 0.40017247200012207, + "learning_rate": 1.4814704239267407e-06, + "loss": 0.3732, + "step": 44412 + }, + { + "epoch": 0.8245489616557071, + "grad_norm": 0.3195667266845703, + "learning_rate": 1.480859496235052e-06, + "loss": 0.217, + "step": 44414 + }, + { + "epoch": 0.8245860917931257, + "grad_norm": 0.34560245275497437, + "learning_rate": 1.4802486844633446e-06, + "loss": 0.2728, + "step": 44416 + }, + { + "epoch": 0.8246232219305444, + "grad_norm": 0.44810667634010315, + "learning_rate": 1.4796379886199353e-06, + "loss": 0.1509, + "step": 44418 + }, + { + "epoch": 0.824660352067963, + "grad_norm": 0.4631747007369995, + "learning_rate": 1.4790274087131296e-06, + "loss": 0.4565, + "step": 44420 + }, + { + "epoch": 0.8246974822053816, + "grad_norm": 0.3789494037628174, + "learning_rate": 1.4784169447512375e-06, + "loss": 0.1867, + "step": 44422 + }, + { + "epoch": 0.8247346123428003, + "grad_norm": 0.24619998037815094, + "learning_rate": 1.477806596742566e-06, + "loss": 0.1749, + "step": 44424 + }, + { + "epoch": 0.8247717424802189, + "grad_norm": 0.36924728751182556, + "learning_rate": 1.477196364695419e-06, + "loss": 0.361, + "step": 44426 + }, + { + "epoch": 0.8248088726176376, + "grad_norm": 0.468763142824173, + "learning_rate": 1.4765862486181037e-06, + "loss": 0.328, + "step": 44428 + }, + { + "epoch": 0.8248460027550562, + "grad_norm": 0.6404542326927185, + "learning_rate": 1.4759762485189154e-06, + "loss": 0.2467, + "step": 44430 + }, + { + "epoch": 0.8248831328924748, + "grad_norm": 0.33120644092559814, + "learning_rate": 1.47536636440616e-06, + "loss": 0.3932, + "step": 44432 + }, + { + "epoch": 0.8249202630298935, + "grad_norm": 0.2860252857208252, + "learning_rate": 1.4747565962881328e-06, + "loss": 0.3304, + "step": 44434 + }, + { + "epoch": 0.8249573931673121, + "grad_norm": 0.7179960608482361, + "learning_rate": 1.4741469441731327e-06, + "loss": 0.2848, + "step": 44436 + }, + { + "epoch": 0.8249945233047308, + "grad_norm": 0.18831995129585266, + "learning_rate": 1.4735374080694564e-06, + "loss": 0.1106, + "step": 44438 + }, + { + "epoch": 0.8250316534421493, + "grad_norm": 0.3486173450946808, + "learning_rate": 1.4729279879853976e-06, + "loss": 0.2194, + "step": 44440 + }, + { + "epoch": 0.825068783579568, + "grad_norm": 0.5629202723503113, + "learning_rate": 1.4723186839292436e-06, + "loss": 0.2457, + "step": 44442 + }, + { + "epoch": 0.8251059137169867, + "grad_norm": 0.29227855801582336, + "learning_rate": 1.4717094959092904e-06, + "loss": 0.2108, + "step": 44444 + }, + { + "epoch": 0.8251430438544053, + "grad_norm": 0.3910392224788666, + "learning_rate": 1.4711004239338245e-06, + "loss": 0.0904, + "step": 44446 + }, + { + "epoch": 0.825180173991824, + "grad_norm": 0.28663647174835205, + "learning_rate": 1.470491468011136e-06, + "loss": 0.3289, + "step": 44448 + }, + { + "epoch": 0.8252173041292425, + "grad_norm": 0.2817362844944, + "learning_rate": 1.4698826281495083e-06, + "loss": 0.2783, + "step": 44450 + }, + { + "epoch": 0.8252544342666612, + "grad_norm": 0.42513948678970337, + "learning_rate": 1.4692739043572313e-06, + "loss": 0.2589, + "step": 44452 + }, + { + "epoch": 0.8252915644040799, + "grad_norm": 0.2713676989078522, + "learning_rate": 1.4686652966425807e-06, + "loss": 0.3299, + "step": 44454 + }, + { + "epoch": 0.8253286945414985, + "grad_norm": 0.25707992911338806, + "learning_rate": 1.468056805013841e-06, + "loss": 0.1729, + "step": 44456 + }, + { + "epoch": 0.8253658246789172, + "grad_norm": 0.3693886399269104, + "learning_rate": 1.4674484294792923e-06, + "loss": 0.3036, + "step": 44458 + }, + { + "epoch": 0.8254029548163357, + "grad_norm": 0.47757822275161743, + "learning_rate": 1.4668401700472146e-06, + "loss": 0.3611, + "step": 44460 + }, + { + "epoch": 0.8254400849537544, + "grad_norm": 0.39343148469924927, + "learning_rate": 1.46623202672588e-06, + "loss": 0.1552, + "step": 44462 + }, + { + "epoch": 0.825477215091173, + "grad_norm": 0.305357426404953, + "learning_rate": 1.4656239995235666e-06, + "loss": 0.3068, + "step": 44464 + }, + { + "epoch": 0.8255143452285917, + "grad_norm": 0.6944701671600342, + "learning_rate": 1.4650160884485498e-06, + "loss": 0.3141, + "step": 44466 + }, + { + "epoch": 0.8255514753660104, + "grad_norm": 0.4313117563724518, + "learning_rate": 1.4644082935090952e-06, + "loss": 0.2253, + "step": 44468 + }, + { + "epoch": 0.8255886055034289, + "grad_norm": 0.5051404237747192, + "learning_rate": 1.4638006147134776e-06, + "loss": 0.4782, + "step": 44470 + }, + { + "epoch": 0.8256257356408476, + "grad_norm": 0.2958587408065796, + "learning_rate": 1.4631930520699645e-06, + "loss": 0.2865, + "step": 44472 + }, + { + "epoch": 0.8256628657782662, + "grad_norm": 0.48300692439079285, + "learning_rate": 1.4625856055868227e-06, + "loss": 0.3818, + "step": 44474 + }, + { + "epoch": 0.8256999959156849, + "grad_norm": 0.42771753668785095, + "learning_rate": 1.4619782752723188e-06, + "loss": 0.2083, + "step": 44476 + }, + { + "epoch": 0.8257371260531036, + "grad_norm": 0.4528143107891083, + "learning_rate": 1.4613710611347187e-06, + "loss": 0.3938, + "step": 44478 + }, + { + "epoch": 0.8257742561905221, + "grad_norm": 0.48317664861679077, + "learning_rate": 1.46076396318228e-06, + "loss": 0.3612, + "step": 44480 + }, + { + "epoch": 0.8258113863279408, + "grad_norm": 0.6486336588859558, + "learning_rate": 1.4601569814232686e-06, + "loss": 0.4703, + "step": 44482 + }, + { + "epoch": 0.8258485164653594, + "grad_norm": 0.25663724541664124, + "learning_rate": 1.4595501158659376e-06, + "loss": 0.3532, + "step": 44484 + }, + { + "epoch": 0.8258856466027781, + "grad_norm": 0.34411704540252686, + "learning_rate": 1.4589433665185482e-06, + "loss": 0.302, + "step": 44486 + }, + { + "epoch": 0.8259227767401968, + "grad_norm": 0.4348393976688385, + "learning_rate": 1.458336733389356e-06, + "loss": 0.2079, + "step": 44488 + }, + { + "epoch": 0.8259599068776153, + "grad_norm": 0.45254871249198914, + "learning_rate": 1.457730216486618e-06, + "loss": 0.3249, + "step": 44490 + }, + { + "epoch": 0.825997037015034, + "grad_norm": 0.32711702585220337, + "learning_rate": 1.4571238158185829e-06, + "loss": 0.416, + "step": 44492 + }, + { + "epoch": 0.8260341671524526, + "grad_norm": 0.40807783603668213, + "learning_rate": 1.4565175313935032e-06, + "loss": 0.2627, + "step": 44494 + }, + { + "epoch": 0.8260712972898713, + "grad_norm": 0.22464247047901154, + "learning_rate": 1.4559113632196299e-06, + "loss": 0.2115, + "step": 44496 + }, + { + "epoch": 0.82610842742729, + "grad_norm": 0.34408894181251526, + "learning_rate": 1.4553053113052096e-06, + "loss": 0.3454, + "step": 44498 + }, + { + "epoch": 0.8261455575647085, + "grad_norm": 0.6072781682014465, + "learning_rate": 1.4546993756584927e-06, + "loss": 0.1457, + "step": 44500 + }, + { + "epoch": 0.8261826877021272, + "grad_norm": 0.24276113510131836, + "learning_rate": 1.454093556287718e-06, + "loss": 0.2323, + "step": 44502 + }, + { + "epoch": 0.8262198178395458, + "grad_norm": 0.3045775592327118, + "learning_rate": 1.4534878532011354e-06, + "loss": 0.2686, + "step": 44504 + }, + { + "epoch": 0.8262569479769645, + "grad_norm": 0.38654378056526184, + "learning_rate": 1.4528822664069798e-06, + "loss": 0.246, + "step": 44506 + }, + { + "epoch": 0.826294078114383, + "grad_norm": 0.3688133955001831, + "learning_rate": 1.4522767959134965e-06, + "loss": 0.2253, + "step": 44508 + }, + { + "epoch": 0.8263312082518017, + "grad_norm": 0.39238420128822327, + "learning_rate": 1.4516714417289213e-06, + "loss": 0.3124, + "step": 44510 + }, + { + "epoch": 0.8263683383892204, + "grad_norm": 0.35250505805015564, + "learning_rate": 1.4510662038614931e-06, + "loss": 0.2764, + "step": 44512 + }, + { + "epoch": 0.826405468526639, + "grad_norm": 0.49556204676628113, + "learning_rate": 1.4504610823194464e-06, + "loss": 0.414, + "step": 44514 + }, + { + "epoch": 0.8264425986640577, + "grad_norm": 0.4538097381591797, + "learning_rate": 1.4498560771110182e-06, + "loss": 0.3608, + "step": 44516 + }, + { + "epoch": 0.8264797288014762, + "grad_norm": 0.3939908444881439, + "learning_rate": 1.449251188244436e-06, + "loss": 0.2234, + "step": 44518 + }, + { + "epoch": 0.8265168589388949, + "grad_norm": 0.5191604495048523, + "learning_rate": 1.4486464157279324e-06, + "loss": 0.2397, + "step": 44520 + }, + { + "epoch": 0.8265539890763136, + "grad_norm": 0.3154982030391693, + "learning_rate": 1.4480417595697371e-06, + "loss": 0.2441, + "step": 44522 + }, + { + "epoch": 0.8265911192137322, + "grad_norm": 0.5836181640625, + "learning_rate": 1.4474372197780795e-06, + "loss": 0.2214, + "step": 44524 + }, + { + "epoch": 0.8266282493511509, + "grad_norm": 0.36124810576438904, + "learning_rate": 1.4468327963611816e-06, + "loss": 0.2832, + "step": 44526 + }, + { + "epoch": 0.8266653794885694, + "grad_norm": 0.6468775272369385, + "learning_rate": 1.4462284893272716e-06, + "loss": 0.3449, + "step": 44528 + }, + { + "epoch": 0.8267025096259881, + "grad_norm": 0.42911240458488464, + "learning_rate": 1.445624298684568e-06, + "loss": 0.3639, + "step": 44530 + }, + { + "epoch": 0.8267396397634068, + "grad_norm": 0.4560735523700714, + "learning_rate": 1.4450202244412936e-06, + "loss": 0.1937, + "step": 44532 + }, + { + "epoch": 0.8267767699008254, + "grad_norm": 0.33229440450668335, + "learning_rate": 1.4444162666056705e-06, + "loss": 0.1003, + "step": 44534 + }, + { + "epoch": 0.8268139000382441, + "grad_norm": 0.4366031587123871, + "learning_rate": 1.443812425185913e-06, + "loss": 0.2276, + "step": 44536 + }, + { + "epoch": 0.8268510301756626, + "grad_norm": 0.36350634694099426, + "learning_rate": 1.4432087001902417e-06, + "loss": 0.3317, + "step": 44538 + }, + { + "epoch": 0.8268881603130813, + "grad_norm": 0.4376106262207031, + "learning_rate": 1.4426050916268708e-06, + "loss": 0.1018, + "step": 44540 + }, + { + "epoch": 0.8269252904505, + "grad_norm": 0.41464704275131226, + "learning_rate": 1.4420015995040093e-06, + "loss": 0.3012, + "step": 44542 + }, + { + "epoch": 0.8269624205879186, + "grad_norm": 0.3919060528278351, + "learning_rate": 1.441398223829873e-06, + "loss": 0.263, + "step": 44544 + }, + { + "epoch": 0.8269995507253373, + "grad_norm": 0.36826562881469727, + "learning_rate": 1.4407949646126729e-06, + "loss": 0.2151, + "step": 44546 + }, + { + "epoch": 0.8270366808627558, + "grad_norm": 0.4565935432910919, + "learning_rate": 1.4401918218606127e-06, + "loss": 0.3597, + "step": 44548 + }, + { + "epoch": 0.8270738110001745, + "grad_norm": 0.33963632583618164, + "learning_rate": 1.4395887955819022e-06, + "loss": 0.2335, + "step": 44550 + }, + { + "epoch": 0.8271109411375932, + "grad_norm": 0.3320991098880768, + "learning_rate": 1.4389858857847473e-06, + "loss": 0.1616, + "step": 44552 + }, + { + "epoch": 0.8271480712750118, + "grad_norm": 0.4420614242553711, + "learning_rate": 1.4383830924773534e-06, + "loss": 0.353, + "step": 44554 + }, + { + "epoch": 0.8271852014124305, + "grad_norm": 0.5286855101585388, + "learning_rate": 1.4377804156679187e-06, + "loss": 0.368, + "step": 44556 + }, + { + "epoch": 0.827222331549849, + "grad_norm": 0.3630439341068268, + "learning_rate": 1.4371778553646454e-06, + "loss": 0.2322, + "step": 44558 + }, + { + "epoch": 0.8272594616872677, + "grad_norm": 0.30532121658325195, + "learning_rate": 1.4365754115757325e-06, + "loss": 0.1734, + "step": 44560 + }, + { + "epoch": 0.8272965918246863, + "grad_norm": 0.27730900049209595, + "learning_rate": 1.4359730843093778e-06, + "loss": 0.2502, + "step": 44562 + }, + { + "epoch": 0.827333721962105, + "grad_norm": 0.5762947797775269, + "learning_rate": 1.4353708735737803e-06, + "loss": 0.3917, + "step": 44564 + }, + { + "epoch": 0.8273708520995237, + "grad_norm": 0.431096613407135, + "learning_rate": 1.434768779377128e-06, + "loss": 0.1643, + "step": 44566 + }, + { + "epoch": 0.8274079822369422, + "grad_norm": 0.6292319297790527, + "learning_rate": 1.43416680172762e-06, + "loss": 0.2437, + "step": 44568 + }, + { + "epoch": 0.8274451123743609, + "grad_norm": 0.3266674280166626, + "learning_rate": 1.4335649406334417e-06, + "loss": 0.3388, + "step": 44570 + }, + { + "epoch": 0.8274822425117795, + "grad_norm": 0.3215568959712982, + "learning_rate": 1.4329631961027845e-06, + "loss": 0.1551, + "step": 44572 + }, + { + "epoch": 0.8275193726491982, + "grad_norm": 0.4225177466869354, + "learning_rate": 1.4323615681438374e-06, + "loss": 0.1401, + "step": 44574 + }, + { + "epoch": 0.8275565027866169, + "grad_norm": 0.4300943911075592, + "learning_rate": 1.431760056764786e-06, + "loss": 0.227, + "step": 44576 + }, + { + "epoch": 0.8275936329240354, + "grad_norm": 0.2382565587759018, + "learning_rate": 1.4311586619738183e-06, + "loss": 0.3377, + "step": 44578 + }, + { + "epoch": 0.8276307630614541, + "grad_norm": 0.38147297501564026, + "learning_rate": 1.4305573837791109e-06, + "loss": 0.3395, + "step": 44580 + }, + { + "epoch": 0.8276678931988727, + "grad_norm": 0.1967245191335678, + "learning_rate": 1.4299562221888507e-06, + "loss": 0.2735, + "step": 44582 + }, + { + "epoch": 0.8277050233362914, + "grad_norm": 0.3886331021785736, + "learning_rate": 1.4293551772112146e-06, + "loss": 0.2476, + "step": 44584 + }, + { + "epoch": 0.82774215347371, + "grad_norm": 0.46296975016593933, + "learning_rate": 1.4287542488543859e-06, + "loss": 0.4051, + "step": 44586 + }, + { + "epoch": 0.8277792836111286, + "grad_norm": 0.5586023330688477, + "learning_rate": 1.4281534371265348e-06, + "loss": 0.2931, + "step": 44588 + }, + { + "epoch": 0.8278164137485473, + "grad_norm": 0.5555031895637512, + "learning_rate": 1.4275527420358403e-06, + "loss": 0.226, + "step": 44590 + }, + { + "epoch": 0.8278535438859659, + "grad_norm": 0.5167964100837708, + "learning_rate": 1.4269521635904782e-06, + "loss": 0.2346, + "step": 44592 + }, + { + "epoch": 0.8278906740233846, + "grad_norm": 0.641940712928772, + "learning_rate": 1.426351701798615e-06, + "loss": 0.4475, + "step": 44594 + }, + { + "epoch": 0.8279278041608032, + "grad_norm": 0.4867044985294342, + "learning_rate": 1.4257513566684245e-06, + "loss": 0.3513, + "step": 44596 + }, + { + "epoch": 0.8279649342982218, + "grad_norm": 0.5539929270744324, + "learning_rate": 1.4251511282080754e-06, + "loss": 0.3085, + "step": 44598 + }, + { + "epoch": 0.8280020644356405, + "grad_norm": 0.20153586566448212, + "learning_rate": 1.4245510164257336e-06, + "loss": 0.2367, + "step": 44600 + }, + { + "epoch": 0.8280391945730591, + "grad_norm": 0.3592088222503662, + "learning_rate": 1.4239510213295671e-06, + "loss": 0.3146, + "step": 44602 + }, + { + "epoch": 0.8280763247104778, + "grad_norm": 1.0701875686645508, + "learning_rate": 1.4233511429277414e-06, + "loss": 0.2778, + "step": 44604 + }, + { + "epoch": 0.8281134548478964, + "grad_norm": 0.4356667995452881, + "learning_rate": 1.4227513812284133e-06, + "loss": 0.3339, + "step": 44606 + }, + { + "epoch": 0.828150584985315, + "grad_norm": 0.16823738813400269, + "learning_rate": 1.4221517362397497e-06, + "loss": 0.1137, + "step": 44608 + }, + { + "epoch": 0.8281877151227337, + "grad_norm": 0.38515982031822205, + "learning_rate": 1.421552207969905e-06, + "loss": 0.1705, + "step": 44610 + }, + { + "epoch": 0.8282248452601523, + "grad_norm": 0.3402326703071594, + "learning_rate": 1.4209527964270398e-06, + "loss": 0.348, + "step": 44612 + }, + { + "epoch": 0.828261975397571, + "grad_norm": 0.38503968715667725, + "learning_rate": 1.4203535016193104e-06, + "loss": 0.2077, + "step": 44614 + }, + { + "epoch": 0.8282991055349895, + "grad_norm": 0.3827439248561859, + "learning_rate": 1.4197543235548715e-06, + "loss": 0.256, + "step": 44616 + }, + { + "epoch": 0.8283362356724082, + "grad_norm": 0.3246382772922516, + "learning_rate": 1.4191552622418747e-06, + "loss": 0.2755, + "step": 44618 + }, + { + "epoch": 0.8283733658098269, + "grad_norm": 0.3551013767719269, + "learning_rate": 1.4185563176884708e-06, + "loss": 0.31, + "step": 44620 + }, + { + "epoch": 0.8284104959472455, + "grad_norm": 0.46833574771881104, + "learning_rate": 1.4179574899028125e-06, + "loss": 0.3474, + "step": 44622 + }, + { + "epoch": 0.8284476260846642, + "grad_norm": 0.3563046455383301, + "learning_rate": 1.4173587788930454e-06, + "loss": 0.399, + "step": 44624 + }, + { + "epoch": 0.8284847562220827, + "grad_norm": 0.610089123249054, + "learning_rate": 1.4167601846673172e-06, + "loss": 0.1886, + "step": 44626 + }, + { + "epoch": 0.8285218863595014, + "grad_norm": 0.3741333782672882, + "learning_rate": 1.4161617072337764e-06, + "loss": 0.315, + "step": 44628 + }, + { + "epoch": 0.8285590164969201, + "grad_norm": 0.3993767201900482, + "learning_rate": 1.4155633466005602e-06, + "loss": 0.1996, + "step": 44630 + }, + { + "epoch": 0.8285961466343387, + "grad_norm": 0.3678515553474426, + "learning_rate": 1.414965102775816e-06, + "loss": 0.3375, + "step": 44632 + }, + { + "epoch": 0.8286332767717574, + "grad_norm": 0.27323460578918457, + "learning_rate": 1.4143669757676792e-06, + "loss": 0.1924, + "step": 44634 + }, + { + "epoch": 0.8286704069091759, + "grad_norm": 0.3332717716693878, + "learning_rate": 1.4137689655842913e-06, + "loss": 0.2877, + "step": 44636 + }, + { + "epoch": 0.8287075370465946, + "grad_norm": 0.5685442686080933, + "learning_rate": 1.4131710722337889e-06, + "loss": 0.3571, + "step": 44638 + }, + { + "epoch": 0.8287446671840133, + "grad_norm": 0.4489130675792694, + "learning_rate": 1.4125732957243077e-06, + "loss": 0.2052, + "step": 44640 + }, + { + "epoch": 0.8287817973214319, + "grad_norm": 0.3397492468357086, + "learning_rate": 1.4119756360639835e-06, + "loss": 0.4944, + "step": 44642 + }, + { + "epoch": 0.8288189274588506, + "grad_norm": 0.39863723516464233, + "learning_rate": 1.4113780932609444e-06, + "loss": 0.384, + "step": 44644 + }, + { + "epoch": 0.8288560575962691, + "grad_norm": 0.32967403531074524, + "learning_rate": 1.4107806673233237e-06, + "loss": 0.2859, + "step": 44646 + }, + { + "epoch": 0.8288931877336878, + "grad_norm": 0.42263832688331604, + "learning_rate": 1.4101833582592506e-06, + "loss": 0.2066, + "step": 44648 + }, + { + "epoch": 0.8289303178711065, + "grad_norm": 0.4866623878479004, + "learning_rate": 1.4095861660768551e-06, + "loss": 0.1208, + "step": 44650 + }, + { + "epoch": 0.8289674480085251, + "grad_norm": 0.42383044958114624, + "learning_rate": 1.4089890907842574e-06, + "loss": 0.449, + "step": 44652 + }, + { + "epoch": 0.8290045781459437, + "grad_norm": 0.31940940022468567, + "learning_rate": 1.4083921323895888e-06, + "loss": 0.323, + "step": 44654 + }, + { + "epoch": 0.8290417082833623, + "grad_norm": 0.4039912521839142, + "learning_rate": 1.4077952909009652e-06, + "loss": 0.2996, + "step": 44656 + }, + { + "epoch": 0.829078838420781, + "grad_norm": 0.3600819408893585, + "learning_rate": 1.4071985663265108e-06, + "loss": 0.2401, + "step": 44658 + }, + { + "epoch": 0.8291159685581996, + "grad_norm": 0.32907411456108093, + "learning_rate": 1.4066019586743461e-06, + "loss": 0.3753, + "step": 44660 + }, + { + "epoch": 0.8291530986956183, + "grad_norm": 0.44636255502700806, + "learning_rate": 1.406005467952588e-06, + "loss": 0.133, + "step": 44662 + }, + { + "epoch": 0.829190228833037, + "grad_norm": 0.5170493125915527, + "learning_rate": 1.4054090941693544e-06, + "loss": 0.4588, + "step": 44664 + }, + { + "epoch": 0.8292273589704555, + "grad_norm": 0.30385977029800415, + "learning_rate": 1.4048128373327585e-06, + "loss": 0.2286, + "step": 44666 + }, + { + "epoch": 0.8292644891078742, + "grad_norm": 0.3809641897678375, + "learning_rate": 1.4042166974509164e-06, + "loss": 0.1742, + "step": 44668 + }, + { + "epoch": 0.8293016192452928, + "grad_norm": 0.3599933981895447, + "learning_rate": 1.4036206745319359e-06, + "loss": 0.2773, + "step": 44670 + }, + { + "epoch": 0.8293387493827115, + "grad_norm": 0.34669196605682373, + "learning_rate": 1.4030247685839316e-06, + "loss": 0.1939, + "step": 44672 + }, + { + "epoch": 0.8293758795201301, + "grad_norm": 0.31091058254241943, + "learning_rate": 1.402428979615006e-06, + "loss": 0.2555, + "step": 44674 + }, + { + "epoch": 0.8294130096575487, + "grad_norm": 0.4375550150871277, + "learning_rate": 1.4018333076332703e-06, + "loss": 0.2974, + "step": 44676 + }, + { + "epoch": 0.8294501397949674, + "grad_norm": 0.3333878815174103, + "learning_rate": 1.401237752646828e-06, + "loss": 0.0489, + "step": 44678 + }, + { + "epoch": 0.829487269932386, + "grad_norm": 0.17322520911693573, + "learning_rate": 1.400642314663786e-06, + "loss": 0.2441, + "step": 44680 + }, + { + "epoch": 0.8295244000698047, + "grad_norm": 0.41641905903816223, + "learning_rate": 1.4000469936922424e-06, + "loss": 0.439, + "step": 44682 + }, + { + "epoch": 0.8295615302072233, + "grad_norm": 0.3629946708679199, + "learning_rate": 1.3994517897402981e-06, + "loss": 0.1693, + "step": 44684 + }, + { + "epoch": 0.8295986603446419, + "grad_norm": 0.3358135521411896, + "learning_rate": 1.398856702816055e-06, + "loss": 0.1988, + "step": 44686 + }, + { + "epoch": 0.8296357904820606, + "grad_norm": 0.49397900700569153, + "learning_rate": 1.398261732927607e-06, + "loss": 0.3042, + "step": 44688 + }, + { + "epoch": 0.8296729206194792, + "grad_norm": 0.22138436138629913, + "learning_rate": 1.3976668800830528e-06, + "loss": 0.2268, + "step": 44690 + }, + { + "epoch": 0.8297100507568979, + "grad_norm": 0.29525843262672424, + "learning_rate": 1.3970721442904877e-06, + "loss": 0.1323, + "step": 44692 + }, + { + "epoch": 0.8297471808943165, + "grad_norm": 0.5254817008972168, + "learning_rate": 1.396477525558001e-06, + "loss": 0.4727, + "step": 44694 + }, + { + "epoch": 0.8297843110317351, + "grad_norm": 1.223310112953186, + "learning_rate": 1.3958830238936826e-06, + "loss": 0.0997, + "step": 44696 + }, + { + "epoch": 0.8298214411691538, + "grad_norm": 0.25934192538261414, + "learning_rate": 1.3952886393056254e-06, + "loss": 0.2234, + "step": 44698 + }, + { + "epoch": 0.8298585713065724, + "grad_norm": 0.3460824489593506, + "learning_rate": 1.3946943718019134e-06, + "loss": 0.1043, + "step": 44700 + }, + { + "epoch": 0.829895701443991, + "grad_norm": 0.29451557993888855, + "learning_rate": 1.3941002213906374e-06, + "loss": 0.3829, + "step": 44702 + }, + { + "epoch": 0.8299328315814097, + "grad_norm": 0.4082031846046448, + "learning_rate": 1.3935061880798806e-06, + "loss": 0.3364, + "step": 44704 + }, + { + "epoch": 0.8299699617188283, + "grad_norm": 0.5123321413993835, + "learning_rate": 1.3929122718777233e-06, + "loss": 0.2749, + "step": 44706 + }, + { + "epoch": 0.830007091856247, + "grad_norm": 0.3799402117729187, + "learning_rate": 1.392318472792249e-06, + "loss": 0.2888, + "step": 44708 + }, + { + "epoch": 0.8300442219936656, + "grad_norm": 0.3698118329048157, + "learning_rate": 1.3917247908315368e-06, + "loss": 0.3204, + "step": 44710 + }, + { + "epoch": 0.8300813521310842, + "grad_norm": 0.7637192606925964, + "learning_rate": 1.3911312260036658e-06, + "loss": 0.4519, + "step": 44712 + }, + { + "epoch": 0.8301184822685028, + "grad_norm": 0.36693376302719116, + "learning_rate": 1.390537778316714e-06, + "loss": 0.401, + "step": 44714 + }, + { + "epoch": 0.8301556124059215, + "grad_norm": 0.24672552943229675, + "learning_rate": 1.3899444477787528e-06, + "loss": 0.4109, + "step": 44716 + }, + { + "epoch": 0.8301927425433402, + "grad_norm": 0.2971721291542053, + "learning_rate": 1.38935123439786e-06, + "loss": 0.2368, + "step": 44718 + }, + { + "epoch": 0.8302298726807588, + "grad_norm": 0.21379490196704865, + "learning_rate": 1.3887581381821025e-06, + "loss": 0.1113, + "step": 44720 + }, + { + "epoch": 0.8302670028181774, + "grad_norm": 0.3707433342933655, + "learning_rate": 1.388165159139553e-06, + "loss": 0.319, + "step": 44722 + }, + { + "epoch": 0.830304132955596, + "grad_norm": 0.353485643863678, + "learning_rate": 1.3875722972782802e-06, + "loss": 0.2927, + "step": 44724 + }, + { + "epoch": 0.8303412630930147, + "grad_norm": 0.4065439999103546, + "learning_rate": 1.3869795526063512e-06, + "loss": 0.2746, + "step": 44726 + }, + { + "epoch": 0.8303783932304334, + "grad_norm": 0.37699365615844727, + "learning_rate": 1.3863869251318319e-06, + "loss": 0.1144, + "step": 44728 + }, + { + "epoch": 0.830415523367852, + "grad_norm": 0.3291468918323517, + "learning_rate": 1.3857944148627878e-06, + "loss": 0.3053, + "step": 44730 + }, + { + "epoch": 0.8304526535052706, + "grad_norm": 0.3150242567062378, + "learning_rate": 1.385202021807277e-06, + "loss": 0.2804, + "step": 44732 + }, + { + "epoch": 0.8304897836426892, + "grad_norm": 0.5001360177993774, + "learning_rate": 1.3846097459733632e-06, + "loss": 0.2991, + "step": 44734 + }, + { + "epoch": 0.8305269137801079, + "grad_norm": 0.3275878429412842, + "learning_rate": 1.3840175873691054e-06, + "loss": 0.0844, + "step": 44736 + }, + { + "epoch": 0.8305640439175266, + "grad_norm": 0.5636890530586243, + "learning_rate": 1.3834255460025592e-06, + "loss": 0.1857, + "step": 44738 + }, + { + "epoch": 0.8306011740549452, + "grad_norm": 0.3712332546710968, + "learning_rate": 1.3828336218817816e-06, + "loss": 0.2335, + "step": 44740 + }, + { + "epoch": 0.8306383041923638, + "grad_norm": 0.2726050317287445, + "learning_rate": 1.3822418150148286e-06, + "loss": 0.2746, + "step": 44742 + }, + { + "epoch": 0.8306754343297824, + "grad_norm": 0.5958617925643921, + "learning_rate": 1.381650125409749e-06, + "loss": 0.3704, + "step": 44744 + }, + { + "epoch": 0.8307125644672011, + "grad_norm": 0.4265875220298767, + "learning_rate": 1.3810585530745968e-06, + "loss": 0.1793, + "step": 44746 + }, + { + "epoch": 0.8307496946046198, + "grad_norm": 0.3158298134803772, + "learning_rate": 1.3804670980174218e-06, + "loss": 0.2676, + "step": 44748 + }, + { + "epoch": 0.8307868247420384, + "grad_norm": 0.35127800703048706, + "learning_rate": 1.3798757602462698e-06, + "loss": 0.2542, + "step": 44750 + }, + { + "epoch": 0.830823954879457, + "grad_norm": 0.4771256148815155, + "learning_rate": 1.379284539769189e-06, + "loss": 0.4213, + "step": 44752 + }, + { + "epoch": 0.8308610850168756, + "grad_norm": 0.3518950343132019, + "learning_rate": 1.378693436594225e-06, + "loss": 0.1961, + "step": 44754 + }, + { + "epoch": 0.8308982151542943, + "grad_norm": 0.3614540696144104, + "learning_rate": 1.3781024507294184e-06, + "loss": 0.1611, + "step": 44756 + }, + { + "epoch": 0.830935345291713, + "grad_norm": 0.31773507595062256, + "learning_rate": 1.3775115821828132e-06, + "loss": 0.3878, + "step": 44758 + }, + { + "epoch": 0.8309724754291316, + "grad_norm": 0.25153690576553345, + "learning_rate": 1.3769208309624472e-06, + "loss": 0.4208, + "step": 44760 + }, + { + "epoch": 0.8310096055665502, + "grad_norm": 0.4551907479763031, + "learning_rate": 1.3763301970763577e-06, + "loss": 0.2535, + "step": 44762 + }, + { + "epoch": 0.8310467357039688, + "grad_norm": 0.4253501892089844, + "learning_rate": 1.375739680532585e-06, + "loss": 0.1408, + "step": 44764 + }, + { + "epoch": 0.8310838658413875, + "grad_norm": 0.34814268350601196, + "learning_rate": 1.375149281339162e-06, + "loss": 0.3096, + "step": 44766 + }, + { + "epoch": 0.8311209959788061, + "grad_norm": 0.3495209515094757, + "learning_rate": 1.3745589995041241e-06, + "loss": 0.1151, + "step": 44768 + }, + { + "epoch": 0.8311581261162247, + "grad_norm": 0.3733285665512085, + "learning_rate": 1.3739688350355007e-06, + "loss": 0.0974, + "step": 44770 + }, + { + "epoch": 0.8311952562536434, + "grad_norm": 0.33342117071151733, + "learning_rate": 1.373378787941324e-06, + "loss": 0.2385, + "step": 44772 + }, + { + "epoch": 0.831232386391062, + "grad_norm": 0.5446091294288635, + "learning_rate": 1.3727888582296211e-06, + "loss": 0.2575, + "step": 44774 + }, + { + "epoch": 0.8312695165284807, + "grad_norm": 0.27492138743400574, + "learning_rate": 1.372199045908421e-06, + "loss": 0.1555, + "step": 44776 + }, + { + "epoch": 0.8313066466658993, + "grad_norm": 0.3746403157711029, + "learning_rate": 1.3716093509857509e-06, + "loss": 0.5507, + "step": 44778 + }, + { + "epoch": 0.831343776803318, + "grad_norm": 0.41709232330322266, + "learning_rate": 1.3710197734696329e-06, + "loss": 0.2431, + "step": 44780 + }, + { + "epoch": 0.8313809069407366, + "grad_norm": 0.36673423647880554, + "learning_rate": 1.3704303133680862e-06, + "loss": 0.2332, + "step": 44782 + }, + { + "epoch": 0.8314180370781552, + "grad_norm": 0.46338629722595215, + "learning_rate": 1.3698409706891347e-06, + "loss": 0.3013, + "step": 44784 + }, + { + "epoch": 0.8314551672155739, + "grad_norm": 0.46083107590675354, + "learning_rate": 1.3692517454407971e-06, + "loss": 0.1703, + "step": 44786 + }, + { + "epoch": 0.8314922973529925, + "grad_norm": 0.4763292372226715, + "learning_rate": 1.3686626376310908e-06, + "loss": 0.2822, + "step": 44788 + }, + { + "epoch": 0.8315294274904111, + "grad_norm": 0.334837943315506, + "learning_rate": 1.3680736472680322e-06, + "loss": 0.1577, + "step": 44790 + }, + { + "epoch": 0.8315665576278298, + "grad_norm": 0.28181159496307373, + "learning_rate": 1.3674847743596365e-06, + "loss": 0.1855, + "step": 44792 + }, + { + "epoch": 0.8316036877652484, + "grad_norm": 0.2867758274078369, + "learning_rate": 1.366896018913917e-06, + "loss": 0.143, + "step": 44794 + }, + { + "epoch": 0.8316408179026671, + "grad_norm": 0.7849120497703552, + "learning_rate": 1.3663073809388816e-06, + "loss": 0.2942, + "step": 44796 + }, + { + "epoch": 0.8316779480400857, + "grad_norm": 0.38695085048675537, + "learning_rate": 1.365718860442542e-06, + "loss": 0.2564, + "step": 44798 + }, + { + "epoch": 0.8317150781775043, + "grad_norm": 0.2648868262767792, + "learning_rate": 1.3651304574329083e-06, + "loss": 0.2921, + "step": 44800 + }, + { + "epoch": 0.831752208314923, + "grad_norm": 0.5481866598129272, + "learning_rate": 1.364542171917982e-06, + "loss": 0.1832, + "step": 44802 + }, + { + "epoch": 0.8317893384523416, + "grad_norm": 0.12923914194107056, + "learning_rate": 1.3639540039057708e-06, + "loss": 0.1621, + "step": 44804 + }, + { + "epoch": 0.8318264685897603, + "grad_norm": 0.3016194999217987, + "learning_rate": 1.3633659534042797e-06, + "loss": 0.2537, + "step": 44806 + }, + { + "epoch": 0.8318635987271789, + "grad_norm": 0.4570295810699463, + "learning_rate": 1.3627780204215069e-06, + "loss": 0.2145, + "step": 44808 + }, + { + "epoch": 0.8319007288645975, + "grad_norm": 1.4479295015335083, + "learning_rate": 1.3621902049654523e-06, + "loss": 0.2671, + "step": 44810 + }, + { + "epoch": 0.8319378590020161, + "grad_norm": 0.46774137020111084, + "learning_rate": 1.3616025070441163e-06, + "loss": 0.2851, + "step": 44812 + }, + { + "epoch": 0.8319749891394348, + "grad_norm": 0.6291539669036865, + "learning_rate": 1.361014926665496e-06, + "loss": 0.2924, + "step": 44814 + }, + { + "epoch": 0.8320121192768535, + "grad_norm": 0.35860252380371094, + "learning_rate": 1.3604274638375846e-06, + "loss": 0.3007, + "step": 44816 + }, + { + "epoch": 0.832049249414272, + "grad_norm": 0.24057145416736603, + "learning_rate": 1.3598401185683806e-06, + "loss": 0.3104, + "step": 44818 + }, + { + "epoch": 0.8320863795516907, + "grad_norm": 0.297654926776886, + "learning_rate": 1.3592528908658687e-06, + "loss": 0.2417, + "step": 44820 + }, + { + "epoch": 0.8321235096891093, + "grad_norm": 0.40400195121765137, + "learning_rate": 1.3586657807380454e-06, + "loss": 0.2908, + "step": 44822 + }, + { + "epoch": 0.832160639826528, + "grad_norm": 0.3872624337673187, + "learning_rate": 1.3580787881928958e-06, + "loss": 0.346, + "step": 44824 + }, + { + "epoch": 0.8321977699639467, + "grad_norm": 0.5213940739631653, + "learning_rate": 1.357491913238408e-06, + "loss": 0.3703, + "step": 44826 + }, + { + "epoch": 0.8322349001013652, + "grad_norm": 0.6502837538719177, + "learning_rate": 1.3569051558825675e-06, + "loss": 0.2462, + "step": 44828 + }, + { + "epoch": 0.8322720302387839, + "grad_norm": 0.47773054242134094, + "learning_rate": 1.3563185161333603e-06, + "loss": 0.3946, + "step": 44830 + }, + { + "epoch": 0.8323091603762025, + "grad_norm": 0.436495840549469, + "learning_rate": 1.3557319939987657e-06, + "loss": 0.1567, + "step": 44832 + }, + { + "epoch": 0.8323462905136212, + "grad_norm": 0.2304411232471466, + "learning_rate": 1.355145589486767e-06, + "loss": 0.327, + "step": 44834 + }, + { + "epoch": 0.8323834206510399, + "grad_norm": 0.3646831810474396, + "learning_rate": 1.3545593026053417e-06, + "loss": 0.2137, + "step": 44836 + }, + { + "epoch": 0.8324205507884584, + "grad_norm": 0.2903871536254883, + "learning_rate": 1.3539731333624684e-06, + "loss": 0.2467, + "step": 44838 + }, + { + "epoch": 0.8324576809258771, + "grad_norm": 0.46289902925491333, + "learning_rate": 1.3533870817661242e-06, + "loss": 0.3503, + "step": 44840 + }, + { + "epoch": 0.8324948110632957, + "grad_norm": 0.24181193113327026, + "learning_rate": 1.3528011478242808e-06, + "loss": 0.1209, + "step": 44842 + }, + { + "epoch": 0.8325319412007144, + "grad_norm": 0.5549391508102417, + "learning_rate": 1.352215331544915e-06, + "loss": 0.2863, + "step": 44844 + }, + { + "epoch": 0.8325690713381331, + "grad_norm": 0.3590112030506134, + "learning_rate": 1.3516296329359924e-06, + "loss": 0.0841, + "step": 44846 + }, + { + "epoch": 0.8326062014755516, + "grad_norm": 0.2760082483291626, + "learning_rate": 1.3510440520054858e-06, + "loss": 0.3155, + "step": 44848 + }, + { + "epoch": 0.8326433316129703, + "grad_norm": 0.4257012903690338, + "learning_rate": 1.3504585887613631e-06, + "loss": 0.4746, + "step": 44850 + }, + { + "epoch": 0.8326804617503889, + "grad_norm": 0.43495070934295654, + "learning_rate": 1.3498732432115914e-06, + "loss": 0.167, + "step": 44852 + }, + { + "epoch": 0.8327175918878076, + "grad_norm": 0.4181768596172333, + "learning_rate": 1.3492880153641342e-06, + "loss": 0.3122, + "step": 44854 + }, + { + "epoch": 0.8327547220252263, + "grad_norm": 0.44027766585350037, + "learning_rate": 1.3487029052269563e-06, + "loss": 0.1784, + "step": 44856 + }, + { + "epoch": 0.8327918521626448, + "grad_norm": 0.3995511829853058, + "learning_rate": 1.348117912808018e-06, + "loss": 0.2844, + "step": 44858 + }, + { + "epoch": 0.8328289823000635, + "grad_norm": 0.4001339077949524, + "learning_rate": 1.3475330381152784e-06, + "loss": 0.2239, + "step": 44860 + }, + { + "epoch": 0.8328661124374821, + "grad_norm": 0.5202770829200745, + "learning_rate": 1.3469482811566993e-06, + "loss": 0.2926, + "step": 44862 + }, + { + "epoch": 0.8329032425749008, + "grad_norm": 0.5506640672683716, + "learning_rate": 1.3463636419402327e-06, + "loss": 0.3624, + "step": 44864 + }, + { + "epoch": 0.8329403727123194, + "grad_norm": 0.5263121128082275, + "learning_rate": 1.3457791204738368e-06, + "loss": 0.1907, + "step": 44866 + }, + { + "epoch": 0.832977502849738, + "grad_norm": 0.28308677673339844, + "learning_rate": 1.3451947167654666e-06, + "loss": 0.3358, + "step": 44868 + }, + { + "epoch": 0.8330146329871567, + "grad_norm": 0.32979995012283325, + "learning_rate": 1.3446104308230701e-06, + "loss": 0.4196, + "step": 44870 + }, + { + "epoch": 0.8330517631245753, + "grad_norm": 0.34685221314430237, + "learning_rate": 1.3440262626545997e-06, + "loss": 0.3335, + "step": 44872 + }, + { + "epoch": 0.833088893261994, + "grad_norm": 0.37179824709892273, + "learning_rate": 1.3434422122680046e-06, + "loss": 0.4498, + "step": 44874 + }, + { + "epoch": 0.8331260233994126, + "grad_norm": 0.23270311951637268, + "learning_rate": 1.3428582796712309e-06, + "loss": 0.3719, + "step": 44876 + }, + { + "epoch": 0.8331631535368312, + "grad_norm": 0.30043721199035645, + "learning_rate": 1.3422744648722251e-06, + "loss": 0.3748, + "step": 44878 + }, + { + "epoch": 0.8332002836742499, + "grad_norm": 0.5993932485580444, + "learning_rate": 1.3416907678789314e-06, + "loss": 0.4098, + "step": 44880 + }, + { + "epoch": 0.8332374138116685, + "grad_norm": 0.20825675129890442, + "learning_rate": 1.3411071886992932e-06, + "loss": 0.161, + "step": 44882 + }, + { + "epoch": 0.8332745439490872, + "grad_norm": 0.4286157488822937, + "learning_rate": 1.3405237273412485e-06, + "loss": 0.2314, + "step": 44884 + }, + { + "epoch": 0.8333116740865057, + "grad_norm": 0.3126898407936096, + "learning_rate": 1.3399403838127388e-06, + "loss": 0.192, + "step": 44886 + }, + { + "epoch": 0.8333488042239244, + "grad_norm": 0.28187185525894165, + "learning_rate": 1.339357158121699e-06, + "loss": 0.2808, + "step": 44888 + }, + { + "epoch": 0.8333859343613431, + "grad_norm": 0.3941890299320221, + "learning_rate": 1.3387740502760672e-06, + "loss": 0.2257, + "step": 44890 + }, + { + "epoch": 0.8334230644987617, + "grad_norm": 0.5147727131843567, + "learning_rate": 1.3381910602837767e-06, + "loss": 0.2975, + "step": 44892 + }, + { + "epoch": 0.8334601946361804, + "grad_norm": 0.37041109800338745, + "learning_rate": 1.3376081881527626e-06, + "loss": 0.2131, + "step": 44894 + }, + { + "epoch": 0.833497324773599, + "grad_norm": 0.2776052951812744, + "learning_rate": 1.3370254338909528e-06, + "loss": 0.2571, + "step": 44896 + }, + { + "epoch": 0.8335344549110176, + "grad_norm": 0.39083975553512573, + "learning_rate": 1.3364427975062777e-06, + "loss": 0.2832, + "step": 44898 + }, + { + "epoch": 0.8335715850484363, + "grad_norm": 0.5486248731613159, + "learning_rate": 1.3358602790066655e-06, + "loss": 0.3231, + "step": 44900 + }, + { + "epoch": 0.8336087151858549, + "grad_norm": 0.6890703439712524, + "learning_rate": 1.335277878400043e-06, + "loss": 0.3461, + "step": 44902 + }, + { + "epoch": 0.8336458453232736, + "grad_norm": 0.46512311697006226, + "learning_rate": 1.3346955956943363e-06, + "loss": 0.2562, + "step": 44904 + }, + { + "epoch": 0.8336829754606921, + "grad_norm": 0.6313436627388, + "learning_rate": 1.3341134308974658e-06, + "loss": 0.3606, + "step": 44906 + }, + { + "epoch": 0.8337201055981108, + "grad_norm": 0.5636327862739563, + "learning_rate": 1.3335313840173559e-06, + "loss": 0.2092, + "step": 44908 + }, + { + "epoch": 0.8337572357355295, + "grad_norm": 0.30125662684440613, + "learning_rate": 1.3329494550619227e-06, + "loss": 0.2961, + "step": 44910 + }, + { + "epoch": 0.8337943658729481, + "grad_norm": 0.5612263679504395, + "learning_rate": 1.3323676440390864e-06, + "loss": 0.3958, + "step": 44912 + }, + { + "epoch": 0.8338314960103668, + "grad_norm": 0.3315229117870331, + "learning_rate": 1.3317859509567643e-06, + "loss": 0.3512, + "step": 44914 + }, + { + "epoch": 0.8338686261477853, + "grad_norm": 0.4897848665714264, + "learning_rate": 1.331204375822871e-06, + "loss": 0.1967, + "step": 44916 + }, + { + "epoch": 0.833905756285204, + "grad_norm": 0.38958385586738586, + "learning_rate": 1.33062291864532e-06, + "loss": 0.1435, + "step": 44918 + }, + { + "epoch": 0.8339428864226226, + "grad_norm": 0.2965165674686432, + "learning_rate": 1.3300415794320255e-06, + "loss": 0.1852, + "step": 44920 + }, + { + "epoch": 0.8339800165600413, + "grad_norm": 0.7399211525917053, + "learning_rate": 1.3294603581908938e-06, + "loss": 0.2411, + "step": 44922 + }, + { + "epoch": 0.83401714669746, + "grad_norm": 0.2486562579870224, + "learning_rate": 1.3288792549298347e-06, + "loss": 0.1999, + "step": 44924 + }, + { + "epoch": 0.8340542768348785, + "grad_norm": 0.35691970586776733, + "learning_rate": 1.328298269656758e-06, + "loss": 0.3243, + "step": 44926 + }, + { + "epoch": 0.8340914069722972, + "grad_norm": 0.48722851276397705, + "learning_rate": 1.3277174023795659e-06, + "loss": 0.2818, + "step": 44928 + }, + { + "epoch": 0.8341285371097158, + "grad_norm": 0.46762779355049133, + "learning_rate": 1.3271366531061625e-06, + "loss": 0.2628, + "step": 44930 + }, + { + "epoch": 0.8341656672471345, + "grad_norm": 0.40678316354751587, + "learning_rate": 1.326556021844454e-06, + "loss": 0.3499, + "step": 44932 + }, + { + "epoch": 0.8342027973845532, + "grad_norm": 0.47252246737480164, + "learning_rate": 1.325975508602335e-06, + "loss": 0.4584, + "step": 44934 + }, + { + "epoch": 0.8342399275219717, + "grad_norm": 0.506563663482666, + "learning_rate": 1.3253951133877075e-06, + "loss": 0.256, + "step": 44936 + }, + { + "epoch": 0.8342770576593904, + "grad_norm": 0.3010956943035126, + "learning_rate": 1.32481483620847e-06, + "loss": 0.3433, + "step": 44938 + }, + { + "epoch": 0.834314187796809, + "grad_norm": 0.5088716745376587, + "learning_rate": 1.3242346770725167e-06, + "loss": 0.2732, + "step": 44940 + }, + { + "epoch": 0.8343513179342277, + "grad_norm": 0.34784966707229614, + "learning_rate": 1.3236546359877433e-06, + "loss": 0.1205, + "step": 44942 + }, + { + "epoch": 0.8343884480716464, + "grad_norm": 0.4532397985458374, + "learning_rate": 1.3230747129620425e-06, + "loss": 0.3476, + "step": 44944 + }, + { + "epoch": 0.8344255782090649, + "grad_norm": 0.28261974453926086, + "learning_rate": 1.3224949080033034e-06, + "loss": 0.2151, + "step": 44946 + }, + { + "epoch": 0.8344627083464836, + "grad_norm": 0.5063139200210571, + "learning_rate": 1.3219152211194186e-06, + "loss": 0.3357, + "step": 44948 + }, + { + "epoch": 0.8344998384839022, + "grad_norm": 0.5508877038955688, + "learning_rate": 1.3213356523182708e-06, + "loss": 0.2917, + "step": 44950 + }, + { + "epoch": 0.8345369686213209, + "grad_norm": 0.5083823800086975, + "learning_rate": 1.3207562016077501e-06, + "loss": 0.3175, + "step": 44952 + }, + { + "epoch": 0.8345740987587396, + "grad_norm": 0.7302245497703552, + "learning_rate": 1.3201768689957396e-06, + "loss": 0.191, + "step": 44954 + }, + { + "epoch": 0.8346112288961581, + "grad_norm": 0.35993441939353943, + "learning_rate": 1.3195976544901235e-06, + "loss": 0.2678, + "step": 44956 + }, + { + "epoch": 0.8346483590335768, + "grad_norm": 0.5507876873016357, + "learning_rate": 1.3190185580987835e-06, + "loss": 0.3134, + "step": 44958 + }, + { + "epoch": 0.8346854891709954, + "grad_norm": 0.328533411026001, + "learning_rate": 1.3184395798295968e-06, + "loss": 0.4651, + "step": 44960 + }, + { + "epoch": 0.8347226193084141, + "grad_norm": 0.39272600412368774, + "learning_rate": 1.3178607196904437e-06, + "loss": 0.2403, + "step": 44962 + }, + { + "epoch": 0.8347597494458326, + "grad_norm": 0.35391271114349365, + "learning_rate": 1.3172819776891988e-06, + "loss": 0.3794, + "step": 44964 + }, + { + "epoch": 0.8347968795832513, + "grad_norm": 0.5653648972511292, + "learning_rate": 1.3167033538337392e-06, + "loss": 0.1661, + "step": 44966 + }, + { + "epoch": 0.83483400972067, + "grad_norm": 0.3520868122577667, + "learning_rate": 1.31612484813194e-06, + "loss": 0.3469, + "step": 44968 + }, + { + "epoch": 0.8348711398580886, + "grad_norm": 0.4825683534145355, + "learning_rate": 1.3155464605916702e-06, + "loss": 0.2464, + "step": 44970 + }, + { + "epoch": 0.8349082699955073, + "grad_norm": 0.3775075376033783, + "learning_rate": 1.3149681912207978e-06, + "loss": 0.3343, + "step": 44972 + }, + { + "epoch": 0.8349454001329258, + "grad_norm": 0.5979307889938354, + "learning_rate": 1.3143900400271937e-06, + "loss": 0.1849, + "step": 44974 + }, + { + "epoch": 0.8349825302703445, + "grad_norm": 0.5886656045913696, + "learning_rate": 1.3138120070187254e-06, + "loss": 0.318, + "step": 44976 + }, + { + "epoch": 0.8350196604077632, + "grad_norm": 0.44074031710624695, + "learning_rate": 1.3132340922032561e-06, + "loss": 0.3487, + "step": 44978 + }, + { + "epoch": 0.8350567905451818, + "grad_norm": 0.4174761176109314, + "learning_rate": 1.3126562955886524e-06, + "loss": 0.1454, + "step": 44980 + }, + { + "epoch": 0.8350939206826005, + "grad_norm": 0.4605412781238556, + "learning_rate": 1.3120786171827759e-06, + "loss": 0.2476, + "step": 44982 + }, + { + "epoch": 0.835131050820019, + "grad_norm": 0.29947465658187866, + "learning_rate": 1.311501056993485e-06, + "loss": 0.2662, + "step": 44984 + }, + { + "epoch": 0.8351681809574377, + "grad_norm": 0.4422217607498169, + "learning_rate": 1.3109236150286386e-06, + "loss": 0.382, + "step": 44986 + }, + { + "epoch": 0.8352053110948564, + "grad_norm": 0.5025306940078735, + "learning_rate": 1.310346291296095e-06, + "loss": 0.1374, + "step": 44988 + }, + { + "epoch": 0.835242441232275, + "grad_norm": 0.43528711795806885, + "learning_rate": 1.3097690858037126e-06, + "loss": 0.1877, + "step": 44990 + }, + { + "epoch": 0.8352795713696937, + "grad_norm": 0.2730690836906433, + "learning_rate": 1.3091919985593404e-06, + "loss": 0.2233, + "step": 44992 + }, + { + "epoch": 0.8353167015071122, + "grad_norm": 0.34336599707603455, + "learning_rate": 1.3086150295708355e-06, + "loss": 0.2124, + "step": 44994 + }, + { + "epoch": 0.8353538316445309, + "grad_norm": 0.37034472823143005, + "learning_rate": 1.3080381788460438e-06, + "loss": 0.272, + "step": 44996 + }, + { + "epoch": 0.8353909617819496, + "grad_norm": 0.31635940074920654, + "learning_rate": 1.3074614463928171e-06, + "loss": 0.2578, + "step": 44998 + }, + { + "epoch": 0.8354280919193682, + "grad_norm": 0.2950815260410309, + "learning_rate": 1.306884832219003e-06, + "loss": 0.3134, + "step": 45000 + }, + { + "epoch": 0.8354652220567869, + "grad_norm": 0.4889046549797058, + "learning_rate": 1.306308336332448e-06, + "loss": 0.2546, + "step": 45002 + }, + { + "epoch": 0.8355023521942054, + "grad_norm": 0.43934768438339233, + "learning_rate": 1.3057319587409956e-06, + "loss": 0.122, + "step": 45004 + }, + { + "epoch": 0.8355394823316241, + "grad_norm": 0.43278878927230835, + "learning_rate": 1.3051556994524883e-06, + "loss": 0.3224, + "step": 45006 + }, + { + "epoch": 0.8355766124690428, + "grad_norm": 0.4403007924556732, + "learning_rate": 1.3045795584747712e-06, + "loss": 0.237, + "step": 45008 + }, + { + "epoch": 0.8356137426064614, + "grad_norm": 0.7316752076148987, + "learning_rate": 1.3040035358156766e-06, + "loss": 0.1876, + "step": 45010 + }, + { + "epoch": 0.8356508727438801, + "grad_norm": 0.5798961520195007, + "learning_rate": 1.3034276314830496e-06, + "loss": 0.1681, + "step": 45012 + }, + { + "epoch": 0.8356880028812986, + "grad_norm": 0.3139997124671936, + "learning_rate": 1.3028518454847216e-06, + "loss": 0.2349, + "step": 45014 + }, + { + "epoch": 0.8357251330187173, + "grad_norm": 0.34415721893310547, + "learning_rate": 1.3022761778285275e-06, + "loss": 0.1386, + "step": 45016 + }, + { + "epoch": 0.8357622631561359, + "grad_norm": 0.4023856520652771, + "learning_rate": 1.3017006285223033e-06, + "loss": 0.3346, + "step": 45018 + }, + { + "epoch": 0.8357993932935546, + "grad_norm": 0.5252102017402649, + "learning_rate": 1.3011251975738815e-06, + "loss": 0.4141, + "step": 45020 + }, + { + "epoch": 0.8358365234309733, + "grad_norm": 0.43758922815322876, + "learning_rate": 1.3005498849910857e-06, + "loss": 0.1681, + "step": 45022 + }, + { + "epoch": 0.8358736535683918, + "grad_norm": 0.3989613354206085, + "learning_rate": 1.2999746907817501e-06, + "loss": 0.3502, + "step": 45024 + }, + { + "epoch": 0.8359107837058105, + "grad_norm": 0.4268325865268707, + "learning_rate": 1.299399614953698e-06, + "loss": 0.245, + "step": 45026 + }, + { + "epoch": 0.8359479138432291, + "grad_norm": 0.25815847516059875, + "learning_rate": 1.2988246575147567e-06, + "loss": 0.1612, + "step": 45028 + }, + { + "epoch": 0.8359850439806478, + "grad_norm": 0.31244388222694397, + "learning_rate": 1.2982498184727498e-06, + "loss": 0.2607, + "step": 45030 + }, + { + "epoch": 0.8360221741180665, + "grad_norm": 0.4806983172893524, + "learning_rate": 1.2976750978354968e-06, + "loss": 0.2321, + "step": 45032 + }, + { + "epoch": 0.836059304255485, + "grad_norm": 0.41797515749931335, + "learning_rate": 1.2971004956108213e-06, + "loss": 0.3074, + "step": 45034 + }, + { + "epoch": 0.8360964343929037, + "grad_norm": 0.686809241771698, + "learning_rate": 1.2965260118065382e-06, + "loss": 0.3773, + "step": 45036 + }, + { + "epoch": 0.8361335645303223, + "grad_norm": 0.3613864481449127, + "learning_rate": 1.2959516464304656e-06, + "loss": 0.3173, + "step": 45038 + }, + { + "epoch": 0.836170694667741, + "grad_norm": 0.4357624351978302, + "learning_rate": 1.2953773994904185e-06, + "loss": 0.196, + "step": 45040 + }, + { + "epoch": 0.8362078248051596, + "grad_norm": 0.38213497400283813, + "learning_rate": 1.2948032709942126e-06, + "loss": 0.0931, + "step": 45042 + }, + { + "epoch": 0.8362449549425782, + "grad_norm": 0.29286354780197144, + "learning_rate": 1.2942292609496598e-06, + "loss": 0.2969, + "step": 45044 + }, + { + "epoch": 0.8362820850799969, + "grad_norm": 0.5897377133369446, + "learning_rate": 1.2936553693645714e-06, + "loss": 0.5881, + "step": 45046 + }, + { + "epoch": 0.8363192152174155, + "grad_norm": 0.36343255639076233, + "learning_rate": 1.2930815962467525e-06, + "loss": 0.4188, + "step": 45048 + }, + { + "epoch": 0.8363563453548342, + "grad_norm": 0.33690643310546875, + "learning_rate": 1.292507941604013e-06, + "loss": 0.4453, + "step": 45050 + }, + { + "epoch": 0.8363934754922528, + "grad_norm": 0.35522323846817017, + "learning_rate": 1.2919344054441585e-06, + "loss": 0.3767, + "step": 45052 + }, + { + "epoch": 0.8364306056296714, + "grad_norm": 0.4292003810405731, + "learning_rate": 1.2913609877749956e-06, + "loss": 0.5735, + "step": 45054 + }, + { + "epoch": 0.8364677357670901, + "grad_norm": 0.4304521381855011, + "learning_rate": 1.2907876886043214e-06, + "loss": 0.4891, + "step": 45056 + }, + { + "epoch": 0.8365048659045087, + "grad_norm": 0.3469237983226776, + "learning_rate": 1.290214507939942e-06, + "loss": 0.1458, + "step": 45058 + }, + { + "epoch": 0.8365419960419274, + "grad_norm": 0.6322197318077087, + "learning_rate": 1.2896414457896522e-06, + "loss": 0.3187, + "step": 45060 + }, + { + "epoch": 0.836579126179346, + "grad_norm": 0.7496894001960754, + "learning_rate": 1.2890685021612515e-06, + "loss": 0.2334, + "step": 45062 + }, + { + "epoch": 0.8366162563167646, + "grad_norm": 0.35302454233169556, + "learning_rate": 1.2884956770625368e-06, + "loss": 0.3174, + "step": 45064 + }, + { + "epoch": 0.8366533864541833, + "grad_norm": 0.25830864906311035, + "learning_rate": 1.287922970501302e-06, + "loss": 0.1221, + "step": 45066 + }, + { + "epoch": 0.8366905165916019, + "grad_norm": 0.1809079796075821, + "learning_rate": 1.2873503824853395e-06, + "loss": 0.3441, + "step": 45068 + }, + { + "epoch": 0.8367276467290206, + "grad_norm": 0.2895350456237793, + "learning_rate": 1.2867779130224433e-06, + "loss": 0.3351, + "step": 45070 + }, + { + "epoch": 0.8367647768664391, + "grad_norm": 0.30710190534591675, + "learning_rate": 1.2862055621203985e-06, + "loss": 0.1846, + "step": 45072 + }, + { + "epoch": 0.8368019070038578, + "grad_norm": 0.352020800113678, + "learning_rate": 1.285633329786995e-06, + "loss": 0.4566, + "step": 45074 + }, + { + "epoch": 0.8368390371412765, + "grad_norm": 0.6144376993179321, + "learning_rate": 1.2850612160300213e-06, + "loss": 0.302, + "step": 45076 + }, + { + "epoch": 0.8368761672786951, + "grad_norm": 1.1882773637771606, + "learning_rate": 1.284489220857258e-06, + "loss": 0.2792, + "step": 45078 + }, + { + "epoch": 0.8369132974161138, + "grad_norm": 0.37534239888191223, + "learning_rate": 1.2839173442764907e-06, + "loss": 0.2337, + "step": 45080 + }, + { + "epoch": 0.8369504275535323, + "grad_norm": 0.19285543262958527, + "learning_rate": 1.2833455862955013e-06, + "loss": 0.2437, + "step": 45082 + }, + { + "epoch": 0.836987557690951, + "grad_norm": 0.38733842968940735, + "learning_rate": 1.28277394692207e-06, + "loss": 0.4651, + "step": 45084 + }, + { + "epoch": 0.8370246878283697, + "grad_norm": 0.5257609486579895, + "learning_rate": 1.2822024261639721e-06, + "loss": 0.1413, + "step": 45086 + }, + { + "epoch": 0.8370618179657883, + "grad_norm": 0.39427798986434937, + "learning_rate": 1.2816310240289876e-06, + "loss": 0.3594, + "step": 45088 + }, + { + "epoch": 0.837098948103207, + "grad_norm": 0.5428263545036316, + "learning_rate": 1.2810597405248893e-06, + "loss": 0.2698, + "step": 45090 + }, + { + "epoch": 0.8371360782406255, + "grad_norm": 0.49413853883743286, + "learning_rate": 1.280488575659452e-06, + "loss": 0.1077, + "step": 45092 + }, + { + "epoch": 0.8371732083780442, + "grad_norm": 0.5706779360771179, + "learning_rate": 1.2799175294404486e-06, + "loss": 0.4263, + "step": 45094 + }, + { + "epoch": 0.8372103385154629, + "grad_norm": 0.38936561346054077, + "learning_rate": 1.2793466018756473e-06, + "loss": 0.1909, + "step": 45096 + }, + { + "epoch": 0.8372474686528815, + "grad_norm": 0.5588836669921875, + "learning_rate": 1.2787757929728184e-06, + "loss": 0.2535, + "step": 45098 + }, + { + "epoch": 0.8372845987903001, + "grad_norm": 0.303112268447876, + "learning_rate": 1.2782051027397257e-06, + "loss": 0.1207, + "step": 45100 + }, + { + "epoch": 0.8373217289277187, + "grad_norm": 0.366486519575119, + "learning_rate": 1.2776345311841376e-06, + "loss": 0.1838, + "step": 45102 + }, + { + "epoch": 0.8373588590651374, + "grad_norm": 0.3416554927825928, + "learning_rate": 1.2770640783138155e-06, + "loss": 0.3471, + "step": 45104 + }, + { + "epoch": 0.8373959892025561, + "grad_norm": 0.36398664116859436, + "learning_rate": 1.2764937441365243e-06, + "loss": 0.3675, + "step": 45106 + }, + { + "epoch": 0.8374331193399747, + "grad_norm": 0.28080499172210693, + "learning_rate": 1.2759235286600258e-06, + "loss": 0.2433, + "step": 45108 + }, + { + "epoch": 0.8374702494773933, + "grad_norm": 0.4126347303390503, + "learning_rate": 1.2753534318920736e-06, + "loss": 0.2695, + "step": 45110 + }, + { + "epoch": 0.8375073796148119, + "grad_norm": 0.29457613825798035, + "learning_rate": 1.274783453840428e-06, + "loss": 0.3346, + "step": 45112 + }, + { + "epoch": 0.8375445097522306, + "grad_norm": 0.33102530241012573, + "learning_rate": 1.2742135945128441e-06, + "loss": 0.2748, + "step": 45114 + }, + { + "epoch": 0.8375816398896492, + "grad_norm": 0.3512861430644989, + "learning_rate": 1.2736438539170793e-06, + "loss": 0.3683, + "step": 45116 + }, + { + "epoch": 0.8376187700270679, + "grad_norm": 0.6879599094390869, + "learning_rate": 1.2730742320608801e-06, + "loss": 0.3493, + "step": 45118 + }, + { + "epoch": 0.8376559001644865, + "grad_norm": 0.38871631026268005, + "learning_rate": 1.272504728952003e-06, + "loss": 0.3884, + "step": 45120 + }, + { + "epoch": 0.8376930303019051, + "grad_norm": 0.29564133286476135, + "learning_rate": 1.271935344598193e-06, + "loss": 0.1928, + "step": 45122 + }, + { + "epoch": 0.8377301604393238, + "grad_norm": 0.3025103807449341, + "learning_rate": 1.271366079007199e-06, + "loss": 0.2815, + "step": 45124 + }, + { + "epoch": 0.8377672905767424, + "grad_norm": 0.41453662514686584, + "learning_rate": 1.2707969321867674e-06, + "loss": 0.2279, + "step": 45126 + }, + { + "epoch": 0.8378044207141611, + "grad_norm": 0.38855987787246704, + "learning_rate": 1.270227904144643e-06, + "loss": 0.189, + "step": 45128 + }, + { + "epoch": 0.8378415508515797, + "grad_norm": 0.2858559191226959, + "learning_rate": 1.2696589948885674e-06, + "loss": 0.299, + "step": 45130 + }, + { + "epoch": 0.8378786809889983, + "grad_norm": 0.3040689527988434, + "learning_rate": 1.2690902044262832e-06, + "loss": 0.4417, + "step": 45132 + }, + { + "epoch": 0.837915811126417, + "grad_norm": 0.442325621843338, + "learning_rate": 1.268521532765531e-06, + "loss": 0.3526, + "step": 45134 + }, + { + "epoch": 0.8379529412638356, + "grad_norm": 0.25379738211631775, + "learning_rate": 1.2679529799140445e-06, + "loss": 0.2031, + "step": 45136 + }, + { + "epoch": 0.8379900714012543, + "grad_norm": 0.6169052720069885, + "learning_rate": 1.2673845458795652e-06, + "loss": 0.3989, + "step": 45138 + }, + { + "epoch": 0.8380272015386729, + "grad_norm": 0.395910382270813, + "learning_rate": 1.2668162306698227e-06, + "loss": 0.2172, + "step": 45140 + }, + { + "epoch": 0.8380643316760915, + "grad_norm": 0.6024186611175537, + "learning_rate": 1.266248034292552e-06, + "loss": 0.46, + "step": 45142 + }, + { + "epoch": 0.8381014618135102, + "grad_norm": 0.3437790870666504, + "learning_rate": 1.2656799567554845e-06, + "loss": 0.403, + "step": 45144 + }, + { + "epoch": 0.8381385919509288, + "grad_norm": 0.2517353594303131, + "learning_rate": 1.2651119980663539e-06, + "loss": 0.1684, + "step": 45146 + }, + { + "epoch": 0.8381757220883475, + "grad_norm": 0.5301719307899475, + "learning_rate": 1.2645441582328822e-06, + "loss": 0.2446, + "step": 45148 + }, + { + "epoch": 0.8382128522257661, + "grad_norm": 0.7153987884521484, + "learning_rate": 1.2639764372627982e-06, + "loss": 0.2055, + "step": 45150 + }, + { + "epoch": 0.8382499823631847, + "grad_norm": 0.35205817222595215, + "learning_rate": 1.2634088351638285e-06, + "loss": 0.2134, + "step": 45152 + }, + { + "epoch": 0.8382871125006034, + "grad_norm": 0.47933775186538696, + "learning_rate": 1.2628413519436955e-06, + "loss": 0.2499, + "step": 45154 + }, + { + "epoch": 0.838324242638022, + "grad_norm": 0.39602696895599365, + "learning_rate": 1.2622739876101197e-06, + "loss": 0.2024, + "step": 45156 + }, + { + "epoch": 0.8383613727754406, + "grad_norm": 0.3977866768836975, + "learning_rate": 1.2617067421708252e-06, + "loss": 0.3727, + "step": 45158 + }, + { + "epoch": 0.8383985029128593, + "grad_norm": 0.4344384968280792, + "learning_rate": 1.2611396156335253e-06, + "loss": 0.5792, + "step": 45160 + }, + { + "epoch": 0.8384356330502779, + "grad_norm": 0.6845353841781616, + "learning_rate": 1.2605726080059421e-06, + "loss": 0.3728, + "step": 45162 + }, + { + "epoch": 0.8384727631876966, + "grad_norm": 0.7672502398490906, + "learning_rate": 1.260005719295786e-06, + "loss": 0.3233, + "step": 45164 + }, + { + "epoch": 0.8385098933251152, + "grad_norm": 0.4859590232372284, + "learning_rate": 1.2594389495107718e-06, + "loss": 0.3, + "step": 45166 + }, + { + "epoch": 0.8385470234625338, + "grad_norm": 0.8549558520317078, + "learning_rate": 1.2588722986586132e-06, + "loss": 0.1337, + "step": 45168 + }, + { + "epoch": 0.8385841535999524, + "grad_norm": 0.5360096096992493, + "learning_rate": 1.25830576674702e-06, + "loss": 0.3736, + "step": 45170 + }, + { + "epoch": 0.8386212837373711, + "grad_norm": 0.3445279598236084, + "learning_rate": 1.2577393537837024e-06, + "loss": 0.3243, + "step": 45172 + }, + { + "epoch": 0.8386584138747898, + "grad_norm": 0.5684767961502075, + "learning_rate": 1.2571730597763644e-06, + "loss": 0.4087, + "step": 45174 + }, + { + "epoch": 0.8386955440122084, + "grad_norm": 0.47289934754371643, + "learning_rate": 1.256606884732714e-06, + "loss": 0.21, + "step": 45176 + }, + { + "epoch": 0.838732674149627, + "grad_norm": 0.3160935938358307, + "learning_rate": 1.256040828660453e-06, + "loss": 0.206, + "step": 45178 + }, + { + "epoch": 0.8387698042870456, + "grad_norm": 0.4236598610877991, + "learning_rate": 1.2554748915672876e-06, + "loss": 0.3754, + "step": 45180 + }, + { + "epoch": 0.8388069344244643, + "grad_norm": 0.28959739208221436, + "learning_rate": 1.254909073460915e-06, + "loss": 0.1838, + "step": 45182 + }, + { + "epoch": 0.838844064561883, + "grad_norm": 0.35430288314819336, + "learning_rate": 1.2543433743490362e-06, + "loss": 0.2966, + "step": 45184 + }, + { + "epoch": 0.8388811946993016, + "grad_norm": 0.3100769817829132, + "learning_rate": 1.2537777942393458e-06, + "loss": 0.3051, + "step": 45186 + }, + { + "epoch": 0.8389183248367202, + "grad_norm": 0.48462754487991333, + "learning_rate": 1.2532123331395428e-06, + "loss": 0.1254, + "step": 45188 + }, + { + "epoch": 0.8389554549741388, + "grad_norm": 0.4009462594985962, + "learning_rate": 1.2526469910573191e-06, + "loss": 0.2094, + "step": 45190 + }, + { + "epoch": 0.8389925851115575, + "grad_norm": 0.2622302770614624, + "learning_rate": 1.2520817680003682e-06, + "loss": 0.1846, + "step": 45192 + }, + { + "epoch": 0.8390297152489762, + "grad_norm": 0.5035792589187622, + "learning_rate": 1.2515166639763831e-06, + "loss": 0.3289, + "step": 45194 + }, + { + "epoch": 0.8390668453863948, + "grad_norm": 0.2800925672054291, + "learning_rate": 1.2509516789930521e-06, + "loss": 0.4425, + "step": 45196 + }, + { + "epoch": 0.8391039755238134, + "grad_norm": 0.360747754573822, + "learning_rate": 1.250386813058061e-06, + "loss": 0.3929, + "step": 45198 + }, + { + "epoch": 0.839141105661232, + "grad_norm": 0.2983044385910034, + "learning_rate": 1.249822066179096e-06, + "loss": 0.3141, + "step": 45200 + }, + { + "epoch": 0.8391782357986507, + "grad_norm": 0.3961217999458313, + "learning_rate": 1.2492574383638466e-06, + "loss": 0.2449, + "step": 45202 + }, + { + "epoch": 0.8392153659360694, + "grad_norm": 0.6232540011405945, + "learning_rate": 1.248692929619989e-06, + "loss": 0.4143, + "step": 45204 + }, + { + "epoch": 0.839252496073488, + "grad_norm": 0.3401852548122406, + "learning_rate": 1.248128539955208e-06, + "loss": 0.3741, + "step": 45206 + }, + { + "epoch": 0.8392896262109066, + "grad_norm": 0.53154057264328, + "learning_rate": 1.2475642693771817e-06, + "loss": 0.2759, + "step": 45208 + }, + { + "epoch": 0.8393267563483252, + "grad_norm": 0.24535687267780304, + "learning_rate": 1.247000117893592e-06, + "loss": 0.3862, + "step": 45210 + }, + { + "epoch": 0.8393638864857439, + "grad_norm": 0.39021027088165283, + "learning_rate": 1.2464360855121093e-06, + "loss": 0.442, + "step": 45212 + }, + { + "epoch": 0.8394010166231626, + "grad_norm": 0.16382379829883575, + "learning_rate": 1.2458721722404122e-06, + "loss": 0.2771, + "step": 45214 + }, + { + "epoch": 0.8394381467605811, + "grad_norm": 0.33260294795036316, + "learning_rate": 1.245308378086174e-06, + "loss": 0.2285, + "step": 45216 + }, + { + "epoch": 0.8394752768979998, + "grad_norm": 0.37016791105270386, + "learning_rate": 1.2447447030570648e-06, + "loss": 0.3945, + "step": 45218 + }, + { + "epoch": 0.8395124070354184, + "grad_norm": 0.5065252780914307, + "learning_rate": 1.2441811471607546e-06, + "loss": 0.2284, + "step": 45220 + }, + { + "epoch": 0.8395495371728371, + "grad_norm": 0.3292807936668396, + "learning_rate": 1.2436177104049151e-06, + "loss": 0.2048, + "step": 45222 + }, + { + "epoch": 0.8395866673102557, + "grad_norm": 0.48583173751831055, + "learning_rate": 1.2430543927972094e-06, + "loss": 0.0893, + "step": 45224 + }, + { + "epoch": 0.8396237974476743, + "grad_norm": 0.34939268231391907, + "learning_rate": 1.2424911943453023e-06, + "loss": 0.2496, + "step": 45226 + }, + { + "epoch": 0.839660927585093, + "grad_norm": 0.33939021825790405, + "learning_rate": 1.2419281150568575e-06, + "loss": 0.2534, + "step": 45228 + }, + { + "epoch": 0.8396980577225116, + "grad_norm": 0.2972109019756317, + "learning_rate": 1.2413651549395377e-06, + "loss": 0.1642, + "step": 45230 + }, + { + "epoch": 0.8397351878599303, + "grad_norm": 0.37945112586021423, + "learning_rate": 1.2408023140010029e-06, + "loss": 0.3842, + "step": 45232 + }, + { + "epoch": 0.8397723179973489, + "grad_norm": 0.4863608181476593, + "learning_rate": 1.240239592248914e-06, + "loss": 0.4406, + "step": 45234 + }, + { + "epoch": 0.8398094481347675, + "grad_norm": 0.3947312533855438, + "learning_rate": 1.2396769896909233e-06, + "loss": 0.2576, + "step": 45236 + }, + { + "epoch": 0.8398465782721862, + "grad_norm": 0.5981470346450806, + "learning_rate": 1.2391145063346888e-06, + "loss": 0.3424, + "step": 45238 + }, + { + "epoch": 0.8398837084096048, + "grad_norm": 0.3994324803352356, + "learning_rate": 1.2385521421878644e-06, + "loss": 0.3285, + "step": 45240 + }, + { + "epoch": 0.8399208385470235, + "grad_norm": 0.3641209602355957, + "learning_rate": 1.2379898972581017e-06, + "loss": 0.2056, + "step": 45242 + }, + { + "epoch": 0.8399579686844421, + "grad_norm": 0.5160177946090698, + "learning_rate": 1.2374277715530537e-06, + "loss": 0.333, + "step": 45244 + }, + { + "epoch": 0.8399950988218607, + "grad_norm": 0.6410272717475891, + "learning_rate": 1.236865765080364e-06, + "loss": 0.2987, + "step": 45246 + }, + { + "epoch": 0.8400322289592794, + "grad_norm": 0.21218463778495789, + "learning_rate": 1.2363038778476844e-06, + "loss": 0.2423, + "step": 45248 + }, + { + "epoch": 0.840069359096698, + "grad_norm": 0.3017255365848541, + "learning_rate": 1.2357421098626565e-06, + "loss": 0.2186, + "step": 45250 + }, + { + "epoch": 0.8401064892341167, + "grad_norm": 0.38248953223228455, + "learning_rate": 1.2351804611329276e-06, + "loss": 0.3396, + "step": 45252 + }, + { + "epoch": 0.8401436193715353, + "grad_norm": 0.3236926198005676, + "learning_rate": 1.2346189316661384e-06, + "loss": 0.2366, + "step": 45254 + }, + { + "epoch": 0.8401807495089539, + "grad_norm": 0.36892765760421753, + "learning_rate": 1.2340575214699302e-06, + "loss": 0.1886, + "step": 45256 + }, + { + "epoch": 0.8402178796463726, + "grad_norm": 0.3523741662502289, + "learning_rate": 1.2334962305519415e-06, + "loss": 0.2918, + "step": 45258 + }, + { + "epoch": 0.8402550097837912, + "grad_norm": 0.5126590728759766, + "learning_rate": 1.232935058919813e-06, + "loss": 0.3094, + "step": 45260 + }, + { + "epoch": 0.8402921399212099, + "grad_norm": 0.29195207357406616, + "learning_rate": 1.2323740065811762e-06, + "loss": 0.5404, + "step": 45262 + }, + { + "epoch": 0.8403292700586285, + "grad_norm": 0.35124409198760986, + "learning_rate": 1.2318130735436673e-06, + "loss": 0.2936, + "step": 45264 + }, + { + "epoch": 0.8403664001960471, + "grad_norm": 0.40031588077545166, + "learning_rate": 1.2312522598149201e-06, + "loss": 0.3257, + "step": 45266 + }, + { + "epoch": 0.8404035303334657, + "grad_norm": 0.36197513341903687, + "learning_rate": 1.2306915654025619e-06, + "loss": 0.2938, + "step": 45268 + }, + { + "epoch": 0.8404406604708844, + "grad_norm": 0.5545368194580078, + "learning_rate": 1.2301309903142245e-06, + "loss": 0.3459, + "step": 45270 + }, + { + "epoch": 0.8404777906083031, + "grad_norm": 0.34288302063941956, + "learning_rate": 1.2295705345575382e-06, + "loss": 0.2677, + "step": 45272 + }, + { + "epoch": 0.8405149207457216, + "grad_norm": 0.44922173023223877, + "learning_rate": 1.2290101981401238e-06, + "loss": 0.3151, + "step": 45274 + }, + { + "epoch": 0.8405520508831403, + "grad_norm": 0.43209108710289, + "learning_rate": 1.2284499810696093e-06, + "loss": 0.2092, + "step": 45276 + }, + { + "epoch": 0.8405891810205589, + "grad_norm": 0.3646174669265747, + "learning_rate": 1.2278898833536157e-06, + "loss": 0.2266, + "step": 45278 + }, + { + "epoch": 0.8406263111579776, + "grad_norm": 0.31939801573753357, + "learning_rate": 1.2273299049997656e-06, + "loss": 0.1111, + "step": 45280 + }, + { + "epoch": 0.8406634412953963, + "grad_norm": 0.3142664134502411, + "learning_rate": 1.2267700460156784e-06, + "loss": 0.2619, + "step": 45282 + }, + { + "epoch": 0.8407005714328148, + "grad_norm": 0.461139053106308, + "learning_rate": 1.2262103064089737e-06, + "loss": 0.1923, + "step": 45284 + }, + { + "epoch": 0.8407377015702335, + "grad_norm": 0.4799596071243286, + "learning_rate": 1.2256506861872652e-06, + "loss": 0.2422, + "step": 45286 + }, + { + "epoch": 0.8407748317076521, + "grad_norm": 0.30488982796669006, + "learning_rate": 1.2250911853581692e-06, + "loss": 0.2267, + "step": 45288 + }, + { + "epoch": 0.8408119618450708, + "grad_norm": 0.44499513506889343, + "learning_rate": 1.2245318039292976e-06, + "loss": 0.2358, + "step": 45290 + }, + { + "epoch": 0.8408490919824895, + "grad_norm": 0.6676802039146423, + "learning_rate": 1.2239725419082614e-06, + "loss": 0.2242, + "step": 45292 + }, + { + "epoch": 0.840886222119908, + "grad_norm": 0.27867016196250916, + "learning_rate": 1.223413399302673e-06, + "loss": 0.1809, + "step": 45294 + }, + { + "epoch": 0.8409233522573267, + "grad_norm": 0.3164690434932709, + "learning_rate": 1.2228543761201383e-06, + "loss": 0.3284, + "step": 45296 + }, + { + "epoch": 0.8409604823947453, + "grad_norm": 0.24337244033813477, + "learning_rate": 1.2222954723682667e-06, + "loss": 0.2519, + "step": 45298 + }, + { + "epoch": 0.840997612532164, + "grad_norm": 0.33757564425468445, + "learning_rate": 1.2217366880546599e-06, + "loss": 0.2743, + "step": 45300 + }, + { + "epoch": 0.8410347426695827, + "grad_norm": 0.21648545563220978, + "learning_rate": 1.221178023186923e-06, + "loss": 0.2054, + "step": 45302 + }, + { + "epoch": 0.8410718728070012, + "grad_norm": 0.3650580942630768, + "learning_rate": 1.2206194777726576e-06, + "loss": 0.1946, + "step": 45304 + }, + { + "epoch": 0.8411090029444199, + "grad_norm": 0.4334535002708435, + "learning_rate": 1.2200610518194644e-06, + "loss": 0.3808, + "step": 45306 + }, + { + "epoch": 0.8411461330818385, + "grad_norm": 0.27494171261787415, + "learning_rate": 1.219502745334943e-06, + "loss": 0.3471, + "step": 45308 + }, + { + "epoch": 0.8411832632192572, + "grad_norm": 0.5048553943634033, + "learning_rate": 1.2189445583266878e-06, + "loss": 0.1819, + "step": 45310 + }, + { + "epoch": 0.8412203933566759, + "grad_norm": 0.4492442011833191, + "learning_rate": 1.2183864908022935e-06, + "loss": 0.2393, + "step": 45312 + }, + { + "epoch": 0.8412575234940944, + "grad_norm": 0.3076966106891632, + "learning_rate": 1.2178285427693547e-06, + "loss": 0.4657, + "step": 45314 + }, + { + "epoch": 0.8412946536315131, + "grad_norm": 0.3456513583660126, + "learning_rate": 1.2172707142354633e-06, + "loss": 0.4455, + "step": 45316 + }, + { + "epoch": 0.8413317837689317, + "grad_norm": 0.38558804988861084, + "learning_rate": 1.216713005208211e-06, + "loss": 0.4214, + "step": 45318 + }, + { + "epoch": 0.8413689139063504, + "grad_norm": 0.2730877101421356, + "learning_rate": 1.216155415695186e-06, + "loss": 0.2416, + "step": 45320 + }, + { + "epoch": 0.841406044043769, + "grad_norm": 0.39779239892959595, + "learning_rate": 1.2155979457039768e-06, + "loss": 0.1751, + "step": 45322 + }, + { + "epoch": 0.8414431741811876, + "grad_norm": 0.4569144546985626, + "learning_rate": 1.215040595242165e-06, + "loss": 0.4193, + "step": 45324 + }, + { + "epoch": 0.8414803043186063, + "grad_norm": 0.4729786217212677, + "learning_rate": 1.214483364317336e-06, + "loss": 0.235, + "step": 45326 + }, + { + "epoch": 0.8415174344560249, + "grad_norm": 0.23023664951324463, + "learning_rate": 1.2139262529370743e-06, + "loss": 0.2322, + "step": 45328 + }, + { + "epoch": 0.8415545645934436, + "grad_norm": 0.449933797121048, + "learning_rate": 1.2133692611089599e-06, + "loss": 0.4932, + "step": 45330 + }, + { + "epoch": 0.8415916947308621, + "grad_norm": 0.4338637888431549, + "learning_rate": 1.2128123888405685e-06, + "loss": 0.2744, + "step": 45332 + }, + { + "epoch": 0.8416288248682808, + "grad_norm": 0.3076188862323761, + "learning_rate": 1.2122556361394812e-06, + "loss": 0.2273, + "step": 45334 + }, + { + "epoch": 0.8416659550056995, + "grad_norm": 0.47620299458503723, + "learning_rate": 1.2116990030132735e-06, + "loss": 0.1985, + "step": 45336 + }, + { + "epoch": 0.8417030851431181, + "grad_norm": 0.38767027854919434, + "learning_rate": 1.2111424894695167e-06, + "loss": 0.1874, + "step": 45338 + }, + { + "epoch": 0.8417402152805368, + "grad_norm": 0.6446080207824707, + "learning_rate": 1.2105860955157844e-06, + "loss": 0.283, + "step": 45340 + }, + { + "epoch": 0.8417773454179553, + "grad_norm": 0.41974738240242004, + "learning_rate": 1.2100298211596485e-06, + "loss": 0.2151, + "step": 45342 + }, + { + "epoch": 0.841814475555374, + "grad_norm": 0.4539782404899597, + "learning_rate": 1.209473666408677e-06, + "loss": 0.1631, + "step": 45344 + }, + { + "epoch": 0.8418516056927927, + "grad_norm": 0.42491239309310913, + "learning_rate": 1.2089176312704387e-06, + "loss": 0.3079, + "step": 45346 + }, + { + "epoch": 0.8418887358302113, + "grad_norm": 0.2321523129940033, + "learning_rate": 1.2083617157525018e-06, + "loss": 0.2417, + "step": 45348 + }, + { + "epoch": 0.84192586596763, + "grad_norm": 0.3383848965167999, + "learning_rate": 1.2078059198624249e-06, + "loss": 0.1232, + "step": 45350 + }, + { + "epoch": 0.8419629961050485, + "grad_norm": 0.3025696277618408, + "learning_rate": 1.2072502436077759e-06, + "loss": 0.2221, + "step": 45352 + }, + { + "epoch": 0.8420001262424672, + "grad_norm": 0.3008034825325012, + "learning_rate": 1.2066946869961127e-06, + "loss": 0.3954, + "step": 45354 + }, + { + "epoch": 0.8420372563798859, + "grad_norm": 0.7164477705955505, + "learning_rate": 1.2061392500349944e-06, + "loss": 0.2334, + "step": 45356 + }, + { + "epoch": 0.8420743865173045, + "grad_norm": 0.4537184536457062, + "learning_rate": 1.2055839327319808e-06, + "loss": 0.2691, + "step": 45358 + }, + { + "epoch": 0.8421115166547232, + "grad_norm": 0.3986304998397827, + "learning_rate": 1.20502873509463e-06, + "loss": 0.2501, + "step": 45360 + }, + { + "epoch": 0.8421486467921417, + "grad_norm": 0.4589011073112488, + "learning_rate": 1.2044736571304915e-06, + "loss": 0.2088, + "step": 45362 + }, + { + "epoch": 0.8421857769295604, + "grad_norm": 0.4428768455982208, + "learning_rate": 1.2039186988471218e-06, + "loss": 0.164, + "step": 45364 + }, + { + "epoch": 0.8422229070669791, + "grad_norm": 0.5436967015266418, + "learning_rate": 1.2033638602520702e-06, + "loss": 0.5053, + "step": 45366 + }, + { + "epoch": 0.8422600372043977, + "grad_norm": 0.4356236755847931, + "learning_rate": 1.2028091413528887e-06, + "loss": 0.5368, + "step": 45368 + }, + { + "epoch": 0.8422971673418164, + "grad_norm": 0.39481520652770996, + "learning_rate": 1.2022545421571252e-06, + "loss": 0.3726, + "step": 45370 + }, + { + "epoch": 0.8423342974792349, + "grad_norm": 0.2691798508167267, + "learning_rate": 1.201700062672323e-06, + "loss": 0.1208, + "step": 45372 + }, + { + "epoch": 0.8423714276166536, + "grad_norm": 0.40969720482826233, + "learning_rate": 1.2011457029060313e-06, + "loss": 0.3441, + "step": 45374 + }, + { + "epoch": 0.8424085577540722, + "grad_norm": 0.3877717852592468, + "learning_rate": 1.200591462865789e-06, + "loss": 0.3389, + "step": 45376 + }, + { + "epoch": 0.8424456878914909, + "grad_norm": 0.388508677482605, + "learning_rate": 1.2000373425591395e-06, + "loss": 0.3556, + "step": 45378 + }, + { + "epoch": 0.8424828180289096, + "grad_norm": 0.37963294982910156, + "learning_rate": 1.1994833419936225e-06, + "loss": 0.2176, + "step": 45380 + }, + { + "epoch": 0.8425199481663281, + "grad_norm": 0.5552801489830017, + "learning_rate": 1.1989294611767776e-06, + "loss": 0.3159, + "step": 45382 + }, + { + "epoch": 0.8425570783037468, + "grad_norm": 0.41281190514564514, + "learning_rate": 1.19837570011614e-06, + "loss": 0.1046, + "step": 45384 + }, + { + "epoch": 0.8425942084411654, + "grad_norm": 0.3915144205093384, + "learning_rate": 1.1978220588192468e-06, + "loss": 0.2836, + "step": 45386 + }, + { + "epoch": 0.8426313385785841, + "grad_norm": 0.532941460609436, + "learning_rate": 1.1972685372936276e-06, + "loss": 0.3094, + "step": 45388 + }, + { + "epoch": 0.8426684687160028, + "grad_norm": 0.4412866532802582, + "learning_rate": 1.1967151355468176e-06, + "loss": 0.3807, + "step": 45390 + }, + { + "epoch": 0.8427055988534213, + "grad_norm": 0.37104952335357666, + "learning_rate": 1.1961618535863472e-06, + "loss": 0.2102, + "step": 45392 + }, + { + "epoch": 0.84274272899084, + "grad_norm": 0.22386036813259125, + "learning_rate": 1.1956086914197407e-06, + "loss": 0.1633, + "step": 45394 + }, + { + "epoch": 0.8427798591282586, + "grad_norm": 0.3792182207107544, + "learning_rate": 1.1950556490545283e-06, + "loss": 0.296, + "step": 45396 + }, + { + "epoch": 0.8428169892656773, + "grad_norm": 0.25072771310806274, + "learning_rate": 1.1945027264982367e-06, + "loss": 0.318, + "step": 45398 + }, + { + "epoch": 0.842854119403096, + "grad_norm": 0.4799729883670807, + "learning_rate": 1.193949923758385e-06, + "loss": 0.3728, + "step": 45400 + }, + { + "epoch": 0.8428912495405145, + "grad_norm": 2.328273057937622, + "learning_rate": 1.1933972408424988e-06, + "loss": 0.2725, + "step": 45402 + }, + { + "epoch": 0.8429283796779332, + "grad_norm": 0.33754175901412964, + "learning_rate": 1.192844677758096e-06, + "loss": 0.3185, + "step": 45404 + }, + { + "epoch": 0.8429655098153518, + "grad_norm": 0.4751463532447815, + "learning_rate": 1.1922922345126974e-06, + "loss": 0.1721, + "step": 45406 + }, + { + "epoch": 0.8430026399527705, + "grad_norm": 0.3931720554828644, + "learning_rate": 1.1917399111138184e-06, + "loss": 0.2646, + "step": 45408 + }, + { + "epoch": 0.8430397700901892, + "grad_norm": 0.3468567728996277, + "learning_rate": 1.1911877075689792e-06, + "loss": 0.2787, + "step": 45410 + }, + { + "epoch": 0.8430769002276077, + "grad_norm": 0.5384902954101562, + "learning_rate": 1.1906356238856865e-06, + "loss": 0.244, + "step": 45412 + }, + { + "epoch": 0.8431140303650264, + "grad_norm": 0.3300603926181793, + "learning_rate": 1.190083660071456e-06, + "loss": 0.2345, + "step": 45414 + }, + { + "epoch": 0.843151160502445, + "grad_norm": 0.27469688653945923, + "learning_rate": 1.1895318161337999e-06, + "loss": 0.1956, + "step": 45416 + }, + { + "epoch": 0.8431882906398637, + "grad_norm": 0.4329138994216919, + "learning_rate": 1.1889800920802241e-06, + "loss": 0.2235, + "step": 45418 + }, + { + "epoch": 0.8432254207772822, + "grad_norm": 0.41585421562194824, + "learning_rate": 1.1884284879182373e-06, + "loss": 0.1964, + "step": 45420 + }, + { + "epoch": 0.8432625509147009, + "grad_norm": 0.4802803099155426, + "learning_rate": 1.1878770036553445e-06, + "loss": 0.2146, + "step": 45422 + }, + { + "epoch": 0.8432996810521196, + "grad_norm": 0.5831377506256104, + "learning_rate": 1.1873256392990517e-06, + "loss": 0.2558, + "step": 45424 + }, + { + "epoch": 0.8433368111895382, + "grad_norm": 0.48825928568840027, + "learning_rate": 1.1867743948568588e-06, + "loss": 0.2005, + "step": 45426 + }, + { + "epoch": 0.8433739413269569, + "grad_norm": 0.6146504878997803, + "learning_rate": 1.1862232703362675e-06, + "loss": 0.4772, + "step": 45428 + }, + { + "epoch": 0.8434110714643754, + "grad_norm": 0.2638562023639679, + "learning_rate": 1.1856722657447772e-06, + "loss": 0.373, + "step": 45430 + }, + { + "epoch": 0.8434482016017941, + "grad_norm": 0.7274913191795349, + "learning_rate": 1.1851213810898855e-06, + "loss": 0.2684, + "step": 45432 + }, + { + "epoch": 0.8434853317392128, + "grad_norm": 0.46377524733543396, + "learning_rate": 1.1845706163790904e-06, + "loss": 0.2082, + "step": 45434 + }, + { + "epoch": 0.8435224618766314, + "grad_norm": 0.3601382076740265, + "learning_rate": 1.184019971619882e-06, + "loss": 0.1814, + "step": 45436 + }, + { + "epoch": 0.8435595920140501, + "grad_norm": 0.3286453187465668, + "learning_rate": 1.183469446819756e-06, + "loss": 0.3388, + "step": 45438 + }, + { + "epoch": 0.8435967221514686, + "grad_norm": 0.261030375957489, + "learning_rate": 1.1829190419862014e-06, + "loss": 0.2928, + "step": 45440 + }, + { + "epoch": 0.8436338522888873, + "grad_norm": 0.3657304346561432, + "learning_rate": 1.1823687571267072e-06, + "loss": 0.2001, + "step": 45442 + }, + { + "epoch": 0.843670982426306, + "grad_norm": 0.32285332679748535, + "learning_rate": 1.1818185922487636e-06, + "loss": 0.1254, + "step": 45444 + }, + { + "epoch": 0.8437081125637246, + "grad_norm": 0.5104774236679077, + "learning_rate": 1.1812685473598551e-06, + "loss": 0.4094, + "step": 45446 + }, + { + "epoch": 0.8437452427011433, + "grad_norm": 0.451045960187912, + "learning_rate": 1.1807186224674684e-06, + "loss": 0.2133, + "step": 45448 + }, + { + "epoch": 0.8437823728385618, + "grad_norm": 0.4571956694126129, + "learning_rate": 1.1801688175790815e-06, + "loss": 0.3241, + "step": 45450 + }, + { + "epoch": 0.8438195029759805, + "grad_norm": 0.5230547785758972, + "learning_rate": 1.17961913270218e-06, + "loss": 0.3487, + "step": 45452 + }, + { + "epoch": 0.8438566331133992, + "grad_norm": 0.258680522441864, + "learning_rate": 1.179069567844241e-06, + "loss": 0.4542, + "step": 45454 + }, + { + "epoch": 0.8438937632508178, + "grad_norm": 0.41075125336647034, + "learning_rate": 1.1785201230127462e-06, + "loss": 0.2004, + "step": 45456 + }, + { + "epoch": 0.8439308933882365, + "grad_norm": 0.30592939257621765, + "learning_rate": 1.1779707982151666e-06, + "loss": 0.1041, + "step": 45458 + }, + { + "epoch": 0.843968023525655, + "grad_norm": 0.23186159133911133, + "learning_rate": 1.1774215934589794e-06, + "loss": 0.1223, + "step": 45460 + }, + { + "epoch": 0.8440051536630737, + "grad_norm": 0.41624850034713745, + "learning_rate": 1.1768725087516597e-06, + "loss": 0.303, + "step": 45462 + }, + { + "epoch": 0.8440422838004924, + "grad_norm": 0.39758405089378357, + "learning_rate": 1.1763235441006749e-06, + "loss": 0.2337, + "step": 45464 + }, + { + "epoch": 0.844079413937911, + "grad_norm": 0.328519344329834, + "learning_rate": 1.1757746995134966e-06, + "loss": 0.2406, + "step": 45466 + }, + { + "epoch": 0.8441165440753297, + "grad_norm": 0.24247892200946808, + "learning_rate": 1.1752259749975924e-06, + "loss": 0.1872, + "step": 45468 + }, + { + "epoch": 0.8441536742127482, + "grad_norm": 0.483316570520401, + "learning_rate": 1.174677370560431e-06, + "loss": 0.3486, + "step": 45470 + }, + { + "epoch": 0.8441908043501669, + "grad_norm": 0.42939358949661255, + "learning_rate": 1.1741288862094747e-06, + "loss": 0.3496, + "step": 45472 + }, + { + "epoch": 0.8442279344875855, + "grad_norm": 0.5680773854255676, + "learning_rate": 1.17358052195219e-06, + "loss": 0.4171, + "step": 45474 + }, + { + "epoch": 0.8442650646250042, + "grad_norm": 0.2503086030483246, + "learning_rate": 1.1730322777960334e-06, + "loss": 0.1558, + "step": 45476 + }, + { + "epoch": 0.8443021947624229, + "grad_norm": 0.42405831813812256, + "learning_rate": 1.172484153748471e-06, + "loss": 0.3673, + "step": 45478 + }, + { + "epoch": 0.8443393248998414, + "grad_norm": 0.29770106077194214, + "learning_rate": 1.1719361498169545e-06, + "loss": 0.3578, + "step": 45480 + }, + { + "epoch": 0.8443764550372601, + "grad_norm": 0.6463533043861389, + "learning_rate": 1.1713882660089448e-06, + "loss": 0.1636, + "step": 45482 + }, + { + "epoch": 0.8444135851746787, + "grad_norm": 0.4354647696018219, + "learning_rate": 1.170840502331896e-06, + "loss": 0.2478, + "step": 45484 + }, + { + "epoch": 0.8444507153120974, + "grad_norm": 0.4315684735774994, + "learning_rate": 1.1702928587932626e-06, + "loss": 0.3181, + "step": 45486 + }, + { + "epoch": 0.844487845449516, + "grad_norm": 0.46276187896728516, + "learning_rate": 1.169745335400494e-06, + "loss": 0.3846, + "step": 45488 + }, + { + "epoch": 0.8445249755869346, + "grad_norm": 0.4298904836177826, + "learning_rate": 1.1691979321610414e-06, + "loss": 0.2277, + "step": 45490 + }, + { + "epoch": 0.8445621057243533, + "grad_norm": 0.2679091989994049, + "learning_rate": 1.1686506490823546e-06, + "loss": 0.343, + "step": 45492 + }, + { + "epoch": 0.8445992358617719, + "grad_norm": 0.6608708500862122, + "learning_rate": 1.1681034861718787e-06, + "loss": 0.2582, + "step": 45494 + }, + { + "epoch": 0.8446363659991906, + "grad_norm": 0.3383752107620239, + "learning_rate": 1.16755644343706e-06, + "loss": 0.2619, + "step": 45496 + }, + { + "epoch": 0.8446734961366092, + "grad_norm": 0.16610772907733917, + "learning_rate": 1.1670095208853438e-06, + "loss": 0.1382, + "step": 45498 + }, + { + "epoch": 0.8447106262740278, + "grad_norm": 0.33105698227882385, + "learning_rate": 1.1664627185241705e-06, + "loss": 0.2885, + "step": 45500 + }, + { + "epoch": 0.8447477564114465, + "grad_norm": 0.4174129366874695, + "learning_rate": 1.1659160363609778e-06, + "loss": 0.1138, + "step": 45502 + }, + { + "epoch": 0.8447848865488651, + "grad_norm": 0.4087831974029541, + "learning_rate": 1.1653694744032062e-06, + "loss": 0.2137, + "step": 45504 + }, + { + "epoch": 0.8448220166862838, + "grad_norm": 0.35414040088653564, + "learning_rate": 1.1648230326582943e-06, + "loss": 0.3037, + "step": 45506 + }, + { + "epoch": 0.8448591468237024, + "grad_norm": 0.42537495493888855, + "learning_rate": 1.164276711133675e-06, + "loss": 0.1782, + "step": 45508 + }, + { + "epoch": 0.844896276961121, + "grad_norm": 0.22932176291942596, + "learning_rate": 1.1637305098367847e-06, + "loss": 0.243, + "step": 45510 + }, + { + "epoch": 0.8449334070985397, + "grad_norm": 0.2516622245311737, + "learning_rate": 1.1631844287750572e-06, + "loss": 0.2511, + "step": 45512 + }, + { + "epoch": 0.8449705372359583, + "grad_norm": 0.6327522397041321, + "learning_rate": 1.1626384679559167e-06, + "loss": 0.5104, + "step": 45514 + }, + { + "epoch": 0.845007667373377, + "grad_norm": 0.3257109224796295, + "learning_rate": 1.1620926273867973e-06, + "loss": 0.2925, + "step": 45516 + }, + { + "epoch": 0.8450447975107956, + "grad_norm": 0.35986414551734924, + "learning_rate": 1.161546907075123e-06, + "loss": 0.3441, + "step": 45518 + }, + { + "epoch": 0.8450819276482142, + "grad_norm": 0.4351577162742615, + "learning_rate": 1.1610013070283243e-06, + "loss": 0.3029, + "step": 45520 + }, + { + "epoch": 0.8451190577856329, + "grad_norm": 0.2859640419483185, + "learning_rate": 1.160455827253819e-06, + "loss": 0.1099, + "step": 45522 + }, + { + "epoch": 0.8451561879230515, + "grad_norm": 0.33043181896209717, + "learning_rate": 1.1599104677590356e-06, + "loss": 0.2182, + "step": 45524 + }, + { + "epoch": 0.8451933180604702, + "grad_norm": 0.6442746520042419, + "learning_rate": 1.1593652285513879e-06, + "loss": 0.3305, + "step": 45526 + }, + { + "epoch": 0.8452304481978887, + "grad_norm": 0.3886159062385559, + "learning_rate": 1.1588201096383001e-06, + "loss": 0.2982, + "step": 45528 + }, + { + "epoch": 0.8452675783353074, + "grad_norm": 0.38991186022758484, + "learning_rate": 1.1582751110271872e-06, + "loss": 0.2858, + "step": 45530 + }, + { + "epoch": 0.8453047084727261, + "grad_norm": 0.3442493975162506, + "learning_rate": 1.157730232725467e-06, + "loss": 0.2996, + "step": 45532 + }, + { + "epoch": 0.8453418386101447, + "grad_norm": 0.9192871451377869, + "learning_rate": 1.157185474740552e-06, + "loss": 0.3112, + "step": 45534 + }, + { + "epoch": 0.8453789687475634, + "grad_norm": 0.3962438106536865, + "learning_rate": 1.1566408370798554e-06, + "loss": 0.238, + "step": 45536 + }, + { + "epoch": 0.8454160988849819, + "grad_norm": 0.3688827157020569, + "learning_rate": 1.1560963197507902e-06, + "loss": 0.3575, + "step": 45538 + }, + { + "epoch": 0.8454532290224006, + "grad_norm": 0.23311764001846313, + "learning_rate": 1.1555519227607625e-06, + "loss": 0.1733, + "step": 45540 + }, + { + "epoch": 0.8454903591598193, + "grad_norm": 0.3592592179775238, + "learning_rate": 1.155007646117182e-06, + "loss": 0.3836, + "step": 45542 + }, + { + "epoch": 0.8455274892972379, + "grad_norm": 0.35091525316238403, + "learning_rate": 1.154463489827451e-06, + "loss": 0.2495, + "step": 45544 + }, + { + "epoch": 0.8455646194346566, + "grad_norm": 0.5616759657859802, + "learning_rate": 1.1539194538989773e-06, + "loss": 0.4544, + "step": 45546 + }, + { + "epoch": 0.8456017495720751, + "grad_norm": 0.34819141030311584, + "learning_rate": 1.1533755383391633e-06, + "loss": 0.2382, + "step": 45548 + }, + { + "epoch": 0.8456388797094938, + "grad_norm": 0.32599616050720215, + "learning_rate": 1.1528317431554103e-06, + "loss": 0.2164, + "step": 45550 + }, + { + "epoch": 0.8456760098469125, + "grad_norm": 0.41436079144477844, + "learning_rate": 1.1522880683551153e-06, + "loss": 0.1302, + "step": 45552 + }, + { + "epoch": 0.8457131399843311, + "grad_norm": 0.5267750024795532, + "learning_rate": 1.1517445139456785e-06, + "loss": 0.2319, + "step": 45554 + }, + { + "epoch": 0.8457502701217497, + "grad_norm": 0.2984572947025299, + "learning_rate": 1.151201079934494e-06, + "loss": 0.2153, + "step": 45556 + }, + { + "epoch": 0.8457874002591683, + "grad_norm": 0.31812795996665955, + "learning_rate": 1.1506577663289587e-06, + "loss": 0.3158, + "step": 45558 + }, + { + "epoch": 0.845824530396587, + "grad_norm": 0.5453408360481262, + "learning_rate": 1.1501145731364649e-06, + "loss": 0.4759, + "step": 45560 + }, + { + "epoch": 0.8458616605340057, + "grad_norm": 0.5357164144515991, + "learning_rate": 1.1495715003644026e-06, + "loss": 0.2113, + "step": 45562 + }, + { + "epoch": 0.8458987906714243, + "grad_norm": 0.3437976539134979, + "learning_rate": 1.1490285480201636e-06, + "loss": 0.3269, + "step": 45564 + }, + { + "epoch": 0.845935920808843, + "grad_norm": 0.3676532506942749, + "learning_rate": 1.148485716111133e-06, + "loss": 0.2621, + "step": 45566 + }, + { + "epoch": 0.8459730509462615, + "grad_norm": 0.29599782824516296, + "learning_rate": 1.1479430046446981e-06, + "loss": 0.1774, + "step": 45568 + }, + { + "epoch": 0.8460101810836802, + "grad_norm": 0.25979599356651306, + "learning_rate": 1.1474004136282434e-06, + "loss": 0.2226, + "step": 45570 + }, + { + "epoch": 0.8460473112210988, + "grad_norm": 0.4676661491394043, + "learning_rate": 1.1468579430691528e-06, + "loss": 0.352, + "step": 45572 + }, + { + "epoch": 0.8460844413585175, + "grad_norm": 0.4226343333721161, + "learning_rate": 1.1463155929748093e-06, + "loss": 0.3163, + "step": 45574 + }, + { + "epoch": 0.8461215714959361, + "grad_norm": 0.43027463555336, + "learning_rate": 1.145773363352589e-06, + "loss": 0.2832, + "step": 45576 + }, + { + "epoch": 0.8461587016333547, + "grad_norm": 0.3799506723880768, + "learning_rate": 1.145231254209871e-06, + "loss": 0.2021, + "step": 45578 + }, + { + "epoch": 0.8461958317707734, + "grad_norm": 0.42559731006622314, + "learning_rate": 1.1446892655540332e-06, + "loss": 0.3203, + "step": 45580 + }, + { + "epoch": 0.846232961908192, + "grad_norm": 0.2443203181028366, + "learning_rate": 1.14414739739245e-06, + "loss": 0.1859, + "step": 45582 + }, + { + "epoch": 0.8462700920456107, + "grad_norm": 0.2973026633262634, + "learning_rate": 1.1436056497324955e-06, + "loss": 0.4137, + "step": 45584 + }, + { + "epoch": 0.8463072221830293, + "grad_norm": 0.45685309171676636, + "learning_rate": 1.1430640225815392e-06, + "loss": 0.4294, + "step": 45586 + }, + { + "epoch": 0.8463443523204479, + "grad_norm": 0.5407895445823669, + "learning_rate": 1.1425225159469533e-06, + "loss": 0.3843, + "step": 45588 + }, + { + "epoch": 0.8463814824578666, + "grad_norm": 0.18578919768333435, + "learning_rate": 1.1419811298361027e-06, + "loss": 0.2257, + "step": 45590 + }, + { + "epoch": 0.8464186125952852, + "grad_norm": 0.3667568266391754, + "learning_rate": 1.1414398642563562e-06, + "loss": 0.411, + "step": 45592 + }, + { + "epoch": 0.8464557427327039, + "grad_norm": 0.3920191526412964, + "learning_rate": 1.1408987192150789e-06, + "loss": 0.1298, + "step": 45594 + }, + { + "epoch": 0.8464928728701225, + "grad_norm": 0.42858630418777466, + "learning_rate": 1.1403576947196337e-06, + "loss": 0.2344, + "step": 45596 + }, + { + "epoch": 0.8465300030075411, + "grad_norm": 0.3888542652130127, + "learning_rate": 1.1398167907773827e-06, + "loss": 0.3274, + "step": 45598 + }, + { + "epoch": 0.8465671331449598, + "grad_norm": 0.414314329624176, + "learning_rate": 1.1392760073956876e-06, + "loss": 0.2835, + "step": 45600 + }, + { + "epoch": 0.8466042632823784, + "grad_norm": 0.5580448508262634, + "learning_rate": 1.1387353445819026e-06, + "loss": 0.2202, + "step": 45602 + }, + { + "epoch": 0.846641393419797, + "grad_norm": 0.32915905117988586, + "learning_rate": 1.1381948023433886e-06, + "loss": 0.2831, + "step": 45604 + }, + { + "epoch": 0.8466785235572157, + "grad_norm": 0.40838298201560974, + "learning_rate": 1.1376543806874996e-06, + "loss": 0.28, + "step": 45606 + }, + { + "epoch": 0.8467156536946343, + "grad_norm": 0.3009864091873169, + "learning_rate": 1.1371140796215873e-06, + "loss": 0.2985, + "step": 45608 + }, + { + "epoch": 0.846752783832053, + "grad_norm": 0.4377017319202423, + "learning_rate": 1.136573899153005e-06, + "loss": 0.4453, + "step": 45610 + }, + { + "epoch": 0.8467899139694716, + "grad_norm": 0.17988137900829315, + "learning_rate": 1.1360338392891057e-06, + "loss": 0.1987, + "step": 45612 + }, + { + "epoch": 0.8468270441068902, + "grad_norm": 0.30020540952682495, + "learning_rate": 1.1354939000372323e-06, + "loss": 0.3311, + "step": 45614 + }, + { + "epoch": 0.8468641742443089, + "grad_norm": 0.24698926508426666, + "learning_rate": 1.1349540814047345e-06, + "loss": 0.1095, + "step": 45616 + }, + { + "epoch": 0.8469013043817275, + "grad_norm": 0.6415321230888367, + "learning_rate": 1.1344143833989575e-06, + "loss": 0.2231, + "step": 45618 + }, + { + "epoch": 0.8469384345191462, + "grad_norm": 0.31010857224464417, + "learning_rate": 1.1338748060272464e-06, + "loss": 0.2364, + "step": 45620 + }, + { + "epoch": 0.8469755646565648, + "grad_norm": 0.5542388558387756, + "learning_rate": 1.1333353492969412e-06, + "loss": 0.2095, + "step": 45622 + }, + { + "epoch": 0.8470126947939834, + "grad_norm": 0.44835609197616577, + "learning_rate": 1.1327960132153848e-06, + "loss": 0.183, + "step": 45624 + }, + { + "epoch": 0.847049824931402, + "grad_norm": 0.4196886420249939, + "learning_rate": 1.1322567977899135e-06, + "loss": 0.2147, + "step": 45626 + }, + { + "epoch": 0.8470869550688207, + "grad_norm": 0.3011663854122162, + "learning_rate": 1.131717703027866e-06, + "loss": 0.223, + "step": 45628 + }, + { + "epoch": 0.8471240852062394, + "grad_norm": 0.5968034267425537, + "learning_rate": 1.1311787289365762e-06, + "loss": 0.276, + "step": 45630 + }, + { + "epoch": 0.847161215343658, + "grad_norm": 0.48993223905563354, + "learning_rate": 1.1306398755233782e-06, + "loss": 0.3658, + "step": 45632 + }, + { + "epoch": 0.8471983454810766, + "grad_norm": 0.3149981200695038, + "learning_rate": 1.1301011427956044e-06, + "loss": 0.1806, + "step": 45634 + }, + { + "epoch": 0.8472354756184952, + "grad_norm": 0.47624871134757996, + "learning_rate": 1.1295625307605852e-06, + "loss": 0.1935, + "step": 45636 + }, + { + "epoch": 0.8472726057559139, + "grad_norm": 0.5210164189338684, + "learning_rate": 1.1290240394256524e-06, + "loss": 0.3052, + "step": 45638 + }, + { + "epoch": 0.8473097358933326, + "grad_norm": 0.5276150107383728, + "learning_rate": 1.1284856687981295e-06, + "loss": 0.3865, + "step": 45640 + }, + { + "epoch": 0.8473468660307512, + "grad_norm": 0.5263790488243103, + "learning_rate": 1.1279474188853422e-06, + "loss": 0.28, + "step": 45642 + }, + { + "epoch": 0.8473839961681698, + "grad_norm": 0.49327078461647034, + "learning_rate": 1.1274092896946165e-06, + "loss": 0.2888, + "step": 45644 + }, + { + "epoch": 0.8474211263055884, + "grad_norm": 0.40713727474212646, + "learning_rate": 1.1268712812332761e-06, + "loss": 0.3151, + "step": 45646 + }, + { + "epoch": 0.8474582564430071, + "grad_norm": 0.3393518030643463, + "learning_rate": 1.1263333935086363e-06, + "loss": 0.3122, + "step": 45648 + }, + { + "epoch": 0.8474953865804258, + "grad_norm": 0.42300945520401, + "learning_rate": 1.1257956265280222e-06, + "loss": 0.3341, + "step": 45650 + }, + { + "epoch": 0.8475325167178444, + "grad_norm": 0.5484139323234558, + "learning_rate": 1.1252579802987463e-06, + "loss": 0.2483, + "step": 45652 + }, + { + "epoch": 0.847569646855263, + "grad_norm": 0.5140788555145264, + "learning_rate": 1.1247204548281254e-06, + "loss": 0.3352, + "step": 45654 + }, + { + "epoch": 0.8476067769926816, + "grad_norm": 0.3372455835342407, + "learning_rate": 1.1241830501234763e-06, + "loss": 0.1911, + "step": 45656 + }, + { + "epoch": 0.8476439071301003, + "grad_norm": 0.43498682975769043, + "learning_rate": 1.1236457661921085e-06, + "loss": 0.2873, + "step": 45658 + }, + { + "epoch": 0.847681037267519, + "grad_norm": 0.37636634707450867, + "learning_rate": 1.123108603041334e-06, + "loss": 0.1302, + "step": 45660 + }, + { + "epoch": 0.8477181674049376, + "grad_norm": 0.617347002029419, + "learning_rate": 1.1225715606784626e-06, + "loss": 0.2554, + "step": 45662 + }, + { + "epoch": 0.8477552975423562, + "grad_norm": 0.2638891935348511, + "learning_rate": 1.1220346391108027e-06, + "loss": 0.2384, + "step": 45664 + }, + { + "epoch": 0.8477924276797748, + "grad_norm": 0.3865855932235718, + "learning_rate": 1.1214978383456576e-06, + "loss": 0.275, + "step": 45666 + }, + { + "epoch": 0.8478295578171935, + "grad_norm": 0.3010675609111786, + "learning_rate": 1.1209611583903334e-06, + "loss": 0.1636, + "step": 45668 + }, + { + "epoch": 0.8478666879546122, + "grad_norm": 0.45228472352027893, + "learning_rate": 1.1204245992521313e-06, + "loss": 0.2742, + "step": 45670 + }, + { + "epoch": 0.8479038180920307, + "grad_norm": 0.6429480910301208, + "learning_rate": 1.1198881609383527e-06, + "loss": 0.4056, + "step": 45672 + }, + { + "epoch": 0.8479409482294494, + "grad_norm": 0.4128681719303131, + "learning_rate": 1.1193518434562966e-06, + "loss": 0.2397, + "step": 45674 + }, + { + "epoch": 0.847978078366868, + "grad_norm": 0.4099940061569214, + "learning_rate": 1.1188156468132639e-06, + "loss": 0.2141, + "step": 45676 + }, + { + "epoch": 0.8480152085042867, + "grad_norm": 0.3877869248390198, + "learning_rate": 1.1182795710165462e-06, + "loss": 0.2067, + "step": 45678 + }, + { + "epoch": 0.8480523386417053, + "grad_norm": 0.2818913161754608, + "learning_rate": 1.117743616073439e-06, + "loss": 0.1738, + "step": 45680 + }, + { + "epoch": 0.848089468779124, + "grad_norm": 0.32833632826805115, + "learning_rate": 1.1172077819912363e-06, + "loss": 0.215, + "step": 45682 + }, + { + "epoch": 0.8481265989165426, + "grad_norm": 0.33793866634368896, + "learning_rate": 1.116672068777228e-06, + "loss": 0.1822, + "step": 45684 + }, + { + "epoch": 0.8481637290539612, + "grad_norm": 0.6777687668800354, + "learning_rate": 1.116136476438705e-06, + "loss": 0.259, + "step": 45686 + }, + { + "epoch": 0.8482008591913799, + "grad_norm": 0.46340078115463257, + "learning_rate": 1.1156010049829557e-06, + "loss": 0.2372, + "step": 45688 + }, + { + "epoch": 0.8482379893287985, + "grad_norm": 0.38235950469970703, + "learning_rate": 1.1150656544172633e-06, + "loss": 0.3629, + "step": 45690 + }, + { + "epoch": 0.8482751194662171, + "grad_norm": 0.33072811365127563, + "learning_rate": 1.1145304247489164e-06, + "loss": 0.2686, + "step": 45692 + }, + { + "epoch": 0.8483122496036358, + "grad_norm": 0.2648681104183197, + "learning_rate": 1.1139953159851924e-06, + "loss": 0.3272, + "step": 45694 + }, + { + "epoch": 0.8483493797410544, + "grad_norm": 0.36077794432640076, + "learning_rate": 1.1134603281333756e-06, + "loss": 0.1412, + "step": 45696 + }, + { + "epoch": 0.8483865098784731, + "grad_norm": 0.35475361347198486, + "learning_rate": 1.1129254612007456e-06, + "loss": 0.3815, + "step": 45698 + }, + { + "epoch": 0.8484236400158917, + "grad_norm": 0.3396094739437103, + "learning_rate": 1.1123907151945823e-06, + "loss": 0.2239, + "step": 45700 + }, + { + "epoch": 0.8484607701533103, + "grad_norm": 0.4191611111164093, + "learning_rate": 1.1118560901221575e-06, + "loss": 0.3854, + "step": 45702 + }, + { + "epoch": 0.848497900290729, + "grad_norm": 0.4581218659877777, + "learning_rate": 1.111321585990749e-06, + "loss": 0.2288, + "step": 45704 + }, + { + "epoch": 0.8485350304281476, + "grad_norm": 0.531073808670044, + "learning_rate": 1.1107872028076283e-06, + "loss": 0.1798, + "step": 45706 + }, + { + "epoch": 0.8485721605655663, + "grad_norm": 0.32432320713996887, + "learning_rate": 1.1102529405800678e-06, + "loss": 0.299, + "step": 45708 + }, + { + "epoch": 0.8486092907029849, + "grad_norm": 0.35589516162872314, + "learning_rate": 1.1097187993153392e-06, + "loss": 0.0717, + "step": 45710 + }, + { + "epoch": 0.8486464208404035, + "grad_norm": 0.4751163125038147, + "learning_rate": 1.1091847790207066e-06, + "loss": 0.3102, + "step": 45712 + }, + { + "epoch": 0.8486835509778222, + "grad_norm": 0.4791879653930664, + "learning_rate": 1.108650879703439e-06, + "loss": 0.3354, + "step": 45714 + }, + { + "epoch": 0.8487206811152408, + "grad_norm": 0.38955581188201904, + "learning_rate": 1.1081171013707992e-06, + "loss": 0.3327, + "step": 45716 + }, + { + "epoch": 0.8487578112526595, + "grad_norm": 0.6199437975883484, + "learning_rate": 1.1075834440300503e-06, + "loss": 0.3467, + "step": 45718 + }, + { + "epoch": 0.848794941390078, + "grad_norm": 0.4170013666152954, + "learning_rate": 1.1070499076884555e-06, + "loss": 0.3848, + "step": 45720 + }, + { + "epoch": 0.8488320715274967, + "grad_norm": 0.3895348012447357, + "learning_rate": 1.1065164923532746e-06, + "loss": 0.2023, + "step": 45722 + }, + { + "epoch": 0.8488692016649153, + "grad_norm": 0.3582184612751007, + "learning_rate": 1.1059831980317636e-06, + "loss": 0.1324, + "step": 45724 + }, + { + "epoch": 0.848906331802334, + "grad_norm": 0.340768426656723, + "learning_rate": 1.1054500247311839e-06, + "loss": 0.232, + "step": 45726 + }, + { + "epoch": 0.8489434619397527, + "grad_norm": 0.5099552273750305, + "learning_rate": 1.104916972458785e-06, + "loss": 0.1743, + "step": 45728 + }, + { + "epoch": 0.8489805920771712, + "grad_norm": 0.47041523456573486, + "learning_rate": 1.104384041221822e-06, + "loss": 0.3169, + "step": 45730 + }, + { + "epoch": 0.8490177222145899, + "grad_norm": 0.30012568831443787, + "learning_rate": 1.1038512310275484e-06, + "loss": 0.1755, + "step": 45732 + }, + { + "epoch": 0.8490548523520085, + "grad_norm": 0.2999100685119629, + "learning_rate": 1.1033185418832105e-06, + "loss": 0.2874, + "step": 45734 + }, + { + "epoch": 0.8490919824894272, + "grad_norm": 0.27719348669052124, + "learning_rate": 1.1027859737960588e-06, + "loss": 0.3284, + "step": 45736 + }, + { + "epoch": 0.8491291126268459, + "grad_norm": 0.3581984341144562, + "learning_rate": 1.1022535267733426e-06, + "loss": 0.3144, + "step": 45738 + }, + { + "epoch": 0.8491662427642644, + "grad_norm": 0.2818341553211212, + "learning_rate": 1.101721200822301e-06, + "loss": 0.2828, + "step": 45740 + }, + { + "epoch": 0.8492033729016831, + "grad_norm": 0.44229641556739807, + "learning_rate": 1.101188995950181e-06, + "loss": 0.302, + "step": 45742 + }, + { + "epoch": 0.8492405030391017, + "grad_norm": 0.35186806321144104, + "learning_rate": 1.100656912164223e-06, + "loss": 0.2456, + "step": 45744 + }, + { + "epoch": 0.8492776331765204, + "grad_norm": 0.3045912981033325, + "learning_rate": 1.1001249494716682e-06, + "loss": 0.1405, + "step": 45746 + }, + { + "epoch": 0.8493147633139391, + "grad_norm": 0.4344618320465088, + "learning_rate": 1.099593107879755e-06, + "loss": 0.1408, + "step": 45748 + }, + { + "epoch": 0.8493518934513576, + "grad_norm": 0.3132820427417755, + "learning_rate": 1.099061387395719e-06, + "loss": 0.387, + "step": 45750 + }, + { + "epoch": 0.8493890235887763, + "grad_norm": 0.4699723720550537, + "learning_rate": 1.0985297880267986e-06, + "loss": 0.435, + "step": 45752 + }, + { + "epoch": 0.8494261537261949, + "grad_norm": 0.4801592230796814, + "learning_rate": 1.097998309780225e-06, + "loss": 0.4126, + "step": 45754 + }, + { + "epoch": 0.8494632838636136, + "grad_norm": 0.22513142228126526, + "learning_rate": 1.0974669526632275e-06, + "loss": 0.3236, + "step": 45756 + }, + { + "epoch": 0.8495004140010323, + "grad_norm": 0.6874632239341736, + "learning_rate": 1.0969357166830397e-06, + "loss": 0.2838, + "step": 45758 + }, + { + "epoch": 0.8495375441384508, + "grad_norm": 0.48389434814453125, + "learning_rate": 1.0964046018468877e-06, + "loss": 0.1719, + "step": 45760 + }, + { + "epoch": 0.8495746742758695, + "grad_norm": 0.3801978528499603, + "learning_rate": 1.0958736081620003e-06, + "loss": 0.3376, + "step": 45762 + }, + { + "epoch": 0.8496118044132881, + "grad_norm": 0.315958708524704, + "learning_rate": 1.0953427356356038e-06, + "loss": 0.1782, + "step": 45764 + }, + { + "epoch": 0.8496489345507068, + "grad_norm": 0.43887069821357727, + "learning_rate": 1.0948119842749182e-06, + "loss": 0.1563, + "step": 45766 + }, + { + "epoch": 0.8496860646881255, + "grad_norm": 0.3497162461280823, + "learning_rate": 1.0942813540871677e-06, + "loss": 0.337, + "step": 45768 + }, + { + "epoch": 0.849723194825544, + "grad_norm": 0.4325413405895233, + "learning_rate": 1.0937508450795719e-06, + "loss": 0.2204, + "step": 45770 + }, + { + "epoch": 0.8497603249629627, + "grad_norm": 0.33843863010406494, + "learning_rate": 1.0932204572593496e-06, + "loss": 0.2056, + "step": 45772 + }, + { + "epoch": 0.8497974551003813, + "grad_norm": 0.3847010135650635, + "learning_rate": 1.0926901906337205e-06, + "loss": 0.2986, + "step": 45774 + }, + { + "epoch": 0.8498345852378, + "grad_norm": 0.2653925120830536, + "learning_rate": 1.0921600452098968e-06, + "loss": 0.3462, + "step": 45776 + }, + { + "epoch": 0.8498717153752186, + "grad_norm": 0.4412490427494049, + "learning_rate": 1.0916300209950915e-06, + "loss": 0.3688, + "step": 45778 + }, + { + "epoch": 0.8499088455126372, + "grad_norm": 0.6875383853912354, + "learning_rate": 1.0911001179965175e-06, + "loss": 0.3007, + "step": 45780 + }, + { + "epoch": 0.8499459756500559, + "grad_norm": 0.2566690146923065, + "learning_rate": 1.090570336221386e-06, + "loss": 0.2596, + "step": 45782 + }, + { + "epoch": 0.8499831057874745, + "grad_norm": 0.36915159225463867, + "learning_rate": 1.0900406756769055e-06, + "loss": 0.3109, + "step": 45784 + }, + { + "epoch": 0.8500202359248932, + "grad_norm": 0.2693183720111847, + "learning_rate": 1.0895111363702826e-06, + "loss": 0.1505, + "step": 45786 + }, + { + "epoch": 0.8500573660623117, + "grad_norm": 0.5959376692771912, + "learning_rate": 1.0889817183087236e-06, + "loss": 0.3289, + "step": 45788 + }, + { + "epoch": 0.8500944961997304, + "grad_norm": 0.2837160527706146, + "learning_rate": 1.088452421499433e-06, + "loss": 0.2686, + "step": 45790 + }, + { + "epoch": 0.8501316263371491, + "grad_norm": 0.3837205171585083, + "learning_rate": 1.0879232459496103e-06, + "loss": 0.0845, + "step": 45792 + }, + { + "epoch": 0.8501687564745677, + "grad_norm": 0.3714580833911896, + "learning_rate": 1.0873941916664578e-06, + "loss": 0.1331, + "step": 45794 + }, + { + "epoch": 0.8502058866119864, + "grad_norm": 0.41546598076820374, + "learning_rate": 1.0868652586571748e-06, + "loss": 0.2922, + "step": 45796 + }, + { + "epoch": 0.850243016749405, + "grad_norm": 0.374381959438324, + "learning_rate": 1.0863364469289573e-06, + "loss": 0.3212, + "step": 45798 + }, + { + "epoch": 0.8502801468868236, + "grad_norm": 0.3366502523422241, + "learning_rate": 1.0858077564890002e-06, + "loss": 0.3412, + "step": 45800 + }, + { + "epoch": 0.8503172770242423, + "grad_norm": 0.32865509390830994, + "learning_rate": 1.0852791873445002e-06, + "loss": 0.43, + "step": 45802 + }, + { + "epoch": 0.8503544071616609, + "grad_norm": 0.28559446334838867, + "learning_rate": 1.0847507395026468e-06, + "loss": 0.2814, + "step": 45804 + }, + { + "epoch": 0.8503915372990796, + "grad_norm": 0.43025457859039307, + "learning_rate": 1.0842224129706302e-06, + "loss": 0.3273, + "step": 45806 + }, + { + "epoch": 0.8504286674364981, + "grad_norm": 0.38377469778060913, + "learning_rate": 1.0836942077556423e-06, + "loss": 0.3647, + "step": 45808 + }, + { + "epoch": 0.8504657975739168, + "grad_norm": 0.22295059263706207, + "learning_rate": 1.0831661238648684e-06, + "loss": 0.3308, + "step": 45810 + }, + { + "epoch": 0.8505029277113355, + "grad_norm": 0.36052536964416504, + "learning_rate": 1.0826381613054937e-06, + "loss": 0.2514, + "step": 45812 + }, + { + "epoch": 0.8505400578487541, + "grad_norm": 0.2967050075531006, + "learning_rate": 1.082110320084705e-06, + "loss": 0.2162, + "step": 45814 + }, + { + "epoch": 0.8505771879861728, + "grad_norm": 0.5921688675880432, + "learning_rate": 1.0815826002096819e-06, + "loss": 0.2528, + "step": 45816 + }, + { + "epoch": 0.8506143181235913, + "grad_norm": 0.5063170194625854, + "learning_rate": 1.0810550016876066e-06, + "loss": 0.2938, + "step": 45818 + }, + { + "epoch": 0.85065144826101, + "grad_norm": 0.3069388270378113, + "learning_rate": 1.0805275245256563e-06, + "loss": 0.2259, + "step": 45820 + }, + { + "epoch": 0.8506885783984287, + "grad_norm": 0.5941773056983948, + "learning_rate": 1.0800001687310091e-06, + "loss": 0.2344, + "step": 45822 + }, + { + "epoch": 0.8507257085358473, + "grad_norm": 0.3848460614681244, + "learning_rate": 1.0794729343108413e-06, + "loss": 0.1967, + "step": 45824 + }, + { + "epoch": 0.850762838673266, + "grad_norm": 0.4966377019882202, + "learning_rate": 1.0789458212723269e-06, + "loss": 0.3422, + "step": 45826 + }, + { + "epoch": 0.8507999688106845, + "grad_norm": 0.4261513650417328, + "learning_rate": 1.0784188296226394e-06, + "loss": 0.2784, + "step": 45828 + }, + { + "epoch": 0.8508370989481032, + "grad_norm": 0.4619360864162445, + "learning_rate": 1.0778919593689475e-06, + "loss": 0.1112, + "step": 45830 + }, + { + "epoch": 0.8508742290855218, + "grad_norm": 0.16798186302185059, + "learning_rate": 1.0773652105184206e-06, + "loss": 0.2746, + "step": 45832 + }, + { + "epoch": 0.8509113592229405, + "grad_norm": 0.443312406539917, + "learning_rate": 1.0768385830782268e-06, + "loss": 0.3658, + "step": 45834 + }, + { + "epoch": 0.8509484893603592, + "grad_norm": 0.2957082986831665, + "learning_rate": 1.0763120770555313e-06, + "loss": 0.3061, + "step": 45836 + }, + { + "epoch": 0.8509856194977777, + "grad_norm": 0.4394667148590088, + "learning_rate": 1.0757856924575017e-06, + "loss": 0.3056, + "step": 45838 + }, + { + "epoch": 0.8510227496351964, + "grad_norm": 0.36978834867477417, + "learning_rate": 1.0752594292912976e-06, + "loss": 0.2392, + "step": 45840 + }, + { + "epoch": 0.851059879772615, + "grad_norm": 0.6332898139953613, + "learning_rate": 1.0747332875640782e-06, + "loss": 0.233, + "step": 45842 + }, + { + "epoch": 0.8510970099100337, + "grad_norm": 0.1708012968301773, + "learning_rate": 1.0742072672830039e-06, + "loss": 0.2769, + "step": 45844 + }, + { + "epoch": 0.8511341400474524, + "grad_norm": 0.49460455775260925, + "learning_rate": 1.0736813684552338e-06, + "loss": 0.1069, + "step": 45846 + }, + { + "epoch": 0.8511712701848709, + "grad_norm": 0.4883250296115875, + "learning_rate": 1.0731555910879221e-06, + "loss": 0.2566, + "step": 45848 + }, + { + "epoch": 0.8512084003222896, + "grad_norm": 0.3710325360298157, + "learning_rate": 1.072629935188224e-06, + "loss": 0.2929, + "step": 45850 + }, + { + "epoch": 0.8512455304597082, + "grad_norm": 0.4473518431186676, + "learning_rate": 1.0721044007632942e-06, + "loss": 0.2774, + "step": 45852 + }, + { + "epoch": 0.8512826605971269, + "grad_norm": 0.3142022490501404, + "learning_rate": 1.0715789878202787e-06, + "loss": 0.3252, + "step": 45854 + }, + { + "epoch": 0.8513197907345456, + "grad_norm": 0.5459117293357849, + "learning_rate": 1.0710536963663299e-06, + "loss": 0.3411, + "step": 45856 + }, + { + "epoch": 0.8513569208719641, + "grad_norm": 0.4470216631889343, + "learning_rate": 1.0705285264085952e-06, + "loss": 0.2737, + "step": 45858 + }, + { + "epoch": 0.8513940510093828, + "grad_norm": 0.3486539125442505, + "learning_rate": 1.0700034779542213e-06, + "loss": 0.2203, + "step": 45860 + }, + { + "epoch": 0.8514311811468014, + "grad_norm": 0.4523930251598358, + "learning_rate": 1.069478551010351e-06, + "loss": 0.1823, + "step": 45862 + }, + { + "epoch": 0.8514683112842201, + "grad_norm": 0.44851231575012207, + "learning_rate": 1.0689537455841293e-06, + "loss": 0.2858, + "step": 45864 + }, + { + "epoch": 0.8515054414216388, + "grad_norm": 0.3544789254665375, + "learning_rate": 1.0684290616826932e-06, + "loss": 0.2376, + "step": 45866 + }, + { + "epoch": 0.8515425715590573, + "grad_norm": 0.2903895974159241, + "learning_rate": 1.0679044993131838e-06, + "loss": 0.3488, + "step": 45868 + }, + { + "epoch": 0.851579701696476, + "grad_norm": 0.28002220392227173, + "learning_rate": 1.067380058482741e-06, + "loss": 0.1652, + "step": 45870 + }, + { + "epoch": 0.8516168318338946, + "grad_norm": 0.5367836952209473, + "learning_rate": 1.066855739198498e-06, + "loss": 0.3014, + "step": 45872 + }, + { + "epoch": 0.8516539619713133, + "grad_norm": 0.28074708580970764, + "learning_rate": 1.0663315414675912e-06, + "loss": 0.3915, + "step": 45874 + }, + { + "epoch": 0.8516910921087318, + "grad_norm": 0.35318630933761597, + "learning_rate": 1.0658074652971518e-06, + "loss": 0.1274, + "step": 45876 + }, + { + "epoch": 0.8517282222461505, + "grad_norm": 0.39961472153663635, + "learning_rate": 1.065283510694315e-06, + "loss": 0.4723, + "step": 45878 + }, + { + "epoch": 0.8517653523835692, + "grad_norm": 0.3566596806049347, + "learning_rate": 1.064759677666204e-06, + "loss": 0.1698, + "step": 45880 + }, + { + "epoch": 0.8518024825209878, + "grad_norm": 0.4198572039604187, + "learning_rate": 1.064235966219952e-06, + "loss": 0.2475, + "step": 45882 + }, + { + "epoch": 0.8518396126584065, + "grad_norm": 0.41657543182373047, + "learning_rate": 1.0637123763626812e-06, + "loss": 0.3093, + "step": 45884 + }, + { + "epoch": 0.851876742795825, + "grad_norm": 0.5737757086753845, + "learning_rate": 1.0631889081015178e-06, + "loss": 0.2677, + "step": 45886 + }, + { + "epoch": 0.8519138729332437, + "grad_norm": 0.5748029947280884, + "learning_rate": 1.0626655614435843e-06, + "loss": 0.153, + "step": 45888 + }, + { + "epoch": 0.8519510030706624, + "grad_norm": 0.4245150089263916, + "learning_rate": 1.0621423363960038e-06, + "loss": 0.2452, + "step": 45890 + }, + { + "epoch": 0.851988133208081, + "grad_norm": 0.2722765803337097, + "learning_rate": 1.0616192329658914e-06, + "loss": 0.2851, + "step": 45892 + }, + { + "epoch": 0.8520252633454997, + "grad_norm": 0.4790297746658325, + "learning_rate": 1.0610962511603695e-06, + "loss": 0.2379, + "step": 45894 + }, + { + "epoch": 0.8520623934829182, + "grad_norm": 0.180750772356987, + "learning_rate": 1.0605733909865512e-06, + "loss": 0.5136, + "step": 45896 + }, + { + "epoch": 0.8520995236203369, + "grad_norm": 0.6062609553337097, + "learning_rate": 1.060050652451553e-06, + "loss": 0.2474, + "step": 45898 + }, + { + "epoch": 0.8521366537577556, + "grad_norm": 0.3892255127429962, + "learning_rate": 1.0595280355624882e-06, + "loss": 0.2568, + "step": 45900 + }, + { + "epoch": 0.8521737838951742, + "grad_norm": 0.5422790050506592, + "learning_rate": 1.0590055403264655e-06, + "loss": 0.2393, + "step": 45902 + }, + { + "epoch": 0.8522109140325929, + "grad_norm": 0.5770406126976013, + "learning_rate": 1.0584831667505991e-06, + "loss": 0.4254, + "step": 45904 + }, + { + "epoch": 0.8522480441700114, + "grad_norm": 0.16075527667999268, + "learning_rate": 1.05796091484199e-06, + "loss": 0.1344, + "step": 45906 + }, + { + "epoch": 0.8522851743074301, + "grad_norm": 0.49856898188591003, + "learning_rate": 1.0574387846077494e-06, + "loss": 0.3245, + "step": 45908 + }, + { + "epoch": 0.8523223044448488, + "grad_norm": 0.33979272842407227, + "learning_rate": 1.0569167760549814e-06, + "loss": 0.3413, + "step": 45910 + }, + { + "epoch": 0.8523594345822674, + "grad_norm": 0.24217449128627777, + "learning_rate": 1.0563948891907871e-06, + "loss": 0.1777, + "step": 45912 + }, + { + "epoch": 0.8523965647196861, + "grad_norm": 0.47848424315452576, + "learning_rate": 1.0558731240222698e-06, + "loss": 0.2715, + "step": 45914 + }, + { + "epoch": 0.8524336948571046, + "grad_norm": 0.31679052114486694, + "learning_rate": 1.0553514805565312e-06, + "loss": 0.2867, + "step": 45916 + }, + { + "epoch": 0.8524708249945233, + "grad_norm": 0.5799715518951416, + "learning_rate": 1.054829958800664e-06, + "loss": 0.1701, + "step": 45918 + }, + { + "epoch": 0.852507955131942, + "grad_norm": 0.5492575764656067, + "learning_rate": 1.0543085587617674e-06, + "loss": 0.1786, + "step": 45920 + }, + { + "epoch": 0.8525450852693606, + "grad_norm": 0.2964135706424713, + "learning_rate": 1.0537872804469374e-06, + "loss": 0.1169, + "step": 45922 + }, + { + "epoch": 0.8525822154067793, + "grad_norm": 0.7114038467407227, + "learning_rate": 1.0532661238632636e-06, + "loss": 0.1051, + "step": 45924 + }, + { + "epoch": 0.8526193455441978, + "grad_norm": 0.3371790051460266, + "learning_rate": 1.0527450890178392e-06, + "loss": 0.2555, + "step": 45926 + }, + { + "epoch": 0.8526564756816165, + "grad_norm": 0.4589039087295532, + "learning_rate": 1.0522241759177566e-06, + "loss": 0.3586, + "step": 45928 + }, + { + "epoch": 0.8526936058190351, + "grad_norm": 0.3742820918560028, + "learning_rate": 1.0517033845700985e-06, + "loss": 0.2502, + "step": 45930 + }, + { + "epoch": 0.8527307359564538, + "grad_norm": 0.3792300820350647, + "learning_rate": 1.0511827149819553e-06, + "loss": 0.1868, + "step": 45932 + }, + { + "epoch": 0.8527678660938725, + "grad_norm": 0.24446897208690643, + "learning_rate": 1.050662167160409e-06, + "loss": 0.2214, + "step": 45934 + }, + { + "epoch": 0.852804996231291, + "grad_norm": 0.451593816280365, + "learning_rate": 1.050141741112546e-06, + "loss": 0.1768, + "step": 45936 + }, + { + "epoch": 0.8528421263687097, + "grad_norm": 0.2400733381509781, + "learning_rate": 1.0496214368454438e-06, + "loss": 0.1928, + "step": 45938 + }, + { + "epoch": 0.8528792565061283, + "grad_norm": 0.31485554575920105, + "learning_rate": 1.0491012543661882e-06, + "loss": 0.1486, + "step": 45940 + }, + { + "epoch": 0.852916386643547, + "grad_norm": 0.2525891661643982, + "learning_rate": 1.0485811936818502e-06, + "loss": 0.2328, + "step": 45942 + }, + { + "epoch": 0.8529535167809656, + "grad_norm": 0.42416176199913025, + "learning_rate": 1.0480612547995106e-06, + "loss": 0.3785, + "step": 45944 + }, + { + "epoch": 0.8529906469183842, + "grad_norm": 0.4395844638347626, + "learning_rate": 1.047541437726245e-06, + "loss": 0.2917, + "step": 45946 + }, + { + "epoch": 0.8530277770558029, + "grad_norm": 0.7794254422187805, + "learning_rate": 1.047021742469122e-06, + "loss": 0.1704, + "step": 45948 + }, + { + "epoch": 0.8530649071932215, + "grad_norm": 0.43471774458885193, + "learning_rate": 1.0465021690352162e-06, + "loss": 0.4344, + "step": 45950 + }, + { + "epoch": 0.8531020373306402, + "grad_norm": 0.5441377758979797, + "learning_rate": 1.0459827174315973e-06, + "loss": 0.3384, + "step": 45952 + }, + { + "epoch": 0.8531391674680588, + "grad_norm": 0.43783295154571533, + "learning_rate": 1.0454633876653353e-06, + "loss": 0.1135, + "step": 45954 + }, + { + "epoch": 0.8531762976054774, + "grad_norm": 0.48397764563560486, + "learning_rate": 1.0449441797434923e-06, + "loss": 0.1306, + "step": 45956 + }, + { + "epoch": 0.8532134277428961, + "grad_norm": 0.4890786111354828, + "learning_rate": 1.0444250936731347e-06, + "loss": 0.2852, + "step": 45958 + }, + { + "epoch": 0.8532505578803147, + "grad_norm": 0.4246615171432495, + "learning_rate": 1.0439061294613284e-06, + "loss": 0.2106, + "step": 45960 + }, + { + "epoch": 0.8532876880177334, + "grad_norm": 0.2709071934223175, + "learning_rate": 1.0433872871151319e-06, + "loss": 0.2762, + "step": 45962 + }, + { + "epoch": 0.853324818155152, + "grad_norm": 0.5390672087669373, + "learning_rate": 1.0428685666416083e-06, + "loss": 0.4722, + "step": 45964 + }, + { + "epoch": 0.8533619482925706, + "grad_norm": 0.23091760277748108, + "learning_rate": 1.0423499680478122e-06, + "loss": 0.0603, + "step": 45966 + }, + { + "epoch": 0.8533990784299893, + "grad_norm": 0.4271117150783539, + "learning_rate": 1.0418314913408034e-06, + "loss": 0.4149, + "step": 45968 + }, + { + "epoch": 0.8534362085674079, + "grad_norm": 0.4028742015361786, + "learning_rate": 1.041313136527633e-06, + "loss": 0.388, + "step": 45970 + }, + { + "epoch": 0.8534733387048266, + "grad_norm": 0.6057612895965576, + "learning_rate": 1.0407949036153564e-06, + "loss": 0.27, + "step": 45972 + }, + { + "epoch": 0.8535104688422452, + "grad_norm": 0.4208018481731415, + "learning_rate": 1.0402767926110258e-06, + "loss": 0.364, + "step": 45974 + }, + { + "epoch": 0.8535475989796638, + "grad_norm": 0.44113689661026, + "learning_rate": 1.0397588035216899e-06, + "loss": 0.2217, + "step": 45976 + }, + { + "epoch": 0.8535847291170825, + "grad_norm": 0.3366793394088745, + "learning_rate": 1.039240936354401e-06, + "loss": 0.3272, + "step": 45978 + }, + { + "epoch": 0.8536218592545011, + "grad_norm": 0.4800073802471161, + "learning_rate": 1.0387231911161988e-06, + "loss": 0.2765, + "step": 45980 + }, + { + "epoch": 0.8536589893919198, + "grad_norm": 0.16636383533477783, + "learning_rate": 1.0382055678141323e-06, + "loss": 0.2887, + "step": 45982 + }, + { + "epoch": 0.8536961195293383, + "grad_norm": 0.2764129042625427, + "learning_rate": 1.0376880664552446e-06, + "loss": 0.3735, + "step": 45984 + }, + { + "epoch": 0.853733249666757, + "grad_norm": 0.32426717877388, + "learning_rate": 1.0371706870465793e-06, + "loss": 0.1868, + "step": 45986 + }, + { + "epoch": 0.8537703798041757, + "grad_norm": 0.3913825452327728, + "learning_rate": 1.0366534295951714e-06, + "loss": 0.2667, + "step": 45988 + }, + { + "epoch": 0.8538075099415943, + "grad_norm": 0.4962126314640045, + "learning_rate": 1.0361362941080643e-06, + "loss": 0.3124, + "step": 45990 + }, + { + "epoch": 0.853844640079013, + "grad_norm": 0.34363263845443726, + "learning_rate": 1.0356192805922915e-06, + "loss": 0.5479, + "step": 45992 + }, + { + "epoch": 0.8538817702164315, + "grad_norm": 0.3517397940158844, + "learning_rate": 1.0351023890548872e-06, + "loss": 0.3074, + "step": 45994 + }, + { + "epoch": 0.8539189003538502, + "grad_norm": 0.447147935628891, + "learning_rate": 1.0345856195028869e-06, + "loss": 0.2679, + "step": 45996 + }, + { + "epoch": 0.8539560304912689, + "grad_norm": 0.4513747990131378, + "learning_rate": 1.0340689719433228e-06, + "loss": 0.2177, + "step": 45998 + }, + { + "epoch": 0.8539931606286875, + "grad_norm": 0.2791364789009094, + "learning_rate": 1.0335524463832225e-06, + "loss": 0.373, + "step": 46000 + }, + { + "epoch": 0.8540302907661061, + "grad_norm": 0.25084614753723145, + "learning_rate": 1.033036042829617e-06, + "loss": 0.1725, + "step": 46002 + }, + { + "epoch": 0.8540674209035247, + "grad_norm": 0.4851343035697937, + "learning_rate": 1.0325197612895333e-06, + "loss": 0.2809, + "step": 46004 + }, + { + "epoch": 0.8541045510409434, + "grad_norm": 0.27796807885169983, + "learning_rate": 1.032003601769993e-06, + "loss": 0.5019, + "step": 46006 + }, + { + "epoch": 0.8541416811783621, + "grad_norm": 0.44860410690307617, + "learning_rate": 1.031487564278023e-06, + "loss": 0.2012, + "step": 46008 + }, + { + "epoch": 0.8541788113157807, + "grad_norm": 0.36061036586761475, + "learning_rate": 1.0309716488206411e-06, + "loss": 0.2425, + "step": 46010 + }, + { + "epoch": 0.8542159414531993, + "grad_norm": 0.29998350143432617, + "learning_rate": 1.0304558554048705e-06, + "loss": 0.3274, + "step": 46012 + }, + { + "epoch": 0.8542530715906179, + "grad_norm": 0.36201295256614685, + "learning_rate": 1.0299401840377287e-06, + "loss": 0.2967, + "step": 46014 + }, + { + "epoch": 0.8542902017280366, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.0294246347262349e-06, + "loss": 0.5454, + "step": 46016 + }, + { + "epoch": 0.8543273318654553, + "grad_norm": 0.2742396593093872, + "learning_rate": 1.0289092074773988e-06, + "loss": 0.269, + "step": 46018 + }, + { + "epoch": 0.8543644620028739, + "grad_norm": 0.17151041328907013, + "learning_rate": 1.028393902298238e-06, + "loss": 0.1378, + "step": 46020 + }, + { + "epoch": 0.8544015921402925, + "grad_norm": 0.33328571915626526, + "learning_rate": 1.0278787191957617e-06, + "loss": 0.2217, + "step": 46022 + }, + { + "epoch": 0.8544387222777111, + "grad_norm": 0.38433122634887695, + "learning_rate": 1.027363658176983e-06, + "loss": 0.1776, + "step": 46024 + }, + { + "epoch": 0.8544758524151298, + "grad_norm": 0.289621502161026, + "learning_rate": 1.0268487192489075e-06, + "loss": 0.1082, + "step": 46026 + }, + { + "epoch": 0.8545129825525484, + "grad_norm": 0.4766714870929718, + "learning_rate": 1.0263339024185458e-06, + "loss": 0.3294, + "step": 46028 + }, + { + "epoch": 0.8545501126899671, + "grad_norm": 0.3471944034099579, + "learning_rate": 1.0258192076929008e-06, + "loss": 0.2152, + "step": 46030 + }, + { + "epoch": 0.8545872428273857, + "grad_norm": 0.320366770029068, + "learning_rate": 1.025304635078973e-06, + "loss": 0.2824, + "step": 46032 + }, + { + "epoch": 0.8546243729648043, + "grad_norm": 0.9767128825187683, + "learning_rate": 1.0247901845837672e-06, + "loss": 0.3382, + "step": 46034 + }, + { + "epoch": 0.854661503102223, + "grad_norm": 0.43145957589149475, + "learning_rate": 1.024275856214283e-06, + "loss": 0.3641, + "step": 46036 + }, + { + "epoch": 0.8546986332396416, + "grad_norm": 0.4482840597629547, + "learning_rate": 1.0237616499775194e-06, + "loss": 0.1859, + "step": 46038 + }, + { + "epoch": 0.8547357633770603, + "grad_norm": 0.6534146070480347, + "learning_rate": 1.023247565880472e-06, + "loss": 0.351, + "step": 46040 + }, + { + "epoch": 0.8547728935144789, + "grad_norm": 0.42724812030792236, + "learning_rate": 1.0227336039301394e-06, + "loss": 0.3739, + "step": 46042 + }, + { + "epoch": 0.8548100236518975, + "grad_norm": 0.3642466366291046, + "learning_rate": 1.0222197641335096e-06, + "loss": 0.3619, + "step": 46044 + }, + { + "epoch": 0.8548471537893162, + "grad_norm": 0.3381684720516205, + "learning_rate": 1.021706046497578e-06, + "loss": 0.2021, + "step": 46046 + }, + { + "epoch": 0.8548842839267348, + "grad_norm": 0.383078396320343, + "learning_rate": 1.0211924510293337e-06, + "loss": 0.2247, + "step": 46048 + }, + { + "epoch": 0.8549214140641535, + "grad_norm": 0.30971020460128784, + "learning_rate": 1.0206789777357672e-06, + "loss": 0.2024, + "step": 46050 + }, + { + "epoch": 0.8549585442015721, + "grad_norm": 0.20407156646251678, + "learning_rate": 1.0201656266238624e-06, + "loss": 0.3147, + "step": 46052 + }, + { + "epoch": 0.8549956743389907, + "grad_norm": 0.5261774659156799, + "learning_rate": 1.0196523977006067e-06, + "loss": 0.2627, + "step": 46054 + }, + { + "epoch": 0.8550328044764094, + "grad_norm": 0.28000393509864807, + "learning_rate": 1.0191392909729814e-06, + "loss": 0.1602, + "step": 46056 + }, + { + "epoch": 0.855069934613828, + "grad_norm": 0.28807374835014343, + "learning_rate": 1.0186263064479695e-06, + "loss": 0.3009, + "step": 46058 + }, + { + "epoch": 0.8551070647512466, + "grad_norm": 0.4003903567790985, + "learning_rate": 1.0181134441325512e-06, + "loss": 0.3935, + "step": 46060 + }, + { + "epoch": 0.8551441948886653, + "grad_norm": 0.44871625304222107, + "learning_rate": 1.0176007040337043e-06, + "loss": 0.2415, + "step": 46062 + }, + { + "epoch": 0.8551813250260839, + "grad_norm": 0.34174075722694397, + "learning_rate": 1.0170880861584076e-06, + "loss": 0.2746, + "step": 46064 + }, + { + "epoch": 0.8552184551635026, + "grad_norm": 0.4288659691810608, + "learning_rate": 1.0165755905136364e-06, + "loss": 0.3563, + "step": 46066 + }, + { + "epoch": 0.8552555853009212, + "grad_norm": 0.5128308534622192, + "learning_rate": 1.016063217106361e-06, + "loss": 0.0958, + "step": 46068 + }, + { + "epoch": 0.8552927154383398, + "grad_norm": 0.5027909874916077, + "learning_rate": 1.015550965943556e-06, + "loss": 0.2644, + "step": 46070 + }, + { + "epoch": 0.8553298455757585, + "grad_norm": 0.3130652904510498, + "learning_rate": 1.0150388370321917e-06, + "loss": 0.1813, + "step": 46072 + }, + { + "epoch": 0.8553669757131771, + "grad_norm": 0.3760024607181549, + "learning_rate": 1.0145268303792343e-06, + "loss": 0.4993, + "step": 46074 + }, + { + "epoch": 0.8554041058505958, + "grad_norm": 0.45926809310913086, + "learning_rate": 1.0140149459916516e-06, + "loss": 0.3788, + "step": 46076 + }, + { + "epoch": 0.8554412359880144, + "grad_norm": 0.695491373538971, + "learning_rate": 1.0135031838764099e-06, + "loss": 0.3275, + "step": 46078 + }, + { + "epoch": 0.855478366125433, + "grad_norm": 0.4763723611831665, + "learning_rate": 1.0129915440404747e-06, + "loss": 0.2214, + "step": 46080 + }, + { + "epoch": 0.8555154962628516, + "grad_norm": 0.387323260307312, + "learning_rate": 1.0124800264908018e-06, + "loss": 0.0521, + "step": 46082 + }, + { + "epoch": 0.8555526264002703, + "grad_norm": 0.2989139258861542, + "learning_rate": 1.011968631234357e-06, + "loss": 0.2115, + "step": 46084 + }, + { + "epoch": 0.855589756537689, + "grad_norm": 0.32972052693367004, + "learning_rate": 1.0114573582780952e-06, + "loss": 0.1938, + "step": 46086 + }, + { + "epoch": 0.8556268866751076, + "grad_norm": 0.31097927689552307, + "learning_rate": 1.0109462076289755e-06, + "loss": 0.2783, + "step": 46088 + }, + { + "epoch": 0.8556640168125262, + "grad_norm": 0.2864035964012146, + "learning_rate": 1.0104351792939549e-06, + "loss": 0.1666, + "step": 46090 + }, + { + "epoch": 0.8557011469499448, + "grad_norm": 0.46506014466285706, + "learning_rate": 1.0099242732799818e-06, + "loss": 0.2935, + "step": 46092 + }, + { + "epoch": 0.8557382770873635, + "grad_norm": 0.40353989601135254, + "learning_rate": 1.0094134895940144e-06, + "loss": 0.3407, + "step": 46094 + }, + { + "epoch": 0.8557754072247822, + "grad_norm": 0.3506017029285431, + "learning_rate": 1.0089028282429958e-06, + "loss": 0.2645, + "step": 46096 + }, + { + "epoch": 0.8558125373622008, + "grad_norm": 0.42331644892692566, + "learning_rate": 1.0083922892338794e-06, + "loss": 0.2844, + "step": 46098 + }, + { + "epoch": 0.8558496674996194, + "grad_norm": 0.19131030142307281, + "learning_rate": 1.0078818725736105e-06, + "loss": 0.1323, + "step": 46100 + }, + { + "epoch": 0.855886797637038, + "grad_norm": 0.31756022572517395, + "learning_rate": 1.007371578269135e-06, + "loss": 0.294, + "step": 46102 + }, + { + "epoch": 0.8559239277744567, + "grad_norm": 0.5922867655754089, + "learning_rate": 1.0068614063273986e-06, + "loss": 0.2922, + "step": 46104 + }, + { + "epoch": 0.8559610579118754, + "grad_norm": 0.5192360281944275, + "learning_rate": 1.0063513567553384e-06, + "loss": 0.336, + "step": 46106 + }, + { + "epoch": 0.855998188049294, + "grad_norm": 0.44299548864364624, + "learning_rate": 1.0058414295598973e-06, + "loss": 0.2214, + "step": 46108 + }, + { + "epoch": 0.8560353181867126, + "grad_norm": 0.4147316515445709, + "learning_rate": 1.0053316247480138e-06, + "loss": 0.1448, + "step": 46110 + }, + { + "epoch": 0.8560724483241312, + "grad_norm": 0.4246554374694824, + "learning_rate": 1.0048219423266236e-06, + "loss": 0.1484, + "step": 46112 + }, + { + "epoch": 0.8561095784615499, + "grad_norm": 0.5457956194877625, + "learning_rate": 1.0043123823026668e-06, + "loss": 0.2399, + "step": 46114 + }, + { + "epoch": 0.8561467085989686, + "grad_norm": 0.414995938539505, + "learning_rate": 1.00380294468307e-06, + "loss": 0.1696, + "step": 46116 + }, + { + "epoch": 0.8561838387363871, + "grad_norm": 0.43596112728118896, + "learning_rate": 1.003293629474771e-06, + "loss": 0.2017, + "step": 46118 + }, + { + "epoch": 0.8562209688738058, + "grad_norm": 0.559964120388031, + "learning_rate": 1.0027844366846962e-06, + "loss": 0.2874, + "step": 46120 + }, + { + "epoch": 0.8562580990112244, + "grad_norm": 0.4684061110019684, + "learning_rate": 1.0022753663197737e-06, + "loss": 0.1829, + "step": 46122 + }, + { + "epoch": 0.8562952291486431, + "grad_norm": 0.8103684782981873, + "learning_rate": 1.0017664183869336e-06, + "loss": 0.3036, + "step": 46124 + }, + { + "epoch": 0.8563323592860618, + "grad_norm": 0.5137556791305542, + "learning_rate": 1.001257592893099e-06, + "loss": 0.2096, + "step": 46126 + }, + { + "epoch": 0.8563694894234803, + "grad_norm": 0.5199487209320068, + "learning_rate": 1.0007488898451945e-06, + "loss": 0.1466, + "step": 46128 + }, + { + "epoch": 0.856406619560899, + "grad_norm": 0.795049250125885, + "learning_rate": 1.0002403092501434e-06, + "loss": 0.5016, + "step": 46130 + }, + { + "epoch": 0.8564437496983176, + "grad_norm": 0.37341001629829407, + "learning_rate": 9.997318511148623e-07, + "loss": 0.1678, + "step": 46132 + }, + { + "epoch": 0.8564808798357363, + "grad_norm": 0.5146586298942566, + "learning_rate": 9.992235154462726e-07, + "loss": 0.4902, + "step": 46134 + }, + { + "epoch": 0.8565180099731549, + "grad_norm": 0.49121350049972534, + "learning_rate": 9.98715302251292e-07, + "loss": 0.2585, + "step": 46136 + }, + { + "epoch": 0.8565551401105735, + "grad_norm": 0.4088270664215088, + "learning_rate": 9.982072115368313e-07, + "loss": 0.2735, + "step": 46138 + }, + { + "epoch": 0.8565922702479922, + "grad_norm": 0.35057932138442993, + "learning_rate": 9.976992433098077e-07, + "loss": 0.3379, + "step": 46140 + }, + { + "epoch": 0.8566294003854108, + "grad_norm": 0.25527843832969666, + "learning_rate": 9.971913975771341e-07, + "loss": 0.2667, + "step": 46142 + }, + { + "epoch": 0.8566665305228295, + "grad_norm": 0.4950246214866638, + "learning_rate": 9.966836743457175e-07, + "loss": 0.4714, + "step": 46144 + }, + { + "epoch": 0.8567036606602481, + "grad_norm": 0.43320509791374207, + "learning_rate": 9.96176073622468e-07, + "loss": 0.2991, + "step": 46146 + }, + { + "epoch": 0.8567407907976667, + "grad_norm": 1.155590295791626, + "learning_rate": 9.95668595414293e-07, + "loss": 0.152, + "step": 46148 + }, + { + "epoch": 0.8567779209350854, + "grad_norm": 0.2639387249946594, + "learning_rate": 9.951612397280963e-07, + "loss": 0.2366, + "step": 46150 + }, + { + "epoch": 0.856815051072504, + "grad_norm": 0.4478684067726135, + "learning_rate": 9.946540065707833e-07, + "loss": 0.3208, + "step": 46152 + }, + { + "epoch": 0.8568521812099227, + "grad_norm": 0.34140002727508545, + "learning_rate": 9.941468959492562e-07, + "loss": 0.2934, + "step": 46154 + }, + { + "epoch": 0.8568893113473413, + "grad_norm": 0.26276132464408875, + "learning_rate": 9.936399078704129e-07, + "loss": 0.2396, + "step": 46156 + }, + { + "epoch": 0.8569264414847599, + "grad_norm": 0.4313145875930786, + "learning_rate": 9.931330423411545e-07, + "loss": 0.2825, + "step": 46158 + }, + { + "epoch": 0.8569635716221786, + "grad_norm": 0.37394723296165466, + "learning_rate": 9.926262993683755e-07, + "loss": 0.2194, + "step": 46160 + }, + { + "epoch": 0.8570007017595972, + "grad_norm": 0.21449095010757446, + "learning_rate": 9.921196789589726e-07, + "loss": 0.2751, + "step": 46162 + }, + { + "epoch": 0.8570378318970159, + "grad_norm": 0.37025001645088196, + "learning_rate": 9.916131811198382e-07, + "loss": 0.1488, + "step": 46164 + }, + { + "epoch": 0.8570749620344345, + "grad_norm": 0.47388169169425964, + "learning_rate": 9.911068058578666e-07, + "loss": 0.2557, + "step": 46166 + }, + { + "epoch": 0.8571120921718531, + "grad_norm": 0.3557104468345642, + "learning_rate": 9.906005531799467e-07, + "loss": 0.0881, + "step": 46168 + }, + { + "epoch": 0.8571492223092718, + "grad_norm": 0.5810810327529907, + "learning_rate": 9.900944230929666e-07, + "loss": 0.2012, + "step": 46170 + }, + { + "epoch": 0.8571863524466904, + "grad_norm": 0.33764055371284485, + "learning_rate": 9.895884156038128e-07, + "loss": 0.2997, + "step": 46172 + }, + { + "epoch": 0.8572234825841091, + "grad_norm": 0.5022072792053223, + "learning_rate": 9.890825307193719e-07, + "loss": 0.3853, + "step": 46174 + }, + { + "epoch": 0.8572606127215276, + "grad_norm": 0.3528594672679901, + "learning_rate": 9.885767684465286e-07, + "loss": 0.2999, + "step": 46176 + }, + { + "epoch": 0.8572977428589463, + "grad_norm": 0.38557982444763184, + "learning_rate": 9.880711287921607e-07, + "loss": 0.3685, + "step": 46178 + }, + { + "epoch": 0.8573348729963649, + "grad_norm": 0.3270796537399292, + "learning_rate": 9.875656117631538e-07, + "loss": 0.3019, + "step": 46180 + }, + { + "epoch": 0.8573720031337836, + "grad_norm": 0.36792823672294617, + "learning_rate": 9.870602173663812e-07, + "loss": 0.177, + "step": 46182 + }, + { + "epoch": 0.8574091332712023, + "grad_norm": 0.30004703998565674, + "learning_rate": 9.86554945608722e-07, + "loss": 0.4977, + "step": 46184 + }, + { + "epoch": 0.8574462634086208, + "grad_norm": 0.3589276969432831, + "learning_rate": 9.860497964970516e-07, + "loss": 0.2549, + "step": 46186 + }, + { + "epoch": 0.8574833935460395, + "grad_norm": 0.6069404482841492, + "learning_rate": 9.855447700382437e-07, + "loss": 0.3117, + "step": 46188 + }, + { + "epoch": 0.8575205236834581, + "grad_norm": 0.30522945523262024, + "learning_rate": 9.850398662391692e-07, + "loss": 0.3231, + "step": 46190 + }, + { + "epoch": 0.8575576538208768, + "grad_norm": 0.6054765582084656, + "learning_rate": 9.845350851067026e-07, + "loss": 0.359, + "step": 46192 + }, + { + "epoch": 0.8575947839582955, + "grad_norm": 0.2566866874694824, + "learning_rate": 9.840304266477063e-07, + "loss": 0.2009, + "step": 46194 + }, + { + "epoch": 0.857631914095714, + "grad_norm": 0.42071259021759033, + "learning_rate": 9.835258908690504e-07, + "loss": 0.197, + "step": 46196 + }, + { + "epoch": 0.8576690442331327, + "grad_norm": 0.36751270294189453, + "learning_rate": 9.830214777776004e-07, + "loss": 0.1363, + "step": 46198 + }, + { + "epoch": 0.8577061743705513, + "grad_norm": 0.26197052001953125, + "learning_rate": 9.825171873802186e-07, + "loss": 0.1955, + "step": 46200 + }, + { + "epoch": 0.85774330450797, + "grad_norm": 0.45065221190452576, + "learning_rate": 9.820130196837663e-07, + "loss": 0.2661, + "step": 46202 + }, + { + "epoch": 0.8577804346453887, + "grad_norm": 0.49744081497192383, + "learning_rate": 9.815089746951057e-07, + "loss": 0.4067, + "step": 46204 + }, + { + "epoch": 0.8578175647828072, + "grad_norm": 0.37906700372695923, + "learning_rate": 9.810050524210956e-07, + "loss": 0.4966, + "step": 46206 + }, + { + "epoch": 0.8578546949202259, + "grad_norm": 0.4864760935306549, + "learning_rate": 9.805012528685908e-07, + "loss": 0.2898, + "step": 46208 + }, + { + "epoch": 0.8578918250576445, + "grad_norm": 0.3086865246295929, + "learning_rate": 9.799975760444468e-07, + "loss": 0.3064, + "step": 46210 + }, + { + "epoch": 0.8579289551950632, + "grad_norm": 1.6721597909927368, + "learning_rate": 9.79494021955517e-07, + "loss": 0.1497, + "step": 46212 + }, + { + "epoch": 0.8579660853324819, + "grad_norm": 0.3922683894634247, + "learning_rate": 9.78990590608655e-07, + "loss": 0.2725, + "step": 46214 + }, + { + "epoch": 0.8580032154699004, + "grad_norm": 0.3594084680080414, + "learning_rate": 9.784872820107106e-07, + "loss": 0.201, + "step": 46216 + }, + { + "epoch": 0.8580403456073191, + "grad_norm": 0.21988588571548462, + "learning_rate": 9.779840961685328e-07, + "loss": 0.0882, + "step": 46218 + }, + { + "epoch": 0.8580774757447377, + "grad_norm": 0.5878615379333496, + "learning_rate": 9.77481033088965e-07, + "loss": 0.0566, + "step": 46220 + }, + { + "epoch": 0.8581146058821564, + "grad_norm": 0.3512178957462311, + "learning_rate": 9.769780927788575e-07, + "loss": 0.1164, + "step": 46222 + }, + { + "epoch": 0.8581517360195751, + "grad_norm": 0.3427608907222748, + "learning_rate": 9.76475275245049e-07, + "loss": 0.1778, + "step": 46224 + }, + { + "epoch": 0.8581888661569936, + "grad_norm": 0.23185452818870544, + "learning_rate": 9.759725804943832e-07, + "loss": 0.3727, + "step": 46226 + }, + { + "epoch": 0.8582259962944123, + "grad_norm": 0.49998024106025696, + "learning_rate": 9.754700085336999e-07, + "loss": 0.5672, + "step": 46228 + }, + { + "epoch": 0.8582631264318309, + "grad_norm": 0.5075954794883728, + "learning_rate": 9.749675593698416e-07, + "loss": 0.2543, + "step": 46230 + }, + { + "epoch": 0.8583002565692496, + "grad_norm": 0.3351147770881653, + "learning_rate": 9.744652330096383e-07, + "loss": 0.1896, + "step": 46232 + }, + { + "epoch": 0.8583373867066681, + "grad_norm": 0.42122018337249756, + "learning_rate": 9.739630294599289e-07, + "loss": 0.2289, + "step": 46234 + }, + { + "epoch": 0.8583745168440868, + "grad_norm": 0.3258845806121826, + "learning_rate": 9.73460948727547e-07, + "loss": 0.2461, + "step": 46236 + }, + { + "epoch": 0.8584116469815055, + "grad_norm": 0.28528159856796265, + "learning_rate": 9.729589908193237e-07, + "loss": 0.1234, + "step": 46238 + }, + { + "epoch": 0.8584487771189241, + "grad_norm": 0.3912682831287384, + "learning_rate": 9.724571557420925e-07, + "loss": 0.3685, + "step": 46240 + }, + { + "epoch": 0.8584859072563428, + "grad_norm": 0.4478892683982849, + "learning_rate": 9.719554435026757e-07, + "loss": 0.2592, + "step": 46242 + }, + { + "epoch": 0.8585230373937613, + "grad_norm": 0.45767948031425476, + "learning_rate": 9.714538541079055e-07, + "loss": 0.2928, + "step": 46244 + }, + { + "epoch": 0.85856016753118, + "grad_norm": 0.36360448598861694, + "learning_rate": 9.709523875646022e-07, + "loss": 0.2248, + "step": 46246 + }, + { + "epoch": 0.8585972976685987, + "grad_norm": 0.2076425701379776, + "learning_rate": 9.704510438795923e-07, + "loss": 0.2488, + "step": 46248 + }, + { + "epoch": 0.8586344278060173, + "grad_norm": 0.39231494069099426, + "learning_rate": 9.69949823059697e-07, + "loss": 0.4904, + "step": 46250 + }, + { + "epoch": 0.858671557943436, + "grad_norm": 0.22408731281757355, + "learning_rate": 9.694487251117357e-07, + "loss": 0.39, + "step": 46252 + }, + { + "epoch": 0.8587086880808545, + "grad_norm": 0.3209182322025299, + "learning_rate": 9.689477500425293e-07, + "loss": 0.3166, + "step": 46254 + }, + { + "epoch": 0.8587458182182732, + "grad_norm": 0.3581145107746124, + "learning_rate": 9.684468978588934e-07, + "loss": 0.2715, + "step": 46256 + }, + { + "epoch": 0.8587829483556919, + "grad_norm": 0.5015003681182861, + "learning_rate": 9.679461685676407e-07, + "loss": 0.355, + "step": 46258 + }, + { + "epoch": 0.8588200784931105, + "grad_norm": 0.3776834309101105, + "learning_rate": 9.674455621755873e-07, + "loss": 0.2292, + "step": 46260 + }, + { + "epoch": 0.8588572086305292, + "grad_norm": 0.1708477884531021, + "learning_rate": 9.669450786895452e-07, + "loss": 0.3651, + "step": 46262 + }, + { + "epoch": 0.8588943387679477, + "grad_norm": 0.424582302570343, + "learning_rate": 9.664447181163228e-07, + "loss": 0.3992, + "step": 46264 + }, + { + "epoch": 0.8589314689053664, + "grad_norm": 0.452316552400589, + "learning_rate": 9.659444804627283e-07, + "loss": 0.1647, + "step": 46266 + }, + { + "epoch": 0.8589685990427851, + "grad_norm": 0.33175915479660034, + "learning_rate": 9.654443657355706e-07, + "loss": 0.3052, + "step": 46268 + }, + { + "epoch": 0.8590057291802037, + "grad_norm": 0.554253101348877, + "learning_rate": 9.64944373941652e-07, + "loss": 0.4591, + "step": 46270 + }, + { + "epoch": 0.8590428593176224, + "grad_norm": 0.39185282588005066, + "learning_rate": 9.644445050877782e-07, + "loss": 0.3826, + "step": 46272 + }, + { + "epoch": 0.8590799894550409, + "grad_norm": 0.2625126540660858, + "learning_rate": 9.639447591807495e-07, + "loss": 0.293, + "step": 46274 + }, + { + "epoch": 0.8591171195924596, + "grad_norm": 0.5193026065826416, + "learning_rate": 9.634451362273667e-07, + "loss": 0.3323, + "step": 46276 + }, + { + "epoch": 0.8591542497298783, + "grad_norm": 0.48075148463249207, + "learning_rate": 9.62945636234427e-07, + "loss": 0.3628, + "step": 46278 + }, + { + "epoch": 0.8591913798672969, + "grad_norm": 0.39393192529678345, + "learning_rate": 9.624462592087314e-07, + "loss": 0.3218, + "step": 46280 + }, + { + "epoch": 0.8592285100047156, + "grad_norm": 0.38466787338256836, + "learning_rate": 9.61947005157069e-07, + "loss": 0.2285, + "step": 46282 + }, + { + "epoch": 0.8592656401421341, + "grad_norm": 0.32856759428977966, + "learning_rate": 9.614478740862377e-07, + "loss": 0.2078, + "step": 46284 + }, + { + "epoch": 0.8593027702795528, + "grad_norm": 0.31078091263771057, + "learning_rate": 9.609488660030252e-07, + "loss": 0.2823, + "step": 46286 + }, + { + "epoch": 0.8593399004169714, + "grad_norm": 0.5123100280761719, + "learning_rate": 9.604499809142232e-07, + "loss": 0.3058, + "step": 46288 + }, + { + "epoch": 0.8593770305543901, + "grad_norm": 0.47025883197784424, + "learning_rate": 9.599512188266213e-07, + "loss": 0.19, + "step": 46290 + }, + { + "epoch": 0.8594141606918088, + "grad_norm": 0.22942614555358887, + "learning_rate": 9.594525797470044e-07, + "loss": 0.2825, + "step": 46292 + }, + { + "epoch": 0.8594512908292273, + "grad_norm": 0.24472802877426147, + "learning_rate": 9.589540636821603e-07, + "loss": 0.235, + "step": 46294 + }, + { + "epoch": 0.859488420966646, + "grad_norm": 0.3846464455127716, + "learning_rate": 9.584556706388692e-07, + "loss": 0.303, + "step": 46296 + }, + { + "epoch": 0.8595255511040646, + "grad_norm": 0.37022069096565247, + "learning_rate": 9.579574006239135e-07, + "loss": 0.518, + "step": 46298 + }, + { + "epoch": 0.8595626812414833, + "grad_norm": 0.3093615472316742, + "learning_rate": 9.57459253644073e-07, + "loss": 0.274, + "step": 46300 + }, + { + "epoch": 0.859599811378902, + "grad_norm": 0.45878714323043823, + "learning_rate": 9.56961229706127e-07, + "loss": 0.2058, + "step": 46302 + }, + { + "epoch": 0.8596369415163205, + "grad_norm": 0.39811742305755615, + "learning_rate": 9.564633288168534e-07, + "loss": 0.2323, + "step": 46304 + }, + { + "epoch": 0.8596740716537392, + "grad_norm": 0.44109609723091125, + "learning_rate": 9.559655509830256e-07, + "loss": 0.4374, + "step": 46306 + }, + { + "epoch": 0.8597112017911578, + "grad_norm": 0.30011868476867676, + "learning_rate": 9.55467896211414e-07, + "loss": 0.2943, + "step": 46308 + }, + { + "epoch": 0.8597483319285765, + "grad_norm": 0.7884966135025024, + "learning_rate": 9.549703645087937e-07, + "loss": 0.4794, + "step": 46310 + }, + { + "epoch": 0.8597854620659952, + "grad_norm": 0.578792929649353, + "learning_rate": 9.544729558819332e-07, + "loss": 0.2461, + "step": 46312 + }, + { + "epoch": 0.8598225922034137, + "grad_norm": 0.42589816451072693, + "learning_rate": 9.539756703376012e-07, + "loss": 0.2991, + "step": 46314 + }, + { + "epoch": 0.8598597223408324, + "grad_norm": 0.5310109257698059, + "learning_rate": 9.534785078825648e-07, + "loss": 0.3408, + "step": 46316 + }, + { + "epoch": 0.859896852478251, + "grad_norm": 0.376668781042099, + "learning_rate": 9.529814685235905e-07, + "loss": 0.3986, + "step": 46318 + }, + { + "epoch": 0.8599339826156697, + "grad_norm": 0.28847813606262207, + "learning_rate": 9.524845522674364e-07, + "loss": 0.2796, + "step": 46320 + }, + { + "epoch": 0.8599711127530884, + "grad_norm": 0.3884340524673462, + "learning_rate": 9.519877591208681e-07, + "loss": 0.3223, + "step": 46322 + }, + { + "epoch": 0.8600082428905069, + "grad_norm": 0.42567428946495056, + "learning_rate": 9.514910890906448e-07, + "loss": 0.2509, + "step": 46324 + }, + { + "epoch": 0.8600453730279256, + "grad_norm": 0.4189719259738922, + "learning_rate": 9.509945421835253e-07, + "loss": 0.214, + "step": 46326 + }, + { + "epoch": 0.8600825031653442, + "grad_norm": 0.36907124519348145, + "learning_rate": 9.504981184062634e-07, + "loss": 0.2991, + "step": 46328 + }, + { + "epoch": 0.8601196333027629, + "grad_norm": 0.3586653172969818, + "learning_rate": 9.500018177656167e-07, + "loss": 0.1652, + "step": 46330 + }, + { + "epoch": 0.8601567634401814, + "grad_norm": 0.45447200536727905, + "learning_rate": 9.495056402683378e-07, + "loss": 0.2426, + "step": 46332 + }, + { + "epoch": 0.8601938935776001, + "grad_norm": 0.3858547508716583, + "learning_rate": 9.490095859211768e-07, + "loss": 0.2238, + "step": 46334 + }, + { + "epoch": 0.8602310237150188, + "grad_norm": 0.45476511120796204, + "learning_rate": 9.485136547308848e-07, + "loss": 0.5031, + "step": 46336 + }, + { + "epoch": 0.8602681538524374, + "grad_norm": 0.48580729961395264, + "learning_rate": 9.480178467042089e-07, + "loss": 0.1914, + "step": 46338 + }, + { + "epoch": 0.8603052839898561, + "grad_norm": 0.305665522813797, + "learning_rate": 9.475221618478969e-07, + "loss": 0.3223, + "step": 46340 + }, + { + "epoch": 0.8603424141272746, + "grad_norm": 0.3031717538833618, + "learning_rate": 9.470266001686923e-07, + "loss": 0.2866, + "step": 46342 + }, + { + "epoch": 0.8603795442646933, + "grad_norm": 0.4734834134578705, + "learning_rate": 9.465311616733408e-07, + "loss": 0.2243, + "step": 46344 + }, + { + "epoch": 0.860416674402112, + "grad_norm": 0.4653489887714386, + "learning_rate": 9.460358463685804e-07, + "loss": 0.2989, + "step": 46346 + }, + { + "epoch": 0.8604538045395306, + "grad_norm": 0.41710546612739563, + "learning_rate": 9.455406542611545e-07, + "loss": 0.4282, + "step": 46348 + }, + { + "epoch": 0.8604909346769493, + "grad_norm": 0.4607117176055908, + "learning_rate": 9.450455853577967e-07, + "loss": 0.369, + "step": 46350 + }, + { + "epoch": 0.8605280648143678, + "grad_norm": 0.6424175500869751, + "learning_rate": 9.445506396652459e-07, + "loss": 0.2617, + "step": 46352 + }, + { + "epoch": 0.8605651949517865, + "grad_norm": 0.4699985086917877, + "learning_rate": 9.440558171902359e-07, + "loss": 0.2994, + "step": 46354 + }, + { + "epoch": 0.8606023250892052, + "grad_norm": 0.44466492533683777, + "learning_rate": 9.435611179395033e-07, + "loss": 0.2322, + "step": 46356 + }, + { + "epoch": 0.8606394552266238, + "grad_norm": 0.42388564348220825, + "learning_rate": 9.430665419197749e-07, + "loss": 0.175, + "step": 46358 + }, + { + "epoch": 0.8606765853640425, + "grad_norm": 0.40325218439102173, + "learning_rate": 9.42572089137782e-07, + "loss": 0.3197, + "step": 46360 + }, + { + "epoch": 0.860713715501461, + "grad_norm": 0.4469625949859619, + "learning_rate": 9.420777596002528e-07, + "loss": 0.5115, + "step": 46362 + }, + { + "epoch": 0.8607508456388797, + "grad_norm": 0.4007731080055237, + "learning_rate": 9.41583553313914e-07, + "loss": 0.2152, + "step": 46364 + }, + { + "epoch": 0.8607879757762984, + "grad_norm": 0.4601231515407562, + "learning_rate": 9.41089470285489e-07, + "loss": 0.3239, + "step": 46366 + }, + { + "epoch": 0.860825105913717, + "grad_norm": 0.37026742100715637, + "learning_rate": 9.405955105217046e-07, + "loss": 0.1786, + "step": 46368 + }, + { + "epoch": 0.8608622360511357, + "grad_norm": 0.6052196025848389, + "learning_rate": 9.40101674029279e-07, + "loss": 0.2635, + "step": 46370 + }, + { + "epoch": 0.8608993661885542, + "grad_norm": 0.5184234380722046, + "learning_rate": 9.3960796081493e-07, + "loss": 0.1757, + "step": 46372 + }, + { + "epoch": 0.8609364963259729, + "grad_norm": 0.36181288957595825, + "learning_rate": 9.391143708853778e-07, + "loss": 0.42, + "step": 46374 + }, + { + "epoch": 0.8609736264633916, + "grad_norm": 0.2517626881599426, + "learning_rate": 9.386209042473382e-07, + "loss": 0.3591, + "step": 46376 + }, + { + "epoch": 0.8610107566008102, + "grad_norm": 0.29608917236328125, + "learning_rate": 9.381275609075269e-07, + "loss": 0.2106, + "step": 46378 + }, + { + "epoch": 0.8610478867382289, + "grad_norm": 0.3649085462093353, + "learning_rate": 9.376343408726551e-07, + "loss": 0.4871, + "step": 46380 + }, + { + "epoch": 0.8610850168756474, + "grad_norm": 0.64967942237854, + "learning_rate": 9.371412441494377e-07, + "loss": 0.3645, + "step": 46382 + }, + { + "epoch": 0.8611221470130661, + "grad_norm": 0.4360603392124176, + "learning_rate": 9.36648270744579e-07, + "loss": 0.2414, + "step": 46384 + }, + { + "epoch": 0.8611592771504847, + "grad_norm": 0.4184345006942749, + "learning_rate": 9.361554206647894e-07, + "loss": 0.2459, + "step": 46386 + }, + { + "epoch": 0.8611964072879034, + "grad_norm": 0.44913792610168457, + "learning_rate": 9.356626939167757e-07, + "loss": 0.1666, + "step": 46388 + }, + { + "epoch": 0.861233537425322, + "grad_norm": 0.3589648902416229, + "learning_rate": 9.351700905072437e-07, + "loss": 0.2685, + "step": 46390 + }, + { + "epoch": 0.8612706675627406, + "grad_norm": 0.3720960319042206, + "learning_rate": 9.346776104428923e-07, + "loss": 0.246, + "step": 46392 + }, + { + "epoch": 0.8613077977001593, + "grad_norm": 0.6639584898948669, + "learning_rate": 9.341852537304263e-07, + "loss": 0.4005, + "step": 46394 + }, + { + "epoch": 0.8613449278375779, + "grad_norm": 0.39261895418167114, + "learning_rate": 9.336930203765426e-07, + "loss": 0.427, + "step": 46396 + }, + { + "epoch": 0.8613820579749966, + "grad_norm": 0.5158554315567017, + "learning_rate": 9.332009103879403e-07, + "loss": 0.3459, + "step": 46398 + }, + { + "epoch": 0.8614191881124152, + "grad_norm": 0.3893435597419739, + "learning_rate": 9.327089237713149e-07, + "loss": 0.3889, + "step": 46400 + }, + { + "epoch": 0.8614563182498338, + "grad_norm": 0.22373245656490326, + "learning_rate": 9.322170605333614e-07, + "loss": 0.2786, + "step": 46402 + }, + { + "epoch": 0.8614934483872525, + "grad_norm": 0.5546050667762756, + "learning_rate": 9.31725320680773e-07, + "loss": 0.2727, + "step": 46404 + }, + { + "epoch": 0.8615305785246711, + "grad_norm": 0.44842684268951416, + "learning_rate": 9.312337042202402e-07, + "loss": 0.2835, + "step": 46406 + }, + { + "epoch": 0.8615677086620898, + "grad_norm": 0.5530038475990295, + "learning_rate": 9.307422111584541e-07, + "loss": 0.1222, + "step": 46408 + }, + { + "epoch": 0.8616048387995084, + "grad_norm": 0.6688160300254822, + "learning_rate": 9.302508415020994e-07, + "loss": 0.3912, + "step": 46410 + }, + { + "epoch": 0.861641968936927, + "grad_norm": 0.71479731798172, + "learning_rate": 9.297595952578653e-07, + "loss": 0.2258, + "step": 46412 + }, + { + "epoch": 0.8616790990743457, + "grad_norm": 0.4237983226776123, + "learning_rate": 9.29268472432433e-07, + "loss": 0.1749, + "step": 46414 + }, + { + "epoch": 0.8617162292117643, + "grad_norm": 0.4547152519226074, + "learning_rate": 9.287774730324861e-07, + "loss": 0.2293, + "step": 46416 + }, + { + "epoch": 0.861753359349183, + "grad_norm": 0.37682297825813293, + "learning_rate": 9.28286597064707e-07, + "loss": 0.2141, + "step": 46418 + }, + { + "epoch": 0.8617904894866016, + "grad_norm": 0.519481360912323, + "learning_rate": 9.27795844535776e-07, + "loss": 0.3248, + "step": 46420 + }, + { + "epoch": 0.8618276196240202, + "grad_norm": 0.6105753779411316, + "learning_rate": 9.273052154523676e-07, + "loss": 0.4192, + "step": 46422 + }, + { + "epoch": 0.8618647497614389, + "grad_norm": 0.35898518562316895, + "learning_rate": 9.2681470982116e-07, + "loss": 0.3695, + "step": 46424 + }, + { + "epoch": 0.8619018798988575, + "grad_norm": 0.42922988533973694, + "learning_rate": 9.263243276488254e-07, + "loss": 0.4508, + "step": 46426 + }, + { + "epoch": 0.8619390100362762, + "grad_norm": 0.4655109941959381, + "learning_rate": 9.258340689420386e-07, + "loss": 0.3069, + "step": 46428 + }, + { + "epoch": 0.8619761401736948, + "grad_norm": 0.18952637910842896, + "learning_rate": 9.253439337074721e-07, + "loss": 0.1967, + "step": 46430 + }, + { + "epoch": 0.8620132703111134, + "grad_norm": 0.4921724796295166, + "learning_rate": 9.248539219517904e-07, + "loss": 0.1317, + "step": 46432 + }, + { + "epoch": 0.8620504004485321, + "grad_norm": 0.39536353945732117, + "learning_rate": 9.243640336816651e-07, + "loss": 0.2687, + "step": 46434 + }, + { + "epoch": 0.8620875305859507, + "grad_norm": 0.4436925947666168, + "learning_rate": 9.238742689037594e-07, + "loss": 0.2681, + "step": 46436 + }, + { + "epoch": 0.8621246607233694, + "grad_norm": 0.34235039353370667, + "learning_rate": 9.233846276247394e-07, + "loss": 0.2058, + "step": 46438 + }, + { + "epoch": 0.8621617908607879, + "grad_norm": 0.6510866284370422, + "learning_rate": 9.228951098512661e-07, + "loss": 0.2409, + "step": 46440 + }, + { + "epoch": 0.8621989209982066, + "grad_norm": 0.4142334461212158, + "learning_rate": 9.224057155900013e-07, + "loss": 0.2129, + "step": 46442 + }, + { + "epoch": 0.8622360511356253, + "grad_norm": 0.28202468156814575, + "learning_rate": 9.21916444847607e-07, + "loss": 0.0866, + "step": 46444 + }, + { + "epoch": 0.8622731812730439, + "grad_norm": 0.44989195466041565, + "learning_rate": 9.214272976307348e-07, + "loss": 0.3838, + "step": 46446 + }, + { + "epoch": 0.8623103114104625, + "grad_norm": 0.6221481561660767, + "learning_rate": 9.209382739460438e-07, + "loss": 0.3436, + "step": 46448 + }, + { + "epoch": 0.8623474415478811, + "grad_norm": 0.2253553867340088, + "learning_rate": 9.204493738001885e-07, + "loss": 0.2152, + "step": 46450 + }, + { + "epoch": 0.8623845716852998, + "grad_norm": 0.6241713762283325, + "learning_rate": 9.199605971998227e-07, + "loss": 0.3115, + "step": 46452 + }, + { + "epoch": 0.8624217018227185, + "grad_norm": 0.24273280799388885, + "learning_rate": 9.194719441515931e-07, + "loss": 0.139, + "step": 46454 + }, + { + "epoch": 0.8624588319601371, + "grad_norm": 0.5003330111503601, + "learning_rate": 9.189834146621501e-07, + "loss": 0.4003, + "step": 46456 + }, + { + "epoch": 0.8624959620975557, + "grad_norm": 0.5129695534706116, + "learning_rate": 9.18495008738145e-07, + "loss": 0.3602, + "step": 46458 + }, + { + "epoch": 0.8625330922349743, + "grad_norm": 0.3395385444164276, + "learning_rate": 9.180067263862192e-07, + "loss": 0.3788, + "step": 46460 + }, + { + "epoch": 0.862570222372393, + "grad_norm": 0.3349679112434387, + "learning_rate": 9.175185676130171e-07, + "loss": 0.2056, + "step": 46462 + }, + { + "epoch": 0.8626073525098117, + "grad_norm": 0.5682922601699829, + "learning_rate": 9.170305324251827e-07, + "loss": 0.2257, + "step": 46464 + }, + { + "epoch": 0.8626444826472303, + "grad_norm": 0.2856213450431824, + "learning_rate": 9.16542620829356e-07, + "loss": 0.1117, + "step": 46466 + }, + { + "epoch": 0.8626816127846489, + "grad_norm": 0.2910298705101013, + "learning_rate": 9.160548328321762e-07, + "loss": 0.3455, + "step": 46468 + }, + { + "epoch": 0.8627187429220675, + "grad_norm": 0.28439223766326904, + "learning_rate": 9.155671684402823e-07, + "loss": 0.1661, + "step": 46470 + }, + { + "epoch": 0.8627558730594862, + "grad_norm": 0.34871184825897217, + "learning_rate": 9.150796276603057e-07, + "loss": 0.3151, + "step": 46472 + }, + { + "epoch": 0.8627930031969049, + "grad_norm": 0.3978327214717865, + "learning_rate": 9.145922104988836e-07, + "loss": 0.3079, + "step": 46474 + }, + { + "epoch": 0.8628301333343235, + "grad_norm": 0.30677497386932373, + "learning_rate": 9.141049169626492e-07, + "loss": 0.325, + "step": 46476 + }, + { + "epoch": 0.8628672634717421, + "grad_norm": 0.34825631976127625, + "learning_rate": 9.136177470582297e-07, + "loss": 0.2891, + "step": 46478 + }, + { + "epoch": 0.8629043936091607, + "grad_norm": 0.5264063477516174, + "learning_rate": 9.131307007922562e-07, + "loss": 0.2168, + "step": 46480 + }, + { + "epoch": 0.8629415237465794, + "grad_norm": 0.5684126019477844, + "learning_rate": 9.12643778171357e-07, + "loss": 0.2151, + "step": 46482 + }, + { + "epoch": 0.862978653883998, + "grad_norm": 0.3642502725124359, + "learning_rate": 9.121569792021545e-07, + "loss": 0.3772, + "step": 46484 + }, + { + "epoch": 0.8630157840214167, + "grad_norm": 0.2748647928237915, + "learning_rate": 9.116703038912744e-07, + "loss": 0.2337, + "step": 46486 + }, + { + "epoch": 0.8630529141588353, + "grad_norm": 0.2251640260219574, + "learning_rate": 9.111837522453382e-07, + "loss": 0.3718, + "step": 46488 + }, + { + "epoch": 0.8630900442962539, + "grad_norm": 0.34252214431762695, + "learning_rate": 9.106973242709672e-07, + "loss": 0.2611, + "step": 46490 + }, + { + "epoch": 0.8631271744336726, + "grad_norm": 0.5877100825309753, + "learning_rate": 9.102110199747805e-07, + "loss": 0.2496, + "step": 46492 + }, + { + "epoch": 0.8631643045710912, + "grad_norm": 0.21235185861587524, + "learning_rate": 9.097248393633961e-07, + "loss": 0.1775, + "step": 46494 + }, + { + "epoch": 0.8632014347085099, + "grad_norm": 0.3401835858821869, + "learning_rate": 9.092387824434256e-07, + "loss": 0.3657, + "step": 46496 + }, + { + "epoch": 0.8632385648459285, + "grad_norm": 0.3647737503051758, + "learning_rate": 9.087528492214882e-07, + "loss": 0.335, + "step": 46498 + }, + { + "epoch": 0.8632756949833471, + "grad_norm": 0.4141778349876404, + "learning_rate": 9.082670397041915e-07, + "loss": 0.3524, + "step": 46500 + }, + { + "epoch": 0.8633128251207658, + "grad_norm": 0.26676759123802185, + "learning_rate": 9.077813538981461e-07, + "loss": 0.4065, + "step": 46502 + }, + { + "epoch": 0.8633499552581844, + "grad_norm": 0.34450563788414, + "learning_rate": 9.072957918099635e-07, + "loss": 0.1958, + "step": 46504 + }, + { + "epoch": 0.863387085395603, + "grad_norm": 0.48400068283081055, + "learning_rate": 9.06810353446248e-07, + "loss": 0.2814, + "step": 46506 + }, + { + "epoch": 0.8634242155330217, + "grad_norm": 0.22704671323299408, + "learning_rate": 9.063250388136091e-07, + "loss": 0.0744, + "step": 46508 + }, + { + "epoch": 0.8634613456704403, + "grad_norm": 0.36942172050476074, + "learning_rate": 9.058398479186459e-07, + "loss": 0.2967, + "step": 46510 + }, + { + "epoch": 0.863498475807859, + "grad_norm": 0.4508446753025055, + "learning_rate": 9.053547807679619e-07, + "loss": 0.3865, + "step": 46512 + }, + { + "epoch": 0.8635356059452776, + "grad_norm": 0.47393882274627686, + "learning_rate": 9.048698373681586e-07, + "loss": 0.3714, + "step": 46514 + }, + { + "epoch": 0.8635727360826962, + "grad_norm": 0.4719904959201813, + "learning_rate": 9.04385017725834e-07, + "loss": 0.3366, + "step": 46516 + }, + { + "epoch": 0.8636098662201149, + "grad_norm": 0.445715069770813, + "learning_rate": 9.039003218475839e-07, + "loss": 0.2088, + "step": 46518 + }, + { + "epoch": 0.8636469963575335, + "grad_norm": 0.48084166646003723, + "learning_rate": 9.034157497400065e-07, + "loss": 0.3843, + "step": 46520 + }, + { + "epoch": 0.8636841264949522, + "grad_norm": 0.3710857927799225, + "learning_rate": 9.029313014096908e-07, + "loss": 0.3525, + "step": 46522 + }, + { + "epoch": 0.8637212566323708, + "grad_norm": 0.5550365447998047, + "learning_rate": 9.024469768632316e-07, + "loss": 0.1933, + "step": 46524 + }, + { + "epoch": 0.8637583867697894, + "grad_norm": 0.4179303050041199, + "learning_rate": 9.019627761072192e-07, + "loss": 0.1408, + "step": 46526 + }, + { + "epoch": 0.8637955169072081, + "grad_norm": 0.4967989921569824, + "learning_rate": 9.014786991482416e-07, + "loss": 0.1584, + "step": 46528 + }, + { + "epoch": 0.8638326470446267, + "grad_norm": 0.340951144695282, + "learning_rate": 9.009947459928847e-07, + "loss": 0.1502, + "step": 46530 + }, + { + "epoch": 0.8638697771820454, + "grad_norm": 0.26855772733688354, + "learning_rate": 9.005109166477355e-07, + "loss": 0.2932, + "step": 46532 + }, + { + "epoch": 0.863906907319464, + "grad_norm": 0.32000988721847534, + "learning_rate": 9.000272111193775e-07, + "loss": 0.1468, + "step": 46534 + }, + { + "epoch": 0.8639440374568826, + "grad_norm": 0.601523756980896, + "learning_rate": 8.995436294143911e-07, + "loss": 0.3262, + "step": 46536 + }, + { + "epoch": 0.8639811675943012, + "grad_norm": 0.5269022583961487, + "learning_rate": 8.990601715393577e-07, + "loss": 0.2056, + "step": 46538 + }, + { + "epoch": 0.8640182977317199, + "grad_norm": 0.5854669809341431, + "learning_rate": 8.98576837500853e-07, + "loss": 0.2651, + "step": 46540 + }, + { + "epoch": 0.8640554278691386, + "grad_norm": 0.32356351613998413, + "learning_rate": 8.980936273054564e-07, + "loss": 0.1804, + "step": 46542 + }, + { + "epoch": 0.8640925580065572, + "grad_norm": 0.4773086905479431, + "learning_rate": 8.976105409597413e-07, + "loss": 0.1683, + "step": 46544 + }, + { + "epoch": 0.8641296881439758, + "grad_norm": 0.4852351248264313, + "learning_rate": 8.971275784702837e-07, + "loss": 0.4865, + "step": 46546 + }, + { + "epoch": 0.8641668182813944, + "grad_norm": 0.2547571659088135, + "learning_rate": 8.966447398436529e-07, + "loss": 0.3646, + "step": 46548 + }, + { + "epoch": 0.8642039484188131, + "grad_norm": 0.40590035915374756, + "learning_rate": 8.961620250864189e-07, + "loss": 0.3467, + "step": 46550 + }, + { + "epoch": 0.8642410785562318, + "grad_norm": 0.28504928946495056, + "learning_rate": 8.9567943420515e-07, + "loss": 0.4209, + "step": 46552 + }, + { + "epoch": 0.8642782086936504, + "grad_norm": 0.35393908619880676, + "learning_rate": 8.951969672064142e-07, + "loss": 0.4074, + "step": 46554 + }, + { + "epoch": 0.864315338831069, + "grad_norm": 0.3662143647670746, + "learning_rate": 8.947146240967752e-07, + "loss": 0.2115, + "step": 46556 + }, + { + "epoch": 0.8643524689684876, + "grad_norm": 0.6477339267730713, + "learning_rate": 8.942324048827977e-07, + "loss": 0.2138, + "step": 46558 + }, + { + "epoch": 0.8643895991059063, + "grad_norm": 0.36759456992149353, + "learning_rate": 8.937503095710431e-07, + "loss": 0.3306, + "step": 46560 + }, + { + "epoch": 0.864426729243325, + "grad_norm": 0.3682751953601837, + "learning_rate": 8.932683381680696e-07, + "loss": 0.1948, + "step": 46562 + }, + { + "epoch": 0.8644638593807435, + "grad_norm": 0.3496985137462616, + "learning_rate": 8.927864906804351e-07, + "loss": 0.306, + "step": 46564 + }, + { + "epoch": 0.8645009895181622, + "grad_norm": 0.47471165657043457, + "learning_rate": 8.923047671146967e-07, + "loss": 0.1044, + "step": 46566 + }, + { + "epoch": 0.8645381196555808, + "grad_norm": 0.5156015753746033, + "learning_rate": 8.918231674774103e-07, + "loss": 0.1427, + "step": 46568 + }, + { + "epoch": 0.8645752497929995, + "grad_norm": 0.4092327654361725, + "learning_rate": 8.913416917751305e-07, + "loss": 0.312, + "step": 46570 + }, + { + "epoch": 0.8646123799304182, + "grad_norm": 0.47803959250450134, + "learning_rate": 8.908603400144045e-07, + "loss": 0.3901, + "step": 46572 + }, + { + "epoch": 0.8646495100678367, + "grad_norm": 0.3265105187892914, + "learning_rate": 8.903791122017847e-07, + "loss": 0.2851, + "step": 46574 + }, + { + "epoch": 0.8646866402052554, + "grad_norm": 0.23752376437187195, + "learning_rate": 8.89898008343818e-07, + "loss": 0.216, + "step": 46576 + }, + { + "epoch": 0.864723770342674, + "grad_norm": 0.3154624402523041, + "learning_rate": 8.894170284470515e-07, + "loss": 0.1441, + "step": 46578 + }, + { + "epoch": 0.8647609004800927, + "grad_norm": 0.4522281587123871, + "learning_rate": 8.889361725180323e-07, + "loss": 0.4899, + "step": 46580 + }, + { + "epoch": 0.8647980306175114, + "grad_norm": 0.2667147219181061, + "learning_rate": 8.884554405632984e-07, + "loss": 0.0726, + "step": 46582 + }, + { + "epoch": 0.8648351607549299, + "grad_norm": 0.5061129331588745, + "learning_rate": 8.879748325893966e-07, + "loss": 0.1911, + "step": 46584 + }, + { + "epoch": 0.8648722908923486, + "grad_norm": 0.15854398906230927, + "learning_rate": 8.874943486028609e-07, + "loss": 0.2873, + "step": 46586 + }, + { + "epoch": 0.8649094210297672, + "grad_norm": 0.48331233859062195, + "learning_rate": 8.870139886102325e-07, + "loss": 0.3132, + "step": 46588 + }, + { + "epoch": 0.8649465511671859, + "grad_norm": 0.2915746569633484, + "learning_rate": 8.865337526180473e-07, + "loss": 0.2558, + "step": 46590 + }, + { + "epoch": 0.8649836813046045, + "grad_norm": 0.4209698438644409, + "learning_rate": 8.86053640632839e-07, + "loss": 0.1905, + "step": 46592 + }, + { + "epoch": 0.8650208114420231, + "grad_norm": 0.29790475964546204, + "learning_rate": 8.855736526611424e-07, + "loss": 0.0867, + "step": 46594 + }, + { + "epoch": 0.8650579415794418, + "grad_norm": 0.49681714177131653, + "learning_rate": 8.85093788709489e-07, + "loss": 0.3693, + "step": 46596 + }, + { + "epoch": 0.8650950717168604, + "grad_norm": 0.4203861951828003, + "learning_rate": 8.846140487844046e-07, + "loss": 0.462, + "step": 46598 + }, + { + "epoch": 0.8651322018542791, + "grad_norm": 0.3562068045139313, + "learning_rate": 8.841344328924185e-07, + "loss": 0.149, + "step": 46600 + }, + { + "epoch": 0.8651693319916977, + "grad_norm": 0.32756155729293823, + "learning_rate": 8.836549410400608e-07, + "loss": 0.326, + "step": 46602 + }, + { + "epoch": 0.8652064621291163, + "grad_norm": 0.4007483422756195, + "learning_rate": 8.831755732338498e-07, + "loss": 0.5249, + "step": 46604 + }, + { + "epoch": 0.865243592266535, + "grad_norm": 0.47130683064460754, + "learning_rate": 8.826963294803126e-07, + "loss": 0.3475, + "step": 46606 + }, + { + "epoch": 0.8652807224039536, + "grad_norm": 0.3040938973426819, + "learning_rate": 8.822172097859693e-07, + "loss": 0.4274, + "step": 46608 + }, + { + "epoch": 0.8653178525413723, + "grad_norm": 0.2896493375301361, + "learning_rate": 8.817382141573372e-07, + "loss": 0.1457, + "step": 46610 + }, + { + "epoch": 0.8653549826787909, + "grad_norm": 0.455607533454895, + "learning_rate": 8.812593426009364e-07, + "loss": 0.2119, + "step": 46612 + }, + { + "epoch": 0.8653921128162095, + "grad_norm": 0.38733503222465515, + "learning_rate": 8.807805951232817e-07, + "loss": 0.1935, + "step": 46614 + }, + { + "epoch": 0.8654292429536282, + "grad_norm": 0.27781516313552856, + "learning_rate": 8.80301971730888e-07, + "loss": 0.2212, + "step": 46616 + }, + { + "epoch": 0.8654663730910468, + "grad_norm": 0.5063551664352417, + "learning_rate": 8.798234724302679e-07, + "loss": 0.3119, + "step": 46618 + }, + { + "epoch": 0.8655035032284655, + "grad_norm": 0.48605701327323914, + "learning_rate": 8.793450972279338e-07, + "loss": 0.2476, + "step": 46620 + }, + { + "epoch": 0.865540633365884, + "grad_norm": 0.4494832754135132, + "learning_rate": 8.788668461303917e-07, + "loss": 0.2823, + "step": 46622 + }, + { + "epoch": 0.8655777635033027, + "grad_norm": 0.39770105481147766, + "learning_rate": 8.783887191441531e-07, + "loss": 0.4773, + "step": 46624 + }, + { + "epoch": 0.8656148936407214, + "grad_norm": 0.20229730010032654, + "learning_rate": 8.779107162757194e-07, + "loss": 0.2204, + "step": 46626 + }, + { + "epoch": 0.86565202377814, + "grad_norm": 0.27505743503570557, + "learning_rate": 8.774328375315966e-07, + "loss": 0.1241, + "step": 46628 + }, + { + "epoch": 0.8656891539155587, + "grad_norm": 0.21107247471809387, + "learning_rate": 8.769550829182883e-07, + "loss": 0.2119, + "step": 46630 + }, + { + "epoch": 0.8657262840529772, + "grad_norm": 0.612760066986084, + "learning_rate": 8.764774524422947e-07, + "loss": 0.3248, + "step": 46632 + }, + { + "epoch": 0.8657634141903959, + "grad_norm": 0.3635815382003784, + "learning_rate": 8.759999461101165e-07, + "loss": 0.3673, + "step": 46634 + }, + { + "epoch": 0.8658005443278145, + "grad_norm": 0.2670760154724121, + "learning_rate": 8.755225639282472e-07, + "loss": 0.1225, + "step": 46636 + }, + { + "epoch": 0.8658376744652332, + "grad_norm": 0.44253015518188477, + "learning_rate": 8.750453059031849e-07, + "loss": 0.3035, + "step": 46638 + }, + { + "epoch": 0.8658748046026519, + "grad_norm": 0.4554644227027893, + "learning_rate": 8.745681720414245e-07, + "loss": 0.2483, + "step": 46640 + }, + { + "epoch": 0.8659119347400704, + "grad_norm": 0.3817485570907593, + "learning_rate": 8.740911623494563e-07, + "loss": 0.1405, + "step": 46642 + }, + { + "epoch": 0.8659490648774891, + "grad_norm": 0.37608328461647034, + "learning_rate": 8.736142768337741e-07, + "loss": 0.2747, + "step": 46644 + }, + { + "epoch": 0.8659861950149077, + "grad_norm": 0.6206964254379272, + "learning_rate": 8.731375155008659e-07, + "loss": 0.3702, + "step": 46646 + }, + { + "epoch": 0.8660233251523264, + "grad_norm": 0.4811398684978485, + "learning_rate": 8.726608783572155e-07, + "loss": 0.5638, + "step": 46648 + }, + { + "epoch": 0.8660604552897451, + "grad_norm": 0.40012407302856445, + "learning_rate": 8.721843654093109e-07, + "loss": 0.2569, + "step": 46650 + }, + { + "epoch": 0.8660975854271636, + "grad_norm": 0.6387036442756653, + "learning_rate": 8.71707976663636e-07, + "loss": 0.5028, + "step": 46652 + }, + { + "epoch": 0.8661347155645823, + "grad_norm": 0.3891315758228302, + "learning_rate": 8.712317121266733e-07, + "loss": 0.1302, + "step": 46654 + }, + { + "epoch": 0.8661718457020009, + "grad_norm": 0.4630424976348877, + "learning_rate": 8.707555718049032e-07, + "loss": 0.2573, + "step": 46656 + }, + { + "epoch": 0.8662089758394196, + "grad_norm": 0.404838889837265, + "learning_rate": 8.702795557048049e-07, + "loss": 0.4076, + "step": 46658 + }, + { + "epoch": 0.8662461059768383, + "grad_norm": 0.4439959228038788, + "learning_rate": 8.698036638328567e-07, + "loss": 0.2787, + "step": 46660 + }, + { + "epoch": 0.8662832361142568, + "grad_norm": 0.646397590637207, + "learning_rate": 8.693278961955309e-07, + "loss": 0.2841, + "step": 46662 + }, + { + "epoch": 0.8663203662516755, + "grad_norm": 0.48292675614356995, + "learning_rate": 8.688522527993026e-07, + "loss": 0.3776, + "step": 46664 + }, + { + "epoch": 0.8663574963890941, + "grad_norm": 0.4071963131427765, + "learning_rate": 8.683767336506466e-07, + "loss": 0.4973, + "step": 46666 + }, + { + "epoch": 0.8663946265265128, + "grad_norm": 0.4584599435329437, + "learning_rate": 8.679013387560276e-07, + "loss": 0.5439, + "step": 46668 + }, + { + "epoch": 0.8664317566639315, + "grad_norm": 0.32005393505096436, + "learning_rate": 8.674260681219193e-07, + "loss": 0.1204, + "step": 46670 + }, + { + "epoch": 0.86646888680135, + "grad_norm": 0.44103845953941345, + "learning_rate": 8.669509217547877e-07, + "loss": 0.3124, + "step": 46672 + }, + { + "epoch": 0.8665060169387687, + "grad_norm": 0.24196313321590424, + "learning_rate": 8.664758996610956e-07, + "loss": 0.0286, + "step": 46674 + }, + { + "epoch": 0.8665431470761873, + "grad_norm": 0.41530174016952515, + "learning_rate": 8.660010018473086e-07, + "loss": 0.3668, + "step": 46676 + }, + { + "epoch": 0.866580277213606, + "grad_norm": 0.49675968289375305, + "learning_rate": 8.655262283198895e-07, + "loss": 0.1009, + "step": 46678 + }, + { + "epoch": 0.8666174073510247, + "grad_norm": 0.33067792654037476, + "learning_rate": 8.650515790852964e-07, + "loss": 0.2376, + "step": 46680 + }, + { + "epoch": 0.8666545374884432, + "grad_norm": 0.2632119655609131, + "learning_rate": 8.645770541499887e-07, + "loss": 0.1833, + "step": 46682 + }, + { + "epoch": 0.8666916676258619, + "grad_norm": 0.4997495412826538, + "learning_rate": 8.641026535204267e-07, + "loss": 0.2493, + "step": 46684 + }, + { + "epoch": 0.8667287977632805, + "grad_norm": 0.20244520902633667, + "learning_rate": 8.636283772030596e-07, + "loss": 0.303, + "step": 46686 + }, + { + "epoch": 0.8667659279006992, + "grad_norm": 0.21397551894187927, + "learning_rate": 8.631542252043457e-07, + "loss": 0.2644, + "step": 46688 + }, + { + "epoch": 0.8668030580381177, + "grad_norm": 0.5239880681037903, + "learning_rate": 8.62680197530732e-07, + "loss": 0.1335, + "step": 46690 + }, + { + "epoch": 0.8668401881755364, + "grad_norm": 0.3104003667831421, + "learning_rate": 8.622062941886732e-07, + "loss": 0.339, + "step": 46692 + }, + { + "epoch": 0.8668773183129551, + "grad_norm": 0.24175812304019928, + "learning_rate": 8.617325151846146e-07, + "loss": 0.1051, + "step": 46694 + }, + { + "epoch": 0.8669144484503737, + "grad_norm": 0.3070138990879059, + "learning_rate": 8.61258860525005e-07, + "loss": 0.3769, + "step": 46696 + }, + { + "epoch": 0.8669515785877924, + "grad_norm": 0.2731201648712158, + "learning_rate": 8.607853302162894e-07, + "loss": 0.0957, + "step": 46698 + }, + { + "epoch": 0.8669887087252109, + "grad_norm": 0.43242397904396057, + "learning_rate": 8.603119242649094e-07, + "loss": 0.3822, + "step": 46700 + }, + { + "epoch": 0.8670258388626296, + "grad_norm": 0.36537522077560425, + "learning_rate": 8.598386426773064e-07, + "loss": 0.341, + "step": 46702 + }, + { + "epoch": 0.8670629690000483, + "grad_norm": 0.33908945322036743, + "learning_rate": 8.593654854599231e-07, + "loss": 0.3787, + "step": 46704 + }, + { + "epoch": 0.8671000991374669, + "grad_norm": 0.5095226168632507, + "learning_rate": 8.588924526191966e-07, + "loss": 0.2708, + "step": 46706 + }, + { + "epoch": 0.8671372292748856, + "grad_norm": 0.3375324606895447, + "learning_rate": 8.584195441615628e-07, + "loss": 0.2631, + "step": 46708 + }, + { + "epoch": 0.8671743594123041, + "grad_norm": 0.30688804388046265, + "learning_rate": 8.579467600934577e-07, + "loss": 0.3349, + "step": 46710 + }, + { + "epoch": 0.8672114895497228, + "grad_norm": 0.7809726595878601, + "learning_rate": 8.574741004213117e-07, + "loss": 0.1772, + "step": 46712 + }, + { + "epoch": 0.8672486196871415, + "grad_norm": 0.2520394027233124, + "learning_rate": 8.570015651515584e-07, + "loss": 0.132, + "step": 46714 + }, + { + "epoch": 0.8672857498245601, + "grad_norm": 0.2935698330402374, + "learning_rate": 8.565291542906262e-07, + "loss": 0.3239, + "step": 46716 + }, + { + "epoch": 0.8673228799619788, + "grad_norm": 0.6291429996490479, + "learning_rate": 8.560568678449455e-07, + "loss": 0.208, + "step": 46718 + }, + { + "epoch": 0.8673600100993973, + "grad_norm": 0.2916485369205475, + "learning_rate": 8.555847058209421e-07, + "loss": 0.2314, + "step": 46720 + }, + { + "epoch": 0.867397140236816, + "grad_norm": 0.4152311384677887, + "learning_rate": 8.551126682250411e-07, + "loss": 0.2583, + "step": 46722 + }, + { + "epoch": 0.8674342703742347, + "grad_norm": 0.33467012643814087, + "learning_rate": 8.546407550636626e-07, + "loss": 0.3899, + "step": 46724 + }, + { + "epoch": 0.8674714005116533, + "grad_norm": 0.5513776540756226, + "learning_rate": 8.541689663432307e-07, + "loss": 0.3961, + "step": 46726 + }, + { + "epoch": 0.867508530649072, + "grad_norm": 0.6702874898910522, + "learning_rate": 8.536973020701644e-07, + "loss": 0.4056, + "step": 46728 + }, + { + "epoch": 0.8675456607864905, + "grad_norm": 0.4560350775718689, + "learning_rate": 8.532257622508811e-07, + "loss": 0.2708, + "step": 46730 + }, + { + "epoch": 0.8675827909239092, + "grad_norm": 0.6269465684890747, + "learning_rate": 8.527543468917976e-07, + "loss": 0.2176, + "step": 46732 + }, + { + "epoch": 0.8676199210613279, + "grad_norm": 0.2301567643880844, + "learning_rate": 8.52283055999329e-07, + "loss": 0.1776, + "step": 46734 + }, + { + "epoch": 0.8676570511987465, + "grad_norm": 0.4391860067844391, + "learning_rate": 8.518118895798866e-07, + "loss": 0.2161, + "step": 46736 + }, + { + "epoch": 0.8676941813361652, + "grad_norm": 0.2992490530014038, + "learning_rate": 8.513408476398821e-07, + "loss": 0.3297, + "step": 46738 + }, + { + "epoch": 0.8677313114735837, + "grad_norm": 0.35348713397979736, + "learning_rate": 8.508699301857248e-07, + "loss": 0.1737, + "step": 46740 + }, + { + "epoch": 0.8677684416110024, + "grad_norm": 0.42722514271736145, + "learning_rate": 8.503991372238241e-07, + "loss": 0.3177, + "step": 46742 + }, + { + "epoch": 0.867805571748421, + "grad_norm": 0.4299079477787018, + "learning_rate": 8.499284687605835e-07, + "loss": 0.4918, + "step": 46744 + }, + { + "epoch": 0.8678427018858397, + "grad_norm": 0.38608160614967346, + "learning_rate": 8.494579248024093e-07, + "loss": 0.1828, + "step": 46746 + }, + { + "epoch": 0.8678798320232584, + "grad_norm": 0.4946957230567932, + "learning_rate": 8.489875053557062e-07, + "loss": 0.2828, + "step": 46748 + }, + { + "epoch": 0.8679169621606769, + "grad_norm": 0.44987526535987854, + "learning_rate": 8.485172104268702e-07, + "loss": 0.3871, + "step": 46750 + }, + { + "epoch": 0.8679540922980956, + "grad_norm": 0.5741338729858398, + "learning_rate": 8.480470400223051e-07, + "loss": 0.2577, + "step": 46752 + }, + { + "epoch": 0.8679912224355142, + "grad_norm": 0.4036657512187958, + "learning_rate": 8.475769941484036e-07, + "loss": 0.2005, + "step": 46754 + }, + { + "epoch": 0.8680283525729329, + "grad_norm": 0.404279887676239, + "learning_rate": 8.47107072811566e-07, + "loss": 0.205, + "step": 46756 + }, + { + "epoch": 0.8680654827103516, + "grad_norm": 0.54813152551651, + "learning_rate": 8.466372760181851e-07, + "loss": 0.1342, + "step": 46758 + }, + { + "epoch": 0.8681026128477701, + "grad_norm": 0.29971837997436523, + "learning_rate": 8.461676037746547e-07, + "loss": 0.1997, + "step": 46760 + }, + { + "epoch": 0.8681397429851888, + "grad_norm": 0.4502348303794861, + "learning_rate": 8.456980560873618e-07, + "loss": 0.2192, + "step": 46762 + }, + { + "epoch": 0.8681768731226074, + "grad_norm": 0.3986245393753052, + "learning_rate": 8.452286329626991e-07, + "loss": 0.1689, + "step": 46764 + }, + { + "epoch": 0.8682140032600261, + "grad_norm": 0.33909183740615845, + "learning_rate": 8.447593344070526e-07, + "loss": 0.2688, + "step": 46766 + }, + { + "epoch": 0.8682511333974448, + "grad_norm": 0.48738452792167664, + "learning_rate": 8.442901604268084e-07, + "loss": 0.3905, + "step": 46768 + }, + { + "epoch": 0.8682882635348633, + "grad_norm": 0.36339446902275085, + "learning_rate": 8.438211110283523e-07, + "loss": 0.3349, + "step": 46770 + }, + { + "epoch": 0.868325393672282, + "grad_norm": 0.21917188167572021, + "learning_rate": 8.433521862180638e-07, + "loss": 0.3015, + "step": 46772 + }, + { + "epoch": 0.8683625238097006, + "grad_norm": 0.45472899079322815, + "learning_rate": 8.428833860023255e-07, + "loss": 0.2701, + "step": 46774 + }, + { + "epoch": 0.8683996539471193, + "grad_norm": 0.3960273861885071, + "learning_rate": 8.424147103875147e-07, + "loss": 0.4122, + "step": 46776 + }, + { + "epoch": 0.868436784084538, + "grad_norm": 0.3743881583213806, + "learning_rate": 8.419461593800082e-07, + "loss": 0.3918, + "step": 46778 + }, + { + "epoch": 0.8684739142219565, + "grad_norm": 0.30909842252731323, + "learning_rate": 8.414777329861845e-07, + "loss": 0.5219, + "step": 46780 + }, + { + "epoch": 0.8685110443593752, + "grad_norm": 0.40392014384269714, + "learning_rate": 8.410094312124151e-07, + "loss": 0.3892, + "step": 46782 + }, + { + "epoch": 0.8685481744967938, + "grad_norm": 0.3154885768890381, + "learning_rate": 8.405412540650737e-07, + "loss": 0.3524, + "step": 46784 + }, + { + "epoch": 0.8685853046342125, + "grad_norm": 0.8122543096542358, + "learning_rate": 8.400732015505309e-07, + "loss": 0.1452, + "step": 46786 + }, + { + "epoch": 0.868622434771631, + "grad_norm": 0.2996312975883484, + "learning_rate": 8.396052736751537e-07, + "loss": 0.3771, + "step": 46788 + }, + { + "epoch": 0.8686595649090497, + "grad_norm": 0.423929363489151, + "learning_rate": 8.391374704453093e-07, + "loss": 0.0882, + "step": 46790 + }, + { + "epoch": 0.8686966950464684, + "grad_norm": 0.3788263201713562, + "learning_rate": 8.386697918673659e-07, + "loss": 0.1789, + "step": 46792 + }, + { + "epoch": 0.868733825183887, + "grad_norm": 0.5026527047157288, + "learning_rate": 8.38202237947684e-07, + "loss": 0.2108, + "step": 46794 + }, + { + "epoch": 0.8687709553213057, + "grad_norm": 0.5362962484359741, + "learning_rate": 8.377348086926262e-07, + "loss": 0.2154, + "step": 46796 + }, + { + "epoch": 0.8688080854587242, + "grad_norm": 0.3448481559753418, + "learning_rate": 8.372675041085565e-07, + "loss": 0.2406, + "step": 46798 + }, + { + "epoch": 0.8688452155961429, + "grad_norm": 0.30452924966812134, + "learning_rate": 8.368003242018274e-07, + "loss": 0.1886, + "step": 46800 + }, + { + "epoch": 0.8688823457335616, + "grad_norm": 0.3973161280155182, + "learning_rate": 8.363332689787995e-07, + "loss": 0.2206, + "step": 46802 + }, + { + "epoch": 0.8689194758709802, + "grad_norm": 0.34383252263069153, + "learning_rate": 8.358663384458276e-07, + "loss": 0.1737, + "step": 46804 + }, + { + "epoch": 0.8689566060083989, + "grad_norm": 0.8220702409744263, + "learning_rate": 8.353995326092645e-07, + "loss": 0.4832, + "step": 46806 + }, + { + "epoch": 0.8689937361458174, + "grad_norm": 0.27525028586387634, + "learning_rate": 8.34932851475464e-07, + "loss": 0.2589, + "step": 46808 + }, + { + "epoch": 0.8690308662832361, + "grad_norm": 0.3732289671897888, + "learning_rate": 8.344662950507753e-07, + "loss": 0.3137, + "step": 46810 + }, + { + "epoch": 0.8690679964206548, + "grad_norm": 0.3179345428943634, + "learning_rate": 8.339998633415447e-07, + "loss": 0.3431, + "step": 46812 + }, + { + "epoch": 0.8691051265580734, + "grad_norm": 0.45786359906196594, + "learning_rate": 8.335335563541225e-07, + "loss": 0.257, + "step": 46814 + }, + { + "epoch": 0.869142256695492, + "grad_norm": 0.32367852330207825, + "learning_rate": 8.330673740948503e-07, + "loss": 0.3346, + "step": 46816 + }, + { + "epoch": 0.8691793868329106, + "grad_norm": 0.5150794982910156, + "learning_rate": 8.326013165700731e-07, + "loss": 0.4357, + "step": 46818 + }, + { + "epoch": 0.8692165169703293, + "grad_norm": 0.563178300857544, + "learning_rate": 8.321353837861323e-07, + "loss": 0.2732, + "step": 46820 + }, + { + "epoch": 0.869253647107748, + "grad_norm": 0.3412834107875824, + "learning_rate": 8.316695757493676e-07, + "loss": 0.2039, + "step": 46822 + }, + { + "epoch": 0.8692907772451666, + "grad_norm": 0.42963916063308716, + "learning_rate": 8.312038924661203e-07, + "loss": 0.1618, + "step": 46824 + }, + { + "epoch": 0.8693279073825853, + "grad_norm": 0.5310094952583313, + "learning_rate": 8.307383339427222e-07, + "loss": 0.3348, + "step": 46826 + }, + { + "epoch": 0.8693650375200038, + "grad_norm": 0.4207155108451843, + "learning_rate": 8.302729001855092e-07, + "loss": 0.1522, + "step": 46828 + }, + { + "epoch": 0.8694021676574225, + "grad_norm": 0.4837324023246765, + "learning_rate": 8.298075912008164e-07, + "loss": 0.2672, + "step": 46830 + }, + { + "epoch": 0.8694392977948412, + "grad_norm": 0.7493934035301208, + "learning_rate": 8.293424069949751e-07, + "loss": 0.2496, + "step": 46832 + }, + { + "epoch": 0.8694764279322598, + "grad_norm": 0.32899099588394165, + "learning_rate": 8.28877347574315e-07, + "loss": 0.2558, + "step": 46834 + }, + { + "epoch": 0.8695135580696784, + "grad_norm": 0.25818580389022827, + "learning_rate": 8.284124129451643e-07, + "loss": 0.366, + "step": 46836 + }, + { + "epoch": 0.869550688207097, + "grad_norm": 0.3294360339641571, + "learning_rate": 8.279476031138456e-07, + "loss": 0.1956, + "step": 46838 + }, + { + "epoch": 0.8695878183445157, + "grad_norm": 0.3252273499965668, + "learning_rate": 8.274829180866873e-07, + "loss": 0.307, + "step": 46840 + }, + { + "epoch": 0.8696249484819343, + "grad_norm": 0.22662734985351562, + "learning_rate": 8.27018357870012e-07, + "loss": 0.1231, + "step": 46842 + }, + { + "epoch": 0.869662078619353, + "grad_norm": 0.46891823410987854, + "learning_rate": 8.265539224701402e-07, + "loss": 0.1574, + "step": 46844 + }, + { + "epoch": 0.8696992087567716, + "grad_norm": 0.4036763608455658, + "learning_rate": 8.260896118933914e-07, + "loss": 0.279, + "step": 46846 + }, + { + "epoch": 0.8697363388941902, + "grad_norm": 0.3151242136955261, + "learning_rate": 8.256254261460861e-07, + "loss": 0.198, + "step": 46848 + }, + { + "epoch": 0.8697734690316089, + "grad_norm": 0.2610291540622711, + "learning_rate": 8.25161365234537e-07, + "loss": 0.1894, + "step": 46850 + }, + { + "epoch": 0.8698105991690275, + "grad_norm": 0.3865082859992981, + "learning_rate": 8.2469742916506e-07, + "loss": 0.2579, + "step": 46852 + }, + { + "epoch": 0.8698477293064462, + "grad_norm": 0.2691592276096344, + "learning_rate": 8.242336179439669e-07, + "loss": 0.2302, + "step": 46854 + }, + { + "epoch": 0.8698848594438648, + "grad_norm": 0.34262847900390625, + "learning_rate": 8.237699315775716e-07, + "loss": 0.428, + "step": 46856 + }, + { + "epoch": 0.8699219895812834, + "grad_norm": 0.4384131133556366, + "learning_rate": 8.2330637007218e-07, + "loss": 0.3155, + "step": 46858 + }, + { + "epoch": 0.8699591197187021, + "grad_norm": 0.41056200861930847, + "learning_rate": 8.228429334341037e-07, + "loss": 0.3629, + "step": 46860 + }, + { + "epoch": 0.8699962498561207, + "grad_norm": 0.5439568161964417, + "learning_rate": 8.223796216696433e-07, + "loss": 0.1458, + "step": 46862 + }, + { + "epoch": 0.8700333799935394, + "grad_norm": 0.5890020728111267, + "learning_rate": 8.219164347851071e-07, + "loss": 0.1753, + "step": 46864 + }, + { + "epoch": 0.870070510130958, + "grad_norm": 0.33165672421455383, + "learning_rate": 8.214533727867957e-07, + "loss": 0.1726, + "step": 46866 + }, + { + "epoch": 0.8701076402683766, + "grad_norm": 0.47224748134613037, + "learning_rate": 8.209904356810116e-07, + "loss": 0.4172, + "step": 46868 + }, + { + "epoch": 0.8701447704057953, + "grad_norm": 0.3229072093963623, + "learning_rate": 8.205276234740534e-07, + "loss": 0.1533, + "step": 46870 + }, + { + "epoch": 0.8701819005432139, + "grad_norm": 0.43850961327552795, + "learning_rate": 8.20064936172219e-07, + "loss": 0.2847, + "step": 46872 + }, + { + "epoch": 0.8702190306806326, + "grad_norm": 0.4134316146373749, + "learning_rate": 8.196023737818048e-07, + "loss": 0.3256, + "step": 46874 + }, + { + "epoch": 0.8702561608180512, + "grad_norm": 0.41142797470092773, + "learning_rate": 8.191399363091024e-07, + "loss": 0.5251, + "step": 46876 + }, + { + "epoch": 0.8702932909554698, + "grad_norm": 0.37773585319519043, + "learning_rate": 8.186776237604066e-07, + "loss": 0.4785, + "step": 46878 + }, + { + "epoch": 0.8703304210928885, + "grad_norm": 0.3806646764278412, + "learning_rate": 8.182154361420059e-07, + "loss": 0.199, + "step": 46880 + }, + { + "epoch": 0.8703675512303071, + "grad_norm": 0.40309634804725647, + "learning_rate": 8.177533734601905e-07, + "loss": 0.3107, + "step": 46882 + }, + { + "epoch": 0.8704046813677258, + "grad_norm": 0.4841700792312622, + "learning_rate": 8.172914357212481e-07, + "loss": 0.3472, + "step": 46884 + }, + { + "epoch": 0.8704418115051444, + "grad_norm": 0.5577847957611084, + "learning_rate": 8.168296229314653e-07, + "loss": 0.4672, + "step": 46886 + }, + { + "epoch": 0.870478941642563, + "grad_norm": 0.3581286370754242, + "learning_rate": 8.16367935097122e-07, + "loss": 0.3389, + "step": 46888 + }, + { + "epoch": 0.8705160717799817, + "grad_norm": 0.3973256051540375, + "learning_rate": 8.159063722245042e-07, + "loss": 0.2965, + "step": 46890 + }, + { + "epoch": 0.8705532019174003, + "grad_norm": 0.24436356127262115, + "learning_rate": 8.1544493431989e-07, + "loss": 0.2871, + "step": 46892 + }, + { + "epoch": 0.870590332054819, + "grad_norm": 0.4235929548740387, + "learning_rate": 8.149836213895601e-07, + "loss": 0.224, + "step": 46894 + }, + { + "epoch": 0.8706274621922375, + "grad_norm": 0.29665058851242065, + "learning_rate": 8.145224334397906e-07, + "loss": 0.6374, + "step": 46896 + }, + { + "epoch": 0.8706645923296562, + "grad_norm": 0.28437289595603943, + "learning_rate": 8.140613704768597e-07, + "loss": 0.1848, + "step": 46898 + }, + { + "epoch": 0.8707017224670749, + "grad_norm": 0.34889310598373413, + "learning_rate": 8.136004325070368e-07, + "loss": 0.2822, + "step": 46900 + }, + { + "epoch": 0.8707388526044935, + "grad_norm": 0.3243907690048218, + "learning_rate": 8.131396195365948e-07, + "loss": 0.3827, + "step": 46902 + }, + { + "epoch": 0.8707759827419121, + "grad_norm": 0.34206849336624146, + "learning_rate": 8.126789315718042e-07, + "loss": 0.3107, + "step": 46904 + }, + { + "epoch": 0.8708131128793307, + "grad_norm": 0.347871333360672, + "learning_rate": 8.122183686189345e-07, + "loss": 0.1952, + "step": 46906 + }, + { + "epoch": 0.8708502430167494, + "grad_norm": 0.4802897274494171, + "learning_rate": 8.117579306842527e-07, + "loss": 0.2035, + "step": 46908 + }, + { + "epoch": 0.8708873731541681, + "grad_norm": 0.4295545816421509, + "learning_rate": 8.112976177740228e-07, + "loss": 0.2716, + "step": 46910 + }, + { + "epoch": 0.8709245032915867, + "grad_norm": 0.33927562832832336, + "learning_rate": 8.10837429894511e-07, + "loss": 0.3697, + "step": 46912 + }, + { + "epoch": 0.8709616334290053, + "grad_norm": 0.4792118966579437, + "learning_rate": 8.103773670519755e-07, + "loss": 0.33, + "step": 46914 + }, + { + "epoch": 0.8709987635664239, + "grad_norm": 0.3199096918106079, + "learning_rate": 8.099174292526768e-07, + "loss": 0.1472, + "step": 46916 + }, + { + "epoch": 0.8710358937038426, + "grad_norm": 0.44600316882133484, + "learning_rate": 8.094576165028756e-07, + "loss": 0.3872, + "step": 46918 + }, + { + "epoch": 0.8710730238412613, + "grad_norm": 0.41273948550224304, + "learning_rate": 8.08997928808829e-07, + "loss": 0.2859, + "step": 46920 + }, + { + "epoch": 0.8711101539786799, + "grad_norm": 0.5141773223876953, + "learning_rate": 8.085383661767887e-07, + "loss": 0.1894, + "step": 46922 + }, + { + "epoch": 0.8711472841160985, + "grad_norm": 0.33083102107048035, + "learning_rate": 8.080789286130108e-07, + "loss": 0.4187, + "step": 46924 + }, + { + "epoch": 0.8711844142535171, + "grad_norm": 0.42429351806640625, + "learning_rate": 8.076196161237437e-07, + "loss": 0.1417, + "step": 46926 + }, + { + "epoch": 0.8712215443909358, + "grad_norm": 0.33410561084747314, + "learning_rate": 8.071604287152401e-07, + "loss": 0.3883, + "step": 46928 + }, + { + "epoch": 0.8712586745283545, + "grad_norm": 0.39677125215530396, + "learning_rate": 8.067013663937473e-07, + "loss": 0.4148, + "step": 46930 + }, + { + "epoch": 0.871295804665773, + "grad_norm": 0.4086270332336426, + "learning_rate": 8.062424291655114e-07, + "loss": 0.3543, + "step": 46932 + }, + { + "epoch": 0.8713329348031917, + "grad_norm": 0.3252888023853302, + "learning_rate": 8.057836170367772e-07, + "loss": 0.3542, + "step": 46934 + }, + { + "epoch": 0.8713700649406103, + "grad_norm": 0.14655110239982605, + "learning_rate": 8.0532493001379e-07, + "loss": 0.1796, + "step": 46936 + }, + { + "epoch": 0.871407195078029, + "grad_norm": 0.3896908164024353, + "learning_rate": 8.04866368102788e-07, + "loss": 0.22, + "step": 46938 + }, + { + "epoch": 0.8714443252154476, + "grad_norm": 0.4422926604747772, + "learning_rate": 8.044079313100117e-07, + "loss": 0.4266, + "step": 46940 + }, + { + "epoch": 0.8714814553528663, + "grad_norm": 0.28147879242897034, + "learning_rate": 8.039496196417018e-07, + "loss": 0.201, + "step": 46942 + }, + { + "epoch": 0.8715185854902849, + "grad_norm": 0.5568515062332153, + "learning_rate": 8.034914331040899e-07, + "loss": 0.3268, + "step": 46944 + }, + { + "epoch": 0.8715557156277035, + "grad_norm": 0.9151408672332764, + "learning_rate": 8.030333717034133e-07, + "loss": 0.2655, + "step": 46946 + }, + { + "epoch": 0.8715928457651222, + "grad_norm": 0.39348793029785156, + "learning_rate": 8.025754354459036e-07, + "loss": 0.2175, + "step": 46948 + }, + { + "epoch": 0.8716299759025408, + "grad_norm": 0.7325469255447388, + "learning_rate": 8.021176243377948e-07, + "loss": 0.3379, + "step": 46950 + }, + { + "epoch": 0.8716671060399594, + "grad_norm": 0.24807314574718475, + "learning_rate": 8.01659938385313e-07, + "loss": 0.1573, + "step": 46952 + }, + { + "epoch": 0.8717042361773781, + "grad_norm": 0.3901421129703522, + "learning_rate": 8.012023775946875e-07, + "loss": 0.4307, + "step": 46954 + }, + { + "epoch": 0.8717413663147967, + "grad_norm": 0.3493906855583191, + "learning_rate": 8.007449419721436e-07, + "loss": 0.3036, + "step": 46956 + }, + { + "epoch": 0.8717784964522154, + "grad_norm": 0.46491196751594543, + "learning_rate": 8.002876315239061e-07, + "loss": 0.187, + "step": 46958 + }, + { + "epoch": 0.871815626589634, + "grad_norm": 0.2769257128238678, + "learning_rate": 7.998304462561989e-07, + "loss": 0.2518, + "step": 46960 + }, + { + "epoch": 0.8718527567270526, + "grad_norm": 0.5014168620109558, + "learning_rate": 7.993733861752407e-07, + "loss": 0.2515, + "step": 46962 + }, + { + "epoch": 0.8718898868644713, + "grad_norm": 0.46553605794906616, + "learning_rate": 7.989164512872527e-07, + "loss": 0.217, + "step": 46964 + }, + { + "epoch": 0.8719270170018899, + "grad_norm": 0.30732080340385437, + "learning_rate": 7.984596415984491e-07, + "loss": 0.2431, + "step": 46966 + }, + { + "epoch": 0.8719641471393086, + "grad_norm": 0.43322813510894775, + "learning_rate": 7.980029571150494e-07, + "loss": 0.2653, + "step": 46968 + }, + { + "epoch": 0.8720012772767272, + "grad_norm": 0.9851709604263306, + "learning_rate": 7.975463978432652e-07, + "loss": 0.3684, + "step": 46970 + }, + { + "epoch": 0.8720384074141458, + "grad_norm": 0.41028451919555664, + "learning_rate": 7.970899637893104e-07, + "loss": 0.2304, + "step": 46972 + }, + { + "epoch": 0.8720755375515645, + "grad_norm": 0.40107613801956177, + "learning_rate": 7.966336549593967e-07, + "loss": 0.1848, + "step": 46974 + }, + { + "epoch": 0.8721126676889831, + "grad_norm": 0.21391628682613373, + "learning_rate": 7.961774713597304e-07, + "loss": 0.2186, + "step": 46976 + }, + { + "epoch": 0.8721497978264018, + "grad_norm": 0.3562232255935669, + "learning_rate": 7.957214129965207e-07, + "loss": 0.2498, + "step": 46978 + }, + { + "epoch": 0.8721869279638204, + "grad_norm": 0.3141767382621765, + "learning_rate": 7.952654798759718e-07, + "loss": 0.4085, + "step": 46980 + }, + { + "epoch": 0.872224058101239, + "grad_norm": 0.5665150284767151, + "learning_rate": 7.948096720042897e-07, + "loss": 0.3, + "step": 46982 + }, + { + "epoch": 0.8722611882386577, + "grad_norm": 0.4706290364265442, + "learning_rate": 7.94353989387674e-07, + "loss": 0.2393, + "step": 46984 + }, + { + "epoch": 0.8722983183760763, + "grad_norm": 0.42412692308425903, + "learning_rate": 7.938984320323262e-07, + "loss": 0.4413, + "step": 46986 + }, + { + "epoch": 0.872335448513495, + "grad_norm": 0.2788406312465668, + "learning_rate": 7.93442999944447e-07, + "loss": 0.2225, + "step": 46988 + }, + { + "epoch": 0.8723725786509136, + "grad_norm": 0.5291059613227844, + "learning_rate": 7.929876931302305e-07, + "loss": 0.1665, + "step": 46990 + }, + { + "epoch": 0.8724097087883322, + "grad_norm": 0.3849498927593231, + "learning_rate": 7.925325115958727e-07, + "loss": 0.1583, + "step": 46992 + }, + { + "epoch": 0.8724468389257508, + "grad_norm": 0.4301437437534332, + "learning_rate": 7.920774553475685e-07, + "loss": 0.3543, + "step": 46994 + }, + { + "epoch": 0.8724839690631695, + "grad_norm": 0.3828851580619812, + "learning_rate": 7.916225243915087e-07, + "loss": 0.1856, + "step": 46996 + }, + { + "epoch": 0.8725210992005882, + "grad_norm": 0.36980459094047546, + "learning_rate": 7.91167718733884e-07, + "loss": 0.2612, + "step": 46998 + }, + { + "epoch": 0.8725582293380068, + "grad_norm": 0.31478601694107056, + "learning_rate": 7.907130383808847e-07, + "loss": 0.2121, + "step": 47000 + }, + { + "epoch": 0.8725953594754254, + "grad_norm": 0.29716619849205017, + "learning_rate": 7.902584833386939e-07, + "loss": 0.3253, + "step": 47002 + }, + { + "epoch": 0.872632489612844, + "grad_norm": 0.3973672091960907, + "learning_rate": 7.898040536134999e-07, + "loss": 0.4157, + "step": 47004 + }, + { + "epoch": 0.8726696197502627, + "grad_norm": 0.3406217694282532, + "learning_rate": 7.893497492114854e-07, + "loss": 0.2602, + "step": 47006 + }, + { + "epoch": 0.8727067498876814, + "grad_norm": 0.40049558877944946, + "learning_rate": 7.888955701388312e-07, + "loss": 0.3306, + "step": 47008 + }, + { + "epoch": 0.8727438800251, + "grad_norm": 0.44139474630355835, + "learning_rate": 7.884415164017167e-07, + "loss": 0.1273, + "step": 47010 + }, + { + "epoch": 0.8727810101625186, + "grad_norm": 0.4285643696784973, + "learning_rate": 7.879875880063237e-07, + "loss": 0.2803, + "step": 47012 + }, + { + "epoch": 0.8728181402999372, + "grad_norm": 0.4406658411026001, + "learning_rate": 7.875337849588249e-07, + "loss": 0.1408, + "step": 47014 + }, + { + "epoch": 0.8728552704373559, + "grad_norm": 0.36838412284851074, + "learning_rate": 7.870801072653966e-07, + "loss": 0.145, + "step": 47016 + }, + { + "epoch": 0.8728924005747746, + "grad_norm": 0.8065474033355713, + "learning_rate": 7.866265549322127e-07, + "loss": 0.2099, + "step": 47018 + }, + { + "epoch": 0.8729295307121931, + "grad_norm": 0.3564400374889374, + "learning_rate": 7.861731279654428e-07, + "loss": 0.3134, + "step": 47020 + }, + { + "epoch": 0.8729666608496118, + "grad_norm": 0.30831846594810486, + "learning_rate": 7.857198263712595e-07, + "loss": 0.2754, + "step": 47022 + }, + { + "epoch": 0.8730037909870304, + "grad_norm": 0.4285091161727905, + "learning_rate": 7.852666501558304e-07, + "loss": 0.1815, + "step": 47024 + }, + { + "epoch": 0.8730409211244491, + "grad_norm": 0.294721782207489, + "learning_rate": 7.848135993253192e-07, + "loss": 0.1206, + "step": 47026 + }, + { + "epoch": 0.8730780512618678, + "grad_norm": 0.4550662636756897, + "learning_rate": 7.843606738858934e-07, + "loss": 0.3837, + "step": 47028 + }, + { + "epoch": 0.8731151813992863, + "grad_norm": 0.37050822377204895, + "learning_rate": 7.839078738437133e-07, + "loss": 0.1648, + "step": 47030 + }, + { + "epoch": 0.873152311536705, + "grad_norm": 0.45092859864234924, + "learning_rate": 7.834551992049422e-07, + "loss": 0.3774, + "step": 47032 + }, + { + "epoch": 0.8731894416741236, + "grad_norm": 0.30006077885627747, + "learning_rate": 7.830026499757393e-07, + "loss": 0.2258, + "step": 47034 + }, + { + "epoch": 0.8732265718115423, + "grad_norm": 0.5308423638343811, + "learning_rate": 7.825502261622608e-07, + "loss": 0.0827, + "step": 47036 + }, + { + "epoch": 0.8732637019489609, + "grad_norm": 0.5286074876785278, + "learning_rate": 7.820979277706675e-07, + "loss": 0.2868, + "step": 47038 + }, + { + "epoch": 0.8733008320863795, + "grad_norm": 0.177654430270195, + "learning_rate": 7.816457548071088e-07, + "loss": 0.177, + "step": 47040 + }, + { + "epoch": 0.8733379622237982, + "grad_norm": 0.38603055477142334, + "learning_rate": 7.811937072777387e-07, + "loss": 0.253, + "step": 47042 + }, + { + "epoch": 0.8733750923612168, + "grad_norm": 0.250284880399704, + "learning_rate": 7.807417851887078e-07, + "loss": 0.3812, + "step": 47044 + }, + { + "epoch": 0.8734122224986355, + "grad_norm": 0.16648025810718536, + "learning_rate": 7.80289988546169e-07, + "loss": 0.3247, + "step": 47046 + }, + { + "epoch": 0.873449352636054, + "grad_norm": 0.6877380013465881, + "learning_rate": 7.798383173562663e-07, + "loss": 0.2944, + "step": 47048 + }, + { + "epoch": 0.8734864827734727, + "grad_norm": 0.3495990037918091, + "learning_rate": 7.793867716251468e-07, + "loss": 0.1867, + "step": 47050 + }, + { + "epoch": 0.8735236129108914, + "grad_norm": 0.4121159315109253, + "learning_rate": 7.789353513589537e-07, + "loss": 0.4015, + "step": 47052 + }, + { + "epoch": 0.87356074304831, + "grad_norm": 0.31774282455444336, + "learning_rate": 7.784840565638296e-07, + "loss": 0.3181, + "step": 47054 + }, + { + "epoch": 0.8735978731857287, + "grad_norm": 0.5005460381507874, + "learning_rate": 7.780328872459164e-07, + "loss": 0.3186, + "step": 47056 + }, + { + "epoch": 0.8736350033231473, + "grad_norm": 0.4335612654685974, + "learning_rate": 7.775818434113514e-07, + "loss": 0.3512, + "step": 47058 + }, + { + "epoch": 0.8736721334605659, + "grad_norm": 0.5132916569709778, + "learning_rate": 7.77130925066274e-07, + "loss": 0.3421, + "step": 47060 + }, + { + "epoch": 0.8737092635979846, + "grad_norm": 0.5163152813911438, + "learning_rate": 7.766801322168216e-07, + "loss": 0.3677, + "step": 47062 + }, + { + "epoch": 0.8737463937354032, + "grad_norm": 0.37245506048202515, + "learning_rate": 7.762294648691226e-07, + "loss": 0.3194, + "step": 47064 + }, + { + "epoch": 0.8737835238728219, + "grad_norm": 0.34529978036880493, + "learning_rate": 7.757789230293122e-07, + "loss": 0.3476, + "step": 47066 + }, + { + "epoch": 0.8738206540102404, + "grad_norm": 0.4306011199951172, + "learning_rate": 7.753285067035232e-07, + "loss": 0.1598, + "step": 47068 + }, + { + "epoch": 0.8738577841476591, + "grad_norm": 0.5175257325172424, + "learning_rate": 7.748782158978807e-07, + "loss": 0.2032, + "step": 47070 + }, + { + "epoch": 0.8738949142850778, + "grad_norm": 0.35490211844444275, + "learning_rate": 7.744280506185131e-07, + "loss": 0.3039, + "step": 47072 + }, + { + "epoch": 0.8739320444224964, + "grad_norm": 0.25291216373443604, + "learning_rate": 7.739780108715455e-07, + "loss": 0.2381, + "step": 47074 + }, + { + "epoch": 0.8739691745599151, + "grad_norm": 0.3430781662464142, + "learning_rate": 7.735280966631032e-07, + "loss": 0.3225, + "step": 47076 + }, + { + "epoch": 0.8740063046973336, + "grad_norm": 0.23195859789848328, + "learning_rate": 7.730783079993065e-07, + "loss": 0.1756, + "step": 47078 + }, + { + "epoch": 0.8740434348347523, + "grad_norm": 0.5487247705459595, + "learning_rate": 7.726286448862752e-07, + "loss": 0.2531, + "step": 47080 + }, + { + "epoch": 0.874080564972171, + "grad_norm": 0.43528392910957336, + "learning_rate": 7.721791073301299e-07, + "loss": 0.3473, + "step": 47082 + }, + { + "epoch": 0.8741176951095896, + "grad_norm": 0.3767924904823303, + "learning_rate": 7.717296953369857e-07, + "loss": 0.1429, + "step": 47084 + }, + { + "epoch": 0.8741548252470083, + "grad_norm": 0.36049363017082214, + "learning_rate": 7.712804089129589e-07, + "loss": 0.2052, + "step": 47086 + }, + { + "epoch": 0.8741919553844268, + "grad_norm": 0.17538847029209137, + "learning_rate": 7.708312480641633e-07, + "loss": 0.156, + "step": 47088 + }, + { + "epoch": 0.8742290855218455, + "grad_norm": 0.4045586585998535, + "learning_rate": 7.703822127967108e-07, + "loss": 0.2576, + "step": 47090 + }, + { + "epoch": 0.8742662156592641, + "grad_norm": 0.5529013276100159, + "learning_rate": 7.699333031167089e-07, + "loss": 0.3724, + "step": 47092 + }, + { + "epoch": 0.8743033457966828, + "grad_norm": 0.5965579748153687, + "learning_rate": 7.694845190302669e-07, + "loss": 0.2175, + "step": 47094 + }, + { + "epoch": 0.8743404759341015, + "grad_norm": 0.683628261089325, + "learning_rate": 7.690358605434923e-07, + "loss": 0.1316, + "step": 47096 + }, + { + "epoch": 0.87437760607152, + "grad_norm": 0.3826424479484558, + "learning_rate": 7.685873276624888e-07, + "loss": 0.1161, + "step": 47098 + }, + { + "epoch": 0.8744147362089387, + "grad_norm": 0.586037278175354, + "learning_rate": 7.681389203933631e-07, + "loss": 0.1938, + "step": 47100 + }, + { + "epoch": 0.8744518663463573, + "grad_norm": 0.5722419023513794, + "learning_rate": 7.676906387422123e-07, + "loss": 0.3818, + "step": 47102 + }, + { + "epoch": 0.874488996483776, + "grad_norm": 0.2506225109100342, + "learning_rate": 7.672424827151381e-07, + "loss": 0.3142, + "step": 47104 + }, + { + "epoch": 0.8745261266211947, + "grad_norm": 0.3636063039302826, + "learning_rate": 7.66794452318238e-07, + "loss": 0.1996, + "step": 47106 + }, + { + "epoch": 0.8745632567586132, + "grad_norm": 0.3397851288318634, + "learning_rate": 7.663465475576093e-07, + "loss": 0.2193, + "step": 47108 + }, + { + "epoch": 0.8746003868960319, + "grad_norm": 0.3968251943588257, + "learning_rate": 7.658987684393471e-07, + "loss": 0.4408, + "step": 47110 + }, + { + "epoch": 0.8746375170334505, + "grad_norm": 0.4553351104259491, + "learning_rate": 7.65451114969542e-07, + "loss": 0.1583, + "step": 47112 + }, + { + "epoch": 0.8746746471708692, + "grad_norm": 0.3876280188560486, + "learning_rate": 7.650035871542883e-07, + "loss": 0.2165, + "step": 47114 + }, + { + "epoch": 0.8747117773082879, + "grad_norm": 0.587955892086029, + "learning_rate": 7.645561849996719e-07, + "loss": 0.1299, + "step": 47116 + }, + { + "epoch": 0.8747489074457064, + "grad_norm": 0.48590391874313354, + "learning_rate": 7.641089085117825e-07, + "loss": 0.1491, + "step": 47118 + }, + { + "epoch": 0.8747860375831251, + "grad_norm": 0.6574304103851318, + "learning_rate": 7.636617576967065e-07, + "loss": 0.1806, + "step": 47120 + }, + { + "epoch": 0.8748231677205437, + "grad_norm": 0.34984254837036133, + "learning_rate": 7.632147325605277e-07, + "loss": 0.3646, + "step": 47122 + }, + { + "epoch": 0.8748602978579624, + "grad_norm": 0.2127975970506668, + "learning_rate": 7.627678331093291e-07, + "loss": 0.1612, + "step": 47124 + }, + { + "epoch": 0.8748974279953811, + "grad_norm": 0.5778454542160034, + "learning_rate": 7.623210593491936e-07, + "loss": 0.3123, + "step": 47126 + }, + { + "epoch": 0.8749345581327996, + "grad_norm": 0.3468020260334015, + "learning_rate": 7.618744112861954e-07, + "loss": 0.4669, + "step": 47128 + }, + { + "epoch": 0.8749716882702183, + "grad_norm": 0.3819958567619324, + "learning_rate": 7.61427888926416e-07, + "loss": 0.2631, + "step": 47130 + }, + { + "epoch": 0.8750088184076369, + "grad_norm": 0.32784876227378845, + "learning_rate": 7.609814922759318e-07, + "loss": 0.2944, + "step": 47132 + }, + { + "epoch": 0.8750459485450556, + "grad_norm": 0.4688751995563507, + "learning_rate": 7.605352213408145e-07, + "loss": 0.2224, + "step": 47134 + }, + { + "epoch": 0.8750830786824743, + "grad_norm": 0.2833612859249115, + "learning_rate": 7.600890761271363e-07, + "loss": 0.2655, + "step": 47136 + }, + { + "epoch": 0.8751202088198928, + "grad_norm": 0.5316777229309082, + "learning_rate": 7.596430566409719e-07, + "loss": 0.1875, + "step": 47138 + }, + { + "epoch": 0.8751573389573115, + "grad_norm": 0.2816583216190338, + "learning_rate": 7.591971628883843e-07, + "loss": 0.1647, + "step": 47140 + }, + { + "epoch": 0.8751944690947301, + "grad_norm": 0.3718714416027069, + "learning_rate": 7.587513948754455e-07, + "loss": 0.3959, + "step": 47142 + }, + { + "epoch": 0.8752315992321488, + "grad_norm": 0.35715925693511963, + "learning_rate": 7.583057526082183e-07, + "loss": 0.1979, + "step": 47144 + }, + { + "epoch": 0.8752687293695673, + "grad_norm": 0.4522768259048462, + "learning_rate": 7.578602360927678e-07, + "loss": 0.4378, + "step": 47146 + }, + { + "epoch": 0.875305859506986, + "grad_norm": 0.19632217288017273, + "learning_rate": 7.574148453351571e-07, + "loss": 0.2197, + "step": 47148 + }, + { + "epoch": 0.8753429896444047, + "grad_norm": 0.8570135235786438, + "learning_rate": 7.569695803414456e-07, + "loss": 0.241, + "step": 47150 + }, + { + "epoch": 0.8753801197818233, + "grad_norm": 0.24787214398384094, + "learning_rate": 7.56524441117692e-07, + "loss": 0.1696, + "step": 47152 + }, + { + "epoch": 0.875417249919242, + "grad_norm": 0.5230529308319092, + "learning_rate": 7.560794276699545e-07, + "loss": 0.3421, + "step": 47154 + }, + { + "epoch": 0.8754543800566605, + "grad_norm": 0.38280364871025085, + "learning_rate": 7.556345400042853e-07, + "loss": 0.1687, + "step": 47156 + }, + { + "epoch": 0.8754915101940792, + "grad_norm": 0.38877183198928833, + "learning_rate": 7.551897781267392e-07, + "loss": 0.1554, + "step": 47158 + }, + { + "epoch": 0.8755286403314979, + "grad_norm": 0.34239476919174194, + "learning_rate": 7.547451420433704e-07, + "loss": 0.3574, + "step": 47160 + }, + { + "epoch": 0.8755657704689165, + "grad_norm": 0.3600442111492157, + "learning_rate": 7.543006317602263e-07, + "loss": 0.1388, + "step": 47162 + }, + { + "epoch": 0.8756029006063352, + "grad_norm": 0.4931805729866028, + "learning_rate": 7.538562472833588e-07, + "loss": 0.1724, + "step": 47164 + }, + { + "epoch": 0.8756400307437537, + "grad_norm": 0.712645947933197, + "learning_rate": 7.534119886188108e-07, + "loss": 0.1219, + "step": 47166 + }, + { + "epoch": 0.8756771608811724, + "grad_norm": 0.2916858196258545, + "learning_rate": 7.529678557726283e-07, + "loss": 0.2207, + "step": 47168 + }, + { + "epoch": 0.8757142910185911, + "grad_norm": 0.41966116428375244, + "learning_rate": 7.525238487508557e-07, + "loss": 0.3076, + "step": 47170 + }, + { + "epoch": 0.8757514211560097, + "grad_norm": 0.3707370162010193, + "learning_rate": 7.520799675595336e-07, + "loss": 0.3253, + "step": 47172 + }, + { + "epoch": 0.8757885512934284, + "grad_norm": 0.6048619747161865, + "learning_rate": 7.516362122047038e-07, + "loss": 0.2772, + "step": 47174 + }, + { + "epoch": 0.8758256814308469, + "grad_norm": 0.3265807330608368, + "learning_rate": 7.511925826924038e-07, + "loss": 0.1958, + "step": 47176 + }, + { + "epoch": 0.8758628115682656, + "grad_norm": 0.45436006784439087, + "learning_rate": 7.507490790286675e-07, + "loss": 0.2952, + "step": 47178 + }, + { + "epoch": 0.8758999417056843, + "grad_norm": 0.4619840085506439, + "learning_rate": 7.503057012195325e-07, + "loss": 0.3443, + "step": 47180 + }, + { + "epoch": 0.8759370718431029, + "grad_norm": 0.5944494605064392, + "learning_rate": 7.498624492710293e-07, + "loss": 0.1944, + "step": 47182 + }, + { + "epoch": 0.8759742019805216, + "grad_norm": 0.4579395651817322, + "learning_rate": 7.494193231891922e-07, + "loss": 0.1944, + "step": 47184 + }, + { + "epoch": 0.8760113321179401, + "grad_norm": 0.37032416462898254, + "learning_rate": 7.489763229800484e-07, + "loss": 0.3257, + "step": 47186 + }, + { + "epoch": 0.8760484622553588, + "grad_norm": 0.5908249616622925, + "learning_rate": 7.485334486496287e-07, + "loss": 0.3229, + "step": 47188 + }, + { + "epoch": 0.8760855923927774, + "grad_norm": 0.49008986353874207, + "learning_rate": 7.480907002039561e-07, + "loss": 0.2797, + "step": 47190 + }, + { + "epoch": 0.8761227225301961, + "grad_norm": 0.44043025374412537, + "learning_rate": 7.476480776490558e-07, + "loss": 0.4102, + "step": 47192 + }, + { + "epoch": 0.8761598526676148, + "grad_norm": 0.5974876284599304, + "learning_rate": 7.472055809909517e-07, + "loss": 0.3626, + "step": 47194 + }, + { + "epoch": 0.8761969828050333, + "grad_norm": 0.44135090708732605, + "learning_rate": 7.467632102356659e-07, + "loss": 0.3204, + "step": 47196 + }, + { + "epoch": 0.876234112942452, + "grad_norm": 0.5945812463760376, + "learning_rate": 7.463209653892134e-07, + "loss": 0.2041, + "step": 47198 + }, + { + "epoch": 0.8762712430798706, + "grad_norm": 0.38129401206970215, + "learning_rate": 7.458788464576161e-07, + "loss": 0.3784, + "step": 47200 + }, + { + "epoch": 0.8763083732172893, + "grad_norm": 0.230791836977005, + "learning_rate": 7.454368534468892e-07, + "loss": 0.106, + "step": 47202 + }, + { + "epoch": 0.876345503354708, + "grad_norm": 0.5226245522499084, + "learning_rate": 7.449949863630446e-07, + "loss": 0.1858, + "step": 47204 + }, + { + "epoch": 0.8763826334921265, + "grad_norm": 0.4472498297691345, + "learning_rate": 7.445532452120963e-07, + "loss": 0.2524, + "step": 47206 + }, + { + "epoch": 0.8764197636295452, + "grad_norm": 0.3954940140247345, + "learning_rate": 7.44111630000055e-07, + "loss": 0.2839, + "step": 47208 + }, + { + "epoch": 0.8764568937669638, + "grad_norm": 0.46032893657684326, + "learning_rate": 7.436701407329305e-07, + "loss": 0.207, + "step": 47210 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.36054563522338867, + "learning_rate": 7.43228777416729e-07, + "loss": 0.178, + "step": 47212 + }, + { + "epoch": 0.8765311540418012, + "grad_norm": 0.3760378658771515, + "learning_rate": 7.42787540057458e-07, + "loss": 0.3141, + "step": 47214 + }, + { + "epoch": 0.8765682841792197, + "grad_norm": 0.39963260293006897, + "learning_rate": 7.423464286611181e-07, + "loss": 0.1283, + "step": 47216 + }, + { + "epoch": 0.8766054143166384, + "grad_norm": 0.4356691241264343, + "learning_rate": 7.419054432337159e-07, + "loss": 0.2503, + "step": 47218 + }, + { + "epoch": 0.876642544454057, + "grad_norm": 0.2949202358722687, + "learning_rate": 7.414645837812473e-07, + "loss": 0.3541, + "step": 47220 + }, + { + "epoch": 0.8766796745914757, + "grad_norm": 0.4196102023124695, + "learning_rate": 7.410238503097134e-07, + "loss": 0.2764, + "step": 47222 + }, + { + "epoch": 0.8767168047288943, + "grad_norm": 0.28274640440940857, + "learning_rate": 7.405832428251115e-07, + "loss": 0.1035, + "step": 47224 + }, + { + "epoch": 0.8767539348663129, + "grad_norm": 0.3193435072898865, + "learning_rate": 7.401427613334377e-07, + "loss": 0.3406, + "step": 47226 + }, + { + "epoch": 0.8767910650037316, + "grad_norm": 0.3037477731704712, + "learning_rate": 7.397024058406821e-07, + "loss": 0.2486, + "step": 47228 + }, + { + "epoch": 0.8768281951411502, + "grad_norm": 0.20076969265937805, + "learning_rate": 7.392621763528396e-07, + "loss": 0.2279, + "step": 47230 + }, + { + "epoch": 0.8768653252785689, + "grad_norm": 0.3188711404800415, + "learning_rate": 7.388220728758999e-07, + "loss": 0.3365, + "step": 47232 + }, + { + "epoch": 0.8769024554159875, + "grad_norm": 0.42716357111930847, + "learning_rate": 7.383820954158516e-07, + "loss": 0.4188, + "step": 47234 + }, + { + "epoch": 0.8769395855534061, + "grad_norm": 0.836982011795044, + "learning_rate": 7.379422439786821e-07, + "loss": 0.2588, + "step": 47236 + }, + { + "epoch": 0.8769767156908248, + "grad_norm": 0.5173723101615906, + "learning_rate": 7.375025185703744e-07, + "loss": 0.2435, + "step": 47238 + }, + { + "epoch": 0.8770138458282434, + "grad_norm": 0.29863041639328003, + "learning_rate": 7.370629191969148e-07, + "loss": 0.4831, + "step": 47240 + }, + { + "epoch": 0.8770509759656621, + "grad_norm": 0.35990795493125916, + "learning_rate": 7.366234458642806e-07, + "loss": 0.3187, + "step": 47242 + }, + { + "epoch": 0.8770881061030806, + "grad_norm": 0.4894818961620331, + "learning_rate": 7.361840985784552e-07, + "loss": 0.3501, + "step": 47244 + }, + { + "epoch": 0.8771252362404993, + "grad_norm": 0.38101810216903687, + "learning_rate": 7.357448773454156e-07, + "loss": 0.1858, + "step": 47246 + }, + { + "epoch": 0.877162366377918, + "grad_norm": 0.3442534804344177, + "learning_rate": 7.353057821711384e-07, + "loss": 0.2225, + "step": 47248 + }, + { + "epoch": 0.8771994965153366, + "grad_norm": 0.3877589702606201, + "learning_rate": 7.348668130615988e-07, + "loss": 0.2142, + "step": 47250 + }, + { + "epoch": 0.8772366266527553, + "grad_norm": 0.3409903049468994, + "learning_rate": 7.344279700227708e-07, + "loss": 0.2784, + "step": 47252 + }, + { + "epoch": 0.8772737567901738, + "grad_norm": 0.41828882694244385, + "learning_rate": 7.33989253060623e-07, + "loss": 0.402, + "step": 47254 + }, + { + "epoch": 0.8773108869275925, + "grad_norm": 0.3038763105869293, + "learning_rate": 7.335506621811272e-07, + "loss": 0.3185, + "step": 47256 + }, + { + "epoch": 0.8773480170650112, + "grad_norm": 0.40076905488967896, + "learning_rate": 7.331121973902522e-07, + "loss": 0.3043, + "step": 47258 + }, + { + "epoch": 0.8773851472024298, + "grad_norm": 0.4108754098415375, + "learning_rate": 7.326738586939608e-07, + "loss": 0.1978, + "step": 47260 + }, + { + "epoch": 0.8774222773398485, + "grad_norm": 0.30718308687210083, + "learning_rate": 7.322356460982183e-07, + "loss": 0.3155, + "step": 47262 + }, + { + "epoch": 0.877459407477267, + "grad_norm": 0.5411142706871033, + "learning_rate": 7.31797559608991e-07, + "loss": 0.2352, + "step": 47264 + }, + { + "epoch": 0.8774965376146857, + "grad_norm": 0.43927431106567383, + "learning_rate": 7.313595992322364e-07, + "loss": 0.231, + "step": 47266 + }, + { + "epoch": 0.8775336677521044, + "grad_norm": 0.5780447125434875, + "learning_rate": 7.309217649739142e-07, + "loss": 0.21, + "step": 47268 + }, + { + "epoch": 0.877570797889523, + "grad_norm": 0.4709116220474243, + "learning_rate": 7.304840568399829e-07, + "loss": 0.1916, + "step": 47270 + }, + { + "epoch": 0.8776079280269417, + "grad_norm": 0.3406688868999481, + "learning_rate": 7.300464748363978e-07, + "loss": 0.314, + "step": 47272 + }, + { + "epoch": 0.8776450581643602, + "grad_norm": 0.5913991928100586, + "learning_rate": 7.296090189691141e-07, + "loss": 0.3777, + "step": 47274 + }, + { + "epoch": 0.8776821883017789, + "grad_norm": 0.519207775592804, + "learning_rate": 7.291716892440848e-07, + "loss": 0.3022, + "step": 47276 + }, + { + "epoch": 0.8777193184391976, + "grad_norm": 0.5524421334266663, + "learning_rate": 7.287344856672574e-07, + "loss": 0.3045, + "step": 47278 + }, + { + "epoch": 0.8777564485766162, + "grad_norm": 0.6641913652420044, + "learning_rate": 7.282974082445837e-07, + "loss": 0.4042, + "step": 47280 + }, + { + "epoch": 0.8777935787140349, + "grad_norm": 0.3197381794452667, + "learning_rate": 7.278604569820113e-07, + "loss": 0.398, + "step": 47282 + }, + { + "epoch": 0.8778307088514534, + "grad_norm": 0.5199897885322571, + "learning_rate": 7.274236318854844e-07, + "loss": 0.385, + "step": 47284 + }, + { + "epoch": 0.8778678389888721, + "grad_norm": 0.49087437987327576, + "learning_rate": 7.269869329609458e-07, + "loss": 0.2597, + "step": 47286 + }, + { + "epoch": 0.8779049691262908, + "grad_norm": 0.45158281922340393, + "learning_rate": 7.265503602143398e-07, + "loss": 0.32, + "step": 47288 + }, + { + "epoch": 0.8779420992637094, + "grad_norm": 0.36116576194763184, + "learning_rate": 7.261139136516082e-07, + "loss": 0.3255, + "step": 47290 + }, + { + "epoch": 0.877979229401128, + "grad_norm": 0.4417078197002411, + "learning_rate": 7.256775932786853e-07, + "loss": 0.2966, + "step": 47292 + }, + { + "epoch": 0.8780163595385466, + "grad_norm": 0.29719746112823486, + "learning_rate": 7.252413991015117e-07, + "loss": 0.2038, + "step": 47294 + }, + { + "epoch": 0.8780534896759653, + "grad_norm": 0.3423810303211212, + "learning_rate": 7.248053311260206e-07, + "loss": 0.1581, + "step": 47296 + }, + { + "epoch": 0.8780906198133839, + "grad_norm": 3.120774030685425, + "learning_rate": 7.243693893581461e-07, + "loss": 0.282, + "step": 47298 + }, + { + "epoch": 0.8781277499508026, + "grad_norm": 0.22917719185352325, + "learning_rate": 7.239335738038234e-07, + "loss": 0.3139, + "step": 47300 + }, + { + "epoch": 0.8781648800882212, + "grad_norm": 0.3209822177886963, + "learning_rate": 7.234978844689778e-07, + "loss": 0.29, + "step": 47302 + }, + { + "epoch": 0.8782020102256398, + "grad_norm": 0.26659253239631653, + "learning_rate": 7.2306232135954e-07, + "loss": 0.1453, + "step": 47304 + }, + { + "epoch": 0.8782391403630585, + "grad_norm": 0.5321511030197144, + "learning_rate": 7.226268844814366e-07, + "loss": 0.3141, + "step": 47306 + }, + { + "epoch": 0.8782762705004771, + "grad_norm": 0.324145644903183, + "learning_rate": 7.221915738405905e-07, + "loss": 0.231, + "step": 47308 + }, + { + "epoch": 0.8783134006378958, + "grad_norm": 0.24725677073001862, + "learning_rate": 7.21756389442928e-07, + "loss": 0.2123, + "step": 47310 + }, + { + "epoch": 0.8783505307753144, + "grad_norm": 0.21191512048244476, + "learning_rate": 7.21321331294369e-07, + "loss": 0.2239, + "step": 47312 + }, + { + "epoch": 0.878387660912733, + "grad_norm": 0.2750058174133301, + "learning_rate": 7.208863994008364e-07, + "loss": 0.217, + "step": 47314 + }, + { + "epoch": 0.8784247910501517, + "grad_norm": 0.2188194990158081, + "learning_rate": 7.204515937682433e-07, + "loss": 0.1569, + "step": 47316 + }, + { + "epoch": 0.8784619211875703, + "grad_norm": 0.4977736473083496, + "learning_rate": 7.200169144025082e-07, + "loss": 0.2384, + "step": 47318 + }, + { + "epoch": 0.878499051324989, + "grad_norm": 0.40972423553466797, + "learning_rate": 7.195823613095465e-07, + "loss": 0.2436, + "step": 47320 + }, + { + "epoch": 0.8785361814624076, + "grad_norm": 0.461217999458313, + "learning_rate": 7.191479344952723e-07, + "loss": 0.3921, + "step": 47322 + }, + { + "epoch": 0.8785733115998262, + "grad_norm": 0.6426409482955933, + "learning_rate": 7.187136339655943e-07, + "loss": 0.4639, + "step": 47324 + }, + { + "epoch": 0.8786104417372449, + "grad_norm": 0.3847665786743164, + "learning_rate": 7.182794597264231e-07, + "loss": 0.2273, + "step": 47326 + }, + { + "epoch": 0.8786475718746635, + "grad_norm": 0.5064146518707275, + "learning_rate": 7.178454117836675e-07, + "loss": 0.3605, + "step": 47328 + }, + { + "epoch": 0.8786847020120822, + "grad_norm": 0.39227500557899475, + "learning_rate": 7.174114901432305e-07, + "loss": 0.1797, + "step": 47330 + }, + { + "epoch": 0.8787218321495008, + "grad_norm": 0.36443546414375305, + "learning_rate": 7.169776948110196e-07, + "loss": 0.0445, + "step": 47332 + }, + { + "epoch": 0.8787589622869194, + "grad_norm": 0.4813399612903595, + "learning_rate": 7.165440257929357e-07, + "loss": 0.2521, + "step": 47334 + }, + { + "epoch": 0.8787960924243381, + "grad_norm": 0.22999557852745056, + "learning_rate": 7.161104830948806e-07, + "loss": 0.0922, + "step": 47336 + }, + { + "epoch": 0.8788332225617567, + "grad_norm": 0.4467817544937134, + "learning_rate": 7.156770667227542e-07, + "loss": 0.2904, + "step": 47338 + }, + { + "epoch": 0.8788703526991754, + "grad_norm": 0.3459838628768921, + "learning_rate": 7.152437766824538e-07, + "loss": 0.2703, + "step": 47340 + }, + { + "epoch": 0.8789074828365939, + "grad_norm": 0.35821595788002014, + "learning_rate": 7.148106129798727e-07, + "loss": 0.2091, + "step": 47342 + }, + { + "epoch": 0.8789446129740126, + "grad_norm": 0.2748994827270508, + "learning_rate": 7.143775756209093e-07, + "loss": 0.2197, + "step": 47344 + }, + { + "epoch": 0.8789817431114313, + "grad_norm": 0.42028987407684326, + "learning_rate": 7.139446646114511e-07, + "loss": 0.2187, + "step": 47346 + }, + { + "epoch": 0.8790188732488499, + "grad_norm": 0.28626084327697754, + "learning_rate": 7.135118799573914e-07, + "loss": 0.304, + "step": 47348 + }, + { + "epoch": 0.8790560033862685, + "grad_norm": 0.3073729872703552, + "learning_rate": 7.130792216646187e-07, + "loss": 0.1929, + "step": 47350 + }, + { + "epoch": 0.8790931335236871, + "grad_norm": 0.25476524233818054, + "learning_rate": 7.126466897390217e-07, + "loss": 0.3705, + "step": 47352 + }, + { + "epoch": 0.8791302636611058, + "grad_norm": 0.3203011453151703, + "learning_rate": 7.122142841864832e-07, + "loss": 0.3743, + "step": 47354 + }, + { + "epoch": 0.8791673937985245, + "grad_norm": 0.39025232195854187, + "learning_rate": 7.117820050128877e-07, + "loss": 0.2974, + "step": 47356 + }, + { + "epoch": 0.8792045239359431, + "grad_norm": 0.37883907556533813, + "learning_rate": 7.11349852224118e-07, + "loss": 0.3199, + "step": 47358 + }, + { + "epoch": 0.8792416540733617, + "grad_norm": 0.2734426259994507, + "learning_rate": 7.109178258260541e-07, + "loss": 0.2347, + "step": 47360 + }, + { + "epoch": 0.8792787842107803, + "grad_norm": 0.3571133017539978, + "learning_rate": 7.104859258245744e-07, + "loss": 0.1747, + "step": 47362 + }, + { + "epoch": 0.879315914348199, + "grad_norm": 1.1186449527740479, + "learning_rate": 7.100541522255577e-07, + "loss": 0.5827, + "step": 47364 + }, + { + "epoch": 0.8793530444856177, + "grad_norm": 0.4527154862880707, + "learning_rate": 7.096225050348771e-07, + "loss": 0.3646, + "step": 47366 + }, + { + "epoch": 0.8793901746230363, + "grad_norm": 0.877415120601654, + "learning_rate": 7.091909842584055e-07, + "loss": 0.3252, + "step": 47368 + }, + { + "epoch": 0.8794273047604549, + "grad_norm": 0.403403639793396, + "learning_rate": 7.087595899020139e-07, + "loss": 0.2817, + "step": 47370 + }, + { + "epoch": 0.8794644348978735, + "grad_norm": 0.6720659732818604, + "learning_rate": 7.083283219715753e-07, + "loss": 0.2892, + "step": 47372 + }, + { + "epoch": 0.8795015650352922, + "grad_norm": 0.40734872221946716, + "learning_rate": 7.078971804729562e-07, + "loss": 0.6437, + "step": 47374 + }, + { + "epoch": 0.8795386951727109, + "grad_norm": 0.287028431892395, + "learning_rate": 7.07466165412023e-07, + "loss": 0.2804, + "step": 47376 + }, + { + "epoch": 0.8795758253101295, + "grad_norm": 0.5888128876686096, + "learning_rate": 7.07035276794642e-07, + "loss": 0.2229, + "step": 47378 + }, + { + "epoch": 0.8796129554475481, + "grad_norm": 0.44357457756996155, + "learning_rate": 7.066045146266743e-07, + "loss": 0.2179, + "step": 47380 + }, + { + "epoch": 0.8796500855849667, + "grad_norm": 0.2920122742652893, + "learning_rate": 7.061738789139816e-07, + "loss": 0.1843, + "step": 47382 + }, + { + "epoch": 0.8796872157223854, + "grad_norm": 0.5150615572929382, + "learning_rate": 7.057433696624249e-07, + "loss": 0.3675, + "step": 47384 + }, + { + "epoch": 0.8797243458598041, + "grad_norm": 0.3968726098537445, + "learning_rate": 7.05312986877863e-07, + "loss": 0.3215, + "step": 47386 + }, + { + "epoch": 0.8797614759972227, + "grad_norm": 0.4688175618648529, + "learning_rate": 7.048827305661487e-07, + "loss": 0.2676, + "step": 47388 + }, + { + "epoch": 0.8797986061346413, + "grad_norm": 0.548578143119812, + "learning_rate": 7.044526007331398e-07, + "loss": 0.1865, + "step": 47390 + }, + { + "epoch": 0.8798357362720599, + "grad_norm": 0.14987076818943024, + "learning_rate": 7.040225973846871e-07, + "loss": 0.2546, + "step": 47392 + }, + { + "epoch": 0.8798728664094786, + "grad_norm": 0.254211962223053, + "learning_rate": 7.035927205266402e-07, + "loss": 0.24, + "step": 47394 + }, + { + "epoch": 0.8799099965468972, + "grad_norm": 0.46713411808013916, + "learning_rate": 7.031629701648523e-07, + "loss": 0.3307, + "step": 47396 + }, + { + "epoch": 0.8799471266843159, + "grad_norm": 0.3724933862686157, + "learning_rate": 7.027333463051678e-07, + "loss": 0.1134, + "step": 47398 + }, + { + "epoch": 0.8799842568217345, + "grad_norm": 0.43817073106765747, + "learning_rate": 7.02303848953435e-07, + "loss": 0.4231, + "step": 47400 + }, + { + "epoch": 0.8800213869591531, + "grad_norm": 0.3878483176231384, + "learning_rate": 7.018744781154963e-07, + "loss": 0.4896, + "step": 47402 + }, + { + "epoch": 0.8800585170965718, + "grad_norm": 0.34842830896377563, + "learning_rate": 7.014452337971966e-07, + "loss": 0.3072, + "step": 47404 + }, + { + "epoch": 0.8800956472339904, + "grad_norm": 0.4477299451828003, + "learning_rate": 7.010161160043727e-07, + "loss": 0.3656, + "step": 47406 + }, + { + "epoch": 0.880132777371409, + "grad_norm": 0.5465213656425476, + "learning_rate": 7.005871247428675e-07, + "loss": 0.1192, + "step": 47408 + }, + { + "epoch": 0.8801699075088277, + "grad_norm": 0.2922872006893158, + "learning_rate": 7.001582600185164e-07, + "loss": 0.2156, + "step": 47410 + }, + { + "epoch": 0.8802070376462463, + "grad_norm": 0.6058338284492493, + "learning_rate": 6.997295218371536e-07, + "loss": 0.261, + "step": 47412 + }, + { + "epoch": 0.880244167783665, + "grad_norm": 0.39384642243385315, + "learning_rate": 6.993009102046144e-07, + "loss": 0.2768, + "step": 47414 + }, + { + "epoch": 0.8802812979210836, + "grad_norm": 0.37298253178596497, + "learning_rate": 6.988724251267343e-07, + "loss": 0.3585, + "step": 47416 + }, + { + "epoch": 0.8803184280585022, + "grad_norm": 0.3825553059577942, + "learning_rate": 6.984440666093373e-07, + "loss": 0.1971, + "step": 47418 + }, + { + "epoch": 0.8803555581959209, + "grad_norm": 0.4948826730251312, + "learning_rate": 6.980158346582555e-07, + "loss": 0.2619, + "step": 47420 + }, + { + "epoch": 0.8803926883333395, + "grad_norm": 0.4946448802947998, + "learning_rate": 6.975877292793154e-07, + "loss": 0.4049, + "step": 47422 + }, + { + "epoch": 0.8804298184707582, + "grad_norm": 0.3572228252887726, + "learning_rate": 6.971597504783422e-07, + "loss": 0.1871, + "step": 47424 + }, + { + "epoch": 0.8804669486081768, + "grad_norm": 0.2821890413761139, + "learning_rate": 6.967318982611604e-07, + "loss": 0.27, + "step": 47426 + }, + { + "epoch": 0.8805040787455954, + "grad_norm": 0.4213487505912781, + "learning_rate": 6.963041726335918e-07, + "loss": 0.1807, + "step": 47428 + }, + { + "epoch": 0.8805412088830141, + "grad_norm": 0.18453316390514374, + "learning_rate": 6.958765736014561e-07, + "loss": 0.238, + "step": 47430 + }, + { + "epoch": 0.8805783390204327, + "grad_norm": 0.4465988278388977, + "learning_rate": 6.954491011705689e-07, + "loss": 0.4429, + "step": 47432 + }, + { + "epoch": 0.8806154691578514, + "grad_norm": 0.5186497569084167, + "learning_rate": 6.950217553467497e-07, + "loss": 0.2183, + "step": 47434 + }, + { + "epoch": 0.88065259929527, + "grad_norm": 0.32859376072883606, + "learning_rate": 6.94594536135812e-07, + "loss": 0.219, + "step": 47436 + }, + { + "epoch": 0.8806897294326886, + "grad_norm": 0.49729156494140625, + "learning_rate": 6.941674435435708e-07, + "loss": 0.337, + "step": 47438 + }, + { + "epoch": 0.8807268595701073, + "grad_norm": 0.4608840048313141, + "learning_rate": 6.937404775758371e-07, + "loss": 0.2518, + "step": 47440 + }, + { + "epoch": 0.8807639897075259, + "grad_norm": 0.23189140856266022, + "learning_rate": 6.933136382384198e-07, + "loss": 0.2876, + "step": 47442 + }, + { + "epoch": 0.8808011198449446, + "grad_norm": 0.5379676222801208, + "learning_rate": 6.928869255371262e-07, + "loss": 0.2543, + "step": 47444 + }, + { + "epoch": 0.8808382499823632, + "grad_norm": 0.39725255966186523, + "learning_rate": 6.92460339477763e-07, + "loss": 0.2817, + "step": 47446 + }, + { + "epoch": 0.8808753801197818, + "grad_norm": 0.6109122037887573, + "learning_rate": 6.920338800661364e-07, + "loss": 0.2363, + "step": 47448 + }, + { + "epoch": 0.8809125102572004, + "grad_norm": 0.4034080505371094, + "learning_rate": 6.916075473080486e-07, + "loss": 0.2007, + "step": 47450 + }, + { + "epoch": 0.8809496403946191, + "grad_norm": 0.27598267793655396, + "learning_rate": 6.911813412092993e-07, + "loss": 0.2483, + "step": 47452 + }, + { + "epoch": 0.8809867705320378, + "grad_norm": 0.3307301700115204, + "learning_rate": 6.907552617756896e-07, + "loss": 0.389, + "step": 47454 + }, + { + "epoch": 0.8810239006694564, + "grad_norm": 0.5051745176315308, + "learning_rate": 6.903293090130158e-07, + "loss": 0.2884, + "step": 47456 + }, + { + "epoch": 0.881061030806875, + "grad_norm": 0.6947413682937622, + "learning_rate": 6.899034829270746e-07, + "loss": 0.2488, + "step": 47458 + }, + { + "epoch": 0.8810981609442936, + "grad_norm": 0.32664889097213745, + "learning_rate": 6.894777835236588e-07, + "loss": 0.4003, + "step": 47460 + }, + { + "epoch": 0.8811352910817123, + "grad_norm": 0.43620002269744873, + "learning_rate": 6.890522108085629e-07, + "loss": 0.1883, + "step": 47462 + }, + { + "epoch": 0.881172421219131, + "grad_norm": 0.3556111454963684, + "learning_rate": 6.886267647875766e-07, + "loss": 0.3559, + "step": 47464 + }, + { + "epoch": 0.8812095513565495, + "grad_norm": 0.49104854464530945, + "learning_rate": 6.882014454664909e-07, + "loss": 0.2284, + "step": 47466 + }, + { + "epoch": 0.8812466814939682, + "grad_norm": 0.39098259806632996, + "learning_rate": 6.877762528510901e-07, + "loss": 0.1079, + "step": 47468 + }, + { + "epoch": 0.8812838116313868, + "grad_norm": 0.39154544472694397, + "learning_rate": 6.873511869471616e-07, + "loss": 0.2838, + "step": 47470 + }, + { + "epoch": 0.8813209417688055, + "grad_norm": 0.2689366042613983, + "learning_rate": 6.869262477604899e-07, + "loss": 0.2355, + "step": 47472 + }, + { + "epoch": 0.8813580719062242, + "grad_norm": 0.4578331410884857, + "learning_rate": 6.865014352968547e-07, + "loss": 0.2281, + "step": 47474 + }, + { + "epoch": 0.8813952020436427, + "grad_norm": 0.3212837278842926, + "learning_rate": 6.860767495620369e-07, + "loss": 0.2668, + "step": 47476 + }, + { + "epoch": 0.8814323321810614, + "grad_norm": 0.34555530548095703, + "learning_rate": 6.856521905618185e-07, + "loss": 0.5203, + "step": 47478 + }, + { + "epoch": 0.88146946231848, + "grad_norm": 0.4289441704750061, + "learning_rate": 6.852277583019729e-07, + "loss": 0.3192, + "step": 47480 + }, + { + "epoch": 0.8815065924558987, + "grad_norm": 0.4660801887512207, + "learning_rate": 6.848034527882752e-07, + "loss": 0.2153, + "step": 47482 + }, + { + "epoch": 0.8815437225933174, + "grad_norm": 0.2465222179889679, + "learning_rate": 6.84379274026501e-07, + "loss": 0.2545, + "step": 47484 + }, + { + "epoch": 0.8815808527307359, + "grad_norm": 0.3759123384952545, + "learning_rate": 6.839552220224222e-07, + "loss": 0.3635, + "step": 47486 + }, + { + "epoch": 0.8816179828681546, + "grad_norm": 0.5579144954681396, + "learning_rate": 6.835312967818065e-07, + "loss": 0.2752, + "step": 47488 + }, + { + "epoch": 0.8816551130055732, + "grad_norm": 0.42652541399002075, + "learning_rate": 6.831074983104268e-07, + "loss": 0.2592, + "step": 47490 + }, + { + "epoch": 0.8816922431429919, + "grad_norm": 0.4003450870513916, + "learning_rate": 6.826838266140445e-07, + "loss": 0.3484, + "step": 47492 + }, + { + "epoch": 0.8817293732804105, + "grad_norm": 0.41991809010505676, + "learning_rate": 6.82260281698428e-07, + "loss": 0.3751, + "step": 47494 + }, + { + "epoch": 0.8817665034178291, + "grad_norm": 0.6549077033996582, + "learning_rate": 6.818368635693385e-07, + "loss": 0.2237, + "step": 47496 + }, + { + "epoch": 0.8818036335552478, + "grad_norm": 0.3264525234699249, + "learning_rate": 6.814135722325365e-07, + "loss": 0.2033, + "step": 47498 + }, + { + "epoch": 0.8818407636926664, + "grad_norm": 0.4776899814605713, + "learning_rate": 6.809904076937845e-07, + "loss": 0.4267, + "step": 47500 + }, + { + "epoch": 0.8818778938300851, + "grad_norm": 0.3332924544811249, + "learning_rate": 6.8056736995884e-07, + "loss": 0.1506, + "step": 47502 + }, + { + "epoch": 0.8819150239675037, + "grad_norm": 0.561475396156311, + "learning_rate": 6.801444590334594e-07, + "loss": 0.2508, + "step": 47504 + }, + { + "epoch": 0.8819521541049223, + "grad_norm": 0.422680139541626, + "learning_rate": 6.797216749233948e-07, + "loss": 0.3893, + "step": 47506 + }, + { + "epoch": 0.881989284242341, + "grad_norm": 0.5122622847557068, + "learning_rate": 6.792990176344017e-07, + "loss": 0.4203, + "step": 47508 + }, + { + "epoch": 0.8820264143797596, + "grad_norm": 0.44281259179115295, + "learning_rate": 6.788764871722298e-07, + "loss": 0.2362, + "step": 47510 + }, + { + "epoch": 0.8820635445171783, + "grad_norm": 0.4901391267776489, + "learning_rate": 6.784540835426301e-07, + "loss": 0.0903, + "step": 47512 + }, + { + "epoch": 0.8821006746545969, + "grad_norm": 0.5330834984779358, + "learning_rate": 6.780318067513469e-07, + "loss": 0.4631, + "step": 47514 + }, + { + "epoch": 0.8821378047920155, + "grad_norm": 0.29579687118530273, + "learning_rate": 6.776096568041312e-07, + "loss": 0.4088, + "step": 47516 + }, + { + "epoch": 0.8821749349294342, + "grad_norm": 0.3763114809989929, + "learning_rate": 6.771876337067218e-07, + "loss": 0.2277, + "step": 47518 + }, + { + "epoch": 0.8822120650668528, + "grad_norm": 0.34977829456329346, + "learning_rate": 6.76765737464864e-07, + "loss": 0.1521, + "step": 47520 + }, + { + "epoch": 0.8822491952042715, + "grad_norm": 0.8410979509353638, + "learning_rate": 6.763439680842987e-07, + "loss": 0.2808, + "step": 47522 + }, + { + "epoch": 0.88228632534169, + "grad_norm": 0.37012603878974915, + "learning_rate": 6.759223255707637e-07, + "loss": 0.2859, + "step": 47524 + }, + { + "epoch": 0.8823234554791087, + "grad_norm": 0.33140870928764343, + "learning_rate": 6.755008099299976e-07, + "loss": 0.276, + "step": 47526 + }, + { + "epoch": 0.8823605856165274, + "grad_norm": 0.5419058203697205, + "learning_rate": 6.750794211677348e-07, + "loss": 0.293, + "step": 47528 + }, + { + "epoch": 0.882397715753946, + "grad_norm": 0.383899986743927, + "learning_rate": 6.746581592897106e-07, + "loss": 0.4261, + "step": 47530 + }, + { + "epoch": 0.8824348458913647, + "grad_norm": 0.41641825437545776, + "learning_rate": 6.74237024301656e-07, + "loss": 0.2288, + "step": 47532 + }, + { + "epoch": 0.8824719760287832, + "grad_norm": 0.41651269793510437, + "learning_rate": 6.738160162093011e-07, + "loss": 0.3479, + "step": 47534 + }, + { + "epoch": 0.8825091061662019, + "grad_norm": 0.36212146282196045, + "learning_rate": 6.733951350183765e-07, + "loss": 0.3046, + "step": 47536 + }, + { + "epoch": 0.8825462363036206, + "grad_norm": 0.37156036496162415, + "learning_rate": 6.729743807346068e-07, + "loss": 0.3054, + "step": 47538 + }, + { + "epoch": 0.8825833664410392, + "grad_norm": 0.5652598142623901, + "learning_rate": 6.725537533637172e-07, + "loss": 0.1989, + "step": 47540 + }, + { + "epoch": 0.8826204965784579, + "grad_norm": 0.5056232810020447, + "learning_rate": 6.721332529114344e-07, + "loss": 0.3204, + "step": 47542 + }, + { + "epoch": 0.8826576267158764, + "grad_norm": 0.311834454536438, + "learning_rate": 6.717128793834749e-07, + "loss": 0.2458, + "step": 47544 + }, + { + "epoch": 0.8826947568532951, + "grad_norm": 0.2799263894557953, + "learning_rate": 6.712926327855629e-07, + "loss": 0.4316, + "step": 47546 + }, + { + "epoch": 0.8827318869907137, + "grad_norm": 0.3284052908420563, + "learning_rate": 6.708725131234151e-07, + "loss": 0.2834, + "step": 47548 + }, + { + "epoch": 0.8827690171281324, + "grad_norm": 0.31405022740364075, + "learning_rate": 6.70452520402749e-07, + "loss": 0.2321, + "step": 47550 + }, + { + "epoch": 0.8828061472655511, + "grad_norm": 0.4365288019180298, + "learning_rate": 6.700326546292779e-07, + "loss": 0.276, + "step": 47552 + }, + { + "epoch": 0.8828432774029696, + "grad_norm": 0.20331554114818573, + "learning_rate": 6.696129158087183e-07, + "loss": 0.161, + "step": 47554 + }, + { + "epoch": 0.8828804075403883, + "grad_norm": 0.5059957504272461, + "learning_rate": 6.691933039467768e-07, + "loss": 0.1668, + "step": 47556 + }, + { + "epoch": 0.8829175376778069, + "grad_norm": 0.269768089056015, + "learning_rate": 6.687738190491677e-07, + "loss": 0.3041, + "step": 47558 + }, + { + "epoch": 0.8829546678152256, + "grad_norm": 0.502941906452179, + "learning_rate": 6.683544611215953e-07, + "loss": 0.2919, + "step": 47560 + }, + { + "epoch": 0.8829917979526443, + "grad_norm": 0.33053064346313477, + "learning_rate": 6.679352301697661e-07, + "loss": 0.3925, + "step": 47562 + }, + { + "epoch": 0.8830289280900628, + "grad_norm": 0.4819253981113434, + "learning_rate": 6.675161261993868e-07, + "loss": 0.2319, + "step": 47564 + }, + { + "epoch": 0.8830660582274815, + "grad_norm": 0.17919844388961792, + "learning_rate": 6.670971492161593e-07, + "loss": 0.1961, + "step": 47566 + }, + { + "epoch": 0.8831031883649001, + "grad_norm": 0.2559063136577606, + "learning_rate": 6.666782992257837e-07, + "loss": 0.2309, + "step": 47568 + }, + { + "epoch": 0.8831403185023188, + "grad_norm": 0.32767990231513977, + "learning_rate": 6.662595762339597e-07, + "loss": 0.3885, + "step": 47570 + }, + { + "epoch": 0.8831774486397375, + "grad_norm": 0.5749056339263916, + "learning_rate": 6.65840980246385e-07, + "loss": 0.1389, + "step": 47572 + }, + { + "epoch": 0.883214578777156, + "grad_norm": 0.4745482802391052, + "learning_rate": 6.654225112687551e-07, + "loss": 0.2405, + "step": 47574 + }, + { + "epoch": 0.8832517089145747, + "grad_norm": 0.2447897046804428, + "learning_rate": 6.650041693067666e-07, + "loss": 0.196, + "step": 47576 + }, + { + "epoch": 0.8832888390519933, + "grad_norm": 0.35503894090652466, + "learning_rate": 6.645859543661082e-07, + "loss": 0.253, + "step": 47578 + }, + { + "epoch": 0.883325969189412, + "grad_norm": 0.5454164743423462, + "learning_rate": 6.641678664524742e-07, + "loss": 0.1569, + "step": 47580 + }, + { + "epoch": 0.8833630993268307, + "grad_norm": 0.43491607904434204, + "learning_rate": 6.637499055715491e-07, + "loss": 0.413, + "step": 47582 + }, + { + "epoch": 0.8834002294642492, + "grad_norm": 0.24743662774562836, + "learning_rate": 6.633320717290237e-07, + "loss": 0.1405, + "step": 47584 + }, + { + "epoch": 0.8834373596016679, + "grad_norm": 0.593473494052887, + "learning_rate": 6.629143649305814e-07, + "loss": 0.2301, + "step": 47586 + }, + { + "epoch": 0.8834744897390865, + "grad_norm": 0.49163755774497986, + "learning_rate": 6.624967851819065e-07, + "loss": 0.233, + "step": 47588 + }, + { + "epoch": 0.8835116198765052, + "grad_norm": 0.3950682282447815, + "learning_rate": 6.620793324886809e-07, + "loss": 0.5542, + "step": 47590 + }, + { + "epoch": 0.8835487500139239, + "grad_norm": 0.2998001277446747, + "learning_rate": 6.616620068565882e-07, + "loss": 0.1195, + "step": 47592 + }, + { + "epoch": 0.8835858801513424, + "grad_norm": 0.4829131066799164, + "learning_rate": 6.612448082913014e-07, + "loss": 0.3492, + "step": 47594 + }, + { + "epoch": 0.8836230102887611, + "grad_norm": 0.40049058198928833, + "learning_rate": 6.608277367985005e-07, + "loss": 0.3441, + "step": 47596 + }, + { + "epoch": 0.8836601404261797, + "grad_norm": 0.33591926097869873, + "learning_rate": 6.60410792383861e-07, + "loss": 0.239, + "step": 47598 + }, + { + "epoch": 0.8836972705635984, + "grad_norm": 0.5166113376617432, + "learning_rate": 6.599939750530537e-07, + "loss": 0.2638, + "step": 47600 + }, + { + "epoch": 0.8837344007010169, + "grad_norm": 0.3719906210899353, + "learning_rate": 6.595772848117532e-07, + "loss": 0.3025, + "step": 47602 + }, + { + "epoch": 0.8837715308384356, + "grad_norm": 0.3433866798877716, + "learning_rate": 6.591607216656281e-07, + "loss": 0.2995, + "step": 47604 + }, + { + "epoch": 0.8838086609758543, + "grad_norm": 0.4769951403141022, + "learning_rate": 6.587442856203464e-07, + "loss": 0.3557, + "step": 47606 + }, + { + "epoch": 0.8838457911132729, + "grad_norm": 0.6962431073188782, + "learning_rate": 6.583279766815731e-07, + "loss": 0.2031, + "step": 47608 + }, + { + "epoch": 0.8838829212506916, + "grad_norm": 0.49196648597717285, + "learning_rate": 6.579117948549751e-07, + "loss": 0.2142, + "step": 47610 + }, + { + "epoch": 0.8839200513881101, + "grad_norm": 0.25094518065452576, + "learning_rate": 6.574957401462156e-07, + "loss": 0.2941, + "step": 47612 + }, + { + "epoch": 0.8839571815255288, + "grad_norm": 0.43323275446891785, + "learning_rate": 6.570798125609535e-07, + "loss": 0.5006, + "step": 47614 + }, + { + "epoch": 0.8839943116629475, + "grad_norm": 0.3959193527698517, + "learning_rate": 6.566640121048506e-07, + "loss": 0.2995, + "step": 47616 + }, + { + "epoch": 0.8840314418003661, + "grad_norm": 0.285462886095047, + "learning_rate": 6.562483387835661e-07, + "loss": 0.258, + "step": 47618 + }, + { + "epoch": 0.8840685719377848, + "grad_norm": 0.27391335368156433, + "learning_rate": 6.55832792602753e-07, + "loss": 0.3142, + "step": 47620 + }, + { + "epoch": 0.8841057020752033, + "grad_norm": 0.45922282338142395, + "learning_rate": 6.554173735680657e-07, + "loss": 0.2239, + "step": 47622 + }, + { + "epoch": 0.884142832212622, + "grad_norm": 0.34486123919487, + "learning_rate": 6.550020816851577e-07, + "loss": 0.1943, + "step": 47624 + }, + { + "epoch": 0.8841799623500407, + "grad_norm": 0.525230884552002, + "learning_rate": 6.545869169596797e-07, + "loss": 0.1174, + "step": 47626 + }, + { + "epoch": 0.8842170924874593, + "grad_norm": 0.5819212794303894, + "learning_rate": 6.541718793972807e-07, + "loss": 0.2081, + "step": 47628 + }, + { + "epoch": 0.884254222624878, + "grad_norm": 0.5742965340614319, + "learning_rate": 6.537569690036105e-07, + "loss": 0.2302, + "step": 47630 + }, + { + "epoch": 0.8842913527622965, + "grad_norm": 0.26287642121315, + "learning_rate": 6.533421857843104e-07, + "loss": 0.1799, + "step": 47632 + }, + { + "epoch": 0.8843284828997152, + "grad_norm": 0.47685742378234863, + "learning_rate": 6.529275297450277e-07, + "loss": 0.3099, + "step": 47634 + }, + { + "epoch": 0.8843656130371339, + "grad_norm": 0.39257556200027466, + "learning_rate": 6.525130008914027e-07, + "loss": 0.3593, + "step": 47636 + }, + { + "epoch": 0.8844027431745525, + "grad_norm": 0.45302775502204895, + "learning_rate": 6.520985992290774e-07, + "loss": 0.3287, + "step": 47638 + }, + { + "epoch": 0.8844398733119712, + "grad_norm": 0.2566416263580322, + "learning_rate": 6.516843247636906e-07, + "loss": 0.2613, + "step": 47640 + }, + { + "epoch": 0.8844770034493897, + "grad_norm": 0.3361918330192566, + "learning_rate": 6.512701775008778e-07, + "loss": 0.1852, + "step": 47642 + }, + { + "epoch": 0.8845141335868084, + "grad_norm": 0.32291272282600403, + "learning_rate": 6.508561574462769e-07, + "loss": 0.3664, + "step": 47644 + }, + { + "epoch": 0.884551263724227, + "grad_norm": 0.3564322590827942, + "learning_rate": 6.504422646055175e-07, + "loss": 0.1937, + "step": 47646 + }, + { + "epoch": 0.8845883938616457, + "grad_norm": 0.32213884592056274, + "learning_rate": 6.500284989842342e-07, + "loss": 0.3376, + "step": 47648 + }, + { + "epoch": 0.8846255239990644, + "grad_norm": 0.3198153078556061, + "learning_rate": 6.49614860588057e-07, + "loss": 0.1232, + "step": 47650 + }, + { + "epoch": 0.8846626541364829, + "grad_norm": 0.3309187889099121, + "learning_rate": 6.492013494226135e-07, + "loss": 0.2061, + "step": 47652 + }, + { + "epoch": 0.8846997842739016, + "grad_norm": 0.5339601635932922, + "learning_rate": 6.487879654935314e-07, + "loss": 0.2892, + "step": 47654 + }, + { + "epoch": 0.8847369144113202, + "grad_norm": 0.4980430603027344, + "learning_rate": 6.483747088064351e-07, + "loss": 0.2893, + "step": 47656 + }, + { + "epoch": 0.8847740445487389, + "grad_norm": 0.4119316637516022, + "learning_rate": 6.479615793669469e-07, + "loss": 0.2262, + "step": 47658 + }, + { + "epoch": 0.8848111746861576, + "grad_norm": 0.16685384511947632, + "learning_rate": 6.4754857718069e-07, + "loss": 0.1812, + "step": 47660 + }, + { + "epoch": 0.8848483048235761, + "grad_norm": 0.5277692079544067, + "learning_rate": 6.471357022532831e-07, + "loss": 0.4157, + "step": 47662 + }, + { + "epoch": 0.8848854349609948, + "grad_norm": 0.4435460567474365, + "learning_rate": 6.46722954590343e-07, + "loss": 0.1823, + "step": 47664 + }, + { + "epoch": 0.8849225650984134, + "grad_norm": 0.33021536469459534, + "learning_rate": 6.463103341974875e-07, + "loss": 0.2036, + "step": 47666 + }, + { + "epoch": 0.8849596952358321, + "grad_norm": 0.39361444115638733, + "learning_rate": 6.45897841080333e-07, + "loss": 0.3195, + "step": 47668 + }, + { + "epoch": 0.8849968253732508, + "grad_norm": 0.37897324562072754, + "learning_rate": 6.454854752444872e-07, + "loss": 0.3483, + "step": 47670 + }, + { + "epoch": 0.8850339555106693, + "grad_norm": 0.3185615837574005, + "learning_rate": 6.450732366955648e-07, + "loss": 0.3074, + "step": 47672 + }, + { + "epoch": 0.885071085648088, + "grad_norm": 0.5312720537185669, + "learning_rate": 6.446611254391744e-07, + "loss": 0.2821, + "step": 47674 + }, + { + "epoch": 0.8851082157855066, + "grad_norm": 0.3081205189228058, + "learning_rate": 6.442491414809238e-07, + "loss": 0.2136, + "step": 47676 + }, + { + "epoch": 0.8851453459229253, + "grad_norm": 0.37605416774749756, + "learning_rate": 6.438372848264185e-07, + "loss": 0.2773, + "step": 47678 + }, + { + "epoch": 0.885182476060344, + "grad_norm": 0.3170308768749237, + "learning_rate": 6.43425555481264e-07, + "loss": 0.2276, + "step": 47680 + }, + { + "epoch": 0.8852196061977625, + "grad_norm": 0.368425190448761, + "learning_rate": 6.430139534510604e-07, + "loss": 0.1798, + "step": 47682 + }, + { + "epoch": 0.8852567363351812, + "grad_norm": 0.28887829184532166, + "learning_rate": 6.426024787414109e-07, + "loss": 0.2493, + "step": 47684 + }, + { + "epoch": 0.8852938664725998, + "grad_norm": 0.37934792041778564, + "learning_rate": 6.42191131357911e-07, + "loss": 0.2498, + "step": 47686 + }, + { + "epoch": 0.8853309966100185, + "grad_norm": 0.4310918152332306, + "learning_rate": 6.417799113061607e-07, + "loss": 0.1756, + "step": 47688 + }, + { + "epoch": 0.8853681267474371, + "grad_norm": 0.4184538722038269, + "learning_rate": 6.413688185917543e-07, + "loss": 0.2912, + "step": 47690 + }, + { + "epoch": 0.8854052568848557, + "grad_norm": 0.5241885185241699, + "learning_rate": 6.409578532202865e-07, + "loss": 0.2817, + "step": 47692 + }, + { + "epoch": 0.8854423870222744, + "grad_norm": 0.4601022005081177, + "learning_rate": 6.405470151973492e-07, + "loss": 0.4263, + "step": 47694 + }, + { + "epoch": 0.885479517159693, + "grad_norm": 0.6604556441307068, + "learning_rate": 6.401363045285314e-07, + "loss": 0.4602, + "step": 47696 + }, + { + "epoch": 0.8855166472971117, + "grad_norm": 0.44825366139411926, + "learning_rate": 6.397257212194219e-07, + "loss": 0.1669, + "step": 47698 + }, + { + "epoch": 0.8855537774345302, + "grad_norm": 0.339515745639801, + "learning_rate": 6.393152652756085e-07, + "loss": 0.3069, + "step": 47700 + }, + { + "epoch": 0.8855909075719489, + "grad_norm": 0.5635107159614563, + "learning_rate": 6.389049367026756e-07, + "loss": 0.3177, + "step": 47702 + }, + { + "epoch": 0.8856280377093676, + "grad_norm": 0.2347307950258255, + "learning_rate": 6.384947355062088e-07, + "loss": 0.1506, + "step": 47704 + }, + { + "epoch": 0.8856651678467862, + "grad_norm": 0.382336288690567, + "learning_rate": 6.380846616917879e-07, + "loss": 0.4081, + "step": 47706 + }, + { + "epoch": 0.8857022979842049, + "grad_norm": 0.2988913059234619, + "learning_rate": 6.376747152649898e-07, + "loss": 0.2154, + "step": 47708 + }, + { + "epoch": 0.8857394281216234, + "grad_norm": 0.272087424993515, + "learning_rate": 6.372648962313965e-07, + "loss": 0.2419, + "step": 47710 + }, + { + "epoch": 0.8857765582590421, + "grad_norm": 0.292536199092865, + "learning_rate": 6.368552045965826e-07, + "loss": 0.3579, + "step": 47712 + }, + { + "epoch": 0.8858136883964608, + "grad_norm": 0.4091271460056305, + "learning_rate": 6.364456403661245e-07, + "loss": 0.4355, + "step": 47714 + }, + { + "epoch": 0.8858508185338794, + "grad_norm": 0.3770033121109009, + "learning_rate": 6.360362035455936e-07, + "loss": 0.1661, + "step": 47716 + }, + { + "epoch": 0.885887948671298, + "grad_norm": 0.4006262421607971, + "learning_rate": 6.35626894140563e-07, + "loss": 0.2791, + "step": 47718 + }, + { + "epoch": 0.8859250788087166, + "grad_norm": 0.36582180857658386, + "learning_rate": 6.352177121565994e-07, + "loss": 0.1738, + "step": 47720 + }, + { + "epoch": 0.8859622089461353, + "grad_norm": 0.2699072062969208, + "learning_rate": 6.348086575992718e-07, + "loss": 0.1488, + "step": 47722 + }, + { + "epoch": 0.885999339083554, + "grad_norm": 0.39797115325927734, + "learning_rate": 6.343997304741467e-07, + "loss": 0.245, + "step": 47724 + }, + { + "epoch": 0.8860364692209726, + "grad_norm": 0.3836439847946167, + "learning_rate": 6.339909307867886e-07, + "loss": 0.5565, + "step": 47726 + }, + { + "epoch": 0.8860735993583913, + "grad_norm": 0.3095533549785614, + "learning_rate": 6.335822585427587e-07, + "loss": 0.209, + "step": 47728 + }, + { + "epoch": 0.8861107294958098, + "grad_norm": 0.4126643240451813, + "learning_rate": 6.331737137476202e-07, + "loss": 0.2995, + "step": 47730 + }, + { + "epoch": 0.8861478596332285, + "grad_norm": 0.5190250277519226, + "learning_rate": 6.327652964069298e-07, + "loss": 0.1056, + "step": 47732 + }, + { + "epoch": 0.8861849897706472, + "grad_norm": 0.3109513521194458, + "learning_rate": 6.323570065262441e-07, + "loss": 0.2578, + "step": 47734 + }, + { + "epoch": 0.8862221199080658, + "grad_norm": 0.27223071455955505, + "learning_rate": 6.31948844111121e-07, + "loss": 0.1946, + "step": 47736 + }, + { + "epoch": 0.8862592500454844, + "grad_norm": 0.29826822876930237, + "learning_rate": 6.315408091671138e-07, + "loss": 0.197, + "step": 47738 + }, + { + "epoch": 0.886296380182903, + "grad_norm": 0.7054978013038635, + "learning_rate": 6.311329016997736e-07, + "loss": 0.2928, + "step": 47740 + }, + { + "epoch": 0.8863335103203217, + "grad_norm": 0.4770456850528717, + "learning_rate": 6.307251217146526e-07, + "loss": 0.2706, + "step": 47742 + }, + { + "epoch": 0.8863706404577404, + "grad_norm": 0.4543619453907013, + "learning_rate": 6.303174692172997e-07, + "loss": 0.3125, + "step": 47744 + }, + { + "epoch": 0.886407770595159, + "grad_norm": 0.2943274676799774, + "learning_rate": 6.299099442132584e-07, + "loss": 0.2069, + "step": 47746 + }, + { + "epoch": 0.8864449007325776, + "grad_norm": 0.30973318219184875, + "learning_rate": 6.295025467080785e-07, + "loss": 0.3188, + "step": 47748 + }, + { + "epoch": 0.8864820308699962, + "grad_norm": 0.6491543054580688, + "learning_rate": 6.290952767072988e-07, + "loss": 0.2978, + "step": 47750 + }, + { + "epoch": 0.8865191610074149, + "grad_norm": 0.3792312741279602, + "learning_rate": 6.286881342164641e-07, + "loss": 0.2469, + "step": 47752 + }, + { + "epoch": 0.8865562911448335, + "grad_norm": 0.24689604341983795, + "learning_rate": 6.282811192411131e-07, + "loss": 0.3013, + "step": 47754 + }, + { + "epoch": 0.8865934212822522, + "grad_norm": 0.47705626487731934, + "learning_rate": 6.278742317867859e-07, + "loss": 0.2608, + "step": 47756 + }, + { + "epoch": 0.8866305514196708, + "grad_norm": 0.30380791425704956, + "learning_rate": 6.274674718590157e-07, + "loss": 0.4468, + "step": 47758 + }, + { + "epoch": 0.8866676815570894, + "grad_norm": 0.3462724983692169, + "learning_rate": 6.270608394633393e-07, + "loss": 0.5226, + "step": 47760 + }, + { + "epoch": 0.8867048116945081, + "grad_norm": 0.505673348903656, + "learning_rate": 6.2665433460529e-07, + "loss": 0.3019, + "step": 47762 + }, + { + "epoch": 0.8867419418319267, + "grad_norm": 0.37957751750946045, + "learning_rate": 6.26247957290399e-07, + "loss": 0.3277, + "step": 47764 + }, + { + "epoch": 0.8867790719693454, + "grad_norm": 0.40392905473709106, + "learning_rate": 6.258417075241973e-07, + "loss": 0.3923, + "step": 47766 + }, + { + "epoch": 0.886816202106764, + "grad_norm": 0.39244788885116577, + "learning_rate": 6.254355853122096e-07, + "loss": 0.3766, + "step": 47768 + }, + { + "epoch": 0.8868533322441826, + "grad_norm": 0.2624119520187378, + "learning_rate": 6.250295906599646e-07, + "loss": 0.2528, + "step": 47770 + }, + { + "epoch": 0.8868904623816013, + "grad_norm": 0.37868067622184753, + "learning_rate": 6.246237235729846e-07, + "loss": 0.1362, + "step": 47772 + }, + { + "epoch": 0.8869275925190199, + "grad_norm": 0.36689993739128113, + "learning_rate": 6.24217984056793e-07, + "loss": 0.1454, + "step": 47774 + }, + { + "epoch": 0.8869647226564386, + "grad_norm": 0.5005046725273132, + "learning_rate": 6.238123721169109e-07, + "loss": 0.2849, + "step": 47776 + }, + { + "epoch": 0.8870018527938572, + "grad_norm": 0.3269892930984497, + "learning_rate": 6.234068877588573e-07, + "loss": 0.3642, + "step": 47778 + }, + { + "epoch": 0.8870389829312758, + "grad_norm": 0.46620404720306396, + "learning_rate": 6.230015309881499e-07, + "loss": 0.266, + "step": 47780 + }, + { + "epoch": 0.8870761130686945, + "grad_norm": 0.3269730508327484, + "learning_rate": 6.225963018103054e-07, + "loss": 0.1691, + "step": 47782 + }, + { + "epoch": 0.8871132432061131, + "grad_norm": 0.3888849914073944, + "learning_rate": 6.22191200230835e-07, + "loss": 0.2703, + "step": 47784 + }, + { + "epoch": 0.8871503733435318, + "grad_norm": 0.31519120931625366, + "learning_rate": 6.217862262552532e-07, + "loss": 0.2591, + "step": 47786 + }, + { + "epoch": 0.8871875034809504, + "grad_norm": 0.398238867521286, + "learning_rate": 6.213813798890711e-07, + "loss": 0.2628, + "step": 47788 + }, + { + "epoch": 0.887224633618369, + "grad_norm": 0.48543480038642883, + "learning_rate": 6.209766611377954e-07, + "loss": 0.2232, + "step": 47790 + }, + { + "epoch": 0.8872617637557877, + "grad_norm": 0.3339442312717438, + "learning_rate": 6.205720700069329e-07, + "loss": 0.1835, + "step": 47792 + }, + { + "epoch": 0.8872988938932063, + "grad_norm": 0.5636805295944214, + "learning_rate": 6.201676065019924e-07, + "loss": 0.3088, + "step": 47794 + }, + { + "epoch": 0.887336024030625, + "grad_norm": 0.5825998783111572, + "learning_rate": 6.197632706284729e-07, + "loss": 0.4153, + "step": 47796 + }, + { + "epoch": 0.8873731541680435, + "grad_norm": 0.3999953866004944, + "learning_rate": 6.193590623918777e-07, + "loss": 0.2238, + "step": 47798 + }, + { + "epoch": 0.8874102843054622, + "grad_norm": 0.37967583537101746, + "learning_rate": 6.189549817977081e-07, + "loss": 0.4268, + "step": 47800 + }, + { + "epoch": 0.8874474144428809, + "grad_norm": 0.12930822372436523, + "learning_rate": 6.185510288514618e-07, + "loss": 0.242, + "step": 47802 + }, + { + "epoch": 0.8874845445802995, + "grad_norm": 0.5515783429145813, + "learning_rate": 6.181472035586344e-07, + "loss": 0.3378, + "step": 47804 + }, + { + "epoch": 0.8875216747177181, + "grad_norm": 0.34625184535980225, + "learning_rate": 6.177435059247238e-07, + "loss": 0.22, + "step": 47806 + }, + { + "epoch": 0.8875588048551367, + "grad_norm": 0.3984210789203644, + "learning_rate": 6.173399359552201e-07, + "loss": 0.2351, + "step": 47808 + }, + { + "epoch": 0.8875959349925554, + "grad_norm": 0.42594677209854126, + "learning_rate": 6.169364936556144e-07, + "loss": 0.344, + "step": 47810 + }, + { + "epoch": 0.8876330651299741, + "grad_norm": 0.34776416420936584, + "learning_rate": 6.165331790314e-07, + "loss": 0.2106, + "step": 47812 + }, + { + "epoch": 0.8876701952673927, + "grad_norm": 0.8937525749206543, + "learning_rate": 6.161299920880604e-07, + "loss": 0.266, + "step": 47814 + }, + { + "epoch": 0.8877073254048113, + "grad_norm": 0.30978524684906006, + "learning_rate": 6.157269328310833e-07, + "loss": 0.2721, + "step": 47816 + }, + { + "epoch": 0.8877444555422299, + "grad_norm": 0.407779723405838, + "learning_rate": 6.153240012659534e-07, + "loss": 0.1965, + "step": 47818 + }, + { + "epoch": 0.8877815856796486, + "grad_norm": 0.47428229451179504, + "learning_rate": 6.149211973981562e-07, + "loss": 0.3347, + "step": 47820 + }, + { + "epoch": 0.8878187158170673, + "grad_norm": 0.4276329278945923, + "learning_rate": 6.145185212331673e-07, + "loss": 0.4252, + "step": 47822 + }, + { + "epoch": 0.8878558459544859, + "grad_norm": 0.5838897228240967, + "learning_rate": 6.14115972776469e-07, + "loss": 0.4623, + "step": 47824 + }, + { + "epoch": 0.8878929760919045, + "grad_norm": 0.5917683839797974, + "learning_rate": 6.13713552033538e-07, + "loss": 0.2913, + "step": 47826 + }, + { + "epoch": 0.8879301062293231, + "grad_norm": 0.6393962502479553, + "learning_rate": 6.1331125900985e-07, + "loss": 0.3586, + "step": 47828 + }, + { + "epoch": 0.8879672363667418, + "grad_norm": 0.4340791702270508, + "learning_rate": 6.129090937108817e-07, + "loss": 0.1538, + "step": 47830 + }, + { + "epoch": 0.8880043665041605, + "grad_norm": 0.3339197337627411, + "learning_rate": 6.125070561421009e-07, + "loss": 0.1525, + "step": 47832 + }, + { + "epoch": 0.888041496641579, + "grad_norm": 0.3617398142814636, + "learning_rate": 6.121051463089822e-07, + "loss": 0.2503, + "step": 47834 + }, + { + "epoch": 0.8880786267789977, + "grad_norm": 0.259944885969162, + "learning_rate": 6.117033642169911e-07, + "loss": 0.1742, + "step": 47836 + }, + { + "epoch": 0.8881157569164163, + "grad_norm": 0.435729056596756, + "learning_rate": 6.113017098715957e-07, + "loss": 0.356, + "step": 47838 + }, + { + "epoch": 0.888152887053835, + "grad_norm": 0.2981502413749695, + "learning_rate": 6.109001832782624e-07, + "loss": 0.1366, + "step": 47840 + }, + { + "epoch": 0.8881900171912537, + "grad_norm": 0.5559622049331665, + "learning_rate": 6.104987844424526e-07, + "loss": 0.2665, + "step": 47842 + }, + { + "epoch": 0.8882271473286723, + "grad_norm": 0.3082905113697052, + "learning_rate": 6.100975133696318e-07, + "loss": 0.1721, + "step": 47844 + }, + { + "epoch": 0.8882642774660909, + "grad_norm": 0.2573748826980591, + "learning_rate": 6.096963700652569e-07, + "loss": 0.3917, + "step": 47846 + }, + { + "epoch": 0.8883014076035095, + "grad_norm": 0.28783687949180603, + "learning_rate": 6.092953545347857e-07, + "loss": 0.2502, + "step": 47848 + }, + { + "epoch": 0.8883385377409282, + "grad_norm": 0.25555160641670227, + "learning_rate": 6.088944667836772e-07, + "loss": 0.2047, + "step": 47850 + }, + { + "epoch": 0.8883756678783468, + "grad_norm": 0.3898650109767914, + "learning_rate": 6.084937068173868e-07, + "loss": 0.1294, + "step": 47852 + }, + { + "epoch": 0.8884127980157654, + "grad_norm": 0.35594040155410767, + "learning_rate": 6.080930746413649e-07, + "loss": 0.2457, + "step": 47854 + }, + { + "epoch": 0.8884499281531841, + "grad_norm": 0.20641061663627625, + "learning_rate": 6.076925702610659e-07, + "loss": 0.13, + "step": 47856 + }, + { + "epoch": 0.8884870582906027, + "grad_norm": 0.3245171904563904, + "learning_rate": 6.072921936819365e-07, + "loss": 0.3288, + "step": 47858 + }, + { + "epoch": 0.8885241884280214, + "grad_norm": 0.4686327874660492, + "learning_rate": 6.068919449094246e-07, + "loss": 0.2744, + "step": 47860 + }, + { + "epoch": 0.88856131856544, + "grad_norm": 0.2799648940563202, + "learning_rate": 6.064918239489792e-07, + "loss": 0.3497, + "step": 47862 + }, + { + "epoch": 0.8885984487028586, + "grad_norm": 0.30769261717796326, + "learning_rate": 6.060918308060426e-07, + "loss": 0.1527, + "step": 47864 + }, + { + "epoch": 0.8886355788402773, + "grad_norm": 0.4349844753742218, + "learning_rate": 6.056919654860583e-07, + "loss": 0.4599, + "step": 47866 + }, + { + "epoch": 0.8886727089776959, + "grad_norm": 0.26441556215286255, + "learning_rate": 6.052922279944661e-07, + "loss": 0.123, + "step": 47868 + }, + { + "epoch": 0.8887098391151146, + "grad_norm": 0.4737924039363861, + "learning_rate": 6.048926183367098e-07, + "loss": 0.389, + "step": 47870 + }, + { + "epoch": 0.8887469692525332, + "grad_norm": 0.23293496668338776, + "learning_rate": 6.044931365182205e-07, + "loss": 0.2639, + "step": 47872 + }, + { + "epoch": 0.8887840993899518, + "grad_norm": 0.5110318064689636, + "learning_rate": 6.040937825444382e-07, + "loss": 0.2486, + "step": 47874 + }, + { + "epoch": 0.8888212295273705, + "grad_norm": 0.33499446511268616, + "learning_rate": 6.036945564207941e-07, + "loss": 0.2498, + "step": 47876 + }, + { + "epoch": 0.8888583596647891, + "grad_norm": 0.378844290971756, + "learning_rate": 6.032954581527228e-07, + "loss": 0.305, + "step": 47878 + }, + { + "epoch": 0.8888954898022078, + "grad_norm": 0.3518020808696747, + "learning_rate": 6.028964877456533e-07, + "loss": 0.1669, + "step": 47880 + }, + { + "epoch": 0.8889326199396264, + "grad_norm": 0.33926478028297424, + "learning_rate": 6.024976452050169e-07, + "loss": 0.3425, + "step": 47882 + }, + { + "epoch": 0.888969750077045, + "grad_norm": 0.42721250653266907, + "learning_rate": 6.020989305362368e-07, + "loss": 0.1519, + "step": 47884 + }, + { + "epoch": 0.8890068802144637, + "grad_norm": 0.5861724615097046, + "learning_rate": 6.017003437447399e-07, + "loss": 0.2668, + "step": 47886 + }, + { + "epoch": 0.8890440103518823, + "grad_norm": 0.27624666690826416, + "learning_rate": 6.013018848359508e-07, + "loss": 0.2564, + "step": 47888 + }, + { + "epoch": 0.889081140489301, + "grad_norm": 0.3486524820327759, + "learning_rate": 6.009035538152907e-07, + "loss": 0.4284, + "step": 47890 + }, + { + "epoch": 0.8891182706267196, + "grad_norm": 0.33791929483413696, + "learning_rate": 6.005053506881797e-07, + "loss": 0.272, + "step": 47892 + }, + { + "epoch": 0.8891554007641382, + "grad_norm": 0.3999594748020172, + "learning_rate": 6.001072754600367e-07, + "loss": 0.2798, + "step": 47894 + }, + { + "epoch": 0.8891925309015569, + "grad_norm": 0.44175106287002563, + "learning_rate": 5.997093281362787e-07, + "loss": 0.4866, + "step": 47896 + }, + { + "epoch": 0.8892296610389755, + "grad_norm": 0.4189186096191406, + "learning_rate": 5.993115087223178e-07, + "loss": 0.3025, + "step": 47898 + }, + { + "epoch": 0.8892667911763942, + "grad_norm": 0.40308451652526855, + "learning_rate": 5.989138172235687e-07, + "loss": 0.2799, + "step": 47900 + }, + { + "epoch": 0.8893039213138128, + "grad_norm": 0.2144613265991211, + "learning_rate": 5.985162536454436e-07, + "loss": 0.3259, + "step": 47902 + }, + { + "epoch": 0.8893410514512314, + "grad_norm": 0.26510271430015564, + "learning_rate": 5.981188179933517e-07, + "loss": 0.4199, + "step": 47904 + }, + { + "epoch": 0.88937818158865, + "grad_norm": 0.5337905883789062, + "learning_rate": 5.977215102727008e-07, + "loss": 0.227, + "step": 47906 + }, + { + "epoch": 0.8894153117260687, + "grad_norm": 0.2218475490808487, + "learning_rate": 5.973243304888976e-07, + "loss": 0.2282, + "step": 47908 + }, + { + "epoch": 0.8894524418634874, + "grad_norm": 1.2308385372161865, + "learning_rate": 5.969272786473457e-07, + "loss": 0.3518, + "step": 47910 + }, + { + "epoch": 0.889489572000906, + "grad_norm": 0.4328368902206421, + "learning_rate": 5.965303547534474e-07, + "loss": 0.3292, + "step": 47912 + }, + { + "epoch": 0.8895267021383246, + "grad_norm": 0.4345044791698456, + "learning_rate": 5.961335588126038e-07, + "loss": 0.1753, + "step": 47914 + }, + { + "epoch": 0.8895638322757432, + "grad_norm": 0.2850533723831177, + "learning_rate": 5.957368908302175e-07, + "loss": 0.2642, + "step": 47916 + }, + { + "epoch": 0.8896009624131619, + "grad_norm": 0.5604518055915833, + "learning_rate": 5.953403508116806e-07, + "loss": 0.2437, + "step": 47918 + }, + { + "epoch": 0.8896380925505806, + "grad_norm": 0.6172801852226257, + "learning_rate": 5.949439387623923e-07, + "loss": 0.5686, + "step": 47920 + }, + { + "epoch": 0.8896752226879991, + "grad_norm": 0.5851142406463623, + "learning_rate": 5.945476546877449e-07, + "loss": 0.3085, + "step": 47922 + }, + { + "epoch": 0.8897123528254178, + "grad_norm": 0.35563135147094727, + "learning_rate": 5.941514985931318e-07, + "loss": 0.2781, + "step": 47924 + }, + { + "epoch": 0.8897494829628364, + "grad_norm": 0.31452447175979614, + "learning_rate": 5.937554704839421e-07, + "loss": 0.2733, + "step": 47926 + }, + { + "epoch": 0.8897866131002551, + "grad_norm": 0.41809457540512085, + "learning_rate": 5.933595703655659e-07, + "loss": 0.2288, + "step": 47928 + }, + { + "epoch": 0.8898237432376738, + "grad_norm": 0.5139526724815369, + "learning_rate": 5.9296379824339e-07, + "loss": 0.1626, + "step": 47930 + }, + { + "epoch": 0.8898608733750923, + "grad_norm": 0.3840597867965698, + "learning_rate": 5.925681541228001e-07, + "loss": 0.3106, + "step": 47932 + }, + { + "epoch": 0.889898003512511, + "grad_norm": 0.5569860935211182, + "learning_rate": 5.921726380091785e-07, + "loss": 0.3707, + "step": 47934 + }, + { + "epoch": 0.8899351336499296, + "grad_norm": 0.4887702763080597, + "learning_rate": 5.917772499079078e-07, + "loss": 0.3392, + "step": 47936 + }, + { + "epoch": 0.8899722637873483, + "grad_norm": 0.31022652983665466, + "learning_rate": 5.913819898243678e-07, + "loss": 0.3608, + "step": 47938 + }, + { + "epoch": 0.890009393924767, + "grad_norm": 0.3625992238521576, + "learning_rate": 5.909868577639366e-07, + "loss": 0.546, + "step": 47940 + }, + { + "epoch": 0.8900465240621855, + "grad_norm": 0.7216010689735413, + "learning_rate": 5.90591853731991e-07, + "loss": 0.1109, + "step": 47942 + }, + { + "epoch": 0.8900836541996042, + "grad_norm": 0.4317566156387329, + "learning_rate": 5.901969777339056e-07, + "loss": 0.1985, + "step": 47944 + }, + { + "epoch": 0.8901207843370228, + "grad_norm": 0.3587350845336914, + "learning_rate": 5.898022297750549e-07, + "loss": 0.2373, + "step": 47946 + }, + { + "epoch": 0.8901579144744415, + "grad_norm": 0.2826683223247528, + "learning_rate": 5.894076098608081e-07, + "loss": 0.3457, + "step": 47948 + }, + { + "epoch": 0.89019504461186, + "grad_norm": 0.5661956071853638, + "learning_rate": 5.890131179965353e-07, + "loss": 0.2071, + "step": 47950 + }, + { + "epoch": 0.8902321747492787, + "grad_norm": 0.27288126945495605, + "learning_rate": 5.886187541876054e-07, + "loss": 0.0497, + "step": 47952 + }, + { + "epoch": 0.8902693048866974, + "grad_norm": 0.3725036680698395, + "learning_rate": 5.882245184393831e-07, + "loss": 0.3025, + "step": 47954 + }, + { + "epoch": 0.890306435024116, + "grad_norm": 0.531634509563446, + "learning_rate": 5.878304107572341e-07, + "loss": 0.2664, + "step": 47956 + }, + { + "epoch": 0.8903435651615347, + "grad_norm": 0.3659232556819916, + "learning_rate": 5.87436431146522e-07, + "loss": 0.1179, + "step": 47958 + }, + { + "epoch": 0.8903806952989533, + "grad_norm": 1.1382054090499878, + "learning_rate": 5.870425796126067e-07, + "loss": 0.221, + "step": 47960 + }, + { + "epoch": 0.8904178254363719, + "grad_norm": 0.4274129867553711, + "learning_rate": 5.866488561608442e-07, + "loss": 0.1932, + "step": 47962 + }, + { + "epoch": 0.8904549555737906, + "grad_norm": 0.36734524369239807, + "learning_rate": 5.862552607965954e-07, + "loss": 0.303, + "step": 47964 + }, + { + "epoch": 0.8904920857112092, + "grad_norm": 0.37382832169532776, + "learning_rate": 5.858617935252142e-07, + "loss": 0.3215, + "step": 47966 + }, + { + "epoch": 0.8905292158486279, + "grad_norm": 0.28260767459869385, + "learning_rate": 5.854684543520561e-07, + "loss": 0.4027, + "step": 47968 + }, + { + "epoch": 0.8905663459860464, + "grad_norm": 0.5251832008361816, + "learning_rate": 5.850752432824735e-07, + "loss": 0.2231, + "step": 47970 + }, + { + "epoch": 0.8906034761234651, + "grad_norm": 0.3341313898563385, + "learning_rate": 5.846821603218145e-07, + "loss": 0.2017, + "step": 47972 + }, + { + "epoch": 0.8906406062608838, + "grad_norm": 0.41732171177864075, + "learning_rate": 5.842892054754301e-07, + "loss": 0.2607, + "step": 47974 + }, + { + "epoch": 0.8906777363983024, + "grad_norm": 0.18817701935768127, + "learning_rate": 5.838963787486651e-07, + "loss": 0.2496, + "step": 47976 + }, + { + "epoch": 0.8907148665357211, + "grad_norm": 0.5483757853507996, + "learning_rate": 5.835036801468663e-07, + "loss": 0.4654, + "step": 47978 + }, + { + "epoch": 0.8907519966731396, + "grad_norm": 0.5468409061431885, + "learning_rate": 5.831111096753784e-07, + "loss": 0.4133, + "step": 47980 + }, + { + "epoch": 0.8907891268105583, + "grad_norm": 0.41872891783714294, + "learning_rate": 5.827186673395401e-07, + "loss": 0.1973, + "step": 47982 + }, + { + "epoch": 0.890826256947977, + "grad_norm": 0.3206910490989685, + "learning_rate": 5.823263531446932e-07, + "loss": 0.207, + "step": 47984 + }, + { + "epoch": 0.8908633870853956, + "grad_norm": 0.320258229970932, + "learning_rate": 5.819341670961753e-07, + "loss": 0.2711, + "step": 47986 + }, + { + "epoch": 0.8909005172228143, + "grad_norm": 0.43495386838912964, + "learning_rate": 5.815421091993223e-07, + "loss": 0.315, + "step": 47988 + }, + { + "epoch": 0.8909376473602328, + "grad_norm": 0.36082538962364197, + "learning_rate": 5.811501794594698e-07, + "loss": 0.417, + "step": 47990 + }, + { + "epoch": 0.8909747774976515, + "grad_norm": 0.3062189817428589, + "learning_rate": 5.807583778819503e-07, + "loss": 0.1601, + "step": 47992 + }, + { + "epoch": 0.8910119076350702, + "grad_norm": 0.2824788987636566, + "learning_rate": 5.803667044720951e-07, + "loss": 0.2261, + "step": 47994 + }, + { + "epoch": 0.8910490377724888, + "grad_norm": 0.30771294236183167, + "learning_rate": 5.799751592352366e-07, + "loss": 0.208, + "step": 47996 + }, + { + "epoch": 0.8910861679099075, + "grad_norm": 0.35971924662590027, + "learning_rate": 5.795837421766981e-07, + "loss": 0.1885, + "step": 47998 + }, + { + "epoch": 0.891123298047326, + "grad_norm": 0.22116224467754364, + "learning_rate": 5.791924533018068e-07, + "loss": 0.1325, + "step": 48000 + }, + { + "epoch": 0.8911604281847447, + "grad_norm": 0.4358120858669281, + "learning_rate": 5.788012926158892e-07, + "loss": 0.2435, + "step": 48002 + }, + { + "epoch": 0.8911975583221633, + "grad_norm": 0.3738580048084259, + "learning_rate": 5.784102601242659e-07, + "loss": 0.3359, + "step": 48004 + }, + { + "epoch": 0.891234688459582, + "grad_norm": 0.4875073730945587, + "learning_rate": 5.780193558322567e-07, + "loss": 0.1977, + "step": 48006 + }, + { + "epoch": 0.8912718185970007, + "grad_norm": 0.4058280289173126, + "learning_rate": 5.776285797451841e-07, + "loss": 0.3417, + "step": 48008 + }, + { + "epoch": 0.8913089487344192, + "grad_norm": 0.5195993185043335, + "learning_rate": 5.772379318683619e-07, + "loss": 0.315, + "step": 48010 + }, + { + "epoch": 0.8913460788718379, + "grad_norm": 0.42354103922843933, + "learning_rate": 5.768474122071066e-07, + "loss": 0.3266, + "step": 48012 + }, + { + "epoch": 0.8913832090092565, + "grad_norm": 0.7197782397270203, + "learning_rate": 5.76457020766733e-07, + "loss": 0.258, + "step": 48014 + }, + { + "epoch": 0.8914203391466752, + "grad_norm": 0.21858695149421692, + "learning_rate": 5.760667575525513e-07, + "loss": 0.1692, + "step": 48016 + }, + { + "epoch": 0.8914574692840939, + "grad_norm": 0.2296631634235382, + "learning_rate": 5.75676622569874e-07, + "loss": 0.0613, + "step": 48018 + }, + { + "epoch": 0.8914945994215124, + "grad_norm": 0.3595762848854065, + "learning_rate": 5.7528661582401e-07, + "loss": 0.1777, + "step": 48020 + }, + { + "epoch": 0.8915317295589311, + "grad_norm": 0.4881545901298523, + "learning_rate": 5.748967373202641e-07, + "loss": 0.3185, + "step": 48022 + }, + { + "epoch": 0.8915688596963497, + "grad_norm": 0.376855731010437, + "learning_rate": 5.74506987063943e-07, + "loss": 0.1649, + "step": 48024 + }, + { + "epoch": 0.8916059898337684, + "grad_norm": 0.4989224970340729, + "learning_rate": 5.741173650603482e-07, + "loss": 0.3102, + "step": 48026 + }, + { + "epoch": 0.8916431199711871, + "grad_norm": 0.5492688417434692, + "learning_rate": 5.737278713147809e-07, + "loss": 0.3578, + "step": 48028 + }, + { + "epoch": 0.8916802501086056, + "grad_norm": 0.24938735365867615, + "learning_rate": 5.733385058325436e-07, + "loss": 0.2354, + "step": 48030 + }, + { + "epoch": 0.8917173802460243, + "grad_norm": 0.4122154116630554, + "learning_rate": 5.729492686189331e-07, + "loss": 0.2711, + "step": 48032 + }, + { + "epoch": 0.8917545103834429, + "grad_norm": 0.20196297764778137, + "learning_rate": 5.725601596792474e-07, + "loss": 0.2054, + "step": 48034 + }, + { + "epoch": 0.8917916405208616, + "grad_norm": 0.3957515060901642, + "learning_rate": 5.721711790187768e-07, + "loss": 0.2445, + "step": 48036 + }, + { + "epoch": 0.8918287706582803, + "grad_norm": 0.5129810571670532, + "learning_rate": 5.71782326642818e-07, + "loss": 0.374, + "step": 48038 + }, + { + "epoch": 0.8918659007956988, + "grad_norm": 0.400850385427475, + "learning_rate": 5.713936025566602e-07, + "loss": 0.0983, + "step": 48040 + }, + { + "epoch": 0.8919030309331175, + "grad_norm": 0.7423719763755798, + "learning_rate": 5.710050067655959e-07, + "loss": 0.3056, + "step": 48042 + }, + { + "epoch": 0.8919401610705361, + "grad_norm": 0.2746523916721344, + "learning_rate": 5.706165392749075e-07, + "loss": 0.2498, + "step": 48044 + }, + { + "epoch": 0.8919772912079548, + "grad_norm": 0.3743668496608734, + "learning_rate": 5.702282000898863e-07, + "loss": 0.1756, + "step": 48046 + }, + { + "epoch": 0.8920144213453735, + "grad_norm": 0.3331935405731201, + "learning_rate": 5.698399892158124e-07, + "loss": 0.0956, + "step": 48048 + }, + { + "epoch": 0.892051551482792, + "grad_norm": 0.3176688551902771, + "learning_rate": 5.694519066579695e-07, + "loss": 0.215, + "step": 48050 + }, + { + "epoch": 0.8920886816202107, + "grad_norm": 0.4722760319709778, + "learning_rate": 5.690639524216379e-07, + "loss": 0.3981, + "step": 48052 + }, + { + "epoch": 0.8921258117576293, + "grad_norm": 0.5450557470321655, + "learning_rate": 5.686761265120977e-07, + "loss": 0.1765, + "step": 48054 + }, + { + "epoch": 0.892162941895048, + "grad_norm": 0.3873656094074249, + "learning_rate": 5.682884289346247e-07, + "loss": 0.1573, + "step": 48056 + }, + { + "epoch": 0.8922000720324665, + "grad_norm": 0.37894609570503235, + "learning_rate": 5.679008596944969e-07, + "loss": 0.3798, + "step": 48058 + }, + { + "epoch": 0.8922372021698852, + "grad_norm": 0.5657567977905273, + "learning_rate": 5.675134187969833e-07, + "loss": 0.4067, + "step": 48060 + }, + { + "epoch": 0.8922743323073039, + "grad_norm": 0.27479425072669983, + "learning_rate": 5.671261062473599e-07, + "loss": 0.1712, + "step": 48062 + }, + { + "epoch": 0.8923114624447225, + "grad_norm": 0.3373755216598511, + "learning_rate": 5.667389220508946e-07, + "loss": 0.281, + "step": 48064 + }, + { + "epoch": 0.8923485925821412, + "grad_norm": 0.23779384791851044, + "learning_rate": 5.663518662128586e-07, + "loss": 0.1429, + "step": 48066 + }, + { + "epoch": 0.8923857227195597, + "grad_norm": 0.6319172978401184, + "learning_rate": 5.659649387385158e-07, + "loss": 0.4282, + "step": 48068 + }, + { + "epoch": 0.8924228528569784, + "grad_norm": 0.4576117992401123, + "learning_rate": 5.655781396331306e-07, + "loss": 0.3398, + "step": 48070 + }, + { + "epoch": 0.8924599829943971, + "grad_norm": 0.36553531885147095, + "learning_rate": 5.6519146890197e-07, + "loss": 0.4287, + "step": 48072 + }, + { + "epoch": 0.8924971131318157, + "grad_norm": 0.4878500998020172, + "learning_rate": 5.64804926550292e-07, + "loss": 0.2953, + "step": 48074 + }, + { + "epoch": 0.8925342432692344, + "grad_norm": 0.4361942410469055, + "learning_rate": 5.644185125833557e-07, + "loss": 0.2574, + "step": 48076 + }, + { + "epoch": 0.8925713734066529, + "grad_norm": 0.16884243488311768, + "learning_rate": 5.640322270064214e-07, + "loss": 0.075, + "step": 48078 + }, + { + "epoch": 0.8926085035440716, + "grad_norm": 0.2505477964878082, + "learning_rate": 5.636460698247448e-07, + "loss": 0.3663, + "step": 48080 + }, + { + "epoch": 0.8926456336814903, + "grad_norm": 0.4852755069732666, + "learning_rate": 5.632600410435796e-07, + "loss": 0.379, + "step": 48082 + }, + { + "epoch": 0.8926827638189089, + "grad_norm": 0.24749785661697388, + "learning_rate": 5.628741406681815e-07, + "loss": 0.194, + "step": 48084 + }, + { + "epoch": 0.8927198939563276, + "grad_norm": 0.4386451542377472, + "learning_rate": 5.624883687037964e-07, + "loss": 0.3711, + "step": 48086 + }, + { + "epoch": 0.8927570240937461, + "grad_norm": 0.27535343170166016, + "learning_rate": 5.621027251556777e-07, + "loss": 0.0969, + "step": 48088 + }, + { + "epoch": 0.8927941542311648, + "grad_norm": 0.38260334730148315, + "learning_rate": 5.617172100290691e-07, + "loss": 0.2146, + "step": 48090 + }, + { + "epoch": 0.8928312843685835, + "grad_norm": 0.43588241934776306, + "learning_rate": 5.613318233292187e-07, + "loss": 0.4238, + "step": 48092 + }, + { + "epoch": 0.8928684145060021, + "grad_norm": 0.39827802777290344, + "learning_rate": 5.609465650613699e-07, + "loss": 0.2679, + "step": 48094 + }, + { + "epoch": 0.8929055446434208, + "grad_norm": 0.5190327167510986, + "learning_rate": 5.605614352307675e-07, + "loss": 0.3406, + "step": 48096 + }, + { + "epoch": 0.8929426747808393, + "grad_norm": 0.8550142049789429, + "learning_rate": 5.601764338426474e-07, + "loss": 0.3625, + "step": 48098 + }, + { + "epoch": 0.892979804918258, + "grad_norm": 0.5571878552436829, + "learning_rate": 5.597915609022498e-07, + "loss": 0.2583, + "step": 48100 + }, + { + "epoch": 0.8930169350556766, + "grad_norm": 0.4084494709968567, + "learning_rate": 5.594068164148137e-07, + "loss": 0.3143, + "step": 48102 + }, + { + "epoch": 0.8930540651930953, + "grad_norm": 0.35584506392478943, + "learning_rate": 5.590222003855716e-07, + "loss": 0.2047, + "step": 48104 + }, + { + "epoch": 0.893091195330514, + "grad_norm": 0.3852481544017792, + "learning_rate": 5.586377128197606e-07, + "loss": 0.3441, + "step": 48106 + }, + { + "epoch": 0.8931283254679325, + "grad_norm": 0.36523163318634033, + "learning_rate": 5.582533537226087e-07, + "loss": 0.2334, + "step": 48108 + }, + { + "epoch": 0.8931654556053512, + "grad_norm": 0.6220179200172424, + "learning_rate": 5.578691230993493e-07, + "loss": 0.188, + "step": 48110 + }, + { + "epoch": 0.8932025857427698, + "grad_norm": 0.3718593716621399, + "learning_rate": 5.574850209552063e-07, + "loss": 0.2723, + "step": 48112 + }, + { + "epoch": 0.8932397158801885, + "grad_norm": 0.2846348285675049, + "learning_rate": 5.571010472954097e-07, + "loss": 0.2902, + "step": 48114 + }, + { + "epoch": 0.8932768460176072, + "grad_norm": 0.49333465099334717, + "learning_rate": 5.567172021251821e-07, + "loss": 0.4848, + "step": 48116 + }, + { + "epoch": 0.8933139761550257, + "grad_norm": 0.5860490798950195, + "learning_rate": 5.563334854497482e-07, + "loss": 0.4137, + "step": 48118 + }, + { + "epoch": 0.8933511062924444, + "grad_norm": 0.3317994773387909, + "learning_rate": 5.559498972743272e-07, + "loss": 0.2115, + "step": 48120 + }, + { + "epoch": 0.893388236429863, + "grad_norm": 0.5351721048355103, + "learning_rate": 5.555664376041426e-07, + "loss": 0.2338, + "step": 48122 + }, + { + "epoch": 0.8934253665672817, + "grad_norm": 0.3390806019306183, + "learning_rate": 5.55183106444408e-07, + "loss": 0.2585, + "step": 48124 + }, + { + "epoch": 0.8934624967047003, + "grad_norm": 0.44500523805618286, + "learning_rate": 5.547999038003404e-07, + "loss": 0.2105, + "step": 48126 + }, + { + "epoch": 0.8934996268421189, + "grad_norm": 0.27266427874565125, + "learning_rate": 5.544168296771557e-07, + "loss": 0.1557, + "step": 48128 + }, + { + "epoch": 0.8935367569795376, + "grad_norm": 0.5000883936882019, + "learning_rate": 5.540338840800641e-07, + "loss": 0.255, + "step": 48130 + }, + { + "epoch": 0.8935738871169562, + "grad_norm": 0.4541560709476471, + "learning_rate": 5.53651067014277e-07, + "loss": 0.2206, + "step": 48132 + }, + { + "epoch": 0.8936110172543749, + "grad_norm": 0.6393914818763733, + "learning_rate": 5.532683784850057e-07, + "loss": 0.171, + "step": 48134 + }, + { + "epoch": 0.8936481473917935, + "grad_norm": 0.300985187292099, + "learning_rate": 5.528858184974539e-07, + "loss": 0.1264, + "step": 48136 + }, + { + "epoch": 0.8936852775292121, + "grad_norm": 0.5072539448738098, + "learning_rate": 5.525033870568286e-07, + "loss": 0.4321, + "step": 48138 + }, + { + "epoch": 0.8937224076666308, + "grad_norm": 0.38438692688941956, + "learning_rate": 5.521210841683333e-07, + "loss": 0.1433, + "step": 48140 + }, + { + "epoch": 0.8937595378040494, + "grad_norm": 0.29546865820884705, + "learning_rate": 5.517389098371706e-07, + "loss": 0.2669, + "step": 48142 + }, + { + "epoch": 0.8937966679414681, + "grad_norm": 0.26627209782600403, + "learning_rate": 5.513568640685407e-07, + "loss": 0.1287, + "step": 48144 + }, + { + "epoch": 0.8938337980788867, + "grad_norm": 0.3024660646915436, + "learning_rate": 5.50974946867644e-07, + "loss": 0.3433, + "step": 48146 + }, + { + "epoch": 0.8938709282163053, + "grad_norm": 0.2645702362060547, + "learning_rate": 5.505931582396729e-07, + "loss": 0.1188, + "step": 48148 + }, + { + "epoch": 0.893908058353724, + "grad_norm": 0.21912391483783722, + "learning_rate": 5.502114981898266e-07, + "loss": 0.2388, + "step": 48150 + }, + { + "epoch": 0.8939451884911426, + "grad_norm": 0.555739164352417, + "learning_rate": 5.498299667232942e-07, + "loss": 0.4138, + "step": 48152 + }, + { + "epoch": 0.8939823186285613, + "grad_norm": 0.34279948472976685, + "learning_rate": 5.494485638452696e-07, + "loss": 0.3392, + "step": 48154 + }, + { + "epoch": 0.8940194487659798, + "grad_norm": 0.2938547432422638, + "learning_rate": 5.490672895609428e-07, + "loss": 0.4222, + "step": 48156 + }, + { + "epoch": 0.8940565789033985, + "grad_norm": 0.36053842306137085, + "learning_rate": 5.486861438755009e-07, + "loss": 0.1888, + "step": 48158 + }, + { + "epoch": 0.8940937090408172, + "grad_norm": 0.39039263129234314, + "learning_rate": 5.483051267941331e-07, + "loss": 0.2318, + "step": 48160 + }, + { + "epoch": 0.8941308391782358, + "grad_norm": 0.4221782088279724, + "learning_rate": 5.479242383220196e-07, + "loss": 0.1981, + "step": 48162 + }, + { + "epoch": 0.8941679693156545, + "grad_norm": 0.4096442759037018, + "learning_rate": 5.475434784643441e-07, + "loss": 0.343, + "step": 48164 + }, + { + "epoch": 0.894205099453073, + "grad_norm": 0.3916064202785492, + "learning_rate": 5.471628472262891e-07, + "loss": 0.2725, + "step": 48166 + }, + { + "epoch": 0.8942422295904917, + "grad_norm": 0.38425859808921814, + "learning_rate": 5.467823446130327e-07, + "loss": 0.2844, + "step": 48168 + }, + { + "epoch": 0.8942793597279104, + "grad_norm": 0.535915195941925, + "learning_rate": 5.464019706297551e-07, + "loss": 0.1161, + "step": 48170 + }, + { + "epoch": 0.894316489865329, + "grad_norm": 0.4379023611545563, + "learning_rate": 5.46021725281628e-07, + "loss": 0.3075, + "step": 48172 + }, + { + "epoch": 0.8943536200027477, + "grad_norm": 0.4208337962627411, + "learning_rate": 5.456416085738292e-07, + "loss": 0.2464, + "step": 48174 + }, + { + "epoch": 0.8943907501401662, + "grad_norm": 0.6813507676124573, + "learning_rate": 5.452616205115269e-07, + "loss": 0.435, + "step": 48176 + }, + { + "epoch": 0.8944278802775849, + "grad_norm": 0.3636918067932129, + "learning_rate": 5.448817610998947e-07, + "loss": 0.17, + "step": 48178 + }, + { + "epoch": 0.8944650104150036, + "grad_norm": 0.3207550346851349, + "learning_rate": 5.445020303440995e-07, + "loss": 0.2887, + "step": 48180 + }, + { + "epoch": 0.8945021405524222, + "grad_norm": 0.23400770127773285, + "learning_rate": 5.441224282493096e-07, + "loss": 0.3307, + "step": 48182 + }, + { + "epoch": 0.8945392706898408, + "grad_norm": 0.463784396648407, + "learning_rate": 5.437429548206907e-07, + "loss": 0.2694, + "step": 48184 + }, + { + "epoch": 0.8945764008272594, + "grad_norm": 0.36724919080734253, + "learning_rate": 5.433636100634043e-07, + "loss": 0.1757, + "step": 48186 + }, + { + "epoch": 0.8946135309646781, + "grad_norm": 0.38956594467163086, + "learning_rate": 5.429843939826129e-07, + "loss": 0.2015, + "step": 48188 + }, + { + "epoch": 0.8946506611020968, + "grad_norm": 0.34098151326179504, + "learning_rate": 5.42605306583478e-07, + "loss": 0.2966, + "step": 48190 + }, + { + "epoch": 0.8946877912395154, + "grad_norm": 0.4431378245353699, + "learning_rate": 5.422263478711575e-07, + "loss": 0.2354, + "step": 48192 + }, + { + "epoch": 0.894724921376934, + "grad_norm": 0.3878399133682251, + "learning_rate": 5.418475178508054e-07, + "loss": 0.3826, + "step": 48194 + }, + { + "epoch": 0.8947620515143526, + "grad_norm": 0.21570952236652374, + "learning_rate": 5.414688165275784e-07, + "loss": 0.2003, + "step": 48196 + }, + { + "epoch": 0.8947991816517713, + "grad_norm": 0.13801129162311554, + "learning_rate": 5.410902439066302e-07, + "loss": 0.0929, + "step": 48198 + }, + { + "epoch": 0.89483631178919, + "grad_norm": 1.528064250946045, + "learning_rate": 5.4071179999311e-07, + "loss": 0.2319, + "step": 48200 + }, + { + "epoch": 0.8948734419266086, + "grad_norm": 0.2992658317089081, + "learning_rate": 5.403334847921681e-07, + "loss": 0.3238, + "step": 48202 + }, + { + "epoch": 0.8949105720640272, + "grad_norm": 0.4056158661842346, + "learning_rate": 5.399552983089529e-07, + "loss": 0.2226, + "step": 48204 + }, + { + "epoch": 0.8949477022014458, + "grad_norm": 0.6549992561340332, + "learning_rate": 5.395772405486099e-07, + "loss": 0.3028, + "step": 48206 + }, + { + "epoch": 0.8949848323388645, + "grad_norm": 0.4309667944908142, + "learning_rate": 5.39199311516283e-07, + "loss": 0.4625, + "step": 48208 + }, + { + "epoch": 0.8950219624762831, + "grad_norm": 0.38017329573631287, + "learning_rate": 5.388215112171169e-07, + "loss": 0.3197, + "step": 48210 + }, + { + "epoch": 0.8950590926137018, + "grad_norm": 0.365291029214859, + "learning_rate": 5.384438396562497e-07, + "loss": 0.1926, + "step": 48212 + }, + { + "epoch": 0.8950962227511204, + "grad_norm": 0.35767385363578796, + "learning_rate": 5.380662968388217e-07, + "loss": 0.1937, + "step": 48214 + }, + { + "epoch": 0.895133352888539, + "grad_norm": 0.3258022367954254, + "learning_rate": 5.376888827699689e-07, + "loss": 0.1238, + "step": 48216 + }, + { + "epoch": 0.8951704830259577, + "grad_norm": 0.38413605093955994, + "learning_rate": 5.373115974548271e-07, + "loss": 0.2322, + "step": 48218 + }, + { + "epoch": 0.8952076131633763, + "grad_norm": 0.2799179255962372, + "learning_rate": 5.3693444089853e-07, + "loss": 0.3519, + "step": 48220 + }, + { + "epoch": 0.895244743300795, + "grad_norm": 0.21654854714870453, + "learning_rate": 5.365574131062112e-07, + "loss": 0.3351, + "step": 48222 + }, + { + "epoch": 0.8952818734382136, + "grad_norm": 0.3522450625896454, + "learning_rate": 5.361805140829989e-07, + "loss": 0.3591, + "step": 48224 + }, + { + "epoch": 0.8953190035756322, + "grad_norm": 0.7935491800308228, + "learning_rate": 5.358037438340225e-07, + "loss": 0.1182, + "step": 48226 + }, + { + "epoch": 0.8953561337130509, + "grad_norm": 0.27044638991355896, + "learning_rate": 5.354271023644087e-07, + "loss": 0.1983, + "step": 48228 + }, + { + "epoch": 0.8953932638504695, + "grad_norm": 0.35649988055229187, + "learning_rate": 5.350505896792824e-07, + "loss": 0.0751, + "step": 48230 + }, + { + "epoch": 0.8954303939878882, + "grad_norm": 0.2961735725402832, + "learning_rate": 5.346742057837661e-07, + "loss": 0.3225, + "step": 48232 + }, + { + "epoch": 0.8954675241253068, + "grad_norm": 0.3365193009376526, + "learning_rate": 5.342979506829837e-07, + "loss": 0.1909, + "step": 48234 + }, + { + "epoch": 0.8955046542627254, + "grad_norm": 0.36907002329826355, + "learning_rate": 5.339218243820532e-07, + "loss": 0.3726, + "step": 48236 + }, + { + "epoch": 0.8955417844001441, + "grad_norm": 0.31047526001930237, + "learning_rate": 5.335458268860916e-07, + "loss": 0.2041, + "step": 48238 + }, + { + "epoch": 0.8955789145375627, + "grad_norm": 0.3974595069885254, + "learning_rate": 5.331699582002148e-07, + "loss": 0.1949, + "step": 48240 + }, + { + "epoch": 0.8956160446749813, + "grad_norm": 0.26994895935058594, + "learning_rate": 5.327942183295399e-07, + "loss": 0.256, + "step": 48242 + }, + { + "epoch": 0.8956531748124, + "grad_norm": 0.38351067900657654, + "learning_rate": 5.32418607279177e-07, + "loss": 0.4123, + "step": 48244 + }, + { + "epoch": 0.8956903049498186, + "grad_norm": 0.3229714632034302, + "learning_rate": 5.32043125054238e-07, + "loss": 0.2973, + "step": 48246 + }, + { + "epoch": 0.8957274350872373, + "grad_norm": 0.5426089763641357, + "learning_rate": 5.31667771659834e-07, + "loss": 0.4656, + "step": 48248 + }, + { + "epoch": 0.8957645652246559, + "grad_norm": 0.4776879847049713, + "learning_rate": 5.312925471010699e-07, + "loss": 0.2504, + "step": 48250 + }, + { + "epoch": 0.8958016953620745, + "grad_norm": 0.5026327967643738, + "learning_rate": 5.309174513830518e-07, + "loss": 0.1962, + "step": 48252 + }, + { + "epoch": 0.8958388254994931, + "grad_norm": 0.42700284719467163, + "learning_rate": 5.305424845108842e-07, + "loss": 0.3031, + "step": 48254 + }, + { + "epoch": 0.8958759556369118, + "grad_norm": 0.5479990839958191, + "learning_rate": 5.301676464896699e-07, + "loss": 0.1597, + "step": 48256 + }, + { + "epoch": 0.8959130857743305, + "grad_norm": 0.3768352270126343, + "learning_rate": 5.29792937324507e-07, + "loss": 0.2337, + "step": 48258 + }, + { + "epoch": 0.8959502159117491, + "grad_norm": 0.29962170124053955, + "learning_rate": 5.29418357020498e-07, + "loss": 0.3534, + "step": 48260 + }, + { + "epoch": 0.8959873460491677, + "grad_norm": 0.46734151244163513, + "learning_rate": 5.290439055827346e-07, + "loss": 0.18, + "step": 48262 + }, + { + "epoch": 0.8960244761865863, + "grad_norm": 0.35517027974128723, + "learning_rate": 5.286695830163158e-07, + "loss": 0.374, + "step": 48264 + }, + { + "epoch": 0.896061606324005, + "grad_norm": 0.328797310590744, + "learning_rate": 5.282953893263343e-07, + "loss": 0.2609, + "step": 48266 + }, + { + "epoch": 0.8960987364614237, + "grad_norm": 0.3383162021636963, + "learning_rate": 5.279213245178816e-07, + "loss": 0.2581, + "step": 48268 + }, + { + "epoch": 0.8961358665988423, + "grad_norm": 0.3055339753627777, + "learning_rate": 5.275473885960469e-07, + "loss": 0.2914, + "step": 48270 + }, + { + "epoch": 0.8961729967362609, + "grad_norm": 0.3556740880012512, + "learning_rate": 5.271735815659195e-07, + "loss": 0.3102, + "step": 48272 + }, + { + "epoch": 0.8962101268736795, + "grad_norm": 0.352333128452301, + "learning_rate": 5.267999034325855e-07, + "loss": 0.1898, + "step": 48274 + }, + { + "epoch": 0.8962472570110982, + "grad_norm": 0.28771793842315674, + "learning_rate": 5.264263542011294e-07, + "loss": 0.2877, + "step": 48276 + }, + { + "epoch": 0.8962843871485169, + "grad_norm": 0.26419955492019653, + "learning_rate": 5.260529338766341e-07, + "loss": 0.2841, + "step": 48278 + }, + { + "epoch": 0.8963215172859355, + "grad_norm": 0.456599622964859, + "learning_rate": 5.256796424641808e-07, + "loss": 0.4229, + "step": 48280 + }, + { + "epoch": 0.8963586474233541, + "grad_norm": 0.5157799124717712, + "learning_rate": 5.253064799688467e-07, + "loss": 0.3144, + "step": 48282 + }, + { + "epoch": 0.8963957775607727, + "grad_norm": 0.3839508295059204, + "learning_rate": 5.249334463957134e-07, + "loss": 0.2781, + "step": 48284 + }, + { + "epoch": 0.8964329076981914, + "grad_norm": 0.4695371389389038, + "learning_rate": 5.245605417498545e-07, + "loss": 0.4401, + "step": 48286 + }, + { + "epoch": 0.8964700378356101, + "grad_norm": 0.20175091922283173, + "learning_rate": 5.241877660363437e-07, + "loss": 0.1287, + "step": 48288 + }, + { + "epoch": 0.8965071679730287, + "grad_norm": 0.5565608739852905, + "learning_rate": 5.238151192602548e-07, + "loss": 0.1541, + "step": 48290 + }, + { + "epoch": 0.8965442981104473, + "grad_norm": 0.413633793592453, + "learning_rate": 5.234426014266569e-07, + "loss": 0.2915, + "step": 48292 + }, + { + "epoch": 0.8965814282478659, + "grad_norm": 0.39114534854888916, + "learning_rate": 5.230702125406195e-07, + "loss": 0.4223, + "step": 48294 + }, + { + "epoch": 0.8966185583852846, + "grad_norm": 0.3685234487056732, + "learning_rate": 5.226979526072107e-07, + "loss": 0.3139, + "step": 48296 + }, + { + "epoch": 0.8966556885227033, + "grad_norm": 0.41675615310668945, + "learning_rate": 5.223258216314941e-07, + "loss": 0.328, + "step": 48298 + }, + { + "epoch": 0.8966928186601218, + "grad_norm": 0.4430311620235443, + "learning_rate": 5.219538196185359e-07, + "loss": 0.3075, + "step": 48300 + }, + { + "epoch": 0.8967299487975405, + "grad_norm": 0.28352394700050354, + "learning_rate": 5.215819465733951e-07, + "loss": 0.2421, + "step": 48302 + }, + { + "epoch": 0.8967670789349591, + "grad_norm": 0.46194711327552795, + "learning_rate": 5.212102025011323e-07, + "loss": 0.1643, + "step": 48304 + }, + { + "epoch": 0.8968042090723778, + "grad_norm": 0.49363917112350464, + "learning_rate": 5.208385874068067e-07, + "loss": 0.4399, + "step": 48306 + }, + { + "epoch": 0.8968413392097964, + "grad_norm": 0.4763575792312622, + "learning_rate": 5.204671012954743e-07, + "loss": 0.1382, + "step": 48308 + }, + { + "epoch": 0.896878469347215, + "grad_norm": 0.6243524551391602, + "learning_rate": 5.20095744172191e-07, + "loss": 0.1494, + "step": 48310 + }, + { + "epoch": 0.8969155994846337, + "grad_norm": 0.27767983078956604, + "learning_rate": 5.197245160420084e-07, + "loss": 0.3101, + "step": 48312 + }, + { + "epoch": 0.8969527296220523, + "grad_norm": 0.2878401279449463, + "learning_rate": 5.193534169099778e-07, + "loss": 0.324, + "step": 48314 + }, + { + "epoch": 0.896989859759471, + "grad_norm": 0.3336949944496155, + "learning_rate": 5.1898244678115e-07, + "loss": 0.1184, + "step": 48316 + }, + { + "epoch": 0.8970269898968896, + "grad_norm": 0.35772940516471863, + "learning_rate": 5.186116056605728e-07, + "loss": 0.252, + "step": 48318 + }, + { + "epoch": 0.8970641200343082, + "grad_norm": 0.19549548625946045, + "learning_rate": 5.182408935532912e-07, + "loss": 0.1601, + "step": 48320 + }, + { + "epoch": 0.8971012501717269, + "grad_norm": 0.47092145681381226, + "learning_rate": 5.17870310464349e-07, + "loss": 0.1275, + "step": 48322 + }, + { + "epoch": 0.8971383803091455, + "grad_norm": 0.5483776926994324, + "learning_rate": 5.17499856398792e-07, + "loss": 0.4689, + "step": 48324 + }, + { + "epoch": 0.8971755104465642, + "grad_norm": 0.3513294458389282, + "learning_rate": 5.171295313616565e-07, + "loss": 0.3317, + "step": 48326 + }, + { + "epoch": 0.8972126405839828, + "grad_norm": 0.2194170206785202, + "learning_rate": 5.167593353579836e-07, + "loss": 0.2261, + "step": 48328 + }, + { + "epoch": 0.8972497707214014, + "grad_norm": 0.2987911105155945, + "learning_rate": 5.163892683928107e-07, + "loss": 0.2115, + "step": 48330 + }, + { + "epoch": 0.8972869008588201, + "grad_norm": 0.48497337102890015, + "learning_rate": 5.160193304711725e-07, + "loss": 0.4345, + "step": 48332 + }, + { + "epoch": 0.8973240309962387, + "grad_norm": 0.63909512758255, + "learning_rate": 5.156495215981039e-07, + "loss": 0.3077, + "step": 48334 + }, + { + "epoch": 0.8973611611336574, + "grad_norm": 0.7699863314628601, + "learning_rate": 5.152798417786386e-07, + "loss": 0.2063, + "step": 48336 + }, + { + "epoch": 0.897398291271076, + "grad_norm": 0.4420967698097229, + "learning_rate": 5.149102910178017e-07, + "loss": 0.1443, + "step": 48338 + }, + { + "epoch": 0.8974354214084946, + "grad_norm": 0.3161070644855499, + "learning_rate": 5.145408693206255e-07, + "loss": 0.3842, + "step": 48340 + }, + { + "epoch": 0.8974725515459133, + "grad_norm": 0.29047083854675293, + "learning_rate": 5.141715766921373e-07, + "loss": 0.313, + "step": 48342 + }, + { + "epoch": 0.8975096816833319, + "grad_norm": 0.7735493183135986, + "learning_rate": 5.138024131373598e-07, + "loss": 0.2356, + "step": 48344 + }, + { + "epoch": 0.8975468118207506, + "grad_norm": 0.22955092787742615, + "learning_rate": 5.134333786613155e-07, + "loss": 0.4078, + "step": 48346 + }, + { + "epoch": 0.8975839419581692, + "grad_norm": 0.3891378939151764, + "learning_rate": 5.130644732690293e-07, + "loss": 0.607, + "step": 48348 + }, + { + "epoch": 0.8976210720955878, + "grad_norm": 0.2374430000782013, + "learning_rate": 5.126956969655184e-07, + "loss": 0.2733, + "step": 48350 + }, + { + "epoch": 0.8976582022330065, + "grad_norm": 0.44031134247779846, + "learning_rate": 5.123270497558009e-07, + "loss": 0.3678, + "step": 48352 + }, + { + "epoch": 0.8976953323704251, + "grad_norm": 0.41577571630477905, + "learning_rate": 5.119585316448927e-07, + "loss": 0.2075, + "step": 48354 + }, + { + "epoch": 0.8977324625078438, + "grad_norm": 0.43238401412963867, + "learning_rate": 5.11590142637809e-07, + "loss": 0.2338, + "step": 48356 + }, + { + "epoch": 0.8977695926452623, + "grad_norm": 0.3450582027435303, + "learning_rate": 5.112218827395631e-07, + "loss": 0.1607, + "step": 48358 + }, + { + "epoch": 0.897806722782681, + "grad_norm": 0.4294826090335846, + "learning_rate": 5.108537519551659e-07, + "loss": 0.3226, + "step": 48360 + }, + { + "epoch": 0.8978438529200996, + "grad_norm": 0.46395614743232727, + "learning_rate": 5.104857502896254e-07, + "loss": 0.1944, + "step": 48362 + }, + { + "epoch": 0.8978809830575183, + "grad_norm": 0.5648952722549438, + "learning_rate": 5.101178777479498e-07, + "loss": 0.1904, + "step": 48364 + }, + { + "epoch": 0.897918113194937, + "grad_norm": 0.3792033791542053, + "learning_rate": 5.09750134335143e-07, + "loss": 0.1977, + "step": 48366 + }, + { + "epoch": 0.8979552433323555, + "grad_norm": 0.4301142990589142, + "learning_rate": 5.093825200562108e-07, + "loss": 0.4331, + "step": 48368 + }, + { + "epoch": 0.8979923734697742, + "grad_norm": 0.5916496515274048, + "learning_rate": 5.090150349161549e-07, + "loss": 0.3613, + "step": 48370 + }, + { + "epoch": 0.8980295036071928, + "grad_norm": 0.6146742701530457, + "learning_rate": 5.086476789199757e-07, + "loss": 0.3902, + "step": 48372 + }, + { + "epoch": 0.8980666337446115, + "grad_norm": 0.4244139492511749, + "learning_rate": 5.082804520726726e-07, + "loss": 0.243, + "step": 48374 + }, + { + "epoch": 0.8981037638820302, + "grad_norm": 0.5452306270599365, + "learning_rate": 5.079133543792403e-07, + "loss": 0.3112, + "step": 48376 + }, + { + "epoch": 0.8981408940194487, + "grad_norm": 0.5164527893066406, + "learning_rate": 5.075463858446749e-07, + "loss": 0.2087, + "step": 48378 + }, + { + "epoch": 0.8981780241568674, + "grad_norm": 0.36262694001197815, + "learning_rate": 5.071795464739704e-07, + "loss": 0.3659, + "step": 48380 + }, + { + "epoch": 0.898215154294286, + "grad_norm": 0.2508375942707062, + "learning_rate": 5.068128362721192e-07, + "loss": 0.1814, + "step": 48382 + }, + { + "epoch": 0.8982522844317047, + "grad_norm": 0.5330743193626404, + "learning_rate": 5.064462552441085e-07, + "loss": 0.2721, + "step": 48384 + }, + { + "epoch": 0.8982894145691234, + "grad_norm": 0.19209109246730804, + "learning_rate": 5.060798033949299e-07, + "loss": 0.2464, + "step": 48386 + }, + { + "epoch": 0.8983265447065419, + "grad_norm": 0.362298846244812, + "learning_rate": 5.057134807295649e-07, + "loss": 0.1363, + "step": 48388 + }, + { + "epoch": 0.8983636748439606, + "grad_norm": 0.4734501242637634, + "learning_rate": 5.053472872530018e-07, + "loss": 0.3012, + "step": 48390 + }, + { + "epoch": 0.8984008049813792, + "grad_norm": 0.35991984605789185, + "learning_rate": 5.049812229702222e-07, + "loss": 0.3406, + "step": 48392 + }, + { + "epoch": 0.8984379351187979, + "grad_norm": 0.45024046301841736, + "learning_rate": 5.046152878862065e-07, + "loss": 0.4601, + "step": 48394 + }, + { + "epoch": 0.8984750652562166, + "grad_norm": 0.33652982115745544, + "learning_rate": 5.042494820059352e-07, + "loss": 0.2312, + "step": 48396 + }, + { + "epoch": 0.8985121953936351, + "grad_norm": 0.30189865827560425, + "learning_rate": 5.038838053343864e-07, + "loss": 0.2037, + "step": 48398 + }, + { + "epoch": 0.8985493255310538, + "grad_norm": 0.2663382291793823, + "learning_rate": 5.035182578765352e-07, + "loss": 0.3458, + "step": 48400 + }, + { + "epoch": 0.8985864556684724, + "grad_norm": 0.47923770546913147, + "learning_rate": 5.03152839637353e-07, + "loss": 0.354, + "step": 48402 + }, + { + "epoch": 0.8986235858058911, + "grad_norm": 0.5102172493934631, + "learning_rate": 5.027875506218172e-07, + "loss": 0.2568, + "step": 48404 + }, + { + "epoch": 0.8986607159433097, + "grad_norm": 0.3228839337825775, + "learning_rate": 5.024223908348935e-07, + "loss": 0.1378, + "step": 48406 + }, + { + "epoch": 0.8986978460807283, + "grad_norm": 0.4514017403125763, + "learning_rate": 5.020573602815526e-07, + "loss": 0.2186, + "step": 48408 + }, + { + "epoch": 0.898734976218147, + "grad_norm": 0.39048248529434204, + "learning_rate": 5.016924589667615e-07, + "loss": 0.3, + "step": 48410 + }, + { + "epoch": 0.8987721063555656, + "grad_norm": 0.9240968823432922, + "learning_rate": 5.013276868954864e-07, + "loss": 0.2054, + "step": 48412 + }, + { + "epoch": 0.8988092364929843, + "grad_norm": 0.5309037566184998, + "learning_rate": 5.009630440726887e-07, + "loss": 0.2719, + "step": 48414 + }, + { + "epoch": 0.8988463666304028, + "grad_norm": 0.5092172026634216, + "learning_rate": 5.0059853050333e-07, + "loss": 0.2437, + "step": 48416 + }, + { + "epoch": 0.8988834967678215, + "grad_norm": 0.4179666340351105, + "learning_rate": 5.002341461923721e-07, + "loss": 0.2189, + "step": 48418 + }, + { + "epoch": 0.8989206269052402, + "grad_norm": 0.3597012162208557, + "learning_rate": 4.99869891144773e-07, + "loss": 0.1073, + "step": 48420 + }, + { + "epoch": 0.8989577570426588, + "grad_norm": 0.4236372113227844, + "learning_rate": 4.995057653654878e-07, + "loss": 0.2731, + "step": 48422 + }, + { + "epoch": 0.8989948871800775, + "grad_norm": 0.24848729372024536, + "learning_rate": 4.991417688594724e-07, + "loss": 0.5007, + "step": 48424 + }, + { + "epoch": 0.899032017317496, + "grad_norm": 0.4562910497188568, + "learning_rate": 4.987779016316807e-07, + "loss": 0.4097, + "step": 48426 + }, + { + "epoch": 0.8990691474549147, + "grad_norm": 0.6012105345726013, + "learning_rate": 4.984141636870599e-07, + "loss": 0.3095, + "step": 48428 + }, + { + "epoch": 0.8991062775923334, + "grad_norm": 0.6279494166374207, + "learning_rate": 4.980505550305626e-07, + "loss": 0.4106, + "step": 48430 + }, + { + "epoch": 0.899143407729752, + "grad_norm": 0.48101726174354553, + "learning_rate": 4.976870756671348e-07, + "loss": 0.1722, + "step": 48432 + }, + { + "epoch": 0.8991805378671707, + "grad_norm": 0.4528746008872986, + "learning_rate": 4.973237256017238e-07, + "loss": 0.3341, + "step": 48434 + }, + { + "epoch": 0.8992176680045892, + "grad_norm": 0.3787568211555481, + "learning_rate": 4.969605048392733e-07, + "loss": 0.4043, + "step": 48436 + }, + { + "epoch": 0.8992547981420079, + "grad_norm": 0.5023171305656433, + "learning_rate": 4.965974133847251e-07, + "loss": 0.2142, + "step": 48438 + }, + { + "epoch": 0.8992919282794266, + "grad_norm": 0.3304109275341034, + "learning_rate": 4.962344512430206e-07, + "loss": 0.4158, + "step": 48440 + }, + { + "epoch": 0.8993290584168452, + "grad_norm": 0.5913110971450806, + "learning_rate": 4.95871618419097e-07, + "loss": 0.18, + "step": 48442 + }, + { + "epoch": 0.8993661885542639, + "grad_norm": 0.22067943215370178, + "learning_rate": 4.955089149178927e-07, + "loss": 0.1343, + "step": 48444 + }, + { + "epoch": 0.8994033186916824, + "grad_norm": 0.5273178219795227, + "learning_rate": 4.951463407443446e-07, + "loss": 0.1051, + "step": 48446 + }, + { + "epoch": 0.8994404488291011, + "grad_norm": 0.4371103048324585, + "learning_rate": 4.947838959033835e-07, + "loss": 0.1502, + "step": 48448 + }, + { + "epoch": 0.8994775789665198, + "grad_norm": 0.46968066692352295, + "learning_rate": 4.94421580399943e-07, + "loss": 0.4376, + "step": 48450 + }, + { + "epoch": 0.8995147091039384, + "grad_norm": 0.33224064111709595, + "learning_rate": 4.940593942389505e-07, + "loss": 0.2132, + "step": 48452 + }, + { + "epoch": 0.8995518392413571, + "grad_norm": 0.47070813179016113, + "learning_rate": 4.936973374253362e-07, + "loss": 0.2735, + "step": 48454 + }, + { + "epoch": 0.8995889693787756, + "grad_norm": 0.4460408091545105, + "learning_rate": 4.933354099640275e-07, + "loss": 0.3298, + "step": 48456 + }, + { + "epoch": 0.8996260995161943, + "grad_norm": 0.5226757526397705, + "learning_rate": 4.929736118599471e-07, + "loss": 0.3876, + "step": 48458 + }, + { + "epoch": 0.8996632296536129, + "grad_norm": 0.49964573979377747, + "learning_rate": 4.926119431180188e-07, + "loss": 0.3311, + "step": 48460 + }, + { + "epoch": 0.8997003597910316, + "grad_norm": 0.4073947072029114, + "learning_rate": 4.922504037431664e-07, + "loss": 0.4149, + "step": 48462 + }, + { + "epoch": 0.8997374899284503, + "grad_norm": 0.2610103189945221, + "learning_rate": 4.918889937403049e-07, + "loss": 0.2189, + "step": 48464 + }, + { + "epoch": 0.8997746200658688, + "grad_norm": 0.58796226978302, + "learning_rate": 4.915277131143548e-07, + "loss": 0.2779, + "step": 48466 + }, + { + "epoch": 0.8998117502032875, + "grad_norm": 0.602207601070404, + "learning_rate": 4.911665618702321e-07, + "loss": 0.4202, + "step": 48468 + }, + { + "epoch": 0.8998488803407061, + "grad_norm": 0.32652461528778076, + "learning_rate": 4.908055400128486e-07, + "loss": 0.3683, + "step": 48470 + }, + { + "epoch": 0.8998860104781248, + "grad_norm": 0.22407186031341553, + "learning_rate": 4.904446475471192e-07, + "loss": 0.1038, + "step": 48472 + }, + { + "epoch": 0.8999231406155435, + "grad_norm": 0.39134252071380615, + "learning_rate": 4.900838844779543e-07, + "loss": 0.2572, + "step": 48474 + }, + { + "epoch": 0.899960270752962, + "grad_norm": 0.3454612195491791, + "learning_rate": 4.897232508102611e-07, + "loss": 0.3745, + "step": 48476 + }, + { + "epoch": 0.8999974008903807, + "grad_norm": 0.4180029630661011, + "learning_rate": 4.893627465489481e-07, + "loss": 0.3087, + "step": 48478 + }, + { + "epoch": 0.9000345310277993, + "grad_norm": 0.4802205264568329, + "learning_rate": 4.8900237169892e-07, + "loss": 0.2964, + "step": 48480 + }, + { + "epoch": 0.900071661165218, + "grad_norm": 0.37667667865753174, + "learning_rate": 4.88642126265082e-07, + "loss": 0.2227, + "step": 48482 + }, + { + "epoch": 0.9001087913026367, + "grad_norm": 0.517568051815033, + "learning_rate": 4.882820102523334e-07, + "loss": 0.2097, + "step": 48484 + }, + { + "epoch": 0.9001459214400552, + "grad_norm": 0.2966461777687073, + "learning_rate": 4.879220236655769e-07, + "loss": 0.2065, + "step": 48486 + }, + { + "epoch": 0.9001830515774739, + "grad_norm": 0.3757840394973755, + "learning_rate": 4.875621665097107e-07, + "loss": 0.2805, + "step": 48488 + }, + { + "epoch": 0.9002201817148925, + "grad_norm": 0.30103421211242676, + "learning_rate": 4.872024387896302e-07, + "loss": 0.1797, + "step": 48490 + }, + { + "epoch": 0.9002573118523112, + "grad_norm": 0.30888625979423523, + "learning_rate": 4.868428405102287e-07, + "loss": 0.3144, + "step": 48492 + }, + { + "epoch": 0.9002944419897299, + "grad_norm": 0.3544558584690094, + "learning_rate": 4.864833716764017e-07, + "loss": 0.1703, + "step": 48494 + }, + { + "epoch": 0.9003315721271484, + "grad_norm": 0.4850054085254669, + "learning_rate": 4.861240322930394e-07, + "loss": 0.3943, + "step": 48496 + }, + { + "epoch": 0.9003687022645671, + "grad_norm": 0.47597599029541016, + "learning_rate": 4.857648223650313e-07, + "loss": 0.3764, + "step": 48498 + }, + { + "epoch": 0.9004058324019857, + "grad_norm": 0.49046382308006287, + "learning_rate": 4.854057418972669e-07, + "loss": 0.0654, + "step": 48500 + }, + { + "epoch": 0.9004429625394044, + "grad_norm": 0.5843791365623474, + "learning_rate": 4.850467908946299e-07, + "loss": 0.4429, + "step": 48502 + }, + { + "epoch": 0.900480092676823, + "grad_norm": 0.7395893335342407, + "learning_rate": 4.846879693620054e-07, + "loss": 0.1962, + "step": 48504 + }, + { + "epoch": 0.9005172228142416, + "grad_norm": 0.48328065872192383, + "learning_rate": 4.843292773042762e-07, + "loss": 0.3219, + "step": 48506 + }, + { + "epoch": 0.9005543529516603, + "grad_norm": 0.4616237282752991, + "learning_rate": 4.839707147263228e-07, + "loss": 0.247, + "step": 48508 + }, + { + "epoch": 0.9005914830890789, + "grad_norm": 0.29741501808166504, + "learning_rate": 4.836122816330247e-07, + "loss": 0.2574, + "step": 48510 + }, + { + "epoch": 0.9006286132264976, + "grad_norm": 0.5822627544403076, + "learning_rate": 4.8325397802926e-07, + "loss": 0.3204, + "step": 48512 + }, + { + "epoch": 0.9006657433639161, + "grad_norm": 0.3079688549041748, + "learning_rate": 4.828958039199005e-07, + "loss": 0.1709, + "step": 48514 + }, + { + "epoch": 0.9007028735013348, + "grad_norm": 0.4466368556022644, + "learning_rate": 4.825377593098213e-07, + "loss": 0.3316, + "step": 48516 + }, + { + "epoch": 0.9007400036387535, + "grad_norm": 0.530005156993866, + "learning_rate": 4.82179844203896e-07, + "loss": 0.2375, + "step": 48518 + }, + { + "epoch": 0.9007771337761721, + "grad_norm": 0.7831388115882874, + "learning_rate": 4.818220586069944e-07, + "loss": 0.2368, + "step": 48520 + }, + { + "epoch": 0.9008142639135908, + "grad_norm": 0.4331243336200714, + "learning_rate": 4.814644025239834e-07, + "loss": 0.4713, + "step": 48522 + }, + { + "epoch": 0.9008513940510093, + "grad_norm": 0.39379361271858215, + "learning_rate": 4.811068759597304e-07, + "loss": 0.3175, + "step": 48524 + }, + { + "epoch": 0.900888524188428, + "grad_norm": 0.3967888355255127, + "learning_rate": 4.807494789191026e-07, + "loss": 0.1991, + "step": 48526 + }, + { + "epoch": 0.9009256543258467, + "grad_norm": 0.4109238386154175, + "learning_rate": 4.803922114069592e-07, + "loss": 0.302, + "step": 48528 + }, + { + "epoch": 0.9009627844632653, + "grad_norm": 0.4563678205013275, + "learning_rate": 4.800350734281633e-07, + "loss": 0.2902, + "step": 48530 + }, + { + "epoch": 0.900999914600684, + "grad_norm": 0.24118366837501526, + "learning_rate": 4.796780649875754e-07, + "loss": 0.3213, + "step": 48532 + }, + { + "epoch": 0.9010370447381025, + "grad_norm": 0.45349451899528503, + "learning_rate": 4.793211860900516e-07, + "loss": 0.2975, + "step": 48534 + }, + { + "epoch": 0.9010741748755212, + "grad_norm": 0.4888429641723633, + "learning_rate": 4.789644367404489e-07, + "loss": 0.2595, + "step": 48536 + }, + { + "epoch": 0.9011113050129399, + "grad_norm": 0.3797314167022705, + "learning_rate": 4.786078169436214e-07, + "loss": 0.257, + "step": 48538 + }, + { + "epoch": 0.9011484351503585, + "grad_norm": 0.38324347138404846, + "learning_rate": 4.78251326704422e-07, + "loss": 0.2267, + "step": 48540 + }, + { + "epoch": 0.9011855652877772, + "grad_norm": 0.3864246606826782, + "learning_rate": 4.778949660277e-07, + "loss": 0.2727, + "step": 48542 + }, + { + "epoch": 0.9012226954251957, + "grad_norm": 0.28465691208839417, + "learning_rate": 4.775387349183058e-07, + "loss": 0.4133, + "step": 48544 + }, + { + "epoch": 0.9012598255626144, + "grad_norm": 0.2729600965976715, + "learning_rate": 4.77182633381087e-07, + "loss": 0.1565, + "step": 48546 + }, + { + "epoch": 0.9012969557000331, + "grad_norm": 0.32435914874076843, + "learning_rate": 4.768266614208872e-07, + "loss": 0.2378, + "step": 48548 + }, + { + "epoch": 0.9013340858374517, + "grad_norm": 0.33168768882751465, + "learning_rate": 4.7647081904255375e-07, + "loss": 0.312, + "step": 48550 + }, + { + "epoch": 0.9013712159748704, + "grad_norm": 0.40228667855262756, + "learning_rate": 4.7611510625092397e-07, + "loss": 0.3253, + "step": 48552 + }, + { + "epoch": 0.9014083461122889, + "grad_norm": 0.38454997539520264, + "learning_rate": 4.757595230508427e-07, + "loss": 0.3319, + "step": 48554 + }, + { + "epoch": 0.9014454762497076, + "grad_norm": 0.5442188382148743, + "learning_rate": 4.75404069447144e-07, + "loss": 0.3861, + "step": 48556 + }, + { + "epoch": 0.9014826063871262, + "grad_norm": 0.34160980582237244, + "learning_rate": 4.7504874544466615e-07, + "loss": 0.2008, + "step": 48558 + }, + { + "epoch": 0.9015197365245449, + "grad_norm": 0.4575362205505371, + "learning_rate": 4.7469355104824423e-07, + "loss": 0.235, + "step": 48560 + }, + { + "epoch": 0.9015568666619636, + "grad_norm": 0.5059443712234497, + "learning_rate": 4.7433848626271204e-07, + "loss": 0.1898, + "step": 48562 + }, + { + "epoch": 0.9015939967993821, + "grad_norm": 0.5246545076370239, + "learning_rate": 4.739835510929014e-07, + "loss": 0.4661, + "step": 48564 + }, + { + "epoch": 0.9016311269368008, + "grad_norm": 0.573365330696106, + "learning_rate": 4.7362874554363944e-07, + "loss": 0.1923, + "step": 48566 + }, + { + "epoch": 0.9016682570742194, + "grad_norm": 0.5026063919067383, + "learning_rate": 4.7327406961975574e-07, + "loss": 0.3593, + "step": 48568 + }, + { + "epoch": 0.9017053872116381, + "grad_norm": 0.35271692276000977, + "learning_rate": 4.729195233260764e-07, + "loss": 0.1609, + "step": 48570 + }, + { + "epoch": 0.9017425173490567, + "grad_norm": 0.5646474361419678, + "learning_rate": 4.7256510666742637e-07, + "loss": 0.2552, + "step": 48572 + }, + { + "epoch": 0.9017796474864753, + "grad_norm": 0.5500047206878662, + "learning_rate": 4.7221081964862636e-07, + "loss": 0.3627, + "step": 48574 + }, + { + "epoch": 0.901816777623894, + "grad_norm": 0.3591936230659485, + "learning_rate": 4.71856662274498e-07, + "loss": 0.2086, + "step": 48576 + }, + { + "epoch": 0.9018539077613126, + "grad_norm": 0.5220582485198975, + "learning_rate": 4.715026345498608e-07, + "loss": 0.2577, + "step": 48578 + }, + { + "epoch": 0.9018910378987313, + "grad_norm": 0.33975934982299805, + "learning_rate": 4.7114873647953087e-07, + "loss": 0.1905, + "step": 48580 + }, + { + "epoch": 0.90192816803615, + "grad_norm": 0.32543617486953735, + "learning_rate": 4.7079496806832434e-07, + "loss": 0.3184, + "step": 48582 + }, + { + "epoch": 0.9019652981735685, + "grad_norm": 0.46310609579086304, + "learning_rate": 4.7044132932105526e-07, + "loss": 0.5333, + "step": 48584 + }, + { + "epoch": 0.9020024283109872, + "grad_norm": 0.48632553219795227, + "learning_rate": 4.700878202425352e-07, + "loss": 0.2166, + "step": 48586 + }, + { + "epoch": 0.9020395584484058, + "grad_norm": 0.42180293798446655, + "learning_rate": 4.697344408375759e-07, + "loss": 0.1619, + "step": 48588 + }, + { + "epoch": 0.9020766885858245, + "grad_norm": 0.22348080575466156, + "learning_rate": 4.6938119111098246e-07, + "loss": 0.2101, + "step": 48590 + }, + { + "epoch": 0.9021138187232431, + "grad_norm": 0.38572800159454346, + "learning_rate": 4.6902807106756433e-07, + "loss": 0.2411, + "step": 48592 + }, + { + "epoch": 0.9021509488606617, + "grad_norm": 0.7147664427757263, + "learning_rate": 4.686750807121254e-07, + "loss": 0.302, + "step": 48594 + }, + { + "epoch": 0.9021880789980804, + "grad_norm": 0.8897229433059692, + "learning_rate": 4.683222200494697e-07, + "loss": 0.3076, + "step": 48596 + }, + { + "epoch": 0.902225209135499, + "grad_norm": 0.5248354077339172, + "learning_rate": 4.6796948908439775e-07, + "loss": 0.422, + "step": 48598 + }, + { + "epoch": 0.9022623392729177, + "grad_norm": 0.32111644744873047, + "learning_rate": 4.676168878217102e-07, + "loss": 0.3104, + "step": 48600 + }, + { + "epoch": 0.9022994694103363, + "grad_norm": 0.40496209263801575, + "learning_rate": 4.672644162662021e-07, + "loss": 0.2711, + "step": 48602 + }, + { + "epoch": 0.9023365995477549, + "grad_norm": 0.3792456090450287, + "learning_rate": 4.669120744226718e-07, + "loss": 0.4368, + "step": 48604 + }, + { + "epoch": 0.9023737296851736, + "grad_norm": 0.5568548440933228, + "learning_rate": 4.6655986229591334e-07, + "loss": 0.4542, + "step": 48606 + }, + { + "epoch": 0.9024108598225922, + "grad_norm": 0.355587363243103, + "learning_rate": 4.662077798907194e-07, + "loss": 0.1886, + "step": 48608 + }, + { + "epoch": 0.9024479899600109, + "grad_norm": 0.41871869564056396, + "learning_rate": 4.658558272118796e-07, + "loss": 0.4242, + "step": 48610 + }, + { + "epoch": 0.9024851200974294, + "grad_norm": 0.3598412573337555, + "learning_rate": 4.655040042641845e-07, + "loss": 0.3388, + "step": 48612 + }, + { + "epoch": 0.9025222502348481, + "grad_norm": 0.33449074625968933, + "learning_rate": 4.651523110524225e-07, + "loss": 0.2815, + "step": 48614 + }, + { + "epoch": 0.9025593803722668, + "grad_norm": 0.4386392831802368, + "learning_rate": 4.648007475813765e-07, + "loss": 0.1842, + "step": 48616 + }, + { + "epoch": 0.9025965105096854, + "grad_norm": 0.41692081093788147, + "learning_rate": 4.6444931385583145e-07, + "loss": 0.3576, + "step": 48618 + }, + { + "epoch": 0.902633640647104, + "grad_norm": 0.2905862331390381, + "learning_rate": 4.640980098805681e-07, + "loss": 0.1182, + "step": 48620 + }, + { + "epoch": 0.9026707707845226, + "grad_norm": 0.4227570593357086, + "learning_rate": 4.637468356603669e-07, + "loss": 0.291, + "step": 48622 + }, + { + "epoch": 0.9027079009219413, + "grad_norm": 0.5767644047737122, + "learning_rate": 4.6339579120000757e-07, + "loss": 0.4244, + "step": 48624 + }, + { + "epoch": 0.90274503105936, + "grad_norm": 0.6080482602119446, + "learning_rate": 4.630448765042672e-07, + "loss": 0.3619, + "step": 48626 + }, + { + "epoch": 0.9027821611967786, + "grad_norm": 0.2802591919898987, + "learning_rate": 4.6269409157791765e-07, + "loss": 0.1157, + "step": 48628 + }, + { + "epoch": 0.9028192913341972, + "grad_norm": 0.37483635544776917, + "learning_rate": 4.623434364257351e-07, + "loss": 0.2988, + "step": 48630 + }, + { + "epoch": 0.9028564214716158, + "grad_norm": 0.2253986895084381, + "learning_rate": 4.61992911052489e-07, + "loss": 0.0688, + "step": 48632 + }, + { + "epoch": 0.9028935516090345, + "grad_norm": 0.49188441038131714, + "learning_rate": 4.6164251546295e-07, + "loss": 0.2907, + "step": 48634 + }, + { + "epoch": 0.9029306817464532, + "grad_norm": 0.6930758953094482, + "learning_rate": 4.6129224966188656e-07, + "loss": 0.4504, + "step": 48636 + }, + { + "epoch": 0.9029678118838718, + "grad_norm": 0.3726752698421478, + "learning_rate": 4.6094211365406263e-07, + "loss": 0.1653, + "step": 48638 + }, + { + "epoch": 0.9030049420212904, + "grad_norm": 0.46654459834098816, + "learning_rate": 4.605921074442454e-07, + "loss": 0.2352, + "step": 48640 + }, + { + "epoch": 0.903042072158709, + "grad_norm": 0.5596013069152832, + "learning_rate": 4.602422310371946e-07, + "loss": 0.252, + "step": 48642 + }, + { + "epoch": 0.9030792022961277, + "grad_norm": 0.47604313492774963, + "learning_rate": 4.5989248443767175e-07, + "loss": 0.2412, + "step": 48644 + }, + { + "epoch": 0.9031163324335464, + "grad_norm": 0.4394693970680237, + "learning_rate": 4.595428676504365e-07, + "loss": 0.3459, + "step": 48646 + }, + { + "epoch": 0.903153462570965, + "grad_norm": 0.23227445781230927, + "learning_rate": 4.5919338068024623e-07, + "loss": 0.0883, + "step": 48648 + }, + { + "epoch": 0.9031905927083836, + "grad_norm": 0.4927299916744232, + "learning_rate": 4.5884402353185477e-07, + "loss": 0.3359, + "step": 48650 + }, + { + "epoch": 0.9032277228458022, + "grad_norm": 0.29460251331329346, + "learning_rate": 4.5849479621001944e-07, + "loss": 0.1813, + "step": 48652 + }, + { + "epoch": 0.9032648529832209, + "grad_norm": 0.2480081170797348, + "learning_rate": 4.5814569871948876e-07, + "loss": 0.1543, + "step": 48654 + }, + { + "epoch": 0.9033019831206396, + "grad_norm": 0.38747185468673706, + "learning_rate": 4.577967310650133e-07, + "loss": 0.205, + "step": 48656 + }, + { + "epoch": 0.9033391132580582, + "grad_norm": 0.4756328761577606, + "learning_rate": 4.5744789325134373e-07, + "loss": 0.3595, + "step": 48658 + }, + { + "epoch": 0.9033762433954768, + "grad_norm": 0.48978331685066223, + "learning_rate": 4.5709918528322293e-07, + "loss": 0.2821, + "step": 48660 + }, + { + "epoch": 0.9034133735328954, + "grad_norm": 0.43526962399482727, + "learning_rate": 4.567506071653993e-07, + "loss": 0.3355, + "step": 48662 + }, + { + "epoch": 0.9034505036703141, + "grad_norm": 0.6052113175392151, + "learning_rate": 4.5640215890261464e-07, + "loss": 0.523, + "step": 48664 + }, + { + "epoch": 0.9034876338077327, + "grad_norm": 0.5025112628936768, + "learning_rate": 4.5605384049960954e-07, + "loss": 0.3155, + "step": 48666 + }, + { + "epoch": 0.9035247639451514, + "grad_norm": 0.5073603391647339, + "learning_rate": 4.557056519611247e-07, + "loss": 0.2931, + "step": 48668 + }, + { + "epoch": 0.90356189408257, + "grad_norm": 0.4151134192943573, + "learning_rate": 4.553575932918963e-07, + "loss": 0.2682, + "step": 48670 + }, + { + "epoch": 0.9035990242199886, + "grad_norm": 0.18232154846191406, + "learning_rate": 4.5500966449666285e-07, + "loss": 0.2648, + "step": 48672 + }, + { + "epoch": 0.9036361543574073, + "grad_norm": 0.47252264618873596, + "learning_rate": 4.54661865580156e-07, + "loss": 0.2682, + "step": 48674 + }, + { + "epoch": 0.9036732844948259, + "grad_norm": 0.33984145522117615, + "learning_rate": 4.5431419654711205e-07, + "loss": 0.2533, + "step": 48676 + }, + { + "epoch": 0.9037104146322446, + "grad_norm": 0.43220630288124084, + "learning_rate": 4.539666574022572e-07, + "loss": 0.2848, + "step": 48678 + }, + { + "epoch": 0.9037475447696632, + "grad_norm": 0.2816605865955353, + "learning_rate": 4.5361924815032435e-07, + "loss": 0.5339, + "step": 48680 + }, + { + "epoch": 0.9037846749070818, + "grad_norm": 0.4373616576194763, + "learning_rate": 4.532719687960363e-07, + "loss": 0.229, + "step": 48682 + }, + { + "epoch": 0.9038218050445005, + "grad_norm": 0.47878390550613403, + "learning_rate": 4.5292481934412267e-07, + "loss": 0.4327, + "step": 48684 + }, + { + "epoch": 0.9038589351819191, + "grad_norm": 0.4835664629936218, + "learning_rate": 4.525777997993042e-07, + "loss": 0.3592, + "step": 48686 + }, + { + "epoch": 0.9038960653193377, + "grad_norm": 0.35434237122535706, + "learning_rate": 4.5223091016630474e-07, + "loss": 0.2468, + "step": 48688 + }, + { + "epoch": 0.9039331954567564, + "grad_norm": 0.2538731098175049, + "learning_rate": 4.518841504498439e-07, + "loss": 0.1439, + "step": 48690 + }, + { + "epoch": 0.903970325594175, + "grad_norm": 0.40721482038497925, + "learning_rate": 4.515375206546402e-07, + "loss": 0.0177, + "step": 48692 + }, + { + "epoch": 0.9040074557315937, + "grad_norm": 0.24740934371948242, + "learning_rate": 4.5119102078540865e-07, + "loss": 0.3613, + "step": 48694 + }, + { + "epoch": 0.9040445858690123, + "grad_norm": 0.29808369278907776, + "learning_rate": 4.5084465084686556e-07, + "loss": 0.3033, + "step": 48696 + }, + { + "epoch": 0.904081716006431, + "grad_norm": 0.4125361144542694, + "learning_rate": 4.504984108437238e-07, + "loss": 0.2456, + "step": 48698 + }, + { + "epoch": 0.9041188461438496, + "grad_norm": 0.6495465040206909, + "learning_rate": 4.5015230078069626e-07, + "loss": 0.2548, + "step": 48700 + }, + { + "epoch": 0.9041559762812682, + "grad_norm": 0.4618299901485443, + "learning_rate": 4.498063206624892e-07, + "loss": 0.2063, + "step": 48702 + }, + { + "epoch": 0.9041931064186869, + "grad_norm": 0.7141796350479126, + "learning_rate": 4.4946047049381323e-07, + "loss": 0.3362, + "step": 48704 + }, + { + "epoch": 0.9042302365561055, + "grad_norm": 0.38592690229415894, + "learning_rate": 4.4911475027937137e-07, + "loss": 0.1687, + "step": 48706 + }, + { + "epoch": 0.9042673666935241, + "grad_norm": 0.5550637245178223, + "learning_rate": 4.4876916002387086e-07, + "loss": 0.2972, + "step": 48708 + }, + { + "epoch": 0.9043044968309427, + "grad_norm": 0.3366844356060028, + "learning_rate": 4.484236997320113e-07, + "loss": 0.1932, + "step": 48710 + }, + { + "epoch": 0.9043416269683614, + "grad_norm": 0.3520852029323578, + "learning_rate": 4.480783694084956e-07, + "loss": 0.2444, + "step": 48712 + }, + { + "epoch": 0.9043787571057801, + "grad_norm": 0.46371978521347046, + "learning_rate": 4.4773316905802335e-07, + "loss": 0.1259, + "step": 48714 + }, + { + "epoch": 0.9044158872431987, + "grad_norm": 0.440447598695755, + "learning_rate": 4.4738809868528964e-07, + "loss": 0.2993, + "step": 48716 + }, + { + "epoch": 0.9044530173806173, + "grad_norm": 0.34427210688591003, + "learning_rate": 4.470431582949897e-07, + "loss": 0.1995, + "step": 48718 + }, + { + "epoch": 0.9044901475180359, + "grad_norm": 0.4193849563598633, + "learning_rate": 4.466983478918174e-07, + "loss": 0.2159, + "step": 48720 + }, + { + "epoch": 0.9045272776554546, + "grad_norm": 0.5327879786491394, + "learning_rate": 4.4635366748046693e-07, + "loss": 0.1666, + "step": 48722 + }, + { + "epoch": 0.9045644077928733, + "grad_norm": 0.4924314022064209, + "learning_rate": 4.4600911706562555e-07, + "loss": 0.2975, + "step": 48724 + }, + { + "epoch": 0.9046015379302919, + "grad_norm": 0.35008883476257324, + "learning_rate": 4.45664696651984e-07, + "loss": 0.2516, + "step": 48726 + }, + { + "epoch": 0.9046386680677105, + "grad_norm": 0.644422173500061, + "learning_rate": 4.4532040624422624e-07, + "loss": 0.1305, + "step": 48728 + }, + { + "epoch": 0.9046757982051291, + "grad_norm": 0.3314388394355774, + "learning_rate": 4.449762458470375e-07, + "loss": 0.1492, + "step": 48730 + }, + { + "epoch": 0.9047129283425478, + "grad_norm": 0.339915007352829, + "learning_rate": 4.4463221546510173e-07, + "loss": 0.4234, + "step": 48732 + }, + { + "epoch": 0.9047500584799665, + "grad_norm": 0.5737096071243286, + "learning_rate": 4.442883151030997e-07, + "loss": 0.1858, + "step": 48734 + }, + { + "epoch": 0.904787188617385, + "grad_norm": 0.32395225763320923, + "learning_rate": 4.439445447657109e-07, + "loss": 0.2272, + "step": 48736 + }, + { + "epoch": 0.9048243187548037, + "grad_norm": 0.2530769407749176, + "learning_rate": 4.4360090445761397e-07, + "loss": 0.0963, + "step": 48738 + }, + { + "epoch": 0.9048614488922223, + "grad_norm": 0.5057935118675232, + "learning_rate": 4.43257394183485e-07, + "loss": 0.1918, + "step": 48740 + }, + { + "epoch": 0.904898579029641, + "grad_norm": 0.25706174969673157, + "learning_rate": 4.429140139479948e-07, + "loss": 0.1854, + "step": 48742 + }, + { + "epoch": 0.9049357091670597, + "grad_norm": 0.2767575681209564, + "learning_rate": 4.425707637558208e-07, + "loss": 0.1404, + "step": 48744 + }, + { + "epoch": 0.9049728393044782, + "grad_norm": 0.40217137336730957, + "learning_rate": 4.422276436116291e-07, + "loss": 0.3098, + "step": 48746 + }, + { + "epoch": 0.9050099694418969, + "grad_norm": 0.4474918842315674, + "learning_rate": 4.418846535200894e-07, + "loss": 0.411, + "step": 48748 + }, + { + "epoch": 0.9050470995793155, + "grad_norm": 0.45403483510017395, + "learning_rate": 4.4154179348587124e-07, + "loss": 0.4259, + "step": 48750 + }, + { + "epoch": 0.9050842297167342, + "grad_norm": 0.19300459325313568, + "learning_rate": 4.411990635136387e-07, + "loss": 0.2264, + "step": 48752 + }, + { + "epoch": 0.9051213598541529, + "grad_norm": 0.44730132818222046, + "learning_rate": 4.408564636080537e-07, + "loss": 0.4131, + "step": 48754 + }, + { + "epoch": 0.9051584899915714, + "grad_norm": 0.41435182094573975, + "learning_rate": 4.405139937737801e-07, + "loss": 0.206, + "step": 48756 + }, + { + "epoch": 0.9051956201289901, + "grad_norm": 0.3289092481136322, + "learning_rate": 4.401716540154766e-07, + "loss": 0.2669, + "step": 48758 + }, + { + "epoch": 0.9052327502664087, + "grad_norm": 0.46171995997428894, + "learning_rate": 4.398294443378015e-07, + "loss": 0.3089, + "step": 48760 + }, + { + "epoch": 0.9052698804038274, + "grad_norm": 0.31057822704315186, + "learning_rate": 4.394873647454112e-07, + "loss": 0.2744, + "step": 48762 + }, + { + "epoch": 0.905307010541246, + "grad_norm": 0.34856677055358887, + "learning_rate": 4.3914541524296304e-07, + "loss": 0.1834, + "step": 48764 + }, + { + "epoch": 0.9053441406786646, + "grad_norm": 0.5252789855003357, + "learning_rate": 4.388035958351078e-07, + "loss": 0.0549, + "step": 48766 + }, + { + "epoch": 0.9053812708160833, + "grad_norm": 0.3833712339401245, + "learning_rate": 4.38461906526495e-07, + "loss": 0.2972, + "step": 48768 + }, + { + "epoch": 0.9054184009535019, + "grad_norm": 0.2802814543247223, + "learning_rate": 4.381203473217754e-07, + "loss": 0.2576, + "step": 48770 + }, + { + "epoch": 0.9054555310909206, + "grad_norm": 0.3977065086364746, + "learning_rate": 4.3777891822559646e-07, + "loss": 0.3733, + "step": 48772 + }, + { + "epoch": 0.9054926612283392, + "grad_norm": 0.2784053385257721, + "learning_rate": 4.374376192426044e-07, + "loss": 0.2598, + "step": 48774 + }, + { + "epoch": 0.9055297913657578, + "grad_norm": 0.6640099883079529, + "learning_rate": 4.370964503774433e-07, + "loss": 0.2136, + "step": 48776 + }, + { + "epoch": 0.9055669215031765, + "grad_norm": 0.5974552035331726, + "learning_rate": 4.367554116347572e-07, + "loss": 0.1108, + "step": 48778 + }, + { + "epoch": 0.9056040516405951, + "grad_norm": 0.5266056656837463, + "learning_rate": 4.3641450301918244e-07, + "loss": 0.1761, + "step": 48780 + }, + { + "epoch": 0.9056411817780138, + "grad_norm": 0.39355018734931946, + "learning_rate": 4.360737245353608e-07, + "loss": 0.357, + "step": 48782 + }, + { + "epoch": 0.9056783119154324, + "grad_norm": 0.32268476486206055, + "learning_rate": 4.357330761879275e-07, + "loss": 0.3456, + "step": 48784 + }, + { + "epoch": 0.905715442052851, + "grad_norm": 0.5596379637718201, + "learning_rate": 4.35392557981521e-07, + "loss": 0.3907, + "step": 48786 + }, + { + "epoch": 0.9057525721902697, + "grad_norm": 0.41029390692710876, + "learning_rate": 4.3505216992077103e-07, + "loss": 0.1038, + "step": 48788 + }, + { + "epoch": 0.9057897023276883, + "grad_norm": 0.8101235628128052, + "learning_rate": 4.347119120103116e-07, + "loss": 0.3004, + "step": 48790 + }, + { + "epoch": 0.905826832465107, + "grad_norm": 0.3662549555301666, + "learning_rate": 4.343717842547701e-07, + "loss": 0.1158, + "step": 48792 + }, + { + "epoch": 0.9058639626025256, + "grad_norm": 0.23307795822620392, + "learning_rate": 4.3403178665877733e-07, + "loss": 0.2989, + "step": 48794 + }, + { + "epoch": 0.9059010927399442, + "grad_norm": 0.3276031017303467, + "learning_rate": 4.336919192269573e-07, + "loss": 0.3171, + "step": 48796 + }, + { + "epoch": 0.9059382228773629, + "grad_norm": 0.3484291732311249, + "learning_rate": 4.333521819639364e-07, + "loss": 0.2481, + "step": 48798 + }, + { + "epoch": 0.9059753530147815, + "grad_norm": 0.4199477434158325, + "learning_rate": 4.3301257487433743e-07, + "loss": 0.369, + "step": 48800 + }, + { + "epoch": 0.9060124831522002, + "grad_norm": 0.3740865886211395, + "learning_rate": 4.3267309796278136e-07, + "loss": 0.3463, + "step": 48802 + }, + { + "epoch": 0.9060496132896187, + "grad_norm": 0.5298373699188232, + "learning_rate": 4.3233375123388656e-07, + "loss": 0.2772, + "step": 48804 + }, + { + "epoch": 0.9060867434270374, + "grad_norm": 0.46103841066360474, + "learning_rate": 4.319945346922705e-07, + "loss": 0.336, + "step": 48806 + }, + { + "epoch": 0.9061238735644561, + "grad_norm": 0.40721645951271057, + "learning_rate": 4.316554483425506e-07, + "loss": 0.3884, + "step": 48808 + }, + { + "epoch": 0.9061610037018747, + "grad_norm": 0.35696664452552795, + "learning_rate": 4.313164921893376e-07, + "loss": 0.2778, + "step": 48810 + }, + { + "epoch": 0.9061981338392934, + "grad_norm": 0.4891475439071655, + "learning_rate": 4.309776662372467e-07, + "loss": 0.4953, + "step": 48812 + }, + { + "epoch": 0.906235263976712, + "grad_norm": 0.4484405815601349, + "learning_rate": 4.306389704908864e-07, + "loss": 0.2813, + "step": 48814 + }, + { + "epoch": 0.9062723941141306, + "grad_norm": 0.40783825516700745, + "learning_rate": 4.3030040495486757e-07, + "loss": 0.2898, + "step": 48816 + }, + { + "epoch": 0.9063095242515492, + "grad_norm": 0.3756624460220337, + "learning_rate": 4.299619696337942e-07, + "loss": 0.3026, + "step": 48818 + }, + { + "epoch": 0.9063466543889679, + "grad_norm": 0.2260306030511856, + "learning_rate": 4.2962366453227377e-07, + "loss": 0.3094, + "step": 48820 + }, + { + "epoch": 0.9063837845263866, + "grad_norm": 0.39833173155784607, + "learning_rate": 4.2928548965490814e-07, + "loss": 0.1883, + "step": 48822 + }, + { + "epoch": 0.9064209146638051, + "grad_norm": 0.23823966085910797, + "learning_rate": 4.2894744500629915e-07, + "loss": 0.0772, + "step": 48824 + }, + { + "epoch": 0.9064580448012238, + "grad_norm": 0.3064398467540741, + "learning_rate": 4.286095305910476e-07, + "loss": 0.3645, + "step": 48826 + }, + { + "epoch": 0.9064951749386424, + "grad_norm": 0.2669902443885803, + "learning_rate": 4.282717464137509e-07, + "loss": 0.3168, + "step": 48828 + }, + { + "epoch": 0.9065323050760611, + "grad_norm": 0.35009729862213135, + "learning_rate": 4.279340924790054e-07, + "loss": 0.2819, + "step": 48830 + }, + { + "epoch": 0.9065694352134798, + "grad_norm": 0.38395726680755615, + "learning_rate": 4.275965687914041e-07, + "loss": 0.31, + "step": 48832 + }, + { + "epoch": 0.9066065653508983, + "grad_norm": 0.2580207586288452, + "learning_rate": 4.272591753555411e-07, + "loss": 0.1479, + "step": 48834 + }, + { + "epoch": 0.906643695488317, + "grad_norm": 0.4046536087989807, + "learning_rate": 4.26921912176006e-07, + "loss": 0.3235, + "step": 48836 + }, + { + "epoch": 0.9066808256257356, + "grad_norm": 0.3600980341434479, + "learning_rate": 4.265847792573896e-07, + "loss": 0.414, + "step": 48838 + }, + { + "epoch": 0.9067179557631543, + "grad_norm": 0.3328019678592682, + "learning_rate": 4.2624777660428053e-07, + "loss": 0.2962, + "step": 48840 + }, + { + "epoch": 0.906755085900573, + "grad_norm": 0.3814980983734131, + "learning_rate": 4.259109042212606e-07, + "loss": 0.1527, + "step": 48842 + }, + { + "epoch": 0.9067922160379915, + "grad_norm": 0.3758595287799835, + "learning_rate": 4.2557416211291614e-07, + "loss": 0.3528, + "step": 48844 + }, + { + "epoch": 0.9068293461754102, + "grad_norm": 0.3968588709831238, + "learning_rate": 4.2523755028382795e-07, + "loss": 0.2607, + "step": 48846 + }, + { + "epoch": 0.9068664763128288, + "grad_norm": 0.5646262764930725, + "learning_rate": 4.2490106873857907e-07, + "loss": 0.1548, + "step": 48848 + }, + { + "epoch": 0.9069036064502475, + "grad_norm": 0.38165464997291565, + "learning_rate": 4.245647174817435e-07, + "loss": 0.4244, + "step": 48850 + }, + { + "epoch": 0.9069407365876662, + "grad_norm": 0.3180619776248932, + "learning_rate": 4.242284965179011e-07, + "loss": 0.3808, + "step": 48852 + }, + { + "epoch": 0.9069778667250847, + "grad_norm": 0.5579198598861694, + "learning_rate": 4.2389240585162694e-07, + "loss": 0.2316, + "step": 48854 + }, + { + "epoch": 0.9070149968625034, + "grad_norm": 0.3150941729545593, + "learning_rate": 4.2355644548749296e-07, + "loss": 0.2236, + "step": 48856 + }, + { + "epoch": 0.907052126999922, + "grad_norm": 0.5002660155296326, + "learning_rate": 4.2322061543007e-07, + "loss": 0.1426, + "step": 48858 + }, + { + "epoch": 0.9070892571373407, + "grad_norm": 0.5429056882858276, + "learning_rate": 4.2288491568392873e-07, + "loss": 0.4129, + "step": 48860 + }, + { + "epoch": 0.9071263872747592, + "grad_norm": 0.3593650758266449, + "learning_rate": 4.2254934625363787e-07, + "loss": 0.1739, + "step": 48862 + }, + { + "epoch": 0.9071635174121779, + "grad_norm": 0.48075568675994873, + "learning_rate": 4.222139071437625e-07, + "loss": 0.3493, + "step": 48864 + }, + { + "epoch": 0.9072006475495966, + "grad_norm": 0.5086880922317505, + "learning_rate": 4.21878598358868e-07, + "loss": 0.2805, + "step": 48866 + }, + { + "epoch": 0.9072377776870152, + "grad_norm": 0.26627203822135925, + "learning_rate": 4.21543419903514e-07, + "loss": 0.303, + "step": 48868 + }, + { + "epoch": 0.9072749078244339, + "grad_norm": 0.38790273666381836, + "learning_rate": 4.212083717822646e-07, + "loss": 0.3013, + "step": 48870 + }, + { + "epoch": 0.9073120379618524, + "grad_norm": 0.30810248851776123, + "learning_rate": 4.208734539996773e-07, + "loss": 0.1382, + "step": 48872 + }, + { + "epoch": 0.9073491680992711, + "grad_norm": 0.48359960317611694, + "learning_rate": 4.205386665603095e-07, + "loss": 0.3672, + "step": 48874 + }, + { + "epoch": 0.9073862982366898, + "grad_norm": 0.9051299691200256, + "learning_rate": 4.202040094687154e-07, + "loss": 0.3244, + "step": 48876 + }, + { + "epoch": 0.9074234283741084, + "grad_norm": 0.22864651679992676, + "learning_rate": 4.198694827294525e-07, + "loss": 0.3099, + "step": 48878 + }, + { + "epoch": 0.9074605585115271, + "grad_norm": 0.4373929500579834, + "learning_rate": 4.195350863470682e-07, + "loss": 0.2349, + "step": 48880 + }, + { + "epoch": 0.9074976886489456, + "grad_norm": 0.358294814825058, + "learning_rate": 4.192008203261144e-07, + "loss": 0.1118, + "step": 48882 + }, + { + "epoch": 0.9075348187863643, + "grad_norm": 0.22718486189842224, + "learning_rate": 4.1886668467113976e-07, + "loss": 0.1764, + "step": 48884 + }, + { + "epoch": 0.907571948923783, + "grad_norm": 0.4960615634918213, + "learning_rate": 4.185326793866906e-07, + "loss": 0.4519, + "step": 48886 + }, + { + "epoch": 0.9076090790612016, + "grad_norm": 0.46453729271888733, + "learning_rate": 4.181988044773122e-07, + "loss": 0.3567, + "step": 48888 + }, + { + "epoch": 0.9076462091986203, + "grad_norm": 0.4068054258823395, + "learning_rate": 4.1786505994754865e-07, + "loss": 0.2467, + "step": 48890 + }, + { + "epoch": 0.9076833393360388, + "grad_norm": 0.4161670506000519, + "learning_rate": 4.175314458019375e-07, + "loss": 0.166, + "step": 48892 + }, + { + "epoch": 0.9077204694734575, + "grad_norm": 0.6045944690704346, + "learning_rate": 4.1719796204502284e-07, + "loss": 0.2245, + "step": 48894 + }, + { + "epoch": 0.9077575996108762, + "grad_norm": 0.4963160753250122, + "learning_rate": 4.168646086813388e-07, + "loss": 0.4962, + "step": 48896 + }, + { + "epoch": 0.9077947297482948, + "grad_norm": 0.3763733506202698, + "learning_rate": 4.1653138571542296e-07, + "loss": 0.32, + "step": 48898 + }, + { + "epoch": 0.9078318598857135, + "grad_norm": 0.41575342416763306, + "learning_rate": 4.1619829315180825e-07, + "loss": 0.2322, + "step": 48900 + }, + { + "epoch": 0.907868990023132, + "grad_norm": 0.6435741782188416, + "learning_rate": 4.158653309950278e-07, + "loss": 0.2855, + "step": 48902 + }, + { + "epoch": 0.9079061201605507, + "grad_norm": 0.29290878772735596, + "learning_rate": 4.155324992496146e-07, + "loss": 0.1809, + "step": 48904 + }, + { + "epoch": 0.9079432502979694, + "grad_norm": 0.4652480185031891, + "learning_rate": 4.151997979200939e-07, + "loss": 0.2356, + "step": 48906 + }, + { + "epoch": 0.907980380435388, + "grad_norm": 0.27646857500076294, + "learning_rate": 4.148672270109932e-07, + "loss": 0.1333, + "step": 48908 + }, + { + "epoch": 0.9080175105728067, + "grad_norm": 0.5264910459518433, + "learning_rate": 4.1453478652683897e-07, + "loss": 0.366, + "step": 48910 + }, + { + "epoch": 0.9080546407102252, + "grad_norm": 0.33842936158180237, + "learning_rate": 4.142024764721564e-07, + "loss": 0.4331, + "step": 48912 + }, + { + "epoch": 0.9080917708476439, + "grad_norm": 0.30302634835243225, + "learning_rate": 4.138702968514641e-07, + "loss": 0.3809, + "step": 48914 + }, + { + "epoch": 0.9081289009850625, + "grad_norm": 0.2515854835510254, + "learning_rate": 4.1353824766928396e-07, + "loss": 0.2734, + "step": 48916 + }, + { + "epoch": 0.9081660311224812, + "grad_norm": 0.3847886919975281, + "learning_rate": 4.1320632893013244e-07, + "loss": 0.1305, + "step": 48918 + }, + { + "epoch": 0.9082031612598999, + "grad_norm": 0.3655715584754944, + "learning_rate": 4.128745406385271e-07, + "loss": 0.2396, + "step": 48920 + }, + { + "epoch": 0.9082402913973184, + "grad_norm": 0.41779062151908875, + "learning_rate": 4.1254288279898303e-07, + "loss": 0.2914, + "step": 48922 + }, + { + "epoch": 0.9082774215347371, + "grad_norm": 0.31139156222343445, + "learning_rate": 4.122113554160123e-07, + "loss": 0.224, + "step": 48924 + }, + { + "epoch": 0.9083145516721557, + "grad_norm": 0.5558975338935852, + "learning_rate": 4.118799584941258e-07, + "loss": 0.3412, + "step": 48926 + }, + { + "epoch": 0.9083516818095744, + "grad_norm": 0.25206711888313293, + "learning_rate": 4.1154869203783423e-07, + "loss": 0.198, + "step": 48928 + }, + { + "epoch": 0.9083888119469931, + "grad_norm": 0.5340746641159058, + "learning_rate": 4.1121755605164405e-07, + "loss": 0.2658, + "step": 48930 + }, + { + "epoch": 0.9084259420844116, + "grad_norm": 0.4344080686569214, + "learning_rate": 4.1088655054006056e-07, + "loss": 0.2056, + "step": 48932 + }, + { + "epoch": 0.9084630722218303, + "grad_norm": 0.2510961592197418, + "learning_rate": 4.1055567550759013e-07, + "loss": 0.3277, + "step": 48934 + }, + { + "epoch": 0.9085002023592489, + "grad_norm": 0.27498647570610046, + "learning_rate": 4.102249309587314e-07, + "loss": 0.1242, + "step": 48936 + }, + { + "epoch": 0.9085373324966676, + "grad_norm": 0.35060954093933105, + "learning_rate": 4.0989431689798633e-07, + "loss": 0.4163, + "step": 48938 + }, + { + "epoch": 0.9085744626340863, + "grad_norm": 0.7289052605628967, + "learning_rate": 4.095638333298546e-07, + "loss": 0.2473, + "step": 48940 + }, + { + "epoch": 0.9086115927715048, + "grad_norm": 0.33154094219207764, + "learning_rate": 4.0923348025883383e-07, + "loss": 0.4336, + "step": 48942 + }, + { + "epoch": 0.9086487229089235, + "grad_norm": 0.37724632024765015, + "learning_rate": 4.0890325768941586e-07, + "loss": 0.2498, + "step": 48944 + }, + { + "epoch": 0.9086858530463421, + "grad_norm": 0.3685709536075592, + "learning_rate": 4.0857316562609717e-07, + "loss": 0.3557, + "step": 48946 + }, + { + "epoch": 0.9087229831837608, + "grad_norm": 0.4333646297454834, + "learning_rate": 4.082432040733664e-07, + "loss": 0.3655, + "step": 48948 + }, + { + "epoch": 0.9087601133211795, + "grad_norm": 0.3445500135421753, + "learning_rate": 4.079133730357154e-07, + "loss": 0.2101, + "step": 48950 + }, + { + "epoch": 0.908797243458598, + "grad_norm": 0.6022473573684692, + "learning_rate": 4.07583672517633e-07, + "loss": 0.1489, + "step": 48952 + }, + { + "epoch": 0.9088343735960167, + "grad_norm": 0.47513312101364136, + "learning_rate": 4.072541025236043e-07, + "loss": 0.3182, + "step": 48954 + }, + { + "epoch": 0.9088715037334353, + "grad_norm": 0.36885449290275574, + "learning_rate": 4.0692466305811363e-07, + "loss": 0.2489, + "step": 48956 + }, + { + "epoch": 0.908908633870854, + "grad_norm": 0.4169812798500061, + "learning_rate": 4.0659535412564287e-07, + "loss": 0.4074, + "step": 48958 + }, + { + "epoch": 0.9089457640082726, + "grad_norm": 0.5923995971679688, + "learning_rate": 4.0626617573067295e-07, + "loss": 0.3072, + "step": 48960 + }, + { + "epoch": 0.9089828941456912, + "grad_norm": 0.4372858703136444, + "learning_rate": 4.059371278776847e-07, + "loss": 0.2164, + "step": 48962 + }, + { + "epoch": 0.9090200242831099, + "grad_norm": 0.5354858040809631, + "learning_rate": 4.0560821057115454e-07, + "loss": 0.2762, + "step": 48964 + }, + { + "epoch": 0.9090571544205285, + "grad_norm": 0.4759131968021393, + "learning_rate": 4.0527942381555886e-07, + "loss": 0.2979, + "step": 48966 + }, + { + "epoch": 0.9090942845579472, + "grad_norm": 0.2577284872531891, + "learning_rate": 4.0495076761536966e-07, + "loss": 0.1724, + "step": 48968 + }, + { + "epoch": 0.9091314146953657, + "grad_norm": 0.3899078369140625, + "learning_rate": 4.046222419750601e-07, + "loss": 0.384, + "step": 48970 + }, + { + "epoch": 0.9091685448327844, + "grad_norm": 0.467097669839859, + "learning_rate": 4.042938468991009e-07, + "loss": 0.247, + "step": 48972 + }, + { + "epoch": 0.9092056749702031, + "grad_norm": 0.39911025762557983, + "learning_rate": 4.0396558239195973e-07, + "loss": 0.2942, + "step": 48974 + }, + { + "epoch": 0.9092428051076217, + "grad_norm": 0.27958929538726807, + "learning_rate": 4.0363744845810405e-07, + "loss": 0.1008, + "step": 48976 + }, + { + "epoch": 0.9092799352450404, + "grad_norm": 0.18853436410427094, + "learning_rate": 4.03309445101997e-07, + "loss": 0.2228, + "step": 48978 + }, + { + "epoch": 0.9093170653824589, + "grad_norm": 0.5499475002288818, + "learning_rate": 4.02981572328105e-07, + "loss": 0.1579, + "step": 48980 + }, + { + "epoch": 0.9093541955198776, + "grad_norm": 0.44660812616348267, + "learning_rate": 4.0265383014088555e-07, + "loss": 0.3534, + "step": 48982 + }, + { + "epoch": 0.9093913256572963, + "grad_norm": 0.4025813639163971, + "learning_rate": 4.023262185448007e-07, + "loss": 0.1882, + "step": 48984 + }, + { + "epoch": 0.9094284557947149, + "grad_norm": 0.5073099732398987, + "learning_rate": 4.01998737544308e-07, + "loss": 0.33, + "step": 48986 + }, + { + "epoch": 0.9094655859321336, + "grad_norm": 0.37051287293434143, + "learning_rate": 4.016713871438616e-07, + "loss": 0.3965, + "step": 48988 + }, + { + "epoch": 0.9095027160695521, + "grad_norm": 0.6194114089012146, + "learning_rate": 4.0134416734791904e-07, + "loss": 0.2328, + "step": 48990 + }, + { + "epoch": 0.9095398462069708, + "grad_norm": 0.3913637101650238, + "learning_rate": 4.010170781609313e-07, + "loss": 0.2516, + "step": 48992 + }, + { + "epoch": 0.9095769763443895, + "grad_norm": 0.597536563873291, + "learning_rate": 4.006901195873469e-07, + "loss": 0.1999, + "step": 48994 + }, + { + "epoch": 0.9096141064818081, + "grad_norm": 0.31112778186798096, + "learning_rate": 4.00363291631618e-07, + "loss": 0.3492, + "step": 48996 + }, + { + "epoch": 0.9096512366192268, + "grad_norm": 0.3149658441543579, + "learning_rate": 4.0003659429819097e-07, + "loss": 0.1568, + "step": 48998 + }, + { + "epoch": 0.9096883667566453, + "grad_norm": 0.44385287165641785, + "learning_rate": 3.9971002759151e-07, + "loss": 0.2742, + "step": 49000 + }, + { + "epoch": 0.909725496894064, + "grad_norm": 0.4968036711215973, + "learning_rate": 3.993835915160194e-07, + "loss": 0.2495, + "step": 49002 + }, + { + "epoch": 0.9097626270314827, + "grad_norm": 0.31728169322013855, + "learning_rate": 3.990572860761621e-07, + "loss": 0.3141, + "step": 49004 + }, + { + "epoch": 0.9097997571689013, + "grad_norm": 0.42736145853996277, + "learning_rate": 3.9873111127637476e-07, + "loss": 0.1715, + "step": 49006 + }, + { + "epoch": 0.90983688730632, + "grad_norm": 0.4397246241569519, + "learning_rate": 3.9840506712109927e-07, + "loss": 0.2555, + "step": 49008 + }, + { + "epoch": 0.9098740174437385, + "grad_norm": 0.2928980886936188, + "learning_rate": 3.980791536147699e-07, + "loss": 0.3205, + "step": 49010 + }, + { + "epoch": 0.9099111475811572, + "grad_norm": 0.5345935225486755, + "learning_rate": 3.97753370761822e-07, + "loss": 0.1594, + "step": 49012 + }, + { + "epoch": 0.9099482777185758, + "grad_norm": 0.40242403745651245, + "learning_rate": 3.974277185666886e-07, + "loss": 0.2283, + "step": 49014 + }, + { + "epoch": 0.9099854078559945, + "grad_norm": 0.4264202415943146, + "learning_rate": 3.971021970338029e-07, + "loss": 0.3024, + "step": 49016 + }, + { + "epoch": 0.9100225379934131, + "grad_norm": 0.43068256974220276, + "learning_rate": 3.967768061675903e-07, + "loss": 0.2781, + "step": 49018 + }, + { + "epoch": 0.9100596681308317, + "grad_norm": 0.48553401231765747, + "learning_rate": 3.964515459724827e-07, + "loss": 0.3174, + "step": 49020 + }, + { + "epoch": 0.9100967982682504, + "grad_norm": 0.47729888558387756, + "learning_rate": 3.9612641645290106e-07, + "loss": 0.3663, + "step": 49022 + }, + { + "epoch": 0.910133928405669, + "grad_norm": 0.8070148229598999, + "learning_rate": 3.9580141761327297e-07, + "loss": 0.1778, + "step": 49024 + }, + { + "epoch": 0.9101710585430877, + "grad_norm": 0.5114554762840271, + "learning_rate": 3.9547654945801927e-07, + "loss": 0.2263, + "step": 49026 + }, + { + "epoch": 0.9102081886805063, + "grad_norm": 0.3578938841819763, + "learning_rate": 3.9515181199156095e-07, + "loss": 0.344, + "step": 49028 + }, + { + "epoch": 0.9102453188179249, + "grad_norm": 0.7296833395957947, + "learning_rate": 3.9482720521831773e-07, + "loss": 0.1101, + "step": 49030 + }, + { + "epoch": 0.9102824489553436, + "grad_norm": 0.46377238631248474, + "learning_rate": 3.945027291427039e-07, + "loss": 0.3456, + "step": 49032 + }, + { + "epoch": 0.9103195790927622, + "grad_norm": 0.3783925175666809, + "learning_rate": 3.941783837691371e-07, + "loss": 0.3248, + "step": 49034 + }, + { + "epoch": 0.9103567092301809, + "grad_norm": 0.41273605823516846, + "learning_rate": 3.938541691020281e-07, + "loss": 0.2744, + "step": 49036 + }, + { + "epoch": 0.9103938393675995, + "grad_norm": 0.5084975957870483, + "learning_rate": 3.9353008514579125e-07, + "loss": 0.203, + "step": 49038 + }, + { + "epoch": 0.9104309695050181, + "grad_norm": 0.38751763105392456, + "learning_rate": 3.932061319048364e-07, + "loss": 0.3638, + "step": 49040 + }, + { + "epoch": 0.9104680996424368, + "grad_norm": 0.28096672892570496, + "learning_rate": 3.9288230938357095e-07, + "loss": 0.3452, + "step": 49042 + }, + { + "epoch": 0.9105052297798554, + "grad_norm": 0.3201262056827545, + "learning_rate": 3.9255861758639824e-07, + "loss": 0.1162, + "step": 49044 + }, + { + "epoch": 0.9105423599172741, + "grad_norm": 0.4111940264701843, + "learning_rate": 3.9223505651772574e-07, + "loss": 0.1254, + "step": 49046 + }, + { + "epoch": 0.9105794900546927, + "grad_norm": 0.5038614273071289, + "learning_rate": 3.9191162618195553e-07, + "loss": 0.2394, + "step": 49048 + }, + { + "epoch": 0.9106166201921113, + "grad_norm": 0.38402634859085083, + "learning_rate": 3.9158832658348854e-07, + "loss": 0.2193, + "step": 49050 + }, + { + "epoch": 0.91065375032953, + "grad_norm": 0.48965758085250854, + "learning_rate": 3.9126515772672345e-07, + "loss": 0.2713, + "step": 49052 + }, + { + "epoch": 0.9106908804669486, + "grad_norm": 0.24301788210868835, + "learning_rate": 3.909421196160601e-07, + "loss": 0.2083, + "step": 49054 + }, + { + "epoch": 0.9107280106043673, + "grad_norm": 0.30776247382164, + "learning_rate": 3.906192122558905e-07, + "loss": 0.2991, + "step": 49056 + }, + { + "epoch": 0.9107651407417859, + "grad_norm": 0.6241234540939331, + "learning_rate": 3.9029643565061006e-07, + "loss": 0.1556, + "step": 49058 + }, + { + "epoch": 0.9108022708792045, + "grad_norm": 0.39345285296440125, + "learning_rate": 3.8997378980460964e-07, + "loss": 0.4543, + "step": 49060 + }, + { + "epoch": 0.9108394010166232, + "grad_norm": 0.7426944375038147, + "learning_rate": 3.896512747222836e-07, + "loss": 0.2158, + "step": 49062 + }, + { + "epoch": 0.9108765311540418, + "grad_norm": 0.36365142464637756, + "learning_rate": 3.8932889040801503e-07, + "loss": 0.3242, + "step": 49064 + }, + { + "epoch": 0.9109136612914605, + "grad_norm": 0.25166434049606323, + "learning_rate": 3.8900663686619266e-07, + "loss": 0.2134, + "step": 49066 + }, + { + "epoch": 0.910950791428879, + "grad_norm": 0.41440215706825256, + "learning_rate": 3.886845141012041e-07, + "loss": 0.2634, + "step": 49068 + }, + { + "epoch": 0.9109879215662977, + "grad_norm": 0.4218740463256836, + "learning_rate": 3.883625221174281e-07, + "loss": 0.3879, + "step": 49070 + }, + { + "epoch": 0.9110250517037164, + "grad_norm": 0.23077178001403809, + "learning_rate": 3.8804066091924773e-07, + "loss": 0.2697, + "step": 49072 + }, + { + "epoch": 0.911062181841135, + "grad_norm": 0.3843381106853485, + "learning_rate": 3.8771893051104406e-07, + "loss": 0.2875, + "step": 49074 + }, + { + "epoch": 0.9110993119785536, + "grad_norm": 0.9045064449310303, + "learning_rate": 3.873973308971923e-07, + "loss": 0.0794, + "step": 49076 + }, + { + "epoch": 0.9111364421159722, + "grad_norm": 0.2245081067085266, + "learning_rate": 3.8707586208207026e-07, + "loss": 0.2742, + "step": 49078 + }, + { + "epoch": 0.9111735722533909, + "grad_norm": 0.2783846855163574, + "learning_rate": 3.867545240700532e-07, + "loss": 0.2055, + "step": 49080 + }, + { + "epoch": 0.9112107023908096, + "grad_norm": 0.3626197278499603, + "learning_rate": 3.8643331686550987e-07, + "loss": 0.2602, + "step": 49082 + }, + { + "epoch": 0.9112478325282282, + "grad_norm": 0.5653846859931946, + "learning_rate": 3.861122404728157e-07, + "loss": 0.0797, + "step": 49084 + }, + { + "epoch": 0.9112849626656468, + "grad_norm": 0.17433226108551025, + "learning_rate": 3.857912948963349e-07, + "loss": 0.2793, + "step": 49086 + }, + { + "epoch": 0.9113220928030654, + "grad_norm": 0.4426973760128021, + "learning_rate": 3.8547048014043744e-07, + "loss": 0.4072, + "step": 49088 + }, + { + "epoch": 0.9113592229404841, + "grad_norm": 0.43427780270576477, + "learning_rate": 3.851497962094863e-07, + "loss": 0.266, + "step": 49090 + }, + { + "epoch": 0.9113963530779028, + "grad_norm": 0.6094973683357239, + "learning_rate": 3.848292431078493e-07, + "loss": 0.5219, + "step": 49092 + }, + { + "epoch": 0.9114334832153214, + "grad_norm": 0.4207667112350464, + "learning_rate": 3.8450882083988394e-07, + "loss": 0.2247, + "step": 49094 + }, + { + "epoch": 0.91147061335274, + "grad_norm": 0.48359277844429016, + "learning_rate": 3.841885294099523e-07, + "loss": 0.1877, + "step": 49096 + }, + { + "epoch": 0.9115077434901586, + "grad_norm": 0.3277426064014435, + "learning_rate": 3.83868368822411e-07, + "loss": 0.2218, + "step": 49098 + }, + { + "epoch": 0.9115448736275773, + "grad_norm": 0.33029836416244507, + "learning_rate": 3.835483390816186e-07, + "loss": 0.1454, + "step": 49100 + }, + { + "epoch": 0.911582003764996, + "grad_norm": 0.6335600018501282, + "learning_rate": 3.832284401919295e-07, + "loss": 0.299, + "step": 49102 + }, + { + "epoch": 0.9116191339024146, + "grad_norm": 0.3945949971675873, + "learning_rate": 3.829086721576947e-07, + "loss": 0.194, + "step": 49104 + }, + { + "epoch": 0.9116562640398332, + "grad_norm": 0.41849344968795776, + "learning_rate": 3.8258903498326727e-07, + "loss": 0.4278, + "step": 49106 + }, + { + "epoch": 0.9116933941772518, + "grad_norm": 0.5891265273094177, + "learning_rate": 3.822695286729938e-07, + "loss": 0.2895, + "step": 49108 + }, + { + "epoch": 0.9117305243146705, + "grad_norm": 0.2867845892906189, + "learning_rate": 3.819501532312242e-07, + "loss": 0.2817, + "step": 49110 + }, + { + "epoch": 0.9117676544520892, + "grad_norm": 0.35573574900627136, + "learning_rate": 3.8163090866230377e-07, + "loss": 0.3439, + "step": 49112 + }, + { + "epoch": 0.9118047845895078, + "grad_norm": 0.3361247777938843, + "learning_rate": 3.813117949705769e-07, + "loss": 0.2586, + "step": 49114 + }, + { + "epoch": 0.9118419147269264, + "grad_norm": 0.5398594737052917, + "learning_rate": 3.8099281216038453e-07, + "loss": 0.3119, + "step": 49116 + }, + { + "epoch": 0.911879044864345, + "grad_norm": 0.4364480972290039, + "learning_rate": 3.806739602360687e-07, + "loss": 0.3951, + "step": 49118 + }, + { + "epoch": 0.9119161750017637, + "grad_norm": 0.22037175297737122, + "learning_rate": 3.8035523920196605e-07, + "loss": 0.3499, + "step": 49120 + }, + { + "epoch": 0.9119533051391823, + "grad_norm": 0.3939928114414215, + "learning_rate": 3.8003664906241413e-07, + "loss": 0.1694, + "step": 49122 + }, + { + "epoch": 0.911990435276601, + "grad_norm": 0.39297863841056824, + "learning_rate": 3.7971818982174834e-07, + "loss": 0.2824, + "step": 49124 + }, + { + "epoch": 0.9120275654140196, + "grad_norm": 0.36904844641685486, + "learning_rate": 3.793998614843042e-07, + "loss": 0.3212, + "step": 49126 + }, + { + "epoch": 0.9120646955514382, + "grad_norm": 0.4340082108974457, + "learning_rate": 3.790816640544082e-07, + "loss": 0.3197, + "step": 49128 + }, + { + "epoch": 0.9121018256888569, + "grad_norm": 0.3527190089225769, + "learning_rate": 3.787635975363946e-07, + "loss": 0.0881, + "step": 49130 + }, + { + "epoch": 0.9121389558262755, + "grad_norm": 0.2179195135831833, + "learning_rate": 3.784456619345889e-07, + "loss": 0.2133, + "step": 49132 + }, + { + "epoch": 0.9121760859636941, + "grad_norm": 0.5752211213111877, + "learning_rate": 3.7812785725331756e-07, + "loss": 0.3775, + "step": 49134 + }, + { + "epoch": 0.9122132161011128, + "grad_norm": 0.3137204647064209, + "learning_rate": 3.77810183496905e-07, + "loss": 0.3147, + "step": 49136 + }, + { + "epoch": 0.9122503462385314, + "grad_norm": 0.3498871922492981, + "learning_rate": 3.7749264066967327e-07, + "loss": 0.2392, + "step": 49138 + }, + { + "epoch": 0.9122874763759501, + "grad_norm": 0.4859921634197235, + "learning_rate": 3.771752287759445e-07, + "loss": 0.1795, + "step": 49140 + }, + { + "epoch": 0.9123246065133687, + "grad_norm": 0.515813410282135, + "learning_rate": 3.768579478200374e-07, + "loss": 0.4419, + "step": 49142 + }, + { + "epoch": 0.9123617366507873, + "grad_norm": 0.3591817617416382, + "learning_rate": 3.7654079780627075e-07, + "loss": 0.2977, + "step": 49144 + }, + { + "epoch": 0.912398866788206, + "grad_norm": 0.4341873228549957, + "learning_rate": 3.7622377873895664e-07, + "loss": 0.2359, + "step": 49146 + }, + { + "epoch": 0.9124359969256246, + "grad_norm": 0.5378904342651367, + "learning_rate": 3.759068906224106e-07, + "loss": 0.087, + "step": 49148 + }, + { + "epoch": 0.9124731270630433, + "grad_norm": 0.5372897386550903, + "learning_rate": 3.7559013346094463e-07, + "loss": 0.2695, + "step": 49150 + }, + { + "epoch": 0.9125102572004619, + "grad_norm": 0.3655979633331299, + "learning_rate": 3.752735072588676e-07, + "loss": 0.2166, + "step": 49152 + }, + { + "epoch": 0.9125473873378805, + "grad_norm": 0.45120567083358765, + "learning_rate": 3.7495701202048817e-07, + "loss": 0.1716, + "step": 49154 + }, + { + "epoch": 0.9125845174752992, + "grad_norm": 0.8137903809547424, + "learning_rate": 3.7464064775011633e-07, + "loss": 0.3569, + "step": 49156 + }, + { + "epoch": 0.9126216476127178, + "grad_norm": 0.3004733920097351, + "learning_rate": 3.743244144520519e-07, + "loss": 0.1815, + "step": 49158 + }, + { + "epoch": 0.9126587777501365, + "grad_norm": 0.3396545648574829, + "learning_rate": 3.740083121305993e-07, + "loss": 0.2561, + "step": 49160 + }, + { + "epoch": 0.9126959078875551, + "grad_norm": 0.400591641664505, + "learning_rate": 3.736923407900606e-07, + "loss": 0.3533, + "step": 49162 + }, + { + "epoch": 0.9127330380249737, + "grad_norm": 0.5410148501396179, + "learning_rate": 3.733765004347356e-07, + "loss": 0.1558, + "step": 49164 + }, + { + "epoch": 0.9127701681623923, + "grad_norm": 0.49031898379325867, + "learning_rate": 3.730607910689221e-07, + "loss": 0.3002, + "step": 49166 + }, + { + "epoch": 0.912807298299811, + "grad_norm": 0.4291669428348541, + "learning_rate": 3.7274521269691446e-07, + "loss": 0.279, + "step": 49168 + }, + { + "epoch": 0.9128444284372297, + "grad_norm": 0.3539947271347046, + "learning_rate": 3.7242976532300913e-07, + "loss": 0.22, + "step": 49170 + }, + { + "epoch": 0.9128815585746483, + "grad_norm": 0.3904728889465332, + "learning_rate": 3.7211444895149493e-07, + "loss": 0.4332, + "step": 49172 + }, + { + "epoch": 0.9129186887120669, + "grad_norm": 0.36876776814460754, + "learning_rate": 3.717992635866641e-07, + "loss": 0.3549, + "step": 49174 + }, + { + "epoch": 0.9129558188494855, + "grad_norm": 0.38211798667907715, + "learning_rate": 3.7148420923280527e-07, + "loss": 0.3592, + "step": 49176 + }, + { + "epoch": 0.9129929489869042, + "grad_norm": 0.31899258494377136, + "learning_rate": 3.711692858942062e-07, + "loss": 0.3396, + "step": 49178 + }, + { + "epoch": 0.9130300791243229, + "grad_norm": 0.48154133558273315, + "learning_rate": 3.7085449357515237e-07, + "loss": 0.2658, + "step": 49180 + }, + { + "epoch": 0.9130672092617415, + "grad_norm": 0.4364120662212372, + "learning_rate": 3.7053983227992476e-07, + "loss": 0.249, + "step": 49182 + }, + { + "epoch": 0.9131043393991601, + "grad_norm": 0.342742383480072, + "learning_rate": 3.702253020128066e-07, + "loss": 0.1758, + "step": 49184 + }, + { + "epoch": 0.9131414695365787, + "grad_norm": 0.4467649757862091, + "learning_rate": 3.6991090277807785e-07, + "loss": 0.3164, + "step": 49186 + }, + { + "epoch": 0.9131785996739974, + "grad_norm": 0.6182628273963928, + "learning_rate": 3.695966345800173e-07, + "loss": 0.1756, + "step": 49188 + }, + { + "epoch": 0.9132157298114161, + "grad_norm": 0.42345836758613586, + "learning_rate": 3.6928249742289924e-07, + "loss": 0.4751, + "step": 49190 + }, + { + "epoch": 0.9132528599488346, + "grad_norm": 0.3199913203716278, + "learning_rate": 3.689684913109992e-07, + "loss": 0.1678, + "step": 49192 + }, + { + "epoch": 0.9132899900862533, + "grad_norm": 0.5057575106620789, + "learning_rate": 3.6865461624859043e-07, + "loss": 0.2928, + "step": 49194 + }, + { + "epoch": 0.9133271202236719, + "grad_norm": 0.27833977341651917, + "learning_rate": 3.6834087223994174e-07, + "loss": 0.2423, + "step": 49196 + }, + { + "epoch": 0.9133642503610906, + "grad_norm": 0.4165906608104706, + "learning_rate": 3.6802725928932417e-07, + "loss": 0.2376, + "step": 49198 + }, + { + "epoch": 0.9134013804985093, + "grad_norm": 0.36996570229530334, + "learning_rate": 3.6771377740100424e-07, + "loss": 0.3135, + "step": 49200 + }, + { + "epoch": 0.9134385106359278, + "grad_norm": 0.3783271908760071, + "learning_rate": 3.674004265792475e-07, + "loss": 0.3374, + "step": 49202 + }, + { + "epoch": 0.9134756407733465, + "grad_norm": 0.49599137902259827, + "learning_rate": 3.6708720682831825e-07, + "loss": 0.407, + "step": 49204 + }, + { + "epoch": 0.9135127709107651, + "grad_norm": 0.6829783320426941, + "learning_rate": 3.6677411815247975e-07, + "loss": 0.5872, + "step": 49206 + }, + { + "epoch": 0.9135499010481838, + "grad_norm": 0.5518897175788879, + "learning_rate": 3.664611605559887e-07, + "loss": 0.1766, + "step": 49208 + }, + { + "epoch": 0.9135870311856025, + "grad_norm": 0.2539489269256592, + "learning_rate": 3.661483340431071e-07, + "loss": 0.2981, + "step": 49210 + }, + { + "epoch": 0.913624161323021, + "grad_norm": 0.26490354537963867, + "learning_rate": 3.658356386180895e-07, + "loss": 0.1054, + "step": 49212 + }, + { + "epoch": 0.9136612914604397, + "grad_norm": 0.3780193328857422, + "learning_rate": 3.6552307428519005e-07, + "loss": 0.1017, + "step": 49214 + }, + { + "epoch": 0.9136984215978583, + "grad_norm": 0.43187615275382996, + "learning_rate": 3.6521064104866333e-07, + "loss": 0.1969, + "step": 49216 + }, + { + "epoch": 0.913735551735277, + "grad_norm": 0.35013946890830994, + "learning_rate": 3.648983389127614e-07, + "loss": 0.1331, + "step": 49218 + }, + { + "epoch": 0.9137726818726956, + "grad_norm": 0.2578405737876892, + "learning_rate": 3.6458616788173195e-07, + "loss": 0.1971, + "step": 49220 + }, + { + "epoch": 0.9138098120101142, + "grad_norm": 0.5299789309501648, + "learning_rate": 3.6427412795982275e-07, + "loss": 0.2827, + "step": 49222 + }, + { + "epoch": 0.9138469421475329, + "grad_norm": 0.3868010342121124, + "learning_rate": 3.639622191512804e-07, + "loss": 0.396, + "step": 49224 + }, + { + "epoch": 0.9138840722849515, + "grad_norm": 0.22832293808460236, + "learning_rate": 3.6365044146034924e-07, + "loss": 0.3655, + "step": 49226 + }, + { + "epoch": 0.9139212024223702, + "grad_norm": 0.36799561977386475, + "learning_rate": 3.6333879489127034e-07, + "loss": 0.2704, + "step": 49228 + }, + { + "epoch": 0.9139583325597888, + "grad_norm": 0.36178672313690186, + "learning_rate": 3.6302727944828696e-07, + "loss": 0.2905, + "step": 49230 + }, + { + "epoch": 0.9139954626972074, + "grad_norm": 0.7449304461479187, + "learning_rate": 3.6271589513563575e-07, + "loss": 0.29, + "step": 49232 + }, + { + "epoch": 0.9140325928346261, + "grad_norm": 0.27457907795906067, + "learning_rate": 3.624046419575555e-07, + "loss": 0.2618, + "step": 49234 + }, + { + "epoch": 0.9140697229720447, + "grad_norm": 0.4801813066005707, + "learning_rate": 3.6209351991827847e-07, + "loss": 0.2464, + "step": 49236 + }, + { + "epoch": 0.9141068531094634, + "grad_norm": 0.39341965317726135, + "learning_rate": 3.617825290220389e-07, + "loss": 0.0933, + "step": 49238 + }, + { + "epoch": 0.914143983246882, + "grad_norm": 0.5951845645904541, + "learning_rate": 3.6147166927307023e-07, + "loss": 0.3771, + "step": 49240 + }, + { + "epoch": 0.9141811133843006, + "grad_norm": 0.2941252291202545, + "learning_rate": 3.6116094067560116e-07, + "loss": 0.339, + "step": 49242 + }, + { + "epoch": 0.9142182435217193, + "grad_norm": 0.505480170249939, + "learning_rate": 3.608503432338617e-07, + "loss": 0.2142, + "step": 49244 + }, + { + "epoch": 0.9142553736591379, + "grad_norm": 0.37368538975715637, + "learning_rate": 3.6053987695207514e-07, + "loss": 0.2307, + "step": 49246 + }, + { + "epoch": 0.9142925037965566, + "grad_norm": 0.5377260446548462, + "learning_rate": 3.60229541834467e-07, + "loss": 0.2101, + "step": 49248 + }, + { + "epoch": 0.9143296339339751, + "grad_norm": 0.22763806581497192, + "learning_rate": 3.599193378852606e-07, + "loss": 0.1591, + "step": 49250 + }, + { + "epoch": 0.9143667640713938, + "grad_norm": 0.42656072974205017, + "learning_rate": 3.59609265108678e-07, + "loss": 0.3388, + "step": 49252 + }, + { + "epoch": 0.9144038942088125, + "grad_norm": 1.0524674654006958, + "learning_rate": 3.592993235089348e-07, + "loss": 0.3559, + "step": 49254 + }, + { + "epoch": 0.9144410243462311, + "grad_norm": 0.3886829614639282, + "learning_rate": 3.589895130902532e-07, + "loss": 0.4413, + "step": 49256 + }, + { + "epoch": 0.9144781544836498, + "grad_norm": 0.45653024315834045, + "learning_rate": 3.5867983385684425e-07, + "loss": 0.2516, + "step": 49258 + }, + { + "epoch": 0.9145152846210683, + "grad_norm": 0.3318355977535248, + "learning_rate": 3.5837028581292343e-07, + "loss": 0.1334, + "step": 49260 + }, + { + "epoch": 0.914552414758487, + "grad_norm": 0.9061023592948914, + "learning_rate": 3.5806086896270296e-07, + "loss": 0.3228, + "step": 49262 + }, + { + "epoch": 0.9145895448959057, + "grad_norm": 0.3987681269645691, + "learning_rate": 3.5775158331039396e-07, + "loss": 0.4032, + "step": 49264 + }, + { + "epoch": 0.9146266750333243, + "grad_norm": 0.3664870858192444, + "learning_rate": 3.5744242886020296e-07, + "loss": 0.1708, + "step": 49266 + }, + { + "epoch": 0.914663805170743, + "grad_norm": 0.35846784710884094, + "learning_rate": 3.571334056163378e-07, + "loss": 0.2191, + "step": 49268 + }, + { + "epoch": 0.9147009353081615, + "grad_norm": 0.3139619529247284, + "learning_rate": 3.5682451358300505e-07, + "loss": 0.2191, + "step": 49270 + }, + { + "epoch": 0.9147380654455802, + "grad_norm": 0.46333786845207214, + "learning_rate": 3.5651575276440474e-07, + "loss": 0.2307, + "step": 49272 + }, + { + "epoch": 0.9147751955829988, + "grad_norm": 0.6021817326545715, + "learning_rate": 3.5620712316474016e-07, + "loss": 0.2243, + "step": 49274 + }, + { + "epoch": 0.9148123257204175, + "grad_norm": 0.3311045467853546, + "learning_rate": 3.5589862478820904e-07, + "loss": 0.2095, + "step": 49276 + }, + { + "epoch": 0.9148494558578362, + "grad_norm": 0.29638537764549255, + "learning_rate": 3.5559025763901135e-07, + "loss": 0.2053, + "step": 49278 + }, + { + "epoch": 0.9148865859952547, + "grad_norm": 0.28549426794052124, + "learning_rate": 3.552820217213404e-07, + "loss": 0.1115, + "step": 49280 + }, + { + "epoch": 0.9149237161326734, + "grad_norm": 0.4278050661087036, + "learning_rate": 3.54973917039394e-07, + "loss": 0.1262, + "step": 49282 + }, + { + "epoch": 0.914960846270092, + "grad_norm": 0.47603076696395874, + "learning_rate": 3.5466594359736097e-07, + "loss": 0.2412, + "step": 49284 + }, + { + "epoch": 0.9149979764075107, + "grad_norm": 0.3592650294303894, + "learning_rate": 3.5435810139943236e-07, + "loss": 0.1617, + "step": 49286 + }, + { + "epoch": 0.9150351065449294, + "grad_norm": 0.37813347578048706, + "learning_rate": 3.540503904497994e-07, + "loss": 0.2273, + "step": 49288 + }, + { + "epoch": 0.9150722366823479, + "grad_norm": 0.40391597151756287, + "learning_rate": 3.537428107526464e-07, + "loss": 0.2648, + "step": 49290 + }, + { + "epoch": 0.9151093668197666, + "grad_norm": 0.46821025013923645, + "learning_rate": 3.5343536231216116e-07, + "loss": 0.2661, + "step": 49292 + }, + { + "epoch": 0.9151464969571852, + "grad_norm": 0.3504076600074768, + "learning_rate": 3.5312804513252586e-07, + "loss": 0.1743, + "step": 49294 + }, + { + "epoch": 0.9151836270946039, + "grad_norm": 0.45604148507118225, + "learning_rate": 3.5282085921792273e-07, + "loss": 0.2164, + "step": 49296 + }, + { + "epoch": 0.9152207572320226, + "grad_norm": 0.41255414485931396, + "learning_rate": 3.525138045725296e-07, + "loss": 0.4561, + "step": 49298 + }, + { + "epoch": 0.9152578873694411, + "grad_norm": 0.35116198658943176, + "learning_rate": 3.522068812005264e-07, + "loss": 0.2102, + "step": 49300 + }, + { + "epoch": 0.9152950175068598, + "grad_norm": 0.630753755569458, + "learning_rate": 3.5190008910608863e-07, + "loss": 0.2681, + "step": 49302 + }, + { + "epoch": 0.9153321476442784, + "grad_norm": 0.5424085855484009, + "learning_rate": 3.515934282933919e-07, + "loss": 0.4646, + "step": 49304 + }, + { + "epoch": 0.9153692777816971, + "grad_norm": 0.18050970137119293, + "learning_rate": 3.512868987666096e-07, + "loss": 0.1358, + "step": 49306 + }, + { + "epoch": 0.9154064079191158, + "grad_norm": 0.3194495737552643, + "learning_rate": 3.509805005299094e-07, + "loss": 0.2835, + "step": 49308 + }, + { + "epoch": 0.9154435380565343, + "grad_norm": 0.43440771102905273, + "learning_rate": 3.5067423358746246e-07, + "loss": 0.4712, + "step": 49310 + }, + { + "epoch": 0.915480668193953, + "grad_norm": 0.2993130683898926, + "learning_rate": 3.5036809794343653e-07, + "loss": 0.2495, + "step": 49312 + }, + { + "epoch": 0.9155177983313716, + "grad_norm": 0.25007620453834534, + "learning_rate": 3.500620936019961e-07, + "loss": 0.2258, + "step": 49314 + }, + { + "epoch": 0.9155549284687903, + "grad_norm": 0.22476312518119812, + "learning_rate": 3.4975622056730776e-07, + "loss": 0.2785, + "step": 49316 + }, + { + "epoch": 0.9155920586062088, + "grad_norm": 0.2292502373456955, + "learning_rate": 3.4945047884352934e-07, + "loss": 0.348, + "step": 49318 + }, + { + "epoch": 0.9156291887436275, + "grad_norm": 0.26134833693504333, + "learning_rate": 3.491448684348253e-07, + "loss": 0.226, + "step": 49320 + }, + { + "epoch": 0.9156663188810462, + "grad_norm": 0.40726497769355774, + "learning_rate": 3.488393893453501e-07, + "loss": 0.2901, + "step": 49322 + }, + { + "epoch": 0.9157034490184648, + "grad_norm": 0.42395174503326416, + "learning_rate": 3.4853404157926265e-07, + "loss": 0.2595, + "step": 49324 + }, + { + "epoch": 0.9157405791558835, + "grad_norm": 0.22526511549949646, + "learning_rate": 3.4822882514071733e-07, + "loss": 0.2162, + "step": 49326 + }, + { + "epoch": 0.915777709293302, + "grad_norm": 0.3842701017856598, + "learning_rate": 3.479237400338664e-07, + "loss": 0.3288, + "step": 49328 + }, + { + "epoch": 0.9158148394307207, + "grad_norm": 0.5759210586547852, + "learning_rate": 3.4761878626286326e-07, + "loss": 0.2733, + "step": 49330 + }, + { + "epoch": 0.9158519695681394, + "grad_norm": 0.3414568603038788, + "learning_rate": 3.473139638318568e-07, + "loss": 0.0647, + "step": 49332 + }, + { + "epoch": 0.915889099705558, + "grad_norm": 0.4282447397708893, + "learning_rate": 3.4700927274499363e-07, + "loss": 0.182, + "step": 49334 + }, + { + "epoch": 0.9159262298429767, + "grad_norm": 0.6249219179153442, + "learning_rate": 3.467047130064194e-07, + "loss": 0.2051, + "step": 49336 + }, + { + "epoch": 0.9159633599803952, + "grad_norm": 0.23117023706436157, + "learning_rate": 3.4640028462028075e-07, + "loss": 0.2968, + "step": 49338 + }, + { + "epoch": 0.9160004901178139, + "grad_norm": 0.6320258378982544, + "learning_rate": 3.460959875907166e-07, + "loss": 0.2675, + "step": 49340 + }, + { + "epoch": 0.9160376202552326, + "grad_norm": 0.29232069849967957, + "learning_rate": 3.457918219218692e-07, + "loss": 0.4168, + "step": 49342 + }, + { + "epoch": 0.9160747503926512, + "grad_norm": 0.3106766939163208, + "learning_rate": 3.454877876178797e-07, + "loss": 0.3143, + "step": 49344 + }, + { + "epoch": 0.9161118805300699, + "grad_norm": 0.1453436315059662, + "learning_rate": 3.451838846828803e-07, + "loss": 0.2135, + "step": 49346 + }, + { + "epoch": 0.9161490106674884, + "grad_norm": 0.4814920723438263, + "learning_rate": 3.4488011312101e-07, + "loss": 0.4099, + "step": 49348 + }, + { + "epoch": 0.9161861408049071, + "grad_norm": 1.1180670261383057, + "learning_rate": 3.4457647293639984e-07, + "loss": 0.3466, + "step": 49350 + }, + { + "epoch": 0.9162232709423258, + "grad_norm": 0.36522749066352844, + "learning_rate": 3.4427296413318323e-07, + "loss": 0.248, + "step": 49352 + }, + { + "epoch": 0.9162604010797444, + "grad_norm": 0.48361122608184814, + "learning_rate": 3.4396958671548906e-07, + "loss": 0.1167, + "step": 49354 + }, + { + "epoch": 0.9162975312171631, + "grad_norm": 0.2928771376609802, + "learning_rate": 3.436663406874463e-07, + "loss": 0.1727, + "step": 49356 + }, + { + "epoch": 0.9163346613545816, + "grad_norm": 0.42670348286628723, + "learning_rate": 3.433632260531805e-07, + "loss": 0.1911, + "step": 49358 + }, + { + "epoch": 0.9163717914920003, + "grad_norm": 0.3926699757575989, + "learning_rate": 3.4306024281681614e-07, + "loss": 0.316, + "step": 49360 + }, + { + "epoch": 0.916408921629419, + "grad_norm": 0.49050793051719666, + "learning_rate": 3.427573909824755e-07, + "loss": 0.1766, + "step": 49362 + }, + { + "epoch": 0.9164460517668376, + "grad_norm": 0.3577752709388733, + "learning_rate": 3.4245467055428084e-07, + "loss": 0.3496, + "step": 49364 + }, + { + "epoch": 0.9164831819042563, + "grad_norm": 0.43557503819465637, + "learning_rate": 3.4215208153634884e-07, + "loss": 0.2187, + "step": 49366 + }, + { + "epoch": 0.9165203120416748, + "grad_norm": 0.28730323910713196, + "learning_rate": 3.4184962393279955e-07, + "loss": 0.3753, + "step": 49368 + }, + { + "epoch": 0.9165574421790935, + "grad_norm": 0.30963724851608276, + "learning_rate": 3.415472977477474e-07, + "loss": 0.2793, + "step": 49370 + }, + { + "epoch": 0.9165945723165121, + "grad_norm": 0.4687226712703705, + "learning_rate": 3.4124510298530587e-07, + "loss": 0.2461, + "step": 49372 + }, + { + "epoch": 0.9166317024539308, + "grad_norm": 0.7027088403701782, + "learning_rate": 3.409430396495872e-07, + "loss": 0.1566, + "step": 49374 + }, + { + "epoch": 0.9166688325913495, + "grad_norm": 0.3863414525985718, + "learning_rate": 3.4064110774470137e-07, + "loss": 0.2191, + "step": 49376 + }, + { + "epoch": 0.916705962728768, + "grad_norm": 0.47203171253204346, + "learning_rate": 3.403393072747574e-07, + "loss": 0.1495, + "step": 49378 + }, + { + "epoch": 0.9167430928661867, + "grad_norm": 0.745822012424469, + "learning_rate": 3.400376382438608e-07, + "loss": 0.3816, + "step": 49380 + }, + { + "epoch": 0.9167802230036053, + "grad_norm": 0.3202285170555115, + "learning_rate": 3.397361006561184e-07, + "loss": 0.3679, + "step": 49382 + }, + { + "epoch": 0.916817353141024, + "grad_norm": 0.37226977944374084, + "learning_rate": 3.3943469451563125e-07, + "loss": 0.0318, + "step": 49384 + }, + { + "epoch": 0.9168544832784427, + "grad_norm": 0.36163854598999023, + "learning_rate": 3.391334198265006e-07, + "loss": 0.3257, + "step": 49386 + }, + { + "epoch": 0.9168916134158612, + "grad_norm": 0.4392906427383423, + "learning_rate": 3.3883227659282536e-07, + "loss": 0.2706, + "step": 49388 + }, + { + "epoch": 0.9169287435532799, + "grad_norm": 0.6020591259002686, + "learning_rate": 3.3853126481870555e-07, + "loss": 0.2558, + "step": 49390 + }, + { + "epoch": 0.9169658736906985, + "grad_norm": 0.4009425640106201, + "learning_rate": 3.3823038450823575e-07, + "loss": 0.4351, + "step": 49392 + }, + { + "epoch": 0.9170030038281172, + "grad_norm": 0.6857998967170715, + "learning_rate": 3.379296356655093e-07, + "loss": 0.16, + "step": 49394 + }, + { + "epoch": 0.9170401339655359, + "grad_norm": 0.49467557668685913, + "learning_rate": 3.376290182946207e-07, + "loss": 0.3361, + "step": 49396 + }, + { + "epoch": 0.9170772641029544, + "grad_norm": 0.48406773805618286, + "learning_rate": 3.373285323996578e-07, + "loss": 0.2949, + "step": 49398 + }, + { + "epoch": 0.9171143942403731, + "grad_norm": 0.39995500445365906, + "learning_rate": 3.3702817798471175e-07, + "loss": 0.2345, + "step": 49400 + }, + { + "epoch": 0.9171515243777917, + "grad_norm": 0.34932398796081543, + "learning_rate": 3.3672795505386824e-07, + "loss": 0.1812, + "step": 49402 + }, + { + "epoch": 0.9171886545152104, + "grad_norm": 0.422699511051178, + "learning_rate": 3.364278636112106e-07, + "loss": 0.133, + "step": 49404 + }, + { + "epoch": 0.917225784652629, + "grad_norm": 0.24442869424819946, + "learning_rate": 3.361279036608256e-07, + "loss": 0.4778, + "step": 49406 + }, + { + "epoch": 0.9172629147900476, + "grad_norm": 0.37181442975997925, + "learning_rate": 3.3582807520679326e-07, + "loss": 0.0535, + "step": 49408 + }, + { + "epoch": 0.9173000449274663, + "grad_norm": 0.4095766246318817, + "learning_rate": 3.3552837825319263e-07, + "loss": 0.7202, + "step": 49410 + }, + { + "epoch": 0.9173371750648849, + "grad_norm": 0.4311332106590271, + "learning_rate": 3.3522881280410145e-07, + "loss": 0.3445, + "step": 49412 + }, + { + "epoch": 0.9173743052023036, + "grad_norm": 0.43774715065956116, + "learning_rate": 3.3492937886359765e-07, + "loss": 0.3055, + "step": 49414 + }, + { + "epoch": 0.9174114353397222, + "grad_norm": 0.4392765164375305, + "learning_rate": 3.346300764357546e-07, + "loss": 0.2215, + "step": 49416 + }, + { + "epoch": 0.9174485654771408, + "grad_norm": 0.39213061332702637, + "learning_rate": 3.3433090552464464e-07, + "loss": 0.3119, + "step": 49418 + }, + { + "epoch": 0.9174856956145595, + "grad_norm": 0.3477953374385834, + "learning_rate": 3.3403186613434115e-07, + "loss": 0.3942, + "step": 49420 + }, + { + "epoch": 0.9175228257519781, + "grad_norm": 0.3387806713581085, + "learning_rate": 3.337329582689086e-07, + "loss": 0.4013, + "step": 49422 + }, + { + "epoch": 0.9175599558893968, + "grad_norm": 0.34520232677459717, + "learning_rate": 3.334341819324194e-07, + "loss": 0.3143, + "step": 49424 + }, + { + "epoch": 0.9175970860268153, + "grad_norm": 0.34256818890571594, + "learning_rate": 3.3313553712893354e-07, + "loss": 0.3453, + "step": 49426 + }, + { + "epoch": 0.917634216164234, + "grad_norm": 0.3753448724746704, + "learning_rate": 3.3283702386251783e-07, + "loss": 0.3723, + "step": 49428 + }, + { + "epoch": 0.9176713463016527, + "grad_norm": 0.6176601648330688, + "learning_rate": 3.325386421372345e-07, + "loss": 0.1915, + "step": 49430 + }, + { + "epoch": 0.9177084764390713, + "grad_norm": 0.3762880563735962, + "learning_rate": 3.3224039195714263e-07, + "loss": 0.1986, + "step": 49432 + }, + { + "epoch": 0.91774560657649, + "grad_norm": 0.35661986470222473, + "learning_rate": 3.3194227332630116e-07, + "loss": 0.2276, + "step": 49434 + }, + { + "epoch": 0.9177827367139085, + "grad_norm": 0.4317947328090668, + "learning_rate": 3.3164428624876564e-07, + "loss": 0.4489, + "step": 49436 + }, + { + "epoch": 0.9178198668513272, + "grad_norm": 0.7085149884223938, + "learning_rate": 3.313464307285907e-07, + "loss": 0.3009, + "step": 49438 + }, + { + "epoch": 0.9178569969887459, + "grad_norm": 0.462202787399292, + "learning_rate": 3.3104870676983093e-07, + "loss": 0.1943, + "step": 49440 + }, + { + "epoch": 0.9178941271261645, + "grad_norm": 0.4702221155166626, + "learning_rate": 3.307511143765363e-07, + "loss": 0.36, + "step": 49442 + }, + { + "epoch": 0.9179312572635832, + "grad_norm": 0.4293614327907562, + "learning_rate": 3.304536535527558e-07, + "loss": 0.282, + "step": 49444 + }, + { + "epoch": 0.9179683874010017, + "grad_norm": 0.34036514163017273, + "learning_rate": 3.301563243025385e-07, + "loss": 0.2922, + "step": 49446 + }, + { + "epoch": 0.9180055175384204, + "grad_norm": 0.3849351704120636, + "learning_rate": 3.298591266299278e-07, + "loss": 0.4261, + "step": 49448 + }, + { + "epoch": 0.9180426476758391, + "grad_norm": 0.47949573397636414, + "learning_rate": 3.2956206053896934e-07, + "loss": 0.3775, + "step": 49450 + }, + { + "epoch": 0.9180797778132577, + "grad_norm": 0.36911675333976746, + "learning_rate": 3.292651260337043e-07, + "loss": 0.191, + "step": 49452 + }, + { + "epoch": 0.9181169079506764, + "grad_norm": 0.32444387674331665, + "learning_rate": 3.289683231181739e-07, + "loss": 0.1477, + "step": 49454 + }, + { + "epoch": 0.9181540380880949, + "grad_norm": 0.58058100938797, + "learning_rate": 3.286716517964172e-07, + "loss": 0.3939, + "step": 49456 + }, + { + "epoch": 0.9181911682255136, + "grad_norm": 0.6614341735839844, + "learning_rate": 3.2837511207247096e-07, + "loss": 0.5406, + "step": 49458 + }, + { + "epoch": 0.9182282983629323, + "grad_norm": 0.344318151473999, + "learning_rate": 3.2807870395036855e-07, + "loss": 0.1165, + "step": 49460 + }, + { + "epoch": 0.9182654285003509, + "grad_norm": 0.46496352553367615, + "learning_rate": 3.2778242743414346e-07, + "loss": 0.2003, + "step": 49462 + }, + { + "epoch": 0.9183025586377696, + "grad_norm": 0.2755275070667267, + "learning_rate": 3.274862825278302e-07, + "loss": 0.3382, + "step": 49464 + }, + { + "epoch": 0.9183396887751881, + "grad_norm": 0.7228017449378967, + "learning_rate": 3.271902692354545e-07, + "loss": 0.0869, + "step": 49466 + }, + { + "epoch": 0.9183768189126068, + "grad_norm": 0.3955008089542389, + "learning_rate": 3.268943875610453e-07, + "loss": 0.2828, + "step": 49468 + }, + { + "epoch": 0.9184139490500254, + "grad_norm": 0.4280640780925751, + "learning_rate": 3.2659863750863053e-07, + "loss": 0.1533, + "step": 49470 + }, + { + "epoch": 0.9184510791874441, + "grad_norm": 0.3458564877510071, + "learning_rate": 3.263030190822325e-07, + "loss": 0.2175, + "step": 49472 + }, + { + "epoch": 0.9184882093248627, + "grad_norm": 0.2705119550228119, + "learning_rate": 3.2600753228587355e-07, + "loss": 0.251, + "step": 49474 + }, + { + "epoch": 0.9185253394622813, + "grad_norm": 0.31712132692337036, + "learning_rate": 3.2571217712357605e-07, + "loss": 0.3142, + "step": 49476 + }, + { + "epoch": 0.9185624695997, + "grad_norm": 0.19012252986431122, + "learning_rate": 3.254169535993579e-07, + "loss": 0.2775, + "step": 49478 + }, + { + "epoch": 0.9185995997371186, + "grad_norm": 0.3018691837787628, + "learning_rate": 3.251218617172358e-07, + "loss": 0.1122, + "step": 49480 + }, + { + "epoch": 0.9186367298745373, + "grad_norm": 0.4619494080543518, + "learning_rate": 3.2482690148122553e-07, + "loss": 0.3475, + "step": 49482 + }, + { + "epoch": 0.918673860011956, + "grad_norm": 0.25093284249305725, + "learning_rate": 3.245320728953416e-07, + "loss": 0.2918, + "step": 49484 + }, + { + "epoch": 0.9187109901493745, + "grad_norm": 0.38401415944099426, + "learning_rate": 3.2423737596359527e-07, + "loss": 0.2384, + "step": 49486 + }, + { + "epoch": 0.9187481202867932, + "grad_norm": 0.6986663937568665, + "learning_rate": 3.2394281068999447e-07, + "loss": 0.5075, + "step": 49488 + }, + { + "epoch": 0.9187852504242118, + "grad_norm": 0.7526283264160156, + "learning_rate": 3.2364837707854923e-07, + "loss": 0.3013, + "step": 49490 + }, + { + "epoch": 0.9188223805616305, + "grad_norm": 0.38385629653930664, + "learning_rate": 3.2335407513326534e-07, + "loss": 0.1869, + "step": 49492 + }, + { + "epoch": 0.9188595106990491, + "grad_norm": 0.2951247990131378, + "learning_rate": 3.230599048581473e-07, + "loss": 0.1548, + "step": 49494 + }, + { + "epoch": 0.9188966408364677, + "grad_norm": 0.45683056116104126, + "learning_rate": 3.2276586625719976e-07, + "loss": 0.3015, + "step": 49496 + }, + { + "epoch": 0.9189337709738864, + "grad_norm": 0.31408703327178955, + "learning_rate": 3.2247195933442053e-07, + "loss": 0.2771, + "step": 49498 + }, + { + "epoch": 0.918970901111305, + "grad_norm": 0.3921660780906677, + "learning_rate": 3.22178184093811e-07, + "loss": 0.4114, + "step": 49500 + }, + { + "epoch": 0.9190080312487237, + "grad_norm": 0.2388375848531723, + "learning_rate": 3.218845405393667e-07, + "loss": 0.268, + "step": 49502 + }, + { + "epoch": 0.9190451613861423, + "grad_norm": 0.5018376708030701, + "learning_rate": 3.215910286750856e-07, + "loss": 0.2191, + "step": 49504 + }, + { + "epoch": 0.9190822915235609, + "grad_norm": 0.401623398065567, + "learning_rate": 3.212976485049613e-07, + "loss": 0.1783, + "step": 49506 + }, + { + "epoch": 0.9191194216609796, + "grad_norm": 0.3418211042881012, + "learning_rate": 3.2100440003298373e-07, + "loss": 0.1708, + "step": 49508 + }, + { + "epoch": 0.9191565517983982, + "grad_norm": 0.4160067141056061, + "learning_rate": 3.2071128326314536e-07, + "loss": 0.2558, + "step": 49510 + }, + { + "epoch": 0.9191936819358169, + "grad_norm": 0.627045214176178, + "learning_rate": 3.2041829819943303e-07, + "loss": 0.2494, + "step": 49512 + }, + { + "epoch": 0.9192308120732355, + "grad_norm": 0.3711293935775757, + "learning_rate": 3.2012544484583353e-07, + "loss": 0.1625, + "step": 49514 + }, + { + "epoch": 0.9192679422106541, + "grad_norm": 0.2925248146057129, + "learning_rate": 3.198327232063325e-07, + "loss": 0.1464, + "step": 49516 + }, + { + "epoch": 0.9193050723480728, + "grad_norm": 0.3501226603984833, + "learning_rate": 3.195401332849124e-07, + "loss": 0.293, + "step": 49518 + }, + { + "epoch": 0.9193422024854914, + "grad_norm": 0.4803522229194641, + "learning_rate": 3.1924767508555444e-07, + "loss": 0.4118, + "step": 49520 + }, + { + "epoch": 0.91937933262291, + "grad_norm": 0.2744458317756653, + "learning_rate": 3.1895534861223987e-07, + "loss": 0.2021, + "step": 49522 + }, + { + "epoch": 0.9194164627603286, + "grad_norm": 0.32981640100479126, + "learning_rate": 3.1866315386894333e-07, + "loss": 0.2452, + "step": 49524 + }, + { + "epoch": 0.9194535928977473, + "grad_norm": 0.3410360515117645, + "learning_rate": 3.1837109085964267e-07, + "loss": 0.324, + "step": 49526 + }, + { + "epoch": 0.919490723035166, + "grad_norm": 0.325719952583313, + "learning_rate": 3.180791595883137e-07, + "loss": 0.1961, + "step": 49528 + }, + { + "epoch": 0.9195278531725846, + "grad_norm": 0.5396624207496643, + "learning_rate": 3.177873600589243e-07, + "loss": 0.34, + "step": 49530 + }, + { + "epoch": 0.9195649833100032, + "grad_norm": 0.41553807258605957, + "learning_rate": 3.17495692275448e-07, + "loss": 0.2987, + "step": 49532 + }, + { + "epoch": 0.9196021134474218, + "grad_norm": 0.3280593454837799, + "learning_rate": 3.1720415624185374e-07, + "loss": 0.3333, + "step": 49534 + }, + { + "epoch": 0.9196392435848405, + "grad_norm": 0.20500224828720093, + "learning_rate": 3.1691275196210626e-07, + "loss": 0.1586, + "step": 49536 + }, + { + "epoch": 0.9196763737222592, + "grad_norm": 0.34626907110214233, + "learning_rate": 3.166214794401712e-07, + "loss": 0.2307, + "step": 49538 + }, + { + "epoch": 0.9197135038596778, + "grad_norm": 0.4983569383621216, + "learning_rate": 3.163303386800143e-07, + "loss": 0.3378, + "step": 49540 + }, + { + "epoch": 0.9197506339970964, + "grad_norm": 0.29788559675216675, + "learning_rate": 3.1603932968559457e-07, + "loss": 0.1536, + "step": 49542 + }, + { + "epoch": 0.919787764134515, + "grad_norm": 0.4228670299053192, + "learning_rate": 3.157484524608734e-07, + "loss": 0.2612, + "step": 49544 + }, + { + "epoch": 0.9198248942719337, + "grad_norm": 0.26527491211891174, + "learning_rate": 3.154577070098086e-07, + "loss": 0.2616, + "step": 49546 + }, + { + "epoch": 0.9198620244093524, + "grad_norm": 0.410087913274765, + "learning_rate": 3.1516709333635485e-07, + "loss": 0.2019, + "step": 49548 + }, + { + "epoch": 0.919899154546771, + "grad_norm": 0.23622356355190277, + "learning_rate": 3.14876611444469e-07, + "loss": 0.2428, + "step": 49550 + }, + { + "epoch": 0.9199362846841896, + "grad_norm": 0.42468416690826416, + "learning_rate": 3.145862613381012e-07, + "loss": 0.1768, + "step": 49552 + }, + { + "epoch": 0.9199734148216082, + "grad_norm": 0.39423689246177673, + "learning_rate": 3.1429604302120276e-07, + "loss": 0.2702, + "step": 49554 + }, + { + "epoch": 0.9200105449590269, + "grad_norm": 0.4662618637084961, + "learning_rate": 3.140059564977227e-07, + "loss": 0.0762, + "step": 49556 + }, + { + "epoch": 0.9200476750964456, + "grad_norm": 0.3346233367919922, + "learning_rate": 3.137160017716101e-07, + "loss": 0.6277, + "step": 49558 + }, + { + "epoch": 0.9200848052338642, + "grad_norm": 0.32841598987579346, + "learning_rate": 3.1342617884680846e-07, + "loss": 0.0838, + "step": 49560 + }, + { + "epoch": 0.9201219353712828, + "grad_norm": 0.35583242774009705, + "learning_rate": 3.1313648772726135e-07, + "loss": 0.3734, + "step": 49562 + }, + { + "epoch": 0.9201590655087014, + "grad_norm": 0.3716599941253662, + "learning_rate": 3.128469284169111e-07, + "loss": 0.4392, + "step": 49564 + }, + { + "epoch": 0.9201961956461201, + "grad_norm": 0.5347161293029785, + "learning_rate": 3.125575009196979e-07, + "loss": 0.2932, + "step": 49566 + }, + { + "epoch": 0.9202333257835388, + "grad_norm": 0.4140762984752655, + "learning_rate": 3.122682052395609e-07, + "loss": 0.2566, + "step": 49568 + }, + { + "epoch": 0.9202704559209574, + "grad_norm": 0.8641968369483948, + "learning_rate": 3.1197904138043466e-07, + "loss": 0.252, + "step": 49570 + }, + { + "epoch": 0.920307586058376, + "grad_norm": 0.46658557653427124, + "learning_rate": 3.1169000934625604e-07, + "loss": 0.2141, + "step": 49572 + }, + { + "epoch": 0.9203447161957946, + "grad_norm": 0.5028789043426514, + "learning_rate": 3.114011091409552e-07, + "loss": 0.21, + "step": 49574 + }, + { + "epoch": 0.9203818463332133, + "grad_norm": 0.5960252285003662, + "learning_rate": 3.1111234076846466e-07, + "loss": 0.3549, + "step": 49576 + }, + { + "epoch": 0.9204189764706319, + "grad_norm": 0.41230836510658264, + "learning_rate": 3.1082370423271337e-07, + "loss": 0.3736, + "step": 49578 + }, + { + "epoch": 0.9204561066080506, + "grad_norm": 0.4283335506916046, + "learning_rate": 3.1053519953762825e-07, + "loss": 0.2163, + "step": 49580 + }, + { + "epoch": 0.9204932367454692, + "grad_norm": 0.40746942162513733, + "learning_rate": 3.1024682668713615e-07, + "loss": 0.3066, + "step": 49582 + }, + { + "epoch": 0.9205303668828878, + "grad_norm": 0.6251316666603088, + "learning_rate": 3.099585856851628e-07, + "loss": 0.2551, + "step": 49584 + }, + { + "epoch": 0.9205674970203065, + "grad_norm": 0.4499858021736145, + "learning_rate": 3.0967047653562623e-07, + "loss": 0.1648, + "step": 49586 + }, + { + "epoch": 0.9206046271577251, + "grad_norm": 0.4417949616909027, + "learning_rate": 3.0938249924244766e-07, + "loss": 0.3586, + "step": 49588 + }, + { + "epoch": 0.9206417572951437, + "grad_norm": 0.43329888582229614, + "learning_rate": 3.090946538095474e-07, + "loss": 0.1925, + "step": 49590 + }, + { + "epoch": 0.9206788874325624, + "grad_norm": 0.43943285942077637, + "learning_rate": 3.088069402408422e-07, + "loss": 0.2741, + "step": 49592 + }, + { + "epoch": 0.920716017569981, + "grad_norm": 0.47027555108070374, + "learning_rate": 3.085193585402457e-07, + "loss": 0.2995, + "step": 49594 + }, + { + "epoch": 0.9207531477073997, + "grad_norm": 0.5543423891067505, + "learning_rate": 3.082319087116714e-07, + "loss": 0.48, + "step": 49596 + }, + { + "epoch": 0.9207902778448183, + "grad_norm": 0.3214503824710846, + "learning_rate": 3.079445907590306e-07, + "loss": 0.2166, + "step": 49598 + }, + { + "epoch": 0.920827407982237, + "grad_norm": 0.5501466989517212, + "learning_rate": 3.076574046862324e-07, + "loss": 0.1658, + "step": 49600 + }, + { + "epoch": 0.9208645381196556, + "grad_norm": 0.352893203496933, + "learning_rate": 3.0737035049718476e-07, + "loss": 0.5363, + "step": 49602 + }, + { + "epoch": 0.9209016682570742, + "grad_norm": 0.5082792639732361, + "learning_rate": 3.0708342819579353e-07, + "loss": 0.4783, + "step": 49604 + }, + { + "epoch": 0.9209387983944929, + "grad_norm": 0.24410173296928406, + "learning_rate": 3.0679663778596325e-07, + "loss": 0.3532, + "step": 49606 + }, + { + "epoch": 0.9209759285319115, + "grad_norm": 0.4608539044857025, + "learning_rate": 3.0650997927159644e-07, + "loss": 0.2335, + "step": 49608 + }, + { + "epoch": 0.9210130586693301, + "grad_norm": 0.5262891054153442, + "learning_rate": 3.0622345265659546e-07, + "loss": 0.3605, + "step": 49610 + }, + { + "epoch": 0.9210501888067488, + "grad_norm": 0.44862616062164307, + "learning_rate": 3.0593705794485505e-07, + "loss": 0.3602, + "step": 49612 + }, + { + "epoch": 0.9210873189441674, + "grad_norm": 0.17595121264457703, + "learning_rate": 3.056507951402754e-07, + "loss": 0.2601, + "step": 49614 + }, + { + "epoch": 0.9211244490815861, + "grad_norm": 0.5183852910995483, + "learning_rate": 3.053646642467489e-07, + "loss": 0.4258, + "step": 49616 + }, + { + "epoch": 0.9211615792190047, + "grad_norm": 0.21415559947490692, + "learning_rate": 3.0507866526817143e-07, + "loss": 0.1418, + "step": 49618 + }, + { + "epoch": 0.9211987093564233, + "grad_norm": 0.45647209882736206, + "learning_rate": 3.047927982084331e-07, + "loss": 0.2802, + "step": 49620 + }, + { + "epoch": 0.9212358394938419, + "grad_norm": 0.445491224527359, + "learning_rate": 3.045070630714253e-07, + "loss": 0.4242, + "step": 49622 + }, + { + "epoch": 0.9212729696312606, + "grad_norm": 0.2769622206687927, + "learning_rate": 3.042214598610349e-07, + "loss": 0.1816, + "step": 49624 + }, + { + "epoch": 0.9213100997686793, + "grad_norm": 0.5608027577400208, + "learning_rate": 3.039359885811477e-07, + "loss": 0.1933, + "step": 49626 + }, + { + "epoch": 0.9213472299060979, + "grad_norm": 0.2970820963382721, + "learning_rate": 3.0365064923564835e-07, + "loss": 0.259, + "step": 49628 + }, + { + "epoch": 0.9213843600435165, + "grad_norm": 0.24148432910442352, + "learning_rate": 3.033654418284193e-07, + "loss": 0.3813, + "step": 49630 + }, + { + "epoch": 0.9214214901809351, + "grad_norm": 0.3515041172504425, + "learning_rate": 3.0308036636334303e-07, + "loss": 0.1049, + "step": 49632 + }, + { + "epoch": 0.9214586203183538, + "grad_norm": 0.2566092610359192, + "learning_rate": 3.027954228442975e-07, + "loss": 0.3213, + "step": 49634 + }, + { + "epoch": 0.9214957504557725, + "grad_norm": 0.4311188757419586, + "learning_rate": 3.025106112751597e-07, + "loss": 0.2825, + "step": 49636 + }, + { + "epoch": 0.921532880593191, + "grad_norm": 0.4219937026500702, + "learning_rate": 3.022259316598042e-07, + "loss": 0.2366, + "step": 49638 + }, + { + "epoch": 0.9215700107306097, + "grad_norm": 0.31171107292175293, + "learning_rate": 3.0194138400210573e-07, + "loss": 0.1902, + "step": 49640 + }, + { + "epoch": 0.9216071408680283, + "grad_norm": 0.4017420709133148, + "learning_rate": 3.016569683059356e-07, + "loss": 0.3359, + "step": 49642 + }, + { + "epoch": 0.921644271005447, + "grad_norm": 0.5019659399986267, + "learning_rate": 3.0137268457516523e-07, + "loss": 0.4369, + "step": 49644 + }, + { + "epoch": 0.9216814011428657, + "grad_norm": 0.6282856464385986, + "learning_rate": 3.0108853281366035e-07, + "loss": 0.3621, + "step": 49646 + }, + { + "epoch": 0.9217185312802842, + "grad_norm": 0.46868380904197693, + "learning_rate": 3.0080451302529125e-07, + "loss": 0.2782, + "step": 49648 + }, + { + "epoch": 0.9217556614177029, + "grad_norm": 0.28179624676704407, + "learning_rate": 3.005206252139192e-07, + "loss": 0.248, + "step": 49650 + }, + { + "epoch": 0.9217927915551215, + "grad_norm": 0.2756509780883789, + "learning_rate": 3.002368693834079e-07, + "loss": 0.2894, + "step": 49652 + }, + { + "epoch": 0.9218299216925402, + "grad_norm": 0.4053601622581482, + "learning_rate": 2.9995324553761753e-07, + "loss": 0.3389, + "step": 49654 + }, + { + "epoch": 0.9218670518299589, + "grad_norm": 0.43468454480171204, + "learning_rate": 2.996697536804105e-07, + "loss": 0.3651, + "step": 49656 + }, + { + "epoch": 0.9219041819673774, + "grad_norm": 0.26193007826805115, + "learning_rate": 2.993863938156416e-07, + "loss": 0.4597, + "step": 49658 + }, + { + "epoch": 0.9219413121047961, + "grad_norm": 0.3843808174133301, + "learning_rate": 2.991031659471677e-07, + "loss": 0.2793, + "step": 49660 + }, + { + "epoch": 0.9219784422422147, + "grad_norm": 0.4789116680622101, + "learning_rate": 2.9882007007884126e-07, + "loss": 0.2201, + "step": 49662 + }, + { + "epoch": 0.9220155723796334, + "grad_norm": 0.490196168422699, + "learning_rate": 2.985371062145159e-07, + "loss": 0.1971, + "step": 49664 + }, + { + "epoch": 0.9220527025170521, + "grad_norm": 0.24228043854236603, + "learning_rate": 2.982542743580408e-07, + "loss": 0.1156, + "step": 49666 + }, + { + "epoch": 0.9220898326544706, + "grad_norm": 0.5311570167541504, + "learning_rate": 2.9797157451326497e-07, + "loss": 0.3389, + "step": 49668 + }, + { + "epoch": 0.9221269627918893, + "grad_norm": 0.45341333746910095, + "learning_rate": 2.9768900668403433e-07, + "loss": 0.1584, + "step": 49670 + }, + { + "epoch": 0.9221640929293079, + "grad_norm": 0.34992337226867676, + "learning_rate": 2.9740657087419577e-07, + "loss": 0.31, + "step": 49672 + }, + { + "epoch": 0.9222012230667266, + "grad_norm": 0.3912426233291626, + "learning_rate": 2.971242670875907e-07, + "loss": 0.2488, + "step": 49674 + }, + { + "epoch": 0.9222383532041452, + "grad_norm": 0.31488868594169617, + "learning_rate": 2.968420953280604e-07, + "loss": 0.2811, + "step": 49676 + }, + { + "epoch": 0.9222754833415638, + "grad_norm": 0.5603411197662354, + "learning_rate": 2.965600555994463e-07, + "loss": 0.3825, + "step": 49678 + }, + { + "epoch": 0.9223126134789825, + "grad_norm": 0.3272095024585724, + "learning_rate": 2.962781479055843e-07, + "loss": 0.2843, + "step": 49680 + }, + { + "epoch": 0.9223497436164011, + "grad_norm": 0.3244319260120392, + "learning_rate": 2.9599637225030895e-07, + "loss": 0.1981, + "step": 49682 + }, + { + "epoch": 0.9223868737538198, + "grad_norm": 0.4519628882408142, + "learning_rate": 2.957147286374573e-07, + "loss": 0.4298, + "step": 49684 + }, + { + "epoch": 0.9224240038912384, + "grad_norm": 0.5805622935295105, + "learning_rate": 2.954332170708618e-07, + "loss": 0.2623, + "step": 49686 + }, + { + "epoch": 0.922461134028657, + "grad_norm": 0.3464388847351074, + "learning_rate": 2.951518375543494e-07, + "loss": 0.1971, + "step": 49688 + }, + { + "epoch": 0.9224982641660757, + "grad_norm": 0.5249294638633728, + "learning_rate": 2.948705900917526e-07, + "loss": 0.2506, + "step": 49690 + }, + { + "epoch": 0.9225353943034943, + "grad_norm": 0.39281004667282104, + "learning_rate": 2.94589474686896e-07, + "loss": 0.3173, + "step": 49692 + }, + { + "epoch": 0.922572524440913, + "grad_norm": 0.49507346749305725, + "learning_rate": 2.943084913436056e-07, + "loss": 0.2218, + "step": 49694 + }, + { + "epoch": 0.9226096545783316, + "grad_norm": 0.43025633692741394, + "learning_rate": 2.9402764006570605e-07, + "loss": 0.1729, + "step": 49696 + }, + { + "epoch": 0.9226467847157502, + "grad_norm": 0.34313881397247314, + "learning_rate": 2.937469208570154e-07, + "loss": 0.3711, + "step": 49698 + }, + { + "epoch": 0.9226839148531689, + "grad_norm": 0.44230666756629944, + "learning_rate": 2.9346633372135833e-07, + "loss": 0.3091, + "step": 49700 + }, + { + "epoch": 0.9227210449905875, + "grad_norm": 0.34169286489486694, + "learning_rate": 2.9318587866254745e-07, + "loss": 0.3142, + "step": 49702 + }, + { + "epoch": 0.9227581751280062, + "grad_norm": 0.639430820941925, + "learning_rate": 2.9290555568440296e-07, + "loss": 0.2785, + "step": 49704 + }, + { + "epoch": 0.9227953052654247, + "grad_norm": 0.3331637382507324, + "learning_rate": 2.9262536479073624e-07, + "loss": 0.1922, + "step": 49706 + }, + { + "epoch": 0.9228324354028434, + "grad_norm": 0.4130639433860779, + "learning_rate": 2.92345305985362e-07, + "loss": 0.1663, + "step": 49708 + }, + { + "epoch": 0.9228695655402621, + "grad_norm": 0.2497435212135315, + "learning_rate": 2.9206537927209177e-07, + "loss": 0.2688, + "step": 49710 + }, + { + "epoch": 0.9229066956776807, + "grad_norm": 0.44751647114753723, + "learning_rate": 2.917855846547313e-07, + "loss": 0.1734, + "step": 49712 + }, + { + "epoch": 0.9229438258150994, + "grad_norm": 0.24870429933071136, + "learning_rate": 2.915059221370908e-07, + "loss": 0.2867, + "step": 49714 + }, + { + "epoch": 0.922980955952518, + "grad_norm": 0.38127273321151733, + "learning_rate": 2.9122639172297293e-07, + "loss": 0.3449, + "step": 49716 + }, + { + "epoch": 0.9230180860899366, + "grad_norm": 0.33446675539016724, + "learning_rate": 2.909469934161846e-07, + "loss": 0.2816, + "step": 49718 + }, + { + "epoch": 0.9230552162273553, + "grad_norm": 0.4531771242618561, + "learning_rate": 2.9066772722052493e-07, + "loss": 0.2424, + "step": 49720 + }, + { + "epoch": 0.9230923463647739, + "grad_norm": 0.36100202798843384, + "learning_rate": 2.903885931397943e-07, + "loss": 0.189, + "step": 49722 + }, + { + "epoch": 0.9231294765021926, + "grad_norm": 0.4066493511199951, + "learning_rate": 2.901095911777918e-07, + "loss": 0.2697, + "step": 49724 + }, + { + "epoch": 0.9231666066396111, + "grad_norm": 0.34657007455825806, + "learning_rate": 2.8983072133831223e-07, + "loss": 0.1586, + "step": 49726 + }, + { + "epoch": 0.9232037367770298, + "grad_norm": 0.4429459571838379, + "learning_rate": 2.895519836251526e-07, + "loss": 0.2624, + "step": 49728 + }, + { + "epoch": 0.9232408669144484, + "grad_norm": 0.8523456454277039, + "learning_rate": 2.892733780421031e-07, + "loss": 0.4041, + "step": 49730 + }, + { + "epoch": 0.9232779970518671, + "grad_norm": 0.3585398495197296, + "learning_rate": 2.889949045929563e-07, + "loss": 0.4686, + "step": 49732 + }, + { + "epoch": 0.9233151271892858, + "grad_norm": 0.4353378117084503, + "learning_rate": 2.887165632815003e-07, + "loss": 0.2502, + "step": 49734 + }, + { + "epoch": 0.9233522573267043, + "grad_norm": 0.4477533996105194, + "learning_rate": 2.884383541115254e-07, + "loss": 0.244, + "step": 49736 + }, + { + "epoch": 0.923389387464123, + "grad_norm": 0.36425039172172546, + "learning_rate": 2.8816027708681305e-07, + "loss": 0.1456, + "step": 49738 + }, + { + "epoch": 0.9234265176015416, + "grad_norm": 0.517238974571228, + "learning_rate": 2.8788233221115123e-07, + "loss": 0.3529, + "step": 49740 + }, + { + "epoch": 0.9234636477389603, + "grad_norm": 0.49075785279273987, + "learning_rate": 2.87604519488317e-07, + "loss": 0.1766, + "step": 49742 + }, + { + "epoch": 0.923500777876379, + "grad_norm": 0.4419189691543579, + "learning_rate": 2.8732683892209514e-07, + "loss": 0.4288, + "step": 49744 + }, + { + "epoch": 0.9235379080137975, + "grad_norm": 0.3829626739025116, + "learning_rate": 2.8704929051626027e-07, + "loss": 0.4717, + "step": 49746 + }, + { + "epoch": 0.9235750381512162, + "grad_norm": 0.523746132850647, + "learning_rate": 2.867718742745929e-07, + "loss": 0.2716, + "step": 49748 + }, + { + "epoch": 0.9236121682886348, + "grad_norm": 0.3787669539451599, + "learning_rate": 2.8649459020086533e-07, + "loss": 0.2094, + "step": 49750 + }, + { + "epoch": 0.9236492984260535, + "grad_norm": 0.5381085872650146, + "learning_rate": 2.8621743829885027e-07, + "loss": 0.273, + "step": 49752 + }, + { + "epoch": 0.9236864285634722, + "grad_norm": 0.2545231580734253, + "learning_rate": 2.8594041857232027e-07, + "loss": 0.1999, + "step": 49754 + }, + { + "epoch": 0.9237235587008907, + "grad_norm": 0.8038944602012634, + "learning_rate": 2.856635310250433e-07, + "loss": 0.2483, + "step": 49756 + }, + { + "epoch": 0.9237606888383094, + "grad_norm": 0.5130699276924133, + "learning_rate": 2.8538677566078866e-07, + "loss": 0.2003, + "step": 49758 + }, + { + "epoch": 0.923797818975728, + "grad_norm": 0.37654849886894226, + "learning_rate": 2.851101524833222e-07, + "loss": 0.2984, + "step": 49760 + }, + { + "epoch": 0.9238349491131467, + "grad_norm": 0.22724920511245728, + "learning_rate": 2.8483366149640645e-07, + "loss": 0.239, + "step": 49762 + }, + { + "epoch": 0.9238720792505654, + "grad_norm": 0.33984535932540894, + "learning_rate": 2.845573027038051e-07, + "loss": 0.115, + "step": 49764 + }, + { + "epoch": 0.9239092093879839, + "grad_norm": 0.25486499071121216, + "learning_rate": 2.8428107610927624e-07, + "loss": 0.1202, + "step": 49766 + }, + { + "epoch": 0.9239463395254026, + "grad_norm": 0.22435320913791656, + "learning_rate": 2.840049817165802e-07, + "loss": 0.1506, + "step": 49768 + }, + { + "epoch": 0.9239834696628212, + "grad_norm": 0.4320261478424072, + "learning_rate": 2.8372901952947394e-07, + "loss": 0.327, + "step": 49770 + }, + { + "epoch": 0.9240205998002399, + "grad_norm": 0.3730677664279938, + "learning_rate": 2.834531895517112e-07, + "loss": 0.2633, + "step": 49772 + }, + { + "epoch": 0.9240577299376584, + "grad_norm": 0.3902011513710022, + "learning_rate": 2.831774917870478e-07, + "loss": 0.3242, + "step": 49774 + }, + { + "epoch": 0.9240948600750771, + "grad_norm": 0.37957963347435, + "learning_rate": 2.8290192623923294e-07, + "loss": 0.3149, + "step": 49776 + }, + { + "epoch": 0.9241319902124958, + "grad_norm": 0.49002882838249207, + "learning_rate": 2.8262649291201596e-07, + "loss": 0.219, + "step": 49778 + }, + { + "epoch": 0.9241691203499144, + "grad_norm": 0.6142514944076538, + "learning_rate": 2.823511918091459e-07, + "loss": 0.2665, + "step": 49780 + }, + { + "epoch": 0.9242062504873331, + "grad_norm": 0.4994458258152008, + "learning_rate": 2.820760229343689e-07, + "loss": 0.2954, + "step": 49782 + }, + { + "epoch": 0.9242433806247516, + "grad_norm": 0.5009868144989014, + "learning_rate": 2.818009862914284e-07, + "loss": 0.2169, + "step": 49784 + }, + { + "epoch": 0.9242805107621703, + "grad_norm": 0.2878788709640503, + "learning_rate": 2.8152608188406817e-07, + "loss": 0.2727, + "step": 49786 + }, + { + "epoch": 0.924317640899589, + "grad_norm": 0.456452876329422, + "learning_rate": 2.812513097160263e-07, + "loss": 0.4672, + "step": 49788 + }, + { + "epoch": 0.9243547710370076, + "grad_norm": 0.3580019474029541, + "learning_rate": 2.8097666979104323e-07, + "loss": 0.3862, + "step": 49790 + }, + { + "epoch": 0.9243919011744263, + "grad_norm": 0.5655384063720703, + "learning_rate": 2.8070216211285584e-07, + "loss": 0.3802, + "step": 49792 + }, + { + "epoch": 0.9244290313118448, + "grad_norm": 0.7612162828445435, + "learning_rate": 2.80427786685199e-07, + "loss": 0.3959, + "step": 49794 + }, + { + "epoch": 0.9244661614492635, + "grad_norm": 0.37837541103363037, + "learning_rate": 2.8015354351180634e-07, + "loss": 0.3358, + "step": 49796 + }, + { + "epoch": 0.9245032915866822, + "grad_norm": 0.39777854084968567, + "learning_rate": 2.798794325964116e-07, + "loss": 0.2923, + "step": 49798 + }, + { + "epoch": 0.9245404217241008, + "grad_norm": 0.29153814911842346, + "learning_rate": 2.7960545394274176e-07, + "loss": 0.2295, + "step": 49800 + }, + { + "epoch": 0.9245775518615195, + "grad_norm": 0.4093559682369232, + "learning_rate": 2.7933160755452495e-07, + "loss": 0.1795, + "step": 49802 + }, + { + "epoch": 0.924614681998938, + "grad_norm": 0.5435904264450073, + "learning_rate": 2.790578934354893e-07, + "loss": 0.3073, + "step": 49804 + }, + { + "epoch": 0.9246518121363567, + "grad_norm": 0.6149637699127197, + "learning_rate": 2.7878431158935735e-07, + "loss": 0.2266, + "step": 49806 + }, + { + "epoch": 0.9246889422737754, + "grad_norm": 0.269695520401001, + "learning_rate": 2.785108620198518e-07, + "loss": 0.2975, + "step": 49808 + }, + { + "epoch": 0.924726072411194, + "grad_norm": 0.3480795919895172, + "learning_rate": 2.782375447306951e-07, + "loss": 0.3703, + "step": 49810 + }, + { + "epoch": 0.9247632025486127, + "grad_norm": 0.26122185587882996, + "learning_rate": 2.7796435972560545e-07, + "loss": 0.3722, + "step": 49812 + }, + { + "epoch": 0.9248003326860312, + "grad_norm": 0.5064325332641602, + "learning_rate": 2.7769130700829983e-07, + "loss": 0.3964, + "step": 49814 + }, + { + "epoch": 0.9248374628234499, + "grad_norm": 0.334913045167923, + "learning_rate": 2.774183865824942e-07, + "loss": 0.21, + "step": 49816 + }, + { + "epoch": 0.9248745929608686, + "grad_norm": 0.25168871879577637, + "learning_rate": 2.7714559845190117e-07, + "loss": 0.1169, + "step": 49818 + }, + { + "epoch": 0.9249117230982872, + "grad_norm": 0.3868754804134369, + "learning_rate": 2.7687294262023325e-07, + "loss": 0.2624, + "step": 49820 + }, + { + "epoch": 0.9249488532357059, + "grad_norm": 0.290458083152771, + "learning_rate": 2.7660041909119973e-07, + "loss": 0.3303, + "step": 49822 + }, + { + "epoch": 0.9249859833731244, + "grad_norm": 0.2924361526966095, + "learning_rate": 2.76328027868511e-07, + "loss": 0.1211, + "step": 49824 + }, + { + "epoch": 0.9250231135105431, + "grad_norm": 0.3289566934108734, + "learning_rate": 2.760557689558729e-07, + "loss": 0.2383, + "step": 49826 + }, + { + "epoch": 0.9250602436479617, + "grad_norm": 0.3992975056171417, + "learning_rate": 2.757836423569871e-07, + "loss": 0.3275, + "step": 49828 + }, + { + "epoch": 0.9250973737853804, + "grad_norm": 0.39561134576797485, + "learning_rate": 2.755116480755593e-07, + "loss": 0.1999, + "step": 49830 + }, + { + "epoch": 0.9251345039227991, + "grad_norm": 0.3623635768890381, + "learning_rate": 2.7523978611529e-07, + "loss": 0.339, + "step": 49832 + }, + { + "epoch": 0.9251716340602176, + "grad_norm": 0.4861413538455963, + "learning_rate": 2.7496805647987733e-07, + "loss": 0.1024, + "step": 49834 + }, + { + "epoch": 0.9252087641976363, + "grad_norm": 0.4445790648460388, + "learning_rate": 2.7469645917302057e-07, + "loss": 0.2267, + "step": 49836 + }, + { + "epoch": 0.9252458943350549, + "grad_norm": 0.3816763162612915, + "learning_rate": 2.744249941984145e-07, + "loss": 0.2856, + "step": 49838 + }, + { + "epoch": 0.9252830244724736, + "grad_norm": 0.3544054627418518, + "learning_rate": 2.7415366155975177e-07, + "loss": 0.6093, + "step": 49840 + }, + { + "epoch": 0.9253201546098923, + "grad_norm": 0.23181086778640747, + "learning_rate": 2.7388246126072494e-07, + "loss": 0.2593, + "step": 49842 + }, + { + "epoch": 0.9253572847473108, + "grad_norm": 0.5426713824272156, + "learning_rate": 2.736113933050255e-07, + "loss": 0.3157, + "step": 49844 + }, + { + "epoch": 0.9253944148847295, + "grad_norm": 0.4052729904651642, + "learning_rate": 2.7334045769634277e-07, + "loss": 0.4713, + "step": 49846 + }, + { + "epoch": 0.9254315450221481, + "grad_norm": 0.3568899631500244, + "learning_rate": 2.730696544383593e-07, + "loss": 0.2084, + "step": 49848 + }, + { + "epoch": 0.9254686751595668, + "grad_norm": 0.4612375497817993, + "learning_rate": 2.7279898353476443e-07, + "loss": 0.1696, + "step": 49850 + }, + { + "epoch": 0.9255058052969855, + "grad_norm": 0.48294755816459656, + "learning_rate": 2.725284449892374e-07, + "loss": 0.2931, + "step": 49852 + }, + { + "epoch": 0.925542935434404, + "grad_norm": 0.5002955794334412, + "learning_rate": 2.7225803880546074e-07, + "loss": 0.3502, + "step": 49854 + }, + { + "epoch": 0.9255800655718227, + "grad_norm": 0.309332013130188, + "learning_rate": 2.7198776498711497e-07, + "loss": 0.1719, + "step": 49856 + }, + { + "epoch": 0.9256171957092413, + "grad_norm": 0.34766149520874023, + "learning_rate": 2.717176235378771e-07, + "loss": 0.3784, + "step": 49858 + }, + { + "epoch": 0.92565432584666, + "grad_norm": 0.42977020144462585, + "learning_rate": 2.71447614461422e-07, + "loss": 0.1214, + "step": 49860 + }, + { + "epoch": 0.9256914559840786, + "grad_norm": 0.31497064232826233, + "learning_rate": 2.711777377614255e-07, + "loss": 0.1847, + "step": 49862 + }, + { + "epoch": 0.9257285861214972, + "grad_norm": 0.30456656217575073, + "learning_rate": 2.709079934415582e-07, + "loss": 0.3104, + "step": 49864 + }, + { + "epoch": 0.9257657162589159, + "grad_norm": 0.25535306334495544, + "learning_rate": 2.7063838150549137e-07, + "loss": 0.1593, + "step": 49866 + }, + { + "epoch": 0.9258028463963345, + "grad_norm": 0.2112947702407837, + "learning_rate": 2.703689019568945e-07, + "loss": 0.2687, + "step": 49868 + }, + { + "epoch": 0.9258399765337532, + "grad_norm": 0.34325939416885376, + "learning_rate": 2.7009955479943116e-07, + "loss": 0.1482, + "step": 49870 + }, + { + "epoch": 0.9258771066711717, + "grad_norm": 0.4030865728855133, + "learning_rate": 2.698303400367697e-07, + "loss": 0.2087, + "step": 49872 + }, + { + "epoch": 0.9259142368085904, + "grad_norm": 0.5946475267410278, + "learning_rate": 2.6956125767257147e-07, + "loss": 0.2744, + "step": 49874 + }, + { + "epoch": 0.9259513669460091, + "grad_norm": 0.4652072787284851, + "learning_rate": 2.692923077104992e-07, + "loss": 0.2395, + "step": 49876 + }, + { + "epoch": 0.9259884970834277, + "grad_norm": 0.30950677394866943, + "learning_rate": 2.6902349015421105e-07, + "loss": 0.2437, + "step": 49878 + }, + { + "epoch": 0.9260256272208464, + "grad_norm": 0.41209298372268677, + "learning_rate": 2.687548050073652e-07, + "loss": 0.2616, + "step": 49880 + }, + { + "epoch": 0.9260627573582649, + "grad_norm": 0.24194031953811646, + "learning_rate": 2.684862522736187e-07, + "loss": 0.36, + "step": 49882 + }, + { + "epoch": 0.9260998874956836, + "grad_norm": 0.376455694437027, + "learning_rate": 2.682178319566242e-07, + "loss": 0.2208, + "step": 49884 + }, + { + "epoch": 0.9261370176331023, + "grad_norm": 0.6508611440658569, + "learning_rate": 2.679495440600366e-07, + "loss": 0.251, + "step": 49886 + }, + { + "epoch": 0.9261741477705209, + "grad_norm": 0.32797205448150635, + "learning_rate": 2.6768138858750404e-07, + "loss": 0.1273, + "step": 49888 + }, + { + "epoch": 0.9262112779079396, + "grad_norm": 0.31941789388656616, + "learning_rate": 2.674133655426769e-07, + "loss": 0.4069, + "step": 49890 + }, + { + "epoch": 0.9262484080453581, + "grad_norm": 0.3808702826499939, + "learning_rate": 2.6714547492920017e-07, + "loss": 0.3239, + "step": 49892 + }, + { + "epoch": 0.9262855381827768, + "grad_norm": 0.35558995604515076, + "learning_rate": 2.6687771675071973e-07, + "loss": 0.2458, + "step": 49894 + }, + { + "epoch": 0.9263226683201955, + "grad_norm": 0.5020599365234375, + "learning_rate": 2.6661009101088044e-07, + "loss": 0.2597, + "step": 49896 + }, + { + "epoch": 0.9263597984576141, + "grad_norm": 0.4423373341560364, + "learning_rate": 2.663425977133227e-07, + "loss": 0.29, + "step": 49898 + }, + { + "epoch": 0.9263969285950328, + "grad_norm": 0.24018560349941254, + "learning_rate": 2.6607523686168703e-07, + "loss": 0.1913, + "step": 49900 + }, + { + "epoch": 0.9264340587324513, + "grad_norm": 0.35924842953681946, + "learning_rate": 2.658080084596104e-07, + "loss": 0.2498, + "step": 49902 + }, + { + "epoch": 0.92647118886987, + "grad_norm": 0.2985548675060272, + "learning_rate": 2.655409125107289e-07, + "loss": 0.0891, + "step": 49904 + }, + { + "epoch": 0.9265083190072887, + "grad_norm": 0.38645586371421814, + "learning_rate": 2.652739490186784e-07, + "loss": 0.3805, + "step": 49906 + }, + { + "epoch": 0.9265454491447073, + "grad_norm": 0.26703301072120667, + "learning_rate": 2.650071179870917e-07, + "loss": 0.1515, + "step": 49908 + }, + { + "epoch": 0.926582579282126, + "grad_norm": 0.2515755891799927, + "learning_rate": 2.6474041941959684e-07, + "loss": 0.1603, + "step": 49910 + }, + { + "epoch": 0.9266197094195445, + "grad_norm": 0.44288450479507446, + "learning_rate": 2.6447385331982544e-07, + "loss": 0.3095, + "step": 49912 + }, + { + "epoch": 0.9266568395569632, + "grad_norm": 0.41956478357315063, + "learning_rate": 2.642074196914024e-07, + "loss": 0.1451, + "step": 49914 + }, + { + "epoch": 0.9266939696943819, + "grad_norm": 0.3735790252685547, + "learning_rate": 2.639411185379537e-07, + "loss": 0.3441, + "step": 49916 + }, + { + "epoch": 0.9267310998318005, + "grad_norm": 0.6341722011566162, + "learning_rate": 2.636749498631053e-07, + "loss": 0.3768, + "step": 49918 + }, + { + "epoch": 0.9267682299692191, + "grad_norm": 0.5440880656242371, + "learning_rate": 2.6340891367047536e-07, + "loss": 0.2471, + "step": 49920 + }, + { + "epoch": 0.9268053601066377, + "grad_norm": 0.3225160241127014, + "learning_rate": 2.631430099636867e-07, + "loss": 0.2332, + "step": 49922 + }, + { + "epoch": 0.9268424902440564, + "grad_norm": 0.44770368933677673, + "learning_rate": 2.628772387463574e-07, + "loss": 0.1694, + "step": 49924 + }, + { + "epoch": 0.926879620381475, + "grad_norm": 0.5047432780265808, + "learning_rate": 2.626116000221013e-07, + "loss": 0.4647, + "step": 49926 + }, + { + "epoch": 0.9269167505188937, + "grad_norm": 0.36197927594184875, + "learning_rate": 2.6234609379453547e-07, + "loss": 0.1238, + "step": 49928 + }, + { + "epoch": 0.9269538806563123, + "grad_norm": 0.22939473390579224, + "learning_rate": 2.6208072006727034e-07, + "loss": 0.194, + "step": 49930 + }, + { + "epoch": 0.9269910107937309, + "grad_norm": 0.39572539925575256, + "learning_rate": 2.618154788439198e-07, + "loss": 0.2093, + "step": 49932 + }, + { + "epoch": 0.9270281409311496, + "grad_norm": 0.38085970282554626, + "learning_rate": 2.6155037012809084e-07, + "loss": 0.1272, + "step": 49934 + }, + { + "epoch": 0.9270652710685682, + "grad_norm": 0.6949074864387512, + "learning_rate": 2.612853939233906e-07, + "loss": 0.3489, + "step": 49936 + }, + { + "epoch": 0.9271024012059869, + "grad_norm": 0.45535701513290405, + "learning_rate": 2.610205502334262e-07, + "loss": 0.1939, + "step": 49938 + }, + { + "epoch": 0.9271395313434055, + "grad_norm": 0.31363987922668457, + "learning_rate": 2.607558390618004e-07, + "loss": 0.2817, + "step": 49940 + }, + { + "epoch": 0.9271766614808241, + "grad_norm": 0.519798755645752, + "learning_rate": 2.604912604121146e-07, + "loss": 0.3837, + "step": 49942 + }, + { + "epoch": 0.9272137916182428, + "grad_norm": 0.4134761691093445, + "learning_rate": 2.6022681428796937e-07, + "loss": 0.2475, + "step": 49944 + }, + { + "epoch": 0.9272509217556614, + "grad_norm": 0.4680531322956085, + "learning_rate": 2.599625006929629e-07, + "loss": 0.2266, + "step": 49946 + }, + { + "epoch": 0.9272880518930801, + "grad_norm": 0.5666810274124146, + "learning_rate": 2.596983196306924e-07, + "loss": 0.3188, + "step": 49948 + }, + { + "epoch": 0.9273251820304987, + "grad_norm": 0.24031789600849152, + "learning_rate": 2.594342711047537e-07, + "loss": 0.2657, + "step": 49950 + }, + { + "epoch": 0.9273623121679173, + "grad_norm": 0.31892114877700806, + "learning_rate": 2.591703551187363e-07, + "loss": 0.3445, + "step": 49952 + }, + { + "epoch": 0.927399442305336, + "grad_norm": 0.3310941457748413, + "learning_rate": 2.5890657167623514e-07, + "loss": 0.2466, + "step": 49954 + }, + { + "epoch": 0.9274365724427546, + "grad_norm": 0.36471033096313477, + "learning_rate": 2.586429207808361e-07, + "loss": 0.3944, + "step": 49956 + }, + { + "epoch": 0.9274737025801733, + "grad_norm": 0.24163736402988434, + "learning_rate": 2.583794024361286e-07, + "loss": 0.2134, + "step": 49958 + }, + { + "epoch": 0.9275108327175919, + "grad_norm": 0.584182858467102, + "learning_rate": 2.5811601664569754e-07, + "loss": 0.1063, + "step": 49960 + }, + { + "epoch": 0.9275479628550105, + "grad_norm": 0.4147902727127075, + "learning_rate": 2.5785276341312784e-07, + "loss": 0.3217, + "step": 49962 + }, + { + "epoch": 0.9275850929924292, + "grad_norm": 0.3719520568847656, + "learning_rate": 2.5758964274199995e-07, + "loss": 0.1422, + "step": 49964 + }, + { + "epoch": 0.9276222231298478, + "grad_norm": 0.20314651727676392, + "learning_rate": 2.5732665463589545e-07, + "loss": 0.2562, + "step": 49966 + }, + { + "epoch": 0.9276593532672665, + "grad_norm": 0.5559768676757812, + "learning_rate": 2.5706379909839263e-07, + "loss": 0.2322, + "step": 49968 + }, + { + "epoch": 0.9276964834046851, + "grad_norm": 0.31284722685813904, + "learning_rate": 2.568010761330675e-07, + "loss": 0.1512, + "step": 49970 + }, + { + "epoch": 0.9277336135421037, + "grad_norm": 0.2678031325340271, + "learning_rate": 2.5653848574349715e-07, + "loss": 0.2352, + "step": 49972 + }, + { + "epoch": 0.9277707436795224, + "grad_norm": 0.33180689811706543, + "learning_rate": 2.56276027933251e-07, + "loss": 0.2884, + "step": 49974 + }, + { + "epoch": 0.927807873816941, + "grad_norm": 0.18834978342056274, + "learning_rate": 2.56013702705904e-07, + "loss": 0.2058, + "step": 49976 + }, + { + "epoch": 0.9278450039543596, + "grad_norm": 0.46009984612464905, + "learning_rate": 2.557515100650232e-07, + "loss": 0.2194, + "step": 49978 + }, + { + "epoch": 0.9278821340917782, + "grad_norm": 0.25312185287475586, + "learning_rate": 2.5548945001417693e-07, + "loss": 0.2198, + "step": 49980 + }, + { + "epoch": 0.9279192642291969, + "grad_norm": 0.3930930495262146, + "learning_rate": 2.5522752255693004e-07, + "loss": 0.2124, + "step": 49982 + }, + { + "epoch": 0.9279563943666156, + "grad_norm": 0.5646523237228394, + "learning_rate": 2.549657276968487e-07, + "loss": 0.1545, + "step": 49984 + }, + { + "epoch": 0.9279935245040342, + "grad_norm": 0.3822021186351776, + "learning_rate": 2.5470406543749326e-07, + "loss": 0.2593, + "step": 49986 + }, + { + "epoch": 0.9280306546414528, + "grad_norm": 0.5157206654548645, + "learning_rate": 2.544425357824265e-07, + "loss": 0.1803, + "step": 49988 + }, + { + "epoch": 0.9280677847788714, + "grad_norm": 0.4178747832775116, + "learning_rate": 2.5418113873520444e-07, + "loss": 0.1658, + "step": 49990 + }, + { + "epoch": 0.9281049149162901, + "grad_norm": 0.2807450294494629, + "learning_rate": 2.539198742993842e-07, + "loss": 0.1151, + "step": 49992 + }, + { + "epoch": 0.9281420450537088, + "grad_norm": 0.2861728072166443, + "learning_rate": 2.5365874247852306e-07, + "loss": 0.3368, + "step": 49994 + }, + { + "epoch": 0.9281791751911274, + "grad_norm": 0.4530234634876251, + "learning_rate": 2.533977432761725e-07, + "loss": 0.2256, + "step": 49996 + }, + { + "epoch": 0.928216305328546, + "grad_norm": 0.6383476257324219, + "learning_rate": 2.5313687669588304e-07, + "loss": 0.3119, + "step": 49998 + }, + { + "epoch": 0.9282534354659646, + "grad_norm": 0.3517311215400696, + "learning_rate": 2.528761427412074e-07, + "loss": 0.2339, + "step": 50000 + }, + { + "epoch": 0.9282905656033833, + "grad_norm": 0.5342261791229248, + "learning_rate": 2.5261554141568943e-07, + "loss": 0.1814, + "step": 50002 + }, + { + "epoch": 0.928327695740802, + "grad_norm": 0.46976590156555176, + "learning_rate": 2.523550727228785e-07, + "loss": 0.1672, + "step": 50004 + }, + { + "epoch": 0.9283648258782206, + "grad_norm": 0.3364795446395874, + "learning_rate": 2.5209473666631625e-07, + "loss": 0.261, + "step": 50006 + }, + { + "epoch": 0.9284019560156392, + "grad_norm": 0.3253527879714966, + "learning_rate": 2.518345332495464e-07, + "loss": 0.0617, + "step": 50008 + }, + { + "epoch": 0.9284390861530578, + "grad_norm": 0.4243648052215576, + "learning_rate": 2.515744624761096e-07, + "loss": 0.3977, + "step": 50010 + }, + { + "epoch": 0.9284762162904765, + "grad_norm": 0.5059959292411804, + "learning_rate": 2.513145243495452e-07, + "loss": 0.2348, + "step": 50012 + }, + { + "epoch": 0.9285133464278952, + "grad_norm": 0.23626206815242767, + "learning_rate": 2.5105471887338915e-07, + "loss": 0.1201, + "step": 50014 + }, + { + "epoch": 0.9285504765653138, + "grad_norm": 0.41124823689460754, + "learning_rate": 2.5079504605117765e-07, + "loss": 0.2356, + "step": 50016 + }, + { + "epoch": 0.9285876067027324, + "grad_norm": 0.4590713381767273, + "learning_rate": 2.505355058864423e-07, + "loss": 0.1258, + "step": 50018 + }, + { + "epoch": 0.928624736840151, + "grad_norm": 0.2713387608528137, + "learning_rate": 2.502760983827157e-07, + "loss": 0.265, + "step": 50020 + }, + { + "epoch": 0.9286618669775697, + "grad_norm": 0.4963763952255249, + "learning_rate": 2.500168235435274e-07, + "loss": 0.2598, + "step": 50022 + }, + { + "epoch": 0.9286989971149883, + "grad_norm": 0.2727981209754944, + "learning_rate": 2.4975768137240565e-07, + "loss": 0.2796, + "step": 50024 + }, + { + "epoch": 0.928736127252407, + "grad_norm": 0.5048683881759644, + "learning_rate": 2.4949867187287756e-07, + "loss": 0.2263, + "step": 50026 + }, + { + "epoch": 0.9287732573898256, + "grad_norm": 0.326703280210495, + "learning_rate": 2.49239795048466e-07, + "loss": 0.2829, + "step": 50028 + }, + { + "epoch": 0.9288103875272442, + "grad_norm": 0.3353484570980072, + "learning_rate": 2.4898105090269354e-07, + "loss": 0.2288, + "step": 50030 + }, + { + "epoch": 0.9288475176646629, + "grad_norm": 0.5045747756958008, + "learning_rate": 2.487224394390808e-07, + "loss": 0.4539, + "step": 50032 + }, + { + "epoch": 0.9288846478020815, + "grad_norm": 0.5396079421043396, + "learning_rate": 2.484639606611472e-07, + "loss": 0.2241, + "step": 50034 + }, + { + "epoch": 0.9289217779395001, + "grad_norm": 0.48862725496292114, + "learning_rate": 2.48205614572411e-07, + "loss": 0.2605, + "step": 50036 + }, + { + "epoch": 0.9289589080769188, + "grad_norm": 0.6225519180297852, + "learning_rate": 2.47947401176386e-07, + "loss": 0.1512, + "step": 50038 + }, + { + "epoch": 0.9289960382143374, + "grad_norm": 0.27551329135894775, + "learning_rate": 2.4768932047658625e-07, + "loss": 0.2087, + "step": 50040 + }, + { + "epoch": 0.9290331683517561, + "grad_norm": 0.3404499292373657, + "learning_rate": 2.4743137247652315e-07, + "loss": 0.2611, + "step": 50042 + }, + { + "epoch": 0.9290702984891747, + "grad_norm": 0.3554465174674988, + "learning_rate": 2.4717355717970624e-07, + "loss": 0.0476, + "step": 50044 + }, + { + "epoch": 0.9291074286265933, + "grad_norm": 0.29980018734931946, + "learning_rate": 2.4691587458964383e-07, + "loss": 0.1652, + "step": 50046 + }, + { + "epoch": 0.929144558764012, + "grad_norm": 0.3189743161201477, + "learning_rate": 2.4665832470984307e-07, + "loss": 0.2194, + "step": 50048 + }, + { + "epoch": 0.9291816889014306, + "grad_norm": 0.43185773491859436, + "learning_rate": 2.4640090754380785e-07, + "loss": 0.1238, + "step": 50050 + }, + { + "epoch": 0.9292188190388493, + "grad_norm": 0.35565221309661865, + "learning_rate": 2.4614362309504093e-07, + "loss": 0.108, + "step": 50052 + }, + { + "epoch": 0.9292559491762679, + "grad_norm": 0.4460527002811432, + "learning_rate": 2.458864713670428e-07, + "loss": 0.4787, + "step": 50054 + }, + { + "epoch": 0.9292930793136865, + "grad_norm": 0.5744699835777283, + "learning_rate": 2.456294523633129e-07, + "loss": 0.2548, + "step": 50056 + }, + { + "epoch": 0.9293302094511052, + "grad_norm": 0.5349975228309631, + "learning_rate": 2.453725660873496e-07, + "loss": 0.3363, + "step": 50058 + }, + { + "epoch": 0.9293673395885238, + "grad_norm": 0.3433665335178375, + "learning_rate": 2.451158125426456e-07, + "loss": 0.1984, + "step": 50060 + }, + { + "epoch": 0.9294044697259425, + "grad_norm": 0.33166879415512085, + "learning_rate": 2.448591917326959e-07, + "loss": 0.3093, + "step": 50062 + }, + { + "epoch": 0.9294415998633611, + "grad_norm": 0.2712383270263672, + "learning_rate": 2.4460270366099435e-07, + "loss": 0.2089, + "step": 50064 + }, + { + "epoch": 0.9294787300007797, + "grad_norm": 0.2754560708999634, + "learning_rate": 2.4434634833102823e-07, + "loss": 0.2261, + "step": 50066 + }, + { + "epoch": 0.9295158601381984, + "grad_norm": 0.418683797121048, + "learning_rate": 2.440901257462869e-07, + "loss": 0.2141, + "step": 50068 + }, + { + "epoch": 0.929552990275617, + "grad_norm": 0.3829540014266968, + "learning_rate": 2.4383403591025533e-07, + "loss": 0.1168, + "step": 50070 + }, + { + "epoch": 0.9295901204130357, + "grad_norm": 0.2854011356830597, + "learning_rate": 2.435780788264208e-07, + "loss": 0.172, + "step": 50072 + }, + { + "epoch": 0.9296272505504543, + "grad_norm": 0.38260915875434875, + "learning_rate": 2.4332225449826386e-07, + "loss": 0.1909, + "step": 50074 + }, + { + "epoch": 0.9296643806878729, + "grad_norm": 0.3819912374019623, + "learning_rate": 2.4306656292926835e-07, + "loss": 0.2626, + "step": 50076 + }, + { + "epoch": 0.9297015108252915, + "grad_norm": 0.5275866985321045, + "learning_rate": 2.428110041229104e-07, + "loss": 0.1463, + "step": 50078 + }, + { + "epoch": 0.9297386409627102, + "grad_norm": 0.2746739089488983, + "learning_rate": 2.4255557808266827e-07, + "loss": 0.2714, + "step": 50080 + }, + { + "epoch": 0.9297757711001289, + "grad_norm": 0.44585299491882324, + "learning_rate": 2.4230028481201816e-07, + "loss": 0.3139, + "step": 50082 + }, + { + "epoch": 0.9298129012375475, + "grad_norm": 0.6144194602966309, + "learning_rate": 2.4204512431443284e-07, + "loss": 0.1877, + "step": 50084 + }, + { + "epoch": 0.9298500313749661, + "grad_norm": 0.3143332600593567, + "learning_rate": 2.4179009659338504e-07, + "loss": 0.2184, + "step": 50086 + }, + { + "epoch": 0.9298871615123847, + "grad_norm": 0.47258126735687256, + "learning_rate": 2.415352016523453e-07, + "loss": 0.1942, + "step": 50088 + }, + { + "epoch": 0.9299242916498034, + "grad_norm": 0.5165525078773499, + "learning_rate": 2.4128043949478207e-07, + "loss": 0.3612, + "step": 50090 + }, + { + "epoch": 0.9299614217872221, + "grad_norm": 0.3326161801815033, + "learning_rate": 2.410258101241592e-07, + "loss": 0.3223, + "step": 50092 + }, + { + "epoch": 0.9299985519246406, + "grad_norm": 0.41989514231681824, + "learning_rate": 2.407713135439449e-07, + "loss": 0.4066, + "step": 50094 + }, + { + "epoch": 0.9300356820620593, + "grad_norm": 0.5700629949569702, + "learning_rate": 2.4051694975760097e-07, + "loss": 0.2163, + "step": 50096 + }, + { + "epoch": 0.9300728121994779, + "grad_norm": 0.30251359939575195, + "learning_rate": 2.4026271876858687e-07, + "loss": 0.2664, + "step": 50098 + }, + { + "epoch": 0.9301099423368966, + "grad_norm": 0.6367429494857788, + "learning_rate": 2.4000862058036534e-07, + "loss": 0.2941, + "step": 50100 + }, + { + "epoch": 0.9301470724743153, + "grad_norm": 0.36553218960762024, + "learning_rate": 2.3975465519639254e-07, + "loss": 0.2799, + "step": 50102 + }, + { + "epoch": 0.9301842026117338, + "grad_norm": 0.40600821375846863, + "learning_rate": 2.395008226201212e-07, + "loss": 0.2766, + "step": 50104 + }, + { + "epoch": 0.9302213327491525, + "grad_norm": 0.44274744391441345, + "learning_rate": 2.3924712285500863e-07, + "loss": 0.5092, + "step": 50106 + }, + { + "epoch": 0.9302584628865711, + "grad_norm": 0.3577493727207184, + "learning_rate": 2.3899355590450533e-07, + "loss": 0.3258, + "step": 50108 + }, + { + "epoch": 0.9302955930239898, + "grad_norm": 0.41966712474823, + "learning_rate": 2.3874012177206195e-07, + "loss": 0.1704, + "step": 50110 + }, + { + "epoch": 0.9303327231614085, + "grad_norm": 0.3496668338775635, + "learning_rate": 2.3848682046112793e-07, + "loss": 0.1664, + "step": 50112 + }, + { + "epoch": 0.930369853298827, + "grad_norm": 0.34141188859939575, + "learning_rate": 2.3823365197514937e-07, + "loss": 0.336, + "step": 50114 + }, + { + "epoch": 0.9304069834362457, + "grad_norm": 0.4654254913330078, + "learning_rate": 2.379806163175713e-07, + "loss": 0.283, + "step": 50116 + }, + { + "epoch": 0.9304441135736643, + "grad_norm": 0.5817670226097107, + "learning_rate": 2.3772771349183544e-07, + "loss": 0.3705, + "step": 50118 + }, + { + "epoch": 0.930481243711083, + "grad_norm": 0.32937487959861755, + "learning_rate": 2.3747494350138345e-07, + "loss": 0.1033, + "step": 50120 + }, + { + "epoch": 0.9305183738485017, + "grad_norm": 0.3401328921318054, + "learning_rate": 2.3722230634965703e-07, + "loss": 0.4134, + "step": 50122 + }, + { + "epoch": 0.9305555039859202, + "grad_norm": 0.2706407308578491, + "learning_rate": 2.3696980204009124e-07, + "loss": 0.3092, + "step": 50124 + }, + { + "epoch": 0.9305926341233389, + "grad_norm": 0.45439186692237854, + "learning_rate": 2.367174305761233e-07, + "loss": 0.1719, + "step": 50126 + }, + { + "epoch": 0.9306297642607575, + "grad_norm": 0.3255786895751953, + "learning_rate": 2.3646519196118601e-07, + "loss": 0.1929, + "step": 50128 + }, + { + "epoch": 0.9306668943981762, + "grad_norm": 0.39936649799346924, + "learning_rate": 2.362130861987122e-07, + "loss": 0.3221, + "step": 50130 + }, + { + "epoch": 0.9307040245355948, + "grad_norm": 0.22375917434692383, + "learning_rate": 2.3596111329213244e-07, + "loss": 0.4441, + "step": 50132 + }, + { + "epoch": 0.9307411546730134, + "grad_norm": 0.4194203019142151, + "learning_rate": 2.3570927324487513e-07, + "loss": 0.3366, + "step": 50134 + }, + { + "epoch": 0.9307782848104321, + "grad_norm": 0.35186129808425903, + "learning_rate": 2.354575660603664e-07, + "loss": 0.1242, + "step": 50136 + }, + { + "epoch": 0.9308154149478507, + "grad_norm": 0.8295299410820007, + "learning_rate": 2.3520599174203351e-07, + "loss": 0.2764, + "step": 50138 + }, + { + "epoch": 0.9308525450852694, + "grad_norm": 0.40202224254608154, + "learning_rate": 2.3495455029329817e-07, + "loss": 0.2701, + "step": 50140 + }, + { + "epoch": 0.930889675222688, + "grad_norm": 0.329073429107666, + "learning_rate": 2.3470324171758096e-07, + "loss": 0.1915, + "step": 50142 + }, + { + "epoch": 0.9309268053601066, + "grad_norm": 0.3940455913543701, + "learning_rate": 2.344520660183025e-07, + "loss": 0.1817, + "step": 50144 + }, + { + "epoch": 0.9309639354975253, + "grad_norm": 0.3769148290157318, + "learning_rate": 2.3420102319888005e-07, + "loss": 0.2202, + "step": 50146 + }, + { + "epoch": 0.9310010656349439, + "grad_norm": 0.3352530896663666, + "learning_rate": 2.3395011326272975e-07, + "loss": 0.0896, + "step": 50148 + }, + { + "epoch": 0.9310381957723626, + "grad_norm": 0.5480664372444153, + "learning_rate": 2.3369933621326557e-07, + "loss": 0.4502, + "step": 50150 + }, + { + "epoch": 0.9310753259097811, + "grad_norm": 0.48081859946250916, + "learning_rate": 2.3344869205390143e-07, + "loss": 0.3651, + "step": 50152 + }, + { + "epoch": 0.9311124560471998, + "grad_norm": 0.316737562417984, + "learning_rate": 2.3319818078804458e-07, + "loss": 0.1826, + "step": 50154 + }, + { + "epoch": 0.9311495861846185, + "grad_norm": 0.436523973941803, + "learning_rate": 2.3294780241910563e-07, + "loss": 0.1889, + "step": 50156 + }, + { + "epoch": 0.9311867163220371, + "grad_norm": 0.3270857632160187, + "learning_rate": 2.326975569504919e-07, + "loss": 0.2512, + "step": 50158 + }, + { + "epoch": 0.9312238464594558, + "grad_norm": 0.35123908519744873, + "learning_rate": 2.324474443856073e-07, + "loss": 0.0923, + "step": 50160 + }, + { + "epoch": 0.9312609765968743, + "grad_norm": 0.5527810454368591, + "learning_rate": 2.3219746472785688e-07, + "loss": 0.2976, + "step": 50162 + }, + { + "epoch": 0.931298106734293, + "grad_norm": 0.30166253447532654, + "learning_rate": 2.3194761798064013e-07, + "loss": 0.1078, + "step": 50164 + }, + { + "epoch": 0.9313352368717117, + "grad_norm": 0.42234131693840027, + "learning_rate": 2.316979041473588e-07, + "loss": 0.3371, + "step": 50166 + }, + { + "epoch": 0.9313723670091303, + "grad_norm": 0.480018675327301, + "learning_rate": 2.3144832323140909e-07, + "loss": 0.1519, + "step": 50168 + }, + { + "epoch": 0.931409497146549, + "grad_norm": 0.2924879193305969, + "learning_rate": 2.31198875236186e-07, + "loss": 0.2406, + "step": 50170 + }, + { + "epoch": 0.9314466272839675, + "grad_norm": 0.22740890085697174, + "learning_rate": 2.3094956016508686e-07, + "loss": 0.1872, + "step": 50172 + }, + { + "epoch": 0.9314837574213862, + "grad_norm": 0.3323245048522949, + "learning_rate": 2.3070037802150113e-07, + "loss": 0.2727, + "step": 50174 + }, + { + "epoch": 0.9315208875588048, + "grad_norm": 0.9086126685142517, + "learning_rate": 2.3045132880882282e-07, + "loss": 0.2833, + "step": 50176 + }, + { + "epoch": 0.9315580176962235, + "grad_norm": 0.3626006841659546, + "learning_rate": 2.3020241253043696e-07, + "loss": 0.3513, + "step": 50178 + }, + { + "epoch": 0.9315951478336422, + "grad_norm": 0.4872686564922333, + "learning_rate": 2.2995362918973308e-07, + "loss": 0.2393, + "step": 50180 + }, + { + "epoch": 0.9316322779710607, + "grad_norm": 0.42003950476646423, + "learning_rate": 2.2970497879009513e-07, + "loss": 0.2756, + "step": 50182 + }, + { + "epoch": 0.9316694081084794, + "grad_norm": 0.5618094205856323, + "learning_rate": 2.2945646133490708e-07, + "loss": 0.4697, + "step": 50184 + }, + { + "epoch": 0.931706538245898, + "grad_norm": 0.3172289729118347, + "learning_rate": 2.2920807682755175e-07, + "loss": 0.3055, + "step": 50186 + }, + { + "epoch": 0.9317436683833167, + "grad_norm": 0.3681546449661255, + "learning_rate": 2.289598252714065e-07, + "loss": 0.1397, + "step": 50188 + }, + { + "epoch": 0.9317807985207354, + "grad_norm": 0.3824702203273773, + "learning_rate": 2.2871170666985076e-07, + "loss": 0.4014, + "step": 50190 + }, + { + "epoch": 0.9318179286581539, + "grad_norm": 0.6514669060707092, + "learning_rate": 2.2846372102625968e-07, + "loss": 0.3967, + "step": 50192 + }, + { + "epoch": 0.9318550587955726, + "grad_norm": 0.3041110634803772, + "learning_rate": 2.2821586834400833e-07, + "loss": 0.3218, + "step": 50194 + }, + { + "epoch": 0.9318921889329912, + "grad_norm": 0.33173733949661255, + "learning_rate": 2.2796814862646954e-07, + "loss": 0.1871, + "step": 50196 + }, + { + "epoch": 0.9319293190704099, + "grad_norm": 0.2545624375343323, + "learning_rate": 2.2772056187701286e-07, + "loss": 0.2165, + "step": 50198 + }, + { + "epoch": 0.9319664492078286, + "grad_norm": 0.3800126016139984, + "learning_rate": 2.2747310809900892e-07, + "loss": 0.2508, + "step": 50200 + }, + { + "epoch": 0.9320035793452471, + "grad_norm": 0.47080495953559875, + "learning_rate": 2.272257872958239e-07, + "loss": 0.3466, + "step": 50202 + }, + { + "epoch": 0.9320407094826658, + "grad_norm": 0.3630296289920807, + "learning_rate": 2.269785994708218e-07, + "loss": 0.1049, + "step": 50204 + }, + { + "epoch": 0.9320778396200844, + "grad_norm": 0.3728260397911072, + "learning_rate": 2.267315446273688e-07, + "loss": 0.2794, + "step": 50206 + }, + { + "epoch": 0.9321149697575031, + "grad_norm": 0.2389519363641739, + "learning_rate": 2.2648462276882445e-07, + "loss": 0.3124, + "step": 50208 + }, + { + "epoch": 0.9321520998949218, + "grad_norm": 0.45861950516700745, + "learning_rate": 2.2623783389854935e-07, + "loss": 0.1979, + "step": 50210 + }, + { + "epoch": 0.9321892300323403, + "grad_norm": 0.9209721088409424, + "learning_rate": 2.25991178019902e-07, + "loss": 0.4842, + "step": 50212 + }, + { + "epoch": 0.932226360169759, + "grad_norm": 0.31795015931129456, + "learning_rate": 2.2574465513623856e-07, + "loss": 0.296, + "step": 50214 + }, + { + "epoch": 0.9322634903071776, + "grad_norm": 0.3507000505924225, + "learning_rate": 2.254982652509119e-07, + "loss": 0.2245, + "step": 50216 + }, + { + "epoch": 0.9323006204445963, + "grad_norm": 1.693692684173584, + "learning_rate": 2.2525200836727601e-07, + "loss": 0.245, + "step": 50218 + }, + { + "epoch": 0.932337750582015, + "grad_norm": 0.27702951431274414, + "learning_rate": 2.2500588448868155e-07, + "loss": 0.1549, + "step": 50220 + }, + { + "epoch": 0.9323748807194335, + "grad_norm": 0.3858413100242615, + "learning_rate": 2.2475989361847695e-07, + "loss": 0.3221, + "step": 50222 + }, + { + "epoch": 0.9324120108568522, + "grad_norm": 0.3768874704837799, + "learning_rate": 2.2451403576001062e-07, + "loss": 0.3847, + "step": 50224 + }, + { + "epoch": 0.9324491409942708, + "grad_norm": 0.36448800563812256, + "learning_rate": 2.242683109166277e-07, + "loss": 0.3161, + "step": 50226 + }, + { + "epoch": 0.9324862711316895, + "grad_norm": 0.3610519468784332, + "learning_rate": 2.2402271909167105e-07, + "loss": 0.3058, + "step": 50228 + }, + { + "epoch": 0.932523401269108, + "grad_norm": 0.36512672901153564, + "learning_rate": 2.2377726028848356e-07, + "loss": 0.2822, + "step": 50230 + }, + { + "epoch": 0.9325605314065267, + "grad_norm": 0.7213726043701172, + "learning_rate": 2.2353193451040368e-07, + "loss": 0.1611, + "step": 50232 + }, + { + "epoch": 0.9325976615439454, + "grad_norm": 0.3615325391292572, + "learning_rate": 2.232867417607698e-07, + "loss": 0.2929, + "step": 50234 + }, + { + "epoch": 0.932634791681364, + "grad_norm": 0.6579441428184509, + "learning_rate": 2.230416820429182e-07, + "loss": 0.1527, + "step": 50236 + }, + { + "epoch": 0.9326719218187827, + "grad_norm": 0.38399797677993774, + "learning_rate": 2.2279675536018508e-07, + "loss": 0.1879, + "step": 50238 + }, + { + "epoch": 0.9327090519562012, + "grad_norm": 0.2739553451538086, + "learning_rate": 2.225519617159022e-07, + "loss": 0.0232, + "step": 50240 + }, + { + "epoch": 0.9327461820936199, + "grad_norm": 0.4344296455383301, + "learning_rate": 2.2230730111340025e-07, + "loss": 0.3672, + "step": 50242 + }, + { + "epoch": 0.9327833122310386, + "grad_norm": 0.2888500392436981, + "learning_rate": 2.220627735560077e-07, + "loss": 0.2945, + "step": 50244 + }, + { + "epoch": 0.9328204423684572, + "grad_norm": 0.26970380544662476, + "learning_rate": 2.2181837904705184e-07, + "loss": 0.1132, + "step": 50246 + }, + { + "epoch": 0.9328575725058759, + "grad_norm": 0.3661527931690216, + "learning_rate": 2.2157411758986003e-07, + "loss": 0.2957, + "step": 50248 + }, + { + "epoch": 0.9328947026432944, + "grad_norm": 0.9967791438102722, + "learning_rate": 2.2132998918775406e-07, + "loss": 0.4025, + "step": 50250 + }, + { + "epoch": 0.9329318327807131, + "grad_norm": 0.2456521987915039, + "learning_rate": 2.2108599384405793e-07, + "loss": 0.3097, + "step": 50252 + }, + { + "epoch": 0.9329689629181318, + "grad_norm": 0.37923794984817505, + "learning_rate": 2.20842131562089e-07, + "loss": 0.1301, + "step": 50254 + }, + { + "epoch": 0.9330060930555504, + "grad_norm": 0.22194752097129822, + "learning_rate": 2.2059840234516572e-07, + "loss": 0.2078, + "step": 50256 + }, + { + "epoch": 0.9330432231929691, + "grad_norm": 0.3658318817615509, + "learning_rate": 2.2035480619660654e-07, + "loss": 0.1605, + "step": 50258 + }, + { + "epoch": 0.9330803533303876, + "grad_norm": 0.1737578958272934, + "learning_rate": 2.2011134311972438e-07, + "loss": 0.1847, + "step": 50260 + }, + { + "epoch": 0.9331174834678063, + "grad_norm": 0.621938943862915, + "learning_rate": 2.1986801311783323e-07, + "loss": 0.3555, + "step": 50262 + }, + { + "epoch": 0.933154613605225, + "grad_norm": 0.6269360780715942, + "learning_rate": 2.1962481619424268e-07, + "loss": 0.3036, + "step": 50264 + }, + { + "epoch": 0.9331917437426436, + "grad_norm": 1.773868441581726, + "learning_rate": 2.1938175235226456e-07, + "loss": 0.1978, + "step": 50266 + }, + { + "epoch": 0.9332288738800623, + "grad_norm": 0.9219141602516174, + "learning_rate": 2.1913882159520283e-07, + "loss": 0.3084, + "step": 50268 + }, + { + "epoch": 0.9332660040174808, + "grad_norm": 0.19909599423408508, + "learning_rate": 2.18896023926366e-07, + "loss": 0.1226, + "step": 50270 + }, + { + "epoch": 0.9333031341548995, + "grad_norm": 0.5098899006843567, + "learning_rate": 2.1865335934905584e-07, + "loss": 0.2954, + "step": 50272 + }, + { + "epoch": 0.9333402642923182, + "grad_norm": 0.3619163930416107, + "learning_rate": 2.184108278665753e-07, + "loss": 0.2906, + "step": 50274 + }, + { + "epoch": 0.9333773944297368, + "grad_norm": 0.32638904452323914, + "learning_rate": 2.1816842948222284e-07, + "loss": 0.2576, + "step": 50276 + }, + { + "epoch": 0.9334145245671555, + "grad_norm": 0.5350068211555481, + "learning_rate": 2.1792616419930024e-07, + "loss": 0.2234, + "step": 50278 + }, + { + "epoch": 0.933451654704574, + "grad_norm": 0.35978618264198303, + "learning_rate": 2.1768403202109934e-07, + "loss": 0.2361, + "step": 50280 + }, + { + "epoch": 0.9334887848419927, + "grad_norm": 0.24880273640155792, + "learning_rate": 2.1744203295091858e-07, + "loss": 0.297, + "step": 50282 + }, + { + "epoch": 0.9335259149794113, + "grad_norm": 0.19702662527561188, + "learning_rate": 2.172001669920498e-07, + "loss": 0.2972, + "step": 50284 + }, + { + "epoch": 0.93356304511683, + "grad_norm": 0.4363676607608795, + "learning_rate": 2.1695843414778262e-07, + "loss": 0.3394, + "step": 50286 + }, + { + "epoch": 0.9336001752542487, + "grad_norm": 0.6145041584968567, + "learning_rate": 2.167168344214088e-07, + "loss": 0.2762, + "step": 50288 + }, + { + "epoch": 0.9336373053916672, + "grad_norm": 0.48047691583633423, + "learning_rate": 2.164753678162146e-07, + "loss": 0.3471, + "step": 50290 + }, + { + "epoch": 0.9336744355290859, + "grad_norm": 0.4448888599872589, + "learning_rate": 2.1623403433548518e-07, + "loss": 0.3481, + "step": 50292 + }, + { + "epoch": 0.9337115656665045, + "grad_norm": 0.3121815621852875, + "learning_rate": 2.159928339825057e-07, + "loss": 0.3047, + "step": 50294 + }, + { + "epoch": 0.9337486958039232, + "grad_norm": 0.3360520899295807, + "learning_rate": 2.1575176676055688e-07, + "loss": 0.4464, + "step": 50296 + }, + { + "epoch": 0.9337858259413419, + "grad_norm": 0.4055921733379364, + "learning_rate": 2.1551083267291829e-07, + "loss": 0.2614, + "step": 50298 + }, + { + "epoch": 0.9338229560787604, + "grad_norm": 0.23660337924957275, + "learning_rate": 2.1527003172286954e-07, + "loss": 0.3921, + "step": 50300 + }, + { + "epoch": 0.9338600862161791, + "grad_norm": 0.30888649821281433, + "learning_rate": 2.150293639136869e-07, + "loss": 0.1719, + "step": 50302 + }, + { + "epoch": 0.9338972163535977, + "grad_norm": 0.42983853816986084, + "learning_rate": 2.1478882924864665e-07, + "loss": 0.3817, + "step": 50304 + }, + { + "epoch": 0.9339343464910164, + "grad_norm": 0.38036930561065674, + "learning_rate": 2.1454842773101946e-07, + "loss": 0.3468, + "step": 50306 + }, + { + "epoch": 0.933971476628435, + "grad_norm": 0.37760263681411743, + "learning_rate": 2.1430815936407722e-07, + "loss": 0.5105, + "step": 50308 + }, + { + "epoch": 0.9340086067658536, + "grad_norm": 0.4960090219974518, + "learning_rate": 2.140680241510884e-07, + "loss": 0.1866, + "step": 50310 + }, + { + "epoch": 0.9340457369032723, + "grad_norm": 0.4065636992454529, + "learning_rate": 2.138280220953237e-07, + "loss": 0.409, + "step": 50312 + }, + { + "epoch": 0.9340828670406909, + "grad_norm": 0.4962323009967804, + "learning_rate": 2.13588153200045e-07, + "loss": 0.4225, + "step": 50314 + }, + { + "epoch": 0.9341199971781096, + "grad_norm": 0.6142938733100891, + "learning_rate": 2.1334841746851964e-07, + "loss": 0.3435, + "step": 50316 + }, + { + "epoch": 0.9341571273155282, + "grad_norm": 0.22493356466293335, + "learning_rate": 2.1310881490400615e-07, + "loss": 0.4127, + "step": 50318 + }, + { + "epoch": 0.9341942574529468, + "grad_norm": 0.3735158443450928, + "learning_rate": 2.1286934550976635e-07, + "loss": 0.1864, + "step": 50320 + }, + { + "epoch": 0.9342313875903655, + "grad_norm": 0.4285488426685333, + "learning_rate": 2.1263000928905986e-07, + "loss": 0.4334, + "step": 50322 + }, + { + "epoch": 0.9342685177277841, + "grad_norm": 0.4024764597415924, + "learning_rate": 2.1239080624514187e-07, + "loss": 0.3155, + "step": 50324 + }, + { + "epoch": 0.9343056478652028, + "grad_norm": 0.5392751097679138, + "learning_rate": 2.121517363812675e-07, + "loss": 0.1325, + "step": 50326 + }, + { + "epoch": 0.9343427780026213, + "grad_norm": 0.46233445405960083, + "learning_rate": 2.119127997006909e-07, + "loss": 0.3244, + "step": 50328 + }, + { + "epoch": 0.93437990814004, + "grad_norm": 0.4473434090614319, + "learning_rate": 2.1167399620666162e-07, + "loss": 0.2035, + "step": 50330 + }, + { + "epoch": 0.9344170382774587, + "grad_norm": 0.5246615409851074, + "learning_rate": 2.1143532590243044e-07, + "loss": 0.4241, + "step": 50332 + }, + { + "epoch": 0.9344541684148773, + "grad_norm": 0.34634634852409363, + "learning_rate": 2.1119678879124472e-07, + "loss": 0.2863, + "step": 50334 + }, + { + "epoch": 0.934491298552296, + "grad_norm": 0.6041848063468933, + "learning_rate": 2.1095838487634856e-07, + "loss": 0.321, + "step": 50336 + }, + { + "epoch": 0.9345284286897145, + "grad_norm": 0.5718795657157898, + "learning_rate": 2.1072011416098715e-07, + "loss": 0.1721, + "step": 50338 + }, + { + "epoch": 0.9345655588271332, + "grad_norm": 0.47210752964019775, + "learning_rate": 2.1048197664840453e-07, + "loss": 0.2688, + "step": 50340 + }, + { + "epoch": 0.9346026889645519, + "grad_norm": 0.43718093633651733, + "learning_rate": 2.1024397234183702e-07, + "loss": 0.2051, + "step": 50342 + }, + { + "epoch": 0.9346398191019705, + "grad_norm": 0.3578401803970337, + "learning_rate": 2.1000610124452648e-07, + "loss": 0.2384, + "step": 50344 + }, + { + "epoch": 0.9346769492393892, + "grad_norm": 0.31479063630104065, + "learning_rate": 2.0976836335970807e-07, + "loss": 0.1628, + "step": 50346 + }, + { + "epoch": 0.9347140793768077, + "grad_norm": 0.5599633455276489, + "learning_rate": 2.09530758690617e-07, + "loss": 0.3048, + "step": 50348 + }, + { + "epoch": 0.9347512095142264, + "grad_norm": 0.3701847493648529, + "learning_rate": 2.0929328724048626e-07, + "loss": 0.1816, + "step": 50350 + }, + { + "epoch": 0.9347883396516451, + "grad_norm": 0.44199663400650024, + "learning_rate": 2.0905594901254767e-07, + "loss": 0.2723, + "step": 50352 + }, + { + "epoch": 0.9348254697890637, + "grad_norm": 0.3768458366394043, + "learning_rate": 2.08818744010032e-07, + "loss": 0.3407, + "step": 50354 + }, + { + "epoch": 0.9348625999264824, + "grad_norm": 0.364531546831131, + "learning_rate": 2.0858167223616444e-07, + "loss": 0.2374, + "step": 50356 + }, + { + "epoch": 0.9348997300639009, + "grad_norm": 0.45745018124580383, + "learning_rate": 2.083447336941702e-07, + "loss": 0.3061, + "step": 50358 + }, + { + "epoch": 0.9349368602013196, + "grad_norm": 0.3720918893814087, + "learning_rate": 2.0810792838727556e-07, + "loss": 0.2701, + "step": 50360 + }, + { + "epoch": 0.9349739903387383, + "grad_norm": 0.3258405029773712, + "learning_rate": 2.078712563187013e-07, + "loss": 0.1801, + "step": 50362 + }, + { + "epoch": 0.9350111204761569, + "grad_norm": 0.3929935097694397, + "learning_rate": 2.076347174916682e-07, + "loss": 0.2558, + "step": 50364 + }, + { + "epoch": 0.9350482506135755, + "grad_norm": 0.5458317995071411, + "learning_rate": 2.0739831190939697e-07, + "loss": 0.2046, + "step": 50366 + }, + { + "epoch": 0.9350853807509941, + "grad_norm": 0.2508763372898102, + "learning_rate": 2.071620395751006e-07, + "loss": 0.1868, + "step": 50368 + }, + { + "epoch": 0.9351225108884128, + "grad_norm": 1.228243350982666, + "learning_rate": 2.0692590049199656e-07, + "loss": 0.3273, + "step": 50370 + }, + { + "epoch": 0.9351596410258315, + "grad_norm": 0.4273628890514374, + "learning_rate": 2.066898946632978e-07, + "loss": 0.1735, + "step": 50372 + }, + { + "epoch": 0.9351967711632501, + "grad_norm": 0.18382541835308075, + "learning_rate": 2.06454022092214e-07, + "loss": 0.0686, + "step": 50374 + }, + { + "epoch": 0.9352339013006687, + "grad_norm": 0.3270367980003357, + "learning_rate": 2.062182827819581e-07, + "loss": 0.1611, + "step": 50376 + }, + { + "epoch": 0.9352710314380873, + "grad_norm": 0.4754035472869873, + "learning_rate": 2.0598267673573424e-07, + "loss": 0.2313, + "step": 50378 + }, + { + "epoch": 0.935308161575506, + "grad_norm": 0.5821787714958191, + "learning_rate": 2.0574720395674986e-07, + "loss": 0.2899, + "step": 50380 + }, + { + "epoch": 0.9353452917129246, + "grad_norm": 0.33625200390815735, + "learning_rate": 2.055118644482079e-07, + "loss": 0.3867, + "step": 50382 + }, + { + "epoch": 0.9353824218503433, + "grad_norm": 0.31917238235473633, + "learning_rate": 2.0527665821331256e-07, + "loss": 0.2235, + "step": 50384 + }, + { + "epoch": 0.9354195519877619, + "grad_norm": 0.29225248098373413, + "learning_rate": 2.050415852552623e-07, + "loss": 0.2697, + "step": 50386 + }, + { + "epoch": 0.9354566821251805, + "grad_norm": 0.3765079081058502, + "learning_rate": 2.0480664557725793e-07, + "loss": 0.2639, + "step": 50388 + }, + { + "epoch": 0.9354938122625992, + "grad_norm": 0.4467739760875702, + "learning_rate": 2.045718391824947e-07, + "loss": 0.2681, + "step": 50390 + }, + { + "epoch": 0.9355309424000178, + "grad_norm": 0.38358041644096375, + "learning_rate": 2.0433716607416886e-07, + "loss": 0.2499, + "step": 50392 + }, + { + "epoch": 0.9355680725374365, + "grad_norm": 0.39348816871643066, + "learning_rate": 2.0410262625547128e-07, + "loss": 0.1173, + "step": 50394 + }, + { + "epoch": 0.9356052026748551, + "grad_norm": 0.5106677412986755, + "learning_rate": 2.038682197295949e-07, + "loss": 0.2704, + "step": 50396 + }, + { + "epoch": 0.9356423328122737, + "grad_norm": 0.41727590560913086, + "learning_rate": 2.0363394649973056e-07, + "loss": 0.2787, + "step": 50398 + }, + { + "epoch": 0.9356794629496924, + "grad_norm": 0.36754557490348816, + "learning_rate": 2.0339980656906455e-07, + "loss": 0.2348, + "step": 50400 + }, + { + "epoch": 0.935716593087111, + "grad_norm": 0.3274444341659546, + "learning_rate": 2.0316579994078212e-07, + "loss": 0.2705, + "step": 50402 + }, + { + "epoch": 0.9357537232245297, + "grad_norm": 0.3400157392024994, + "learning_rate": 2.029319266180696e-07, + "loss": 0.3042, + "step": 50404 + }, + { + "epoch": 0.9357908533619483, + "grad_norm": 0.45816606283187866, + "learning_rate": 2.0269818660410667e-07, + "loss": 0.2454, + "step": 50406 + }, + { + "epoch": 0.9358279834993669, + "grad_norm": 0.4133220613002777, + "learning_rate": 2.0246457990207414e-07, + "loss": 0.3562, + "step": 50408 + }, + { + "epoch": 0.9358651136367856, + "grad_norm": 0.26466718316078186, + "learning_rate": 2.0223110651515275e-07, + "loss": 0.2541, + "step": 50410 + }, + { + "epoch": 0.9359022437742042, + "grad_norm": 0.40572988986968994, + "learning_rate": 2.0199776644651893e-07, + "loss": 0.3739, + "step": 50412 + }, + { + "epoch": 0.9359393739116229, + "grad_norm": 0.5456663966178894, + "learning_rate": 2.017645596993456e-07, + "loss": 0.2566, + "step": 50414 + }, + { + "epoch": 0.9359765040490415, + "grad_norm": 0.6315339207649231, + "learning_rate": 2.0153148627680917e-07, + "loss": 0.2558, + "step": 50416 + }, + { + "epoch": 0.9360136341864601, + "grad_norm": 0.29461973905563354, + "learning_rate": 2.0129854618207823e-07, + "loss": 0.2117, + "step": 50418 + }, + { + "epoch": 0.9360507643238788, + "grad_norm": 0.482860267162323, + "learning_rate": 2.0106573941832463e-07, + "loss": 0.2068, + "step": 50420 + }, + { + "epoch": 0.9360878944612974, + "grad_norm": 0.4696381986141205, + "learning_rate": 2.0083306598871478e-07, + "loss": 0.4644, + "step": 50422 + }, + { + "epoch": 0.936125024598716, + "grad_norm": 0.3810580372810364, + "learning_rate": 2.0060052589641498e-07, + "loss": 0.2406, + "step": 50424 + }, + { + "epoch": 0.9361621547361347, + "grad_norm": 0.33511000871658325, + "learning_rate": 2.003681191445883e-07, + "loss": 0.1702, + "step": 50426 + }, + { + "epoch": 0.9361992848735533, + "grad_norm": 0.7196905016899109, + "learning_rate": 2.0013584573639888e-07, + "loss": 0.2437, + "step": 50428 + }, + { + "epoch": 0.936236415010972, + "grad_norm": 0.5981458425521851, + "learning_rate": 1.999037056750075e-07, + "loss": 0.273, + "step": 50430 + }, + { + "epoch": 0.9362735451483906, + "grad_norm": 0.5226945281028748, + "learning_rate": 1.9967169896357164e-07, + "loss": 0.1275, + "step": 50432 + }, + { + "epoch": 0.9363106752858092, + "grad_norm": 0.36623862385749817, + "learning_rate": 1.9943982560524765e-07, + "loss": 0.1688, + "step": 50434 + }, + { + "epoch": 0.9363478054232278, + "grad_norm": 0.2931693494319916, + "learning_rate": 1.9920808560319194e-07, + "loss": 0.4856, + "step": 50436 + }, + { + "epoch": 0.9363849355606465, + "grad_norm": 0.505641758441925, + "learning_rate": 1.989764789605586e-07, + "loss": 0.2772, + "step": 50438 + }, + { + "epoch": 0.9364220656980652, + "grad_norm": 0.5909997820854187, + "learning_rate": 1.9874500568049625e-07, + "loss": 0.3388, + "step": 50440 + }, + { + "epoch": 0.9364591958354838, + "grad_norm": 0.4241778552532196, + "learning_rate": 1.9851366576615793e-07, + "loss": 0.4256, + "step": 50442 + }, + { + "epoch": 0.9364963259729024, + "grad_norm": 0.7040421962738037, + "learning_rate": 1.982824592206878e-07, + "loss": 0.2828, + "step": 50444 + }, + { + "epoch": 0.936533456110321, + "grad_norm": 0.459684282541275, + "learning_rate": 1.980513860472355e-07, + "loss": 0.2272, + "step": 50446 + }, + { + "epoch": 0.9365705862477397, + "grad_norm": 0.37046897411346436, + "learning_rate": 1.9782044624894193e-07, + "loss": 0.3529, + "step": 50448 + }, + { + "epoch": 0.9366077163851584, + "grad_norm": 0.4277670383453369, + "learning_rate": 1.9758963982895229e-07, + "loss": 0.1935, + "step": 50450 + }, + { + "epoch": 0.936644846522577, + "grad_norm": 0.3797200322151184, + "learning_rate": 1.9735896679040522e-07, + "loss": 0.2852, + "step": 50452 + }, + { + "epoch": 0.9366819766599956, + "grad_norm": 0.42556628584861755, + "learning_rate": 1.9712842713644153e-07, + "loss": 0.3725, + "step": 50454 + }, + { + "epoch": 0.9367191067974142, + "grad_norm": 0.2555093765258789, + "learning_rate": 1.968980208701965e-07, + "loss": 0.2586, + "step": 50456 + }, + { + "epoch": 0.9367562369348329, + "grad_norm": 0.45253226161003113, + "learning_rate": 1.9666774799480537e-07, + "loss": 0.3277, + "step": 50458 + }, + { + "epoch": 0.9367933670722516, + "grad_norm": 0.2537003457546234, + "learning_rate": 1.9643760851340232e-07, + "loss": 0.3003, + "step": 50460 + }, + { + "epoch": 0.9368304972096702, + "grad_norm": 0.256867378950119, + "learning_rate": 1.9620760242911818e-07, + "loss": 0.2997, + "step": 50462 + }, + { + "epoch": 0.9368676273470888, + "grad_norm": 0.3946177661418915, + "learning_rate": 1.9597772974508266e-07, + "loss": 0.4203, + "step": 50464 + }, + { + "epoch": 0.9369047574845074, + "grad_norm": 0.2898321747779846, + "learning_rate": 1.9574799046442438e-07, + "loss": 0.3947, + "step": 50466 + }, + { + "epoch": 0.9369418876219261, + "grad_norm": 0.30333712697029114, + "learning_rate": 1.9551838459026863e-07, + "loss": 0.3152, + "step": 50468 + }, + { + "epoch": 0.9369790177593448, + "grad_norm": 0.5426267981529236, + "learning_rate": 1.952889121257384e-07, + "loss": 0.4185, + "step": 50470 + }, + { + "epoch": 0.9370161478967634, + "grad_norm": 0.20420224964618683, + "learning_rate": 1.9505957307395907e-07, + "loss": 0.2163, + "step": 50472 + }, + { + "epoch": 0.937053278034182, + "grad_norm": 0.2776103913784027, + "learning_rate": 1.9483036743804916e-07, + "loss": 0.1981, + "step": 50474 + }, + { + "epoch": 0.9370904081716006, + "grad_norm": 0.3246515989303589, + "learning_rate": 1.9460129522112737e-07, + "loss": 0.3298, + "step": 50476 + }, + { + "epoch": 0.9371275383090193, + "grad_norm": 0.3749484419822693, + "learning_rate": 1.9437235642631225e-07, + "loss": 0.1431, + "step": 50478 + }, + { + "epoch": 0.9371646684464379, + "grad_norm": 0.314248651266098, + "learning_rate": 1.941435510567191e-07, + "loss": 0.2078, + "step": 50480 + }, + { + "epoch": 0.9372017985838565, + "grad_norm": 0.4632081389427185, + "learning_rate": 1.939148791154588e-07, + "loss": 0.2562, + "step": 50482 + }, + { + "epoch": 0.9372389287212752, + "grad_norm": 0.6664503812789917, + "learning_rate": 1.9368634060564552e-07, + "loss": 0.185, + "step": 50484 + }, + { + "epoch": 0.9372760588586938, + "grad_norm": 0.40528416633605957, + "learning_rate": 1.9345793553038671e-07, + "loss": 0.4443, + "step": 50486 + }, + { + "epoch": 0.9373131889961125, + "grad_norm": 0.3443734645843506, + "learning_rate": 1.932296638927922e-07, + "loss": 0.2818, + "step": 50488 + }, + { + "epoch": 0.9373503191335311, + "grad_norm": 0.26927846670150757, + "learning_rate": 1.930015256959661e-07, + "loss": 0.3232, + "step": 50490 + }, + { + "epoch": 0.9373874492709497, + "grad_norm": 0.283528596162796, + "learning_rate": 1.9277352094301592e-07, + "loss": 0.319, + "step": 50492 + }, + { + "epoch": 0.9374245794083684, + "grad_norm": 0.18127410113811493, + "learning_rate": 1.9254564963704036e-07, + "loss": 0.0619, + "step": 50494 + }, + { + "epoch": 0.937461709545787, + "grad_norm": 0.37580209970474243, + "learning_rate": 1.9231791178114245e-07, + "loss": 0.1107, + "step": 50496 + }, + { + "epoch": 0.9374988396832057, + "grad_norm": 0.31789058446884155, + "learning_rate": 1.9209030737841972e-07, + "loss": 0.2323, + "step": 50498 + }, + { + "epoch": 0.9375359698206243, + "grad_norm": 0.6327764987945557, + "learning_rate": 1.9186283643196967e-07, + "loss": 0.1488, + "step": 50500 + }, + { + "epoch": 0.9375730999580429, + "grad_norm": 0.5328879356384277, + "learning_rate": 1.9163549894488875e-07, + "loss": 0.1919, + "step": 50502 + }, + { + "epoch": 0.9376102300954616, + "grad_norm": 0.35823971033096313, + "learning_rate": 1.9140829492026892e-07, + "loss": 0.3331, + "step": 50504 + }, + { + "epoch": 0.9376473602328802, + "grad_norm": 0.5131212472915649, + "learning_rate": 1.9118122436120213e-07, + "loss": 0.1846, + "step": 50506 + }, + { + "epoch": 0.9376844903702989, + "grad_norm": 0.5100772976875305, + "learning_rate": 1.9095428727077813e-07, + "loss": 0.5072, + "step": 50508 + }, + { + "epoch": 0.9377216205077175, + "grad_norm": 0.36215993762016296, + "learning_rate": 1.907274836520845e-07, + "loss": 0.0539, + "step": 50510 + }, + { + "epoch": 0.9377587506451361, + "grad_norm": 0.31194761395454407, + "learning_rate": 1.905008135082076e-07, + "loss": 0.3013, + "step": 50512 + }, + { + "epoch": 0.9377958807825548, + "grad_norm": 0.36816251277923584, + "learning_rate": 1.9027427684223053e-07, + "loss": 0.3429, + "step": 50514 + }, + { + "epoch": 0.9378330109199734, + "grad_norm": 0.5024937391281128, + "learning_rate": 1.9004787365723864e-07, + "loss": 0.3006, + "step": 50516 + }, + { + "epoch": 0.9378701410573921, + "grad_norm": 0.40359485149383545, + "learning_rate": 1.8982160395631167e-07, + "loss": 0.274, + "step": 50518 + }, + { + "epoch": 0.9379072711948107, + "grad_norm": 0.27537867426872253, + "learning_rate": 1.8959546774252602e-07, + "loss": 0.1818, + "step": 50520 + }, + { + "epoch": 0.9379444013322293, + "grad_norm": 0.6612551808357239, + "learning_rate": 1.8936946501896147e-07, + "loss": 0.3089, + "step": 50522 + }, + { + "epoch": 0.937981531469648, + "grad_norm": 2.1385087966918945, + "learning_rate": 1.8914359578869223e-07, + "loss": 0.3573, + "step": 50524 + }, + { + "epoch": 0.9380186616070666, + "grad_norm": 0.3032298982143402, + "learning_rate": 1.889178600547914e-07, + "loss": 0.4587, + "step": 50526 + }, + { + "epoch": 0.9380557917444853, + "grad_norm": 0.3252864181995392, + "learning_rate": 1.8869225782033096e-07, + "loss": 0.347, + "step": 50528 + }, + { + "epoch": 0.9380929218819039, + "grad_norm": 0.3504236936569214, + "learning_rate": 1.8846678908838068e-07, + "loss": 0.3391, + "step": 50530 + }, + { + "epoch": 0.9381300520193225, + "grad_norm": 0.4330886900424957, + "learning_rate": 1.882414538620081e-07, + "loss": 0.3414, + "step": 50532 + }, + { + "epoch": 0.9381671821567411, + "grad_norm": 0.3574167490005493, + "learning_rate": 1.8801625214428076e-07, + "loss": 0.2797, + "step": 50534 + }, + { + "epoch": 0.9382043122941598, + "grad_norm": 0.35796743631362915, + "learning_rate": 1.877911839382607e-07, + "loss": 0.4347, + "step": 50536 + }, + { + "epoch": 0.9382414424315785, + "grad_norm": 0.36988940834999084, + "learning_rate": 1.8756624924701204e-07, + "loss": 0.3294, + "step": 50538 + }, + { + "epoch": 0.938278572568997, + "grad_norm": 0.2854657769203186, + "learning_rate": 1.8734144807359466e-07, + "loss": 0.3095, + "step": 50540 + }, + { + "epoch": 0.9383157027064157, + "grad_norm": 0.31911319494247437, + "learning_rate": 1.871167804210694e-07, + "loss": 0.3963, + "step": 50542 + }, + { + "epoch": 0.9383528328438343, + "grad_norm": 0.5411933064460754, + "learning_rate": 1.8689224629249048e-07, + "loss": 0.2804, + "step": 50544 + }, + { + "epoch": 0.938389962981253, + "grad_norm": 0.18840636312961578, + "learning_rate": 1.8666784569091544e-07, + "loss": 0.2103, + "step": 50546 + }, + { + "epoch": 0.9384270931186717, + "grad_norm": 0.42887693643569946, + "learning_rate": 1.864435786193963e-07, + "loss": 0.0617, + "step": 50548 + }, + { + "epoch": 0.9384642232560902, + "grad_norm": 0.49172353744506836, + "learning_rate": 1.86219445080984e-07, + "loss": 0.298, + "step": 50550 + }, + { + "epoch": 0.9385013533935089, + "grad_norm": 0.3039313852787018, + "learning_rate": 1.8599544507873046e-07, + "loss": 0.3195, + "step": 50552 + }, + { + "epoch": 0.9385384835309275, + "grad_norm": 0.3312618136405945, + "learning_rate": 1.8577157861568107e-07, + "loss": 0.3235, + "step": 50554 + }, + { + "epoch": 0.9385756136683462, + "grad_norm": 0.38901248574256897, + "learning_rate": 1.855478456948856e-07, + "loss": 0.3924, + "step": 50556 + }, + { + "epoch": 0.9386127438057649, + "grad_norm": 0.23433274030685425, + "learning_rate": 1.8532424631938496e-07, + "loss": 0.2871, + "step": 50558 + }, + { + "epoch": 0.9386498739431834, + "grad_norm": 0.41085517406463623, + "learning_rate": 1.8510078049222335e-07, + "loss": 0.3958, + "step": 50560 + }, + { + "epoch": 0.9386870040806021, + "grad_norm": 0.43302658200263977, + "learning_rate": 1.8487744821644061e-07, + "loss": 0.275, + "step": 50562 + }, + { + "epoch": 0.9387241342180207, + "grad_norm": 0.5038411021232605, + "learning_rate": 1.846542494950765e-07, + "loss": 0.2495, + "step": 50564 + }, + { + "epoch": 0.9387612643554394, + "grad_norm": 0.47242259979248047, + "learning_rate": 1.8443118433116857e-07, + "loss": 0.2778, + "step": 50566 + }, + { + "epoch": 0.9387983944928581, + "grad_norm": 0.5323737859725952, + "learning_rate": 1.8420825272775e-07, + "loss": 0.1311, + "step": 50568 + }, + { + "epoch": 0.9388355246302766, + "grad_norm": 0.5178433060646057, + "learning_rate": 1.839854546878561e-07, + "loss": 0.1369, + "step": 50570 + }, + { + "epoch": 0.9388726547676953, + "grad_norm": 0.4531875252723694, + "learning_rate": 1.8376279021451783e-07, + "loss": 0.2166, + "step": 50572 + }, + { + "epoch": 0.9389097849051139, + "grad_norm": 0.27651655673980713, + "learning_rate": 1.835402593107638e-07, + "loss": 0.3026, + "step": 50574 + }, + { + "epoch": 0.9389469150425326, + "grad_norm": 0.6184964776039124, + "learning_rate": 1.8331786197962386e-07, + "loss": 0.5048, + "step": 50576 + }, + { + "epoch": 0.9389840451799513, + "grad_norm": 0.17431627213954926, + "learning_rate": 1.8309559822412337e-07, + "loss": 0.2446, + "step": 50578 + }, + { + "epoch": 0.9390211753173698, + "grad_norm": 0.531634509563446, + "learning_rate": 1.8287346804728768e-07, + "loss": 0.396, + "step": 50580 + }, + { + "epoch": 0.9390583054547885, + "grad_norm": 0.5510864853858948, + "learning_rate": 1.826514714521377e-07, + "loss": 0.2673, + "step": 50582 + }, + { + "epoch": 0.9390954355922071, + "grad_norm": 0.27180781960487366, + "learning_rate": 1.8242960844169434e-07, + "loss": 0.175, + "step": 50584 + }, + { + "epoch": 0.9391325657296258, + "grad_norm": 0.4251702129840851, + "learning_rate": 1.822078790189774e-07, + "loss": 0.2656, + "step": 50586 + }, + { + "epoch": 0.9391696958670444, + "grad_norm": 0.35033923387527466, + "learning_rate": 1.819862831870034e-07, + "loss": 0.1701, + "step": 50588 + }, + { + "epoch": 0.939206826004463, + "grad_norm": 0.36468085646629333, + "learning_rate": 1.8176482094878767e-07, + "loss": 0.2366, + "step": 50590 + }, + { + "epoch": 0.9392439561418817, + "grad_norm": 0.3783363997936249, + "learning_rate": 1.8154349230734446e-07, + "loss": 0.4572, + "step": 50592 + }, + { + "epoch": 0.9392810862793003, + "grad_norm": 0.6165767312049866, + "learning_rate": 1.8132229726568473e-07, + "loss": 0.3959, + "step": 50594 + }, + { + "epoch": 0.939318216416719, + "grad_norm": 0.4167231023311615, + "learning_rate": 1.8110123582681716e-07, + "loss": 0.3769, + "step": 50596 + }, + { + "epoch": 0.9393553465541375, + "grad_norm": 0.31226852536201477, + "learning_rate": 1.8088030799375045e-07, + "loss": 0.3572, + "step": 50598 + }, + { + "epoch": 0.9393924766915562, + "grad_norm": 0.1898368000984192, + "learning_rate": 1.806595137694922e-07, + "loss": 0.2176, + "step": 50600 + }, + { + "epoch": 0.9394296068289749, + "grad_norm": 0.49393194913864136, + "learning_rate": 1.8043885315704445e-07, + "loss": 0.336, + "step": 50602 + }, + { + "epoch": 0.9394667369663935, + "grad_norm": 0.3919266164302826, + "learning_rate": 1.802183261594115e-07, + "loss": 0.2408, + "step": 50604 + }, + { + "epoch": 0.9395038671038122, + "grad_norm": 0.3320578634738922, + "learning_rate": 1.799979327795942e-07, + "loss": 0.1498, + "step": 50606 + }, + { + "epoch": 0.9395409972412307, + "grad_norm": 0.41016069054603577, + "learning_rate": 1.7977767302059025e-07, + "loss": 0.2311, + "step": 50608 + }, + { + "epoch": 0.9395781273786494, + "grad_norm": 0.3183784484863281, + "learning_rate": 1.7955754688539718e-07, + "loss": 0.2397, + "step": 50610 + }, + { + "epoch": 0.9396152575160681, + "grad_norm": 0.33483946323394775, + "learning_rate": 1.793375543770104e-07, + "loss": 0.2633, + "step": 50612 + }, + { + "epoch": 0.9396523876534867, + "grad_norm": 0.41786736249923706, + "learning_rate": 1.7911769549842307e-07, + "loss": 0.5234, + "step": 50614 + }, + { + "epoch": 0.9396895177909054, + "grad_norm": 0.3217369616031647, + "learning_rate": 1.788979702526261e-07, + "loss": 0.4471, + "step": 50616 + }, + { + "epoch": 0.9397266479283239, + "grad_norm": 0.37516579031944275, + "learning_rate": 1.786783786426116e-07, + "loss": 0.3092, + "step": 50618 + }, + { + "epoch": 0.9397637780657426, + "grad_norm": 0.36329740285873413, + "learning_rate": 1.7845892067136604e-07, + "loss": 0.2399, + "step": 50620 + }, + { + "epoch": 0.9398009082031613, + "grad_norm": 0.40958651900291443, + "learning_rate": 1.782395963418748e-07, + "loss": 0.2678, + "step": 50622 + }, + { + "epoch": 0.9398380383405799, + "grad_norm": 0.476159930229187, + "learning_rate": 1.780204056571222e-07, + "loss": 0.2439, + "step": 50624 + }, + { + "epoch": 0.9398751684779986, + "grad_norm": 0.6685574054718018, + "learning_rate": 1.7780134862009356e-07, + "loss": 0.2646, + "step": 50626 + }, + { + "epoch": 0.9399122986154171, + "grad_norm": 0.42609867453575134, + "learning_rate": 1.7758242523376658e-07, + "loss": 0.3007, + "step": 50628 + }, + { + "epoch": 0.9399494287528358, + "grad_norm": 0.44763118028640747, + "learning_rate": 1.7736363550112213e-07, + "loss": 0.2261, + "step": 50630 + }, + { + "epoch": 0.9399865588902544, + "grad_norm": 0.4175860583782196, + "learning_rate": 1.7714497942513565e-07, + "loss": 0.4953, + "step": 50632 + }, + { + "epoch": 0.9400236890276731, + "grad_norm": 0.37613430619239807, + "learning_rate": 1.7692645700878365e-07, + "loss": 0.38, + "step": 50634 + }, + { + "epoch": 0.9400608191650918, + "grad_norm": 0.5618124604225159, + "learning_rate": 1.7670806825503816e-07, + "loss": 0.2512, + "step": 50636 + }, + { + "epoch": 0.9400979493025103, + "grad_norm": 0.35358425974845886, + "learning_rate": 1.7648981316687242e-07, + "loss": 0.1269, + "step": 50638 + }, + { + "epoch": 0.940135079439929, + "grad_norm": 0.3483954966068268, + "learning_rate": 1.762716917472551e-07, + "loss": 0.2341, + "step": 50640 + }, + { + "epoch": 0.9401722095773476, + "grad_norm": 0.26897183060646057, + "learning_rate": 1.76053703999155e-07, + "loss": 0.2906, + "step": 50642 + }, + { + "epoch": 0.9402093397147663, + "grad_norm": 0.33310216665267944, + "learning_rate": 1.758358499255375e-07, + "loss": 0.3203, + "step": 50644 + }, + { + "epoch": 0.940246469852185, + "grad_norm": 0.3106984794139862, + "learning_rate": 1.7561812952936795e-07, + "loss": 0.0731, + "step": 50646 + }, + { + "epoch": 0.9402835999896035, + "grad_norm": 0.42390069365501404, + "learning_rate": 1.7540054281360742e-07, + "loss": 0.3323, + "step": 50648 + }, + { + "epoch": 0.9403207301270222, + "grad_norm": 0.5804970264434814, + "learning_rate": 1.751830897812179e-07, + "loss": 0.2338, + "step": 50650 + }, + { + "epoch": 0.9403578602644408, + "grad_norm": 0.39209285378456116, + "learning_rate": 1.7496577043515817e-07, + "loss": 0.4027, + "step": 50652 + }, + { + "epoch": 0.9403949904018595, + "grad_norm": 0.2924545705318451, + "learning_rate": 1.7474858477838363e-07, + "loss": 0.1227, + "step": 50654 + }, + { + "epoch": 0.9404321205392782, + "grad_norm": 0.3210468590259552, + "learning_rate": 1.7453153281385304e-07, + "loss": 0.3435, + "step": 50656 + }, + { + "epoch": 0.9404692506766967, + "grad_norm": 0.38592904806137085, + "learning_rate": 1.7431461454451514e-07, + "loss": 0.1794, + "step": 50658 + }, + { + "epoch": 0.9405063808141154, + "grad_norm": 0.5031899809837341, + "learning_rate": 1.7409782997332536e-07, + "loss": 0.2234, + "step": 50660 + }, + { + "epoch": 0.940543510951534, + "grad_norm": 0.4274146854877472, + "learning_rate": 1.7388117910323133e-07, + "loss": 0.1912, + "step": 50662 + }, + { + "epoch": 0.9405806410889527, + "grad_norm": 0.9583744406700134, + "learning_rate": 1.736646619371818e-07, + "loss": 0.1072, + "step": 50664 + }, + { + "epoch": 0.9406177712263714, + "grad_norm": 0.32100996375083923, + "learning_rate": 1.734482784781233e-07, + "loss": 0.3274, + "step": 50666 + }, + { + "epoch": 0.9406549013637899, + "grad_norm": 0.34991973638534546, + "learning_rate": 1.7323202872900126e-07, + "loss": 0.3383, + "step": 50668 + }, + { + "epoch": 0.9406920315012086, + "grad_norm": 0.4815565049648285, + "learning_rate": 1.730159126927544e-07, + "loss": 0.1787, + "step": 50670 + }, + { + "epoch": 0.9407291616386272, + "grad_norm": 0.27533143758773804, + "learning_rate": 1.7279993037232712e-07, + "loss": 0.2097, + "step": 50672 + }, + { + "epoch": 0.9407662917760459, + "grad_norm": 0.26766160130500793, + "learning_rate": 1.72584081770657e-07, + "loss": 0.286, + "step": 50674 + }, + { + "epoch": 0.9408034219134646, + "grad_norm": 0.282402902841568, + "learning_rate": 1.7236836689068058e-07, + "loss": 0.2247, + "step": 50676 + }, + { + "epoch": 0.9408405520508831, + "grad_norm": 0.523047685623169, + "learning_rate": 1.7215278573533334e-07, + "loss": 0.228, + "step": 50678 + }, + { + "epoch": 0.9408776821883018, + "grad_norm": 0.24758701026439667, + "learning_rate": 1.7193733830754845e-07, + "loss": 0.0928, + "step": 50680 + }, + { + "epoch": 0.9409148123257204, + "grad_norm": 0.5844394564628601, + "learning_rate": 1.7172202461025912e-07, + "loss": 0.1938, + "step": 50682 + }, + { + "epoch": 0.9409519424631391, + "grad_norm": 0.7414089441299438, + "learning_rate": 1.71506844646393e-07, + "loss": 0.1886, + "step": 50684 + }, + { + "epoch": 0.9409890726005576, + "grad_norm": 0.3517048954963684, + "learning_rate": 1.712917984188789e-07, + "loss": 0.1523, + "step": 50686 + }, + { + "epoch": 0.9410262027379763, + "grad_norm": 0.3593765199184418, + "learning_rate": 1.710768859306433e-07, + "loss": 0.297, + "step": 50688 + }, + { + "epoch": 0.941063332875395, + "grad_norm": 0.547726035118103, + "learning_rate": 1.708621071846106e-07, + "loss": 0.1596, + "step": 50690 + }, + { + "epoch": 0.9411004630128136, + "grad_norm": 0.3129470944404602, + "learning_rate": 1.7064746218370288e-07, + "loss": 0.2858, + "step": 50692 + }, + { + "epoch": 0.9411375931502323, + "grad_norm": 0.4534943997859955, + "learning_rate": 1.7043295093084112e-07, + "loss": 0.263, + "step": 50694 + }, + { + "epoch": 0.9411747232876508, + "grad_norm": 0.4176694452762604, + "learning_rate": 1.702185734289441e-07, + "loss": 0.3454, + "step": 50696 + }, + { + "epoch": 0.9412118534250695, + "grad_norm": 0.705500602722168, + "learning_rate": 1.700043296809273e-07, + "loss": 0.1812, + "step": 50698 + }, + { + "epoch": 0.9412489835624882, + "grad_norm": 0.28096455335617065, + "learning_rate": 1.6979021968970832e-07, + "loss": 0.2047, + "step": 50700 + }, + { + "epoch": 0.9412861136999068, + "grad_norm": 0.2851487398147583, + "learning_rate": 1.6957624345819934e-07, + "loss": 0.3721, + "step": 50702 + }, + { + "epoch": 0.9413232438373255, + "grad_norm": 0.3478700518608093, + "learning_rate": 1.6936240098931135e-07, + "loss": 0.3118, + "step": 50704 + }, + { + "epoch": 0.941360373974744, + "grad_norm": 0.35653167963027954, + "learning_rate": 1.6914869228595644e-07, + "loss": 0.2814, + "step": 50706 + }, + { + "epoch": 0.9413975041121627, + "grad_norm": 0.3799697756767273, + "learning_rate": 1.6893511735104007e-07, + "loss": 0.2944, + "step": 50708 + }, + { + "epoch": 0.9414346342495814, + "grad_norm": 0.42187803983688354, + "learning_rate": 1.6872167618746992e-07, + "loss": 0.3894, + "step": 50710 + }, + { + "epoch": 0.941471764387, + "grad_norm": 0.31701526045799255, + "learning_rate": 1.6850836879814925e-07, + "loss": 0.1939, + "step": 50712 + }, + { + "epoch": 0.9415088945244187, + "grad_norm": 0.41651391983032227, + "learning_rate": 1.6829519518598014e-07, + "loss": 0.4861, + "step": 50714 + }, + { + "epoch": 0.9415460246618372, + "grad_norm": 0.3945968747138977, + "learning_rate": 1.6808215535386584e-07, + "loss": 0.0756, + "step": 50716 + }, + { + "epoch": 0.9415831547992559, + "grad_norm": 0.48246362805366516, + "learning_rate": 1.6786924930470293e-07, + "loss": 0.1649, + "step": 50718 + }, + { + "epoch": 0.9416202849366746, + "grad_norm": 0.508870542049408, + "learning_rate": 1.6765647704138909e-07, + "loss": 0.262, + "step": 50720 + }, + { + "epoch": 0.9416574150740932, + "grad_norm": 0.4178345501422882, + "learning_rate": 1.6744383856681867e-07, + "loss": 0.2277, + "step": 50722 + }, + { + "epoch": 0.9416945452115119, + "grad_norm": 0.6024828553199768, + "learning_rate": 1.672313338838849e-07, + "loss": 0.3844, + "step": 50724 + }, + { + "epoch": 0.9417316753489304, + "grad_norm": 0.47281378507614136, + "learning_rate": 1.6701896299548105e-07, + "loss": 0.2748, + "step": 50726 + }, + { + "epoch": 0.9417688054863491, + "grad_norm": 0.9686122536659241, + "learning_rate": 1.6680672590449588e-07, + "loss": 0.2208, + "step": 50728 + }, + { + "epoch": 0.9418059356237678, + "grad_norm": 0.42115071415901184, + "learning_rate": 1.6659462261381709e-07, + "loss": 0.3366, + "step": 50730 + }, + { + "epoch": 0.9418430657611864, + "grad_norm": 0.4273863732814789, + "learning_rate": 1.6638265312633129e-07, + "loss": 0.2523, + "step": 50732 + }, + { + "epoch": 0.941880195898605, + "grad_norm": 0.4353385865688324, + "learning_rate": 1.661708174449228e-07, + "loss": 0.1577, + "step": 50734 + }, + { + "epoch": 0.9419173260360236, + "grad_norm": 0.4323591887950897, + "learning_rate": 1.6595911557247267e-07, + "loss": 0.2597, + "step": 50736 + }, + { + "epoch": 0.9419544561734423, + "grad_norm": 0.31123489141464233, + "learning_rate": 1.6574754751186416e-07, + "loss": 0.3125, + "step": 50738 + }, + { + "epoch": 0.9419915863108609, + "grad_norm": 0.29007601737976074, + "learning_rate": 1.6553611326597384e-07, + "loss": 0.1705, + "step": 50740 + }, + { + "epoch": 0.9420287164482796, + "grad_norm": 0.28883588314056396, + "learning_rate": 1.6532481283767833e-07, + "loss": 0.2308, + "step": 50742 + }, + { + "epoch": 0.9420658465856983, + "grad_norm": 0.4793853461742401, + "learning_rate": 1.6511364622985527e-07, + "loss": 0.214, + "step": 50744 + }, + { + "epoch": 0.9421029767231168, + "grad_norm": 0.5205013751983643, + "learning_rate": 1.6490261344537684e-07, + "loss": 0.2724, + "step": 50746 + }, + { + "epoch": 0.9421401068605355, + "grad_norm": 0.4871504604816437, + "learning_rate": 1.6469171448711298e-07, + "loss": 0.1824, + "step": 50748 + }, + { + "epoch": 0.9421772369979541, + "grad_norm": 0.2478114813566208, + "learning_rate": 1.6448094935793469e-07, + "loss": 0.2894, + "step": 50750 + }, + { + "epoch": 0.9422143671353728, + "grad_norm": 0.3564758598804474, + "learning_rate": 1.642703180607108e-07, + "loss": 0.2557, + "step": 50752 + }, + { + "epoch": 0.9422514972727914, + "grad_norm": 0.34629225730895996, + "learning_rate": 1.6405982059830573e-07, + "loss": 0.1682, + "step": 50754 + }, + { + "epoch": 0.94228862741021, + "grad_norm": 0.3364121913909912, + "learning_rate": 1.63849456973586e-07, + "loss": 0.2919, + "step": 50756 + }, + { + "epoch": 0.9423257575476287, + "grad_norm": 0.5090534687042236, + "learning_rate": 1.636392271894105e-07, + "loss": 0.1951, + "step": 50758 + }, + { + "epoch": 0.9423628876850473, + "grad_norm": 0.47575676441192627, + "learning_rate": 1.6342913124864356e-07, + "loss": 0.321, + "step": 50760 + }, + { + "epoch": 0.942400017822466, + "grad_norm": 0.6369500756263733, + "learning_rate": 1.6321916915414182e-07, + "loss": 0.1285, + "step": 50762 + }, + { + "epoch": 0.9424371479598846, + "grad_norm": 0.38148143887519836, + "learning_rate": 1.6300934090876187e-07, + "loss": 0.2228, + "step": 50764 + }, + { + "epoch": 0.9424742780973032, + "grad_norm": 0.5819621682167053, + "learning_rate": 1.6279964651535919e-07, + "loss": 0.4027, + "step": 50766 + }, + { + "epoch": 0.9425114082347219, + "grad_norm": 0.2398240864276886, + "learning_rate": 1.6259008597678817e-07, + "loss": 0.1611, + "step": 50768 + }, + { + "epoch": 0.9425485383721405, + "grad_norm": 0.31871211528778076, + "learning_rate": 1.6238065929589985e-07, + "loss": 0.0933, + "step": 50770 + }, + { + "epoch": 0.9425856685095592, + "grad_norm": 0.3966899812221527, + "learning_rate": 1.621713664755431e-07, + "loss": 0.3231, + "step": 50772 + }, + { + "epoch": 0.9426227986469778, + "grad_norm": 0.366228848695755, + "learning_rate": 1.6196220751856674e-07, + "loss": 0.2633, + "step": 50774 + }, + { + "epoch": 0.9426599287843964, + "grad_norm": 0.18636897206306458, + "learning_rate": 1.6175318242781623e-07, + "loss": 0.1736, + "step": 50776 + }, + { + "epoch": 0.9426970589218151, + "grad_norm": 0.49861106276512146, + "learning_rate": 1.6154429120613712e-07, + "loss": 0.2072, + "step": 50778 + }, + { + "epoch": 0.9427341890592337, + "grad_norm": 0.3963800370693207, + "learning_rate": 1.6133553385636935e-07, + "loss": 0.3982, + "step": 50780 + }, + { + "epoch": 0.9427713191966524, + "grad_norm": 1.3076131343841553, + "learning_rate": 1.611269103813551e-07, + "loss": 0.3215, + "step": 50782 + }, + { + "epoch": 0.9428084493340709, + "grad_norm": 0.4988119900226593, + "learning_rate": 1.6091842078393204e-07, + "loss": 0.267, + "step": 50784 + }, + { + "epoch": 0.9428455794714896, + "grad_norm": 0.33033621311187744, + "learning_rate": 1.607100650669391e-07, + "loss": 0.2916, + "step": 50786 + }, + { + "epoch": 0.9428827096089083, + "grad_norm": 0.40125006437301636, + "learning_rate": 1.605018432332084e-07, + "loss": 0.2322, + "step": 50788 + }, + { + "epoch": 0.9429198397463269, + "grad_norm": 0.34340158104896545, + "learning_rate": 1.6029375528557544e-07, + "loss": 0.3728, + "step": 50790 + }, + { + "epoch": 0.9429569698837456, + "grad_norm": 0.3459474742412567, + "learning_rate": 1.6008580122687244e-07, + "loss": 0.3021, + "step": 50792 + }, + { + "epoch": 0.9429941000211641, + "grad_norm": 0.4168727695941925, + "learning_rate": 1.5987798105992714e-07, + "loss": 0.2049, + "step": 50794 + }, + { + "epoch": 0.9430312301585828, + "grad_norm": 0.37913772463798523, + "learning_rate": 1.5967029478756724e-07, + "loss": 0.4085, + "step": 50796 + }, + { + "epoch": 0.9430683602960015, + "grad_norm": 0.3915462791919708, + "learning_rate": 1.594627424126194e-07, + "loss": 0.2323, + "step": 50798 + }, + { + "epoch": 0.9431054904334201, + "grad_norm": 0.4993734061717987, + "learning_rate": 1.592553239379091e-07, + "loss": 0.3659, + "step": 50800 + }, + { + "epoch": 0.9431426205708388, + "grad_norm": 1.349609136581421, + "learning_rate": 1.5904803936625635e-07, + "loss": 0.1961, + "step": 50802 + }, + { + "epoch": 0.9431797507082573, + "grad_norm": 0.3478587567806244, + "learning_rate": 1.588408887004833e-07, + "loss": 0.4316, + "step": 50804 + }, + { + "epoch": 0.943216880845676, + "grad_norm": 0.34329503774642944, + "learning_rate": 1.5863387194340663e-07, + "loss": 0.2534, + "step": 50806 + }, + { + "epoch": 0.9432540109830947, + "grad_norm": 0.3742624819278717, + "learning_rate": 1.5842698909784737e-07, + "loss": 0.0962, + "step": 50808 + }, + { + "epoch": 0.9432911411205133, + "grad_norm": 0.3517443537712097, + "learning_rate": 1.5822024016661553e-07, + "loss": 0.285, + "step": 50810 + }, + { + "epoch": 0.943328271257932, + "grad_norm": 0.36728939414024353, + "learning_rate": 1.5801362515252773e-07, + "loss": 0.252, + "step": 50812 + }, + { + "epoch": 0.9433654013953505, + "grad_norm": 0.456514835357666, + "learning_rate": 1.578071440583928e-07, + "loss": 0.2715, + "step": 50814 + }, + { + "epoch": 0.9434025315327692, + "grad_norm": 0.5580490827560425, + "learning_rate": 1.57600796887023e-07, + "loss": 0.3401, + "step": 50816 + }, + { + "epoch": 0.9434396616701879, + "grad_norm": 0.43877464532852173, + "learning_rate": 1.5739458364122494e-07, + "loss": 0.0728, + "step": 50818 + }, + { + "epoch": 0.9434767918076065, + "grad_norm": 0.258999228477478, + "learning_rate": 1.5718850432380528e-07, + "loss": 0.3801, + "step": 50820 + }, + { + "epoch": 0.9435139219450251, + "grad_norm": 0.38710400462150574, + "learning_rate": 1.569825589375673e-07, + "loss": 0.0745, + "step": 50822 + }, + { + "epoch": 0.9435510520824437, + "grad_norm": 0.4101353883743286, + "learning_rate": 1.5677674748531323e-07, + "loss": 0.1797, + "step": 50824 + }, + { + "epoch": 0.9435881822198624, + "grad_norm": 0.34796273708343506, + "learning_rate": 1.5657106996984305e-07, + "loss": 0.2001, + "step": 50826 + }, + { + "epoch": 0.9436253123572811, + "grad_norm": 0.5266385078430176, + "learning_rate": 1.5636552639395676e-07, + "loss": 0.2376, + "step": 50828 + }, + { + "epoch": 0.9436624424946997, + "grad_norm": 0.2701573967933655, + "learning_rate": 1.5616011676044984e-07, + "loss": 0.3514, + "step": 50830 + }, + { + "epoch": 0.9436995726321183, + "grad_norm": 0.376899391412735, + "learning_rate": 1.55954841072119e-07, + "loss": 0.2988, + "step": 50832 + }, + { + "epoch": 0.9437367027695369, + "grad_norm": 0.46510744094848633, + "learning_rate": 1.5574969933175532e-07, + "loss": 0.1707, + "step": 50834 + }, + { + "epoch": 0.9437738329069556, + "grad_norm": 0.392042338848114, + "learning_rate": 1.5554469154215212e-07, + "loss": 0.4621, + "step": 50836 + }, + { + "epoch": 0.9438109630443742, + "grad_norm": 0.269295871257782, + "learning_rate": 1.553398177060983e-07, + "loss": 0.3467, + "step": 50838 + }, + { + "epoch": 0.9438480931817929, + "grad_norm": 0.3690491318702698, + "learning_rate": 1.551350778263805e-07, + "loss": 0.0764, + "step": 50840 + }, + { + "epoch": 0.9438852233192115, + "grad_norm": 0.33338361978530884, + "learning_rate": 1.5493047190578647e-07, + "loss": 0.2309, + "step": 50842 + }, + { + "epoch": 0.9439223534566301, + "grad_norm": 0.4011947810649872, + "learning_rate": 1.5472599994709848e-07, + "loss": 0.145, + "step": 50844 + }, + { + "epoch": 0.9439594835940488, + "grad_norm": 0.36638596653938293, + "learning_rate": 1.5452166195310093e-07, + "loss": 0.3649, + "step": 50846 + }, + { + "epoch": 0.9439966137314674, + "grad_norm": 0.5089031457901001, + "learning_rate": 1.543174579265716e-07, + "loss": 0.5912, + "step": 50848 + }, + { + "epoch": 0.944033743868886, + "grad_norm": 0.40648385882377625, + "learning_rate": 1.541133878702905e-07, + "loss": 0.3476, + "step": 50850 + }, + { + "epoch": 0.9440708740063047, + "grad_norm": 0.3949213922023773, + "learning_rate": 1.5390945178703543e-07, + "loss": 0.2193, + "step": 50852 + }, + { + "epoch": 0.9441080041437233, + "grad_norm": 0.36781901121139526, + "learning_rate": 1.5370564967957857e-07, + "loss": 0.3288, + "step": 50854 + }, + { + "epoch": 0.944145134281142, + "grad_norm": 0.46677350997924805, + "learning_rate": 1.535019815506955e-07, + "loss": 0.2375, + "step": 50856 + }, + { + "epoch": 0.9441822644185606, + "grad_norm": 0.3283880054950714, + "learning_rate": 1.5329844740315736e-07, + "loss": 0.2453, + "step": 50858 + }, + { + "epoch": 0.9442193945559793, + "grad_norm": 0.3368111848831177, + "learning_rate": 1.53095047239733e-07, + "loss": 0.2168, + "step": 50860 + }, + { + "epoch": 0.9442565246933979, + "grad_norm": 0.44965943694114685, + "learning_rate": 1.5289178106319024e-07, + "loss": 0.3098, + "step": 50862 + }, + { + "epoch": 0.9442936548308165, + "grad_norm": 0.3287608027458191, + "learning_rate": 1.5268864887629465e-07, + "loss": 0.1679, + "step": 50864 + }, + { + "epoch": 0.9443307849682352, + "grad_norm": 0.22254541516304016, + "learning_rate": 1.5248565068180954e-07, + "loss": 0.3215, + "step": 50866 + }, + { + "epoch": 0.9443679151056538, + "grad_norm": 0.5609930157661438, + "learning_rate": 1.5228278648249938e-07, + "loss": 0.22, + "step": 50868 + }, + { + "epoch": 0.9444050452430724, + "grad_norm": 0.36397600173950195, + "learning_rate": 1.520800562811231e-07, + "loss": 0.3221, + "step": 50870 + }, + { + "epoch": 0.9444421753804911, + "grad_norm": 0.32637637853622437, + "learning_rate": 1.5187746008043846e-07, + "loss": 0.2578, + "step": 50872 + }, + { + "epoch": 0.9444793055179097, + "grad_norm": 0.37783822417259216, + "learning_rate": 1.5167499788320216e-07, + "loss": 0.2866, + "step": 50874 + }, + { + "epoch": 0.9445164356553284, + "grad_norm": 0.43859991431236267, + "learning_rate": 1.51472669692172e-07, + "loss": 0.2698, + "step": 50876 + }, + { + "epoch": 0.944553565792747, + "grad_norm": 0.47613459825515747, + "learning_rate": 1.51270475510098e-07, + "loss": 0.3631, + "step": 50878 + }, + { + "epoch": 0.9445906959301656, + "grad_norm": 0.3435044586658478, + "learning_rate": 1.5106841533973238e-07, + "loss": 0.1484, + "step": 50880 + }, + { + "epoch": 0.9446278260675843, + "grad_norm": 0.3109498620033264, + "learning_rate": 1.5086648918382407e-07, + "loss": 0.2662, + "step": 50882 + }, + { + "epoch": 0.9446649562050029, + "grad_norm": 0.21308717131614685, + "learning_rate": 1.506646970451231e-07, + "loss": 0.1752, + "step": 50884 + }, + { + "epoch": 0.9447020863424216, + "grad_norm": 0.3168817460536957, + "learning_rate": 1.5046303892637282e-07, + "loss": 0.2611, + "step": 50886 + }, + { + "epoch": 0.9447392164798402, + "grad_norm": 0.39375633001327515, + "learning_rate": 1.502615148303177e-07, + "loss": 0.2608, + "step": 50888 + }, + { + "epoch": 0.9447763466172588, + "grad_norm": 0.4844721257686615, + "learning_rate": 1.5006012475969888e-07, + "loss": 0.2319, + "step": 50890 + }, + { + "epoch": 0.9448134767546774, + "grad_norm": 0.28992506861686707, + "learning_rate": 1.4985886871725862e-07, + "loss": 0.1648, + "step": 50892 + }, + { + "epoch": 0.9448506068920961, + "grad_norm": 0.3431810438632965, + "learning_rate": 1.4965774670573473e-07, + "loss": 0.4382, + "step": 50894 + }, + { + "epoch": 0.9448877370295148, + "grad_norm": 0.42396751046180725, + "learning_rate": 1.4945675872786392e-07, + "loss": 0.3528, + "step": 50896 + }, + { + "epoch": 0.9449248671669334, + "grad_norm": 0.1901555210351944, + "learning_rate": 1.4925590478638064e-07, + "loss": 0.2131, + "step": 50898 + }, + { + "epoch": 0.944961997304352, + "grad_norm": 0.3956177532672882, + "learning_rate": 1.4905518488401714e-07, + "loss": 0.3167, + "step": 50900 + }, + { + "epoch": 0.9449991274417706, + "grad_norm": 0.4629685580730438, + "learning_rate": 1.4885459902350685e-07, + "loss": 0.2148, + "step": 50902 + }, + { + "epoch": 0.9450362575791893, + "grad_norm": 0.35121551156044006, + "learning_rate": 1.4865414720757641e-07, + "loss": 0.3034, + "step": 50904 + }, + { + "epoch": 0.945073387716608, + "grad_norm": 0.21928615868091583, + "learning_rate": 1.4845382943895704e-07, + "loss": 0.3247, + "step": 50906 + }, + { + "epoch": 0.9451105178540266, + "grad_norm": 0.2957574725151062, + "learning_rate": 1.4825364572037093e-07, + "loss": 0.1804, + "step": 50908 + }, + { + "epoch": 0.9451476479914452, + "grad_norm": 0.22765344381332397, + "learning_rate": 1.4805359605454373e-07, + "loss": 0.183, + "step": 50910 + }, + { + "epoch": 0.9451847781288638, + "grad_norm": 0.41308075189590454, + "learning_rate": 1.4785368044419657e-07, + "loss": 0.2705, + "step": 50912 + }, + { + "epoch": 0.9452219082662825, + "grad_norm": 0.5689462423324585, + "learning_rate": 1.476538988920506e-07, + "loss": 0.2481, + "step": 50914 + }, + { + "epoch": 0.9452590384037012, + "grad_norm": 0.6958836913108826, + "learning_rate": 1.4745425140082258e-07, + "loss": 0.2674, + "step": 50916 + }, + { + "epoch": 0.9452961685411198, + "grad_norm": 0.30955561995506287, + "learning_rate": 1.472547379732314e-07, + "loss": 0.1922, + "step": 50918 + }, + { + "epoch": 0.9453332986785384, + "grad_norm": 0.21716073155403137, + "learning_rate": 1.4705535861199162e-07, + "loss": 0.3639, + "step": 50920 + }, + { + "epoch": 0.945370428815957, + "grad_norm": 0.2723653018474579, + "learning_rate": 1.468561133198143e-07, + "loss": 0.3251, + "step": 50922 + }, + { + "epoch": 0.9454075589533757, + "grad_norm": 0.35107341408729553, + "learning_rate": 1.4665700209941292e-07, + "loss": 0.2167, + "step": 50924 + }, + { + "epoch": 0.9454446890907944, + "grad_norm": 0.466427743434906, + "learning_rate": 1.4645802495349414e-07, + "loss": 0.2326, + "step": 50926 + }, + { + "epoch": 0.945481819228213, + "grad_norm": 0.35235506296157837, + "learning_rate": 1.4625918188476806e-07, + "loss": 0.277, + "step": 50928 + }, + { + "epoch": 0.9455189493656316, + "grad_norm": 0.24556496739387512, + "learning_rate": 1.4606047289593805e-07, + "loss": 0.2478, + "step": 50930 + }, + { + "epoch": 0.9455560795030502, + "grad_norm": 0.41484537720680237, + "learning_rate": 1.4586189798970973e-07, + "loss": 0.2627, + "step": 50932 + }, + { + "epoch": 0.9455932096404689, + "grad_norm": 0.530802845954895, + "learning_rate": 1.4566345716878539e-07, + "loss": 0.2982, + "step": 50934 + }, + { + "epoch": 0.9456303397778875, + "grad_norm": 0.18776439130306244, + "learning_rate": 1.4546515043586285e-07, + "loss": 0.0632, + "step": 50936 + }, + { + "epoch": 0.9456674699153061, + "grad_norm": 0.4547887444496155, + "learning_rate": 1.4526697779364218e-07, + "loss": 0.1809, + "step": 50938 + }, + { + "epoch": 0.9457046000527248, + "grad_norm": 0.41720056533813477, + "learning_rate": 1.4506893924481902e-07, + "loss": 0.2367, + "step": 50940 + }, + { + "epoch": 0.9457417301901434, + "grad_norm": 0.41276952624320984, + "learning_rate": 1.4487103479208898e-07, + "loss": 0.4067, + "step": 50942 + }, + { + "epoch": 0.9457788603275621, + "grad_norm": 0.4630793631076813, + "learning_rate": 1.4467326443814433e-07, + "loss": 0.3856, + "step": 50944 + }, + { + "epoch": 0.9458159904649807, + "grad_norm": 0.42008623480796814, + "learning_rate": 1.444756281856774e-07, + "loss": 0.4544, + "step": 50946 + }, + { + "epoch": 0.9458531206023993, + "grad_norm": 0.4949977993965149, + "learning_rate": 1.4427812603737601e-07, + "loss": 0.2481, + "step": 50948 + }, + { + "epoch": 0.945890250739818, + "grad_norm": 0.35529232025146484, + "learning_rate": 1.4408075799592913e-07, + "loss": 0.4289, + "step": 50950 + }, + { + "epoch": 0.9459273808772366, + "grad_norm": 0.6116740703582764, + "learning_rate": 1.4388352406401905e-07, + "loss": 0.3819, + "step": 50952 + }, + { + "epoch": 0.9459645110146553, + "grad_norm": 0.3929576277732849, + "learning_rate": 1.4368642424433365e-07, + "loss": 0.2431, + "step": 50954 + }, + { + "epoch": 0.9460016411520739, + "grad_norm": 0.30607128143310547, + "learning_rate": 1.4348945853955188e-07, + "loss": 0.2749, + "step": 50956 + }, + { + "epoch": 0.9460387712894925, + "grad_norm": 0.32680752873420715, + "learning_rate": 1.432926269523549e-07, + "loss": 0.2426, + "step": 50958 + }, + { + "epoch": 0.9460759014269112, + "grad_norm": 0.3459329307079315, + "learning_rate": 1.4309592948542172e-07, + "loss": 0.1661, + "step": 50960 + }, + { + "epoch": 0.9461130315643298, + "grad_norm": 0.23430697619915009, + "learning_rate": 1.4289936614142686e-07, + "loss": 0.1726, + "step": 50962 + }, + { + "epoch": 0.9461501617017485, + "grad_norm": 0.5292536020278931, + "learning_rate": 1.4270293692304704e-07, + "loss": 0.3447, + "step": 50964 + }, + { + "epoch": 0.946187291839167, + "grad_norm": 0.5701496005058289, + "learning_rate": 1.4250664183295347e-07, + "loss": 0.631, + "step": 50966 + }, + { + "epoch": 0.9462244219765857, + "grad_norm": 0.31739240884780884, + "learning_rate": 1.423104808738196e-07, + "loss": 0.1677, + "step": 50968 + }, + { + "epoch": 0.9462615521140044, + "grad_norm": 0.3869602084159851, + "learning_rate": 1.4211445404831214e-07, + "loss": 0.5141, + "step": 50970 + }, + { + "epoch": 0.946298682251423, + "grad_norm": 0.3388206958770752, + "learning_rate": 1.41918561359099e-07, + "loss": 0.2797, + "step": 50972 + }, + { + "epoch": 0.9463358123888417, + "grad_norm": 0.5334675312042236, + "learning_rate": 1.4172280280884576e-07, + "loss": 0.3169, + "step": 50974 + }, + { + "epoch": 0.9463729425262603, + "grad_norm": 0.361398845911026, + "learning_rate": 1.415271784002159e-07, + "loss": 0.2522, + "step": 50976 + }, + { + "epoch": 0.9464100726636789, + "grad_norm": 0.5255299806594849, + "learning_rate": 1.4133168813587173e-07, + "loss": 0.1057, + "step": 50978 + }, + { + "epoch": 0.9464472028010976, + "grad_norm": 0.42784133553504944, + "learning_rate": 1.4113633201847333e-07, + "loss": 0.5189, + "step": 50980 + }, + { + "epoch": 0.9464843329385162, + "grad_norm": 0.33455702662467957, + "learning_rate": 1.4094111005067856e-07, + "loss": 0.3069, + "step": 50982 + }, + { + "epoch": 0.9465214630759349, + "grad_norm": 0.5092288255691528, + "learning_rate": 1.4074602223514422e-07, + "loss": 0.3683, + "step": 50984 + }, + { + "epoch": 0.9465585932133534, + "grad_norm": 0.4560586214065552, + "learning_rate": 1.4055106857452483e-07, + "loss": 0.3923, + "step": 50986 + }, + { + "epoch": 0.9465957233507721, + "grad_norm": 0.4736967086791992, + "learning_rate": 1.403562490714716e-07, + "loss": 0.2505, + "step": 50988 + }, + { + "epoch": 0.9466328534881907, + "grad_norm": 0.564443826675415, + "learning_rate": 1.4016156372863798e-07, + "loss": 0.2412, + "step": 50990 + }, + { + "epoch": 0.9466699836256094, + "grad_norm": 0.4422684609889984, + "learning_rate": 1.3996701254867185e-07, + "loss": 0.395, + "step": 50992 + }, + { + "epoch": 0.9467071137630281, + "grad_norm": 0.4116092324256897, + "learning_rate": 1.3977259553422106e-07, + "loss": 0.3189, + "step": 50994 + }, + { + "epoch": 0.9467442439004466, + "grad_norm": 0.31543976068496704, + "learning_rate": 1.395783126879302e-07, + "loss": 0.4607, + "step": 50996 + }, + { + "epoch": 0.9467813740378653, + "grad_norm": 0.2742465138435364, + "learning_rate": 1.393841640124416e-07, + "loss": 0.2619, + "step": 50998 + }, + { + "epoch": 0.9468185041752839, + "grad_norm": 0.4801630973815918, + "learning_rate": 1.3919014951039977e-07, + "loss": 0.2777, + "step": 51000 + }, + { + "epoch": 0.9468556343127026, + "grad_norm": 0.398260235786438, + "learning_rate": 1.3899626918444265e-07, + "loss": 0.1127, + "step": 51002 + }, + { + "epoch": 0.9468927644501213, + "grad_norm": 0.5946736931800842, + "learning_rate": 1.388025230372103e-07, + "loss": 0.2116, + "step": 51004 + }, + { + "epoch": 0.9469298945875398, + "grad_norm": 0.5703896880149841, + "learning_rate": 1.3860891107133734e-07, + "loss": 0.3117, + "step": 51006 + }, + { + "epoch": 0.9469670247249585, + "grad_norm": 0.3947688341140747, + "learning_rate": 1.3841543328945828e-07, + "loss": 0.2346, + "step": 51008 + }, + { + "epoch": 0.9470041548623771, + "grad_norm": 0.30989542603492737, + "learning_rate": 1.382220896942077e-07, + "loss": 0.2145, + "step": 51010 + }, + { + "epoch": 0.9470412849997958, + "grad_norm": 1.6001261472702026, + "learning_rate": 1.3802888028821348e-07, + "loss": 0.3382, + "step": 51012 + }, + { + "epoch": 0.9470784151372145, + "grad_norm": 0.5119905471801758, + "learning_rate": 1.37835805074108e-07, + "loss": 0.2928, + "step": 51014 + }, + { + "epoch": 0.947115545274633, + "grad_norm": 0.2686319351196289, + "learning_rate": 1.3764286405451578e-07, + "loss": 0.1819, + "step": 51016 + }, + { + "epoch": 0.9471526754120517, + "grad_norm": 0.421956330537796, + "learning_rate": 1.3745005723206251e-07, + "loss": 0.2522, + "step": 51018 + }, + { + "epoch": 0.9471898055494703, + "grad_norm": 0.170281320810318, + "learning_rate": 1.3725738460937166e-07, + "loss": 0.1865, + "step": 51020 + }, + { + "epoch": 0.947226935686889, + "grad_norm": 0.48961326479911804, + "learning_rate": 1.370648461890667e-07, + "loss": 0.2206, + "step": 51022 + }, + { + "epoch": 0.9472640658243077, + "grad_norm": 0.22081826627254486, + "learning_rate": 1.3687244197376548e-07, + "loss": 0.248, + "step": 51024 + }, + { + "epoch": 0.9473011959617262, + "grad_norm": 0.26116743683815, + "learning_rate": 1.3668017196608707e-07, + "loss": 0.1911, + "step": 51026 + }, + { + "epoch": 0.9473383260991449, + "grad_norm": 0.4485781192779541, + "learning_rate": 1.3648803616864825e-07, + "loss": 0.2728, + "step": 51028 + }, + { + "epoch": 0.9473754562365635, + "grad_norm": 0.3974182903766632, + "learning_rate": 1.362960345840614e-07, + "loss": 0.1765, + "step": 51030 + }, + { + "epoch": 0.9474125863739822, + "grad_norm": 0.32474008202552795, + "learning_rate": 1.3610416721494103e-07, + "loss": 0.3082, + "step": 51032 + }, + { + "epoch": 0.9474497165114009, + "grad_norm": 0.4679239094257355, + "learning_rate": 1.3591243406389733e-07, + "loss": 0.2614, + "step": 51034 + }, + { + "epoch": 0.9474868466488194, + "grad_norm": 0.5621490478515625, + "learning_rate": 1.3572083513353928e-07, + "loss": 0.3231, + "step": 51036 + }, + { + "epoch": 0.9475239767862381, + "grad_norm": 0.6725314259529114, + "learning_rate": 1.3552937042647263e-07, + "loss": 0.2987, + "step": 51038 + }, + { + "epoch": 0.9475611069236567, + "grad_norm": 0.2390737235546112, + "learning_rate": 1.3533803994530415e-07, + "loss": 0.2607, + "step": 51040 + }, + { + "epoch": 0.9475982370610754, + "grad_norm": 0.32657134532928467, + "learning_rate": 1.351468436926362e-07, + "loss": 0.141, + "step": 51042 + }, + { + "epoch": 0.947635367198494, + "grad_norm": 0.37334662675857544, + "learning_rate": 1.3495578167107226e-07, + "loss": 0.3399, + "step": 51044 + }, + { + "epoch": 0.9476724973359126, + "grad_norm": 0.31984302401542664, + "learning_rate": 1.3476485388321136e-07, + "loss": 0.3492, + "step": 51046 + }, + { + "epoch": 0.9477096274733313, + "grad_norm": 0.3173534870147705, + "learning_rate": 1.3457406033164922e-07, + "loss": 0.3486, + "step": 51048 + }, + { + "epoch": 0.9477467576107499, + "grad_norm": 0.30752885341644287, + "learning_rate": 1.3438340101898483e-07, + "loss": 0.2684, + "step": 51050 + }, + { + "epoch": 0.9477838877481686, + "grad_norm": 0.4655018150806427, + "learning_rate": 1.341928759478106e-07, + "loss": 0.1938, + "step": 51052 + }, + { + "epoch": 0.9478210178855871, + "grad_norm": 0.40786081552505493, + "learning_rate": 1.3400248512072112e-07, + "loss": 0.442, + "step": 51054 + }, + { + "epoch": 0.9478581480230058, + "grad_norm": 0.6336261630058289, + "learning_rate": 1.338122285403054e-07, + "loss": 0.3988, + "step": 51056 + }, + { + "epoch": 0.9478952781604245, + "grad_norm": 0.32837778329849243, + "learning_rate": 1.336221062091514e-07, + "loss": 0.1753, + "step": 51058 + }, + { + "epoch": 0.9479324082978431, + "grad_norm": 0.5618468523025513, + "learning_rate": 1.3343211812984923e-07, + "loss": 0.325, + "step": 51060 + }, + { + "epoch": 0.9479695384352618, + "grad_norm": 0.3489518463611603, + "learning_rate": 1.3324226430498022e-07, + "loss": 0.1879, + "step": 51062 + }, + { + "epoch": 0.9480066685726803, + "grad_norm": 0.21018348634243011, + "learning_rate": 1.3305254473713003e-07, + "loss": 0.1693, + "step": 51064 + }, + { + "epoch": 0.948043798710099, + "grad_norm": 0.8977999091148376, + "learning_rate": 1.3286295942887994e-07, + "loss": 0.3582, + "step": 51066 + }, + { + "epoch": 0.9480809288475177, + "grad_norm": 0.3504578471183777, + "learning_rate": 1.326735083828101e-07, + "loss": 0.1833, + "step": 51068 + }, + { + "epoch": 0.9481180589849363, + "grad_norm": 0.7570350170135498, + "learning_rate": 1.3248419160149738e-07, + "loss": 0.2474, + "step": 51070 + }, + { + "epoch": 0.948155189122355, + "grad_norm": 0.482299268245697, + "learning_rate": 1.3229500908751858e-07, + "loss": 0.2272, + "step": 51072 + }, + { + "epoch": 0.9481923192597735, + "grad_norm": 0.5501895546913147, + "learning_rate": 1.3210596084344718e-07, + "loss": 0.2972, + "step": 51074 + }, + { + "epoch": 0.9482294493971922, + "grad_norm": 0.5036604404449463, + "learning_rate": 1.3191704687185558e-07, + "loss": 0.2989, + "step": 51076 + }, + { + "epoch": 0.9482665795346109, + "grad_norm": 0.36826324462890625, + "learning_rate": 1.3172826717531506e-07, + "loss": 0.2826, + "step": 51078 + }, + { + "epoch": 0.9483037096720295, + "grad_norm": 0.40228116512298584, + "learning_rate": 1.3153962175639357e-07, + "loss": 0.4322, + "step": 51080 + }, + { + "epoch": 0.9483408398094482, + "grad_norm": 0.6288641691207886, + "learning_rate": 1.3135111061765793e-07, + "loss": 0.3082, + "step": 51082 + }, + { + "epoch": 0.9483779699468667, + "grad_norm": 0.45098572969436646, + "learning_rate": 1.3116273376167498e-07, + "loss": 0.3693, + "step": 51084 + }, + { + "epoch": 0.9484151000842854, + "grad_norm": 0.32466498017311096, + "learning_rate": 1.3097449119100604e-07, + "loss": 0.4027, + "step": 51086 + }, + { + "epoch": 0.948452230221704, + "grad_norm": 0.471682071685791, + "learning_rate": 1.3078638290821234e-07, + "loss": 0.3085, + "step": 51088 + }, + { + "epoch": 0.9484893603591227, + "grad_norm": 0.3796114921569824, + "learning_rate": 1.3059840891585518e-07, + "loss": 0.1774, + "step": 51090 + }, + { + "epoch": 0.9485264904965414, + "grad_norm": 0.1632833182811737, + "learning_rate": 1.3041056921649142e-07, + "loss": 0.1883, + "step": 51092 + }, + { + "epoch": 0.9485636206339599, + "grad_norm": 0.48637375235557556, + "learning_rate": 1.3022286381267567e-07, + "loss": 0.3603, + "step": 51094 + }, + { + "epoch": 0.9486007507713786, + "grad_norm": 0.322948694229126, + "learning_rate": 1.300352927069648e-07, + "loss": 0.2058, + "step": 51096 + }, + { + "epoch": 0.9486378809087972, + "grad_norm": 0.4468685984611511, + "learning_rate": 1.2984785590190897e-07, + "loss": 0.2485, + "step": 51098 + }, + { + "epoch": 0.9486750110462159, + "grad_norm": 0.3279907703399658, + "learning_rate": 1.2966055340006057e-07, + "loss": 0.1749, + "step": 51100 + }, + { + "epoch": 0.9487121411836346, + "grad_norm": 0.4309535622596741, + "learning_rate": 1.2947338520396535e-07, + "loss": 0.2039, + "step": 51102 + }, + { + "epoch": 0.9487492713210531, + "grad_norm": 0.35730135440826416, + "learning_rate": 1.292863513161724e-07, + "loss": 0.4053, + "step": 51104 + }, + { + "epoch": 0.9487864014584718, + "grad_norm": 0.40997830033302307, + "learning_rate": 1.2909945173922522e-07, + "loss": 0.2538, + "step": 51106 + }, + { + "epoch": 0.9488235315958904, + "grad_norm": 0.39124903082847595, + "learning_rate": 1.2891268647566846e-07, + "loss": 0.2043, + "step": 51108 + }, + { + "epoch": 0.9488606617333091, + "grad_norm": 0.38807371258735657, + "learning_rate": 1.287260555280434e-07, + "loss": 0.1858, + "step": 51110 + }, + { + "epoch": 0.9488977918707278, + "grad_norm": 0.29328930377960205, + "learning_rate": 1.2853955889888804e-07, + "loss": 0.0799, + "step": 51112 + }, + { + "epoch": 0.9489349220081463, + "grad_norm": 0.3937098979949951, + "learning_rate": 1.283531965907403e-07, + "loss": 0.2762, + "step": 51114 + }, + { + "epoch": 0.948972052145565, + "grad_norm": 0.4479004144668579, + "learning_rate": 1.281669686061371e-07, + "loss": 0.2098, + "step": 51116 + }, + { + "epoch": 0.9490091822829836, + "grad_norm": 0.4203970432281494, + "learning_rate": 1.2798087494761192e-07, + "loss": 0.3501, + "step": 51118 + }, + { + "epoch": 0.9490463124204023, + "grad_norm": 0.3023132085800171, + "learning_rate": 1.2779491561769719e-07, + "loss": 0.2805, + "step": 51120 + }, + { + "epoch": 0.949083442557821, + "grad_norm": 0.4120701849460602, + "learning_rate": 1.2760909061892313e-07, + "loss": 0.334, + "step": 51122 + }, + { + "epoch": 0.9491205726952395, + "grad_norm": 0.38511180877685547, + "learning_rate": 1.2742339995381768e-07, + "loss": 0.3025, + "step": 51124 + }, + { + "epoch": 0.9491577028326582, + "grad_norm": 0.39255228638648987, + "learning_rate": 1.2723784362490777e-07, + "loss": 0.257, + "step": 51126 + }, + { + "epoch": 0.9491948329700768, + "grad_norm": 0.5008650422096252, + "learning_rate": 1.2705242163471798e-07, + "loss": 0.3305, + "step": 51128 + }, + { + "epoch": 0.9492319631074955, + "grad_norm": 0.3733157217502594, + "learning_rate": 1.268671339857719e-07, + "loss": 0.0745, + "step": 51130 + }, + { + "epoch": 0.9492690932449142, + "grad_norm": 0.40562862157821655, + "learning_rate": 1.266819806805908e-07, + "loss": 0.19, + "step": 51132 + }, + { + "epoch": 0.9493062233823327, + "grad_norm": 0.47175151109695435, + "learning_rate": 1.264969617216949e-07, + "loss": 0.2528, + "step": 51134 + }, + { + "epoch": 0.9493433535197514, + "grad_norm": 0.31468597054481506, + "learning_rate": 1.263120771116011e-07, + "loss": 0.1944, + "step": 51136 + }, + { + "epoch": 0.94938048365717, + "grad_norm": 0.584264874458313, + "learning_rate": 1.2612732685282291e-07, + "loss": 0.3335, + "step": 51138 + }, + { + "epoch": 0.9494176137945887, + "grad_norm": 0.6311646699905396, + "learning_rate": 1.2594271094787835e-07, + "loss": 0.3943, + "step": 51140 + }, + { + "epoch": 0.9494547439320072, + "grad_norm": 0.38294675946235657, + "learning_rate": 1.257582293992754e-07, + "loss": 0.3575, + "step": 51142 + }, + { + "epoch": 0.9494918740694259, + "grad_norm": 0.4584772288799286, + "learning_rate": 1.255738822095265e-07, + "loss": 0.3486, + "step": 51144 + }, + { + "epoch": 0.9495290042068446, + "grad_norm": 0.4281284213066101, + "learning_rate": 1.2538966938114072e-07, + "loss": 0.1817, + "step": 51146 + }, + { + "epoch": 0.9495661343442632, + "grad_norm": 0.3020859360694885, + "learning_rate": 1.2520559091662278e-07, + "loss": 0.2531, + "step": 51148 + }, + { + "epoch": 0.9496032644816819, + "grad_norm": 0.41157469153404236, + "learning_rate": 1.250216468184784e-07, + "loss": 0.2727, + "step": 51150 + }, + { + "epoch": 0.9496403946191004, + "grad_norm": 0.5780795216560364, + "learning_rate": 1.2483783708921116e-07, + "loss": 0.2293, + "step": 51152 + }, + { + "epoch": 0.9496775247565191, + "grad_norm": 0.2992190420627594, + "learning_rate": 1.2465416173132017e-07, + "loss": 0.21, + "step": 51154 + }, + { + "epoch": 0.9497146548939378, + "grad_norm": 0.4864193797111511, + "learning_rate": 1.2447062074730676e-07, + "loss": 0.3428, + "step": 51156 + }, + { + "epoch": 0.9497517850313564, + "grad_norm": 0.4156431555747986, + "learning_rate": 1.2428721413966672e-07, + "loss": 0.2909, + "step": 51158 + }, + { + "epoch": 0.9497889151687751, + "grad_norm": 0.4218223989009857, + "learning_rate": 1.2410394191089802e-07, + "loss": 0.2617, + "step": 51160 + }, + { + "epoch": 0.9498260453061936, + "grad_norm": 0.3566693663597107, + "learning_rate": 1.2392080406349316e-07, + "loss": 0.2557, + "step": 51162 + }, + { + "epoch": 0.9498631754436123, + "grad_norm": 0.3107702136039734, + "learning_rate": 1.2373780059994233e-07, + "loss": 0.282, + "step": 51164 + }, + { + "epoch": 0.949900305581031, + "grad_norm": 0.6074591279029846, + "learning_rate": 1.23554931522738e-07, + "loss": 0.3319, + "step": 51166 + }, + { + "epoch": 0.9499374357184496, + "grad_norm": 0.5217092633247375, + "learning_rate": 1.2337219683436708e-07, + "loss": 0.3936, + "step": 51168 + }, + { + "epoch": 0.9499745658558683, + "grad_norm": 0.39786359667778015, + "learning_rate": 1.2318959653731643e-07, + "loss": 0.2049, + "step": 51170 + }, + { + "epoch": 0.9500116959932868, + "grad_norm": 0.39279642701148987, + "learning_rate": 1.2300713063407076e-07, + "loss": 0.4446, + "step": 51172 + }, + { + "epoch": 0.9500488261307055, + "grad_norm": 0.48613661527633667, + "learning_rate": 1.2282479912711477e-07, + "loss": 0.3357, + "step": 51174 + }, + { + "epoch": 0.9500859562681242, + "grad_norm": 0.3647420108318329, + "learning_rate": 1.226426020189264e-07, + "loss": 0.13, + "step": 51176 + }, + { + "epoch": 0.9501230864055428, + "grad_norm": 0.3206399381160736, + "learning_rate": 1.2246053931198597e-07, + "loss": 0.1944, + "step": 51178 + }, + { + "epoch": 0.9501602165429615, + "grad_norm": 0.3007342517375946, + "learning_rate": 1.2227861100877147e-07, + "loss": 0.2947, + "step": 51180 + }, + { + "epoch": 0.95019734668038, + "grad_norm": 0.5943436622619629, + "learning_rate": 1.220968171117576e-07, + "loss": 0.25, + "step": 51182 + }, + { + "epoch": 0.9502344768177987, + "grad_norm": 0.6613900661468506, + "learning_rate": 1.2191515762341898e-07, + "loss": 0.2655, + "step": 51184 + }, + { + "epoch": 0.9502716069552174, + "grad_norm": 0.48037734627723694, + "learning_rate": 1.2173363254622594e-07, + "loss": 0.4709, + "step": 51186 + }, + { + "epoch": 0.950308737092636, + "grad_norm": 0.3712351620197296, + "learning_rate": 1.215522418826498e-07, + "loss": 0.2338, + "step": 51188 + }, + { + "epoch": 0.9503458672300547, + "grad_norm": 1.0520695447921753, + "learning_rate": 1.2137098563515748e-07, + "loss": 0.3828, + "step": 51190 + }, + { + "epoch": 0.9503829973674732, + "grad_norm": 0.49460604786872864, + "learning_rate": 1.21189863806217e-07, + "loss": 0.2952, + "step": 51192 + }, + { + "epoch": 0.9504201275048919, + "grad_norm": 0.24679668247699738, + "learning_rate": 1.2100887639829196e-07, + "loss": 0.3813, + "step": 51194 + }, + { + "epoch": 0.9504572576423105, + "grad_norm": 0.6260696053504944, + "learning_rate": 1.2082802341384482e-07, + "loss": 0.3651, + "step": 51196 + }, + { + "epoch": 0.9504943877797292, + "grad_norm": 0.4114972949028015, + "learning_rate": 1.2064730485533694e-07, + "loss": 0.4719, + "step": 51198 + }, + { + "epoch": 0.9505315179171479, + "grad_norm": 0.5454961061477661, + "learning_rate": 1.2046672072522636e-07, + "loss": 0.1759, + "step": 51200 + }, + { + "epoch": 0.9505686480545664, + "grad_norm": 0.2732129395008087, + "learning_rate": 1.2028627102597223e-07, + "loss": 0.211, + "step": 51202 + }, + { + "epoch": 0.9506057781919851, + "grad_norm": 0.4664188325405121, + "learning_rate": 1.2010595576002816e-07, + "loss": 0.3029, + "step": 51204 + }, + { + "epoch": 0.9506429083294037, + "grad_norm": 0.3517145812511444, + "learning_rate": 1.199257749298488e-07, + "loss": 0.1972, + "step": 51206 + }, + { + "epoch": 0.9506800384668224, + "grad_norm": 0.44931256771087646, + "learning_rate": 1.1974572853788446e-07, + "loss": 0.381, + "step": 51208 + }, + { + "epoch": 0.950717168604241, + "grad_norm": 0.422444224357605, + "learning_rate": 1.1956581658658762e-07, + "loss": 0.4643, + "step": 51210 + }, + { + "epoch": 0.9507542987416596, + "grad_norm": 0.4548596143722534, + "learning_rate": 1.1938603907840295e-07, + "loss": 0.2382, + "step": 51212 + }, + { + "epoch": 0.9507914288790783, + "grad_norm": 0.42188867926597595, + "learning_rate": 1.1920639601577855e-07, + "loss": 0.363, + "step": 51214 + }, + { + "epoch": 0.9508285590164969, + "grad_norm": 0.37047120928764343, + "learning_rate": 1.1902688740115909e-07, + "loss": 0.2885, + "step": 51216 + }, + { + "epoch": 0.9508656891539156, + "grad_norm": 0.46173036098480225, + "learning_rate": 1.1884751323698706e-07, + "loss": 0.1614, + "step": 51218 + }, + { + "epoch": 0.9509028192913342, + "grad_norm": 0.4396073818206787, + "learning_rate": 1.1866827352570277e-07, + "loss": 0.2405, + "step": 51220 + }, + { + "epoch": 0.9509399494287528, + "grad_norm": 0.5908403396606445, + "learning_rate": 1.1848916826974533e-07, + "loss": 0.3247, + "step": 51222 + }, + { + "epoch": 0.9509770795661715, + "grad_norm": 0.38285836577415466, + "learning_rate": 1.183101974715517e-07, + "loss": 0.309, + "step": 51224 + }, + { + "epoch": 0.9510142097035901, + "grad_norm": 0.31784898042678833, + "learning_rate": 1.1813136113355772e-07, + "loss": 0.1693, + "step": 51226 + }, + { + "epoch": 0.9510513398410088, + "grad_norm": 0.475458562374115, + "learning_rate": 1.1795265925819588e-07, + "loss": 0.215, + "step": 51228 + }, + { + "epoch": 0.9510884699784274, + "grad_norm": 0.3930922746658325, + "learning_rate": 1.1777409184789756e-07, + "loss": 0.239, + "step": 51230 + }, + { + "epoch": 0.951125600115846, + "grad_norm": 0.2587066888809204, + "learning_rate": 1.1759565890509305e-07, + "loss": 0.2819, + "step": 51232 + }, + { + "epoch": 0.9511627302532647, + "grad_norm": 0.38370686769485474, + "learning_rate": 1.1741736043221153e-07, + "loss": 0.3256, + "step": 51234 + }, + { + "epoch": 0.9511998603906833, + "grad_norm": 0.38192781805992126, + "learning_rate": 1.172391964316777e-07, + "loss": 0.2076, + "step": 51236 + }, + { + "epoch": 0.951236990528102, + "grad_norm": 0.3160693645477295, + "learning_rate": 1.1706116690591518e-07, + "loss": 0.2255, + "step": 51238 + }, + { + "epoch": 0.9512741206655205, + "grad_norm": 0.38616102933883667, + "learning_rate": 1.168832718573476e-07, + "loss": 0.1585, + "step": 51240 + }, + { + "epoch": 0.9513112508029392, + "grad_norm": 0.40396353602409363, + "learning_rate": 1.1670551128839636e-07, + "loss": 0.3848, + "step": 51242 + }, + { + "epoch": 0.9513483809403579, + "grad_norm": 0.4451180398464203, + "learning_rate": 1.165278852014784e-07, + "loss": 0.2376, + "step": 51244 + }, + { + "epoch": 0.9513855110777765, + "grad_norm": 0.5604163408279419, + "learning_rate": 1.1635039359901179e-07, + "loss": 0.2264, + "step": 51246 + }, + { + "epoch": 0.9514226412151952, + "grad_norm": 0.31771785020828247, + "learning_rate": 1.1617303648341127e-07, + "loss": 0.1552, + "step": 51248 + }, + { + "epoch": 0.9514597713526137, + "grad_norm": 0.6490215063095093, + "learning_rate": 1.1599581385709047e-07, + "loss": 0.4153, + "step": 51250 + }, + { + "epoch": 0.9514969014900324, + "grad_norm": 0.28555601835250854, + "learning_rate": 1.1581872572245967e-07, + "loss": 0.3914, + "step": 51252 + }, + { + "epoch": 0.9515340316274511, + "grad_norm": 0.3286740779876709, + "learning_rate": 1.1564177208193029e-07, + "loss": 0.2707, + "step": 51254 + }, + { + "epoch": 0.9515711617648697, + "grad_norm": 0.5443800687789917, + "learning_rate": 1.1546495293790927e-07, + "loss": 0.3459, + "step": 51256 + }, + { + "epoch": 0.9516082919022884, + "grad_norm": 0.5195097327232361, + "learning_rate": 1.1528826829280248e-07, + "loss": 0.3158, + "step": 51258 + }, + { + "epoch": 0.9516454220397069, + "grad_norm": 0.30374205112457275, + "learning_rate": 1.1511171814901356e-07, + "loss": 0.4355, + "step": 51260 + }, + { + "epoch": 0.9516825521771256, + "grad_norm": 0.32916373014450073, + "learning_rate": 1.1493530250894724e-07, + "loss": 0.1155, + "step": 51262 + }, + { + "epoch": 0.9517196823145443, + "grad_norm": 0.42171844840049744, + "learning_rate": 1.147590213750005e-07, + "loss": 0.2959, + "step": 51264 + }, + { + "epoch": 0.9517568124519629, + "grad_norm": 0.561106264591217, + "learning_rate": 1.1458287474957475e-07, + "loss": 0.4802, + "step": 51266 + }, + { + "epoch": 0.9517939425893815, + "grad_norm": 0.18155167996883392, + "learning_rate": 1.1440686263506584e-07, + "loss": 0.2088, + "step": 51268 + }, + { + "epoch": 0.9518310727268001, + "grad_norm": 0.4434301257133484, + "learning_rate": 1.1423098503386853e-07, + "loss": 0.3417, + "step": 51270 + }, + { + "epoch": 0.9518682028642188, + "grad_norm": 0.27022165060043335, + "learning_rate": 1.1405524194837536e-07, + "loss": 0.354, + "step": 51272 + }, + { + "epoch": 0.9519053330016375, + "grad_norm": 0.3725062608718872, + "learning_rate": 1.1387963338097996e-07, + "loss": 0.2444, + "step": 51274 + }, + { + "epoch": 0.9519424631390561, + "grad_norm": 0.4949437975883484, + "learning_rate": 1.1370415933407042e-07, + "loss": 0.1128, + "step": 51276 + }, + { + "epoch": 0.9519795932764747, + "grad_norm": 0.3151581287384033, + "learning_rate": 1.135288198100326e-07, + "loss": 0.3854, + "step": 51278 + }, + { + "epoch": 0.9520167234138933, + "grad_norm": 0.6056416034698486, + "learning_rate": 1.1335361481125573e-07, + "loss": 0.4175, + "step": 51280 + }, + { + "epoch": 0.952053853551312, + "grad_norm": 0.3668951988220215, + "learning_rate": 1.1317854434012121e-07, + "loss": 0.2336, + "step": 51282 + }, + { + "epoch": 0.9520909836887307, + "grad_norm": 0.43334779143333435, + "learning_rate": 1.130036083990127e-07, + "loss": 0.3276, + "step": 51284 + }, + { + "epoch": 0.9521281138261493, + "grad_norm": 0.47040700912475586, + "learning_rate": 1.1282880699031051e-07, + "loss": 0.2232, + "step": 51286 + }, + { + "epoch": 0.9521652439635679, + "grad_norm": 0.3533232808113098, + "learning_rate": 1.1265414011639275e-07, + "loss": 0.1173, + "step": 51288 + }, + { + "epoch": 0.9522023741009865, + "grad_norm": 0.43317684531211853, + "learning_rate": 1.1247960777963529e-07, + "loss": 0.364, + "step": 51290 + }, + { + "epoch": 0.9522395042384052, + "grad_norm": 0.27443671226501465, + "learning_rate": 1.1230520998241401e-07, + "loss": 0.2527, + "step": 51292 + }, + { + "epoch": 0.9522766343758238, + "grad_norm": 0.3441377282142639, + "learning_rate": 1.1213094672710256e-07, + "loss": 0.3496, + "step": 51294 + }, + { + "epoch": 0.9523137645132425, + "grad_norm": 0.614155113697052, + "learning_rate": 1.1195681801607016e-07, + "loss": 0.2428, + "step": 51296 + }, + { + "epoch": 0.9523508946506611, + "grad_norm": 0.48285776376724243, + "learning_rate": 1.1178282385168826e-07, + "loss": 0.4336, + "step": 51298 + }, + { + "epoch": 0.9523880247880797, + "grad_norm": 0.3487530052661896, + "learning_rate": 1.1160896423632384e-07, + "loss": 0.2833, + "step": 51300 + }, + { + "epoch": 0.9524251549254984, + "grad_norm": 0.4644484221935272, + "learning_rate": 1.1143523917234056e-07, + "loss": 0.2472, + "step": 51302 + }, + { + "epoch": 0.952462285062917, + "grad_norm": 0.452174574136734, + "learning_rate": 1.1126164866210542e-07, + "loss": 0.1775, + "step": 51304 + }, + { + "epoch": 0.9524994152003357, + "grad_norm": 0.36379164457321167, + "learning_rate": 1.1108819270797877e-07, + "loss": 0.2419, + "step": 51306 + }, + { + "epoch": 0.9525365453377543, + "grad_norm": 0.42875194549560547, + "learning_rate": 1.1091487131232093e-07, + "loss": 0.4057, + "step": 51308 + }, + { + "epoch": 0.9525736754751729, + "grad_norm": 0.3376370072364807, + "learning_rate": 1.1074168447749111e-07, + "loss": 0.1452, + "step": 51310 + }, + { + "epoch": 0.9526108056125916, + "grad_norm": 0.39539510011672974, + "learning_rate": 1.1056863220584412e-07, + "loss": 0.3155, + "step": 51312 + }, + { + "epoch": 0.9526479357500102, + "grad_norm": 0.31804022192955017, + "learning_rate": 1.1039571449973696e-07, + "loss": 0.2814, + "step": 51314 + }, + { + "epoch": 0.9526850658874289, + "grad_norm": 0.5603212118148804, + "learning_rate": 1.1022293136151996e-07, + "loss": 0.439, + "step": 51316 + }, + { + "epoch": 0.9527221960248475, + "grad_norm": 0.6551733613014221, + "learning_rate": 1.1005028279354568e-07, + "loss": 0.2828, + "step": 51318 + }, + { + "epoch": 0.9527593261622661, + "grad_norm": 0.9215043783187866, + "learning_rate": 1.0987776879816447e-07, + "loss": 0.198, + "step": 51320 + }, + { + "epoch": 0.9527964562996848, + "grad_norm": 0.5769176483154297, + "learning_rate": 1.0970538937772113e-07, + "loss": 0.2796, + "step": 51322 + }, + { + "epoch": 0.9528335864371034, + "grad_norm": 0.19632776081562042, + "learning_rate": 1.0953314453456376e-07, + "loss": 0.3823, + "step": 51324 + }, + { + "epoch": 0.952870716574522, + "grad_norm": 0.5432040095329285, + "learning_rate": 1.0936103427103384e-07, + "loss": 0.4746, + "step": 51326 + }, + { + "epoch": 0.9529078467119407, + "grad_norm": 0.4054275155067444, + "learning_rate": 1.0918905858947504e-07, + "loss": 0.4682, + "step": 51328 + }, + { + "epoch": 0.9529449768493593, + "grad_norm": 0.44310373067855835, + "learning_rate": 1.0901721749222772e-07, + "loss": 0.1581, + "step": 51330 + }, + { + "epoch": 0.952982106986778, + "grad_norm": 0.2824499309062958, + "learning_rate": 1.0884551098162777e-07, + "loss": 0.0859, + "step": 51332 + }, + { + "epoch": 0.9530192371241966, + "grad_norm": 0.2441079169511795, + "learning_rate": 1.0867393906001222e-07, + "loss": 0.3679, + "step": 51334 + }, + { + "epoch": 0.9530563672616152, + "grad_norm": 0.4055043160915375, + "learning_rate": 1.085025017297181e-07, + "loss": 0.2128, + "step": 51336 + }, + { + "epoch": 0.9530934973990339, + "grad_norm": 0.2868565320968628, + "learning_rate": 1.0833119899307576e-07, + "loss": 0.3097, + "step": 51338 + }, + { + "epoch": 0.9531306275364525, + "grad_norm": 0.6610684990882874, + "learning_rate": 1.0816003085241666e-07, + "loss": 0.2934, + "step": 51340 + }, + { + "epoch": 0.9531677576738712, + "grad_norm": 0.3810104727745056, + "learning_rate": 1.0798899731007006e-07, + "loss": 0.269, + "step": 51342 + }, + { + "epoch": 0.9532048878112898, + "grad_norm": 0.288468599319458, + "learning_rate": 1.0781809836836299e-07, + "loss": 0.3354, + "step": 51344 + }, + { + "epoch": 0.9532420179487084, + "grad_norm": 0.5050945281982422, + "learning_rate": 1.0764733402962136e-07, + "loss": 0.3589, + "step": 51346 + }, + { + "epoch": 0.953279148086127, + "grad_norm": 0.5115669965744019, + "learning_rate": 1.0747670429616886e-07, + "loss": 0.4395, + "step": 51348 + }, + { + "epoch": 0.9533162782235457, + "grad_norm": 0.5838960409164429, + "learning_rate": 1.0730620917032697e-07, + "loss": 0.3846, + "step": 51350 + }, + { + "epoch": 0.9533534083609644, + "grad_norm": 0.34467118978500366, + "learning_rate": 1.0713584865441495e-07, + "loss": 0.1637, + "step": 51352 + }, + { + "epoch": 0.953390538498383, + "grad_norm": 0.3816034197807312, + "learning_rate": 1.0696562275075206e-07, + "loss": 0.3864, + "step": 51354 + }, + { + "epoch": 0.9534276686358016, + "grad_norm": 0.4820430874824524, + "learning_rate": 1.0679553146165312e-07, + "loss": 0.3548, + "step": 51356 + }, + { + "epoch": 0.9534647987732202, + "grad_norm": 0.3864564001560211, + "learning_rate": 1.0662557478943403e-07, + "loss": 0.1949, + "step": 51358 + }, + { + "epoch": 0.9535019289106389, + "grad_norm": 0.4808722734451294, + "learning_rate": 1.0645575273640629e-07, + "loss": 0.1143, + "step": 51360 + }, + { + "epoch": 0.9535390590480576, + "grad_norm": 0.38032978773117065, + "learning_rate": 1.062860653048825e-07, + "loss": 0.2929, + "step": 51362 + }, + { + "epoch": 0.9535761891854762, + "grad_norm": 0.2155487835407257, + "learning_rate": 1.061165124971686e-07, + "loss": 0.1088, + "step": 51364 + }, + { + "epoch": 0.9536133193228948, + "grad_norm": 0.36278069019317627, + "learning_rate": 1.0594709431557382e-07, + "loss": 0.2193, + "step": 51366 + }, + { + "epoch": 0.9536504494603134, + "grad_norm": 0.49232059717178345, + "learning_rate": 1.0577781076240301e-07, + "loss": 0.1183, + "step": 51368 + }, + { + "epoch": 0.9536875795977321, + "grad_norm": 0.3859732151031494, + "learning_rate": 1.0560866183995988e-07, + "loss": 0.1119, + "step": 51370 + }, + { + "epoch": 0.9537247097351508, + "grad_norm": 0.5421310663223267, + "learning_rate": 1.0543964755054593e-07, + "loss": 0.2101, + "step": 51372 + }, + { + "epoch": 0.9537618398725694, + "grad_norm": 0.33848968148231506, + "learning_rate": 1.052707678964604e-07, + "loss": 0.1709, + "step": 51374 + }, + { + "epoch": 0.953798970009988, + "grad_norm": 0.4361926317214966, + "learning_rate": 1.0510202288000259e-07, + "loss": 0.1234, + "step": 51376 + }, + { + "epoch": 0.9538361001474066, + "grad_norm": 0.32363268733024597, + "learning_rate": 1.0493341250346622e-07, + "loss": 0.2823, + "step": 51378 + }, + { + "epoch": 0.9538732302848253, + "grad_norm": 0.2282724678516388, + "learning_rate": 1.0476493676914723e-07, + "loss": 0.2135, + "step": 51380 + }, + { + "epoch": 0.953910360422244, + "grad_norm": 0.20674701035022736, + "learning_rate": 1.045965956793371e-07, + "loss": 0.2888, + "step": 51382 + }, + { + "epoch": 0.9539474905596625, + "grad_norm": 0.21321214735507965, + "learning_rate": 1.0442838923632847e-07, + "loss": 0.2131, + "step": 51384 + }, + { + "epoch": 0.9539846206970812, + "grad_norm": 0.3474956154823303, + "learning_rate": 1.0426031744240727e-07, + "loss": 0.2003, + "step": 51386 + }, + { + "epoch": 0.9540217508344998, + "grad_norm": 0.31653910875320435, + "learning_rate": 1.0409238029986391e-07, + "loss": 0.3052, + "step": 51388 + }, + { + "epoch": 0.9540588809719185, + "grad_norm": 0.49861517548561096, + "learning_rate": 1.0392457781097986e-07, + "loss": 0.1798, + "step": 51390 + }, + { + "epoch": 0.9540960111093371, + "grad_norm": 0.34396955370903015, + "learning_rate": 1.037569099780411e-07, + "loss": 0.1365, + "step": 51392 + }, + { + "epoch": 0.9541331412467557, + "grad_norm": 0.38625603914260864, + "learning_rate": 1.0358937680332804e-07, + "loss": 0.2548, + "step": 51394 + }, + { + "epoch": 0.9541702713841744, + "grad_norm": 0.37313809990882874, + "learning_rate": 1.0342197828912104e-07, + "loss": 0.3021, + "step": 51396 + }, + { + "epoch": 0.954207401521593, + "grad_norm": 0.5972605347633362, + "learning_rate": 1.0325471443769609e-07, + "loss": 0.1474, + "step": 51398 + }, + { + "epoch": 0.9542445316590117, + "grad_norm": 0.4858228862285614, + "learning_rate": 1.0308758525133022e-07, + "loss": 0.2084, + "step": 51400 + }, + { + "epoch": 0.9542816617964303, + "grad_norm": 0.40986567735671997, + "learning_rate": 1.0292059073229832e-07, + "loss": 0.2341, + "step": 51402 + }, + { + "epoch": 0.9543187919338489, + "grad_norm": 0.3499915599822998, + "learning_rate": 1.0275373088287189e-07, + "loss": 0.2386, + "step": 51404 + }, + { + "epoch": 0.9543559220712676, + "grad_norm": 0.3555004298686981, + "learning_rate": 1.0258700570532131e-07, + "loss": 0.3372, + "step": 51406 + }, + { + "epoch": 0.9543930522086862, + "grad_norm": 0.3637676537036896, + "learning_rate": 1.0242041520191481e-07, + "loss": 0.3126, + "step": 51408 + }, + { + "epoch": 0.9544301823461049, + "grad_norm": 0.32187241315841675, + "learning_rate": 1.0225395937492056e-07, + "loss": 0.3867, + "step": 51410 + }, + { + "epoch": 0.9544673124835235, + "grad_norm": 0.4091941714286804, + "learning_rate": 1.0208763822660228e-07, + "loss": 0.241, + "step": 51412 + }, + { + "epoch": 0.9545044426209421, + "grad_norm": 0.6092678308486938, + "learning_rate": 1.0192145175922374e-07, + "loss": 0.3584, + "step": 51414 + }, + { + "epoch": 0.9545415727583608, + "grad_norm": 0.5188865661621094, + "learning_rate": 1.0175539997504647e-07, + "loss": 0.2723, + "step": 51416 + }, + { + "epoch": 0.9545787028957794, + "grad_norm": 0.44235536456108093, + "learning_rate": 1.0158948287632975e-07, + "loss": 0.2762, + "step": 51418 + }, + { + "epoch": 0.9546158330331981, + "grad_norm": 0.36820608377456665, + "learning_rate": 1.0142370046533067e-07, + "loss": 0.1337, + "step": 51420 + }, + { + "epoch": 0.9546529631706167, + "grad_norm": 0.3130168318748474, + "learning_rate": 1.0125805274430411e-07, + "loss": 0.2306, + "step": 51422 + }, + { + "epoch": 0.9546900933080353, + "grad_norm": 0.502172589302063, + "learning_rate": 1.0109253971550603e-07, + "loss": 0.3917, + "step": 51424 + }, + { + "epoch": 0.954727223445454, + "grad_norm": 0.48643258213996887, + "learning_rate": 1.0092716138118908e-07, + "loss": 0.5205, + "step": 51426 + }, + { + "epoch": 0.9547643535828726, + "grad_norm": 0.5207310318946838, + "learning_rate": 1.0076191774360144e-07, + "loss": 0.2222, + "step": 51428 + }, + { + "epoch": 0.9548014837202913, + "grad_norm": 0.5100780725479126, + "learning_rate": 1.0059680880499245e-07, + "loss": 0.1614, + "step": 51430 + }, + { + "epoch": 0.9548386138577099, + "grad_norm": 0.4287956655025482, + "learning_rate": 1.0043183456760808e-07, + "loss": 0.3084, + "step": 51432 + }, + { + "epoch": 0.9548757439951285, + "grad_norm": 0.27186858654022217, + "learning_rate": 1.002669950336943e-07, + "loss": 0.466, + "step": 51434 + }, + { + "epoch": 0.9549128741325472, + "grad_norm": 0.4396379590034485, + "learning_rate": 1.0010229020549378e-07, + "loss": 0.5569, + "step": 51436 + }, + { + "epoch": 0.9549500042699658, + "grad_norm": 0.34844908118247986, + "learning_rate": 9.993772008524805e-08, + "loss": 0.1378, + "step": 51438 + }, + { + "epoch": 0.9549871344073845, + "grad_norm": 0.32082870602607727, + "learning_rate": 9.977328467519532e-08, + "loss": 0.2846, + "step": 51440 + }, + { + "epoch": 0.955024264544803, + "grad_norm": 0.4624224007129669, + "learning_rate": 9.960898397757268e-08, + "loss": 0.4315, + "step": 51442 + }, + { + "epoch": 0.9550613946822217, + "grad_norm": 0.1708763688802719, + "learning_rate": 9.944481799461725e-08, + "loss": 0.2394, + "step": 51444 + }, + { + "epoch": 0.9550985248196403, + "grad_norm": 0.28764525055885315, + "learning_rate": 9.928078672856278e-08, + "loss": 0.2519, + "step": 51446 + }, + { + "epoch": 0.955135654957059, + "grad_norm": 0.3690250813961029, + "learning_rate": 9.911689018163973e-08, + "loss": 0.2175, + "step": 51448 + }, + { + "epoch": 0.9551727850944777, + "grad_norm": 0.27418121695518494, + "learning_rate": 9.895312835608073e-08, + "loss": 0.2164, + "step": 51450 + }, + { + "epoch": 0.9552099152318962, + "grad_norm": 0.47681599855422974, + "learning_rate": 9.878950125411068e-08, + "loss": 0.142, + "step": 51452 + }, + { + "epoch": 0.9552470453693149, + "grad_norm": 0.37893256545066833, + "learning_rate": 9.862600887795893e-08, + "loss": 0.2912, + "step": 51454 + }, + { + "epoch": 0.9552841755067335, + "grad_norm": 0.5107290744781494, + "learning_rate": 9.846265122984921e-08, + "loss": 0.3965, + "step": 51456 + }, + { + "epoch": 0.9553213056441522, + "grad_norm": 0.2970569133758545, + "learning_rate": 9.829942831200423e-08, + "loss": 0.1353, + "step": 51458 + }, + { + "epoch": 0.9553584357815709, + "grad_norm": 0.3862713575363159, + "learning_rate": 9.813634012664552e-08, + "loss": 0.3067, + "step": 51460 + }, + { + "epoch": 0.9553955659189894, + "grad_norm": 0.44956332445144653, + "learning_rate": 9.797338667599132e-08, + "loss": 0.1933, + "step": 51462 + }, + { + "epoch": 0.9554326960564081, + "grad_norm": 0.2788529396057129, + "learning_rate": 9.781056796225874e-08, + "loss": 0.1533, + "step": 51464 + }, + { + "epoch": 0.9554698261938267, + "grad_norm": 0.5427239537239075, + "learning_rate": 9.764788398766489e-08, + "loss": 0.3225, + "step": 51466 + }, + { + "epoch": 0.9555069563312454, + "grad_norm": 0.42808350920677185, + "learning_rate": 9.748533475442135e-08, + "loss": 0.278, + "step": 51468 + }, + { + "epoch": 0.9555440864686641, + "grad_norm": 0.17554649710655212, + "learning_rate": 9.732292026474077e-08, + "loss": 0.0411, + "step": 51470 + }, + { + "epoch": 0.9555812166060826, + "grad_norm": 0.4540509879589081, + "learning_rate": 9.716064052083362e-08, + "loss": 0.373, + "step": 51472 + }, + { + "epoch": 0.9556183467435013, + "grad_norm": 1.3159315586090088, + "learning_rate": 9.699849552490815e-08, + "loss": 0.4413, + "step": 51474 + }, + { + "epoch": 0.9556554768809199, + "grad_norm": 0.3383181393146515, + "learning_rate": 9.683648527917034e-08, + "loss": 0.2005, + "step": 51476 + }, + { + "epoch": 0.9556926070183386, + "grad_norm": 0.4478803873062134, + "learning_rate": 9.667460978582399e-08, + "loss": 0.2232, + "step": 51478 + }, + { + "epoch": 0.9557297371557573, + "grad_norm": 0.36779308319091797, + "learning_rate": 9.651286904707292e-08, + "loss": 0.2742, + "step": 51480 + }, + { + "epoch": 0.9557668672931758, + "grad_norm": 0.44625532627105713, + "learning_rate": 9.635126306511755e-08, + "loss": 0.4092, + "step": 51482 + }, + { + "epoch": 0.9558039974305945, + "grad_norm": 0.14745530486106873, + "learning_rate": 9.618979184215504e-08, + "loss": 0.0803, + "step": 51484 + }, + { + "epoch": 0.9558411275680131, + "grad_norm": 0.5856987237930298, + "learning_rate": 9.602845538038585e-08, + "loss": 0.2287, + "step": 51486 + }, + { + "epoch": 0.9558782577054318, + "grad_norm": 0.554949164390564, + "learning_rate": 9.586725368200267e-08, + "loss": 0.2197, + "step": 51488 + }, + { + "epoch": 0.9559153878428505, + "grad_norm": 0.2984238266944885, + "learning_rate": 9.57061867492004e-08, + "loss": 0.3928, + "step": 51490 + }, + { + "epoch": 0.955952517980269, + "grad_norm": 0.6525359749794006, + "learning_rate": 9.554525458416953e-08, + "loss": 0.1239, + "step": 51492 + }, + { + "epoch": 0.9559896481176877, + "grad_norm": 0.2128794640302658, + "learning_rate": 9.53844571891005e-08, + "loss": 0.1859, + "step": 51494 + }, + { + "epoch": 0.9560267782551063, + "grad_norm": 0.36496207118034363, + "learning_rate": 9.522379456618047e-08, + "loss": 0.1817, + "step": 51496 + }, + { + "epoch": 0.956063908392525, + "grad_norm": 0.5023206472396851, + "learning_rate": 9.50632667175988e-08, + "loss": 0.3518, + "step": 51498 + }, + { + "epoch": 0.9561010385299435, + "grad_norm": 0.3819437623023987, + "learning_rate": 9.490287364553485e-08, + "loss": 0.1771, + "step": 51500 + }, + { + "epoch": 0.9561381686673622, + "grad_norm": 0.40997087955474854, + "learning_rate": 9.474261535217577e-08, + "loss": 0.2268, + "step": 51502 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.43543556332588196, + "learning_rate": 9.45824918396987e-08, + "loss": 0.3131, + "step": 51504 + }, + { + "epoch": 0.9562124289421995, + "grad_norm": 0.48614105582237244, + "learning_rate": 9.442250311028301e-08, + "loss": 0.2235, + "step": 51506 + }, + { + "epoch": 0.9562495590796182, + "grad_norm": 0.6929818391799927, + "learning_rate": 9.426264916610806e-08, + "loss": 0.4692, + "step": 51508 + }, + { + "epoch": 0.9562866892170367, + "grad_norm": 0.2409350425004959, + "learning_rate": 9.410293000934545e-08, + "loss": 0.1772, + "step": 51510 + }, + { + "epoch": 0.9563238193544554, + "grad_norm": 0.22209814190864563, + "learning_rate": 9.394334564217233e-08, + "loss": 0.2402, + "step": 51512 + }, + { + "epoch": 0.9563609494918741, + "grad_norm": 0.2895338535308838, + "learning_rate": 9.378389606675698e-08, + "loss": 0.1887, + "step": 51514 + }, + { + "epoch": 0.9563980796292927, + "grad_norm": 0.546881377696991, + "learning_rate": 9.362458128527096e-08, + "loss": 0.1819, + "step": 51516 + }, + { + "epoch": 0.9564352097667114, + "grad_norm": 0.36732521653175354, + "learning_rate": 9.346540129988035e-08, + "loss": 0.2193, + "step": 51518 + }, + { + "epoch": 0.9564723399041299, + "grad_norm": 0.3495822846889496, + "learning_rate": 9.330635611275341e-08, + "loss": 0.3648, + "step": 51520 + }, + { + "epoch": 0.9565094700415486, + "grad_norm": 0.3063301146030426, + "learning_rate": 9.314744572605283e-08, + "loss": 0.1641, + "step": 51522 + }, + { + "epoch": 0.9565466001789673, + "grad_norm": 0.40818724036216736, + "learning_rate": 9.298867014194024e-08, + "loss": 0.268, + "step": 51524 + }, + { + "epoch": 0.9565837303163859, + "grad_norm": 0.339240700006485, + "learning_rate": 9.283002936257723e-08, + "loss": 0.2768, + "step": 51526 + }, + { + "epoch": 0.9566208604538046, + "grad_norm": 0.45224931836128235, + "learning_rate": 9.267152339012208e-08, + "loss": 0.3243, + "step": 51528 + }, + { + "epoch": 0.9566579905912231, + "grad_norm": 0.501471757888794, + "learning_rate": 9.251315222673196e-08, + "loss": 0.3255, + "step": 51530 + }, + { + "epoch": 0.9566951207286418, + "grad_norm": 0.2901224195957184, + "learning_rate": 9.235491587456069e-08, + "loss": 0.3532, + "step": 51532 + }, + { + "epoch": 0.9567322508660605, + "grad_norm": 0.37131261825561523, + "learning_rate": 9.219681433576321e-08, + "loss": 0.3123, + "step": 51534 + }, + { + "epoch": 0.9567693810034791, + "grad_norm": 0.3656637966632843, + "learning_rate": 9.203884761248894e-08, + "loss": 0.2702, + "step": 51536 + }, + { + "epoch": 0.9568065111408978, + "grad_norm": 0.34161725640296936, + "learning_rate": 9.188101570688834e-08, + "loss": 0.333, + "step": 51538 + }, + { + "epoch": 0.9568436412783163, + "grad_norm": 0.29506412148475647, + "learning_rate": 9.172331862110972e-08, + "loss": 0.3374, + "step": 51540 + }, + { + "epoch": 0.956880771415735, + "grad_norm": 0.4480403661727905, + "learning_rate": 9.156575635729692e-08, + "loss": 0.3452, + "step": 51542 + }, + { + "epoch": 0.9569179015531536, + "grad_norm": 0.2707580029964447, + "learning_rate": 9.140832891759599e-08, + "loss": 0.2365, + "step": 51544 + }, + { + "epoch": 0.9569550316905723, + "grad_norm": 0.34321004152297974, + "learning_rate": 9.125103630414744e-08, + "loss": 0.2322, + "step": 51546 + }, + { + "epoch": 0.956992161827991, + "grad_norm": 0.4533778131008148, + "learning_rate": 9.10938785190918e-08, + "loss": 0.3507, + "step": 51548 + }, + { + "epoch": 0.9570292919654095, + "grad_norm": 0.5854126811027527, + "learning_rate": 9.093685556456733e-08, + "loss": 0.2008, + "step": 51550 + }, + { + "epoch": 0.9570664221028282, + "grad_norm": 0.5888938307762146, + "learning_rate": 9.077996744271345e-08, + "loss": 0.3178, + "step": 51552 + }, + { + "epoch": 0.9571035522402468, + "grad_norm": 0.3842276334762573, + "learning_rate": 9.062321415566066e-08, + "loss": 0.3804, + "step": 51554 + }, + { + "epoch": 0.9571406823776655, + "grad_norm": 0.35984310507774353, + "learning_rate": 9.046659570554395e-08, + "loss": 0.3131, + "step": 51556 + }, + { + "epoch": 0.9571778125150842, + "grad_norm": 0.4122650921344757, + "learning_rate": 9.031011209449492e-08, + "loss": 0.2193, + "step": 51558 + }, + { + "epoch": 0.9572149426525027, + "grad_norm": 0.545009970664978, + "learning_rate": 9.015376332464298e-08, + "loss": 0.4237, + "step": 51560 + }, + { + "epoch": 0.9572520727899214, + "grad_norm": 0.39067867398262024, + "learning_rate": 8.999754939811422e-08, + "loss": 0.3222, + "step": 51562 + }, + { + "epoch": 0.95728920292734, + "grad_norm": 1.3999227285385132, + "learning_rate": 8.984147031703472e-08, + "loss": 0.3103, + "step": 51564 + }, + { + "epoch": 0.9573263330647587, + "grad_norm": 0.3567414879798889, + "learning_rate": 8.968552608352943e-08, + "loss": 0.2156, + "step": 51566 + }, + { + "epoch": 0.9573634632021774, + "grad_norm": 0.47114434838294983, + "learning_rate": 8.952971669971777e-08, + "loss": 0.3099, + "step": 51568 + }, + { + "epoch": 0.9574005933395959, + "grad_norm": 0.38934656977653503, + "learning_rate": 8.93740421677225e-08, + "loss": 0.2393, + "step": 51570 + }, + { + "epoch": 0.9574377234770146, + "grad_norm": 0.4058121144771576, + "learning_rate": 8.921850248965968e-08, + "loss": 0.2452, + "step": 51572 + }, + { + "epoch": 0.9574748536144332, + "grad_norm": 0.5005676746368408, + "learning_rate": 8.906309766764765e-08, + "loss": 0.3649, + "step": 51574 + }, + { + "epoch": 0.9575119837518519, + "grad_norm": 0.48660942912101746, + "learning_rate": 8.890782770380024e-08, + "loss": 0.0588, + "step": 51576 + }, + { + "epoch": 0.9575491138892706, + "grad_norm": 0.6719896793365479, + "learning_rate": 8.875269260023023e-08, + "loss": 0.3653, + "step": 51578 + }, + { + "epoch": 0.9575862440266891, + "grad_norm": 0.39558500051498413, + "learning_rate": 8.859769235904814e-08, + "loss": 0.3686, + "step": 51580 + }, + { + "epoch": 0.9576233741641078, + "grad_norm": 0.5397294759750366, + "learning_rate": 8.84428269823634e-08, + "loss": 0.264, + "step": 51582 + }, + { + "epoch": 0.9576605043015264, + "grad_norm": 0.3553774952888489, + "learning_rate": 8.82880964722832e-08, + "loss": 0.2656, + "step": 51584 + }, + { + "epoch": 0.9576976344389451, + "grad_norm": 0.34070703387260437, + "learning_rate": 8.813350083091255e-08, + "loss": 0.2815, + "step": 51586 + }, + { + "epoch": 0.9577347645763638, + "grad_norm": 0.23878291249275208, + "learning_rate": 8.79790400603564e-08, + "loss": 0.2088, + "step": 51588 + }, + { + "epoch": 0.9577718947137823, + "grad_norm": 0.3160630464553833, + "learning_rate": 8.782471416271532e-08, + "loss": 0.2797, + "step": 51590 + }, + { + "epoch": 0.957809024851201, + "grad_norm": 0.2325183004140854, + "learning_rate": 8.767052314008872e-08, + "loss": 0.3662, + "step": 51592 + }, + { + "epoch": 0.9578461549886196, + "grad_norm": 0.546848475933075, + "learning_rate": 8.751646699457605e-08, + "loss": 0.1907, + "step": 51594 + }, + { + "epoch": 0.9578832851260383, + "grad_norm": 0.3785562813282013, + "learning_rate": 8.736254572827341e-08, + "loss": 0.2578, + "step": 51596 + }, + { + "epoch": 0.9579204152634568, + "grad_norm": 0.5302824378013611, + "learning_rate": 8.720875934327356e-08, + "loss": 0.2715, + "step": 51598 + }, + { + "epoch": 0.9579575454008755, + "grad_norm": 0.3110813498497009, + "learning_rate": 8.705510784167037e-08, + "loss": 0.0403, + "step": 51600 + }, + { + "epoch": 0.9579946755382942, + "grad_norm": 0.6449050307273865, + "learning_rate": 8.690159122555552e-08, + "loss": 0.3297, + "step": 51602 + }, + { + "epoch": 0.9580318056757128, + "grad_norm": 0.38409891724586487, + "learning_rate": 8.674820949701623e-08, + "loss": 0.4097, + "step": 51604 + }, + { + "epoch": 0.9580689358131315, + "grad_norm": 0.25078266859054565, + "learning_rate": 8.659496265814082e-08, + "loss": 0.2994, + "step": 51606 + }, + { + "epoch": 0.95810606595055, + "grad_norm": 0.39670518040657043, + "learning_rate": 8.644185071101319e-08, + "loss": 0.2524, + "step": 51608 + }, + { + "epoch": 0.9581431960879687, + "grad_norm": 0.4671781063079834, + "learning_rate": 8.628887365771721e-08, + "loss": 0.2871, + "step": 51610 + }, + { + "epoch": 0.9581803262253874, + "grad_norm": 0.5040252804756165, + "learning_rate": 8.613603150033567e-08, + "loss": 0.4066, + "step": 51612 + }, + { + "epoch": 0.958217456362806, + "grad_norm": 0.41721487045288086, + "learning_rate": 8.59833242409469e-08, + "loss": 0.296, + "step": 51614 + }, + { + "epoch": 0.9582545865002247, + "grad_norm": 0.3969128429889679, + "learning_rate": 8.583075188162815e-08, + "loss": 0.1765, + "step": 51616 + }, + { + "epoch": 0.9582917166376432, + "grad_norm": 0.4134426414966583, + "learning_rate": 8.567831442445773e-08, + "loss": 0.2495, + "step": 51618 + }, + { + "epoch": 0.9583288467750619, + "grad_norm": 0.4129183292388916, + "learning_rate": 8.552601187150734e-08, + "loss": 0.4507, + "step": 51620 + }, + { + "epoch": 0.9583659769124806, + "grad_norm": 0.38789424300193787, + "learning_rate": 8.537384422485195e-08, + "loss": 0.2077, + "step": 51622 + }, + { + "epoch": 0.9584031070498992, + "grad_norm": 0.2857956886291504, + "learning_rate": 8.522181148655994e-08, + "loss": 0.2961, + "step": 51624 + }, + { + "epoch": 0.9584402371873179, + "grad_norm": 0.4785972237586975, + "learning_rate": 8.506991365870188e-08, + "loss": 0.4389, + "step": 51626 + }, + { + "epoch": 0.9584773673247364, + "grad_norm": 0.5297247171401978, + "learning_rate": 8.491815074334276e-08, + "loss": 0.2955, + "step": 51628 + }, + { + "epoch": 0.9585144974621551, + "grad_norm": 0.4262286126613617, + "learning_rate": 8.47665227425487e-08, + "loss": 0.202, + "step": 51630 + }, + { + "epoch": 0.9585516275995738, + "grad_norm": 0.34194982051849365, + "learning_rate": 8.461502965838253e-08, + "loss": 0.0884, + "step": 51632 + }, + { + "epoch": 0.9585887577369924, + "grad_norm": 0.2540087103843689, + "learning_rate": 8.446367149290591e-08, + "loss": 0.2065, + "step": 51634 + }, + { + "epoch": 0.958625887874411, + "grad_norm": 0.2380489856004715, + "learning_rate": 8.43124482481783e-08, + "loss": 0.3655, + "step": 51636 + }, + { + "epoch": 0.9586630180118296, + "grad_norm": 0.45640599727630615, + "learning_rate": 8.416135992625696e-08, + "loss": 0.2984, + "step": 51638 + }, + { + "epoch": 0.9587001481492483, + "grad_norm": 0.42374998331069946, + "learning_rate": 8.401040652919912e-08, + "loss": 0.3952, + "step": 51640 + }, + { + "epoch": 0.958737278286667, + "grad_norm": 0.44691967964172363, + "learning_rate": 8.385958805905758e-08, + "loss": 0.299, + "step": 51642 + }, + { + "epoch": 0.9587744084240856, + "grad_norm": 0.6609805822372437, + "learning_rate": 8.370890451788405e-08, + "loss": 0.3957, + "step": 51644 + }, + { + "epoch": 0.9588115385615043, + "grad_norm": 0.3307952284812927, + "learning_rate": 8.35583559077291e-08, + "loss": 0.2903, + "step": 51646 + }, + { + "epoch": 0.9588486686989228, + "grad_norm": 0.3113168179988861, + "learning_rate": 8.34079422306433e-08, + "loss": 0.2529, + "step": 51648 + }, + { + "epoch": 0.9588857988363415, + "grad_norm": 0.7100905776023865, + "learning_rate": 8.325766348867059e-08, + "loss": 0.1601, + "step": 51650 + }, + { + "epoch": 0.9589229289737601, + "grad_norm": 0.4381139576435089, + "learning_rate": 8.310751968385821e-08, + "loss": 0.2397, + "step": 51652 + }, + { + "epoch": 0.9589600591111788, + "grad_norm": 0.35432446002960205, + "learning_rate": 8.295751081824676e-08, + "loss": 0.2826, + "step": 51654 + }, + { + "epoch": 0.9589971892485974, + "grad_norm": 0.48388224840164185, + "learning_rate": 8.280763689387906e-08, + "loss": 0.2811, + "step": 51656 + }, + { + "epoch": 0.959034319386016, + "grad_norm": 0.35934674739837646, + "learning_rate": 8.265789791279456e-08, + "loss": 0.2151, + "step": 51658 + }, + { + "epoch": 0.9590714495234347, + "grad_norm": 0.43315452337265015, + "learning_rate": 8.25082938770294e-08, + "loss": 0.2681, + "step": 51660 + }, + { + "epoch": 0.9591085796608533, + "grad_norm": 0.44586893916130066, + "learning_rate": 8.235882478861978e-08, + "loss": 0.2726, + "step": 51662 + }, + { + "epoch": 0.959145709798272, + "grad_norm": 0.2879968583583832, + "learning_rate": 8.220949064960071e-08, + "loss": 0.2467, + "step": 51664 + }, + { + "epoch": 0.9591828399356906, + "grad_norm": 0.4043121039867401, + "learning_rate": 8.206029146200278e-08, + "loss": 0.1181, + "step": 51666 + }, + { + "epoch": 0.9592199700731092, + "grad_norm": 0.3585694432258606, + "learning_rate": 8.19112272278566e-08, + "loss": 0.1599, + "step": 51668 + }, + { + "epoch": 0.9592571002105279, + "grad_norm": 0.267323762178421, + "learning_rate": 8.176229794919055e-08, + "loss": 0.3345, + "step": 51670 + }, + { + "epoch": 0.9592942303479465, + "grad_norm": 0.46867766976356506, + "learning_rate": 8.161350362803078e-08, + "loss": 0.2797, + "step": 51672 + }, + { + "epoch": 0.9593313604853652, + "grad_norm": 0.7019122838973999, + "learning_rate": 8.146484426640344e-08, + "loss": 0.4138, + "step": 51674 + }, + { + "epoch": 0.9593684906227838, + "grad_norm": 0.4788989722728729, + "learning_rate": 8.131631986632804e-08, + "loss": 0.1903, + "step": 51676 + }, + { + "epoch": 0.9594056207602024, + "grad_norm": 0.2404189109802246, + "learning_rate": 8.116793042982962e-08, + "loss": 0.2649, + "step": 51678 + }, + { + "epoch": 0.9594427508976211, + "grad_norm": 0.377534419298172, + "learning_rate": 8.101967595892435e-08, + "loss": 0.187, + "step": 51680 + }, + { + "epoch": 0.9594798810350397, + "grad_norm": 0.40210217237472534, + "learning_rate": 8.08715564556306e-08, + "loss": 0.1981, + "step": 51682 + }, + { + "epoch": 0.9595170111724584, + "grad_norm": 0.5479485988616943, + "learning_rate": 8.072357192196345e-08, + "loss": 0.1356, + "step": 51684 + }, + { + "epoch": 0.959554141309877, + "grad_norm": 0.46031203866004944, + "learning_rate": 8.057572235993682e-08, + "loss": 0.4076, + "step": 51686 + }, + { + "epoch": 0.9595912714472956, + "grad_norm": 0.29781094193458557, + "learning_rate": 8.042800777156245e-08, + "loss": 0.3117, + "step": 51688 + }, + { + "epoch": 0.9596284015847143, + "grad_norm": 0.4593803286552429, + "learning_rate": 8.028042815885206e-08, + "loss": 0.3926, + "step": 51690 + }, + { + "epoch": 0.9596655317221329, + "grad_norm": 0.55975341796875, + "learning_rate": 8.013298352381072e-08, + "loss": 0.3164, + "step": 51692 + }, + { + "epoch": 0.9597026618595516, + "grad_norm": 0.28401824831962585, + "learning_rate": 7.998567386844569e-08, + "loss": 0.1594, + "step": 51694 + }, + { + "epoch": 0.9597397919969701, + "grad_norm": 0.48278170824050903, + "learning_rate": 7.983849919476206e-08, + "loss": 0.2595, + "step": 51696 + }, + { + "epoch": 0.9597769221343888, + "grad_norm": 0.24228018522262573, + "learning_rate": 7.969145950476154e-08, + "loss": 0.4278, + "step": 51698 + }, + { + "epoch": 0.9598140522718075, + "grad_norm": 0.28728726506233215, + "learning_rate": 7.954455480044587e-08, + "loss": 0.2734, + "step": 51700 + }, + { + "epoch": 0.9598511824092261, + "grad_norm": 0.30777013301849365, + "learning_rate": 7.939778508381457e-08, + "loss": 0.3074, + "step": 51702 + }, + { + "epoch": 0.9598883125466448, + "grad_norm": 0.32437658309936523, + "learning_rate": 7.925115035686271e-08, + "loss": 0.3027, + "step": 51704 + }, + { + "epoch": 0.9599254426840633, + "grad_norm": 0.459170937538147, + "learning_rate": 7.910465062158756e-08, + "loss": 0.2865, + "step": 51706 + }, + { + "epoch": 0.959962572821482, + "grad_norm": 0.3641872704029083, + "learning_rate": 7.895828587997978e-08, + "loss": 0.3881, + "step": 51708 + }, + { + "epoch": 0.9599997029589007, + "grad_norm": 0.4682431221008301, + "learning_rate": 7.881205613403442e-08, + "loss": 0.2795, + "step": 51710 + }, + { + "epoch": 0.9600368330963193, + "grad_norm": 0.3898244798183441, + "learning_rate": 7.866596138573989e-08, + "loss": 0.3468, + "step": 51712 + }, + { + "epoch": 0.960073963233738, + "grad_norm": 0.480672687292099, + "learning_rate": 7.85200016370824e-08, + "loss": 0.4507, + "step": 51714 + }, + { + "epoch": 0.9601110933711565, + "grad_norm": 0.2736056447029114, + "learning_rate": 7.837417689005034e-08, + "loss": 0.143, + "step": 51716 + }, + { + "epoch": 0.9601482235085752, + "grad_norm": 0.5502651333808899, + "learning_rate": 7.822848714662656e-08, + "loss": 0.2166, + "step": 51718 + }, + { + "epoch": 0.9601853536459939, + "grad_norm": 0.5269626975059509, + "learning_rate": 7.808293240879395e-08, + "loss": 0.1047, + "step": 51720 + }, + { + "epoch": 0.9602224837834125, + "grad_norm": 0.48788073658943176, + "learning_rate": 7.793751267853312e-08, + "loss": 0.2866, + "step": 51722 + }, + { + "epoch": 0.9602596139208311, + "grad_norm": 0.5617185235023499, + "learning_rate": 7.779222795782249e-08, + "loss": 0.1351, + "step": 51724 + }, + { + "epoch": 0.9602967440582497, + "grad_norm": 0.38651520013809204, + "learning_rate": 7.764707824863937e-08, + "loss": 0.0833, + "step": 51726 + }, + { + "epoch": 0.9603338741956684, + "grad_norm": 0.4881591200828552, + "learning_rate": 7.750206355295886e-08, + "loss": 0.2759, + "step": 51728 + }, + { + "epoch": 0.9603710043330871, + "grad_norm": 0.3922053873538971, + "learning_rate": 7.735718387275382e-08, + "loss": 0.2945, + "step": 51730 + }, + { + "epoch": 0.9604081344705057, + "grad_norm": 0.45379650592803955, + "learning_rate": 7.721243920999599e-08, + "loss": 0.303, + "step": 51732 + }, + { + "epoch": 0.9604452646079243, + "grad_norm": 0.38330143690109253, + "learning_rate": 7.706782956665493e-08, + "loss": 0.3267, + "step": 51734 + }, + { + "epoch": 0.9604823947453429, + "grad_norm": 0.44030776619911194, + "learning_rate": 7.692335494469683e-08, + "loss": 0.3364, + "step": 51736 + }, + { + "epoch": 0.9605195248827616, + "grad_norm": 0.3758874237537384, + "learning_rate": 7.6779015346089e-08, + "loss": 0.3852, + "step": 51738 + }, + { + "epoch": 0.9605566550201803, + "grad_norm": 0.2617555260658264, + "learning_rate": 7.663481077279655e-08, + "loss": 0.227, + "step": 51740 + }, + { + "epoch": 0.9605937851575989, + "grad_norm": 0.32857197523117065, + "learning_rate": 7.649074122678013e-08, + "loss": 0.3449, + "step": 51742 + }, + { + "epoch": 0.9606309152950175, + "grad_norm": 0.3772386908531189, + "learning_rate": 7.634680670999928e-08, + "loss": 0.2439, + "step": 51744 + }, + { + "epoch": 0.9606680454324361, + "grad_norm": 0.3572908043861389, + "learning_rate": 7.620300722441465e-08, + "loss": 0.3608, + "step": 51746 + }, + { + "epoch": 0.9607051755698548, + "grad_norm": 0.39418825507164, + "learning_rate": 7.605934277198134e-08, + "loss": 0.3055, + "step": 51748 + }, + { + "epoch": 0.9607423057072734, + "grad_norm": 0.271115243434906, + "learning_rate": 7.591581335465448e-08, + "loss": 0.234, + "step": 51750 + }, + { + "epoch": 0.960779435844692, + "grad_norm": 0.3313447833061218, + "learning_rate": 7.577241897438802e-08, + "loss": 0.3719, + "step": 51752 + }, + { + "epoch": 0.9608165659821107, + "grad_norm": 0.40390798449516296, + "learning_rate": 7.562915963313156e-08, + "loss": 0.4454, + "step": 51754 + }, + { + "epoch": 0.9608536961195293, + "grad_norm": 0.1685211956501007, + "learning_rate": 7.548603533283683e-08, + "loss": 0.1217, + "step": 51756 + }, + { + "epoch": 0.960890826256948, + "grad_norm": 0.31904810667037964, + "learning_rate": 7.534304607544895e-08, + "loss": 0.2766, + "step": 51758 + }, + { + "epoch": 0.9609279563943666, + "grad_norm": 0.37015795707702637, + "learning_rate": 7.520019186291416e-08, + "loss": 0.493, + "step": 51760 + }, + { + "epoch": 0.9609650865317853, + "grad_norm": 0.7840816974639893, + "learning_rate": 7.505747269717645e-08, + "loss": 0.3699, + "step": 51762 + }, + { + "epoch": 0.9610022166692039, + "grad_norm": 0.4443550705909729, + "learning_rate": 7.49148885801776e-08, + "loss": 0.2254, + "step": 51764 + }, + { + "epoch": 0.9610393468066225, + "grad_norm": 0.3132689893245697, + "learning_rate": 7.477243951385937e-08, + "loss": 0.305, + "step": 51766 + }, + { + "epoch": 0.9610764769440412, + "grad_norm": 0.5389214754104614, + "learning_rate": 7.463012550015803e-08, + "loss": 0.4715, + "step": 51768 + }, + { + "epoch": 0.9611136070814598, + "grad_norm": 0.42372971773147583, + "learning_rate": 7.448794654100976e-08, + "loss": 0.3061, + "step": 51770 + }, + { + "epoch": 0.9611507372188784, + "grad_norm": 0.2763903737068176, + "learning_rate": 7.434590263835084e-08, + "loss": 0.1386, + "step": 51772 + }, + { + "epoch": 0.9611878673562971, + "grad_norm": 0.45145857334136963, + "learning_rate": 7.420399379411414e-08, + "loss": 0.2279, + "step": 51774 + }, + { + "epoch": 0.9612249974937157, + "grad_norm": 0.4513481557369232, + "learning_rate": 7.406222001022923e-08, + "loss": 0.3085, + "step": 51776 + }, + { + "epoch": 0.9612621276311344, + "grad_norm": 0.43919387459754944, + "learning_rate": 7.392058128862677e-08, + "loss": 0.307, + "step": 51778 + }, + { + "epoch": 0.961299257768553, + "grad_norm": 0.2974982261657715, + "learning_rate": 7.377907763123304e-08, + "loss": 0.4287, + "step": 51780 + }, + { + "epoch": 0.9613363879059716, + "grad_norm": 0.42442306876182556, + "learning_rate": 7.363770903997203e-08, + "loss": 0.1967, + "step": 51782 + }, + { + "epoch": 0.9613735180433903, + "grad_norm": 0.5041149258613586, + "learning_rate": 7.349647551676997e-08, + "loss": 0.2604, + "step": 51784 + }, + { + "epoch": 0.9614106481808089, + "grad_norm": 0.49555397033691406, + "learning_rate": 7.335537706354756e-08, + "loss": 0.3164, + "step": 51786 + }, + { + "epoch": 0.9614477783182276, + "grad_norm": 0.5422022938728333, + "learning_rate": 7.321441368222437e-08, + "loss": 0.2222, + "step": 51788 + }, + { + "epoch": 0.9614849084556462, + "grad_norm": 0.27792268991470337, + "learning_rate": 7.307358537471998e-08, + "loss": 0.2901, + "step": 51790 + }, + { + "epoch": 0.9615220385930648, + "grad_norm": 0.24670034646987915, + "learning_rate": 7.29328921429484e-08, + "loss": 0.1027, + "step": 51792 + }, + { + "epoch": 0.9615591687304835, + "grad_norm": 0.5402873158454895, + "learning_rate": 7.279233398882479e-08, + "loss": 0.1407, + "step": 51794 + }, + { + "epoch": 0.9615962988679021, + "grad_norm": 0.5390249490737915, + "learning_rate": 7.265191091426204e-08, + "loss": 0.356, + "step": 51796 + }, + { + "epoch": 0.9616334290053208, + "grad_norm": 0.36146315932273865, + "learning_rate": 7.251162292117198e-08, + "loss": 0.1821, + "step": 51798 + }, + { + "epoch": 0.9616705591427394, + "grad_norm": 0.5167868137359619, + "learning_rate": 7.237147001146195e-08, + "loss": 0.2239, + "step": 51800 + }, + { + "epoch": 0.961707689280158, + "grad_norm": 0.2698373794555664, + "learning_rate": 7.223145218704042e-08, + "loss": 0.3792, + "step": 51802 + }, + { + "epoch": 0.9617448194175766, + "grad_norm": 0.3992241621017456, + "learning_rate": 7.209156944981033e-08, + "loss": 0.079, + "step": 51804 + }, + { + "epoch": 0.9617819495549953, + "grad_norm": 0.30803990364074707, + "learning_rate": 7.195182180167793e-08, + "loss": 0.2093, + "step": 51806 + }, + { + "epoch": 0.961819079692414, + "grad_norm": 0.41861769556999207, + "learning_rate": 7.181220924454169e-08, + "loss": 0.5198, + "step": 51808 + }, + { + "epoch": 0.9618562098298326, + "grad_norm": 0.5619530081748962, + "learning_rate": 7.167273178030343e-08, + "loss": 0.1621, + "step": 51810 + }, + { + "epoch": 0.9618933399672512, + "grad_norm": 0.42366135120391846, + "learning_rate": 7.153338941086052e-08, + "loss": 0.3556, + "step": 51812 + }, + { + "epoch": 0.9619304701046698, + "grad_norm": 0.32972222566604614, + "learning_rate": 7.139418213810922e-08, + "loss": 0.2898, + "step": 51814 + }, + { + "epoch": 0.9619676002420885, + "grad_norm": 0.33899882435798645, + "learning_rate": 7.125510996394358e-08, + "loss": 0.1679, + "step": 51816 + }, + { + "epoch": 0.9620047303795072, + "grad_norm": 0.36961978673934937, + "learning_rate": 7.111617289025541e-08, + "loss": 0.2689, + "step": 51818 + }, + { + "epoch": 0.9620418605169258, + "grad_norm": 0.38777318596839905, + "learning_rate": 7.097737091893652e-08, + "loss": 0.275, + "step": 51820 + }, + { + "epoch": 0.9620789906543444, + "grad_norm": 0.49063849449157715, + "learning_rate": 7.083870405187432e-08, + "loss": 0.2872, + "step": 51822 + }, + { + "epoch": 0.962116120791763, + "grad_norm": 0.35705888271331787, + "learning_rate": 7.070017229095615e-08, + "loss": 0.3404, + "step": 51824 + }, + { + "epoch": 0.9621532509291817, + "grad_norm": 0.3247383236885071, + "learning_rate": 7.05617756380661e-08, + "loss": 0.3949, + "step": 51826 + }, + { + "epoch": 0.9621903810666004, + "grad_norm": 0.26819127798080444, + "learning_rate": 7.04235140950893e-08, + "loss": 0.3194, + "step": 51828 + }, + { + "epoch": 0.962227511204019, + "grad_norm": 0.5702958106994629, + "learning_rate": 7.028538766390536e-08, + "loss": 0.2757, + "step": 51830 + }, + { + "epoch": 0.9622646413414376, + "grad_norm": 0.30072227120399475, + "learning_rate": 7.014739634639389e-08, + "loss": 0.283, + "step": 51832 + }, + { + "epoch": 0.9623017714788562, + "grad_norm": 0.4023928642272949, + "learning_rate": 7.00095401444334e-08, + "loss": 0.2026, + "step": 51834 + }, + { + "epoch": 0.9623389016162749, + "grad_norm": 0.5951882600784302, + "learning_rate": 6.987181905990014e-08, + "loss": 0.3694, + "step": 51836 + }, + { + "epoch": 0.9623760317536936, + "grad_norm": 0.4866952896118164, + "learning_rate": 6.97342330946671e-08, + "loss": 0.1818, + "step": 51838 + }, + { + "epoch": 0.9624131618911121, + "grad_norm": 0.42780017852783203, + "learning_rate": 6.959678225060496e-08, + "loss": 0.2664, + "step": 51840 + }, + { + "epoch": 0.9624502920285308, + "grad_norm": 0.43010371923446655, + "learning_rate": 6.94594665295878e-08, + "loss": 0.119, + "step": 51842 + }, + { + "epoch": 0.9624874221659494, + "grad_norm": 0.4382522702217102, + "learning_rate": 6.932228593348078e-08, + "loss": 0.1905, + "step": 51844 + }, + { + "epoch": 0.9625245523033681, + "grad_norm": 0.5566995739936829, + "learning_rate": 6.918524046415132e-08, + "loss": 0.35, + "step": 51846 + }, + { + "epoch": 0.9625616824407867, + "grad_norm": 0.22547376155853271, + "learning_rate": 6.904833012346457e-08, + "loss": 0.1593, + "step": 51848 + }, + { + "epoch": 0.9625988125782053, + "grad_norm": 0.38808420300483704, + "learning_rate": 6.89115549132835e-08, + "loss": 0.1203, + "step": 51850 + }, + { + "epoch": 0.962635942715624, + "grad_norm": 0.40613439679145813, + "learning_rate": 6.877491483546883e-08, + "loss": 0.1904, + "step": 51852 + }, + { + "epoch": 0.9626730728530426, + "grad_norm": 0.4027932286262512, + "learning_rate": 6.863840989188131e-08, + "loss": 0.2431, + "step": 51854 + }, + { + "epoch": 0.9627102029904613, + "grad_norm": 0.3869066536426544, + "learning_rate": 6.850204008437611e-08, + "loss": 0.2266, + "step": 51856 + }, + { + "epoch": 0.9627473331278799, + "grad_norm": 0.4246046841144562, + "learning_rate": 6.836580541480953e-08, + "loss": 0.159, + "step": 51858 + }, + { + "epoch": 0.9627844632652985, + "grad_norm": 0.6201530694961548, + "learning_rate": 6.822970588503675e-08, + "loss": 0.238, + "step": 51860 + }, + { + "epoch": 0.9628215934027172, + "grad_norm": 0.39868947863578796, + "learning_rate": 6.80937414969085e-08, + "loss": 0.1883, + "step": 51862 + }, + { + "epoch": 0.9628587235401358, + "grad_norm": 0.33960285782814026, + "learning_rate": 6.795791225227443e-08, + "loss": 0.3644, + "step": 51864 + }, + { + "epoch": 0.9628958536775545, + "grad_norm": 0.1643591970205307, + "learning_rate": 6.782221815298306e-08, + "loss": 0.123, + "step": 51866 + }, + { + "epoch": 0.962932983814973, + "grad_norm": 0.46688312292099, + "learning_rate": 6.76866592008818e-08, + "loss": 0.2929, + "step": 51868 + }, + { + "epoch": 0.9629701139523917, + "grad_norm": 0.4753555655479431, + "learning_rate": 6.75512353978136e-08, + "loss": 0.3647, + "step": 51870 + }, + { + "epoch": 0.9630072440898104, + "grad_norm": 0.6376470327377319, + "learning_rate": 6.741594674562147e-08, + "loss": 0.2634, + "step": 51872 + }, + { + "epoch": 0.963044374227229, + "grad_norm": 0.5328581929206848, + "learning_rate": 6.728079324614723e-08, + "loss": 0.3076, + "step": 51874 + }, + { + "epoch": 0.9630815043646477, + "grad_norm": 0.3925354480743408, + "learning_rate": 6.714577490123053e-08, + "loss": 0.3007, + "step": 51876 + }, + { + "epoch": 0.9631186345020663, + "grad_norm": 0.2147914320230484, + "learning_rate": 6.701089171270658e-08, + "loss": 0.1747, + "step": 51878 + }, + { + "epoch": 0.9631557646394849, + "grad_norm": 0.36584752798080444, + "learning_rate": 6.687614368241168e-08, + "loss": 0.0897, + "step": 51880 + }, + { + "epoch": 0.9631928947769036, + "grad_norm": 0.41972261667251587, + "learning_rate": 6.674153081217882e-08, + "loss": 0.2913, + "step": 51882 + }, + { + "epoch": 0.9632300249143222, + "grad_norm": 0.403815895318985, + "learning_rate": 6.660705310384207e-08, + "loss": 0.3287, + "step": 51884 + }, + { + "epoch": 0.9632671550517409, + "grad_norm": 0.269770085811615, + "learning_rate": 6.647271055922777e-08, + "loss": 0.1702, + "step": 51886 + }, + { + "epoch": 0.9633042851891594, + "grad_norm": 0.38881126046180725, + "learning_rate": 6.633850318016554e-08, + "loss": 0.403, + "step": 51888 + }, + { + "epoch": 0.9633414153265781, + "grad_norm": 0.3687662184238434, + "learning_rate": 6.620443096848172e-08, + "loss": 0.2, + "step": 51890 + }, + { + "epoch": 0.9633785454639968, + "grad_norm": 0.656246542930603, + "learning_rate": 6.60704939260004e-08, + "loss": 0.1359, + "step": 51892 + }, + { + "epoch": 0.9634156756014154, + "grad_norm": 0.3333469033241272, + "learning_rate": 6.593669205454457e-08, + "loss": 0.1591, + "step": 51894 + }, + { + "epoch": 0.9634528057388341, + "grad_norm": 0.4012209177017212, + "learning_rate": 6.580302535593385e-08, + "loss": 0.3402, + "step": 51896 + }, + { + "epoch": 0.9634899358762526, + "grad_norm": 0.5394437313079834, + "learning_rate": 6.566949383198684e-08, + "loss": 0.1841, + "step": 51898 + }, + { + "epoch": 0.9635270660136713, + "grad_norm": 0.2256866991519928, + "learning_rate": 6.553609748452206e-08, + "loss": 0.323, + "step": 51900 + }, + { + "epoch": 0.9635641961510899, + "grad_norm": 0.28045791387557983, + "learning_rate": 6.540283631535471e-08, + "loss": 0.3361, + "step": 51902 + }, + { + "epoch": 0.9636013262885086, + "grad_norm": 0.39396989345550537, + "learning_rate": 6.52697103262967e-08, + "loss": 0.2667, + "step": 51904 + }, + { + "epoch": 0.9636384564259273, + "grad_norm": 0.5530263781547546, + "learning_rate": 6.51367195191599e-08, + "loss": 0.4838, + "step": 51906 + }, + { + "epoch": 0.9636755865633458, + "grad_norm": 0.27274277806282043, + "learning_rate": 6.500386389575286e-08, + "loss": 0.3169, + "step": 51908 + }, + { + "epoch": 0.9637127167007645, + "grad_norm": 0.47739526629447937, + "learning_rate": 6.487114345788526e-08, + "loss": 0.4687, + "step": 51910 + }, + { + "epoch": 0.9637498468381831, + "grad_norm": 0.47350066900253296, + "learning_rate": 6.47385582073623e-08, + "loss": 0.3993, + "step": 51912 + }, + { + "epoch": 0.9637869769756018, + "grad_norm": 0.38004085421562195, + "learning_rate": 6.460610814598811e-08, + "loss": 0.3426, + "step": 51914 + }, + { + "epoch": 0.9638241071130205, + "grad_norm": 0.23073621094226837, + "learning_rate": 6.44737932755657e-08, + "loss": 0.2882, + "step": 51916 + }, + { + "epoch": 0.963861237250439, + "grad_norm": 0.22326688468456268, + "learning_rate": 6.434161359789359e-08, + "loss": 0.1193, + "step": 51918 + }, + { + "epoch": 0.9638983673878577, + "grad_norm": 0.30562201142311096, + "learning_rate": 6.42095691147715e-08, + "loss": 0.1272, + "step": 51920 + }, + { + "epoch": 0.9639354975252763, + "grad_norm": 0.43220290541648865, + "learning_rate": 6.407765982799574e-08, + "loss": 0.1837, + "step": 51922 + }, + { + "epoch": 0.963972627662695, + "grad_norm": 0.43850141763687134, + "learning_rate": 6.394588573936156e-08, + "loss": 0.1758, + "step": 51924 + }, + { + "epoch": 0.9640097578001137, + "grad_norm": 0.5628682374954224, + "learning_rate": 6.381424685066195e-08, + "loss": 0.3684, + "step": 51926 + }, + { + "epoch": 0.9640468879375322, + "grad_norm": 0.3620678782463074, + "learning_rate": 6.368274316368884e-08, + "loss": 0.141, + "step": 51928 + }, + { + "epoch": 0.9640840180749509, + "grad_norm": 0.40990346670150757, + "learning_rate": 6.355137468023076e-08, + "loss": 0.2374, + "step": 51930 + }, + { + "epoch": 0.9641211482123695, + "grad_norm": 0.27141252160072327, + "learning_rate": 6.34201414020752e-08, + "loss": 0.1238, + "step": 51932 + }, + { + "epoch": 0.9641582783497882, + "grad_norm": 0.605760395526886, + "learning_rate": 6.328904333100739e-08, + "loss": 0.2618, + "step": 51934 + }, + { + "epoch": 0.9641954084872069, + "grad_norm": 0.47716793417930603, + "learning_rate": 6.315808046881144e-08, + "loss": 0.2998, + "step": 51936 + }, + { + "epoch": 0.9642325386246254, + "grad_norm": 0.23172542452812195, + "learning_rate": 6.30272528172704e-08, + "loss": 0.0864, + "step": 51938 + }, + { + "epoch": 0.9642696687620441, + "grad_norm": 0.48228976130485535, + "learning_rate": 6.289656037816394e-08, + "loss": 0.2307, + "step": 51940 + }, + { + "epoch": 0.9643067988994627, + "grad_norm": 0.29130199551582336, + "learning_rate": 6.276600315327064e-08, + "loss": 0.3453, + "step": 51942 + }, + { + "epoch": 0.9643439290368814, + "grad_norm": 0.41725465655326843, + "learning_rate": 6.263558114436574e-08, + "loss": 0.1707, + "step": 51944 + }, + { + "epoch": 0.9643810591743001, + "grad_norm": 0.19328583776950836, + "learning_rate": 6.250529435322561e-08, + "loss": 0.2533, + "step": 51946 + }, + { + "epoch": 0.9644181893117186, + "grad_norm": 0.3093467652797699, + "learning_rate": 6.237514278162215e-08, + "loss": 0.2097, + "step": 51948 + }, + { + "epoch": 0.9644553194491373, + "grad_norm": 0.3799770474433899, + "learning_rate": 6.224512643132508e-08, + "loss": 0.4986, + "step": 51950 + }, + { + "epoch": 0.9644924495865559, + "grad_norm": 0.3797805905342102, + "learning_rate": 6.211524530410628e-08, + "loss": 0.0691, + "step": 51952 + }, + { + "epoch": 0.9645295797239746, + "grad_norm": 0.7204617261886597, + "learning_rate": 6.198549940173104e-08, + "loss": 0.3494, + "step": 51954 + }, + { + "epoch": 0.9645667098613931, + "grad_norm": 0.41263675689697266, + "learning_rate": 6.18558887259646e-08, + "loss": 0.1385, + "step": 51956 + }, + { + "epoch": 0.9646038399988118, + "grad_norm": 0.4448762834072113, + "learning_rate": 6.17264132785722e-08, + "loss": 0.1774, + "step": 51958 + }, + { + "epoch": 0.9646409701362305, + "grad_norm": 0.23807239532470703, + "learning_rate": 6.159707306131468e-08, + "loss": 0.1604, + "step": 51960 + }, + { + "epoch": 0.9646781002736491, + "grad_norm": 0.3758525550365448, + "learning_rate": 6.146786807595173e-08, + "loss": 0.1963, + "step": 51962 + }, + { + "epoch": 0.9647152304110678, + "grad_norm": 0.32541969418525696, + "learning_rate": 6.133879832424084e-08, + "loss": 0.1494, + "step": 51964 + }, + { + "epoch": 0.9647523605484863, + "grad_norm": 0.5362234115600586, + "learning_rate": 6.120986380794059e-08, + "loss": 0.2944, + "step": 51966 + }, + { + "epoch": 0.964789490685905, + "grad_norm": 0.3478713631629944, + "learning_rate": 6.108106452880403e-08, + "loss": 0.2385, + "step": 51968 + }, + { + "epoch": 0.9648266208233237, + "grad_norm": 0.5390769243240356, + "learning_rate": 6.095240048858308e-08, + "loss": 0.3323, + "step": 51970 + }, + { + "epoch": 0.9648637509607423, + "grad_norm": 0.5467588305473328, + "learning_rate": 6.08238716890297e-08, + "loss": 0.4634, + "step": 51972 + }, + { + "epoch": 0.964900881098161, + "grad_norm": 0.4785202741622925, + "learning_rate": 6.069547813189136e-08, + "loss": 0.3048, + "step": 51974 + }, + { + "epoch": 0.9649380112355795, + "grad_norm": 0.24885335564613342, + "learning_rate": 6.056721981891556e-08, + "loss": 0.3631, + "step": 51976 + }, + { + "epoch": 0.9649751413729982, + "grad_norm": 0.3617143929004669, + "learning_rate": 6.043909675184867e-08, + "loss": 0.1883, + "step": 51978 + }, + { + "epoch": 0.9650122715104169, + "grad_norm": 0.4388920068740845, + "learning_rate": 6.031110893243375e-08, + "loss": 0.2838, + "step": 51980 + }, + { + "epoch": 0.9650494016478355, + "grad_norm": 0.23073747754096985, + "learning_rate": 6.018325636241052e-08, + "loss": 0.3341, + "step": 51982 + }, + { + "epoch": 0.9650865317852542, + "grad_norm": 0.2821613848209381, + "learning_rate": 6.005553904352091e-08, + "loss": 0.2022, + "step": 51984 + }, + { + "epoch": 0.9651236619226727, + "grad_norm": 0.6161848902702332, + "learning_rate": 5.992795697750132e-08, + "loss": 0.0772, + "step": 51986 + }, + { + "epoch": 0.9651607920600914, + "grad_norm": 0.7914007306098938, + "learning_rate": 5.980051016608812e-08, + "loss": 0.4433, + "step": 51988 + }, + { + "epoch": 0.9651979221975101, + "grad_norm": 0.3046756982803345, + "learning_rate": 5.967319861101661e-08, + "loss": 0.2908, + "step": 51990 + }, + { + "epoch": 0.9652350523349287, + "grad_norm": 0.3921298682689667, + "learning_rate": 5.954602231401763e-08, + "loss": 0.1715, + "step": 51992 + }, + { + "epoch": 0.9652721824723474, + "grad_norm": 0.44833242893218994, + "learning_rate": 5.9418981276821995e-08, + "loss": 0.2819, + "step": 51994 + }, + { + "epoch": 0.9653093126097659, + "grad_norm": 0.5143845081329346, + "learning_rate": 5.929207550115834e-08, + "loss": 0.2821, + "step": 51996 + }, + { + "epoch": 0.9653464427471846, + "grad_norm": 0.26001134514808655, + "learning_rate": 5.916530498875417e-08, + "loss": 0.3764, + "step": 51998 + }, + { + "epoch": 0.9653835728846032, + "grad_norm": 0.3874657452106476, + "learning_rate": 5.9038669741332547e-08, + "loss": 0.2241, + "step": 52000 + }, + { + "epoch": 0.9654207030220219, + "grad_norm": 0.40083709359169006, + "learning_rate": 5.8912169760619866e-08, + "loss": 0.3464, + "step": 52002 + }, + { + "epoch": 0.9654578331594406, + "grad_norm": 0.22208306193351746, + "learning_rate": 5.878580504833475e-08, + "loss": 0.1798, + "step": 52004 + }, + { + "epoch": 0.9654949632968591, + "grad_norm": 0.33477073907852173, + "learning_rate": 5.865957560619695e-08, + "loss": 0.2475, + "step": 52006 + }, + { + "epoch": 0.9655320934342778, + "grad_norm": 0.664359986782074, + "learning_rate": 5.853348143592508e-08, + "loss": 0.2207, + "step": 52008 + }, + { + "epoch": 0.9655692235716964, + "grad_norm": 0.34503164887428284, + "learning_rate": 5.8407522539235537e-08, + "loss": 0.4315, + "step": 52010 + }, + { + "epoch": 0.9656063537091151, + "grad_norm": 0.1442912518978119, + "learning_rate": 5.828169891783919e-08, + "loss": 0.1798, + "step": 52012 + }, + { + "epoch": 0.9656434838465338, + "grad_norm": 0.4738776981830597, + "learning_rate": 5.815601057345133e-08, + "loss": 0.3199, + "step": 52014 + }, + { + "epoch": 0.9656806139839523, + "grad_norm": 0.31175050139427185, + "learning_rate": 5.803045750778058e-08, + "loss": 0.2108, + "step": 52016 + }, + { + "epoch": 0.965717744121371, + "grad_norm": 0.3600679337978363, + "learning_rate": 5.7905039722535585e-08, + "loss": 0.0992, + "step": 52018 + }, + { + "epoch": 0.9657548742587896, + "grad_norm": 0.33089563250541687, + "learning_rate": 5.777975721942275e-08, + "loss": 0.0236, + "step": 52020 + }, + { + "epoch": 0.9657920043962083, + "grad_norm": 0.28495630621910095, + "learning_rate": 5.765461000014627e-08, + "loss": 0.2472, + "step": 52022 + }, + { + "epoch": 0.965829134533627, + "grad_norm": 0.28831151127815247, + "learning_rate": 5.752959806641145e-08, + "loss": 0.2628, + "step": 52024 + }, + { + "epoch": 0.9658662646710455, + "grad_norm": 0.26042693853378296, + "learning_rate": 5.740472141991582e-08, + "loss": 0.2352, + "step": 52026 + }, + { + "epoch": 0.9659033948084642, + "grad_norm": 0.30099377036094666, + "learning_rate": 5.7279980062362464e-08, + "loss": 0.3728, + "step": 52028 + }, + { + "epoch": 0.9659405249458828, + "grad_norm": 0.3281489610671997, + "learning_rate": 5.7155373995445575e-08, + "loss": 0.2401, + "step": 52030 + }, + { + "epoch": 0.9659776550833015, + "grad_norm": 0.4513225257396698, + "learning_rate": 5.703090322086158e-08, + "loss": 0.4553, + "step": 52032 + }, + { + "epoch": 0.9660147852207202, + "grad_norm": 0.48078542947769165, + "learning_rate": 5.690656774030468e-08, + "loss": 0.1003, + "step": 52034 + }, + { + "epoch": 0.9660519153581387, + "grad_norm": 0.3619637191295624, + "learning_rate": 5.678236755546684e-08, + "loss": 0.3818, + "step": 52036 + }, + { + "epoch": 0.9660890454955574, + "grad_norm": 0.592544674873352, + "learning_rate": 5.6658302668036734e-08, + "loss": 0.2069, + "step": 52038 + }, + { + "epoch": 0.966126175632976, + "grad_norm": 0.30436888337135315, + "learning_rate": 5.6534373079703e-08, + "loss": 0.1948, + "step": 52040 + }, + { + "epoch": 0.9661633057703947, + "grad_norm": 0.32228145003318787, + "learning_rate": 5.641057879215317e-08, + "loss": 0.2733, + "step": 52042 + }, + { + "epoch": 0.9662004359078133, + "grad_norm": 0.4426427185535431, + "learning_rate": 5.628691980707146e-08, + "loss": 0.3353, + "step": 52044 + }, + { + "epoch": 0.9662375660452319, + "grad_norm": 0.46166566014289856, + "learning_rate": 5.616339612613875e-08, + "loss": 0.2531, + "step": 52046 + }, + { + "epoch": 0.9662746961826506, + "grad_norm": 0.2919369637966156, + "learning_rate": 5.604000775103702e-08, + "loss": 0.4079, + "step": 52048 + }, + { + "epoch": 0.9663118263200692, + "grad_norm": 0.2622028887271881, + "learning_rate": 5.591675468344604e-08, + "loss": 0.296, + "step": 52050 + }, + { + "epoch": 0.9663489564574879, + "grad_norm": 0.2078421264886856, + "learning_rate": 5.5793636925042246e-08, + "loss": 0.4202, + "step": 52052 + }, + { + "epoch": 0.9663860865949064, + "grad_norm": 0.2534765303134918, + "learning_rate": 5.567065447749987e-08, + "loss": 0.2572, + "step": 52054 + }, + { + "epoch": 0.9664232167323251, + "grad_norm": 0.38242074847221375, + "learning_rate": 5.55478073424931e-08, + "loss": 0.2907, + "step": 52056 + }, + { + "epoch": 0.9664603468697438, + "grad_norm": 0.37709495425224304, + "learning_rate": 5.542509552169395e-08, + "loss": 0.2081, + "step": 52058 + }, + { + "epoch": 0.9664974770071624, + "grad_norm": 0.3213070034980774, + "learning_rate": 5.5302519016771085e-08, + "loss": 0.1386, + "step": 52060 + }, + { + "epoch": 0.9665346071445811, + "grad_norm": 0.4885368049144745, + "learning_rate": 5.518007782939428e-08, + "loss": 0.1624, + "step": 52062 + }, + { + "epoch": 0.9665717372819996, + "grad_norm": 0.24357494711875916, + "learning_rate": 5.5057771961227744e-08, + "loss": 0.3749, + "step": 52064 + }, + { + "epoch": 0.9666088674194183, + "grad_norm": 0.8089112639427185, + "learning_rate": 5.493560141393572e-08, + "loss": 0.1917, + "step": 52066 + }, + { + "epoch": 0.966645997556837, + "grad_norm": 0.40955623984336853, + "learning_rate": 5.481356618918243e-08, + "loss": 0.2914, + "step": 52068 + }, + { + "epoch": 0.9666831276942556, + "grad_norm": 0.2952806055545807, + "learning_rate": 5.469166628862765e-08, + "loss": 0.3548, + "step": 52070 + }, + { + "epoch": 0.9667202578316743, + "grad_norm": 0.43467700481414795, + "learning_rate": 5.4569901713927844e-08, + "loss": 0.2834, + "step": 52072 + }, + { + "epoch": 0.9667573879690928, + "grad_norm": 0.28507158160209656, + "learning_rate": 5.444827246674278e-08, + "loss": 0.2506, + "step": 52074 + }, + { + "epoch": 0.9667945181065115, + "grad_norm": 0.22088083624839783, + "learning_rate": 5.43267785487267e-08, + "loss": 0.2486, + "step": 52076 + }, + { + "epoch": 0.9668316482439302, + "grad_norm": 0.3074319064617157, + "learning_rate": 5.420541996153161e-08, + "loss": 0.2668, + "step": 52078 + }, + { + "epoch": 0.9668687783813488, + "grad_norm": 0.2029096633195877, + "learning_rate": 5.4084196706810646e-08, + "loss": 0.2417, + "step": 52080 + }, + { + "epoch": 0.9669059085187675, + "grad_norm": 0.3142991065979004, + "learning_rate": 5.396310878621136e-08, + "loss": 0.2092, + "step": 52082 + }, + { + "epoch": 0.966943038656186, + "grad_norm": 0.4453062415122986, + "learning_rate": 5.3842156201382444e-08, + "loss": 0.4936, + "step": 52084 + }, + { + "epoch": 0.9669801687936047, + "grad_norm": 0.9171554446220398, + "learning_rate": 5.3721338953969246e-08, + "loss": 0.2611, + "step": 52086 + }, + { + "epoch": 0.9670172989310234, + "grad_norm": 0.4402405321598053, + "learning_rate": 5.360065704561712e-08, + "loss": 0.3302, + "step": 52088 + }, + { + "epoch": 0.967054429068442, + "grad_norm": 0.40600696206092834, + "learning_rate": 5.3480110477965864e-08, + "loss": 0.318, + "step": 52090 + }, + { + "epoch": 0.9670915592058607, + "grad_norm": 0.4360716640949249, + "learning_rate": 5.33596992526586e-08, + "loss": 0.3749, + "step": 52092 + }, + { + "epoch": 0.9671286893432792, + "grad_norm": 0.3171249330043793, + "learning_rate": 5.32394233713307e-08, + "loss": 0.2076, + "step": 52094 + }, + { + "epoch": 0.9671658194806979, + "grad_norm": 0.3083970844745636, + "learning_rate": 5.3119282835621956e-08, + "loss": 0.1401, + "step": 52096 + }, + { + "epoch": 0.9672029496181166, + "grad_norm": 0.36607569456100464, + "learning_rate": 5.2999277647163284e-08, + "loss": 0.4236, + "step": 52098 + }, + { + "epoch": 0.9672400797555352, + "grad_norm": 0.33107033371925354, + "learning_rate": 5.2879407807591156e-08, + "loss": 0.2523, + "step": 52100 + }, + { + "epoch": 0.9672772098929538, + "grad_norm": 0.35962867736816406, + "learning_rate": 5.2759673318534266e-08, + "loss": 0.1013, + "step": 52102 + }, + { + "epoch": 0.9673143400303724, + "grad_norm": 0.290518581867218, + "learning_rate": 5.2640074181623535e-08, + "loss": 0.28, + "step": 52104 + }, + { + "epoch": 0.9673514701677911, + "grad_norm": 0.7249562740325928, + "learning_rate": 5.252061039848655e-08, + "loss": 0.1727, + "step": 52106 + }, + { + "epoch": 0.9673886003052097, + "grad_norm": 1.3534290790557861, + "learning_rate": 5.240128197074645e-08, + "loss": 0.3881, + "step": 52108 + }, + { + "epoch": 0.9674257304426284, + "grad_norm": 0.3808611333370209, + "learning_rate": 5.2282088900028615e-08, + "loss": 0.4356, + "step": 52110 + }, + { + "epoch": 0.967462860580047, + "grad_norm": 0.6429833769798279, + "learning_rate": 5.216303118795396e-08, + "loss": 0.3072, + "step": 52112 + }, + { + "epoch": 0.9674999907174656, + "grad_norm": 0.33377113938331604, + "learning_rate": 5.2044108836144525e-08, + "loss": 0.2278, + "step": 52114 + }, + { + "epoch": 0.9675371208548843, + "grad_norm": 0.25443726778030396, + "learning_rate": 5.1925321846216794e-08, + "loss": 0.2458, + "step": 52116 + }, + { + "epoch": 0.9675742509923029, + "grad_norm": 0.4367438852787018, + "learning_rate": 5.1806670219788357e-08, + "loss": 0.3858, + "step": 52118 + }, + { + "epoch": 0.9676113811297216, + "grad_norm": 0.3579873740673065, + "learning_rate": 5.168815395847127e-08, + "loss": 0.1641, + "step": 52120 + }, + { + "epoch": 0.9676485112671402, + "grad_norm": 0.3920475244522095, + "learning_rate": 5.156977306388089e-08, + "loss": 0.2737, + "step": 52122 + }, + { + "epoch": 0.9676856414045588, + "grad_norm": 0.34164249897003174, + "learning_rate": 5.145152753762595e-08, + "loss": 0.4361, + "step": 52124 + }, + { + "epoch": 0.9677227715419775, + "grad_norm": 0.6192024350166321, + "learning_rate": 5.133341738131625e-08, + "loss": 0.3249, + "step": 52126 + }, + { + "epoch": 0.9677599016793961, + "grad_norm": 0.4364921748638153, + "learning_rate": 5.121544259656053e-08, + "loss": 0.3604, + "step": 52128 + }, + { + "epoch": 0.9677970318168148, + "grad_norm": 0.2823511064052582, + "learning_rate": 5.109760318496082e-08, + "loss": 0.2594, + "step": 52130 + }, + { + "epoch": 0.9678341619542334, + "grad_norm": 0.4863722324371338, + "learning_rate": 5.0979899148123624e-08, + "loss": 0.2451, + "step": 52132 + }, + { + "epoch": 0.967871292091652, + "grad_norm": 0.3227176070213318, + "learning_rate": 5.086233048764877e-08, + "loss": 0.2854, + "step": 52134 + }, + { + "epoch": 0.9679084222290707, + "grad_norm": 0.26267120242118835, + "learning_rate": 5.07448972051372e-08, + "loss": 0.2057, + "step": 52136 + }, + { + "epoch": 0.9679455523664893, + "grad_norm": 0.42357468605041504, + "learning_rate": 5.062759930218541e-08, + "loss": 0.34, + "step": 52138 + }, + { + "epoch": 0.967982682503908, + "grad_norm": 0.35861295461654663, + "learning_rate": 5.051043678038992e-08, + "loss": 0.4334, + "step": 52140 + }, + { + "epoch": 0.9680198126413266, + "grad_norm": 0.5864925384521484, + "learning_rate": 5.039340964134498e-08, + "loss": 0.2277, + "step": 52142 + }, + { + "epoch": 0.9680569427787452, + "grad_norm": 0.5402977466583252, + "learning_rate": 5.0276517886643784e-08, + "loss": 0.2578, + "step": 52144 + }, + { + "epoch": 0.9680940729161639, + "grad_norm": 0.3470340371131897, + "learning_rate": 5.015976151787616e-08, + "loss": 0.1117, + "step": 52146 + }, + { + "epoch": 0.9681312030535825, + "grad_norm": 0.44760286808013916, + "learning_rate": 5.004314053663084e-08, + "loss": 0.1137, + "step": 52148 + }, + { + "epoch": 0.9681683331910012, + "grad_norm": 0.4367511570453644, + "learning_rate": 4.9926654944495447e-08, + "loss": 0.3033, + "step": 52150 + }, + { + "epoch": 0.9682054633284197, + "grad_norm": 0.5647957921028137, + "learning_rate": 4.981030474305426e-08, + "loss": 0.3964, + "step": 52152 + }, + { + "epoch": 0.9682425934658384, + "grad_norm": 0.32753831148147583, + "learning_rate": 4.9694089933889353e-08, + "loss": 0.2173, + "step": 52154 + }, + { + "epoch": 0.9682797236032571, + "grad_norm": 0.5231319069862366, + "learning_rate": 4.9578010518585015e-08, + "loss": 0.3118, + "step": 52156 + }, + { + "epoch": 0.9683168537406757, + "grad_norm": 0.5173414349555969, + "learning_rate": 4.9462066498717764e-08, + "loss": 0.2905, + "step": 52158 + }, + { + "epoch": 0.9683539838780943, + "grad_norm": 0.27994275093078613, + "learning_rate": 4.934625787586744e-08, + "loss": 0.1202, + "step": 52160 + }, + { + "epoch": 0.9683911140155129, + "grad_norm": 0.4744930863380432, + "learning_rate": 4.923058465160724e-08, + "loss": 0.3823, + "step": 52162 + }, + { + "epoch": 0.9684282441529316, + "grad_norm": 0.25886625051498413, + "learning_rate": 4.911504682751367e-08, + "loss": 0.31, + "step": 52164 + }, + { + "epoch": 0.9684653742903503, + "grad_norm": 0.4309345781803131, + "learning_rate": 4.899964440515881e-08, + "loss": 0.4171, + "step": 52166 + }, + { + "epoch": 0.9685025044277689, + "grad_norm": 0.2908543050289154, + "learning_rate": 4.8884377386111407e-08, + "loss": 0.2764, + "step": 52168 + }, + { + "epoch": 0.9685396345651875, + "grad_norm": 0.3983971178531647, + "learning_rate": 4.87692457719402e-08, + "loss": 0.3448, + "step": 52170 + }, + { + "epoch": 0.9685767647026061, + "grad_norm": 0.4219726622104645, + "learning_rate": 4.865424956421283e-08, + "loss": 0.3789, + "step": 52172 + }, + { + "epoch": 0.9686138948400248, + "grad_norm": 0.8485954403877258, + "learning_rate": 4.853938876449249e-08, + "loss": 0.2404, + "step": 52174 + }, + { + "epoch": 0.9686510249774435, + "grad_norm": 0.28808602690696716, + "learning_rate": 4.8424663374343485e-08, + "loss": 0.3382, + "step": 52176 + }, + { + "epoch": 0.9686881551148621, + "grad_norm": 0.3417855501174927, + "learning_rate": 4.831007339532678e-08, + "loss": 0.1123, + "step": 52178 + }, + { + "epoch": 0.9687252852522807, + "grad_norm": 0.37864014506340027, + "learning_rate": 4.8195618829001145e-08, + "loss": 0.3089, + "step": 52180 + }, + { + "epoch": 0.9687624153896993, + "grad_norm": 0.35535019636154175, + "learning_rate": 4.808129967692421e-08, + "loss": 0.2048, + "step": 52182 + }, + { + "epoch": 0.968799545527118, + "grad_norm": 0.9459232091903687, + "learning_rate": 4.7967115940650287e-08, + "loss": 0.1914, + "step": 52184 + }, + { + "epoch": 0.9688366756645367, + "grad_norm": 0.4172569215297699, + "learning_rate": 4.7853067621734806e-08, + "loss": 0.354, + "step": 52186 + }, + { + "epoch": 0.9688738058019553, + "grad_norm": 0.5325927734375, + "learning_rate": 4.773915472172874e-08, + "loss": 0.3308, + "step": 52188 + }, + { + "epoch": 0.9689109359393739, + "grad_norm": 0.7270974516868591, + "learning_rate": 4.762537724218308e-08, + "loss": 0.3195, + "step": 52190 + }, + { + "epoch": 0.9689480660767925, + "grad_norm": 0.2682584524154663, + "learning_rate": 4.751173518464436e-08, + "loss": 0.0756, + "step": 52192 + }, + { + "epoch": 0.9689851962142112, + "grad_norm": 0.2906855642795563, + "learning_rate": 4.739822855066023e-08, + "loss": 0.2218, + "step": 52194 + }, + { + "epoch": 0.9690223263516299, + "grad_norm": 0.3794834315776825, + "learning_rate": 4.7284857341773903e-08, + "loss": 0.2578, + "step": 52196 + }, + { + "epoch": 0.9690594564890485, + "grad_norm": 0.5507210493087769, + "learning_rate": 4.7171621559529704e-08, + "loss": 0.4141, + "step": 52198 + }, + { + "epoch": 0.9690965866264671, + "grad_norm": 0.3101101815700531, + "learning_rate": 4.70585212054675e-08, + "loss": 0.1847, + "step": 52200 + }, + { + "epoch": 0.9691337167638857, + "grad_norm": 0.2991110682487488, + "learning_rate": 4.694555628112607e-08, + "loss": 0.2987, + "step": 52202 + }, + { + "epoch": 0.9691708469013044, + "grad_norm": 0.6705873012542725, + "learning_rate": 4.683272678804307e-08, + "loss": 0.1893, + "step": 52204 + }, + { + "epoch": 0.969207977038723, + "grad_norm": 0.4680391550064087, + "learning_rate": 4.672003272775283e-08, + "loss": 0.1342, + "step": 52206 + }, + { + "epoch": 0.9692451071761417, + "grad_norm": 0.7091013789176941, + "learning_rate": 4.660747410178967e-08, + "loss": 0.0988, + "step": 52208 + }, + { + "epoch": 0.9692822373135603, + "grad_norm": 0.5330358147621155, + "learning_rate": 4.649505091168571e-08, + "loss": 0.2396, + "step": 52210 + }, + { + "epoch": 0.9693193674509789, + "grad_norm": 0.41658705472946167, + "learning_rate": 4.6382763158968615e-08, + "loss": 0.4732, + "step": 52212 + }, + { + "epoch": 0.9693564975883976, + "grad_norm": 0.40233591198921204, + "learning_rate": 4.627061084516826e-08, + "loss": 0.4603, + "step": 52214 + }, + { + "epoch": 0.9693936277258162, + "grad_norm": 0.44144904613494873, + "learning_rate": 4.615859397181011e-08, + "loss": 0.3123, + "step": 52216 + }, + { + "epoch": 0.9694307578632348, + "grad_norm": 0.38346484303474426, + "learning_rate": 4.604671254041848e-08, + "loss": 0.1842, + "step": 52218 + }, + { + "epoch": 0.9694678880006535, + "grad_norm": 0.38805896043777466, + "learning_rate": 4.5934966552515505e-08, + "loss": 0.2188, + "step": 52220 + }, + { + "epoch": 0.9695050181380721, + "grad_norm": 0.447449266910553, + "learning_rate": 4.582335600962107e-08, + "loss": 0.3932, + "step": 52222 + }, + { + "epoch": 0.9695421482754908, + "grad_norm": 0.3837672472000122, + "learning_rate": 4.571188091325507e-08, + "loss": 0.1795, + "step": 52224 + }, + { + "epoch": 0.9695792784129094, + "grad_norm": 0.44542455673217773, + "learning_rate": 4.5600541264934074e-08, + "loss": 0.4923, + "step": 52226 + }, + { + "epoch": 0.969616408550328, + "grad_norm": 0.617743968963623, + "learning_rate": 4.548933706617242e-08, + "loss": 0.2639, + "step": 52228 + }, + { + "epoch": 0.9696535386877467, + "grad_norm": 0.6876521110534668, + "learning_rate": 4.537826831848335e-08, + "loss": 0.2823, + "step": 52230 + }, + { + "epoch": 0.9696906688251653, + "grad_norm": 0.5498456954956055, + "learning_rate": 4.526733502337899e-08, + "loss": 0.2075, + "step": 52232 + }, + { + "epoch": 0.969727798962584, + "grad_norm": 0.4629049003124237, + "learning_rate": 4.515653718236812e-08, + "loss": 0.2346, + "step": 52234 + }, + { + "epoch": 0.9697649291000026, + "grad_norm": 0.3241240978240967, + "learning_rate": 4.5045874796958435e-08, + "loss": 0.3661, + "step": 52236 + }, + { + "epoch": 0.9698020592374212, + "grad_norm": 0.4844273626804352, + "learning_rate": 4.49353478686565e-08, + "loss": 0.23, + "step": 52238 + }, + { + "epoch": 0.9698391893748399, + "grad_norm": 0.5575023293495178, + "learning_rate": 4.482495639896445e-08, + "loss": 0.2491, + "step": 52240 + }, + { + "epoch": 0.9698763195122585, + "grad_norm": 0.5679180026054382, + "learning_rate": 4.4714700389386635e-08, + "loss": 0.2196, + "step": 52242 + }, + { + "epoch": 0.9699134496496772, + "grad_norm": 0.530428409576416, + "learning_rate": 4.4604579841422965e-08, + "loss": 0.2097, + "step": 52244 + }, + { + "epoch": 0.9699505797870958, + "grad_norm": 0.3517967462539673, + "learning_rate": 4.449459475657003e-08, + "loss": 0.4829, + "step": 52246 + }, + { + "epoch": 0.9699877099245144, + "grad_norm": 0.4275780916213989, + "learning_rate": 4.4384745136324406e-08, + "loss": 0.2136, + "step": 52248 + }, + { + "epoch": 0.9700248400619331, + "grad_norm": 0.4370857775211334, + "learning_rate": 4.427503098218378e-08, + "loss": 0.2589, + "step": 52250 + }, + { + "epoch": 0.9700619701993517, + "grad_norm": 0.38713881373405457, + "learning_rate": 4.416545229563807e-08, + "loss": 0.3396, + "step": 52252 + }, + { + "epoch": 0.9700991003367704, + "grad_norm": 0.4153943955898285, + "learning_rate": 4.405600907817942e-08, + "loss": 0.3046, + "step": 52254 + }, + { + "epoch": 0.970136230474189, + "grad_norm": 0.5087686777114868, + "learning_rate": 4.394670133129664e-08, + "loss": 0.3375, + "step": 52256 + }, + { + "epoch": 0.9701733606116076, + "grad_norm": 0.3228212296962738, + "learning_rate": 4.383752905647853e-08, + "loss": 0.0541, + "step": 52258 + }, + { + "epoch": 0.9702104907490262, + "grad_norm": 0.7833784222602844, + "learning_rate": 4.3728492255208364e-08, + "loss": 0.4716, + "step": 52260 + }, + { + "epoch": 0.9702476208864449, + "grad_norm": 0.4449281394481659, + "learning_rate": 4.361959092897161e-08, + "loss": 0.3141, + "step": 52262 + }, + { + "epoch": 0.9702847510238636, + "grad_norm": 0.4693848788738251, + "learning_rate": 4.351082507924931e-08, + "loss": 0.5738, + "step": 52264 + }, + { + "epoch": 0.9703218811612822, + "grad_norm": 0.4302957355976105, + "learning_rate": 4.340219470752138e-08, + "loss": 0.3716, + "step": 52266 + }, + { + "epoch": 0.9703590112987008, + "grad_norm": 0.4023655652999878, + "learning_rate": 4.3293699815266655e-08, + "loss": 0.2937, + "step": 52268 + }, + { + "epoch": 0.9703961414361194, + "grad_norm": 0.3765750527381897, + "learning_rate": 4.318534040396061e-08, + "loss": 0.2493, + "step": 52270 + }, + { + "epoch": 0.9704332715735381, + "grad_norm": 0.34252360463142395, + "learning_rate": 4.307711647507762e-08, + "loss": 0.2458, + "step": 52272 + }, + { + "epoch": 0.9704704017109568, + "grad_norm": 0.3828517496585846, + "learning_rate": 4.296902803009095e-08, + "loss": 0.3062, + "step": 52274 + }, + { + "epoch": 0.9705075318483753, + "grad_norm": 0.385883092880249, + "learning_rate": 4.286107507047055e-08, + "loss": 0.3781, + "step": 52276 + }, + { + "epoch": 0.970544661985794, + "grad_norm": 0.462250292301178, + "learning_rate": 4.275325759768634e-08, + "loss": 0.2805, + "step": 52278 + }, + { + "epoch": 0.9705817921232126, + "grad_norm": 0.43409058451652527, + "learning_rate": 4.2645575613204927e-08, + "loss": 0.249, + "step": 52280 + }, + { + "epoch": 0.9706189222606313, + "grad_norm": 0.44005900621414185, + "learning_rate": 4.253802911849181e-08, + "loss": 0.3421, + "step": 52282 + }, + { + "epoch": 0.97065605239805, + "grad_norm": 0.2576841413974762, + "learning_rate": 4.243061811500915e-08, + "loss": 0.2232, + "step": 52284 + }, + { + "epoch": 0.9706931825354685, + "grad_norm": 0.3891441822052002, + "learning_rate": 4.232334260422022e-08, + "loss": 0.3199, + "step": 52286 + }, + { + "epoch": 0.9707303126728872, + "grad_norm": 0.435314804315567, + "learning_rate": 4.2216202587583854e-08, + "loss": 0.2707, + "step": 52288 + }, + { + "epoch": 0.9707674428103058, + "grad_norm": 0.42135632038116455, + "learning_rate": 4.2109198066556666e-08, + "loss": 0.2789, + "step": 52290 + }, + { + "epoch": 0.9708045729477245, + "grad_norm": 0.34286144375801086, + "learning_rate": 4.200232904259749e-08, + "loss": 0.3169, + "step": 52292 + }, + { + "epoch": 0.9708417030851432, + "grad_norm": 0.6923753023147583, + "learning_rate": 4.1895595517158494e-08, + "loss": 0.3712, + "step": 52294 + }, + { + "epoch": 0.9708788332225617, + "grad_norm": 0.4062420427799225, + "learning_rate": 4.178899749169185e-08, + "loss": 0.3797, + "step": 52296 + }, + { + "epoch": 0.9709159633599804, + "grad_norm": 0.4525628387928009, + "learning_rate": 4.168253496764974e-08, + "loss": 0.2691, + "step": 52298 + }, + { + "epoch": 0.970953093497399, + "grad_norm": 0.5534800887107849, + "learning_rate": 4.157620794647876e-08, + "loss": 0.3463, + "step": 52300 + }, + { + "epoch": 0.9709902236348177, + "grad_norm": 0.37565261125564575, + "learning_rate": 4.147001642962667e-08, + "loss": 0.4954, + "step": 52302 + }, + { + "epoch": 0.9710273537722363, + "grad_norm": 0.24253802001476288, + "learning_rate": 4.1363960418538963e-08, + "loss": 0.2076, + "step": 52304 + }, + { + "epoch": 0.9710644839096549, + "grad_norm": 1.8483612537384033, + "learning_rate": 4.125803991465782e-08, + "loss": 0.2077, + "step": 52306 + }, + { + "epoch": 0.9711016140470736, + "grad_norm": 0.3528026342391968, + "learning_rate": 4.115225491942543e-08, + "loss": 0.1916, + "step": 52308 + }, + { + "epoch": 0.9711387441844922, + "grad_norm": 0.39766982197761536, + "learning_rate": 4.104660543428063e-08, + "loss": 0.2032, + "step": 52310 + }, + { + "epoch": 0.9711758743219109, + "grad_norm": 0.3801085650920868, + "learning_rate": 4.0941091460660055e-08, + "loss": 0.1953, + "step": 52312 + }, + { + "epoch": 0.9712130044593295, + "grad_norm": 0.34212860465049744, + "learning_rate": 4.083571300000144e-08, + "loss": 0.3623, + "step": 52314 + }, + { + "epoch": 0.9712501345967481, + "grad_norm": 0.3319641053676605, + "learning_rate": 4.073047005373698e-08, + "loss": 0.1739, + "step": 52316 + }, + { + "epoch": 0.9712872647341668, + "grad_norm": 0.4583672881126404, + "learning_rate": 4.062536262329997e-08, + "loss": 0.4254, + "step": 52318 + }, + { + "epoch": 0.9713243948715854, + "grad_norm": 0.47544699907302856, + "learning_rate": 4.052039071011926e-08, + "loss": 0.1431, + "step": 52320 + }, + { + "epoch": 0.9713615250090041, + "grad_norm": 0.3348153829574585, + "learning_rate": 4.041555431562372e-08, + "loss": 0.2437, + "step": 52322 + }, + { + "epoch": 0.9713986551464227, + "grad_norm": 0.5156077742576599, + "learning_rate": 4.031085344124108e-08, + "loss": 0.3041, + "step": 52324 + }, + { + "epoch": 0.9714357852838413, + "grad_norm": 0.4214031398296356, + "learning_rate": 4.020628808839466e-08, + "loss": 0.3558, + "step": 52326 + }, + { + "epoch": 0.97147291542126, + "grad_norm": 0.27523618936538696, + "learning_rate": 4.0101858258507764e-08, + "loss": 0.4049, + "step": 52328 + }, + { + "epoch": 0.9715100455586786, + "grad_norm": 0.2950456738471985, + "learning_rate": 3.9997563953000364e-08, + "loss": 0.3094, + "step": 52330 + }, + { + "epoch": 0.9715471756960973, + "grad_norm": 0.4403890073299408, + "learning_rate": 3.989340517329354e-08, + "loss": 0.35, + "step": 52332 + }, + { + "epoch": 0.9715843058335158, + "grad_norm": 0.22450010478496552, + "learning_rate": 3.978938192080284e-08, + "loss": 0.2865, + "step": 52334 + }, + { + "epoch": 0.9716214359709345, + "grad_norm": 0.34587424993515015, + "learning_rate": 3.9685494196944894e-08, + "loss": 0.1388, + "step": 52336 + }, + { + "epoch": 0.9716585661083532, + "grad_norm": 0.5718381404876709, + "learning_rate": 3.958174200313192e-08, + "loss": 0.3292, + "step": 52338 + }, + { + "epoch": 0.9716956962457718, + "grad_norm": 0.5066449642181396, + "learning_rate": 3.947812534077722e-08, + "loss": 0.2451, + "step": 52340 + }, + { + "epoch": 0.9717328263831905, + "grad_norm": 0.46121588349342346, + "learning_rate": 3.937464421128967e-08, + "loss": 0.5591, + "step": 52342 + }, + { + "epoch": 0.971769956520609, + "grad_norm": 0.4850556552410126, + "learning_rate": 3.9271298616077037e-08, + "loss": 0.2448, + "step": 52344 + }, + { + "epoch": 0.9718070866580277, + "grad_norm": 0.3433486521244049, + "learning_rate": 3.916808855654819e-08, + "loss": 0.3022, + "step": 52346 + }, + { + "epoch": 0.9718442167954464, + "grad_norm": 0.5071460008621216, + "learning_rate": 3.906501403410423e-08, + "loss": 0.2032, + "step": 52348 + }, + { + "epoch": 0.971881346932865, + "grad_norm": 0.2898614704608917, + "learning_rate": 3.896207505014848e-08, + "loss": 0.147, + "step": 52350 + }, + { + "epoch": 0.9719184770702837, + "grad_norm": 0.4271768629550934, + "learning_rate": 3.885927160608316e-08, + "loss": 0.2297, + "step": 52352 + }, + { + "epoch": 0.9719556072077022, + "grad_norm": 0.44398200511932373, + "learning_rate": 3.875660370330603e-08, + "loss": 0.1111, + "step": 52354 + }, + { + "epoch": 0.9719927373451209, + "grad_norm": 0.6286386847496033, + "learning_rate": 3.865407134321264e-08, + "loss": 0.4717, + "step": 52356 + }, + { + "epoch": 0.9720298674825395, + "grad_norm": 0.3571556806564331, + "learning_rate": 3.855167452720188e-08, + "loss": 0.3092, + "step": 52358 + }, + { + "epoch": 0.9720669976199582, + "grad_norm": 0.34171995520591736, + "learning_rate": 3.844941325666374e-08, + "loss": 0.2387, + "step": 52360 + }, + { + "epoch": 0.9721041277573769, + "grad_norm": 0.5210464000701904, + "learning_rate": 3.834728753299044e-08, + "loss": 0.2233, + "step": 52362 + }, + { + "epoch": 0.9721412578947954, + "grad_norm": 0.38443824648857117, + "learning_rate": 3.824529735757199e-08, + "loss": 0.2382, + "step": 52364 + }, + { + "epoch": 0.9721783880322141, + "grad_norm": 0.24594590067863464, + "learning_rate": 3.8143442731796154e-08, + "loss": 0.1391, + "step": 52366 + }, + { + "epoch": 0.9722155181696327, + "grad_norm": 0.27534717321395874, + "learning_rate": 3.804172365704961e-08, + "loss": 0.2746, + "step": 52368 + }, + { + "epoch": 0.9722526483070514, + "grad_norm": 0.258372038602829, + "learning_rate": 3.794014013471459e-08, + "loss": 0.3865, + "step": 52370 + }, + { + "epoch": 0.9722897784444701, + "grad_norm": 0.5005186200141907, + "learning_rate": 3.783869216617553e-08, + "loss": 0.3906, + "step": 52372 + }, + { + "epoch": 0.9723269085818886, + "grad_norm": 0.7120698690414429, + "learning_rate": 3.773737975281133e-08, + "loss": 0.3128, + "step": 52374 + }, + { + "epoch": 0.9723640387193073, + "grad_norm": 0.3213861286640167, + "learning_rate": 3.763620289600089e-08, + "loss": 0.1925, + "step": 52376 + }, + { + "epoch": 0.9724011688567259, + "grad_norm": 0.5673524737358093, + "learning_rate": 3.753516159712089e-08, + "loss": 0.4235, + "step": 52378 + }, + { + "epoch": 0.9724382989941446, + "grad_norm": 0.6401824951171875, + "learning_rate": 3.7434255857545786e-08, + "loss": 0.2199, + "step": 52380 + }, + { + "epoch": 0.9724754291315633, + "grad_norm": 0.3292693793773651, + "learning_rate": 3.733348567865003e-08, + "loss": 0.1266, + "step": 52382 + }, + { + "epoch": 0.9725125592689818, + "grad_norm": 0.4881197214126587, + "learning_rate": 3.7232851061803635e-08, + "loss": 0.307, + "step": 52384 + }, + { + "epoch": 0.9725496894064005, + "grad_norm": 0.4343830645084381, + "learning_rate": 3.713235200837662e-08, + "loss": 0.2965, + "step": 52386 + }, + { + "epoch": 0.9725868195438191, + "grad_norm": 0.20725177228450775, + "learning_rate": 3.7031988519735664e-08, + "loss": 0.3818, + "step": 52388 + }, + { + "epoch": 0.9726239496812378, + "grad_norm": 0.3907625675201416, + "learning_rate": 3.6931760597246345e-08, + "loss": 0.4011, + "step": 52390 + }, + { + "epoch": 0.9726610798186565, + "grad_norm": 0.47869133949279785, + "learning_rate": 3.683166824227313e-08, + "loss": 0.5723, + "step": 52392 + }, + { + "epoch": 0.972698209956075, + "grad_norm": 0.4184792935848236, + "learning_rate": 3.673171145617827e-08, + "loss": 0.3221, + "step": 52394 + }, + { + "epoch": 0.9727353400934937, + "grad_norm": 0.5446518063545227, + "learning_rate": 3.663189024032177e-08, + "loss": 0.1639, + "step": 52396 + }, + { + "epoch": 0.9727724702309123, + "grad_norm": 0.2734091281890869, + "learning_rate": 3.6532204596060326e-08, + "loss": 0.3196, + "step": 52398 + }, + { + "epoch": 0.972809600368331, + "grad_norm": 0.6280050277709961, + "learning_rate": 3.6432654524751755e-08, + "loss": 0.1841, + "step": 52400 + }, + { + "epoch": 0.9728467305057497, + "grad_norm": 0.39039379358291626, + "learning_rate": 3.6333240027750515e-08, + "loss": 0.18, + "step": 52402 + }, + { + "epoch": 0.9728838606431682, + "grad_norm": 0.5928041338920593, + "learning_rate": 3.623396110640887e-08, + "loss": 0.2752, + "step": 52404 + }, + { + "epoch": 0.9729209907805869, + "grad_norm": 0.46616432070732117, + "learning_rate": 3.6134817762079053e-08, + "loss": 0.3744, + "step": 52406 + }, + { + "epoch": 0.9729581209180055, + "grad_norm": 0.3793087899684906, + "learning_rate": 3.6035809996108894e-08, + "loss": 0.1487, + "step": 52408 + }, + { + "epoch": 0.9729952510554242, + "grad_norm": 0.3221988379955292, + "learning_rate": 3.5936937809845087e-08, + "loss": 0.2328, + "step": 52410 + }, + { + "epoch": 0.9730323811928427, + "grad_norm": 0.5459412932395935, + "learning_rate": 3.5838201204634327e-08, + "loss": 0.1445, + "step": 52412 + }, + { + "epoch": 0.9730695113302614, + "grad_norm": 0.43969154357910156, + "learning_rate": 3.573960018181999e-08, + "loss": 0.2808, + "step": 52414 + }, + { + "epoch": 0.9731066414676801, + "grad_norm": 0.36072394251823425, + "learning_rate": 3.564113474274211e-08, + "loss": 0.2578, + "step": 52416 + }, + { + "epoch": 0.9731437716050987, + "grad_norm": 0.35191261768341064, + "learning_rate": 3.5542804888742954e-08, + "loss": 0.1617, + "step": 52418 + }, + { + "epoch": 0.9731809017425174, + "grad_norm": 0.46096497774124146, + "learning_rate": 3.544461062115812e-08, + "loss": 0.3327, + "step": 52420 + }, + { + "epoch": 0.9732180318799359, + "grad_norm": 0.36141785979270935, + "learning_rate": 3.5346551941325414e-08, + "loss": 0.2449, + "step": 52422 + }, + { + "epoch": 0.9732551620173546, + "grad_norm": 0.30802807211875916, + "learning_rate": 3.5248628850578224e-08, + "loss": 0.2961, + "step": 52424 + }, + { + "epoch": 0.9732922921547733, + "grad_norm": 0.4033017158508301, + "learning_rate": 3.515084135024993e-08, + "loss": 0.3185, + "step": 52426 + }, + { + "epoch": 0.9733294222921919, + "grad_norm": 0.32289233803749084, + "learning_rate": 3.505318944166947e-08, + "loss": 0.1804, + "step": 52428 + }, + { + "epoch": 0.9733665524296106, + "grad_norm": 0.24146059155464172, + "learning_rate": 3.495567312616688e-08, + "loss": 0.1335, + "step": 52430 + }, + { + "epoch": 0.9734036825670291, + "grad_norm": 0.41892123222351074, + "learning_rate": 3.4858292405069996e-08, + "loss": 0.1674, + "step": 52432 + }, + { + "epoch": 0.9734408127044478, + "grad_norm": 0.34480446577072144, + "learning_rate": 3.47610472797022e-08, + "loss": 0.1479, + "step": 52434 + }, + { + "epoch": 0.9734779428418665, + "grad_norm": 0.31480976939201355, + "learning_rate": 3.4663937751386875e-08, + "loss": 0.4589, + "step": 52436 + }, + { + "epoch": 0.9735150729792851, + "grad_norm": 0.5684669017791748, + "learning_rate": 3.45669638214452e-08, + "loss": 0.5442, + "step": 52438 + }, + { + "epoch": 0.9735522031167038, + "grad_norm": 0.2442110776901245, + "learning_rate": 3.4470125491198324e-08, + "loss": 0.433, + "step": 52440 + }, + { + "epoch": 0.9735893332541223, + "grad_norm": 0.4478112757205963, + "learning_rate": 3.437342276196187e-08, + "loss": 0.2489, + "step": 52442 + }, + { + "epoch": 0.973626463391541, + "grad_norm": 0.29075488448143005, + "learning_rate": 3.427685563505256e-08, + "loss": 0.3118, + "step": 52444 + }, + { + "epoch": 0.9736635935289597, + "grad_norm": 0.5737714767456055, + "learning_rate": 3.4180424111784905e-08, + "loss": 0.2905, + "step": 52446 + }, + { + "epoch": 0.9737007236663783, + "grad_norm": 0.47367018461227417, + "learning_rate": 3.4084128193471176e-08, + "loss": 0.4879, + "step": 52448 + }, + { + "epoch": 0.973737853803797, + "grad_norm": 0.33778467774391174, + "learning_rate": 3.3987967881420333e-08, + "loss": 0.213, + "step": 52450 + }, + { + "epoch": 0.9737749839412155, + "grad_norm": 0.4188460409641266, + "learning_rate": 3.389194317694133e-08, + "loss": 0.2444, + "step": 52452 + }, + { + "epoch": 0.9738121140786342, + "grad_norm": 0.48974165320396423, + "learning_rate": 3.3796054081342014e-08, + "loss": 0.3504, + "step": 52454 + }, + { + "epoch": 0.9738492442160528, + "grad_norm": 0.3058036267757416, + "learning_rate": 3.3700300595924664e-08, + "loss": 0.1649, + "step": 52456 + }, + { + "epoch": 0.9738863743534715, + "grad_norm": 0.47437435388565063, + "learning_rate": 3.360468272199491e-08, + "loss": 0.134, + "step": 52458 + }, + { + "epoch": 0.9739235044908902, + "grad_norm": 0.3037269413471222, + "learning_rate": 3.350920046085282e-08, + "loss": 0.1854, + "step": 52460 + }, + { + "epoch": 0.9739606346283087, + "grad_norm": 0.3654553294181824, + "learning_rate": 3.341385381379625e-08, + "loss": 0.3039, + "step": 52462 + }, + { + "epoch": 0.9739977647657274, + "grad_norm": 0.36664679646492004, + "learning_rate": 3.331864278212526e-08, + "loss": 0.0863, + "step": 52464 + }, + { + "epoch": 0.974034894903146, + "grad_norm": 0.6627983450889587, + "learning_rate": 3.322356736713217e-08, + "loss": 0.2081, + "step": 52466 + }, + { + "epoch": 0.9740720250405647, + "grad_norm": 0.3373587727546692, + "learning_rate": 3.3128627570113705e-08, + "loss": 0.2321, + "step": 52468 + }, + { + "epoch": 0.9741091551779834, + "grad_norm": 0.3118228614330292, + "learning_rate": 3.3033823392359945e-08, + "loss": 0.265, + "step": 52470 + }, + { + "epoch": 0.9741462853154019, + "grad_norm": 0.4530518054962158, + "learning_rate": 3.293915483516097e-08, + "loss": 0.1735, + "step": 52472 + }, + { + "epoch": 0.9741834154528206, + "grad_norm": 0.3990160822868347, + "learning_rate": 3.284462189980575e-08, + "loss": 0.2922, + "step": 52474 + }, + { + "epoch": 0.9742205455902392, + "grad_norm": 0.5626431107521057, + "learning_rate": 3.2750224587579924e-08, + "loss": 0.3601, + "step": 52476 + }, + { + "epoch": 0.9742576757276579, + "grad_norm": 0.6035019755363464, + "learning_rate": 3.2655962899768027e-08, + "loss": 0.3731, + "step": 52478 + }, + { + "epoch": 0.9742948058650766, + "grad_norm": 0.28266674280166626, + "learning_rate": 3.256183683765346e-08, + "loss": 0.232, + "step": 52480 + }, + { + "epoch": 0.9743319360024951, + "grad_norm": 0.3526769280433655, + "learning_rate": 3.2467846402516324e-08, + "loss": 0.4141, + "step": 52482 + }, + { + "epoch": 0.9743690661399138, + "grad_norm": 0.5029996633529663, + "learning_rate": 3.237399159563448e-08, + "loss": 0.2065, + "step": 52484 + }, + { + "epoch": 0.9744061962773324, + "grad_norm": 0.47557756304740906, + "learning_rate": 3.228027241828691e-08, + "loss": 0.3551, + "step": 52486 + }, + { + "epoch": 0.9744433264147511, + "grad_norm": 0.41370537877082825, + "learning_rate": 3.218668887174814e-08, + "loss": 0.2723, + "step": 52488 + }, + { + "epoch": 0.9744804565521697, + "grad_norm": 0.36553439497947693, + "learning_rate": 3.20932409572916e-08, + "loss": 0.1588, + "step": 52490 + }, + { + "epoch": 0.9745175866895883, + "grad_norm": 0.3766074478626251, + "learning_rate": 3.1999928676188505e-08, + "loss": 0.1288, + "step": 52492 + }, + { + "epoch": 0.974554716827007, + "grad_norm": 0.5123952627182007, + "learning_rate": 3.190675202970783e-08, + "loss": 0.2449, + "step": 52494 + }, + { + "epoch": 0.9745918469644256, + "grad_norm": 0.22378191351890564, + "learning_rate": 3.181371101911968e-08, + "loss": 0.2137, + "step": 52496 + }, + { + "epoch": 0.9746289771018443, + "grad_norm": 0.3810320794582367, + "learning_rate": 3.172080564568858e-08, + "loss": 0.3039, + "step": 52498 + }, + { + "epoch": 0.974666107239263, + "grad_norm": 0.46907705068588257, + "learning_rate": 3.162803591067909e-08, + "loss": 0.1964, + "step": 52500 + }, + { + "epoch": 0.9747032373766815, + "grad_norm": 0.39222970604896545, + "learning_rate": 3.1535401815352416e-08, + "loss": 0.2044, + "step": 52502 + }, + { + "epoch": 0.9747403675141002, + "grad_norm": 0.5220727324485779, + "learning_rate": 3.144290336096978e-08, + "loss": 0.2374, + "step": 52504 + }, + { + "epoch": 0.9747774976515188, + "grad_norm": 0.2719477117061615, + "learning_rate": 3.1350540548791276e-08, + "loss": 0.1044, + "step": 52506 + }, + { + "epoch": 0.9748146277889375, + "grad_norm": 0.3455660343170166, + "learning_rate": 3.1258313380070345e-08, + "loss": 0.22, + "step": 52508 + }, + { + "epoch": 0.974851757926356, + "grad_norm": 0.34028542041778564, + "learning_rate": 3.1166221856065995e-08, + "loss": 0.2956, + "step": 52510 + }, + { + "epoch": 0.9748888880637747, + "grad_norm": 0.33966779708862305, + "learning_rate": 3.107426597802832e-08, + "loss": 0.5051, + "step": 52512 + }, + { + "epoch": 0.9749260182011934, + "grad_norm": 0.5566272139549255, + "learning_rate": 3.0982445747209656e-08, + "loss": 0.2726, + "step": 52514 + }, + { + "epoch": 0.974963148338612, + "grad_norm": 0.3141480088233948, + "learning_rate": 3.0890761164859004e-08, + "loss": 0.3139, + "step": 52516 + }, + { + "epoch": 0.9750002784760307, + "grad_norm": 0.2885169982910156, + "learning_rate": 3.0799212232224265e-08, + "loss": 0.233, + "step": 52518 + }, + { + "epoch": 0.9750374086134492, + "grad_norm": 0.38074564933776855, + "learning_rate": 3.0707798950551095e-08, + "loss": 0.1124, + "step": 52520 + }, + { + "epoch": 0.9750745387508679, + "grad_norm": 0.49517300724983215, + "learning_rate": 3.061652132108295e-08, + "loss": 0.3213, + "step": 52522 + }, + { + "epoch": 0.9751116688882866, + "grad_norm": 0.5436496734619141, + "learning_rate": 3.052537934506328e-08, + "loss": 0.3126, + "step": 52524 + }, + { + "epoch": 0.9751487990257052, + "grad_norm": 0.38221463561058044, + "learning_rate": 3.043437302372998e-08, + "loss": 0.2795, + "step": 52526 + }, + { + "epoch": 0.9751859291631239, + "grad_norm": 0.2784236967563629, + "learning_rate": 3.034350235832317e-08, + "loss": 0.3699, + "step": 52528 + }, + { + "epoch": 0.9752230593005424, + "grad_norm": 0.2580036520957947, + "learning_rate": 3.025276735007854e-08, + "loss": 0.3234, + "step": 52530 + }, + { + "epoch": 0.9752601894379611, + "grad_norm": 0.37121567130088806, + "learning_rate": 3.016216800023064e-08, + "loss": 0.0849, + "step": 52532 + }, + { + "epoch": 0.9752973195753798, + "grad_norm": 0.23808827996253967, + "learning_rate": 3.007170431001294e-08, + "loss": 0.2437, + "step": 52534 + }, + { + "epoch": 0.9753344497127984, + "grad_norm": 0.2764566242694855, + "learning_rate": 2.998137628065556e-08, + "loss": 0.2789, + "step": 52536 + }, + { + "epoch": 0.975371579850217, + "grad_norm": 0.39689087867736816, + "learning_rate": 2.9891183913387525e-08, + "loss": 0.3759, + "step": 52538 + }, + { + "epoch": 0.9754087099876356, + "grad_norm": 0.3870512545108795, + "learning_rate": 2.980112720943784e-08, + "loss": 0.1843, + "step": 52540 + }, + { + "epoch": 0.9754458401250543, + "grad_norm": 0.5041999816894531, + "learning_rate": 2.9711206170029983e-08, + "loss": 0.2334, + "step": 52542 + }, + { + "epoch": 0.975482970262473, + "grad_norm": 0.3743602931499481, + "learning_rate": 2.9621420796387414e-08, + "loss": 0.2127, + "step": 52544 + }, + { + "epoch": 0.9755201003998916, + "grad_norm": 0.215674489736557, + "learning_rate": 2.9531771089733598e-08, + "loss": 0.3453, + "step": 52546 + }, + { + "epoch": 0.9755572305373102, + "grad_norm": 0.34515440464019775, + "learning_rate": 2.944225705128756e-08, + "loss": 0.4175, + "step": 52548 + }, + { + "epoch": 0.9755943606747288, + "grad_norm": 0.25181716680526733, + "learning_rate": 2.9352878682266106e-08, + "loss": 0.1931, + "step": 52550 + }, + { + "epoch": 0.9756314908121475, + "grad_norm": 0.35617464780807495, + "learning_rate": 2.926363598388715e-08, + "loss": 0.1598, + "step": 52552 + }, + { + "epoch": 0.9756686209495661, + "grad_norm": 0.32083582878112793, + "learning_rate": 2.9174528957364167e-08, + "loss": 0.3056, + "step": 52554 + }, + { + "epoch": 0.9757057510869848, + "grad_norm": 0.3217821717262268, + "learning_rate": 2.908555760390952e-08, + "loss": 0.2996, + "step": 52556 + }, + { + "epoch": 0.9757428812244034, + "grad_norm": 0.3042164146900177, + "learning_rate": 2.899672192473446e-08, + "loss": 0.1812, + "step": 52558 + }, + { + "epoch": 0.975780011361822, + "grad_norm": 0.4111628234386444, + "learning_rate": 2.8908021921046914e-08, + "loss": 0.4655, + "step": 52560 + }, + { + "epoch": 0.9758171414992407, + "grad_norm": 0.2899168133735657, + "learning_rate": 2.88194575940548e-08, + "loss": 0.1325, + "step": 52562 + }, + { + "epoch": 0.9758542716366593, + "grad_norm": 0.34224218130111694, + "learning_rate": 2.8731028944962714e-08, + "loss": 0.2113, + "step": 52564 + }, + { + "epoch": 0.975891401774078, + "grad_norm": 0.538062334060669, + "learning_rate": 2.8642735974973024e-08, + "loss": 0.3183, + "step": 52566 + }, + { + "epoch": 0.9759285319114966, + "grad_norm": 0.3536325693130493, + "learning_rate": 2.8554578685289214e-08, + "loss": 0.294, + "step": 52568 + }, + { + "epoch": 0.9759656620489152, + "grad_norm": 0.3570454716682434, + "learning_rate": 2.8466557077108103e-08, + "loss": 0.4027, + "step": 52570 + }, + { + "epoch": 0.9760027921863339, + "grad_norm": 0.3187287747859955, + "learning_rate": 2.8378671151630954e-08, + "loss": 0.2558, + "step": 52572 + }, + { + "epoch": 0.9760399223237525, + "grad_norm": 0.40225204825401306, + "learning_rate": 2.8290920910050146e-08, + "loss": 0.2772, + "step": 52574 + }, + { + "epoch": 0.9760770524611712, + "grad_norm": 0.32322025299072266, + "learning_rate": 2.820330635356139e-08, + "loss": 0.3288, + "step": 52576 + }, + { + "epoch": 0.9761141825985898, + "grad_norm": 0.4910758435726166, + "learning_rate": 2.811582748335706e-08, + "loss": 0.2053, + "step": 52578 + }, + { + "epoch": 0.9761513127360084, + "grad_norm": 0.6543188691139221, + "learning_rate": 2.8028484300627322e-08, + "loss": 0.226, + "step": 52580 + }, + { + "epoch": 0.9761884428734271, + "grad_norm": 0.3032397925853729, + "learning_rate": 2.7941276806560112e-08, + "loss": 0.221, + "step": 52582 + }, + { + "epoch": 0.9762255730108457, + "grad_norm": 0.3602171242237091, + "learning_rate": 2.785420500234226e-08, + "loss": 0.2157, + "step": 52584 + }, + { + "epoch": 0.9762627031482644, + "grad_norm": 0.48271968960762024, + "learning_rate": 2.7767268889158373e-08, + "loss": 0.2133, + "step": 52586 + }, + { + "epoch": 0.976299833285683, + "grad_norm": 0.19905145466327667, + "learning_rate": 2.7680468468191947e-08, + "loss": 0.2804, + "step": 52588 + }, + { + "epoch": 0.9763369634231016, + "grad_norm": 0.21626682579517365, + "learning_rate": 2.759380374062426e-08, + "loss": 0.3924, + "step": 52590 + }, + { + "epoch": 0.9763740935605203, + "grad_norm": 0.35485005378723145, + "learning_rate": 2.750727470763326e-08, + "loss": 0.1978, + "step": 52592 + }, + { + "epoch": 0.9764112236979389, + "grad_norm": 0.31072521209716797, + "learning_rate": 2.742088137039689e-08, + "loss": 0.0933, + "step": 52594 + }, + { + "epoch": 0.9764483538353576, + "grad_norm": 0.5642861127853394, + "learning_rate": 2.733462373009199e-08, + "loss": 0.2589, + "step": 52596 + }, + { + "epoch": 0.9764854839727762, + "grad_norm": 0.27900439500808716, + "learning_rate": 2.7248501787890957e-08, + "loss": 0.3788, + "step": 52598 + }, + { + "epoch": 0.9765226141101948, + "grad_norm": 0.5054538249969482, + "learning_rate": 2.7162515544966182e-08, + "loss": 0.2025, + "step": 52600 + }, + { + "epoch": 0.9765597442476135, + "grad_norm": 0.32331642508506775, + "learning_rate": 2.7076665002486734e-08, + "loss": 0.4825, + "step": 52602 + }, + { + "epoch": 0.9765968743850321, + "grad_norm": 0.37884241342544556, + "learning_rate": 2.6990950161622787e-08, + "loss": 0.2509, + "step": 52604 + }, + { + "epoch": 0.9766340045224507, + "grad_norm": 0.5432556867599487, + "learning_rate": 2.6905371023537852e-08, + "loss": 0.2735, + "step": 52606 + }, + { + "epoch": 0.9766711346598693, + "grad_norm": 0.399818480014801, + "learning_rate": 2.6819927589398775e-08, + "loss": 0.2318, + "step": 52608 + }, + { + "epoch": 0.976708264797288, + "grad_norm": 0.25092265009880066, + "learning_rate": 2.673461986036796e-08, + "loss": 0.3856, + "step": 52610 + }, + { + "epoch": 0.9767453949347067, + "grad_norm": 0.48335975408554077, + "learning_rate": 2.664944783760448e-08, + "loss": 0.2507, + "step": 52612 + }, + { + "epoch": 0.9767825250721253, + "grad_norm": 0.5250675082206726, + "learning_rate": 2.656441152226852e-08, + "loss": 0.275, + "step": 52614 + }, + { + "epoch": 0.976819655209544, + "grad_norm": 0.3093653917312622, + "learning_rate": 2.6479510915516926e-08, + "loss": 0.4943, + "step": 52616 + }, + { + "epoch": 0.9768567853469625, + "grad_norm": 0.3774946630001068, + "learning_rate": 2.6394746018505447e-08, + "loss": 0.2235, + "step": 52618 + }, + { + "epoch": 0.9768939154843812, + "grad_norm": 0.4391375482082367, + "learning_rate": 2.631011683238649e-08, + "loss": 0.4435, + "step": 52620 + }, + { + "epoch": 0.9769310456217999, + "grad_norm": 0.2906727194786072, + "learning_rate": 2.622562335831358e-08, + "loss": 0.2458, + "step": 52622 + }, + { + "epoch": 0.9769681757592185, + "grad_norm": 0.5192936658859253, + "learning_rate": 2.6141265597433574e-08, + "loss": 0.2049, + "step": 52624 + }, + { + "epoch": 0.9770053058966371, + "grad_norm": 1.1134597063064575, + "learning_rate": 2.6057043550896667e-08, + "loss": 0.4149, + "step": 52626 + }, + { + "epoch": 0.9770424360340557, + "grad_norm": 0.20615707337856293, + "learning_rate": 2.5972957219847494e-08, + "loss": 0.2737, + "step": 52628 + }, + { + "epoch": 0.9770795661714744, + "grad_norm": 0.5077775120735168, + "learning_rate": 2.58890066054307e-08, + "loss": 0.1686, + "step": 52630 + }, + { + "epoch": 0.9771166963088931, + "grad_norm": 0.3013204038143158, + "learning_rate": 2.5805191708788703e-08, + "loss": 0.1169, + "step": 52632 + }, + { + "epoch": 0.9771538264463117, + "grad_norm": 0.372030645608902, + "learning_rate": 2.57215125310617e-08, + "loss": 0.4087, + "step": 52634 + }, + { + "epoch": 0.9771909565837303, + "grad_norm": 0.3397143483161926, + "learning_rate": 2.563796907338878e-08, + "loss": 0.1733, + "step": 52636 + }, + { + "epoch": 0.9772280867211489, + "grad_norm": 0.30003345012664795, + "learning_rate": 2.5554561336906814e-08, + "loss": 0.1496, + "step": 52638 + }, + { + "epoch": 0.9772652168585676, + "grad_norm": 0.2165130078792572, + "learning_rate": 2.5471289322749338e-08, + "loss": 0.0353, + "step": 52640 + }, + { + "epoch": 0.9773023469959863, + "grad_norm": 0.5763567686080933, + "learning_rate": 2.5388153032051e-08, + "loss": 0.3336, + "step": 52642 + }, + { + "epoch": 0.9773394771334049, + "grad_norm": 0.30017635226249695, + "learning_rate": 2.5305152465943116e-08, + "loss": 0.1733, + "step": 52644 + }, + { + "epoch": 0.9773766072708235, + "grad_norm": 0.511091947555542, + "learning_rate": 2.5222287625553675e-08, + "loss": 0.122, + "step": 52646 + }, + { + "epoch": 0.9774137374082421, + "grad_norm": 0.32595300674438477, + "learning_rate": 2.5139558512011776e-08, + "loss": 0.4068, + "step": 52648 + }, + { + "epoch": 0.9774508675456608, + "grad_norm": 0.2640933692455292, + "learning_rate": 2.505696512644096e-08, + "loss": 0.3098, + "step": 52650 + }, + { + "epoch": 0.9774879976830795, + "grad_norm": 0.3979596793651581, + "learning_rate": 2.497450746996699e-08, + "loss": 0.0657, + "step": 52652 + }, + { + "epoch": 0.977525127820498, + "grad_norm": 0.5992537140846252, + "learning_rate": 2.48921855437112e-08, + "loss": 0.4477, + "step": 52654 + }, + { + "epoch": 0.9775622579579167, + "grad_norm": 0.3509773910045624, + "learning_rate": 2.48099993487938e-08, + "loss": 0.3969, + "step": 52656 + }, + { + "epoch": 0.9775993880953353, + "grad_norm": 0.43008869886398315, + "learning_rate": 2.4727948886333898e-08, + "loss": 0.5364, + "step": 52658 + }, + { + "epoch": 0.977636518232754, + "grad_norm": 0.31644877791404724, + "learning_rate": 2.4646034157447264e-08, + "loss": 0.1419, + "step": 52660 + }, + { + "epoch": 0.9776736483701726, + "grad_norm": 0.5948702692985535, + "learning_rate": 2.456425516324745e-08, + "loss": 0.273, + "step": 52662 + }, + { + "epoch": 0.9777107785075912, + "grad_norm": 0.3356820046901703, + "learning_rate": 2.448261190484913e-08, + "loss": 0.2393, + "step": 52664 + }, + { + "epoch": 0.9777479086450099, + "grad_norm": 0.6614366769790649, + "learning_rate": 2.4401104383361406e-08, + "loss": 0.4151, + "step": 52666 + }, + { + "epoch": 0.9777850387824285, + "grad_norm": 0.37577515840530396, + "learning_rate": 2.431973259989562e-08, + "loss": 0.3648, + "step": 52668 + }, + { + "epoch": 0.9778221689198472, + "grad_norm": 0.43406444787979126, + "learning_rate": 2.4238496555556434e-08, + "loss": 0.281, + "step": 52670 + }, + { + "epoch": 0.9778592990572658, + "grad_norm": 0.2981327176094055, + "learning_rate": 2.415739625145075e-08, + "loss": 0.393, + "step": 52672 + }, + { + "epoch": 0.9778964291946844, + "grad_norm": 0.44977495074272156, + "learning_rate": 2.407643168868212e-08, + "loss": 0.3544, + "step": 52674 + }, + { + "epoch": 0.9779335593321031, + "grad_norm": 0.3680245280265808, + "learning_rate": 2.3995602868351897e-08, + "loss": 0.1886, + "step": 52676 + }, + { + "epoch": 0.9779706894695217, + "grad_norm": 0.2316843867301941, + "learning_rate": 2.3914909791559195e-08, + "loss": 0.364, + "step": 52678 + }, + { + "epoch": 0.9780078196069404, + "grad_norm": 0.4484330415725708, + "learning_rate": 2.3834352459403132e-08, + "loss": 0.2541, + "step": 52680 + }, + { + "epoch": 0.978044949744359, + "grad_norm": 0.42768245935440063, + "learning_rate": 2.3753930872979502e-08, + "loss": 0.2974, + "step": 52682 + }, + { + "epoch": 0.9780820798817776, + "grad_norm": 0.49718043208122253, + "learning_rate": 2.3673645033382985e-08, + "loss": 0.3228, + "step": 52684 + }, + { + "epoch": 0.9781192100191963, + "grad_norm": 0.3892648220062256, + "learning_rate": 2.359349494170493e-08, + "loss": 0.466, + "step": 52686 + }, + { + "epoch": 0.9781563401566149, + "grad_norm": 0.4572296142578125, + "learning_rate": 2.3513480599036686e-08, + "loss": 0.1825, + "step": 52688 + }, + { + "epoch": 0.9781934702940336, + "grad_norm": 0.42824915051460266, + "learning_rate": 2.3433602006467383e-08, + "loss": 0.3205, + "step": 52690 + }, + { + "epoch": 0.9782306004314522, + "grad_norm": 0.5128491520881653, + "learning_rate": 2.3353859165083925e-08, + "loss": 0.4391, + "step": 52692 + }, + { + "epoch": 0.9782677305688708, + "grad_norm": 0.35662809014320374, + "learning_rate": 2.3274252075969893e-08, + "loss": 0.2015, + "step": 52694 + }, + { + "epoch": 0.9783048607062895, + "grad_norm": 0.34031981229782104, + "learning_rate": 2.3194780740209976e-08, + "loss": 0.2894, + "step": 52696 + }, + { + "epoch": 0.9783419908437081, + "grad_norm": 0.6298449635505676, + "learning_rate": 2.311544515888553e-08, + "loss": 0.1317, + "step": 52698 + }, + { + "epoch": 0.9783791209811268, + "grad_norm": 0.4997366666793823, + "learning_rate": 2.303624533307458e-08, + "loss": 0.3311, + "step": 52700 + }, + { + "epoch": 0.9784162511185454, + "grad_norm": 0.41044628620147705, + "learning_rate": 2.2957181263856265e-08, + "loss": 0.1661, + "step": 52702 + }, + { + "epoch": 0.978453381255964, + "grad_norm": 0.352595716714859, + "learning_rate": 2.287825295230639e-08, + "loss": 0.3509, + "step": 52704 + }, + { + "epoch": 0.9784905113933826, + "grad_norm": 0.34571948647499084, + "learning_rate": 2.2799460399498542e-08, + "loss": 0.3981, + "step": 52706 + }, + { + "epoch": 0.9785276415308013, + "grad_norm": 0.3974832594394684, + "learning_rate": 2.2720803606504084e-08, + "loss": 0.2851, + "step": 52708 + }, + { + "epoch": 0.97856477166822, + "grad_norm": 0.36147740483283997, + "learning_rate": 2.264228257439549e-08, + "loss": 0.5107, + "step": 52710 + }, + { + "epoch": 0.9786019018056386, + "grad_norm": 0.41298454999923706, + "learning_rate": 2.256389730423858e-08, + "loss": 0.181, + "step": 52712 + }, + { + "epoch": 0.9786390319430572, + "grad_norm": 0.5696089863777161, + "learning_rate": 2.2485647797101384e-08, + "loss": 0.15, + "step": 52714 + }, + { + "epoch": 0.9786761620804758, + "grad_norm": 0.36442697048187256, + "learning_rate": 2.2407534054048608e-08, + "loss": 0.1328, + "step": 52716 + }, + { + "epoch": 0.9787132922178945, + "grad_norm": 0.16239385306835175, + "learning_rate": 2.2329556076142733e-08, + "loss": 0.2648, + "step": 52718 + }, + { + "epoch": 0.9787504223553132, + "grad_norm": 0.32683441042900085, + "learning_rate": 2.2251713864445133e-08, + "loss": 0.0754, + "step": 52720 + }, + { + "epoch": 0.9787875524927317, + "grad_norm": 0.3488009572029114, + "learning_rate": 2.2174007420014965e-08, + "loss": 0.3742, + "step": 52722 + }, + { + "epoch": 0.9788246826301504, + "grad_norm": 0.3816192150115967, + "learning_rate": 2.2096436743910264e-08, + "loss": 0.2999, + "step": 52724 + }, + { + "epoch": 0.978861812767569, + "grad_norm": 0.5605812072753906, + "learning_rate": 2.2019001837184638e-08, + "loss": 0.2527, + "step": 52726 + }, + { + "epoch": 0.9788989429049877, + "grad_norm": 0.5688818097114563, + "learning_rate": 2.1941702700892798e-08, + "loss": 0.3419, + "step": 52728 + }, + { + "epoch": 0.9789360730424064, + "grad_norm": 0.521503746509552, + "learning_rate": 2.186453933608723e-08, + "loss": 0.344, + "step": 52730 + }, + { + "epoch": 0.978973203179825, + "grad_norm": 0.27724796533584595, + "learning_rate": 2.17875117438171e-08, + "loss": 0.4428, + "step": 52732 + }, + { + "epoch": 0.9790103333172436, + "grad_norm": 0.4398658573627472, + "learning_rate": 2.171061992513046e-08, + "loss": 0.306, + "step": 52734 + }, + { + "epoch": 0.9790474634546622, + "grad_norm": 0.4050920605659485, + "learning_rate": 2.1633863881074245e-08, + "loss": 0.3518, + "step": 52736 + }, + { + "epoch": 0.9790845935920809, + "grad_norm": 0.4036939740180969, + "learning_rate": 2.155724361269318e-08, + "loss": 0.44, + "step": 52738 + }, + { + "epoch": 0.9791217237294996, + "grad_norm": 0.24823527038097382, + "learning_rate": 2.148075912102754e-08, + "loss": 0.2675, + "step": 52740 + }, + { + "epoch": 0.9791588538669181, + "grad_norm": 0.39661112427711487, + "learning_rate": 2.1404410407120936e-08, + "loss": 0.2928, + "step": 52742 + }, + { + "epoch": 0.9791959840043368, + "grad_norm": 0.5383175611495972, + "learning_rate": 2.1328197472010315e-08, + "loss": 0.2731, + "step": 52744 + }, + { + "epoch": 0.9792331141417554, + "grad_norm": 0.5584187507629395, + "learning_rate": 2.125212031673374e-08, + "loss": 0.118, + "step": 52746 + }, + { + "epoch": 0.9792702442791741, + "grad_norm": 0.20835137367248535, + "learning_rate": 2.117617894232482e-08, + "loss": 0.227, + "step": 52748 + }, + { + "epoch": 0.9793073744165928, + "grad_norm": 0.4613208770751953, + "learning_rate": 2.1100373349819403e-08, + "loss": 0.1014, + "step": 52750 + }, + { + "epoch": 0.9793445045540113, + "grad_norm": 0.3824054002761841, + "learning_rate": 2.102470354024777e-08, + "loss": 0.1685, + "step": 52752 + }, + { + "epoch": 0.97938163469143, + "grad_norm": 0.4447002410888672, + "learning_rate": 2.09491695146391e-08, + "loss": 0.3277, + "step": 52754 + }, + { + "epoch": 0.9794187648288486, + "grad_norm": 0.49731025099754333, + "learning_rate": 2.087377127402146e-08, + "loss": 0.31, + "step": 52756 + }, + { + "epoch": 0.9794558949662673, + "grad_norm": 0.24771015346050262, + "learning_rate": 2.0798508819419582e-08, + "loss": 0.2411, + "step": 52758 + }, + { + "epoch": 0.9794930251036859, + "grad_norm": 0.5227185487747192, + "learning_rate": 2.0723382151860427e-08, + "loss": 0.2772, + "step": 52760 + }, + { + "epoch": 0.9795301552411045, + "grad_norm": 0.2755507826805115, + "learning_rate": 2.0648391272364287e-08, + "loss": 0.2852, + "step": 52762 + }, + { + "epoch": 0.9795672853785232, + "grad_norm": 0.45438331365585327, + "learning_rate": 2.057353618195146e-08, + "loss": 0.1982, + "step": 52764 + }, + { + "epoch": 0.9796044155159418, + "grad_norm": 0.47954118251800537, + "learning_rate": 2.0498816881641127e-08, + "loss": 0.358, + "step": 52766 + }, + { + "epoch": 0.9796415456533605, + "grad_norm": 0.3554628789424896, + "learning_rate": 2.0424233372450253e-08, + "loss": 0.3661, + "step": 52768 + }, + { + "epoch": 0.979678675790779, + "grad_norm": 0.32770344614982605, + "learning_rate": 2.0349785655393584e-08, + "loss": 0.0854, + "step": 52770 + }, + { + "epoch": 0.9797158059281977, + "grad_norm": 0.4444442391395569, + "learning_rate": 2.0275473731482532e-08, + "loss": 0.2301, + "step": 52772 + }, + { + "epoch": 0.9797529360656164, + "grad_norm": 0.37271958589553833, + "learning_rate": 2.020129760173073e-08, + "loss": 0.2251, + "step": 52774 + }, + { + "epoch": 0.979790066203035, + "grad_norm": 0.6411756873130798, + "learning_rate": 2.0127257267146262e-08, + "loss": 0.1835, + "step": 52776 + }, + { + "epoch": 0.9798271963404537, + "grad_norm": 0.5964099168777466, + "learning_rate": 2.0053352728736098e-08, + "loss": 0.2327, + "step": 52778 + }, + { + "epoch": 0.9798643264778722, + "grad_norm": 0.6121138334274292, + "learning_rate": 1.9979583987506103e-08, + "loss": 0.2595, + "step": 52780 + }, + { + "epoch": 0.9799014566152909, + "grad_norm": 0.3534139394760132, + "learning_rate": 1.9905951044461025e-08, + "loss": 0.4295, + "step": 52782 + }, + { + "epoch": 0.9799385867527096, + "grad_norm": 0.4625093340873718, + "learning_rate": 1.9832453900601178e-08, + "loss": 0.4808, + "step": 52784 + }, + { + "epoch": 0.9799757168901282, + "grad_norm": 0.4284069538116455, + "learning_rate": 1.9759092556929094e-08, + "loss": 0.3214, + "step": 52786 + }, + { + "epoch": 0.9800128470275469, + "grad_norm": 0.3997967541217804, + "learning_rate": 1.968586701444064e-08, + "loss": 0.264, + "step": 52788 + }, + { + "epoch": 0.9800499771649654, + "grad_norm": 0.5932273268699646, + "learning_rate": 1.96127772741328e-08, + "loss": 0.3216, + "step": 52790 + }, + { + "epoch": 0.9800871073023841, + "grad_norm": 0.3180560767650604, + "learning_rate": 1.953982333700033e-08, + "loss": 0.3493, + "step": 52792 + }, + { + "epoch": 0.9801242374398028, + "grad_norm": 0.5678401589393616, + "learning_rate": 1.9467005204036883e-08, + "loss": 0.2271, + "step": 52794 + }, + { + "epoch": 0.9801613675772214, + "grad_norm": 0.31910276412963867, + "learning_rate": 1.9394322876231663e-08, + "loss": 0.3144, + "step": 52796 + }, + { + "epoch": 0.9801984977146401, + "grad_norm": 0.34138843417167664, + "learning_rate": 1.9321776354573885e-08, + "loss": 0.4476, + "step": 52798 + }, + { + "epoch": 0.9802356278520586, + "grad_norm": 0.2658621668815613, + "learning_rate": 1.9249365640051642e-08, + "loss": 0.1022, + "step": 52800 + }, + { + "epoch": 0.9802727579894773, + "grad_norm": 0.6638557314872742, + "learning_rate": 1.9177090733649705e-08, + "loss": 0.2759, + "step": 52802 + }, + { + "epoch": 0.980309888126896, + "grad_norm": 0.2613357901573181, + "learning_rate": 1.910495163635173e-08, + "loss": 0.1718, + "step": 52804 + }, + { + "epoch": 0.9803470182643146, + "grad_norm": 0.26341357827186584, + "learning_rate": 1.9032948349138046e-08, + "loss": 0.1402, + "step": 52806 + }, + { + "epoch": 0.9803841484017333, + "grad_norm": 0.3948315382003784, + "learning_rate": 1.896108087299009e-08, + "loss": 0.1752, + "step": 52808 + }, + { + "epoch": 0.9804212785391518, + "grad_norm": 0.6185754537582397, + "learning_rate": 1.8889349208885965e-08, + "loss": 0.3638, + "step": 52810 + }, + { + "epoch": 0.9804584086765705, + "grad_norm": 0.4181791841983795, + "learning_rate": 1.881775335780045e-08, + "loss": 0.2577, + "step": 52812 + }, + { + "epoch": 0.9804955388139891, + "grad_norm": 0.38397645950317383, + "learning_rate": 1.8746293320708318e-08, + "loss": 0.2355, + "step": 52814 + }, + { + "epoch": 0.9805326689514078, + "grad_norm": 0.312391072511673, + "learning_rate": 1.867496909858102e-08, + "loss": 0.4889, + "step": 52816 + }, + { + "epoch": 0.9805697990888265, + "grad_norm": 0.4079931974411011, + "learning_rate": 1.8603780692389995e-08, + "loss": 0.1835, + "step": 52818 + }, + { + "epoch": 0.980606929226245, + "grad_norm": 0.24754378199577332, + "learning_rate": 1.853272810310447e-08, + "loss": 0.36, + "step": 52820 + }, + { + "epoch": 0.9806440593636637, + "grad_norm": 0.3267788290977478, + "learning_rate": 1.846181133169034e-08, + "loss": 0.2409, + "step": 52822 + }, + { + "epoch": 0.9806811895010823, + "grad_norm": 0.574005663394928, + "learning_rate": 1.8391030379113493e-08, + "loss": 0.5111, + "step": 52824 + }, + { + "epoch": 0.980718319638501, + "grad_norm": 0.40877363085746765, + "learning_rate": 1.8320385246335394e-08, + "loss": 0.3059, + "step": 52826 + }, + { + "epoch": 0.9807554497759197, + "grad_norm": 0.355730265378952, + "learning_rate": 1.8249875934317483e-08, + "loss": 0.296, + "step": 52828 + }, + { + "epoch": 0.9807925799133382, + "grad_norm": 0.49034491181373596, + "learning_rate": 1.8179502444021224e-08, + "loss": 0.1473, + "step": 52830 + }, + { + "epoch": 0.9808297100507569, + "grad_norm": 0.338891863822937, + "learning_rate": 1.8109264776402514e-08, + "loss": 0.2209, + "step": 52832 + }, + { + "epoch": 0.9808668401881755, + "grad_norm": 0.3314644694328308, + "learning_rate": 1.8039162932417252e-08, + "loss": 0.2614, + "step": 52834 + }, + { + "epoch": 0.9809039703255942, + "grad_norm": 0.46912673115730286, + "learning_rate": 1.7969196913020238e-08, + "loss": 0.1514, + "step": 52836 + }, + { + "epoch": 0.9809411004630129, + "grad_norm": 0.5093889236450195, + "learning_rate": 1.7899366719162926e-08, + "loss": 0.2625, + "step": 52838 + }, + { + "epoch": 0.9809782306004314, + "grad_norm": 0.4473753273487091, + "learning_rate": 1.7829672351794557e-08, + "loss": 0.1652, + "step": 52840 + }, + { + "epoch": 0.9810153607378501, + "grad_norm": 0.2749598026275635, + "learning_rate": 1.7760113811864375e-08, + "loss": 0.3651, + "step": 52842 + }, + { + "epoch": 0.9810524908752687, + "grad_norm": 0.4179568886756897, + "learning_rate": 1.7690691100319403e-08, + "loss": 0.2564, + "step": 52844 + }, + { + "epoch": 0.9810896210126874, + "grad_norm": 0.3955458998680115, + "learning_rate": 1.7621404218103323e-08, + "loss": 0.4479, + "step": 52846 + }, + { + "epoch": 0.9811267511501061, + "grad_norm": 0.432411253452301, + "learning_rate": 1.7552253166158716e-08, + "loss": 0.2532, + "step": 52848 + }, + { + "epoch": 0.9811638812875246, + "grad_norm": 0.6299183964729309, + "learning_rate": 1.7483237945428167e-08, + "loss": 0.2844, + "step": 52850 + }, + { + "epoch": 0.9812010114249433, + "grad_norm": 0.21975034475326538, + "learning_rate": 1.74143585568487e-08, + "loss": 0.2474, + "step": 52852 + }, + { + "epoch": 0.9812381415623619, + "grad_norm": 0.3171994686126709, + "learning_rate": 1.734561500135845e-08, + "loss": 0.408, + "step": 52854 + }, + { + "epoch": 0.9812752716997806, + "grad_norm": 0.46048086881637573, + "learning_rate": 1.7277007279893344e-08, + "loss": 0.2808, + "step": 52856 + }, + { + "epoch": 0.9813124018371991, + "grad_norm": 0.7793512344360352, + "learning_rate": 1.720853539338707e-08, + "loss": 0.1515, + "step": 52858 + }, + { + "epoch": 0.9813495319746178, + "grad_norm": 0.3635629415512085, + "learning_rate": 1.7140199342770005e-08, + "loss": 0.2522, + "step": 52860 + }, + { + "epoch": 0.9813866621120365, + "grad_norm": 0.47209876775741577, + "learning_rate": 1.7071999128972504e-08, + "loss": 0.494, + "step": 52862 + }, + { + "epoch": 0.9814237922494551, + "grad_norm": 0.31801649928092957, + "learning_rate": 1.7003934752922723e-08, + "loss": 0.3408, + "step": 52864 + }, + { + "epoch": 0.9814609223868738, + "grad_norm": 0.3857996463775635, + "learning_rate": 1.6936006215546584e-08, + "loss": 0.2377, + "step": 52866 + }, + { + "epoch": 0.9814980525242923, + "grad_norm": 0.5572075843811035, + "learning_rate": 1.6868213517770016e-08, + "loss": 0.1468, + "step": 52868 + }, + { + "epoch": 0.981535182661711, + "grad_norm": 0.34970077872276306, + "learning_rate": 1.680055666051228e-08, + "loss": 0.1699, + "step": 52870 + }, + { + "epoch": 0.9815723127991297, + "grad_norm": 0.36041855812072754, + "learning_rate": 1.673303564469708e-08, + "loss": 0.2577, + "step": 52872 + }, + { + "epoch": 0.9816094429365483, + "grad_norm": 0.78822922706604, + "learning_rate": 1.666565047124147e-08, + "loss": 0.3168, + "step": 52874 + }, + { + "epoch": 0.981646573073967, + "grad_norm": 0.3355909585952759, + "learning_rate": 1.6598401141063593e-08, + "loss": 0.2424, + "step": 52876 + }, + { + "epoch": 0.9816837032113855, + "grad_norm": 0.3343003988265991, + "learning_rate": 1.6531287655077166e-08, + "loss": 0.105, + "step": 52878 + }, + { + "epoch": 0.9817208333488042, + "grad_norm": 0.40963032841682434, + "learning_rate": 1.6464310014195906e-08, + "loss": 0.1985, + "step": 52880 + }, + { + "epoch": 0.9817579634862229, + "grad_norm": 0.26679834723472595, + "learning_rate": 1.6397468219331302e-08, + "loss": 0.1181, + "step": 52882 + }, + { + "epoch": 0.9817950936236415, + "grad_norm": 0.27392667531967163, + "learning_rate": 1.633076227139263e-08, + "loss": 0.2663, + "step": 52884 + }, + { + "epoch": 0.9818322237610602, + "grad_norm": 0.2717020809650421, + "learning_rate": 1.626419217128694e-08, + "loss": 0.3258, + "step": 52886 + }, + { + "epoch": 0.9818693538984787, + "grad_norm": 0.24505992233753204, + "learning_rate": 1.6197757919922398e-08, + "loss": 0.2452, + "step": 52888 + }, + { + "epoch": 0.9819064840358974, + "grad_norm": 0.4316135346889496, + "learning_rate": 1.6131459518200498e-08, + "loss": 0.265, + "step": 52890 + }, + { + "epoch": 0.9819436141733161, + "grad_norm": 0.34182289242744446, + "learning_rate": 1.6065296967024968e-08, + "loss": 0.3385, + "step": 52892 + }, + { + "epoch": 0.9819807443107347, + "grad_norm": 0.47099146246910095, + "learning_rate": 1.5999270267295087e-08, + "loss": 0.4158, + "step": 52894 + }, + { + "epoch": 0.9820178744481534, + "grad_norm": 0.3879220485687256, + "learning_rate": 1.5933379419909023e-08, + "loss": 0.3247, + "step": 52896 + }, + { + "epoch": 0.9820550045855719, + "grad_norm": 0.5407374501228333, + "learning_rate": 1.586762442576495e-08, + "loss": 0.2354, + "step": 52898 + }, + { + "epoch": 0.9820921347229906, + "grad_norm": 0.4070351719856262, + "learning_rate": 1.5802005285755485e-08, + "loss": 0.5488, + "step": 52900 + }, + { + "epoch": 0.9821292648604093, + "grad_norm": 0.332881897687912, + "learning_rate": 1.5736522000775466e-08, + "loss": 0.3025, + "step": 52902 + }, + { + "epoch": 0.9821663949978279, + "grad_norm": 0.27456822991371155, + "learning_rate": 1.5671174571714186e-08, + "loss": 0.1363, + "step": 52904 + }, + { + "epoch": 0.9822035251352466, + "grad_norm": 0.2130303829908371, + "learning_rate": 1.5605962999462044e-08, + "loss": 0.0708, + "step": 52906 + }, + { + "epoch": 0.9822406552726651, + "grad_norm": 0.5916491150856018, + "learning_rate": 1.5540887284906103e-08, + "loss": 0.417, + "step": 52908 + }, + { + "epoch": 0.9822777854100838, + "grad_norm": 0.688385546207428, + "learning_rate": 1.5475947428932325e-08, + "loss": 0.1805, + "step": 52910 + }, + { + "epoch": 0.9823149155475024, + "grad_norm": 0.3841279149055481, + "learning_rate": 1.5411143432423337e-08, + "loss": 0.2372, + "step": 52912 + }, + { + "epoch": 0.9823520456849211, + "grad_norm": 0.27340877056121826, + "learning_rate": 1.5346475296261764e-08, + "loss": 0.2267, + "step": 52914 + }, + { + "epoch": 0.9823891758223398, + "grad_norm": 0.4170225262641907, + "learning_rate": 1.5281943021326906e-08, + "loss": 0.2333, + "step": 52916 + }, + { + "epoch": 0.9824263059597583, + "grad_norm": 0.41291022300720215, + "learning_rate": 1.5217546608495835e-08, + "loss": 0.2107, + "step": 52918 + }, + { + "epoch": 0.982463436097177, + "grad_norm": 0.3946945369243622, + "learning_rate": 1.5153286058647855e-08, + "loss": 0.1434, + "step": 52920 + }, + { + "epoch": 0.9825005662345956, + "grad_norm": 0.410515159368515, + "learning_rate": 1.5089161372654483e-08, + "loss": 0.1629, + "step": 52922 + }, + { + "epoch": 0.9825376963720143, + "grad_norm": 0.4105881452560425, + "learning_rate": 1.5025172551390575e-08, + "loss": 0.2839, + "step": 52924 + }, + { + "epoch": 0.982574826509433, + "grad_norm": 0.571586549282074, + "learning_rate": 1.4961319595724333e-08, + "loss": 0.2665, + "step": 52926 + }, + { + "epoch": 0.9826119566468515, + "grad_norm": 0.4444388151168823, + "learning_rate": 1.4897602506526165e-08, + "loss": 0.1819, + "step": 52928 + }, + { + "epoch": 0.9826490867842702, + "grad_norm": 0.2781773507595062, + "learning_rate": 1.4834021284663157e-08, + "loss": 0.2921, + "step": 52930 + }, + { + "epoch": 0.9826862169216888, + "grad_norm": 0.49465271830558777, + "learning_rate": 1.4770575931000176e-08, + "loss": 0.2521, + "step": 52932 + }, + { + "epoch": 0.9827233470591075, + "grad_norm": 0.3094932734966278, + "learning_rate": 1.4707266446399859e-08, + "loss": 0.1966, + "step": 52934 + }, + { + "epoch": 0.9827604771965261, + "grad_norm": 0.4414733946323395, + "learning_rate": 1.4644092831724855e-08, + "loss": 0.1535, + "step": 52936 + }, + { + "epoch": 0.9827976073339447, + "grad_norm": 0.5220457911491394, + "learning_rate": 1.4581055087833362e-08, + "loss": 0.3634, + "step": 52938 + }, + { + "epoch": 0.9828347374713634, + "grad_norm": 0.46621036529541016, + "learning_rate": 1.4518153215584695e-08, + "loss": 0.3282, + "step": 52940 + }, + { + "epoch": 0.982871867608782, + "grad_norm": 0.37622910737991333, + "learning_rate": 1.4455387215833728e-08, + "loss": 0.3323, + "step": 52942 + }, + { + "epoch": 0.9829089977462007, + "grad_norm": 0.3686809539794922, + "learning_rate": 1.4392757089434217e-08, + "loss": 0.3028, + "step": 52944 + }, + { + "epoch": 0.9829461278836193, + "grad_norm": 0.4885520935058594, + "learning_rate": 1.4330262837239927e-08, + "loss": 0.084, + "step": 52946 + }, + { + "epoch": 0.9829832580210379, + "grad_norm": 0.4798552691936493, + "learning_rate": 1.4267904460099069e-08, + "loss": 0.5117, + "step": 52948 + }, + { + "epoch": 0.9830203881584566, + "grad_norm": 0.2901044487953186, + "learning_rate": 1.4205681958862073e-08, + "loss": 0.302, + "step": 52950 + }, + { + "epoch": 0.9830575182958752, + "grad_norm": 0.2551848590373993, + "learning_rate": 1.4143595334374927e-08, + "loss": 0.0837, + "step": 52952 + }, + { + "epoch": 0.9830946484332939, + "grad_norm": 0.3479268252849579, + "learning_rate": 1.4081644587482513e-08, + "loss": 0.4864, + "step": 52954 + }, + { + "epoch": 0.9831317785707125, + "grad_norm": 0.5674386620521545, + "learning_rate": 1.401982971902749e-08, + "loss": 0.2726, + "step": 52956 + }, + { + "epoch": 0.9831689087081311, + "grad_norm": 0.29985716938972473, + "learning_rate": 1.3958150729850295e-08, + "loss": 0.238, + "step": 52958 + }, + { + "epoch": 0.9832060388455498, + "grad_norm": 0.26672589778900146, + "learning_rate": 1.389660762079137e-08, + "loss": 0.402, + "step": 52960 + }, + { + "epoch": 0.9832431689829684, + "grad_norm": 0.8663783669471741, + "learning_rate": 1.383520039268782e-08, + "loss": 0.2666, + "step": 52962 + }, + { + "epoch": 0.9832802991203871, + "grad_norm": 0.3186021149158478, + "learning_rate": 1.3773929046375645e-08, + "loss": 0.2136, + "step": 52964 + }, + { + "epoch": 0.9833174292578056, + "grad_norm": 0.22540484368801117, + "learning_rate": 1.371279358268751e-08, + "loss": 0.2074, + "step": 52966 + }, + { + "epoch": 0.9833545593952243, + "grad_norm": 0.4237343966960907, + "learning_rate": 1.3651794002456087e-08, + "loss": 0.2856, + "step": 52968 + }, + { + "epoch": 0.983391689532643, + "grad_norm": 0.4007505774497986, + "learning_rate": 1.3590930306511819e-08, + "loss": 0.3496, + "step": 52970 + }, + { + "epoch": 0.9834288196700616, + "grad_norm": 0.4818536937236786, + "learning_rate": 1.353020249568071e-08, + "loss": 0.2505, + "step": 52972 + }, + { + "epoch": 0.9834659498074803, + "grad_norm": 0.32615578174591064, + "learning_rate": 1.3469610570792103e-08, + "loss": 0.372, + "step": 52974 + }, + { + "epoch": 0.9835030799448988, + "grad_norm": 0.3479304015636444, + "learning_rate": 1.3409154532668667e-08, + "loss": 0.2359, + "step": 52976 + }, + { + "epoch": 0.9835402100823175, + "grad_norm": 0.4243628680706024, + "learning_rate": 1.334883438213308e-08, + "loss": 0.2649, + "step": 52978 + }, + { + "epoch": 0.9835773402197362, + "grad_norm": 0.27116644382476807, + "learning_rate": 1.3288650120005797e-08, + "loss": 0.256, + "step": 52980 + }, + { + "epoch": 0.9836144703571548, + "grad_norm": 0.4559706151485443, + "learning_rate": 1.3228601747107272e-08, + "loss": 0.2736, + "step": 52982 + }, + { + "epoch": 0.9836516004945735, + "grad_norm": 0.34350845217704773, + "learning_rate": 1.316868926425352e-08, + "loss": 0.267, + "step": 52984 + }, + { + "epoch": 0.983688730631992, + "grad_norm": 0.6091378331184387, + "learning_rate": 1.3108912672259445e-08, + "loss": 0.1935, + "step": 52986 + }, + { + "epoch": 0.9837258607694107, + "grad_norm": 0.49014905095100403, + "learning_rate": 1.3049271971938838e-08, + "loss": 0.2191, + "step": 52988 + }, + { + "epoch": 0.9837629909068294, + "grad_norm": 0.6771931648254395, + "learning_rate": 1.2989767164103273e-08, + "loss": 0.1934, + "step": 52990 + }, + { + "epoch": 0.983800121044248, + "grad_norm": 0.3722216784954071, + "learning_rate": 1.2930398249562103e-08, + "loss": 0.3798, + "step": 52992 + }, + { + "epoch": 0.9838372511816666, + "grad_norm": 0.5458692312240601, + "learning_rate": 1.2871165229124683e-08, + "loss": 0.1906, + "step": 52994 + }, + { + "epoch": 0.9838743813190852, + "grad_norm": 0.4778379797935486, + "learning_rate": 1.281206810359481e-08, + "loss": 0.2864, + "step": 52996 + }, + { + "epoch": 0.9839115114565039, + "grad_norm": 0.37623992562294006, + "learning_rate": 1.27531068737774e-08, + "loss": 0.1773, + "step": 52998 + }, + { + "epoch": 0.9839486415939226, + "grad_norm": 0.2356218695640564, + "learning_rate": 1.269428154047514e-08, + "loss": 0.1914, + "step": 53000 + }, + { + "epoch": 0.9839857717313412, + "grad_norm": 0.3305864930152893, + "learning_rate": 1.2635592104488502e-08, + "loss": 0.3145, + "step": 53002 + }, + { + "epoch": 0.9840229018687598, + "grad_norm": 0.5014298558235168, + "learning_rate": 1.257703856661574e-08, + "loss": 0.177, + "step": 53004 + }, + { + "epoch": 0.9840600320061784, + "grad_norm": 0.3790280818939209, + "learning_rate": 1.251862092765399e-08, + "loss": 0.1949, + "step": 53006 + }, + { + "epoch": 0.9840971621435971, + "grad_norm": 0.17900466918945312, + "learning_rate": 1.2460339188398174e-08, + "loss": 0.2662, + "step": 53008 + }, + { + "epoch": 0.9841342922810157, + "grad_norm": 0.353831946849823, + "learning_rate": 1.2402193349639879e-08, + "loss": 0.2156, + "step": 53010 + }, + { + "epoch": 0.9841714224184344, + "grad_norm": 0.31914785504341125, + "learning_rate": 1.2344183412171806e-08, + "loss": 0.352, + "step": 53012 + }, + { + "epoch": 0.984208552555853, + "grad_norm": 0.33785805106163025, + "learning_rate": 1.2286309376783323e-08, + "loss": 0.2257, + "step": 53014 + }, + { + "epoch": 0.9842456826932716, + "grad_norm": 0.45867738127708435, + "learning_rate": 1.2228571244261578e-08, + "loss": 0.2575, + "step": 53016 + }, + { + "epoch": 0.9842828128306903, + "grad_norm": 0.28101497888565063, + "learning_rate": 1.2170969015391499e-08, + "loss": 0.338, + "step": 53018 + }, + { + "epoch": 0.9843199429681089, + "grad_norm": 0.3125835359096527, + "learning_rate": 1.2113502690958011e-08, + "loss": 0.0865, + "step": 53020 + }, + { + "epoch": 0.9843570731055276, + "grad_norm": 0.340982586145401, + "learning_rate": 1.2056172271742717e-08, + "loss": 0.262, + "step": 53022 + }, + { + "epoch": 0.9843942032429462, + "grad_norm": 0.6768050193786621, + "learning_rate": 1.1998977758524987e-08, + "loss": 0.415, + "step": 53024 + }, + { + "epoch": 0.9844313333803648, + "grad_norm": 0.36077985167503357, + "learning_rate": 1.1941919152084203e-08, + "loss": 0.3354, + "step": 53026 + }, + { + "epoch": 0.9844684635177835, + "grad_norm": 0.43528828024864197, + "learning_rate": 1.1884996453196406e-08, + "loss": 0.2371, + "step": 53028 + }, + { + "epoch": 0.9845055936552021, + "grad_norm": 0.21479077637195587, + "learning_rate": 1.1828209662635425e-08, + "loss": 0.2092, + "step": 53030 + }, + { + "epoch": 0.9845427237926208, + "grad_norm": 0.5424080491065979, + "learning_rate": 1.1771558781173976e-08, + "loss": 0.3202, + "step": 53032 + }, + { + "epoch": 0.9845798539300394, + "grad_norm": 0.36951112747192383, + "learning_rate": 1.171504380958366e-08, + "loss": 0.2445, + "step": 53034 + }, + { + "epoch": 0.984616984067458, + "grad_norm": 0.494515061378479, + "learning_rate": 1.1658664748633863e-08, + "loss": 0.2434, + "step": 53036 + }, + { + "epoch": 0.9846541142048767, + "grad_norm": 0.30795934796333313, + "learning_rate": 1.1602421599090641e-08, + "loss": 0.3617, + "step": 53038 + }, + { + "epoch": 0.9846912443422953, + "grad_norm": 0.4298781156539917, + "learning_rate": 1.1546314361720046e-08, + "loss": 0.4055, + "step": 53040 + }, + { + "epoch": 0.984728374479714, + "grad_norm": 0.5600579380989075, + "learning_rate": 1.1490343037284801e-08, + "loss": 0.4086, + "step": 53042 + }, + { + "epoch": 0.9847655046171326, + "grad_norm": 0.432710200548172, + "learning_rate": 1.143450762654652e-08, + "loss": 0.2066, + "step": 53044 + }, + { + "epoch": 0.9848026347545512, + "grad_norm": 0.4219384789466858, + "learning_rate": 1.1378808130265705e-08, + "loss": 0.1525, + "step": 53046 + }, + { + "epoch": 0.9848397648919699, + "grad_norm": 0.4071643054485321, + "learning_rate": 1.132324454920064e-08, + "loss": 0.5499, + "step": 53048 + }, + { + "epoch": 0.9848768950293885, + "grad_norm": 0.3895339369773865, + "learning_rate": 1.1267816884106275e-08, + "loss": 0.3011, + "step": 53050 + }, + { + "epoch": 0.9849140251668071, + "grad_norm": 0.4749773144721985, + "learning_rate": 1.1212525135737563e-08, + "loss": 0.2704, + "step": 53052 + }, + { + "epoch": 0.9849511553042258, + "grad_norm": 0.560800313949585, + "learning_rate": 1.1157369304846122e-08, + "loss": 0.327, + "step": 53054 + }, + { + "epoch": 0.9849882854416444, + "grad_norm": 0.46202149987220764, + "learning_rate": 1.1102349392182466e-08, + "loss": 0.1878, + "step": 53056 + }, + { + "epoch": 0.9850254155790631, + "grad_norm": 0.2849270701408386, + "learning_rate": 1.1047465398495993e-08, + "loss": 0.2057, + "step": 53058 + }, + { + "epoch": 0.9850625457164817, + "grad_norm": 0.37653493881225586, + "learning_rate": 1.0992717324532776e-08, + "loss": 0.2182, + "step": 53060 + }, + { + "epoch": 0.9850996758539003, + "grad_norm": 0.34469857811927795, + "learning_rate": 1.0938105171038882e-08, + "loss": 0.2982, + "step": 53062 + }, + { + "epoch": 0.9851368059913189, + "grad_norm": 0.46987178921699524, + "learning_rate": 1.0883628938755942e-08, + "loss": 0.2527, + "step": 53064 + }, + { + "epoch": 0.9851739361287376, + "grad_norm": 0.36580121517181396, + "learning_rate": 1.0829288628426694e-08, + "loss": 0.2096, + "step": 53066 + }, + { + "epoch": 0.9852110662661563, + "grad_norm": 0.394609659910202, + "learning_rate": 1.0775084240789435e-08, + "loss": 0.303, + "step": 53068 + }, + { + "epoch": 0.9852481964035749, + "grad_norm": 0.5190526247024536, + "learning_rate": 1.0721015776581356e-08, + "loss": 0.276, + "step": 53070 + }, + { + "epoch": 0.9852853265409935, + "grad_norm": 0.5082761645317078, + "learning_rate": 1.0667083236539644e-08, + "loss": 0.2337, + "step": 53072 + }, + { + "epoch": 0.9853224566784121, + "grad_norm": 0.30295056104660034, + "learning_rate": 1.0613286621398156e-08, + "loss": 0.3476, + "step": 53074 + }, + { + "epoch": 0.9853595868158308, + "grad_norm": 0.5708178281784058, + "learning_rate": 1.0559625931887419e-08, + "loss": 0.2694, + "step": 53076 + }, + { + "epoch": 0.9853967169532495, + "grad_norm": 0.4554416239261627, + "learning_rate": 1.0506101168737959e-08, + "loss": 0.2682, + "step": 53078 + }, + { + "epoch": 0.9854338470906681, + "grad_norm": 0.3567994236946106, + "learning_rate": 1.0452712332678083e-08, + "loss": 0.2271, + "step": 53080 + }, + { + "epoch": 0.9854709772280867, + "grad_norm": 0.32993486523628235, + "learning_rate": 1.0399459424436098e-08, + "loss": 0.3898, + "step": 53082 + }, + { + "epoch": 0.9855081073655053, + "grad_norm": 0.49736452102661133, + "learning_rate": 1.0346342444733648e-08, + "loss": 0.2953, + "step": 53084 + }, + { + "epoch": 0.985545237502924, + "grad_norm": 0.24259555339813232, + "learning_rate": 1.0293361394295709e-08, + "loss": 0.2253, + "step": 53086 + }, + { + "epoch": 0.9855823676403427, + "grad_norm": 0.4085068106651306, + "learning_rate": 1.0240516273842815e-08, + "loss": 0.1324, + "step": 53088 + }, + { + "epoch": 0.9856194977777613, + "grad_norm": 0.4238944351673126, + "learning_rate": 1.0187807084093282e-08, + "loss": 0.319, + "step": 53090 + }, + { + "epoch": 0.9856566279151799, + "grad_norm": 0.31107038259506226, + "learning_rate": 1.0135233825765423e-08, + "loss": 0.4662, + "step": 53092 + }, + { + "epoch": 0.9856937580525985, + "grad_norm": 0.36112818121910095, + "learning_rate": 1.0082796499573112e-08, + "loss": 0.3264, + "step": 53094 + }, + { + "epoch": 0.9857308881900172, + "grad_norm": 0.3701775372028351, + "learning_rate": 1.003049510623022e-08, + "loss": 0.1673, + "step": 53096 + }, + { + "epoch": 0.9857680183274359, + "grad_norm": 0.4876464307308197, + "learning_rate": 9.978329646449514e-09, + "loss": 0.1603, + "step": 53098 + }, + { + "epoch": 0.9858051484648545, + "grad_norm": 0.47264254093170166, + "learning_rate": 9.926300120940423e-09, + "loss": 0.1892, + "step": 53100 + }, + { + "epoch": 0.9858422786022731, + "grad_norm": 0.4265842139720917, + "learning_rate": 9.874406530411274e-09, + "loss": 0.2829, + "step": 53102 + }, + { + "epoch": 0.9858794087396917, + "grad_norm": 0.40275701880455017, + "learning_rate": 9.822648875565944e-09, + "loss": 0.1599, + "step": 53104 + }, + { + "epoch": 0.9859165388771104, + "grad_norm": 0.4250245690345764, + "learning_rate": 9.771027157111645e-09, + "loss": 0.2779, + "step": 53106 + }, + { + "epoch": 0.9859536690145291, + "grad_norm": 0.559378445148468, + "learning_rate": 9.71954137574893e-09, + "loss": 0.3623, + "step": 53108 + }, + { + "epoch": 0.9859907991519477, + "grad_norm": 0.4601593017578125, + "learning_rate": 9.66819153217835e-09, + "loss": 0.3438, + "step": 53110 + }, + { + "epoch": 0.9860279292893663, + "grad_norm": 0.4573003947734833, + "learning_rate": 9.616977627099344e-09, + "loss": 0.2259, + "step": 53112 + }, + { + "epoch": 0.9860650594267849, + "grad_norm": 0.4358961582183838, + "learning_rate": 9.565899661209132e-09, + "loss": 0.3607, + "step": 53114 + }, + { + "epoch": 0.9861021895642036, + "grad_norm": 0.39182502031326294, + "learning_rate": 9.514957635202715e-09, + "loss": 0.2713, + "step": 53116 + }, + { + "epoch": 0.9861393197016222, + "grad_norm": 0.2643243372440338, + "learning_rate": 9.46415154977176e-09, + "loss": 0.1512, + "step": 53118 + }, + { + "epoch": 0.9861764498390408, + "grad_norm": 0.48572197556495667, + "learning_rate": 9.413481405609048e-09, + "loss": 0.2686, + "step": 53120 + }, + { + "epoch": 0.9862135799764595, + "grad_norm": 0.3542144000530243, + "learning_rate": 9.362947203402917e-09, + "loss": 0.2562, + "step": 53122 + }, + { + "epoch": 0.9862507101138781, + "grad_norm": 0.26186344027519226, + "learning_rate": 9.312548943842815e-09, + "loss": 0.2799, + "step": 53124 + }, + { + "epoch": 0.9862878402512968, + "grad_norm": 0.32664772868156433, + "learning_rate": 9.262286627612638e-09, + "loss": 0.2103, + "step": 53126 + }, + { + "epoch": 0.9863249703887154, + "grad_norm": 0.41710367798805237, + "learning_rate": 9.212160255397396e-09, + "loss": 0.2349, + "step": 53128 + }, + { + "epoch": 0.986362100526134, + "grad_norm": 0.27914559841156006, + "learning_rate": 9.162169827877653e-09, + "loss": 0.1334, + "step": 53130 + }, + { + "epoch": 0.9863992306635527, + "grad_norm": 0.27275288105010986, + "learning_rate": 9.1123153457362e-09, + "loss": 0.1456, + "step": 53132 + }, + { + "epoch": 0.9864363608009713, + "grad_norm": 0.26110753417015076, + "learning_rate": 9.06259680964916e-09, + "loss": 0.2433, + "step": 53134 + }, + { + "epoch": 0.98647349093839, + "grad_norm": 0.3933124840259552, + "learning_rate": 9.01301422029377e-09, + "loss": 0.2525, + "step": 53136 + }, + { + "epoch": 0.9865106210758086, + "grad_norm": 0.5826109647750854, + "learning_rate": 8.963567578345045e-09, + "loss": 0.3903, + "step": 53138 + }, + { + "epoch": 0.9865477512132272, + "grad_norm": 0.44276341795921326, + "learning_rate": 8.91425688447578e-09, + "loss": 0.4659, + "step": 53140 + }, + { + "epoch": 0.9865848813506459, + "grad_norm": 0.31747257709503174, + "learning_rate": 8.86508213935655e-09, + "loss": 0.2264, + "step": 53142 + }, + { + "epoch": 0.9866220114880645, + "grad_norm": 0.5133402943611145, + "learning_rate": 8.81604334365682e-09, + "loss": 0.4238, + "step": 53144 + }, + { + "epoch": 0.9866591416254832, + "grad_norm": 0.23044633865356445, + "learning_rate": 8.767140498042725e-09, + "loss": 0.1266, + "step": 53146 + }, + { + "epoch": 0.9866962717629018, + "grad_norm": 0.3196012079715729, + "learning_rate": 8.718373603181508e-09, + "loss": 0.3068, + "step": 53148 + }, + { + "epoch": 0.9867334019003204, + "grad_norm": 0.22003494203090668, + "learning_rate": 8.66974265973597e-09, + "loss": 0.154, + "step": 53150 + }, + { + "epoch": 0.9867705320377391, + "grad_norm": 0.17721134424209595, + "learning_rate": 8.621247668367806e-09, + "loss": 0.2136, + "step": 53152 + }, + { + "epoch": 0.9868076621751577, + "grad_norm": 0.5871778726577759, + "learning_rate": 8.572888629736487e-09, + "loss": 0.3495, + "step": 53154 + }, + { + "epoch": 0.9868447923125764, + "grad_norm": 0.39905649423599243, + "learning_rate": 8.524665544500378e-09, + "loss": 0.3881, + "step": 53156 + }, + { + "epoch": 0.986881922449995, + "grad_norm": 0.4124574363231659, + "learning_rate": 8.476578413315618e-09, + "loss": 0.2409, + "step": 53158 + }, + { + "epoch": 0.9869190525874136, + "grad_norm": 0.4230913817882538, + "learning_rate": 8.428627236836129e-09, + "loss": 0.4552, + "step": 53160 + }, + { + "epoch": 0.9869561827248322, + "grad_norm": 0.3227681815624237, + "learning_rate": 8.380812015714723e-09, + "loss": 0.2197, + "step": 53162 + }, + { + "epoch": 0.9869933128622509, + "grad_norm": 0.32631757855415344, + "learning_rate": 8.333132750603102e-09, + "loss": 0.3257, + "step": 53164 + }, + { + "epoch": 0.9870304429996696, + "grad_norm": 0.35036492347717285, + "learning_rate": 8.285589442148524e-09, + "loss": 0.1323, + "step": 53166 + }, + { + "epoch": 0.9870675731370882, + "grad_norm": 0.5013242363929749, + "learning_rate": 8.238182090998248e-09, + "loss": 0.3104, + "step": 53168 + }, + { + "epoch": 0.9871047032745068, + "grad_norm": 0.5183802843093872, + "learning_rate": 8.190910697798426e-09, + "loss": 0.4346, + "step": 53170 + }, + { + "epoch": 0.9871418334119254, + "grad_norm": 0.4218640923500061, + "learning_rate": 8.143775263189657e-09, + "loss": 0.3287, + "step": 53172 + }, + { + "epoch": 0.9871789635493441, + "grad_norm": 0.5029458999633789, + "learning_rate": 8.096775787816979e-09, + "loss": 0.3065, + "step": 53174 + }, + { + "epoch": 0.9872160936867628, + "grad_norm": 0.2588317096233368, + "learning_rate": 8.04991227231655e-09, + "loss": 0.3033, + "step": 53176 + }, + { + "epoch": 0.9872532238241813, + "grad_norm": 0.4231448173522949, + "learning_rate": 8.00318471732786e-09, + "loss": 0.2711, + "step": 53178 + }, + { + "epoch": 0.9872903539616, + "grad_norm": 0.28477081656455994, + "learning_rate": 7.956593123487066e-09, + "loss": 0.1377, + "step": 53180 + }, + { + "epoch": 0.9873274840990186, + "grad_norm": 0.40675631165504456, + "learning_rate": 7.910137491425884e-09, + "loss": 0.1686, + "step": 53182 + }, + { + "epoch": 0.9873646142364373, + "grad_norm": 0.6056632399559021, + "learning_rate": 7.863817821779363e-09, + "loss": 0.2871, + "step": 53184 + }, + { + "epoch": 0.987401744373856, + "grad_norm": 0.3164503872394562, + "learning_rate": 7.817634115175887e-09, + "loss": 0.2802, + "step": 53186 + }, + { + "epoch": 0.9874388745112745, + "grad_norm": 0.41557440161705017, + "learning_rate": 7.771586372243844e-09, + "loss": 0.2096, + "step": 53188 + }, + { + "epoch": 0.9874760046486932, + "grad_norm": 0.7443267107009888, + "learning_rate": 7.725674593611621e-09, + "loss": 0.2484, + "step": 53190 + }, + { + "epoch": 0.9875131347861118, + "grad_norm": 0.22927403450012207, + "learning_rate": 7.679898779900941e-09, + "loss": 0.3217, + "step": 53192 + }, + { + "epoch": 0.9875502649235305, + "grad_norm": 0.5478350520133972, + "learning_rate": 7.63425893173797e-09, + "loss": 0.345, + "step": 53194 + }, + { + "epoch": 0.9875873950609492, + "grad_norm": 0.3027779757976532, + "learning_rate": 7.588755049741104e-09, + "loss": 0.0598, + "step": 53196 + }, + { + "epoch": 0.9876245251983677, + "grad_norm": 0.23514460027217865, + "learning_rate": 7.543387134530955e-09, + "loss": 0.2057, + "step": 53198 + }, + { + "epoch": 0.9876616553357864, + "grad_norm": 0.31874892115592957, + "learning_rate": 7.498155186723698e-09, + "loss": 0.2605, + "step": 53200 + }, + { + "epoch": 0.987698785473205, + "grad_norm": 0.2721218466758728, + "learning_rate": 7.453059206936619e-09, + "loss": 0.1028, + "step": 53202 + }, + { + "epoch": 0.9877359156106237, + "grad_norm": 0.3757397532463074, + "learning_rate": 7.408099195781449e-09, + "loss": 0.4145, + "step": 53204 + }, + { + "epoch": 0.9877730457480424, + "grad_norm": 0.4937693774700165, + "learning_rate": 7.36327515387103e-09, + "loss": 0.3242, + "step": 53206 + }, + { + "epoch": 0.9878101758854609, + "grad_norm": 0.3928329348564148, + "learning_rate": 7.318587081815987e-09, + "loss": 0.4254, + "step": 53208 + }, + { + "epoch": 0.9878473060228796, + "grad_norm": 0.46920663118362427, + "learning_rate": 7.274034980222499e-09, + "loss": 0.2867, + "step": 53210 + }, + { + "epoch": 0.9878844361602982, + "grad_norm": 0.43465757369995117, + "learning_rate": 7.229618849696752e-09, + "loss": 0.3264, + "step": 53212 + }, + { + "epoch": 0.9879215662977169, + "grad_norm": 0.3177186846733093, + "learning_rate": 7.185338690846033e-09, + "loss": 0.304, + "step": 53214 + }, + { + "epoch": 0.9879586964351355, + "grad_norm": 0.4860471487045288, + "learning_rate": 7.141194504269866e-09, + "loss": 0.2128, + "step": 53216 + }, + { + "epoch": 0.9879958265725541, + "grad_norm": 0.36399605870246887, + "learning_rate": 7.097186290569991e-09, + "loss": 0.1092, + "step": 53218 + }, + { + "epoch": 0.9880329567099728, + "grad_norm": 0.42836377024650574, + "learning_rate": 7.053314050344817e-09, + "loss": 0.2586, + "step": 53220 + }, + { + "epoch": 0.9880700868473914, + "grad_norm": 0.47378724813461304, + "learning_rate": 7.009577784192756e-09, + "loss": 0.3456, + "step": 53222 + }, + { + "epoch": 0.9881072169848101, + "grad_norm": 0.46237489581108093, + "learning_rate": 6.965977492706666e-09, + "loss": 0.2552, + "step": 53224 + }, + { + "epoch": 0.9881443471222287, + "grad_norm": 0.28131136298179626, + "learning_rate": 6.922513176481627e-09, + "loss": 0.1465, + "step": 53226 + }, + { + "epoch": 0.9881814772596473, + "grad_norm": 0.30757683515548706, + "learning_rate": 6.879184836109387e-09, + "loss": 0.4704, + "step": 53228 + }, + { + "epoch": 0.988218607397066, + "grad_norm": 0.4576558768749237, + "learning_rate": 6.835992472177255e-09, + "loss": 0.3092, + "step": 53230 + }, + { + "epoch": 0.9882557375344846, + "grad_norm": 0.32685327529907227, + "learning_rate": 6.792936085274759e-09, + "loss": 0.2592, + "step": 53232 + }, + { + "epoch": 0.9882928676719033, + "grad_norm": 0.3320324420928955, + "learning_rate": 6.7500156759869874e-09, + "loss": 0.2922, + "step": 53234 + }, + { + "epoch": 0.9883299978093218, + "grad_norm": 0.3592033088207245, + "learning_rate": 6.707231244897916e-09, + "loss": 0.3451, + "step": 53236 + }, + { + "epoch": 0.9883671279467405, + "grad_norm": 0.37102431058883667, + "learning_rate": 6.664582792591523e-09, + "loss": 0.1934, + "step": 53238 + }, + { + "epoch": 0.9884042580841592, + "grad_norm": 0.4393404722213745, + "learning_rate": 6.622070319645124e-09, + "loss": 0.2311, + "step": 53240 + }, + { + "epoch": 0.9884413882215778, + "grad_norm": 0.4849635660648346, + "learning_rate": 6.5796938266393665e-09, + "loss": 0.3056, + "step": 53242 + }, + { + "epoch": 0.9884785183589965, + "grad_norm": 0.4947705864906311, + "learning_rate": 6.5374533141504545e-09, + "loss": 0.4123, + "step": 53244 + }, + { + "epoch": 0.988515648496415, + "grad_norm": 0.4610726535320282, + "learning_rate": 6.495348782752375e-09, + "loss": 0.2363, + "step": 53246 + }, + { + "epoch": 0.9885527786338337, + "grad_norm": 0.5574173927307129, + "learning_rate": 6.453380233019113e-09, + "loss": 0.3357, + "step": 53248 + }, + { + "epoch": 0.9885899087712524, + "grad_norm": 0.4010533094406128, + "learning_rate": 6.411547665520212e-09, + "loss": 0.1971, + "step": 53250 + }, + { + "epoch": 0.988627038908671, + "grad_norm": 0.4894147217273712, + "learning_rate": 6.369851080827438e-09, + "loss": 0.253, + "step": 53252 + }, + { + "epoch": 0.9886641690460897, + "grad_norm": 0.5542286038398743, + "learning_rate": 6.328290479505894e-09, + "loss": 0.1554, + "step": 53254 + }, + { + "epoch": 0.9887012991835082, + "grad_norm": 0.365032821893692, + "learning_rate": 6.286865862120684e-09, + "loss": 0.3762, + "step": 53256 + }, + { + "epoch": 0.9887384293209269, + "grad_norm": 0.16837038099765778, + "learning_rate": 6.24557722923802e-09, + "loss": 0.1749, + "step": 53258 + }, + { + "epoch": 0.9887755594583456, + "grad_norm": 0.4075848162174225, + "learning_rate": 6.204424581417456e-09, + "loss": 0.3186, + "step": 53260 + }, + { + "epoch": 0.9888126895957642, + "grad_norm": 0.4162130057811737, + "learning_rate": 6.1634079192207654e-09, + "loss": 0.2489, + "step": 53262 + }, + { + "epoch": 0.9888498197331829, + "grad_norm": 0.37175485491752625, + "learning_rate": 6.122527243204168e-09, + "loss": 0.2835, + "step": 53264 + }, + { + "epoch": 0.9888869498706014, + "grad_norm": 0.5033999681472778, + "learning_rate": 6.081782553926107e-09, + "loss": 0.4711, + "step": 53266 + }, + { + "epoch": 0.9889240800080201, + "grad_norm": 0.3311261534690857, + "learning_rate": 6.041173851938365e-09, + "loss": 0.1535, + "step": 53268 + }, + { + "epoch": 0.9889612101454387, + "grad_norm": 0.3732919991016388, + "learning_rate": 6.0007011377949394e-09, + "loss": 0.2485, + "step": 53270 + }, + { + "epoch": 0.9889983402828574, + "grad_norm": 0.33835986256599426, + "learning_rate": 5.960364412046505e-09, + "loss": 0.3863, + "step": 53272 + }, + { + "epoch": 0.9890354704202761, + "grad_norm": 0.19367873668670654, + "learning_rate": 5.9201636752426186e-09, + "loss": 0.1656, + "step": 53274 + }, + { + "epoch": 0.9890726005576946, + "grad_norm": 0.34048378467559814, + "learning_rate": 5.8800989279295115e-09, + "loss": 0.3059, + "step": 53276 + }, + { + "epoch": 0.9891097306951133, + "grad_norm": 0.3036670982837677, + "learning_rate": 5.840170170652304e-09, + "loss": 0.4157, + "step": 53278 + }, + { + "epoch": 0.9891468608325319, + "grad_norm": 0.4369852840900421, + "learning_rate": 5.800377403953894e-09, + "loss": 0.3206, + "step": 53280 + }, + { + "epoch": 0.9891839909699506, + "grad_norm": 0.4867478013038635, + "learning_rate": 5.760720628377181e-09, + "loss": 0.27, + "step": 53282 + }, + { + "epoch": 0.9892211211073693, + "grad_norm": 0.8104970455169678, + "learning_rate": 5.721199844459513e-09, + "loss": 0.2174, + "step": 53284 + }, + { + "epoch": 0.9892582512447878, + "grad_norm": 0.3569347560405731, + "learning_rate": 5.681815052740458e-09, + "loss": 0.1651, + "step": 53286 + }, + { + "epoch": 0.9892953813822065, + "grad_norm": 0.3604087829589844, + "learning_rate": 5.642566253756254e-09, + "loss": 0.2577, + "step": 53288 + }, + { + "epoch": 0.9893325115196251, + "grad_norm": 0.4281077980995178, + "learning_rate": 5.603453448039808e-09, + "loss": 0.4361, + "step": 53290 + }, + { + "epoch": 0.9893696416570438, + "grad_norm": 0.6063163876533508, + "learning_rate": 5.564476636122918e-09, + "loss": 0.389, + "step": 53292 + }, + { + "epoch": 0.9894067717944625, + "grad_norm": 0.28695377707481384, + "learning_rate": 5.525635818537378e-09, + "loss": 0.3937, + "step": 53294 + }, + { + "epoch": 0.989443901931881, + "grad_norm": 0.3462778627872467, + "learning_rate": 5.486930995810546e-09, + "loss": 0.1233, + "step": 53296 + }, + { + "epoch": 0.9894810320692997, + "grad_norm": 0.40961721539497375, + "learning_rate": 5.448362168469778e-09, + "loss": 0.1678, + "step": 53298 + }, + { + "epoch": 0.9895181622067183, + "grad_norm": 0.4321346580982208, + "learning_rate": 5.40992933704021e-09, + "loss": 0.2486, + "step": 53300 + }, + { + "epoch": 0.989555292344137, + "grad_norm": 0.3577726483345032, + "learning_rate": 5.371632502043644e-09, + "loss": 0.1735, + "step": 53302 + }, + { + "epoch": 0.9895924224815557, + "grad_norm": 0.6342619061470032, + "learning_rate": 5.333471664001888e-09, + "loss": 0.1127, + "step": 53304 + }, + { + "epoch": 0.9896295526189742, + "grad_norm": 0.2502640187740326, + "learning_rate": 5.2954468234345245e-09, + "loss": 0.2863, + "step": 53306 + }, + { + "epoch": 0.9896666827563929, + "grad_norm": 0.7644517421722412, + "learning_rate": 5.2575579808578085e-09, + "loss": 0.4087, + "step": 53308 + }, + { + "epoch": 0.9897038128938115, + "grad_norm": 0.39484095573425293, + "learning_rate": 5.219805136789102e-09, + "loss": 0.2468, + "step": 53310 + }, + { + "epoch": 0.9897409430312302, + "grad_norm": 0.3180004060268402, + "learning_rate": 5.1821882917391095e-09, + "loss": 0.282, + "step": 53312 + }, + { + "epoch": 0.9897780731686487, + "grad_norm": 0.3529704511165619, + "learning_rate": 5.144707446222974e-09, + "loss": 0.1592, + "step": 53314 + }, + { + "epoch": 0.9898152033060674, + "grad_norm": 0.5988825559616089, + "learning_rate": 5.107362600749177e-09, + "loss": 0.2789, + "step": 53316 + }, + { + "epoch": 0.9898523334434861, + "grad_norm": 0.22207272052764893, + "learning_rate": 5.0701537558262014e-09, + "loss": 0.1865, + "step": 53318 + }, + { + "epoch": 0.9898894635809047, + "grad_norm": 0.24557848274707794, + "learning_rate": 5.033080911959198e-09, + "loss": 0.2769, + "step": 53320 + }, + { + "epoch": 0.9899265937183234, + "grad_norm": 0.2961386740207672, + "learning_rate": 4.996144069654429e-09, + "loss": 0.3831, + "step": 53322 + }, + { + "epoch": 0.9899637238557419, + "grad_norm": 0.30986568331718445, + "learning_rate": 4.9593432294137156e-09, + "loss": 0.1994, + "step": 53324 + }, + { + "epoch": 0.9900008539931606, + "grad_norm": 0.5762717723846436, + "learning_rate": 4.922678391737767e-09, + "loss": 0.1298, + "step": 53326 + }, + { + "epoch": 0.9900379841305793, + "grad_norm": 0.49220049381256104, + "learning_rate": 4.886149557125075e-09, + "loss": 0.228, + "step": 53328 + }, + { + "epoch": 0.9900751142679979, + "grad_norm": 0.3445313572883606, + "learning_rate": 4.849756726073018e-09, + "loss": 0.1974, + "step": 53330 + }, + { + "epoch": 0.9901122444054166, + "grad_norm": 0.4992985427379608, + "learning_rate": 4.813499899076757e-09, + "loss": 0.3602, + "step": 53332 + }, + { + "epoch": 0.9901493745428351, + "grad_norm": 0.42849379777908325, + "learning_rate": 4.77737907663034e-09, + "loss": 0.2281, + "step": 53334 + }, + { + "epoch": 0.9901865046802538, + "grad_norm": 0.39814242720603943, + "learning_rate": 4.741394259223375e-09, + "loss": 0.2372, + "step": 53336 + }, + { + "epoch": 0.9902236348176725, + "grad_norm": 0.2453489601612091, + "learning_rate": 4.7055454473476925e-09, + "loss": 0.1676, + "step": 53338 + }, + { + "epoch": 0.9902607649550911, + "grad_norm": 0.41594839096069336, + "learning_rate": 4.669832641490679e-09, + "loss": 0.1983, + "step": 53340 + }, + { + "epoch": 0.9902978950925098, + "grad_norm": 0.7274026274681091, + "learning_rate": 4.634255842136393e-09, + "loss": 0.0838, + "step": 53342 + }, + { + "epoch": 0.9903350252299283, + "grad_norm": 0.290635883808136, + "learning_rate": 4.59881504977111e-09, + "loss": 0.1224, + "step": 53344 + }, + { + "epoch": 0.990372155367347, + "grad_norm": 0.26693812012672424, + "learning_rate": 4.563510264876669e-09, + "loss": 0.3341, + "step": 53346 + }, + { + "epoch": 0.9904092855047657, + "grad_norm": 0.3814639747142792, + "learning_rate": 4.528341487932686e-09, + "loss": 0.2294, + "step": 53348 + }, + { + "epoch": 0.9904464156421843, + "grad_norm": 0.3989715278148651, + "learning_rate": 4.493308719417666e-09, + "loss": 0.3172, + "step": 53350 + }, + { + "epoch": 0.990483545779603, + "grad_norm": 0.4636980891227722, + "learning_rate": 4.458411959809006e-09, + "loss": 0.2385, + "step": 53352 + }, + { + "epoch": 0.9905206759170215, + "grad_norm": 0.44374024868011475, + "learning_rate": 4.423651209580771e-09, + "loss": 0.2856, + "step": 53354 + }, + { + "epoch": 0.9905578060544402, + "grad_norm": 0.26918163895606995, + "learning_rate": 4.389026469207025e-09, + "loss": 0.1872, + "step": 53356 + }, + { + "epoch": 0.9905949361918589, + "grad_norm": 0.480447381734848, + "learning_rate": 4.354537739158504e-09, + "loss": 0.1514, + "step": 53358 + }, + { + "epoch": 0.9906320663292775, + "grad_norm": 0.27315622568130493, + "learning_rate": 4.320185019903722e-09, + "loss": 0.2533, + "step": 53360 + }, + { + "epoch": 0.9906691964666962, + "grad_norm": 0.3951440453529358, + "learning_rate": 4.2859683119111925e-09, + "loss": 0.4399, + "step": 53362 + }, + { + "epoch": 0.9907063266041147, + "grad_norm": 0.37956899404525757, + "learning_rate": 4.251887615644989e-09, + "loss": 0.2445, + "step": 53364 + }, + { + "epoch": 0.9907434567415334, + "grad_norm": 0.39813852310180664, + "learning_rate": 4.217942931571406e-09, + "loss": 0.5996, + "step": 53366 + }, + { + "epoch": 0.990780586878952, + "grad_norm": 0.46266883611679077, + "learning_rate": 4.184134260150075e-09, + "loss": 0.114, + "step": 53368 + }, + { + "epoch": 0.9908177170163707, + "grad_norm": 0.33156463503837585, + "learning_rate": 4.150461601841737e-09, + "loss": 0.2006, + "step": 53370 + }, + { + "epoch": 0.9908548471537894, + "grad_norm": 0.3152031898498535, + "learning_rate": 4.116924957103807e-09, + "loss": 0.2329, + "step": 53372 + }, + { + "epoch": 0.9908919772912079, + "grad_norm": 0.6567909121513367, + "learning_rate": 4.083524326394806e-09, + "loss": 0.261, + "step": 53374 + }, + { + "epoch": 0.9909291074286266, + "grad_norm": 0.3906242847442627, + "learning_rate": 4.050259710166593e-09, + "loss": 0.2924, + "step": 53376 + }, + { + "epoch": 0.9909662375660452, + "grad_norm": 0.34235602617263794, + "learning_rate": 4.017131108873251e-09, + "loss": 0.3029, + "step": 53378 + }, + { + "epoch": 0.9910033677034639, + "grad_norm": 0.372734934091568, + "learning_rate": 3.9841385229655305e-09, + "loss": 0.4109, + "step": 53380 + }, + { + "epoch": 0.9910404978408826, + "grad_norm": 0.5924265384674072, + "learning_rate": 3.951281952891961e-09, + "loss": 0.3896, + "step": 53382 + }, + { + "epoch": 0.9910776279783011, + "grad_norm": 0.3129676878452301, + "learning_rate": 3.918561399101073e-09, + "loss": 0.2586, + "step": 53384 + }, + { + "epoch": 0.9911147581157198, + "grad_norm": 0.2878083288669586, + "learning_rate": 3.885976862034735e-09, + "loss": 0.184, + "step": 53386 + }, + { + "epoch": 0.9911518882531384, + "grad_norm": 0.4027186632156372, + "learning_rate": 3.853528342140367e-09, + "loss": 0.4386, + "step": 53388 + }, + { + "epoch": 0.9911890183905571, + "grad_norm": 1.048187494277954, + "learning_rate": 3.821215839856507e-09, + "loss": 0.471, + "step": 53390 + }, + { + "epoch": 0.9912261485279757, + "grad_norm": 0.49501165747642517, + "learning_rate": 3.7890393556239135e-09, + "loss": 0.3035, + "step": 53392 + }, + { + "epoch": 0.9912632786653943, + "grad_norm": 0.6613175868988037, + "learning_rate": 3.756998889880015e-09, + "loss": 0.2475, + "step": 53394 + }, + { + "epoch": 0.991300408802813, + "grad_norm": 0.3849734663963318, + "learning_rate": 3.7250944430622383e-09, + "loss": 0.496, + "step": 53396 + }, + { + "epoch": 0.9913375389402316, + "grad_norm": 0.34422069787979126, + "learning_rate": 3.6933260156024607e-09, + "loss": 0.2689, + "step": 53398 + }, + { + "epoch": 0.9913746690776503, + "grad_norm": 0.4010905623435974, + "learning_rate": 3.66169360793478e-09, + "loss": 0.2522, + "step": 53400 + }, + { + "epoch": 0.991411799215069, + "grad_norm": 0.4136776030063629, + "learning_rate": 3.6301972204888515e-09, + "loss": 0.1628, + "step": 53402 + }, + { + "epoch": 0.9914489293524875, + "grad_norm": 0.39263975620269775, + "learning_rate": 3.598836853693222e-09, + "loss": 0.1642, + "step": 53404 + }, + { + "epoch": 0.9914860594899062, + "grad_norm": 0.38743874430656433, + "learning_rate": 3.5676125079753266e-09, + "loss": 0.1524, + "step": 53406 + }, + { + "epoch": 0.9915231896273248, + "grad_norm": 0.2566862404346466, + "learning_rate": 3.5365241837592713e-09, + "loss": 0.1722, + "step": 53408 + }, + { + "epoch": 0.9915603197647435, + "grad_norm": 0.2932698428630829, + "learning_rate": 3.505571881468051e-09, + "loss": 0.1563, + "step": 53410 + }, + { + "epoch": 0.9915974499021621, + "grad_norm": 0.44588708877563477, + "learning_rate": 3.474755601522439e-09, + "loss": 0.3682, + "step": 53412 + }, + { + "epoch": 0.9916345800395807, + "grad_norm": 0.7323007583618164, + "learning_rate": 3.4440753443432118e-09, + "loss": 0.2727, + "step": 53414 + }, + { + "epoch": 0.9916717101769994, + "grad_norm": 0.3405202031135559, + "learning_rate": 3.4135311103455916e-09, + "loss": 0.383, + "step": 53416 + }, + { + "epoch": 0.991708840314418, + "grad_norm": 0.41052931547164917, + "learning_rate": 3.3831228999481324e-09, + "loss": 0.333, + "step": 53418 + }, + { + "epoch": 0.9917459704518367, + "grad_norm": 0.355103075504303, + "learning_rate": 3.3528507135627274e-09, + "loss": 0.3513, + "step": 53420 + }, + { + "epoch": 0.9917831005892552, + "grad_norm": 0.4810498058795929, + "learning_rate": 3.3227145516012693e-09, + "loss": 0.3655, + "step": 53422 + }, + { + "epoch": 0.9918202307266739, + "grad_norm": 0.2912665903568268, + "learning_rate": 3.2927144144734303e-09, + "loss": 0.3988, + "step": 53424 + }, + { + "epoch": 0.9918573608640926, + "grad_norm": 0.4754859507083893, + "learning_rate": 3.2628503025899925e-09, + "loss": 0.1577, + "step": 53426 + }, + { + "epoch": 0.9918944910015112, + "grad_norm": 0.38529467582702637, + "learning_rate": 3.233122216353968e-09, + "loss": 0.3528, + "step": 53428 + }, + { + "epoch": 0.9919316211389299, + "grad_norm": 0.4235038459300995, + "learning_rate": 3.2035301561716968e-09, + "loss": 0.1733, + "step": 53430 + }, + { + "epoch": 0.9919687512763484, + "grad_norm": 0.38974523544311523, + "learning_rate": 3.174074122446191e-09, + "loss": 0.2643, + "step": 53432 + }, + { + "epoch": 0.9920058814137671, + "grad_norm": 0.6087730526924133, + "learning_rate": 3.144754115576021e-09, + "loss": 0.4278, + "step": 53434 + }, + { + "epoch": 0.9920430115511858, + "grad_norm": 0.2930710017681122, + "learning_rate": 3.1155701359630865e-09, + "loss": 0.2443, + "step": 53436 + }, + { + "epoch": 0.9920801416886044, + "grad_norm": 0.44917362928390503, + "learning_rate": 3.0865221840026273e-09, + "loss": 0.2315, + "step": 53438 + }, + { + "epoch": 0.992117271826023, + "grad_norm": 0.43266502022743225, + "learning_rate": 3.057610260090993e-09, + "loss": 0.2447, + "step": 53440 + }, + { + "epoch": 0.9921544019634416, + "grad_norm": 0.41224372386932373, + "learning_rate": 3.0288343646200926e-09, + "loss": 0.2161, + "step": 53442 + }, + { + "epoch": 0.9921915321008603, + "grad_norm": 0.40588468313217163, + "learning_rate": 3.0001944979829444e-09, + "loss": 0.2163, + "step": 53444 + }, + { + "epoch": 0.992228662238279, + "grad_norm": 0.3856228291988373, + "learning_rate": 2.9716906605681272e-09, + "loss": 0.1889, + "step": 53446 + }, + { + "epoch": 0.9922657923756976, + "grad_norm": 0.2712112069129944, + "learning_rate": 2.9433228527642186e-09, + "loss": 0.2092, + "step": 53448 + }, + { + "epoch": 0.9923029225131162, + "grad_norm": 0.4635825455188751, + "learning_rate": 2.915091074957577e-09, + "loss": 0.1971, + "step": 53450 + }, + { + "epoch": 0.9923400526505348, + "grad_norm": 0.31481078267097473, + "learning_rate": 2.886995327531228e-09, + "loss": 0.1635, + "step": 53452 + }, + { + "epoch": 0.9923771827879535, + "grad_norm": 0.40318188071250916, + "learning_rate": 2.8590356108681994e-09, + "loss": 0.2224, + "step": 53454 + }, + { + "epoch": 0.9924143129253722, + "grad_norm": 0.33646485209465027, + "learning_rate": 2.831211925348187e-09, + "loss": 0.3131, + "step": 53456 + }, + { + "epoch": 0.9924514430627908, + "grad_norm": 0.33020278811454773, + "learning_rate": 2.803524271350888e-09, + "loss": 0.2114, + "step": 53458 + }, + { + "epoch": 0.9924885732002094, + "grad_norm": 0.31008413434028625, + "learning_rate": 2.7759726492526674e-09, + "loss": 0.3573, + "step": 53460 + }, + { + "epoch": 0.992525703337628, + "grad_norm": 0.5426214933395386, + "learning_rate": 2.74855705942878e-09, + "loss": 0.4391, + "step": 53462 + }, + { + "epoch": 0.9925628334750467, + "grad_norm": 0.31837981939315796, + "learning_rate": 2.7212775022511517e-09, + "loss": 0.1023, + "step": 53464 + }, + { + "epoch": 0.9925999636124653, + "grad_norm": 0.3098362982273102, + "learning_rate": 2.6941339780917064e-09, + "loss": 0.2495, + "step": 53466 + }, + { + "epoch": 0.992637093749884, + "grad_norm": 0.4193378686904907, + "learning_rate": 2.6671264873201486e-09, + "loss": 0.3251, + "step": 53468 + }, + { + "epoch": 0.9926742238873026, + "grad_norm": 0.5505390167236328, + "learning_rate": 2.6402550303028516e-09, + "loss": 0.2582, + "step": 53470 + }, + { + "epoch": 0.9927113540247212, + "grad_norm": 0.5060679316520691, + "learning_rate": 2.61351960740619e-09, + "loss": 0.3774, + "step": 53472 + }, + { + "epoch": 0.9927484841621399, + "grad_norm": 0.24632394313812256, + "learning_rate": 2.5869202189943156e-09, + "loss": 0.1187, + "step": 53474 + }, + { + "epoch": 0.9927856142995585, + "grad_norm": 0.3460080027580261, + "learning_rate": 2.5604568654291616e-09, + "loss": 0.3076, + "step": 53476 + }, + { + "epoch": 0.9928227444369772, + "grad_norm": 0.44628074765205383, + "learning_rate": 2.5341295470693306e-09, + "loss": 0.1971, + "step": 53478 + }, + { + "epoch": 0.9928598745743958, + "grad_norm": 0.5723697543144226, + "learning_rate": 2.5079382642745344e-09, + "loss": 0.1416, + "step": 53480 + }, + { + "epoch": 0.9928970047118144, + "grad_norm": 0.31815630197525024, + "learning_rate": 2.4818830174011544e-09, + "loss": 0.2075, + "step": 53482 + }, + { + "epoch": 0.9929341348492331, + "grad_norm": 0.5790970921516418, + "learning_rate": 2.455963806802242e-09, + "loss": 0.2423, + "step": 53484 + }, + { + "epoch": 0.9929712649866517, + "grad_norm": 0.5024975538253784, + "learning_rate": 2.430180632833068e-09, + "loss": 0.2143, + "step": 53486 + }, + { + "epoch": 0.9930083951240704, + "grad_norm": 0.39349859952926636, + "learning_rate": 2.404533495841133e-09, + "loss": 0.1257, + "step": 53488 + }, + { + "epoch": 0.993045525261489, + "grad_norm": 0.36300456523895264, + "learning_rate": 2.3790223961783766e-09, + "loss": 0.2291, + "step": 53490 + }, + { + "epoch": 0.9930826553989076, + "grad_norm": 0.3271787166595459, + "learning_rate": 2.3536473341911893e-09, + "loss": 0.1167, + "step": 53492 + }, + { + "epoch": 0.9931197855363263, + "grad_norm": 0.3815661668777466, + "learning_rate": 2.32840831022374e-09, + "loss": 0.2907, + "step": 53494 + }, + { + "epoch": 0.9931569156737449, + "grad_norm": 0.26615142822265625, + "learning_rate": 2.3033053246201977e-09, + "loss": 0.1387, + "step": 53496 + }, + { + "epoch": 0.9931940458111636, + "grad_norm": 0.6048455834388733, + "learning_rate": 2.278338377722511e-09, + "loss": 0.3151, + "step": 53498 + }, + { + "epoch": 0.9932311759485822, + "grad_norm": 0.3908873498439789, + "learning_rate": 2.253507469869298e-09, + "loss": 0.2528, + "step": 53500 + }, + { + "epoch": 0.9932683060860008, + "grad_norm": 0.3220839202404022, + "learning_rate": 2.228812601399177e-09, + "loss": 0.2634, + "step": 53502 + }, + { + "epoch": 0.9933054362234195, + "grad_norm": 0.39248910546302795, + "learning_rate": 2.2042537726485456e-09, + "loss": 0.3237, + "step": 53504 + }, + { + "epoch": 0.9933425663608381, + "grad_norm": 0.372831255197525, + "learning_rate": 2.179830983951581e-09, + "loss": 0.4463, + "step": 53506 + }, + { + "epoch": 0.9933796964982567, + "grad_norm": 0.4783819615840912, + "learning_rate": 2.1555442356391287e-09, + "loss": 0.2326, + "step": 53508 + }, + { + "epoch": 0.9934168266356754, + "grad_norm": 0.37915462255477905, + "learning_rate": 2.1313935280431465e-09, + "loss": 0.2429, + "step": 53510 + }, + { + "epoch": 0.993453956773094, + "grad_norm": 0.26616522669792175, + "learning_rate": 2.1073788614922596e-09, + "loss": 0.2037, + "step": 53512 + }, + { + "epoch": 0.9934910869105127, + "grad_norm": 0.3992733657360077, + "learning_rate": 2.083500236312874e-09, + "loss": 0.2271, + "step": 53514 + }, + { + "epoch": 0.9935282170479313, + "grad_norm": 0.34670883417129517, + "learning_rate": 2.0597576528291752e-09, + "loss": 0.247, + "step": 53516 + }, + { + "epoch": 0.99356534718535, + "grad_norm": 0.2764451801776886, + "learning_rate": 2.0361511113653477e-09, + "loss": 0.2873, + "step": 53518 + }, + { + "epoch": 0.9936024773227685, + "grad_norm": 0.45024481415748596, + "learning_rate": 2.0126806122411356e-09, + "loss": 0.204, + "step": 53520 + }, + { + "epoch": 0.9936396074601872, + "grad_norm": 0.3657209575176239, + "learning_rate": 1.9893461557785042e-09, + "loss": 0.2899, + "step": 53522 + }, + { + "epoch": 0.9936767375976059, + "grad_norm": 0.43768295645713806, + "learning_rate": 1.9661477422916465e-09, + "loss": 0.3133, + "step": 53524 + }, + { + "epoch": 0.9937138677350245, + "grad_norm": 0.4510355591773987, + "learning_rate": 1.943085372099196e-09, + "loss": 0.1475, + "step": 53526 + }, + { + "epoch": 0.9937509978724431, + "grad_norm": 0.3143789768218994, + "learning_rate": 1.9201590455131257e-09, + "loss": 0.3382, + "step": 53528 + }, + { + "epoch": 0.9937881280098617, + "grad_norm": 0.2952684164047241, + "learning_rate": 1.8973687628465186e-09, + "loss": 0.2522, + "step": 53530 + }, + { + "epoch": 0.9938252581472804, + "grad_norm": 0.4724416434764862, + "learning_rate": 1.874714524408017e-09, + "loss": 0.3912, + "step": 53532 + }, + { + "epoch": 0.9938623882846991, + "grad_norm": 0.309989333152771, + "learning_rate": 1.852196330507372e-09, + "loss": 0.1172, + "step": 53534 + }, + { + "epoch": 0.9938995184221177, + "grad_norm": 0.5229268670082092, + "learning_rate": 1.8298141814510062e-09, + "loss": 0.1765, + "step": 53536 + }, + { + "epoch": 0.9939366485595363, + "grad_norm": 0.5416547060012817, + "learning_rate": 1.8075680775420102e-09, + "loss": 0.2724, + "step": 53538 + }, + { + "epoch": 0.9939737786969549, + "grad_norm": 0.3997677266597748, + "learning_rate": 1.7854580190845849e-09, + "loss": 0.2793, + "step": 53540 + }, + { + "epoch": 0.9940109088343736, + "grad_norm": 0.46541351079940796, + "learning_rate": 1.7634840063784909e-09, + "loss": 0.1859, + "step": 53542 + }, + { + "epoch": 0.9940480389717923, + "grad_norm": 0.18295122683048248, + "learning_rate": 1.741646039723488e-09, + "loss": 0.276, + "step": 53544 + }, + { + "epoch": 0.9940851691092109, + "grad_norm": 0.43300697207450867, + "learning_rate": 1.719944119416006e-09, + "loss": 0.2477, + "step": 53546 + }, + { + "epoch": 0.9941222992466295, + "grad_norm": 0.3096964955329895, + "learning_rate": 1.6983782457513642e-09, + "loss": 0.2512, + "step": 53548 + }, + { + "epoch": 0.9941594293840481, + "grad_norm": 0.3587380051612854, + "learning_rate": 1.676948419024882e-09, + "loss": 0.2363, + "step": 53550 + }, + { + "epoch": 0.9941965595214668, + "grad_norm": 0.4650644063949585, + "learning_rate": 1.655654639525217e-09, + "loss": 0.388, + "step": 53552 + }, + { + "epoch": 0.9942336896588855, + "grad_norm": 0.35090088844299316, + "learning_rate": 1.6344969075443584e-09, + "loss": 0.282, + "step": 53554 + }, + { + "epoch": 0.994270819796304, + "grad_norm": 0.39210835099220276, + "learning_rate": 1.6134752233698537e-09, + "loss": 0.3073, + "step": 53556 + }, + { + "epoch": 0.9943079499337227, + "grad_norm": 0.34158438444137573, + "learning_rate": 1.59258958728703e-09, + "loss": 0.088, + "step": 53558 + }, + { + "epoch": 0.9943450800711413, + "grad_norm": 0.4653012454509735, + "learning_rate": 1.5718399995801047e-09, + "loss": 0.6044, + "step": 53560 + }, + { + "epoch": 0.99438221020856, + "grad_norm": 0.7530316710472107, + "learning_rate": 1.5512264605310745e-09, + "loss": 0.2215, + "step": 53562 + }, + { + "epoch": 0.9944193403459787, + "grad_norm": 0.48231950402259827, + "learning_rate": 1.5307489704219358e-09, + "loss": 0.232, + "step": 53564 + }, + { + "epoch": 0.9944564704833972, + "grad_norm": 0.3443574607372284, + "learning_rate": 1.5104075295302445e-09, + "loss": 0.2529, + "step": 53566 + }, + { + "epoch": 0.9944936006208159, + "grad_norm": 0.3055831789970398, + "learning_rate": 1.4902021381324461e-09, + "loss": 0.1434, + "step": 53568 + }, + { + "epoch": 0.9945307307582345, + "grad_norm": 0.2209073007106781, + "learning_rate": 1.4701327965038759e-09, + "loss": 0.4109, + "step": 53570 + }, + { + "epoch": 0.9945678608956532, + "grad_norm": 0.41637393832206726, + "learning_rate": 1.4501995049187589e-09, + "loss": 0.3015, + "step": 53572 + }, + { + "epoch": 0.9946049910330718, + "grad_norm": 0.21818257868289948, + "learning_rate": 1.4304022636468796e-09, + "loss": 0.2275, + "step": 53574 + }, + { + "epoch": 0.9946421211704904, + "grad_norm": 0.3729708790779114, + "learning_rate": 1.4107410729580217e-09, + "loss": 0.2756, + "step": 53576 + }, + { + "epoch": 0.9946792513079091, + "grad_norm": 0.4783799350261688, + "learning_rate": 1.3912159331197495e-09, + "loss": 0.18, + "step": 53578 + }, + { + "epoch": 0.9947163814453277, + "grad_norm": 0.4182230234146118, + "learning_rate": 1.3718268443974059e-09, + "loss": 0.1816, + "step": 53580 + }, + { + "epoch": 0.9947535115827464, + "grad_norm": 0.3196980357170105, + "learning_rate": 1.352573807055224e-09, + "loss": 0.2514, + "step": 53582 + }, + { + "epoch": 0.994790641720165, + "grad_norm": 0.47197315096855164, + "learning_rate": 1.333456821355217e-09, + "loss": 0.2721, + "step": 53584 + }, + { + "epoch": 0.9948277718575836, + "grad_norm": 0.4402155578136444, + "learning_rate": 1.314475887557176e-09, + "loss": 0.2735, + "step": 53586 + }, + { + "epoch": 0.9948649019950023, + "grad_norm": 0.38109177350997925, + "learning_rate": 1.2956310059197841e-09, + "loss": 0.2884, + "step": 53588 + }, + { + "epoch": 0.9949020321324209, + "grad_norm": 0.4029310941696167, + "learning_rate": 1.2769221766995022e-09, + "loss": 0.2454, + "step": 53590 + }, + { + "epoch": 0.9949391622698396, + "grad_norm": 0.34634122252464294, + "learning_rate": 1.2583494001505714e-09, + "loss": 0.1174, + "step": 53592 + }, + { + "epoch": 0.9949762924072582, + "grad_norm": 0.2543226480484009, + "learning_rate": 1.2399126765261227e-09, + "loss": 0.3627, + "step": 53594 + }, + { + "epoch": 0.9950134225446768, + "grad_norm": 0.6224349141120911, + "learning_rate": 1.2216120060759562e-09, + "loss": 0.2763, + "step": 53596 + }, + { + "epoch": 0.9950505526820955, + "grad_norm": 0.5005269646644592, + "learning_rate": 1.2034473890498722e-09, + "loss": 0.287, + "step": 53598 + }, + { + "epoch": 0.9950876828195141, + "grad_norm": 0.32010751962661743, + "learning_rate": 1.18541882569545e-09, + "loss": 0.3269, + "step": 53600 + }, + { + "epoch": 0.9951248129569328, + "grad_norm": 0.45107343792915344, + "learning_rate": 1.1675263162580497e-09, + "loss": 0.4303, + "step": 53602 + }, + { + "epoch": 0.9951619430943514, + "grad_norm": 0.329780250787735, + "learning_rate": 1.1497698609796993e-09, + "loss": 0.2941, + "step": 53604 + }, + { + "epoch": 0.99519907323177, + "grad_norm": 0.4158528447151184, + "learning_rate": 1.1321494601035377e-09, + "loss": 0.2347, + "step": 53606 + }, + { + "epoch": 0.9952362033691887, + "grad_norm": 0.3148943781852722, + "learning_rate": 1.114665113868263e-09, + "loss": 0.2758, + "step": 53608 + }, + { + "epoch": 0.9952733335066073, + "grad_norm": 0.3847060203552246, + "learning_rate": 1.0973168225125729e-09, + "loss": 0.3269, + "step": 53610 + }, + { + "epoch": 0.995310463644026, + "grad_norm": 0.26362016797065735, + "learning_rate": 1.0801045862718351e-09, + "loss": 0.2093, + "step": 53612 + }, + { + "epoch": 0.9953475937814446, + "grad_norm": 0.3356444239616394, + "learning_rate": 1.0630284053814165e-09, + "loss": 0.2673, + "step": 53614 + }, + { + "epoch": 0.9953847239188632, + "grad_norm": 0.4901287853717804, + "learning_rate": 1.046088280072244e-09, + "loss": 0.246, + "step": 53616 + }, + { + "epoch": 0.9954218540562818, + "grad_norm": 0.3556593656539917, + "learning_rate": 1.0292842105752432e-09, + "loss": 0.2867, + "step": 53618 + }, + { + "epoch": 0.9954589841937005, + "grad_norm": 0.33632415533065796, + "learning_rate": 1.0126161971202309e-09, + "loss": 0.41, + "step": 53620 + }, + { + "epoch": 0.9954961143311192, + "grad_norm": 0.4291035532951355, + "learning_rate": 9.96084239932582e-10, + "loss": 0.2029, + "step": 53622 + }, + { + "epoch": 0.9955332444685377, + "grad_norm": 4.7542901039123535, + "learning_rate": 9.796883392376722e-10, + "loss": 0.2167, + "step": 53624 + }, + { + "epoch": 0.9955703746059564, + "grad_norm": 0.4169110655784607, + "learning_rate": 9.63428495258656e-10, + "loss": 0.3424, + "step": 53626 + }, + { + "epoch": 0.995607504743375, + "grad_norm": 0.4213502109050751, + "learning_rate": 9.473047082164676e-10, + "loss": 0.2929, + "step": 53628 + }, + { + "epoch": 0.9956446348807937, + "grad_norm": 0.3640592396259308, + "learning_rate": 9.313169783309317e-10, + "loss": 0.2291, + "step": 53630 + }, + { + "epoch": 0.9956817650182124, + "grad_norm": 0.33782774209976196, + "learning_rate": 9.154653058196516e-10, + "loss": 0.25, + "step": 53632 + }, + { + "epoch": 0.995718895155631, + "grad_norm": 0.4786086082458496, + "learning_rate": 8.99749690899121e-10, + "loss": 0.2543, + "step": 53634 + }, + { + "epoch": 0.9957560252930496, + "grad_norm": 0.44893699884414673, + "learning_rate": 8.841701337813924e-10, + "loss": 0.2986, + "step": 53636 + }, + { + "epoch": 0.9957931554304682, + "grad_norm": 0.3055330514907837, + "learning_rate": 8.687266346785184e-10, + "loss": 0.2478, + "step": 53638 + }, + { + "epoch": 0.9958302855678869, + "grad_norm": 0.4336663782596588, + "learning_rate": 8.534191938025516e-10, + "loss": 0.3425, + "step": 53640 + }, + { + "epoch": 0.9958674157053056, + "grad_norm": 0.39832034707069397, + "learning_rate": 8.382478113599934e-10, + "loss": 0.1627, + "step": 53642 + }, + { + "epoch": 0.9959045458427241, + "grad_norm": 0.4333001673221588, + "learning_rate": 8.232124875584558e-10, + "loss": 0.2044, + "step": 53644 + }, + { + "epoch": 0.9959416759801428, + "grad_norm": 0.3518376052379608, + "learning_rate": 8.083132226022195e-10, + "loss": 0.2138, + "step": 53646 + }, + { + "epoch": 0.9959788061175614, + "grad_norm": 0.31043896079063416, + "learning_rate": 7.935500166933452e-10, + "loss": 0.4797, + "step": 53648 + }, + { + "epoch": 0.9960159362549801, + "grad_norm": 0.4227985739707947, + "learning_rate": 7.789228700327833e-10, + "loss": 0.4345, + "step": 53650 + }, + { + "epoch": 0.9960530663923988, + "grad_norm": 0.42044922709465027, + "learning_rate": 7.64431782820374e-10, + "loss": 0.4083, + "step": 53652 + }, + { + "epoch": 0.9960901965298173, + "grad_norm": 0.2929927110671997, + "learning_rate": 7.500767552526267e-10, + "loss": 0.2392, + "step": 53654 + }, + { + "epoch": 0.996127326667236, + "grad_norm": 0.22848284244537354, + "learning_rate": 7.358577875249406e-10, + "loss": 0.2927, + "step": 53656 + }, + { + "epoch": 0.9961644568046546, + "grad_norm": 0.42669424414634705, + "learning_rate": 7.217748798316049e-10, + "loss": 0.1534, + "step": 53658 + }, + { + "epoch": 0.9962015869420733, + "grad_norm": 0.2893311381340027, + "learning_rate": 7.078280323635778e-10, + "loss": 0.2772, + "step": 53660 + }, + { + "epoch": 0.996238717079492, + "grad_norm": 0.45024368166923523, + "learning_rate": 6.940172453095972e-10, + "loss": 0.3999, + "step": 53662 + }, + { + "epoch": 0.9962758472169105, + "grad_norm": 0.4148610830307007, + "learning_rate": 6.803425188595114e-10, + "loss": 0.1435, + "step": 53664 + }, + { + "epoch": 0.9963129773543292, + "grad_norm": 0.5388889312744141, + "learning_rate": 6.668038531987275e-10, + "loss": 0.2778, + "step": 53666 + }, + { + "epoch": 0.9963501074917478, + "grad_norm": 0.48851341009140015, + "learning_rate": 6.534012485115426e-10, + "loss": 0.2584, + "step": 53668 + }, + { + "epoch": 0.9963872376291665, + "grad_norm": 0.4291050434112549, + "learning_rate": 6.40134704978923e-10, + "loss": 0.3036, + "step": 53670 + }, + { + "epoch": 0.996424367766585, + "grad_norm": 0.341378778219223, + "learning_rate": 6.270042227829453e-10, + "loss": 0.3362, + "step": 53672 + }, + { + "epoch": 0.9964614979040037, + "grad_norm": 0.27826619148254395, + "learning_rate": 6.140098021023555e-10, + "loss": 0.172, + "step": 53674 + }, + { + "epoch": 0.9964986280414224, + "grad_norm": 0.4631991982460022, + "learning_rate": 6.011514431136789e-10, + "loss": 0.2323, + "step": 53676 + }, + { + "epoch": 0.996535758178841, + "grad_norm": 0.33916786313056946, + "learning_rate": 5.884291459912206e-10, + "loss": 0.2308, + "step": 53678 + }, + { + "epoch": 0.9965728883162597, + "grad_norm": 0.32062771916389465, + "learning_rate": 5.758429109081753e-10, + "loss": 0.2907, + "step": 53680 + }, + { + "epoch": 0.9966100184536782, + "grad_norm": 0.24314405024051666, + "learning_rate": 5.633927380366277e-10, + "loss": 0.1704, + "step": 53682 + }, + { + "epoch": 0.9966471485910969, + "grad_norm": 0.3417761027812958, + "learning_rate": 5.510786275453317e-10, + "loss": 0.1601, + "step": 53684 + }, + { + "epoch": 0.9966842787285156, + "grad_norm": 0.33845800161361694, + "learning_rate": 5.389005796019309e-10, + "loss": 0.2226, + "step": 53686 + }, + { + "epoch": 0.9967214088659342, + "grad_norm": 0.5561087727546692, + "learning_rate": 5.268585943718485e-10, + "loss": 0.5634, + "step": 53688 + }, + { + "epoch": 0.9967585390033529, + "grad_norm": 0.5833976864814758, + "learning_rate": 5.149526720205078e-10, + "loss": 0.2883, + "step": 53690 + }, + { + "epoch": 0.9967956691407714, + "grad_norm": 0.4740462005138397, + "learning_rate": 5.031828127077809e-10, + "loss": 0.2769, + "step": 53692 + }, + { + "epoch": 0.9968327992781901, + "grad_norm": 0.5102601051330566, + "learning_rate": 4.915490165946501e-10, + "loss": 0.3672, + "step": 53694 + }, + { + "epoch": 0.9968699294156088, + "grad_norm": 0.28935521841049194, + "learning_rate": 4.800512838398774e-10, + "loss": 0.3387, + "step": 53696 + }, + { + "epoch": 0.9969070595530274, + "grad_norm": 0.3723895847797394, + "learning_rate": 4.686896146000042e-10, + "loss": 0.1884, + "step": 53698 + }, + { + "epoch": 0.9969441896904461, + "grad_norm": 0.4542303681373596, + "learning_rate": 4.5746400902824116e-10, + "loss": 0.2482, + "step": 53700 + }, + { + "epoch": 0.9969813198278646, + "grad_norm": 0.2907441556453705, + "learning_rate": 4.4637446727890944e-10, + "loss": 0.1688, + "step": 53702 + }, + { + "epoch": 0.9970184499652833, + "grad_norm": 0.4606148898601532, + "learning_rate": 4.354209895029993e-10, + "loss": 0.1532, + "step": 53704 + }, + { + "epoch": 0.997055580102702, + "grad_norm": 0.3624784052371979, + "learning_rate": 4.2460357584817034e-10, + "loss": 0.1822, + "step": 53706 + }, + { + "epoch": 0.9970927102401206, + "grad_norm": 0.5269354581832886, + "learning_rate": 4.1392222646208236e-10, + "loss": 0.3226, + "step": 53708 + }, + { + "epoch": 0.9971298403775393, + "grad_norm": 0.3577289581298828, + "learning_rate": 4.033769414901745e-10, + "loss": 0.2755, + "step": 53710 + }, + { + "epoch": 0.9971669705149578, + "grad_norm": 0.37310826778411865, + "learning_rate": 3.929677210767757e-10, + "loss": 0.458, + "step": 53712 + }, + { + "epoch": 0.9972041006523765, + "grad_norm": 0.9955070614814758, + "learning_rate": 3.826945653617742e-10, + "loss": 0.3687, + "step": 53714 + }, + { + "epoch": 0.9972412307897952, + "grad_norm": 0.4011007845401764, + "learning_rate": 3.725574744861682e-10, + "loss": 0.2351, + "step": 53716 + }, + { + "epoch": 0.9972783609272138, + "grad_norm": 0.3228796124458313, + "learning_rate": 3.625564485887356e-10, + "loss": 0.3916, + "step": 53718 + }, + { + "epoch": 0.9973154910646325, + "grad_norm": 0.465603768825531, + "learning_rate": 3.5269148780381347e-10, + "loss": 0.2046, + "step": 53720 + }, + { + "epoch": 0.997352621202051, + "grad_norm": 0.4606330692768097, + "learning_rate": 3.429625922668489e-10, + "loss": 0.2519, + "step": 53722 + }, + { + "epoch": 0.9973897513394697, + "grad_norm": 0.2891654968261719, + "learning_rate": 3.333697621088483e-10, + "loss": 0.2871, + "step": 53724 + }, + { + "epoch": 0.9974268814768883, + "grad_norm": 0.3046843409538269, + "learning_rate": 3.2391299746192817e-10, + "loss": 0.2035, + "step": 53726 + }, + { + "epoch": 0.997464011614307, + "grad_norm": 0.43812593817710876, + "learning_rate": 3.1459229845376416e-10, + "loss": 0.4661, + "step": 53728 + }, + { + "epoch": 0.9975011417517257, + "grad_norm": 0.4280153810977936, + "learning_rate": 3.0540766521092167e-10, + "loss": 0.3746, + "step": 53730 + }, + { + "epoch": 0.9975382718891442, + "grad_norm": 0.24807289242744446, + "learning_rate": 2.963590978599662e-10, + "loss": 0.1999, + "step": 53732 + }, + { + "epoch": 0.9975754020265629, + "grad_norm": 0.3842063546180725, + "learning_rate": 2.87446596521912e-10, + "loss": 0.3784, + "step": 53734 + }, + { + "epoch": 0.9976125321639815, + "grad_norm": 0.5945963263511658, + "learning_rate": 2.786701613199938e-10, + "loss": 0.3559, + "step": 53736 + }, + { + "epoch": 0.9976496623014002, + "grad_norm": 0.32852858304977417, + "learning_rate": 2.700297923718953e-10, + "loss": 0.2284, + "step": 53738 + }, + { + "epoch": 0.9976867924388189, + "grad_norm": 0.5375715494155884, + "learning_rate": 2.6152548979752055e-10, + "loss": 0.4906, + "step": 53740 + }, + { + "epoch": 0.9977239225762374, + "grad_norm": 0.34251654148101807, + "learning_rate": 2.5315725371011234e-10, + "loss": 0.1663, + "step": 53742 + }, + { + "epoch": 0.9977610527136561, + "grad_norm": 0.41482555866241455, + "learning_rate": 2.449250842240236e-10, + "loss": 0.3223, + "step": 53744 + }, + { + "epoch": 0.9977981828510747, + "grad_norm": 0.30851998925209045, + "learning_rate": 2.368289814524971e-10, + "loss": 0.3236, + "step": 53746 + }, + { + "epoch": 0.9978353129884934, + "grad_norm": 0.42594292759895325, + "learning_rate": 2.288689455043347e-10, + "loss": 0.3947, + "step": 53748 + }, + { + "epoch": 0.9978724431259121, + "grad_norm": 0.6044430732727051, + "learning_rate": 2.2104497648944845e-10, + "loss": 0.3306, + "step": 53750 + }, + { + "epoch": 0.9979095732633306, + "grad_norm": 0.719402015209198, + "learning_rate": 2.1335707451219934e-10, + "loss": 0.3729, + "step": 53752 + }, + { + "epoch": 0.9979467034007493, + "grad_norm": 0.48660847544670105, + "learning_rate": 2.0580523967916877e-10, + "loss": 0.2041, + "step": 53754 + }, + { + "epoch": 0.9979838335381679, + "grad_norm": 0.40611547231674194, + "learning_rate": 1.9838947209249725e-10, + "loss": 0.4153, + "step": 53756 + }, + { + "epoch": 0.9980209636755866, + "grad_norm": 0.46043530106544495, + "learning_rate": 1.9110977185210489e-10, + "loss": 0.3609, + "step": 53758 + }, + { + "epoch": 0.9980580938130053, + "grad_norm": 0.41980114579200745, + "learning_rate": 1.8396613905791173e-10, + "loss": 0.3069, + "step": 53760 + }, + { + "epoch": 0.9980952239504238, + "grad_norm": 0.32720959186553955, + "learning_rate": 1.7695857380761738e-10, + "loss": 0.1981, + "step": 53762 + }, + { + "epoch": 0.9981323540878425, + "grad_norm": 0.3697236180305481, + "learning_rate": 1.7008707619559083e-10, + "loss": 0.2896, + "step": 53764 + }, + { + "epoch": 0.9981694842252611, + "grad_norm": 0.6462329626083374, + "learning_rate": 1.6335164631620105e-10, + "loss": 0.2578, + "step": 53766 + }, + { + "epoch": 0.9982066143626798, + "grad_norm": 0.4427346885204315, + "learning_rate": 1.5675228425937605e-10, + "loss": 0.2741, + "step": 53768 + }, + { + "epoch": 0.9982437445000983, + "grad_norm": 0.2270592898130417, + "learning_rate": 1.5028899011726438e-10, + "loss": 0.289, + "step": 53770 + }, + { + "epoch": 0.998280874637517, + "grad_norm": 0.28622016310691833, + "learning_rate": 1.4396176397646345e-10, + "loss": 0.1953, + "step": 53772 + }, + { + "epoch": 0.9983180047749357, + "grad_norm": 0.32890117168426514, + "learning_rate": 1.377706059235706e-10, + "loss": 0.2047, + "step": 53774 + }, + { + "epoch": 0.9983551349123543, + "grad_norm": 0.4771745204925537, + "learning_rate": 1.3171551604185263e-10, + "loss": 0.2669, + "step": 53776 + }, + { + "epoch": 0.998392265049773, + "grad_norm": 0.4160178601741791, + "learning_rate": 1.257964944145762e-10, + "loss": 0.2299, + "step": 53778 + }, + { + "epoch": 0.9984293951871915, + "grad_norm": 0.27709200978279114, + "learning_rate": 1.2001354112278762e-10, + "loss": 0.1718, + "step": 53780 + }, + { + "epoch": 0.9984665253246102, + "grad_norm": 0.5008622407913208, + "learning_rate": 1.143666562442025e-10, + "loss": 0.499, + "step": 53782 + }, + { + "epoch": 0.9985036554620289, + "grad_norm": 0.4462430775165558, + "learning_rate": 1.0885583985542625e-10, + "loss": 0.4113, + "step": 53784 + }, + { + "epoch": 0.9985407855994475, + "grad_norm": 0.5909548997879028, + "learning_rate": 1.03481092031954e-10, + "loss": 0.1594, + "step": 53786 + }, + { + "epoch": 0.9985779157368662, + "grad_norm": 0.21251781284809113, + "learning_rate": 9.82424128481707e-11, + "loss": 0.2595, + "step": 53788 + }, + { + "epoch": 0.9986150458742847, + "grad_norm": 0.5942063331604004, + "learning_rate": 9.31398023729102e-11, + "loss": 0.1757, + "step": 53790 + }, + { + "epoch": 0.9986521760117034, + "grad_norm": 0.3929668068885803, + "learning_rate": 8.817326067722675e-11, + "loss": 0.151, + "step": 53792 + }, + { + "epoch": 0.9986893061491221, + "grad_norm": 0.23832613229751587, + "learning_rate": 8.334278782773376e-11, + "loss": 0.1724, + "step": 53794 + }, + { + "epoch": 0.9987264362865407, + "grad_norm": 0.4586021900177002, + "learning_rate": 7.86483838921548e-11, + "loss": 0.3743, + "step": 53796 + }, + { + "epoch": 0.9987635664239594, + "grad_norm": 0.4723970592021942, + "learning_rate": 7.409004893155214e-11, + "loss": 0.1855, + "step": 53798 + }, + { + "epoch": 0.9988006965613779, + "grad_norm": 0.3699887692928314, + "learning_rate": 6.966778301031874e-11, + "loss": 0.2648, + "step": 53800 + }, + { + "epoch": 0.9988378266987966, + "grad_norm": 0.518562912940979, + "learning_rate": 6.53815861872964e-11, + "loss": 0.0801, + "step": 53802 + }, + { + "epoch": 0.9988749568362153, + "grad_norm": 0.5363730192184448, + "learning_rate": 6.123145852021672e-11, + "loss": 0.4228, + "step": 53804 + }, + { + "epoch": 0.9989120869736339, + "grad_norm": 0.5990108251571655, + "learning_rate": 5.7217400067921534e-11, + "loss": 0.2689, + "step": 53806 + }, + { + "epoch": 0.9989492171110526, + "grad_norm": 0.5347096920013428, + "learning_rate": 5.333941088259131e-11, + "loss": 0.2134, + "step": 53808 + }, + { + "epoch": 0.9989863472484711, + "grad_norm": 0.2717122435569763, + "learning_rate": 4.9597491018626984e-11, + "loss": 0.158, + "step": 53810 + }, + { + "epoch": 0.9990234773858898, + "grad_norm": 0.43843913078308105, + "learning_rate": 4.599164052598859e-11, + "loss": 0.3426, + "step": 53812 + }, + { + "epoch": 0.9990606075233085, + "grad_norm": 0.49701523780822754, + "learning_rate": 4.252185945463616e-11, + "loss": 0.288, + "step": 53814 + }, + { + "epoch": 0.9990977376607271, + "grad_norm": 0.29901742935180664, + "learning_rate": 3.918814785008884e-11, + "loss": 0.1733, + "step": 53816 + }, + { + "epoch": 0.9991348677981458, + "grad_norm": 0.46045032143592834, + "learning_rate": 3.5990505760086226e-11, + "loss": 0.2785, + "step": 53818 + }, + { + "epoch": 0.9991719979355643, + "grad_norm": 0.22243620455265045, + "learning_rate": 3.292893322570656e-11, + "loss": 0.2369, + "step": 53820 + }, + { + "epoch": 0.999209128072983, + "grad_norm": 0.3235505223274231, + "learning_rate": 3.000343029135877e-11, + "loss": 0.1959, + "step": 53822 + }, + { + "epoch": 0.9992462582104016, + "grad_norm": 0.5686267614364624, + "learning_rate": 2.7213996993680213e-11, + "loss": 0.3031, + "step": 53824 + }, + { + "epoch": 0.9992833883478203, + "grad_norm": 0.5182155966758728, + "learning_rate": 2.456063337263892e-11, + "loss": 0.3971, + "step": 53826 + }, + { + "epoch": 0.999320518485239, + "grad_norm": 0.29292815923690796, + "learning_rate": 2.2043339464872248e-11, + "loss": 0.2562, + "step": 53828 + }, + { + "epoch": 0.9993576486226575, + "grad_norm": 0.5193690657615662, + "learning_rate": 1.9662115302576667e-11, + "loss": 0.3402, + "step": 53830 + }, + { + "epoch": 0.9993947787600762, + "grad_norm": 0.503534197807312, + "learning_rate": 1.741696092016909e-11, + "loss": 0.3503, + "step": 53832 + }, + { + "epoch": 0.9994319088974948, + "grad_norm": 0.5535789728164673, + "learning_rate": 1.5307876346515315e-11, + "loss": 0.2963, + "step": 53834 + }, + { + "epoch": 0.9994690390349135, + "grad_norm": 0.48619726300239563, + "learning_rate": 1.3334861611591365e-11, + "loss": 0.3453, + "step": 53836 + }, + { + "epoch": 0.9995061691723321, + "grad_norm": 0.45374229550361633, + "learning_rate": 1.1497916742042592e-11, + "loss": 0.3414, + "step": 53838 + }, + { + "epoch": 0.9995432993097507, + "grad_norm": 0.34518998861312866, + "learning_rate": 9.797041761183678e-12, + "loss": 0.2941, + "step": 53840 + }, + { + "epoch": 0.9995804294471694, + "grad_norm": 0.37265685200691223, + "learning_rate": 8.232236694549756e-12, + "loss": 0.2017, + "step": 53842 + }, + { + "epoch": 0.999617559584588, + "grad_norm": 0.7281001806259155, + "learning_rate": 6.80350156323506e-12, + "loss": 0.214, + "step": 53844 + }, + { + "epoch": 0.9996546897220067, + "grad_norm": 0.3920292854309082, + "learning_rate": 5.51083638500316e-12, + "loss": 0.386, + "step": 53846 + }, + { + "epoch": 0.9996918198594253, + "grad_norm": 0.46180394291877747, + "learning_rate": 4.354241177617624e-12, + "loss": 0.2327, + "step": 53848 + }, + { + "epoch": 0.9997289499968439, + "grad_norm": 0.5219993591308594, + "learning_rate": 3.333715958842021e-12, + "loss": 0.1978, + "step": 53850 + }, + { + "epoch": 0.9997660801342626, + "grad_norm": 0.420766681432724, + "learning_rate": 2.4492607408888035e-12, + "loss": 0.0943, + "step": 53852 + }, + { + "epoch": 0.9998032102716812, + "grad_norm": 0.3140062093734741, + "learning_rate": 1.7008755348602025e-12, + "loss": 0.2727, + "step": 53854 + }, + { + "epoch": 0.9998403404090999, + "grad_norm": 0.23232311010360718, + "learning_rate": 1.0885603540788936e-12, + "loss": 0.2094, + "step": 53856 + }, + { + "epoch": 0.9998774705465185, + "grad_norm": 0.4728757441043854, + "learning_rate": 6.123152040959923e-13, + "loss": 0.2055, + "step": 53858 + }, + { + "epoch": 0.9999146006839371, + "grad_norm": 0.24474041163921356, + "learning_rate": 2.7214009268305974e-13, + "loss": 0.196, + "step": 53860 + }, + { + "epoch": 0.9999517308213558, + "grad_norm": 0.3812914490699768, + "learning_rate": 6.803502317076493e-14, + "loss": 0.2148, + "step": 53862 + }, + { + "epoch": 0.9999888609587744, + "grad_norm": 0.2657621502876282, + "learning_rate": 0.0, + "loss": 0.2966, + "step": 53864 + }, + { + "epoch": 0.9999888609587744, + "step": 53864, + "total_flos": 4.172909200980178e+19, + "train_loss": 0.30097950914072474, + "train_runtime": 327326.476, + "train_samples_per_second": 3.291, + "train_steps_per_second": 0.165 + } + ], + "logging_steps": 2, + "max_steps": 53864, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 15000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.172909200980178e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}