{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.222222222222222, "eval_steps": 2250, "global_step": 9500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044444444444444447, "grad_norm": 0.7025980353355408, "learning_rate": 2e-05, "loss": 3.0289, "step": 1 }, { "epoch": 0.0008888888888888889, "grad_norm": 0.8140734434127808, "learning_rate": 4e-05, "loss": 3.1428, "step": 2 }, { "epoch": 0.0013333333333333333, "grad_norm": 0.8431426286697388, "learning_rate": 6e-05, "loss": 2.759, "step": 3 }, { "epoch": 0.0017777777777777779, "grad_norm": 0.8204770088195801, "learning_rate": 8e-05, "loss": 3.0823, "step": 4 }, { "epoch": 0.0022222222222222222, "grad_norm": 0.7714145183563232, "learning_rate": 0.0001, "loss": 2.9085, "step": 5 }, { "epoch": 0.0026666666666666666, "grad_norm": 0.8233251571655273, "learning_rate": 0.00012, "loss": 2.926, "step": 6 }, { "epoch": 0.003111111111111111, "grad_norm": 0.919024646282196, "learning_rate": 0.00014, "loss": 2.9215, "step": 7 }, { "epoch": 0.0035555555555555557, "grad_norm": 0.996406078338623, "learning_rate": 0.00016, "loss": 2.6709, "step": 8 }, { "epoch": 0.004, "grad_norm": 1.156698226928711, "learning_rate": 0.00018, "loss": 2.936, "step": 9 }, { "epoch": 0.0044444444444444444, "grad_norm": 1.6368826627731323, "learning_rate": 0.0002, "loss": 3.0452, "step": 10 }, { "epoch": 0.004888888888888889, "grad_norm": 1.3638646602630615, "learning_rate": 0.00019998220640569397, "loss": 2.7221, "step": 11 }, { "epoch": 0.005333333333333333, "grad_norm": 1.2367130517959595, "learning_rate": 0.00019996441281138792, "loss": 3.1021, "step": 12 }, { "epoch": 0.0057777777777777775, "grad_norm": 1.1429595947265625, "learning_rate": 0.00019994661921708185, "loss": 2.5428, "step": 13 }, { "epoch": 0.006222222222222222, "grad_norm": 1.3077034950256348, "learning_rate": 0.0001999288256227758, "loss": 2.7311, "step": 14 }, { "epoch": 0.006666666666666667, "grad_norm": 1.2416568994522095, "learning_rate": 0.00019991103202846976, "loss": 2.3485, "step": 15 }, { "epoch": 0.0071111111111111115, "grad_norm": 1.2917921543121338, "learning_rate": 0.00019989323843416372, "loss": 2.6843, "step": 16 }, { "epoch": 0.007555555555555556, "grad_norm": 1.388919711112976, "learning_rate": 0.00019987544483985765, "loss": 2.5847, "step": 17 }, { "epoch": 0.008, "grad_norm": 1.4038573503494263, "learning_rate": 0.0001998576512455516, "loss": 2.8317, "step": 18 }, { "epoch": 0.008444444444444444, "grad_norm": 1.3480195999145508, "learning_rate": 0.00019983985765124556, "loss": 2.9627, "step": 19 }, { "epoch": 0.008888888888888889, "grad_norm": 1.1358228921890259, "learning_rate": 0.0001998220640569395, "loss": 2.6335, "step": 20 }, { "epoch": 0.009333333333333334, "grad_norm": 1.3050692081451416, "learning_rate": 0.00019980427046263345, "loss": 2.7682, "step": 21 }, { "epoch": 0.009777777777777778, "grad_norm": 1.1249107122421265, "learning_rate": 0.0001997864768683274, "loss": 2.5659, "step": 22 }, { "epoch": 0.010222222222222223, "grad_norm": 2.0670816898345947, "learning_rate": 0.00019976868327402136, "loss": 3.6833, "step": 23 }, { "epoch": 0.010666666666666666, "grad_norm": 1.3983336687088013, "learning_rate": 0.00019975088967971532, "loss": 2.2004, "step": 24 }, { "epoch": 0.011111111111111112, "grad_norm": 1.2295548915863037, "learning_rate": 0.00019973309608540928, "loss": 2.5752, "step": 25 }, { "epoch": 0.011555555555555555, "grad_norm": 1.4657237529754639, "learning_rate": 0.0001997153024911032, "loss": 2.5437, "step": 26 }, { "epoch": 0.012, "grad_norm": 1.2808438539505005, "learning_rate": 0.00019969750889679716, "loss": 2.5708, "step": 27 }, { "epoch": 0.012444444444444444, "grad_norm": 1.811769723892212, "learning_rate": 0.00019967971530249112, "loss": 2.7687, "step": 28 }, { "epoch": 0.012888888888888889, "grad_norm": 1.4724963903427124, "learning_rate": 0.00019966192170818507, "loss": 2.4946, "step": 29 }, { "epoch": 0.013333333333333334, "grad_norm": 1.31075119972229, "learning_rate": 0.000199644128113879, "loss": 2.2154, "step": 30 }, { "epoch": 0.013777777777777778, "grad_norm": 1.5601913928985596, "learning_rate": 0.00019962633451957296, "loss": 2.9217, "step": 31 }, { "epoch": 0.014222222222222223, "grad_norm": 1.3747111558914185, "learning_rate": 0.00019960854092526692, "loss": 2.921, "step": 32 }, { "epoch": 0.014666666666666666, "grad_norm": 1.3256399631500244, "learning_rate": 0.00019959074733096085, "loss": 2.664, "step": 33 }, { "epoch": 0.015111111111111112, "grad_norm": 1.421920657157898, "learning_rate": 0.0001995729537366548, "loss": 2.4761, "step": 34 }, { "epoch": 0.015555555555555555, "grad_norm": 1.6060773134231567, "learning_rate": 0.00019955516014234876, "loss": 2.585, "step": 35 }, { "epoch": 0.016, "grad_norm": 1.2819411754608154, "learning_rate": 0.00019953736654804272, "loss": 2.4084, "step": 36 }, { "epoch": 0.016444444444444446, "grad_norm": 1.6595838069915771, "learning_rate": 0.00019951957295373667, "loss": 2.9492, "step": 37 }, { "epoch": 0.016888888888888887, "grad_norm": 1.7327772378921509, "learning_rate": 0.00019950177935943063, "loss": 2.4173, "step": 38 }, { "epoch": 0.017333333333333333, "grad_norm": 1.7617017030715942, "learning_rate": 0.00019948398576512456, "loss": 2.0839, "step": 39 }, { "epoch": 0.017777777777777778, "grad_norm": 1.677414894104004, "learning_rate": 0.00019946619217081851, "loss": 2.3126, "step": 40 }, { "epoch": 0.018222222222222223, "grad_norm": 1.3741059303283691, "learning_rate": 0.00019944839857651247, "loss": 2.5466, "step": 41 }, { "epoch": 0.018666666666666668, "grad_norm": 1.7205222845077515, "learning_rate": 0.00019943060498220643, "loss": 2.7476, "step": 42 }, { "epoch": 0.01911111111111111, "grad_norm": 1.9464102983474731, "learning_rate": 0.00019941281138790036, "loss": 2.8767, "step": 43 }, { "epoch": 0.019555555555555555, "grad_norm": 1.5541801452636719, "learning_rate": 0.00019939501779359431, "loss": 2.4521, "step": 44 }, { "epoch": 0.02, "grad_norm": 1.6664501428604126, "learning_rate": 0.00019937722419928827, "loss": 2.1682, "step": 45 }, { "epoch": 0.020444444444444446, "grad_norm": 1.555294394493103, "learning_rate": 0.0001993594306049822, "loss": 2.6263, "step": 46 }, { "epoch": 0.020888888888888887, "grad_norm": 1.8338146209716797, "learning_rate": 0.00019934163701067616, "loss": 2.6912, "step": 47 }, { "epoch": 0.021333333333333333, "grad_norm": 1.5415942668914795, "learning_rate": 0.0001993238434163701, "loss": 2.5132, "step": 48 }, { "epoch": 0.021777777777777778, "grad_norm": 1.7458958625793457, "learning_rate": 0.00019930604982206407, "loss": 2.6587, "step": 49 }, { "epoch": 0.022222222222222223, "grad_norm": 5.483662128448486, "learning_rate": 0.00019928825622775803, "loss": 2.7683, "step": 50 }, { "epoch": 0.02266666666666667, "grad_norm": 0.7813481688499451, "learning_rate": 0.00019927046263345198, "loss": 2.3085, "step": 51 }, { "epoch": 0.02311111111111111, "grad_norm": 0.9960127472877502, "learning_rate": 0.0001992526690391459, "loss": 2.4272, "step": 52 }, { "epoch": 0.023555555555555555, "grad_norm": 1.1653705835342407, "learning_rate": 0.00019923487544483987, "loss": 2.7689, "step": 53 }, { "epoch": 0.024, "grad_norm": 1.2431868314743042, "learning_rate": 0.00019921708185053382, "loss": 2.7339, "step": 54 }, { "epoch": 0.024444444444444446, "grad_norm": 1.22008216381073, "learning_rate": 0.00019919928825622778, "loss": 2.4097, "step": 55 }, { "epoch": 0.024888888888888887, "grad_norm": 1.1046444177627563, "learning_rate": 0.0001991814946619217, "loss": 2.5895, "step": 56 }, { "epoch": 0.025333333333333333, "grad_norm": 1.812303066253662, "learning_rate": 0.00019916370106761567, "loss": 2.7829, "step": 57 }, { "epoch": 0.025777777777777778, "grad_norm": 1.241114854812622, "learning_rate": 0.00019914590747330962, "loss": 2.4605, "step": 58 }, { "epoch": 0.026222222222222223, "grad_norm": 1.2486529350280762, "learning_rate": 0.00019912811387900355, "loss": 2.5066, "step": 59 }, { "epoch": 0.02666666666666667, "grad_norm": 1.2300881147384644, "learning_rate": 0.0001991103202846975, "loss": 2.676, "step": 60 }, { "epoch": 0.02711111111111111, "grad_norm": 1.840987205505371, "learning_rate": 0.00019909252669039147, "loss": 2.5316, "step": 61 }, { "epoch": 0.027555555555555555, "grad_norm": 1.1190531253814697, "learning_rate": 0.00019907473309608542, "loss": 2.3772, "step": 62 }, { "epoch": 0.028, "grad_norm": 1.255723476409912, "learning_rate": 0.00019905693950177938, "loss": 2.4208, "step": 63 }, { "epoch": 0.028444444444444446, "grad_norm": 1.0897091627120972, "learning_rate": 0.00019903914590747334, "loss": 2.5596, "step": 64 }, { "epoch": 0.028888888888888888, "grad_norm": 1.1145886182785034, "learning_rate": 0.00019902135231316726, "loss": 2.428, "step": 65 }, { "epoch": 0.029333333333333333, "grad_norm": 1.608787178993225, "learning_rate": 0.00019900355871886122, "loss": 2.956, "step": 66 }, { "epoch": 0.029777777777777778, "grad_norm": 1.2278952598571777, "learning_rate": 0.00019898576512455518, "loss": 2.9204, "step": 67 }, { "epoch": 0.030222222222222223, "grad_norm": 1.2748639583587646, "learning_rate": 0.00019896797153024913, "loss": 2.8291, "step": 68 }, { "epoch": 0.030666666666666665, "grad_norm": 1.2492533922195435, "learning_rate": 0.00019895017793594306, "loss": 2.8622, "step": 69 }, { "epoch": 0.03111111111111111, "grad_norm": 1.2926253080368042, "learning_rate": 0.00019893238434163702, "loss": 2.7478, "step": 70 }, { "epoch": 0.03155555555555556, "grad_norm": 1.654407024383545, "learning_rate": 0.00019891459074733098, "loss": 2.4137, "step": 71 }, { "epoch": 0.032, "grad_norm": 1.4059809446334839, "learning_rate": 0.0001988967971530249, "loss": 3.0662, "step": 72 }, { "epoch": 0.03244444444444444, "grad_norm": 1.5277940034866333, "learning_rate": 0.00019887900355871886, "loss": 2.4363, "step": 73 }, { "epoch": 0.03288888888888889, "grad_norm": 1.141005277633667, "learning_rate": 0.00019886120996441282, "loss": 2.4846, "step": 74 }, { "epoch": 0.03333333333333333, "grad_norm": 1.2516539096832275, "learning_rate": 0.00019884341637010678, "loss": 2.3899, "step": 75 }, { "epoch": 0.033777777777777775, "grad_norm": 1.2361774444580078, "learning_rate": 0.00019882562277580073, "loss": 2.4988, "step": 76 }, { "epoch": 0.03422222222222222, "grad_norm": 1.2464451789855957, "learning_rate": 0.0001988078291814947, "loss": 2.4915, "step": 77 }, { "epoch": 0.034666666666666665, "grad_norm": 1.488785982131958, "learning_rate": 0.00019879003558718862, "loss": 2.6796, "step": 78 }, { "epoch": 0.035111111111111114, "grad_norm": 1.3093085289001465, "learning_rate": 0.00019877224199288257, "loss": 2.6892, "step": 79 }, { "epoch": 0.035555555555555556, "grad_norm": 1.1957430839538574, "learning_rate": 0.00019875444839857653, "loss": 2.5761, "step": 80 }, { "epoch": 0.036, "grad_norm": 1.3030658960342407, "learning_rate": 0.00019873665480427046, "loss": 2.6818, "step": 81 }, { "epoch": 0.036444444444444446, "grad_norm": 1.2497376203536987, "learning_rate": 0.00019871886120996442, "loss": 2.2396, "step": 82 }, { "epoch": 0.03688888888888889, "grad_norm": 1.1968713998794556, "learning_rate": 0.00019870106761565837, "loss": 2.6936, "step": 83 }, { "epoch": 0.037333333333333336, "grad_norm": 1.5414577722549438, "learning_rate": 0.00019868327402135233, "loss": 2.4236, "step": 84 }, { "epoch": 0.03777777777777778, "grad_norm": 1.4061691761016846, "learning_rate": 0.00019866548042704626, "loss": 2.0842, "step": 85 }, { "epoch": 0.03822222222222222, "grad_norm": 1.3697423934936523, "learning_rate": 0.00019864768683274022, "loss": 2.6033, "step": 86 }, { "epoch": 0.03866666666666667, "grad_norm": 1.4249686002731323, "learning_rate": 0.00019862989323843417, "loss": 2.3001, "step": 87 }, { "epoch": 0.03911111111111111, "grad_norm": 1.449453592300415, "learning_rate": 0.00019861209964412813, "loss": 2.7934, "step": 88 }, { "epoch": 0.03955555555555555, "grad_norm": 1.6140450239181519, "learning_rate": 0.00019859430604982209, "loss": 2.9546, "step": 89 }, { "epoch": 0.04, "grad_norm": 1.3802794218063354, "learning_rate": 0.00019857651245551604, "loss": 2.6017, "step": 90 }, { "epoch": 0.04044444444444444, "grad_norm": 1.4572910070419312, "learning_rate": 0.00019855871886120997, "loss": 2.9772, "step": 91 }, { "epoch": 0.04088888888888889, "grad_norm": 1.6315029859542847, "learning_rate": 0.00019854092526690393, "loss": 3.0303, "step": 92 }, { "epoch": 0.04133333333333333, "grad_norm": 1.6254206895828247, "learning_rate": 0.00019852313167259788, "loss": 2.9715, "step": 93 }, { "epoch": 0.041777777777777775, "grad_norm": 1.2871061563491821, "learning_rate": 0.0001985053380782918, "loss": 2.3368, "step": 94 }, { "epoch": 0.042222222222222223, "grad_norm": 1.4380096197128296, "learning_rate": 0.00019848754448398577, "loss": 2.5723, "step": 95 }, { "epoch": 0.042666666666666665, "grad_norm": 1.5760232210159302, "learning_rate": 0.00019846975088967973, "loss": 2.5501, "step": 96 }, { "epoch": 0.043111111111111114, "grad_norm": 1.6527888774871826, "learning_rate": 0.00019845195729537368, "loss": 2.6406, "step": 97 }, { "epoch": 0.043555555555555556, "grad_norm": 1.8810604810714722, "learning_rate": 0.0001984341637010676, "loss": 2.8451, "step": 98 }, { "epoch": 0.044, "grad_norm": 2.4290010929107666, "learning_rate": 0.00019841637010676157, "loss": 2.3485, "step": 99 }, { "epoch": 0.044444444444444446, "grad_norm": 2.3675355911254883, "learning_rate": 0.00019839857651245553, "loss": 2.8247, "step": 100 }, { "epoch": 0.04488888888888889, "grad_norm": 0.7726467847824097, "learning_rate": 0.00019838078291814948, "loss": 2.5495, "step": 101 }, { "epoch": 0.04533333333333334, "grad_norm": 1.0265134572982788, "learning_rate": 0.00019836298932384344, "loss": 2.7777, "step": 102 }, { "epoch": 0.04577777777777778, "grad_norm": 0.9280586838722229, "learning_rate": 0.00019834519572953737, "loss": 2.8585, "step": 103 }, { "epoch": 0.04622222222222222, "grad_norm": 0.897459089756012, "learning_rate": 0.00019832740213523132, "loss": 2.5307, "step": 104 }, { "epoch": 0.04666666666666667, "grad_norm": 1.299446702003479, "learning_rate": 0.00019830960854092528, "loss": 2.6633, "step": 105 }, { "epoch": 0.04711111111111111, "grad_norm": 1.0051794052124023, "learning_rate": 0.00019829181494661924, "loss": 2.714, "step": 106 }, { "epoch": 0.04755555555555555, "grad_norm": 1.096691370010376, "learning_rate": 0.00019827402135231317, "loss": 2.5904, "step": 107 }, { "epoch": 0.048, "grad_norm": 0.9809961318969727, "learning_rate": 0.00019825622775800712, "loss": 2.7201, "step": 108 }, { "epoch": 0.04844444444444444, "grad_norm": 1.2203365564346313, "learning_rate": 0.00019823843416370108, "loss": 2.7113, "step": 109 }, { "epoch": 0.04888888888888889, "grad_norm": 1.2605012655258179, "learning_rate": 0.000198220640569395, "loss": 2.7018, "step": 110 }, { "epoch": 0.04933333333333333, "grad_norm": 1.0020304918289185, "learning_rate": 0.00019820284697508897, "loss": 2.4043, "step": 111 }, { "epoch": 0.049777777777777775, "grad_norm": 0.9287395477294922, "learning_rate": 0.00019818505338078292, "loss": 2.1282, "step": 112 }, { "epoch": 0.050222222222222224, "grad_norm": 1.0499564409255981, "learning_rate": 0.00019816725978647688, "loss": 2.724, "step": 113 }, { "epoch": 0.050666666666666665, "grad_norm": 0.9938886165618896, "learning_rate": 0.00019814946619217083, "loss": 2.8035, "step": 114 }, { "epoch": 0.051111111111111114, "grad_norm": 0.9068772196769714, "learning_rate": 0.0001981316725978648, "loss": 2.2173, "step": 115 }, { "epoch": 0.051555555555555556, "grad_norm": 0.9217369556427002, "learning_rate": 0.00019811387900355872, "loss": 2.266, "step": 116 }, { "epoch": 0.052, "grad_norm": 0.9447048306465149, "learning_rate": 0.00019809608540925268, "loss": 2.5042, "step": 117 }, { "epoch": 0.052444444444444446, "grad_norm": 1.1115142107009888, "learning_rate": 0.00019807829181494663, "loss": 2.9463, "step": 118 }, { "epoch": 0.05288888888888889, "grad_norm": 1.0305101871490479, "learning_rate": 0.0001980604982206406, "loss": 2.5201, "step": 119 }, { "epoch": 0.05333333333333334, "grad_norm": 1.2101026773452759, "learning_rate": 0.00019804270462633452, "loss": 2.6229, "step": 120 }, { "epoch": 0.05377777777777778, "grad_norm": 1.163856029510498, "learning_rate": 0.00019802491103202848, "loss": 2.8074, "step": 121 }, { "epoch": 0.05422222222222222, "grad_norm": 1.2083830833435059, "learning_rate": 0.00019800711743772243, "loss": 2.7174, "step": 122 }, { "epoch": 0.05466666666666667, "grad_norm": 1.169143795967102, "learning_rate": 0.00019798932384341636, "loss": 2.3256, "step": 123 }, { "epoch": 0.05511111111111111, "grad_norm": 1.1567578315734863, "learning_rate": 0.00019797153024911032, "loss": 2.4596, "step": 124 }, { "epoch": 0.05555555555555555, "grad_norm": 1.2948107719421387, "learning_rate": 0.00019795373665480427, "loss": 2.5357, "step": 125 }, { "epoch": 0.056, "grad_norm": 1.2189359664916992, "learning_rate": 0.00019793594306049823, "loss": 2.823, "step": 126 }, { "epoch": 0.05644444444444444, "grad_norm": 1.5137442350387573, "learning_rate": 0.0001979181494661922, "loss": 3.2187, "step": 127 }, { "epoch": 0.05688888888888889, "grad_norm": 1.2950530052185059, "learning_rate": 0.00019790035587188614, "loss": 2.6476, "step": 128 }, { "epoch": 0.05733333333333333, "grad_norm": 1.1926493644714355, "learning_rate": 0.00019788256227758007, "loss": 1.9832, "step": 129 }, { "epoch": 0.057777777777777775, "grad_norm": 1.3409109115600586, "learning_rate": 0.00019786476868327403, "loss": 2.5699, "step": 130 }, { "epoch": 0.058222222222222224, "grad_norm": 1.2740064859390259, "learning_rate": 0.000197846975088968, "loss": 2.6332, "step": 131 }, { "epoch": 0.058666666666666666, "grad_norm": 1.7328652143478394, "learning_rate": 0.00019782918149466194, "loss": 2.946, "step": 132 }, { "epoch": 0.059111111111111114, "grad_norm": 1.3805177211761475, "learning_rate": 0.00019781138790035587, "loss": 2.7012, "step": 133 }, { "epoch": 0.059555555555555556, "grad_norm": 1.3198126554489136, "learning_rate": 0.00019779359430604983, "loss": 2.6473, "step": 134 }, { "epoch": 0.06, "grad_norm": 1.4987982511520386, "learning_rate": 0.00019777580071174379, "loss": 2.8902, "step": 135 }, { "epoch": 0.060444444444444446, "grad_norm": 1.4471278190612793, "learning_rate": 0.00019775800711743772, "loss": 2.8084, "step": 136 }, { "epoch": 0.06088888888888889, "grad_norm": 1.1906874179840088, "learning_rate": 0.00019774021352313167, "loss": 2.2734, "step": 137 }, { "epoch": 0.06133333333333333, "grad_norm": 1.349488615989685, "learning_rate": 0.00019772241992882563, "loss": 2.734, "step": 138 }, { "epoch": 0.06177777777777778, "grad_norm": 1.1277025938034058, "learning_rate": 0.00019770462633451958, "loss": 1.7254, "step": 139 }, { "epoch": 0.06222222222222222, "grad_norm": 1.277053713798523, "learning_rate": 0.00019768683274021354, "loss": 2.5841, "step": 140 }, { "epoch": 0.06266666666666666, "grad_norm": 1.358282446861267, "learning_rate": 0.0001976690391459075, "loss": 2.4615, "step": 141 }, { "epoch": 0.06311111111111112, "grad_norm": 1.463334560394287, "learning_rate": 0.00019765124555160143, "loss": 2.8497, "step": 142 }, { "epoch": 0.06355555555555556, "grad_norm": 1.537904143333435, "learning_rate": 0.00019763345195729538, "loss": 2.4246, "step": 143 }, { "epoch": 0.064, "grad_norm": 1.3648548126220703, "learning_rate": 0.00019761565836298934, "loss": 2.6535, "step": 144 }, { "epoch": 0.06444444444444444, "grad_norm": 1.3705697059631348, "learning_rate": 0.0001975978647686833, "loss": 2.6365, "step": 145 }, { "epoch": 0.06488888888888888, "grad_norm": 1.5270709991455078, "learning_rate": 0.00019758007117437723, "loss": 2.7863, "step": 146 }, { "epoch": 0.06533333333333333, "grad_norm": 1.413665771484375, "learning_rate": 0.00019756227758007118, "loss": 2.4596, "step": 147 }, { "epoch": 0.06577777777777778, "grad_norm": 1.7925262451171875, "learning_rate": 0.00019754448398576514, "loss": 3.2978, "step": 148 }, { "epoch": 0.06622222222222222, "grad_norm": 1.5601551532745361, "learning_rate": 0.00019752669039145907, "loss": 2.1796, "step": 149 }, { "epoch": 0.06666666666666667, "grad_norm": 1.8019471168518066, "learning_rate": 0.00019750889679715302, "loss": 2.5945, "step": 150 }, { "epoch": 0.06711111111111111, "grad_norm": 0.816262423992157, "learning_rate": 0.00019749110320284698, "loss": 2.478, "step": 151 }, { "epoch": 0.06755555555555555, "grad_norm": 0.9055171608924866, "learning_rate": 0.00019747330960854094, "loss": 2.6226, "step": 152 }, { "epoch": 0.068, "grad_norm": 0.8419144153594971, "learning_rate": 0.0001974555160142349, "loss": 2.2478, "step": 153 }, { "epoch": 0.06844444444444445, "grad_norm": 1.01926589012146, "learning_rate": 0.00019743772241992885, "loss": 2.8489, "step": 154 }, { "epoch": 0.06888888888888889, "grad_norm": 1.3037279844284058, "learning_rate": 0.00019741992882562278, "loss": 3.1187, "step": 155 }, { "epoch": 0.06933333333333333, "grad_norm": 1.0188137292861938, "learning_rate": 0.00019740213523131674, "loss": 2.6388, "step": 156 }, { "epoch": 0.06977777777777777, "grad_norm": 1.1099557876586914, "learning_rate": 0.0001973843416370107, "loss": 2.7675, "step": 157 }, { "epoch": 0.07022222222222223, "grad_norm": 0.9259713292121887, "learning_rate": 0.00019736654804270465, "loss": 2.7232, "step": 158 }, { "epoch": 0.07066666666666667, "grad_norm": 1.0099951028823853, "learning_rate": 0.00019734875444839858, "loss": 2.3996, "step": 159 }, { "epoch": 0.07111111111111111, "grad_norm": 1.045190691947937, "learning_rate": 0.00019733096085409254, "loss": 2.5773, "step": 160 }, { "epoch": 0.07155555555555555, "grad_norm": 1.1050662994384766, "learning_rate": 0.0001973131672597865, "loss": 2.1176, "step": 161 }, { "epoch": 0.072, "grad_norm": 1.2864255905151367, "learning_rate": 0.00019729537366548042, "loss": 3.2324, "step": 162 }, { "epoch": 0.07244444444444445, "grad_norm": 1.0812265872955322, "learning_rate": 0.00019727758007117438, "loss": 2.671, "step": 163 }, { "epoch": 0.07288888888888889, "grad_norm": 1.0999687910079956, "learning_rate": 0.00019725978647686833, "loss": 2.7204, "step": 164 }, { "epoch": 0.07333333333333333, "grad_norm": 1.0504425764083862, "learning_rate": 0.0001972419928825623, "loss": 2.5729, "step": 165 }, { "epoch": 0.07377777777777778, "grad_norm": 1.059959053993225, "learning_rate": 0.00019722419928825625, "loss": 2.4222, "step": 166 }, { "epoch": 0.07422222222222222, "grad_norm": 1.0592875480651855, "learning_rate": 0.0001972064056939502, "loss": 2.4263, "step": 167 }, { "epoch": 0.07466666666666667, "grad_norm": 1.0814061164855957, "learning_rate": 0.00019718861209964413, "loss": 2.5768, "step": 168 }, { "epoch": 0.07511111111111111, "grad_norm": 1.416944980621338, "learning_rate": 0.0001971708185053381, "loss": 3.0183, "step": 169 }, { "epoch": 0.07555555555555556, "grad_norm": 1.0500316619873047, "learning_rate": 0.00019715302491103205, "loss": 2.6289, "step": 170 }, { "epoch": 0.076, "grad_norm": 1.2884352207183838, "learning_rate": 0.000197135231316726, "loss": 2.5171, "step": 171 }, { "epoch": 0.07644444444444444, "grad_norm": 1.3116530179977417, "learning_rate": 0.00019711743772241993, "loss": 3.0034, "step": 172 }, { "epoch": 0.0768888888888889, "grad_norm": 1.2504932880401611, "learning_rate": 0.0001970996441281139, "loss": 2.5675, "step": 173 }, { "epoch": 0.07733333333333334, "grad_norm": 1.1547982692718506, "learning_rate": 0.00019708185053380785, "loss": 2.301, "step": 174 }, { "epoch": 0.07777777777777778, "grad_norm": 1.1116724014282227, "learning_rate": 0.00019706405693950177, "loss": 2.4003, "step": 175 }, { "epoch": 0.07822222222222222, "grad_norm": 1.109155297279358, "learning_rate": 0.00019704626334519573, "loss": 2.2188, "step": 176 }, { "epoch": 0.07866666666666666, "grad_norm": 1.2276736497879028, "learning_rate": 0.0001970284697508897, "loss": 2.5917, "step": 177 }, { "epoch": 0.0791111111111111, "grad_norm": 1.5495067834854126, "learning_rate": 0.00019701067615658364, "loss": 2.5696, "step": 178 }, { "epoch": 0.07955555555555556, "grad_norm": 1.2796157598495483, "learning_rate": 0.0001969928825622776, "loss": 2.3558, "step": 179 }, { "epoch": 0.08, "grad_norm": 1.151888132095337, "learning_rate": 0.00019697508896797156, "loss": 1.779, "step": 180 }, { "epoch": 0.08044444444444444, "grad_norm": 1.2485597133636475, "learning_rate": 0.0001969572953736655, "loss": 2.1977, "step": 181 }, { "epoch": 0.08088888888888889, "grad_norm": 1.392452597618103, "learning_rate": 0.00019693950177935944, "loss": 2.6898, "step": 182 }, { "epoch": 0.08133333333333333, "grad_norm": 1.514426350593567, "learning_rate": 0.0001969217081850534, "loss": 2.8083, "step": 183 }, { "epoch": 0.08177777777777778, "grad_norm": 1.472489356994629, "learning_rate": 0.00019690391459074736, "loss": 2.9328, "step": 184 }, { "epoch": 0.08222222222222222, "grad_norm": 1.5749987363815308, "learning_rate": 0.00019688612099644129, "loss": 2.3176, "step": 185 }, { "epoch": 0.08266666666666667, "grad_norm": 1.5175185203552246, "learning_rate": 0.00019686832740213524, "loss": 2.7677, "step": 186 }, { "epoch": 0.08311111111111111, "grad_norm": 1.286679983139038, "learning_rate": 0.0001968505338078292, "loss": 2.08, "step": 187 }, { "epoch": 0.08355555555555555, "grad_norm": 1.4742923974990845, "learning_rate": 0.00019683274021352313, "loss": 3.0767, "step": 188 }, { "epoch": 0.084, "grad_norm": 1.3634746074676514, "learning_rate": 0.00019681494661921708, "loss": 2.423, "step": 189 }, { "epoch": 0.08444444444444445, "grad_norm": 1.4298174381256104, "learning_rate": 0.00019679715302491104, "loss": 2.1539, "step": 190 }, { "epoch": 0.08488888888888889, "grad_norm": 1.6024513244628906, "learning_rate": 0.000196779359430605, "loss": 2.868, "step": 191 }, { "epoch": 0.08533333333333333, "grad_norm": 1.4882041215896606, "learning_rate": 0.00019676156583629895, "loss": 2.4238, "step": 192 }, { "epoch": 0.08577777777777777, "grad_norm": 1.4021437168121338, "learning_rate": 0.00019674377224199288, "loss": 2.6468, "step": 193 }, { "epoch": 0.08622222222222223, "grad_norm": 1.5105438232421875, "learning_rate": 0.00019672597864768684, "loss": 2.3776, "step": 194 }, { "epoch": 0.08666666666666667, "grad_norm": 1.753899097442627, "learning_rate": 0.0001967081850533808, "loss": 2.4496, "step": 195 }, { "epoch": 0.08711111111111111, "grad_norm": 1.6667553186416626, "learning_rate": 0.00019669039145907475, "loss": 2.7937, "step": 196 }, { "epoch": 0.08755555555555555, "grad_norm": 1.4890007972717285, "learning_rate": 0.00019667259786476868, "loss": 2.7493, "step": 197 }, { "epoch": 0.088, "grad_norm": 1.8673200607299805, "learning_rate": 0.00019665480427046264, "loss": 2.7379, "step": 198 }, { "epoch": 0.08844444444444445, "grad_norm": 2.127183675765991, "learning_rate": 0.0001966370106761566, "loss": 3.239, "step": 199 }, { "epoch": 0.08888888888888889, "grad_norm": 2.974283218383789, "learning_rate": 0.00019661921708185052, "loss": 2.9018, "step": 200 }, { "epoch": 0.08933333333333333, "grad_norm": 0.7585410475730896, "learning_rate": 0.00019660142348754448, "loss": 2.1856, "step": 201 }, { "epoch": 0.08977777777777778, "grad_norm": 0.8838407397270203, "learning_rate": 0.00019658362989323844, "loss": 2.353, "step": 202 }, { "epoch": 0.09022222222222222, "grad_norm": 0.9938530325889587, "learning_rate": 0.0001965658362989324, "loss": 2.5122, "step": 203 }, { "epoch": 0.09066666666666667, "grad_norm": 0.8707981109619141, "learning_rate": 0.00019654804270462635, "loss": 2.5396, "step": 204 }, { "epoch": 0.09111111111111111, "grad_norm": 0.9297839999198914, "learning_rate": 0.0001965302491103203, "loss": 2.7153, "step": 205 }, { "epoch": 0.09155555555555556, "grad_norm": 1.176153302192688, "learning_rate": 0.00019651245551601424, "loss": 2.5667, "step": 206 }, { "epoch": 0.092, "grad_norm": 1.1235204935073853, "learning_rate": 0.0001964946619217082, "loss": 2.5297, "step": 207 }, { "epoch": 0.09244444444444444, "grad_norm": 1.123063564300537, "learning_rate": 0.00019647686832740215, "loss": 2.8088, "step": 208 }, { "epoch": 0.09288888888888888, "grad_norm": 1.0872026681900024, "learning_rate": 0.0001964590747330961, "loss": 2.5135, "step": 209 }, { "epoch": 0.09333333333333334, "grad_norm": 1.1103240251541138, "learning_rate": 0.00019644128113879004, "loss": 2.6343, "step": 210 }, { "epoch": 0.09377777777777778, "grad_norm": 0.998205840587616, "learning_rate": 0.000196423487544484, "loss": 2.5504, "step": 211 }, { "epoch": 0.09422222222222222, "grad_norm": 0.9748513698577881, "learning_rate": 0.00019640569395017795, "loss": 2.2131, "step": 212 }, { "epoch": 0.09466666666666666, "grad_norm": 1.0071059465408325, "learning_rate": 0.00019638790035587188, "loss": 2.2381, "step": 213 }, { "epoch": 0.0951111111111111, "grad_norm": 1.4692164659500122, "learning_rate": 0.00019637010676156583, "loss": 2.6001, "step": 214 }, { "epoch": 0.09555555555555556, "grad_norm": 1.2764703035354614, "learning_rate": 0.0001963523131672598, "loss": 2.7826, "step": 215 }, { "epoch": 0.096, "grad_norm": 1.0466008186340332, "learning_rate": 0.00019633451957295375, "loss": 2.7655, "step": 216 }, { "epoch": 0.09644444444444444, "grad_norm": 1.1789090633392334, "learning_rate": 0.0001963167259786477, "loss": 2.5712, "step": 217 }, { "epoch": 0.09688888888888889, "grad_norm": 1.240662932395935, "learning_rate": 0.00019629893238434166, "loss": 2.7105, "step": 218 }, { "epoch": 0.09733333333333333, "grad_norm": 1.3624532222747803, "learning_rate": 0.0001962811387900356, "loss": 2.7762, "step": 219 }, { "epoch": 0.09777777777777778, "grad_norm": 1.0563603639602661, "learning_rate": 0.00019626334519572955, "loss": 2.3591, "step": 220 }, { "epoch": 0.09822222222222222, "grad_norm": 1.0683754682540894, "learning_rate": 0.0001962455516014235, "loss": 1.9874, "step": 221 }, { "epoch": 0.09866666666666667, "grad_norm": 1.7027884721755981, "learning_rate": 0.00019622775800711746, "loss": 2.5819, "step": 222 }, { "epoch": 0.09911111111111111, "grad_norm": 1.2697211503982544, "learning_rate": 0.0001962099644128114, "loss": 2.5147, "step": 223 }, { "epoch": 0.09955555555555555, "grad_norm": 1.1141654253005981, "learning_rate": 0.00019619217081850534, "loss": 2.196, "step": 224 }, { "epoch": 0.1, "grad_norm": 1.337015986442566, "learning_rate": 0.0001961743772241993, "loss": 2.5916, "step": 225 }, { "epoch": 0.10044444444444445, "grad_norm": 1.3354969024658203, "learning_rate": 0.00019615658362989323, "loss": 2.3433, "step": 226 }, { "epoch": 0.10088888888888889, "grad_norm": 1.2302438020706177, "learning_rate": 0.0001961387900355872, "loss": 2.3541, "step": 227 }, { "epoch": 0.10133333333333333, "grad_norm": 1.8002538681030273, "learning_rate": 0.00019612099644128114, "loss": 2.6135, "step": 228 }, { "epoch": 0.10177777777777777, "grad_norm": 1.4334473609924316, "learning_rate": 0.0001961032028469751, "loss": 2.7333, "step": 229 }, { "epoch": 0.10222222222222223, "grad_norm": 1.5954945087432861, "learning_rate": 0.00019608540925266906, "loss": 2.6602, "step": 230 }, { "epoch": 0.10266666666666667, "grad_norm": 1.3982867002487183, "learning_rate": 0.000196067615658363, "loss": 2.7748, "step": 231 }, { "epoch": 0.10311111111111111, "grad_norm": 1.322675108909607, "learning_rate": 0.00019604982206405694, "loss": 2.4225, "step": 232 }, { "epoch": 0.10355555555555555, "grad_norm": 1.3092737197875977, "learning_rate": 0.0001960320284697509, "loss": 2.479, "step": 233 }, { "epoch": 0.104, "grad_norm": 1.3040847778320312, "learning_rate": 0.00019601423487544486, "loss": 2.6299, "step": 234 }, { "epoch": 0.10444444444444445, "grad_norm": 1.3706668615341187, "learning_rate": 0.0001959964412811388, "loss": 2.2323, "step": 235 }, { "epoch": 0.10488888888888889, "grad_norm": 1.5371273756027222, "learning_rate": 0.00019597864768683274, "loss": 2.8628, "step": 236 }, { "epoch": 0.10533333333333333, "grad_norm": 1.5061933994293213, "learning_rate": 0.0001959608540925267, "loss": 2.962, "step": 237 }, { "epoch": 0.10577777777777778, "grad_norm": 1.4326897859573364, "learning_rate": 0.00019594306049822065, "loss": 2.6172, "step": 238 }, { "epoch": 0.10622222222222222, "grad_norm": 1.7283401489257812, "learning_rate": 0.00019592526690391458, "loss": 3.1021, "step": 239 }, { "epoch": 0.10666666666666667, "grad_norm": 1.5328441858291626, "learning_rate": 0.00019590747330960854, "loss": 2.9195, "step": 240 }, { "epoch": 0.10711111111111112, "grad_norm": 1.4138455390930176, "learning_rate": 0.0001958896797153025, "loss": 2.7317, "step": 241 }, { "epoch": 0.10755555555555556, "grad_norm": 1.336175560951233, "learning_rate": 0.00019587188612099645, "loss": 2.6417, "step": 242 }, { "epoch": 0.108, "grad_norm": 1.5993636846542358, "learning_rate": 0.0001958540925266904, "loss": 2.5646, "step": 243 }, { "epoch": 0.10844444444444444, "grad_norm": 1.464353084564209, "learning_rate": 0.00019583629893238437, "loss": 2.6526, "step": 244 }, { "epoch": 0.10888888888888888, "grad_norm": 1.731520652770996, "learning_rate": 0.0001958185053380783, "loss": 2.5564, "step": 245 }, { "epoch": 0.10933333333333334, "grad_norm": 1.7385467290878296, "learning_rate": 0.00019580071174377225, "loss": 2.8463, "step": 246 }, { "epoch": 0.10977777777777778, "grad_norm": 1.7964988946914673, "learning_rate": 0.0001957829181494662, "loss": 2.7856, "step": 247 }, { "epoch": 0.11022222222222222, "grad_norm": 1.8664138317108154, "learning_rate": 0.00019576512455516017, "loss": 2.6548, "step": 248 }, { "epoch": 0.11066666666666666, "grad_norm": 1.7917791604995728, "learning_rate": 0.0001957473309608541, "loss": 2.5387, "step": 249 }, { "epoch": 0.1111111111111111, "grad_norm": 1.9322112798690796, "learning_rate": 0.00019572953736654805, "loss": 1.9883, "step": 250 }, { "epoch": 0.11155555555555556, "grad_norm": 0.8669712543487549, "learning_rate": 0.000195711743772242, "loss": 2.6931, "step": 251 }, { "epoch": 0.112, "grad_norm": 1.020733118057251, "learning_rate": 0.00019569395017793594, "loss": 3.1699, "step": 252 }, { "epoch": 0.11244444444444444, "grad_norm": 0.8693115711212158, "learning_rate": 0.0001956761565836299, "loss": 2.4069, "step": 253 }, { "epoch": 0.11288888888888889, "grad_norm": 1.043466567993164, "learning_rate": 0.00019565836298932385, "loss": 2.9878, "step": 254 }, { "epoch": 0.11333333333333333, "grad_norm": 1.030266284942627, "learning_rate": 0.0001956405693950178, "loss": 2.6809, "step": 255 }, { "epoch": 0.11377777777777778, "grad_norm": 0.9637128710746765, "learning_rate": 0.00019562277580071176, "loss": 2.2864, "step": 256 }, { "epoch": 0.11422222222222222, "grad_norm": 1.2764060497283936, "learning_rate": 0.00019560498220640572, "loss": 2.5282, "step": 257 }, { "epoch": 0.11466666666666667, "grad_norm": 1.0020864009857178, "learning_rate": 0.00019558718861209965, "loss": 2.6671, "step": 258 }, { "epoch": 0.11511111111111111, "grad_norm": 0.9636043310165405, "learning_rate": 0.0001955693950177936, "loss": 2.6084, "step": 259 }, { "epoch": 0.11555555555555555, "grad_norm": 0.9684137105941772, "learning_rate": 0.00019555160142348756, "loss": 2.6087, "step": 260 }, { "epoch": 0.116, "grad_norm": 0.9589288830757141, "learning_rate": 0.00019553380782918152, "loss": 2.4366, "step": 261 }, { "epoch": 0.11644444444444445, "grad_norm": 0.9377467632293701, "learning_rate": 0.00019551601423487545, "loss": 2.1606, "step": 262 }, { "epoch": 0.11688888888888889, "grad_norm": 1.1523168087005615, "learning_rate": 0.0001954982206405694, "loss": 2.8838, "step": 263 }, { "epoch": 0.11733333333333333, "grad_norm": 1.0509337186813354, "learning_rate": 0.00019548042704626336, "loss": 2.41, "step": 264 }, { "epoch": 0.11777777777777777, "grad_norm": 1.2317067384719849, "learning_rate": 0.0001954626334519573, "loss": 2.5933, "step": 265 }, { "epoch": 0.11822222222222223, "grad_norm": 1.4088350534439087, "learning_rate": 0.00019544483985765125, "loss": 2.7486, "step": 266 }, { "epoch": 0.11866666666666667, "grad_norm": 1.033850073814392, "learning_rate": 0.0001954270462633452, "loss": 2.407, "step": 267 }, { "epoch": 0.11911111111111111, "grad_norm": 1.2386589050292969, "learning_rate": 0.00019540925266903916, "loss": 2.1679, "step": 268 }, { "epoch": 0.11955555555555555, "grad_norm": 1.0948667526245117, "learning_rate": 0.00019539145907473312, "loss": 2.4034, "step": 269 }, { "epoch": 0.12, "grad_norm": 1.3226970434188843, "learning_rate": 0.00019537366548042707, "loss": 2.7977, "step": 270 }, { "epoch": 0.12044444444444445, "grad_norm": 1.3416509628295898, "learning_rate": 0.000195355871886121, "loss": 2.7933, "step": 271 }, { "epoch": 0.12088888888888889, "grad_norm": 1.142828345298767, "learning_rate": 0.00019533807829181496, "loss": 2.1703, "step": 272 }, { "epoch": 0.12133333333333333, "grad_norm": 1.4413820505142212, "learning_rate": 0.00019532028469750892, "loss": 3.1676, "step": 273 }, { "epoch": 0.12177777777777778, "grad_norm": 1.3678597211837769, "learning_rate": 0.00019530249110320287, "loss": 2.5917, "step": 274 }, { "epoch": 0.12222222222222222, "grad_norm": 1.165423035621643, "learning_rate": 0.0001952846975088968, "loss": 2.1922, "step": 275 }, { "epoch": 0.12266666666666666, "grad_norm": 1.458446741104126, "learning_rate": 0.00019526690391459076, "loss": 2.5036, "step": 276 }, { "epoch": 0.12311111111111112, "grad_norm": 1.5351003408432007, "learning_rate": 0.00019524911032028471, "loss": 2.849, "step": 277 }, { "epoch": 0.12355555555555556, "grad_norm": 1.343141794204712, "learning_rate": 0.00019523131672597864, "loss": 2.3158, "step": 278 }, { "epoch": 0.124, "grad_norm": 1.5747337341308594, "learning_rate": 0.0001952135231316726, "loss": 3.3124, "step": 279 }, { "epoch": 0.12444444444444444, "grad_norm": 1.538909912109375, "learning_rate": 0.00019519572953736656, "loss": 2.7005, "step": 280 }, { "epoch": 0.12488888888888888, "grad_norm": 1.4149315357208252, "learning_rate": 0.0001951779359430605, "loss": 1.8815, "step": 281 }, { "epoch": 0.12533333333333332, "grad_norm": 1.2315411567687988, "learning_rate": 0.00019516014234875447, "loss": 2.4296, "step": 282 }, { "epoch": 0.12577777777777777, "grad_norm": 1.5550092458724976, "learning_rate": 0.0001951423487544484, "loss": 3.0271, "step": 283 }, { "epoch": 0.12622222222222224, "grad_norm": 1.3692402839660645, "learning_rate": 0.00019512455516014236, "loss": 2.5704, "step": 284 }, { "epoch": 0.12666666666666668, "grad_norm": 1.4310396909713745, "learning_rate": 0.0001951067615658363, "loss": 2.4914, "step": 285 }, { "epoch": 0.12711111111111112, "grad_norm": 1.283097505569458, "learning_rate": 0.00019508896797153027, "loss": 2.31, "step": 286 }, { "epoch": 0.12755555555555556, "grad_norm": 1.3749858140945435, "learning_rate": 0.00019507117437722422, "loss": 2.2563, "step": 287 }, { "epoch": 0.128, "grad_norm": 1.3623768091201782, "learning_rate": 0.00019505338078291815, "loss": 2.2465, "step": 288 }, { "epoch": 0.12844444444444444, "grad_norm": 1.4308843612670898, "learning_rate": 0.0001950355871886121, "loss": 2.6771, "step": 289 }, { "epoch": 0.1288888888888889, "grad_norm": 1.568965196609497, "learning_rate": 0.00019501779359430604, "loss": 2.3809, "step": 290 }, { "epoch": 0.12933333333333333, "grad_norm": 1.429734468460083, "learning_rate": 0.000195, "loss": 2.6331, "step": 291 }, { "epoch": 0.12977777777777777, "grad_norm": 1.3805055618286133, "learning_rate": 0.00019498220640569395, "loss": 2.2138, "step": 292 }, { "epoch": 0.1302222222222222, "grad_norm": 1.5100244283676147, "learning_rate": 0.0001949644128113879, "loss": 2.1832, "step": 293 }, { "epoch": 0.13066666666666665, "grad_norm": 1.2256643772125244, "learning_rate": 0.00019494661921708187, "loss": 2.1298, "step": 294 }, { "epoch": 0.13111111111111112, "grad_norm": 1.6592442989349365, "learning_rate": 0.00019492882562277582, "loss": 2.5226, "step": 295 }, { "epoch": 0.13155555555555556, "grad_norm": 1.3523834943771362, "learning_rate": 0.00019491103202846975, "loss": 2.3038, "step": 296 }, { "epoch": 0.132, "grad_norm": 1.4921329021453857, "learning_rate": 0.0001948932384341637, "loss": 2.2571, "step": 297 }, { "epoch": 0.13244444444444445, "grad_norm": 1.7550227642059326, "learning_rate": 0.00019487544483985766, "loss": 3.1873, "step": 298 }, { "epoch": 0.1328888888888889, "grad_norm": 1.6412465572357178, "learning_rate": 0.00019485765124555162, "loss": 2.63, "step": 299 }, { "epoch": 0.13333333333333333, "grad_norm": 1.9257419109344482, "learning_rate": 0.00019483985765124558, "loss": 2.4374, "step": 300 }, { "epoch": 0.13377777777777777, "grad_norm": 0.9691830277442932, "learning_rate": 0.0001948220640569395, "loss": 3.1218, "step": 301 }, { "epoch": 0.13422222222222221, "grad_norm": 0.8326031565666199, "learning_rate": 0.00019480427046263346, "loss": 2.5508, "step": 302 }, { "epoch": 0.13466666666666666, "grad_norm": 0.9384410381317139, "learning_rate": 0.0001947864768683274, "loss": 2.5186, "step": 303 }, { "epoch": 0.1351111111111111, "grad_norm": 1.1212788820266724, "learning_rate": 0.00019476868327402135, "loss": 2.1688, "step": 304 }, { "epoch": 0.13555555555555557, "grad_norm": 0.8847819566726685, "learning_rate": 0.0001947508896797153, "loss": 2.4118, "step": 305 }, { "epoch": 0.136, "grad_norm": 1.0403448343276978, "learning_rate": 0.00019473309608540926, "loss": 2.5872, "step": 306 }, { "epoch": 0.13644444444444445, "grad_norm": 1.0033663511276245, "learning_rate": 0.00019471530249110322, "loss": 2.4532, "step": 307 }, { "epoch": 0.1368888888888889, "grad_norm": 0.9713349342346191, "learning_rate": 0.00019469750889679718, "loss": 2.6743, "step": 308 }, { "epoch": 0.13733333333333334, "grad_norm": 0.9954231381416321, "learning_rate": 0.0001946797153024911, "loss": 2.2478, "step": 309 }, { "epoch": 0.13777777777777778, "grad_norm": 1.4259557723999023, "learning_rate": 0.00019466192170818506, "loss": 2.4252, "step": 310 }, { "epoch": 0.13822222222222222, "grad_norm": 1.1478148698806763, "learning_rate": 0.00019464412811387902, "loss": 2.6959, "step": 311 }, { "epoch": 0.13866666666666666, "grad_norm": 0.9570370316505432, "learning_rate": 0.00019462633451957297, "loss": 2.4102, "step": 312 }, { "epoch": 0.1391111111111111, "grad_norm": 1.0587913990020752, "learning_rate": 0.0001946085409252669, "loss": 2.341, "step": 313 }, { "epoch": 0.13955555555555554, "grad_norm": 0.9694374203681946, "learning_rate": 0.00019459074733096086, "loss": 2.3627, "step": 314 }, { "epoch": 0.14, "grad_norm": 1.0712873935699463, "learning_rate": 0.00019457295373665482, "loss": 2.7581, "step": 315 }, { "epoch": 0.14044444444444446, "grad_norm": 1.1833367347717285, "learning_rate": 0.00019455516014234875, "loss": 2.424, "step": 316 }, { "epoch": 0.1408888888888889, "grad_norm": 1.1975206136703491, "learning_rate": 0.0001945373665480427, "loss": 2.992, "step": 317 }, { "epoch": 0.14133333333333334, "grad_norm": 1.165632724761963, "learning_rate": 0.00019451957295373666, "loss": 2.3119, "step": 318 }, { "epoch": 0.14177777777777778, "grad_norm": 1.2752189636230469, "learning_rate": 0.00019450177935943062, "loss": 2.755, "step": 319 }, { "epoch": 0.14222222222222222, "grad_norm": 1.0351862907409668, "learning_rate": 0.00019448398576512457, "loss": 2.4693, "step": 320 }, { "epoch": 0.14266666666666666, "grad_norm": 1.3102034330368042, "learning_rate": 0.00019446619217081853, "loss": 3.0192, "step": 321 }, { "epoch": 0.1431111111111111, "grad_norm": 1.2251161336898804, "learning_rate": 0.00019444839857651246, "loss": 2.6388, "step": 322 }, { "epoch": 0.14355555555555555, "grad_norm": 1.147139072418213, "learning_rate": 0.00019443060498220641, "loss": 2.1927, "step": 323 }, { "epoch": 0.144, "grad_norm": 1.6661100387573242, "learning_rate": 0.00019441281138790037, "loss": 2.3797, "step": 324 }, { "epoch": 0.14444444444444443, "grad_norm": 1.5821012258529663, "learning_rate": 0.00019439501779359433, "loss": 2.6176, "step": 325 }, { "epoch": 0.1448888888888889, "grad_norm": 1.4074416160583496, "learning_rate": 0.00019437722419928826, "loss": 2.887, "step": 326 }, { "epoch": 0.14533333333333334, "grad_norm": 1.1706616878509521, "learning_rate": 0.0001943594306049822, "loss": 2.5354, "step": 327 }, { "epoch": 0.14577777777777778, "grad_norm": 1.8788220882415771, "learning_rate": 0.00019434163701067617, "loss": 3.0704, "step": 328 }, { "epoch": 0.14622222222222223, "grad_norm": 1.3559796810150146, "learning_rate": 0.0001943238434163701, "loss": 2.5688, "step": 329 }, { "epoch": 0.14666666666666667, "grad_norm": 1.3595597743988037, "learning_rate": 0.00019430604982206406, "loss": 2.4277, "step": 330 }, { "epoch": 0.1471111111111111, "grad_norm": 1.267451286315918, "learning_rate": 0.000194288256227758, "loss": 2.6135, "step": 331 }, { "epoch": 0.14755555555555555, "grad_norm": 1.2744300365447998, "learning_rate": 0.00019427046263345197, "loss": 2.5527, "step": 332 }, { "epoch": 0.148, "grad_norm": 1.6571002006530762, "learning_rate": 0.00019425266903914593, "loss": 2.653, "step": 333 }, { "epoch": 0.14844444444444443, "grad_norm": 1.295233130455017, "learning_rate": 0.00019423487544483988, "loss": 2.3443, "step": 334 }, { "epoch": 0.14888888888888888, "grad_norm": 1.4817813634872437, "learning_rate": 0.0001942170818505338, "loss": 2.6859, "step": 335 }, { "epoch": 0.14933333333333335, "grad_norm": 1.2812329530715942, "learning_rate": 0.00019419928825622777, "loss": 2.5331, "step": 336 }, { "epoch": 0.1497777777777778, "grad_norm": 1.2227575778961182, "learning_rate": 0.00019418149466192172, "loss": 2.2328, "step": 337 }, { "epoch": 0.15022222222222223, "grad_norm": 1.2625856399536133, "learning_rate": 0.00019416370106761568, "loss": 2.3834, "step": 338 }, { "epoch": 0.15066666666666667, "grad_norm": 1.4927825927734375, "learning_rate": 0.0001941459074733096, "loss": 2.66, "step": 339 }, { "epoch": 0.1511111111111111, "grad_norm": 1.5926779508590698, "learning_rate": 0.00019412811387900357, "loss": 2.8668, "step": 340 }, { "epoch": 0.15155555555555555, "grad_norm": 1.4455138444900513, "learning_rate": 0.00019411032028469752, "loss": 2.313, "step": 341 }, { "epoch": 0.152, "grad_norm": 1.7588112354278564, "learning_rate": 0.00019409252669039145, "loss": 3.2341, "step": 342 }, { "epoch": 0.15244444444444444, "grad_norm": 1.61067795753479, "learning_rate": 0.0001940747330960854, "loss": 3.0195, "step": 343 }, { "epoch": 0.15288888888888888, "grad_norm": 1.5478051900863647, "learning_rate": 0.00019405693950177937, "loss": 2.5716, "step": 344 }, { "epoch": 0.15333333333333332, "grad_norm": 1.436558485031128, "learning_rate": 0.00019403914590747332, "loss": 2.4509, "step": 345 }, { "epoch": 0.1537777777777778, "grad_norm": 1.6531201601028442, "learning_rate": 0.00019402135231316728, "loss": 2.8739, "step": 346 }, { "epoch": 0.15422222222222223, "grad_norm": 1.6915215253829956, "learning_rate": 0.00019400355871886124, "loss": 2.6791, "step": 347 }, { "epoch": 0.15466666666666667, "grad_norm": 1.5415655374526978, "learning_rate": 0.00019398576512455516, "loss": 2.8461, "step": 348 }, { "epoch": 0.15511111111111112, "grad_norm": 1.5193134546279907, "learning_rate": 0.00019396797153024912, "loss": 2.0756, "step": 349 }, { "epoch": 0.15555555555555556, "grad_norm": 1.9580684900283813, "learning_rate": 0.00019395017793594308, "loss": 2.2343, "step": 350 }, { "epoch": 0.156, "grad_norm": 0.9276076555252075, "learning_rate": 0.00019393238434163703, "loss": 2.5678, "step": 351 }, { "epoch": 0.15644444444444444, "grad_norm": 0.8778754472732544, "learning_rate": 0.00019391459074733096, "loss": 2.2734, "step": 352 }, { "epoch": 0.15688888888888888, "grad_norm": 0.9423462152481079, "learning_rate": 0.00019389679715302492, "loss": 2.3079, "step": 353 }, { "epoch": 0.15733333333333333, "grad_norm": 1.1275615692138672, "learning_rate": 0.00019387900355871888, "loss": 3.0597, "step": 354 }, { "epoch": 0.15777777777777777, "grad_norm": 1.098436713218689, "learning_rate": 0.0001938612099644128, "loss": 2.5681, "step": 355 }, { "epoch": 0.1582222222222222, "grad_norm": 1.2952444553375244, "learning_rate": 0.00019384341637010676, "loss": 3.2901, "step": 356 }, { "epoch": 0.15866666666666668, "grad_norm": 0.9615929126739502, "learning_rate": 0.00019382562277580072, "loss": 2.1562, "step": 357 }, { "epoch": 0.15911111111111112, "grad_norm": 1.0871940851211548, "learning_rate": 0.00019380782918149468, "loss": 2.483, "step": 358 }, { "epoch": 0.15955555555555556, "grad_norm": 1.2492108345031738, "learning_rate": 0.00019379003558718863, "loss": 2.4314, "step": 359 }, { "epoch": 0.16, "grad_norm": 1.184037208557129, "learning_rate": 0.0001937722419928826, "loss": 2.5729, "step": 360 }, { "epoch": 0.16044444444444445, "grad_norm": 1.07174813747406, "learning_rate": 0.00019375444839857652, "loss": 2.4092, "step": 361 }, { "epoch": 0.1608888888888889, "grad_norm": 1.034098744392395, "learning_rate": 0.00019373665480427047, "loss": 2.3074, "step": 362 }, { "epoch": 0.16133333333333333, "grad_norm": 1.224602460861206, "learning_rate": 0.00019371886120996443, "loss": 2.5208, "step": 363 }, { "epoch": 0.16177777777777777, "grad_norm": 1.1033531427383423, "learning_rate": 0.0001937010676156584, "loss": 2.4352, "step": 364 }, { "epoch": 0.1622222222222222, "grad_norm": 1.3021239042282104, "learning_rate": 0.00019368327402135232, "loss": 3.0264, "step": 365 }, { "epoch": 0.16266666666666665, "grad_norm": 1.0758678913116455, "learning_rate": 0.00019366548042704627, "loss": 2.1807, "step": 366 }, { "epoch": 0.16311111111111112, "grad_norm": 1.262274980545044, "learning_rate": 0.00019364768683274023, "loss": 2.0729, "step": 367 }, { "epoch": 0.16355555555555557, "grad_norm": 1.1292612552642822, "learning_rate": 0.00019362989323843416, "loss": 2.3755, "step": 368 }, { "epoch": 0.164, "grad_norm": 1.226879596710205, "learning_rate": 0.00019361209964412812, "loss": 2.7226, "step": 369 }, { "epoch": 0.16444444444444445, "grad_norm": 1.2748644351959229, "learning_rate": 0.00019359430604982207, "loss": 2.8053, "step": 370 }, { "epoch": 0.1648888888888889, "grad_norm": 1.2830379009246826, "learning_rate": 0.00019357651245551603, "loss": 2.6742, "step": 371 }, { "epoch": 0.16533333333333333, "grad_norm": 1.3083009719848633, "learning_rate": 0.00019355871886120998, "loss": 2.5176, "step": 372 }, { "epoch": 0.16577777777777777, "grad_norm": 1.2429115772247314, "learning_rate": 0.00019354092526690391, "loss": 2.4358, "step": 373 }, { "epoch": 0.16622222222222222, "grad_norm": 1.3192554712295532, "learning_rate": 0.00019352313167259787, "loss": 2.3644, "step": 374 }, { "epoch": 0.16666666666666666, "grad_norm": 1.1091505289077759, "learning_rate": 0.00019350533807829183, "loss": 2.0567, "step": 375 }, { "epoch": 0.1671111111111111, "grad_norm": 1.3943992853164673, "learning_rate": 0.00019348754448398578, "loss": 2.3812, "step": 376 }, { "epoch": 0.16755555555555557, "grad_norm": 1.3048312664031982, "learning_rate": 0.00019346975088967974, "loss": 2.614, "step": 377 }, { "epoch": 0.168, "grad_norm": 1.2868915796279907, "learning_rate": 0.00019345195729537367, "loss": 2.8753, "step": 378 }, { "epoch": 0.16844444444444445, "grad_norm": 1.7796956300735474, "learning_rate": 0.00019343416370106763, "loss": 2.8598, "step": 379 }, { "epoch": 0.1688888888888889, "grad_norm": 1.2406312227249146, "learning_rate": 0.00019341637010676156, "loss": 2.3268, "step": 380 }, { "epoch": 0.16933333333333334, "grad_norm": 1.1642827987670898, "learning_rate": 0.0001933985765124555, "loss": 1.8269, "step": 381 }, { "epoch": 0.16977777777777778, "grad_norm": 1.325958251953125, "learning_rate": 0.00019338078291814947, "loss": 2.6527, "step": 382 }, { "epoch": 0.17022222222222222, "grad_norm": 1.4351950883865356, "learning_rate": 0.00019336298932384343, "loss": 2.7466, "step": 383 }, { "epoch": 0.17066666666666666, "grad_norm": 1.4853794574737549, "learning_rate": 0.00019334519572953738, "loss": 2.7099, "step": 384 }, { "epoch": 0.1711111111111111, "grad_norm": 1.2756681442260742, "learning_rate": 0.00019332740213523134, "loss": 1.9808, "step": 385 }, { "epoch": 0.17155555555555554, "grad_norm": 1.567001223564148, "learning_rate": 0.00019330960854092527, "loss": 2.6314, "step": 386 }, { "epoch": 0.172, "grad_norm": 1.4116157293319702, "learning_rate": 0.00019329181494661922, "loss": 2.3277, "step": 387 }, { "epoch": 0.17244444444444446, "grad_norm": 1.7159870862960815, "learning_rate": 0.00019327402135231318, "loss": 3.2206, "step": 388 }, { "epoch": 0.1728888888888889, "grad_norm": 1.3646256923675537, "learning_rate": 0.00019325622775800714, "loss": 2.4943, "step": 389 }, { "epoch": 0.17333333333333334, "grad_norm": 1.47772216796875, "learning_rate": 0.0001932384341637011, "loss": 2.3482, "step": 390 }, { "epoch": 0.17377777777777778, "grad_norm": 1.6584293842315674, "learning_rate": 0.00019322064056939502, "loss": 2.449, "step": 391 }, { "epoch": 0.17422222222222222, "grad_norm": 1.6674505472183228, "learning_rate": 0.00019320284697508898, "loss": 2.319, "step": 392 }, { "epoch": 0.17466666666666666, "grad_norm": 1.4847460985183716, "learning_rate": 0.0001931850533807829, "loss": 2.4977, "step": 393 }, { "epoch": 0.1751111111111111, "grad_norm": 1.5762537717819214, "learning_rate": 0.00019316725978647687, "loss": 2.5036, "step": 394 }, { "epoch": 0.17555555555555555, "grad_norm": 1.731825828552246, "learning_rate": 0.00019314946619217082, "loss": 2.9634, "step": 395 }, { "epoch": 0.176, "grad_norm": 1.5546553134918213, "learning_rate": 0.00019313167259786478, "loss": 2.9042, "step": 396 }, { "epoch": 0.17644444444444443, "grad_norm": 1.7094860076904297, "learning_rate": 0.00019311387900355873, "loss": 2.6971, "step": 397 }, { "epoch": 0.1768888888888889, "grad_norm": 1.4419265985488892, "learning_rate": 0.0001930960854092527, "loss": 2.0126, "step": 398 }, { "epoch": 0.17733333333333334, "grad_norm": 1.516395926475525, "learning_rate": 0.00019307829181494662, "loss": 1.9048, "step": 399 }, { "epoch": 0.17777777777777778, "grad_norm": 2.010030508041382, "learning_rate": 0.00019306049822064058, "loss": 2.6672, "step": 400 }, { "epoch": 0.17822222222222223, "grad_norm": 0.8796574473381042, "learning_rate": 0.00019304270462633453, "loss": 2.3451, "step": 401 }, { "epoch": 0.17866666666666667, "grad_norm": 1.060291051864624, "learning_rate": 0.0001930249110320285, "loss": 1.5039, "step": 402 }, { "epoch": 0.1791111111111111, "grad_norm": 0.9270257949829102, "learning_rate": 0.00019300711743772245, "loss": 2.3051, "step": 403 }, { "epoch": 0.17955555555555555, "grad_norm": 1.0624326467514038, "learning_rate": 0.00019298932384341638, "loss": 2.4763, "step": 404 }, { "epoch": 0.18, "grad_norm": 1.396337866783142, "learning_rate": 0.00019297153024911033, "loss": 3.0648, "step": 405 }, { "epoch": 0.18044444444444444, "grad_norm": 1.185214877128601, "learning_rate": 0.00019295373665480426, "loss": 2.5809, "step": 406 }, { "epoch": 0.18088888888888888, "grad_norm": 1.2690224647521973, "learning_rate": 0.00019293594306049822, "loss": 2.6203, "step": 407 }, { "epoch": 0.18133333333333335, "grad_norm": 1.083329439163208, "learning_rate": 0.00019291814946619217, "loss": 2.6254, "step": 408 }, { "epoch": 0.1817777777777778, "grad_norm": 1.1971805095672607, "learning_rate": 0.00019290035587188613, "loss": 2.6854, "step": 409 }, { "epoch": 0.18222222222222223, "grad_norm": 1.253471851348877, "learning_rate": 0.0001928825622775801, "loss": 2.7219, "step": 410 }, { "epoch": 0.18266666666666667, "grad_norm": 1.1367192268371582, "learning_rate": 0.00019286476868327404, "loss": 2.6906, "step": 411 }, { "epoch": 0.1831111111111111, "grad_norm": 1.1325358152389526, "learning_rate": 0.00019284697508896797, "loss": 2.6536, "step": 412 }, { "epoch": 0.18355555555555556, "grad_norm": 1.1050721406936646, "learning_rate": 0.00019282918149466193, "loss": 2.351, "step": 413 }, { "epoch": 0.184, "grad_norm": 1.1547539234161377, "learning_rate": 0.0001928113879003559, "loss": 2.3844, "step": 414 }, { "epoch": 0.18444444444444444, "grad_norm": 1.0745432376861572, "learning_rate": 0.00019279359430604984, "loss": 1.9506, "step": 415 }, { "epoch": 0.18488888888888888, "grad_norm": 1.3409316539764404, "learning_rate": 0.0001927758007117438, "loss": 3.0778, "step": 416 }, { "epoch": 0.18533333333333332, "grad_norm": 1.029353380203247, "learning_rate": 0.00019275800711743773, "loss": 2.1147, "step": 417 }, { "epoch": 0.18577777777777776, "grad_norm": 1.3205243349075317, "learning_rate": 0.00019274021352313169, "loss": 2.706, "step": 418 }, { "epoch": 0.18622222222222223, "grad_norm": 1.2287330627441406, "learning_rate": 0.00019272241992882562, "loss": 2.9919, "step": 419 }, { "epoch": 0.18666666666666668, "grad_norm": 1.2103333473205566, "learning_rate": 0.00019270462633451957, "loss": 2.5828, "step": 420 }, { "epoch": 0.18711111111111112, "grad_norm": 1.2801356315612793, "learning_rate": 0.00019268683274021353, "loss": 2.7884, "step": 421 }, { "epoch": 0.18755555555555556, "grad_norm": 1.2993440628051758, "learning_rate": 0.00019266903914590748, "loss": 2.3698, "step": 422 }, { "epoch": 0.188, "grad_norm": 1.173684000968933, "learning_rate": 0.00019265124555160144, "loss": 1.9735, "step": 423 }, { "epoch": 0.18844444444444444, "grad_norm": 1.1559852361679077, "learning_rate": 0.0001926334519572954, "loss": 2.3973, "step": 424 }, { "epoch": 0.18888888888888888, "grad_norm": 1.1459964513778687, "learning_rate": 0.00019261565836298933, "loss": 2.2315, "step": 425 }, { "epoch": 0.18933333333333333, "grad_norm": 1.2700178623199463, "learning_rate": 0.00019259786476868328, "loss": 2.0161, "step": 426 }, { "epoch": 0.18977777777777777, "grad_norm": 1.4809290170669556, "learning_rate": 0.00019258007117437724, "loss": 2.4066, "step": 427 }, { "epoch": 0.1902222222222222, "grad_norm": 1.3454186916351318, "learning_rate": 0.0001925622775800712, "loss": 2.5743, "step": 428 }, { "epoch": 0.19066666666666668, "grad_norm": 1.7205144166946411, "learning_rate": 0.00019254448398576513, "loss": 2.6614, "step": 429 }, { "epoch": 0.19111111111111112, "grad_norm": 1.3243727684020996, "learning_rate": 0.00019252669039145908, "loss": 2.5333, "step": 430 }, { "epoch": 0.19155555555555556, "grad_norm": 1.282810926437378, "learning_rate": 0.00019250889679715304, "loss": 2.4967, "step": 431 }, { "epoch": 0.192, "grad_norm": 1.4963980913162231, "learning_rate": 0.00019249110320284697, "loss": 2.5604, "step": 432 }, { "epoch": 0.19244444444444445, "grad_norm": 1.4316112995147705, "learning_rate": 0.00019247330960854092, "loss": 2.5011, "step": 433 }, { "epoch": 0.1928888888888889, "grad_norm": 1.550047755241394, "learning_rate": 0.00019245551601423488, "loss": 2.2658, "step": 434 }, { "epoch": 0.19333333333333333, "grad_norm": 1.3296672105789185, "learning_rate": 0.00019243772241992884, "loss": 2.3954, "step": 435 }, { "epoch": 0.19377777777777777, "grad_norm": 1.4095767736434937, "learning_rate": 0.0001924199288256228, "loss": 2.4617, "step": 436 }, { "epoch": 0.1942222222222222, "grad_norm": 1.5941203832626343, "learning_rate": 0.00019240213523131675, "loss": 2.4245, "step": 437 }, { "epoch": 0.19466666666666665, "grad_norm": 1.427892804145813, "learning_rate": 0.00019238434163701068, "loss": 2.6397, "step": 438 }, { "epoch": 0.19511111111111112, "grad_norm": 1.512261152267456, "learning_rate": 0.00019236654804270464, "loss": 3.0727, "step": 439 }, { "epoch": 0.19555555555555557, "grad_norm": 1.3837703466415405, "learning_rate": 0.0001923487544483986, "loss": 2.0297, "step": 440 }, { "epoch": 0.196, "grad_norm": 1.7951322793960571, "learning_rate": 0.00019233096085409255, "loss": 2.9765, "step": 441 }, { "epoch": 0.19644444444444445, "grad_norm": 1.418602705001831, "learning_rate": 0.00019231316725978648, "loss": 2.4067, "step": 442 }, { "epoch": 0.1968888888888889, "grad_norm": 1.8720040321350098, "learning_rate": 0.00019229537366548044, "loss": 2.911, "step": 443 }, { "epoch": 0.19733333333333333, "grad_norm": 1.337851881980896, "learning_rate": 0.0001922775800711744, "loss": 2.2034, "step": 444 }, { "epoch": 0.19777777777777777, "grad_norm": 1.770947813987732, "learning_rate": 0.00019225978647686832, "loss": 3.1644, "step": 445 }, { "epoch": 0.19822222222222222, "grad_norm": 1.5249742269515991, "learning_rate": 0.00019224199288256228, "loss": 3.0308, "step": 446 }, { "epoch": 0.19866666666666666, "grad_norm": 1.9548603296279907, "learning_rate": 0.00019222419928825623, "loss": 3.2289, "step": 447 }, { "epoch": 0.1991111111111111, "grad_norm": 1.8034451007843018, "learning_rate": 0.0001922064056939502, "loss": 2.9985, "step": 448 }, { "epoch": 0.19955555555555557, "grad_norm": 2.0380022525787354, "learning_rate": 0.00019218861209964415, "loss": 2.9506, "step": 449 }, { "epoch": 0.2, "grad_norm": 2.272326946258545, "learning_rate": 0.0001921708185053381, "loss": 2.9212, "step": 450 }, { "epoch": 0.20044444444444445, "grad_norm": 0.8263271450996399, "learning_rate": 0.00019215302491103203, "loss": 2.646, "step": 451 }, { "epoch": 0.2008888888888889, "grad_norm": 0.7942408919334412, "learning_rate": 0.000192135231316726, "loss": 2.1999, "step": 452 }, { "epoch": 0.20133333333333334, "grad_norm": 1.059103012084961, "learning_rate": 0.00019211743772241995, "loss": 1.499, "step": 453 }, { "epoch": 0.20177777777777778, "grad_norm": 1.2836692333221436, "learning_rate": 0.0001920996441281139, "loss": 2.1925, "step": 454 }, { "epoch": 0.20222222222222222, "grad_norm": 0.9823219776153564, "learning_rate": 0.00019208185053380783, "loss": 2.3254, "step": 455 }, { "epoch": 0.20266666666666666, "grad_norm": 1.1425485610961914, "learning_rate": 0.0001920640569395018, "loss": 2.6993, "step": 456 }, { "epoch": 0.2031111111111111, "grad_norm": 1.2006306648254395, "learning_rate": 0.00019204626334519575, "loss": 2.8467, "step": 457 }, { "epoch": 0.20355555555555555, "grad_norm": 1.0690330266952515, "learning_rate": 0.00019202846975088967, "loss": 2.5, "step": 458 }, { "epoch": 0.204, "grad_norm": 1.0791610479354858, "learning_rate": 0.00019201067615658363, "loss": 2.6261, "step": 459 }, { "epoch": 0.20444444444444446, "grad_norm": 1.281191349029541, "learning_rate": 0.0001919928825622776, "loss": 2.6089, "step": 460 }, { "epoch": 0.2048888888888889, "grad_norm": 0.9611422419548035, "learning_rate": 0.00019197508896797154, "loss": 2.6225, "step": 461 }, { "epoch": 0.20533333333333334, "grad_norm": 1.1073009967803955, "learning_rate": 0.0001919572953736655, "loss": 2.2157, "step": 462 }, { "epoch": 0.20577777777777778, "grad_norm": 1.305037498474121, "learning_rate": 0.00019193950177935943, "loss": 3.2172, "step": 463 }, { "epoch": 0.20622222222222222, "grad_norm": 1.1567325592041016, "learning_rate": 0.00019192170818505339, "loss": 2.4355, "step": 464 }, { "epoch": 0.20666666666666667, "grad_norm": 1.2676507234573364, "learning_rate": 0.00019190391459074734, "loss": 2.405, "step": 465 }, { "epoch": 0.2071111111111111, "grad_norm": 1.1165955066680908, "learning_rate": 0.0001918861209964413, "loss": 2.0347, "step": 466 }, { "epoch": 0.20755555555555555, "grad_norm": 1.3841233253479004, "learning_rate": 0.00019186832740213526, "loss": 2.7864, "step": 467 }, { "epoch": 0.208, "grad_norm": 1.4399226903915405, "learning_rate": 0.00019185053380782919, "loss": 3.6052, "step": 468 }, { "epoch": 0.20844444444444443, "grad_norm": 1.4364230632781982, "learning_rate": 0.00019183274021352314, "loss": 2.7003, "step": 469 }, { "epoch": 0.2088888888888889, "grad_norm": 1.1740696430206299, "learning_rate": 0.00019181494661921707, "loss": 2.1421, "step": 470 }, { "epoch": 0.20933333333333334, "grad_norm": 1.5531638860702515, "learning_rate": 0.00019179715302491103, "loss": 2.6817, "step": 471 }, { "epoch": 0.20977777777777779, "grad_norm": 1.4464926719665527, "learning_rate": 0.00019177935943060498, "loss": 2.7101, "step": 472 }, { "epoch": 0.21022222222222223, "grad_norm": 1.319682002067566, "learning_rate": 0.00019176156583629894, "loss": 2.3335, "step": 473 }, { "epoch": 0.21066666666666667, "grad_norm": 1.347642183303833, "learning_rate": 0.0001917437722419929, "loss": 2.6819, "step": 474 }, { "epoch": 0.2111111111111111, "grad_norm": 1.3280656337738037, "learning_rate": 0.00019172597864768685, "loss": 2.7256, "step": 475 }, { "epoch": 0.21155555555555555, "grad_norm": 1.4412258863449097, "learning_rate": 0.00019170818505338078, "loss": 2.7524, "step": 476 }, { "epoch": 0.212, "grad_norm": 1.455552577972412, "learning_rate": 0.00019169039145907474, "loss": 2.5749, "step": 477 }, { "epoch": 0.21244444444444444, "grad_norm": 1.2440650463104248, "learning_rate": 0.0001916725978647687, "loss": 2.3623, "step": 478 }, { "epoch": 0.21288888888888888, "grad_norm": 1.2427901029586792, "learning_rate": 0.00019165480427046265, "loss": 2.3891, "step": 479 }, { "epoch": 0.21333333333333335, "grad_norm": 1.2674572467803955, "learning_rate": 0.0001916370106761566, "loss": 2.2005, "step": 480 }, { "epoch": 0.2137777777777778, "grad_norm": 1.4567019939422607, "learning_rate": 0.00019161921708185054, "loss": 2.7974, "step": 481 }, { "epoch": 0.21422222222222223, "grad_norm": 1.25277578830719, "learning_rate": 0.0001916014234875445, "loss": 2.4577, "step": 482 }, { "epoch": 0.21466666666666667, "grad_norm": 1.2980494499206543, "learning_rate": 0.00019158362989323842, "loss": 2.4017, "step": 483 }, { "epoch": 0.21511111111111111, "grad_norm": 1.5980355739593506, "learning_rate": 0.00019156583629893238, "loss": 2.7121, "step": 484 }, { "epoch": 0.21555555555555556, "grad_norm": 1.3960875272750854, "learning_rate": 0.00019154804270462634, "loss": 2.4198, "step": 485 }, { "epoch": 0.216, "grad_norm": 1.5180373191833496, "learning_rate": 0.0001915302491103203, "loss": 2.8813, "step": 486 }, { "epoch": 0.21644444444444444, "grad_norm": 1.339158058166504, "learning_rate": 0.00019151245551601425, "loss": 2.4161, "step": 487 }, { "epoch": 0.21688888888888888, "grad_norm": 1.708709955215454, "learning_rate": 0.0001914946619217082, "loss": 2.6625, "step": 488 }, { "epoch": 0.21733333333333332, "grad_norm": 1.4037717580795288, "learning_rate": 0.00019147686832740214, "loss": 2.7222, "step": 489 }, { "epoch": 0.21777777777777776, "grad_norm": 1.547869324684143, "learning_rate": 0.0001914590747330961, "loss": 2.4619, "step": 490 }, { "epoch": 0.21822222222222223, "grad_norm": 2.0343785285949707, "learning_rate": 0.00019144128113879005, "loss": 2.8051, "step": 491 }, { "epoch": 0.21866666666666668, "grad_norm": 1.5703917741775513, "learning_rate": 0.000191423487544484, "loss": 2.7364, "step": 492 }, { "epoch": 0.21911111111111112, "grad_norm": 1.4888960123062134, "learning_rate": 0.00019140569395017796, "loss": 2.3952, "step": 493 }, { "epoch": 0.21955555555555556, "grad_norm": 1.9154101610183716, "learning_rate": 0.0001913879003558719, "loss": 3.4184, "step": 494 }, { "epoch": 0.22, "grad_norm": 1.8733478784561157, "learning_rate": 0.00019137010676156585, "loss": 2.0588, "step": 495 }, { "epoch": 0.22044444444444444, "grad_norm": 1.5168616771697998, "learning_rate": 0.00019135231316725978, "loss": 2.4865, "step": 496 }, { "epoch": 0.22088888888888888, "grad_norm": 1.6080540418624878, "learning_rate": 0.00019133451957295373, "loss": 2.6185, "step": 497 }, { "epoch": 0.22133333333333333, "grad_norm": 1.8169959783554077, "learning_rate": 0.0001913167259786477, "loss": 2.7514, "step": 498 }, { "epoch": 0.22177777777777777, "grad_norm": 1.903592824935913, "learning_rate": 0.00019129893238434165, "loss": 2.926, "step": 499 }, { "epoch": 0.2222222222222222, "grad_norm": 2.2263951301574707, "learning_rate": 0.0001912811387900356, "loss": 3.209, "step": 500 }, { "epoch": 0.22266666666666668, "grad_norm": 0.9277405142784119, "learning_rate": 0.00019126334519572956, "loss": 2.8295, "step": 501 }, { "epoch": 0.22311111111111112, "grad_norm": 0.7616639137268066, "learning_rate": 0.0001912455516014235, "loss": 2.3259, "step": 502 }, { "epoch": 0.22355555555555556, "grad_norm": 0.863919734954834, "learning_rate": 0.00019122775800711745, "loss": 2.6629, "step": 503 }, { "epoch": 0.224, "grad_norm": 0.9117692112922668, "learning_rate": 0.0001912099644128114, "loss": 2.8012, "step": 504 }, { "epoch": 0.22444444444444445, "grad_norm": 0.8689172863960266, "learning_rate": 0.00019119217081850536, "loss": 1.8062, "step": 505 }, { "epoch": 0.2248888888888889, "grad_norm": 0.9966077208518982, "learning_rate": 0.00019117437722419932, "loss": 2.3426, "step": 506 }, { "epoch": 0.22533333333333333, "grad_norm": 1.1402056217193604, "learning_rate": 0.00019115658362989324, "loss": 2.597, "step": 507 }, { "epoch": 0.22577777777777777, "grad_norm": 1.0207332372665405, "learning_rate": 0.0001911387900355872, "loss": 2.7771, "step": 508 }, { "epoch": 0.2262222222222222, "grad_norm": 1.0293519496917725, "learning_rate": 0.00019112099644128113, "loss": 2.2381, "step": 509 }, { "epoch": 0.22666666666666666, "grad_norm": 1.0531278848648071, "learning_rate": 0.0001911032028469751, "loss": 2.42, "step": 510 }, { "epoch": 0.22711111111111112, "grad_norm": 1.2546653747558594, "learning_rate": 0.00019108540925266904, "loss": 2.9447, "step": 511 }, { "epoch": 0.22755555555555557, "grad_norm": 1.0765845775604248, "learning_rate": 0.000191067615658363, "loss": 2.3876, "step": 512 }, { "epoch": 0.228, "grad_norm": 1.0224113464355469, "learning_rate": 0.00019104982206405696, "loss": 2.5661, "step": 513 }, { "epoch": 0.22844444444444445, "grad_norm": 1.1143425703048706, "learning_rate": 0.0001910320284697509, "loss": 2.4567, "step": 514 }, { "epoch": 0.2288888888888889, "grad_norm": 1.2478740215301514, "learning_rate": 0.00019101423487544484, "loss": 2.2325, "step": 515 }, { "epoch": 0.22933333333333333, "grad_norm": 1.5122989416122437, "learning_rate": 0.0001909964412811388, "loss": 2.7624, "step": 516 }, { "epoch": 0.22977777777777778, "grad_norm": 1.2084643840789795, "learning_rate": 0.00019097864768683276, "loss": 2.1843, "step": 517 }, { "epoch": 0.23022222222222222, "grad_norm": 2.0436813831329346, "learning_rate": 0.0001909608540925267, "loss": 2.7025, "step": 518 }, { "epoch": 0.23066666666666666, "grad_norm": 1.6114445924758911, "learning_rate": 0.00019094306049822067, "loss": 2.5077, "step": 519 }, { "epoch": 0.2311111111111111, "grad_norm": 1.163203239440918, "learning_rate": 0.0001909252669039146, "loss": 2.6062, "step": 520 }, { "epoch": 0.23155555555555554, "grad_norm": 1.3827770948410034, "learning_rate": 0.00019090747330960855, "loss": 2.565, "step": 521 }, { "epoch": 0.232, "grad_norm": 1.443726658821106, "learning_rate": 0.00019088967971530248, "loss": 2.9211, "step": 522 }, { "epoch": 0.23244444444444445, "grad_norm": 1.1512651443481445, "learning_rate": 0.00019087188612099644, "loss": 2.3893, "step": 523 }, { "epoch": 0.2328888888888889, "grad_norm": 1.3335007429122925, "learning_rate": 0.0001908540925266904, "loss": 2.8188, "step": 524 }, { "epoch": 0.23333333333333334, "grad_norm": 1.2235959768295288, "learning_rate": 0.00019083629893238435, "loss": 2.2223, "step": 525 }, { "epoch": 0.23377777777777778, "grad_norm": 1.3788108825683594, "learning_rate": 0.0001908185053380783, "loss": 2.7354, "step": 526 }, { "epoch": 0.23422222222222222, "grad_norm": 1.400914192199707, "learning_rate": 0.00019080071174377227, "loss": 2.3977, "step": 527 }, { "epoch": 0.23466666666666666, "grad_norm": 1.4983909130096436, "learning_rate": 0.0001907829181494662, "loss": 2.7346, "step": 528 }, { "epoch": 0.2351111111111111, "grad_norm": 1.453970193862915, "learning_rate": 0.00019076512455516015, "loss": 2.4243, "step": 529 }, { "epoch": 0.23555555555555555, "grad_norm": 1.6744136810302734, "learning_rate": 0.0001907473309608541, "loss": 0.8518, "step": 530 }, { "epoch": 0.236, "grad_norm": 1.4733753204345703, "learning_rate": 0.00019072953736654807, "loss": 2.5426, "step": 531 }, { "epoch": 0.23644444444444446, "grad_norm": 1.4669400453567505, "learning_rate": 0.00019071174377224202, "loss": 2.527, "step": 532 }, { "epoch": 0.2368888888888889, "grad_norm": 1.413023829460144, "learning_rate": 0.00019069395017793595, "loss": 2.5775, "step": 533 }, { "epoch": 0.23733333333333334, "grad_norm": 1.4842833280563354, "learning_rate": 0.0001906761565836299, "loss": 2.3223, "step": 534 }, { "epoch": 0.23777777777777778, "grad_norm": 1.6651334762573242, "learning_rate": 0.00019065836298932384, "loss": 2.7261, "step": 535 }, { "epoch": 0.23822222222222222, "grad_norm": 2.1817784309387207, "learning_rate": 0.0001906405693950178, "loss": 2.602, "step": 536 }, { "epoch": 0.23866666666666667, "grad_norm": 1.4185001850128174, "learning_rate": 0.00019062277580071175, "loss": 2.5273, "step": 537 }, { "epoch": 0.2391111111111111, "grad_norm": 1.6372658014297485, "learning_rate": 0.0001906049822064057, "loss": 2.7267, "step": 538 }, { "epoch": 0.23955555555555555, "grad_norm": 2.114755868911743, "learning_rate": 0.00019058718861209966, "loss": 1.3715, "step": 539 }, { "epoch": 0.24, "grad_norm": 1.5371288061141968, "learning_rate": 0.00019056939501779362, "loss": 2.5673, "step": 540 }, { "epoch": 0.24044444444444443, "grad_norm": 1.3606349229812622, "learning_rate": 0.00019055160142348755, "loss": 2.5102, "step": 541 }, { "epoch": 0.2408888888888889, "grad_norm": 1.7038285732269287, "learning_rate": 0.0001905338078291815, "loss": 2.8909, "step": 542 }, { "epoch": 0.24133333333333334, "grad_norm": 1.6660969257354736, "learning_rate": 0.00019051601423487546, "loss": 2.3934, "step": 543 }, { "epoch": 0.24177777777777779, "grad_norm": 1.4915132522583008, "learning_rate": 0.00019049822064056942, "loss": 2.6195, "step": 544 }, { "epoch": 0.24222222222222223, "grad_norm": 1.606236219406128, "learning_rate": 0.00019048042704626335, "loss": 2.5366, "step": 545 }, { "epoch": 0.24266666666666667, "grad_norm": 1.6464382410049438, "learning_rate": 0.0001904626334519573, "loss": 2.2821, "step": 546 }, { "epoch": 0.2431111111111111, "grad_norm": 1.5627448558807373, "learning_rate": 0.00019044483985765126, "loss": 2.7557, "step": 547 }, { "epoch": 0.24355555555555555, "grad_norm": 1.5537325143814087, "learning_rate": 0.0001904270462633452, "loss": 2.7917, "step": 548 }, { "epoch": 0.244, "grad_norm": 2.0328938961029053, "learning_rate": 0.00019040925266903915, "loss": 3.1031, "step": 549 }, { "epoch": 0.24444444444444444, "grad_norm": 2.5331227779388428, "learning_rate": 0.0001903914590747331, "loss": 2.0921, "step": 550 }, { "epoch": 0.24488888888888888, "grad_norm": 1.062377691268921, "learning_rate": 0.00019037366548042706, "loss": 2.7696, "step": 551 }, { "epoch": 0.24533333333333332, "grad_norm": 0.9012869596481323, "learning_rate": 0.00019035587188612102, "loss": 1.8978, "step": 552 }, { "epoch": 0.2457777777777778, "grad_norm": 0.9942989349365234, "learning_rate": 0.00019033807829181495, "loss": 1.9979, "step": 553 }, { "epoch": 0.24622222222222223, "grad_norm": 1.0721116065979004, "learning_rate": 0.0001903202846975089, "loss": 2.6083, "step": 554 }, { "epoch": 0.24666666666666667, "grad_norm": 1.0755621194839478, "learning_rate": 0.00019030249110320286, "loss": 2.7578, "step": 555 }, { "epoch": 0.24711111111111111, "grad_norm": 1.080788254737854, "learning_rate": 0.00019028469750889681, "loss": 2.2836, "step": 556 }, { "epoch": 0.24755555555555556, "grad_norm": 1.0383445024490356, "learning_rate": 0.00019026690391459077, "loss": 2.4542, "step": 557 }, { "epoch": 0.248, "grad_norm": 1.1483523845672607, "learning_rate": 0.0001902491103202847, "loss": 2.5689, "step": 558 }, { "epoch": 0.24844444444444444, "grad_norm": 1.041678547859192, "learning_rate": 0.00019023131672597866, "loss": 2.2751, "step": 559 }, { "epoch": 0.24888888888888888, "grad_norm": 1.1849489212036133, "learning_rate": 0.0001902135231316726, "loss": 2.6915, "step": 560 }, { "epoch": 0.24933333333333332, "grad_norm": 1.1290448904037476, "learning_rate": 0.00019019572953736654, "loss": 2.184, "step": 561 }, { "epoch": 0.24977777777777777, "grad_norm": 1.2467504739761353, "learning_rate": 0.0001901779359430605, "loss": 2.2676, "step": 562 }, { "epoch": 0.25022222222222223, "grad_norm": 1.1683002710342407, "learning_rate": 0.00019016014234875446, "loss": 2.4178, "step": 563 }, { "epoch": 0.25066666666666665, "grad_norm": 1.2386951446533203, "learning_rate": 0.0001901423487544484, "loss": 2.4034, "step": 564 }, { "epoch": 0.2511111111111111, "grad_norm": 1.259753704071045, "learning_rate": 0.00019012455516014237, "loss": 2.5685, "step": 565 }, { "epoch": 0.25155555555555553, "grad_norm": 1.5166339874267578, "learning_rate": 0.0001901067615658363, "loss": 2.6635, "step": 566 }, { "epoch": 0.252, "grad_norm": 1.23752760887146, "learning_rate": 0.00019008896797153026, "loss": 2.4881, "step": 567 }, { "epoch": 0.25244444444444447, "grad_norm": 1.2667707204818726, "learning_rate": 0.0001900711743772242, "loss": 2.8058, "step": 568 }, { "epoch": 0.2528888888888889, "grad_norm": 1.8489893674850464, "learning_rate": 0.00019005338078291817, "loss": 2.6723, "step": 569 }, { "epoch": 0.25333333333333335, "grad_norm": 1.2785292863845825, "learning_rate": 0.00019003558718861212, "loss": 2.8666, "step": 570 }, { "epoch": 0.25377777777777777, "grad_norm": 1.141205906867981, "learning_rate": 0.00019001779359430605, "loss": 2.1319, "step": 571 }, { "epoch": 0.25422222222222224, "grad_norm": 1.4991300106048584, "learning_rate": 0.00019, "loss": 2.3779, "step": 572 }, { "epoch": 0.25466666666666665, "grad_norm": 1.2517198324203491, "learning_rate": 0.00018998220640569394, "loss": 2.7036, "step": 573 }, { "epoch": 0.2551111111111111, "grad_norm": 1.186219334602356, "learning_rate": 0.0001899644128113879, "loss": 2.6787, "step": 574 }, { "epoch": 0.25555555555555554, "grad_norm": 1.2609152793884277, "learning_rate": 0.00018994661921708185, "loss": 2.5269, "step": 575 }, { "epoch": 0.256, "grad_norm": 1.4722431898117065, "learning_rate": 0.0001899288256227758, "loss": 2.6287, "step": 576 }, { "epoch": 0.2564444444444444, "grad_norm": 1.3302136659622192, "learning_rate": 0.00018991103202846977, "loss": 2.5439, "step": 577 }, { "epoch": 0.2568888888888889, "grad_norm": 1.270352840423584, "learning_rate": 0.00018989323843416372, "loss": 2.5176, "step": 578 }, { "epoch": 0.25733333333333336, "grad_norm": 1.2411810159683228, "learning_rate": 0.00018987544483985765, "loss": 2.423, "step": 579 }, { "epoch": 0.2577777777777778, "grad_norm": 1.3175048828125, "learning_rate": 0.0001898576512455516, "loss": 2.364, "step": 580 }, { "epoch": 0.25822222222222224, "grad_norm": 1.4399092197418213, "learning_rate": 0.00018983985765124556, "loss": 2.7466, "step": 581 }, { "epoch": 0.25866666666666666, "grad_norm": 1.234508752822876, "learning_rate": 0.00018982206405693952, "loss": 2.1863, "step": 582 }, { "epoch": 0.2591111111111111, "grad_norm": 1.6190673112869263, "learning_rate": 0.00018980427046263348, "loss": 2.4877, "step": 583 }, { "epoch": 0.25955555555555554, "grad_norm": 1.159323811531067, "learning_rate": 0.0001897864768683274, "loss": 2.2116, "step": 584 }, { "epoch": 0.26, "grad_norm": 1.284498929977417, "learning_rate": 0.00018976868327402136, "loss": 2.2746, "step": 585 }, { "epoch": 0.2604444444444444, "grad_norm": 1.366461992263794, "learning_rate": 0.0001897508896797153, "loss": 2.5624, "step": 586 }, { "epoch": 0.2608888888888889, "grad_norm": 1.4767354726791382, "learning_rate": 0.00018973309608540925, "loss": 2.5998, "step": 587 }, { "epoch": 0.2613333333333333, "grad_norm": 1.6025152206420898, "learning_rate": 0.0001897153024911032, "loss": 2.1852, "step": 588 }, { "epoch": 0.2617777777777778, "grad_norm": 1.54243803024292, "learning_rate": 0.00018969750889679716, "loss": 2.9229, "step": 589 }, { "epoch": 0.26222222222222225, "grad_norm": 1.440328598022461, "learning_rate": 0.00018967971530249112, "loss": 2.3776, "step": 590 }, { "epoch": 0.26266666666666666, "grad_norm": 1.5140371322631836, "learning_rate": 0.00018966192170818508, "loss": 3.043, "step": 591 }, { "epoch": 0.26311111111111113, "grad_norm": 1.7295174598693848, "learning_rate": 0.000189644128113879, "loss": 2.8572, "step": 592 }, { "epoch": 0.26355555555555554, "grad_norm": 1.5222134590148926, "learning_rate": 0.00018962633451957296, "loss": 2.2202, "step": 593 }, { "epoch": 0.264, "grad_norm": 1.484958529472351, "learning_rate": 0.00018960854092526692, "loss": 2.1051, "step": 594 }, { "epoch": 0.2644444444444444, "grad_norm": 1.4371466636657715, "learning_rate": 0.00018959074733096087, "loss": 2.301, "step": 595 }, { "epoch": 0.2648888888888889, "grad_norm": 1.6050223112106323, "learning_rate": 0.00018957295373665483, "loss": 2.4561, "step": 596 }, { "epoch": 0.2653333333333333, "grad_norm": 1.7809783220291138, "learning_rate": 0.00018955516014234876, "loss": 2.3243, "step": 597 }, { "epoch": 0.2657777777777778, "grad_norm": 1.850594401359558, "learning_rate": 0.00018953736654804272, "loss": 2.6873, "step": 598 }, { "epoch": 0.26622222222222225, "grad_norm": 1.9856559038162231, "learning_rate": 0.00018951957295373665, "loss": 2.3005, "step": 599 }, { "epoch": 0.26666666666666666, "grad_norm": 2.3018789291381836, "learning_rate": 0.0001895017793594306, "loss": 1.6366, "step": 600 }, { "epoch": 0.26711111111111113, "grad_norm": 0.9843171834945679, "learning_rate": 0.00018948398576512456, "loss": 2.7333, "step": 601 }, { "epoch": 0.26755555555555555, "grad_norm": 0.9362220764160156, "learning_rate": 0.00018946619217081852, "loss": 2.8176, "step": 602 }, { "epoch": 0.268, "grad_norm": 0.9775174260139465, "learning_rate": 0.00018944839857651247, "loss": 2.3173, "step": 603 }, { "epoch": 0.26844444444444443, "grad_norm": 1.0477993488311768, "learning_rate": 0.00018943060498220643, "loss": 2.0655, "step": 604 }, { "epoch": 0.2688888888888889, "grad_norm": 1.0463943481445312, "learning_rate": 0.00018941281138790036, "loss": 1.998, "step": 605 }, { "epoch": 0.2693333333333333, "grad_norm": 1.0541325807571411, "learning_rate": 0.00018939501779359431, "loss": 2.4205, "step": 606 }, { "epoch": 0.2697777777777778, "grad_norm": 1.0537536144256592, "learning_rate": 0.00018937722419928827, "loss": 2.8228, "step": 607 }, { "epoch": 0.2702222222222222, "grad_norm": 1.0244420766830444, "learning_rate": 0.00018935943060498223, "loss": 2.3281, "step": 608 }, { "epoch": 0.27066666666666667, "grad_norm": 1.3767787218093872, "learning_rate": 0.00018934163701067618, "loss": 3.1796, "step": 609 }, { "epoch": 0.27111111111111114, "grad_norm": 1.03878915309906, "learning_rate": 0.0001893238434163701, "loss": 1.968, "step": 610 }, { "epoch": 0.27155555555555555, "grad_norm": 1.1602753400802612, "learning_rate": 0.00018930604982206407, "loss": 2.6853, "step": 611 }, { "epoch": 0.272, "grad_norm": 1.0449435710906982, "learning_rate": 0.000189288256227758, "loss": 2.7955, "step": 612 }, { "epoch": 0.27244444444444443, "grad_norm": 1.095615029335022, "learning_rate": 0.00018927046263345196, "loss": 2.299, "step": 613 }, { "epoch": 0.2728888888888889, "grad_norm": 1.4768877029418945, "learning_rate": 0.0001892526690391459, "loss": 3.057, "step": 614 }, { "epoch": 0.2733333333333333, "grad_norm": 1.276252269744873, "learning_rate": 0.00018923487544483987, "loss": 2.585, "step": 615 }, { "epoch": 0.2737777777777778, "grad_norm": 1.2952446937561035, "learning_rate": 0.00018921708185053383, "loss": 2.5297, "step": 616 }, { "epoch": 0.2742222222222222, "grad_norm": 1.2312525510787964, "learning_rate": 0.00018919928825622778, "loss": 2.6543, "step": 617 }, { "epoch": 0.27466666666666667, "grad_norm": 1.368359923362732, "learning_rate": 0.0001891814946619217, "loss": 2.6951, "step": 618 }, { "epoch": 0.2751111111111111, "grad_norm": 1.50626802444458, "learning_rate": 0.00018916370106761567, "loss": 2.7336, "step": 619 }, { "epoch": 0.27555555555555555, "grad_norm": 1.2678533792495728, "learning_rate": 0.00018914590747330962, "loss": 2.4208, "step": 620 }, { "epoch": 0.276, "grad_norm": 1.167494773864746, "learning_rate": 0.00018912811387900358, "loss": 2.1091, "step": 621 }, { "epoch": 0.27644444444444444, "grad_norm": 1.12721586227417, "learning_rate": 0.00018911032028469754, "loss": 2.0721, "step": 622 }, { "epoch": 0.2768888888888889, "grad_norm": 1.306931495666504, "learning_rate": 0.00018909252669039147, "loss": 2.4418, "step": 623 }, { "epoch": 0.2773333333333333, "grad_norm": 1.8457114696502686, "learning_rate": 0.00018907473309608542, "loss": 3.0008, "step": 624 }, { "epoch": 0.2777777777777778, "grad_norm": 1.4332703351974487, "learning_rate": 0.00018905693950177935, "loss": 2.3379, "step": 625 }, { "epoch": 0.2782222222222222, "grad_norm": 1.4976214170455933, "learning_rate": 0.0001890391459074733, "loss": 2.8298, "step": 626 }, { "epoch": 0.2786666666666667, "grad_norm": 1.3851099014282227, "learning_rate": 0.00018902135231316727, "loss": 2.608, "step": 627 }, { "epoch": 0.2791111111111111, "grad_norm": 1.3901604413986206, "learning_rate": 0.00018900355871886122, "loss": 2.4981, "step": 628 }, { "epoch": 0.27955555555555556, "grad_norm": 1.5062224864959717, "learning_rate": 0.00018898576512455518, "loss": 2.4534, "step": 629 }, { "epoch": 0.28, "grad_norm": 1.3895263671875, "learning_rate": 0.00018896797153024913, "loss": 2.5953, "step": 630 }, { "epoch": 0.28044444444444444, "grad_norm": 1.5202879905700684, "learning_rate": 0.00018895017793594306, "loss": 2.2741, "step": 631 }, { "epoch": 0.2808888888888889, "grad_norm": 1.2481839656829834, "learning_rate": 0.00018893238434163702, "loss": 2.3242, "step": 632 }, { "epoch": 0.2813333333333333, "grad_norm": 1.2710275650024414, "learning_rate": 0.00018891459074733098, "loss": 2.1379, "step": 633 }, { "epoch": 0.2817777777777778, "grad_norm": 1.3879283666610718, "learning_rate": 0.00018889679715302493, "loss": 1.353, "step": 634 }, { "epoch": 0.2822222222222222, "grad_norm": 1.5457983016967773, "learning_rate": 0.0001888790035587189, "loss": 2.3432, "step": 635 }, { "epoch": 0.2826666666666667, "grad_norm": 1.5545676946640015, "learning_rate": 0.00018886120996441282, "loss": 2.6554, "step": 636 }, { "epoch": 0.2831111111111111, "grad_norm": 1.2374818325042725, "learning_rate": 0.00018884341637010678, "loss": 1.4633, "step": 637 }, { "epoch": 0.28355555555555556, "grad_norm": 1.3478444814682007, "learning_rate": 0.0001888256227758007, "loss": 2.4976, "step": 638 }, { "epoch": 0.284, "grad_norm": 1.5220305919647217, "learning_rate": 0.00018880782918149466, "loss": 2.6628, "step": 639 }, { "epoch": 0.28444444444444444, "grad_norm": 1.6763924360275269, "learning_rate": 0.00018879003558718862, "loss": 2.7547, "step": 640 }, { "epoch": 0.2848888888888889, "grad_norm": 1.5462572574615479, "learning_rate": 0.00018877224199288258, "loss": 2.6934, "step": 641 }, { "epoch": 0.2853333333333333, "grad_norm": 1.9419124126434326, "learning_rate": 0.00018875444839857653, "loss": 2.5849, "step": 642 }, { "epoch": 0.2857777777777778, "grad_norm": 1.6451220512390137, "learning_rate": 0.00018873665480427046, "loss": 2.7719, "step": 643 }, { "epoch": 0.2862222222222222, "grad_norm": 1.5274759531021118, "learning_rate": 0.00018871886120996442, "loss": 2.2116, "step": 644 }, { "epoch": 0.2866666666666667, "grad_norm": 2.1637439727783203, "learning_rate": 0.00018870106761565837, "loss": 3.0492, "step": 645 }, { "epoch": 0.2871111111111111, "grad_norm": 1.6823992729187012, "learning_rate": 0.00018868327402135233, "loss": 2.7498, "step": 646 }, { "epoch": 0.28755555555555556, "grad_norm": 1.695084810256958, "learning_rate": 0.0001886654804270463, "loss": 2.6973, "step": 647 }, { "epoch": 0.288, "grad_norm": 1.8430315256118774, "learning_rate": 0.00018864768683274024, "loss": 1.9394, "step": 648 }, { "epoch": 0.28844444444444445, "grad_norm": 2.2120563983917236, "learning_rate": 0.00018862989323843417, "loss": 1.9077, "step": 649 }, { "epoch": 0.28888888888888886, "grad_norm": 2.309023141860962, "learning_rate": 0.0001886120996441281, "loss": 2.15, "step": 650 }, { "epoch": 0.28933333333333333, "grad_norm": 1.1258161067962646, "learning_rate": 0.00018859430604982206, "loss": 2.9532, "step": 651 }, { "epoch": 0.2897777777777778, "grad_norm": 0.9455536603927612, "learning_rate": 0.00018857651245551602, "loss": 2.695, "step": 652 }, { "epoch": 0.2902222222222222, "grad_norm": 1.228993535041809, "learning_rate": 0.00018855871886120997, "loss": 1.3139, "step": 653 }, { "epoch": 0.2906666666666667, "grad_norm": 1.0988825559616089, "learning_rate": 0.00018854092526690393, "loss": 2.3994, "step": 654 }, { "epoch": 0.2911111111111111, "grad_norm": 1.0510218143463135, "learning_rate": 0.00018852313167259788, "loss": 2.9534, "step": 655 }, { "epoch": 0.29155555555555557, "grad_norm": 1.1386710405349731, "learning_rate": 0.00018850533807829181, "loss": 2.2164, "step": 656 }, { "epoch": 0.292, "grad_norm": 1.1761900186538696, "learning_rate": 0.00018848754448398577, "loss": 2.6014, "step": 657 }, { "epoch": 0.29244444444444445, "grad_norm": 1.0478448867797852, "learning_rate": 0.00018846975088967973, "loss": 2.497, "step": 658 }, { "epoch": 0.29288888888888887, "grad_norm": 1.2289860248565674, "learning_rate": 0.00018845195729537368, "loss": 3.0106, "step": 659 }, { "epoch": 0.29333333333333333, "grad_norm": 1.2881073951721191, "learning_rate": 0.00018843416370106764, "loss": 2.5109, "step": 660 }, { "epoch": 0.2937777777777778, "grad_norm": 1.2944267988204956, "learning_rate": 0.00018841637010676157, "loss": 2.3959, "step": 661 }, { "epoch": 0.2942222222222222, "grad_norm": 1.1457511186599731, "learning_rate": 0.00018839857651245553, "loss": 2.5717, "step": 662 }, { "epoch": 0.2946666666666667, "grad_norm": 1.288615345954895, "learning_rate": 0.00018838078291814946, "loss": 2.6314, "step": 663 }, { "epoch": 0.2951111111111111, "grad_norm": 1.153175711631775, "learning_rate": 0.0001883629893238434, "loss": 2.1529, "step": 664 }, { "epoch": 0.29555555555555557, "grad_norm": 1.0861622095108032, "learning_rate": 0.00018834519572953737, "loss": 2.2134, "step": 665 }, { "epoch": 0.296, "grad_norm": 1.3027865886688232, "learning_rate": 0.00018832740213523132, "loss": 2.496, "step": 666 }, { "epoch": 0.29644444444444445, "grad_norm": 1.2084095478057861, "learning_rate": 0.00018830960854092528, "loss": 2.6488, "step": 667 }, { "epoch": 0.29688888888888887, "grad_norm": 1.2126069068908691, "learning_rate": 0.00018829181494661924, "loss": 2.2976, "step": 668 }, { "epoch": 0.29733333333333334, "grad_norm": 1.392379879951477, "learning_rate": 0.00018827402135231317, "loss": 2.25, "step": 669 }, { "epoch": 0.29777777777777775, "grad_norm": 1.228171944618225, "learning_rate": 0.00018825622775800712, "loss": 2.4179, "step": 670 }, { "epoch": 0.2982222222222222, "grad_norm": 1.2194924354553223, "learning_rate": 0.00018823843416370108, "loss": 2.5822, "step": 671 }, { "epoch": 0.2986666666666667, "grad_norm": 1.2600946426391602, "learning_rate": 0.00018822064056939504, "loss": 2.3714, "step": 672 }, { "epoch": 0.2991111111111111, "grad_norm": 1.2756378650665283, "learning_rate": 0.000188202846975089, "loss": 2.3046, "step": 673 }, { "epoch": 0.2995555555555556, "grad_norm": 1.4584718942642212, "learning_rate": 0.00018818505338078292, "loss": 2.6382, "step": 674 }, { "epoch": 0.3, "grad_norm": 1.393619179725647, "learning_rate": 0.00018816725978647688, "loss": 2.7167, "step": 675 }, { "epoch": 0.30044444444444446, "grad_norm": 1.6865702867507935, "learning_rate": 0.0001881494661921708, "loss": 2.446, "step": 676 }, { "epoch": 0.3008888888888889, "grad_norm": 1.3696800470352173, "learning_rate": 0.00018813167259786477, "loss": 2.5134, "step": 677 }, { "epoch": 0.30133333333333334, "grad_norm": 1.3241018056869507, "learning_rate": 0.00018811387900355872, "loss": 2.7836, "step": 678 }, { "epoch": 0.30177777777777776, "grad_norm": 1.3688435554504395, "learning_rate": 0.00018809608540925268, "loss": 2.8028, "step": 679 }, { "epoch": 0.3022222222222222, "grad_norm": 2.019115447998047, "learning_rate": 0.00018807829181494663, "loss": 2.4958, "step": 680 }, { "epoch": 0.30266666666666664, "grad_norm": 1.3393666744232178, "learning_rate": 0.0001880604982206406, "loss": 2.4144, "step": 681 }, { "epoch": 0.3031111111111111, "grad_norm": 1.5808879137039185, "learning_rate": 0.00018804270462633452, "loss": 1.5098, "step": 682 }, { "epoch": 0.3035555555555556, "grad_norm": 1.5631835460662842, "learning_rate": 0.00018802491103202848, "loss": 2.6174, "step": 683 }, { "epoch": 0.304, "grad_norm": 1.338965892791748, "learning_rate": 0.00018800711743772243, "loss": 2.933, "step": 684 }, { "epoch": 0.30444444444444446, "grad_norm": 1.4270402193069458, "learning_rate": 0.0001879893238434164, "loss": 2.4163, "step": 685 }, { "epoch": 0.3048888888888889, "grad_norm": 1.6511561870574951, "learning_rate": 0.00018797153024911035, "loss": 2.8065, "step": 686 }, { "epoch": 0.30533333333333335, "grad_norm": 1.3582799434661865, "learning_rate": 0.00018795373665480428, "loss": 2.4156, "step": 687 }, { "epoch": 0.30577777777777776, "grad_norm": 1.3298442363739014, "learning_rate": 0.00018793594306049823, "loss": 2.7763, "step": 688 }, { "epoch": 0.30622222222222223, "grad_norm": 1.5233420133590698, "learning_rate": 0.00018791814946619216, "loss": 2.605, "step": 689 }, { "epoch": 0.30666666666666664, "grad_norm": 1.3514484167099, "learning_rate": 0.00018790035587188612, "loss": 2.6729, "step": 690 }, { "epoch": 0.3071111111111111, "grad_norm": 1.5793657302856445, "learning_rate": 0.00018788256227758007, "loss": 2.8392, "step": 691 }, { "epoch": 0.3075555555555556, "grad_norm": 1.590437889099121, "learning_rate": 0.00018786476868327403, "loss": 2.445, "step": 692 }, { "epoch": 0.308, "grad_norm": 1.5807512998580933, "learning_rate": 0.000187846975088968, "loss": 2.9341, "step": 693 }, { "epoch": 0.30844444444444447, "grad_norm": 1.5815593004226685, "learning_rate": 0.00018782918149466194, "loss": 2.5062, "step": 694 }, { "epoch": 0.3088888888888889, "grad_norm": 1.6342856884002686, "learning_rate": 0.00018781138790035587, "loss": 2.769, "step": 695 }, { "epoch": 0.30933333333333335, "grad_norm": 1.8067598342895508, "learning_rate": 0.00018779359430604983, "loss": 2.3512, "step": 696 }, { "epoch": 0.30977777777777776, "grad_norm": 1.7684136629104614, "learning_rate": 0.0001877758007117438, "loss": 3.2031, "step": 697 }, { "epoch": 0.31022222222222223, "grad_norm": 1.7312310934066772, "learning_rate": 0.00018775800711743774, "loss": 2.7118, "step": 698 }, { "epoch": 0.31066666666666665, "grad_norm": 1.7264273166656494, "learning_rate": 0.0001877402135231317, "loss": 2.7106, "step": 699 }, { "epoch": 0.3111111111111111, "grad_norm": 1.8946504592895508, "learning_rate": 0.00018772241992882563, "loss": 1.8234, "step": 700 }, { "epoch": 0.31155555555555553, "grad_norm": 0.897571325302124, "learning_rate": 0.00018770462633451959, "loss": 2.4317, "step": 701 }, { "epoch": 0.312, "grad_norm": 1.212544560432434, "learning_rate": 0.00018768683274021351, "loss": 1.817, "step": 702 }, { "epoch": 0.31244444444444447, "grad_norm": 1.4370547533035278, "learning_rate": 0.00018766903914590747, "loss": 1.6912, "step": 703 }, { "epoch": 0.3128888888888889, "grad_norm": 1.2379952669143677, "learning_rate": 0.00018765124555160143, "loss": 2.8551, "step": 704 }, { "epoch": 0.31333333333333335, "grad_norm": 1.2043800354003906, "learning_rate": 0.00018763345195729538, "loss": 2.2021, "step": 705 }, { "epoch": 0.31377777777777777, "grad_norm": 1.1653484106063843, "learning_rate": 0.00018761565836298934, "loss": 2.97, "step": 706 }, { "epoch": 0.31422222222222224, "grad_norm": 1.2504090070724487, "learning_rate": 0.0001875978647686833, "loss": 2.8417, "step": 707 }, { "epoch": 0.31466666666666665, "grad_norm": 1.186420202255249, "learning_rate": 0.00018758007117437723, "loss": 2.2231, "step": 708 }, { "epoch": 0.3151111111111111, "grad_norm": 1.3034793138504028, "learning_rate": 0.00018756227758007118, "loss": 2.7367, "step": 709 }, { "epoch": 0.31555555555555553, "grad_norm": 1.1502488851547241, "learning_rate": 0.00018754448398576514, "loss": 3.0465, "step": 710 }, { "epoch": 0.316, "grad_norm": 1.3152366876602173, "learning_rate": 0.0001875266903914591, "loss": 3.0754, "step": 711 }, { "epoch": 0.3164444444444444, "grad_norm": 1.0930118560791016, "learning_rate": 0.00018750889679715305, "loss": 2.3183, "step": 712 }, { "epoch": 0.3168888888888889, "grad_norm": 1.3411848545074463, "learning_rate": 0.00018749110320284698, "loss": 3.0055, "step": 713 }, { "epoch": 0.31733333333333336, "grad_norm": 1.222649097442627, "learning_rate": 0.00018747330960854094, "loss": 2.541, "step": 714 }, { "epoch": 0.31777777777777777, "grad_norm": 1.3231635093688965, "learning_rate": 0.00018745551601423487, "loss": 3.1424, "step": 715 }, { "epoch": 0.31822222222222224, "grad_norm": 1.3029333353042603, "learning_rate": 0.00018743772241992882, "loss": 2.736, "step": 716 }, { "epoch": 0.31866666666666665, "grad_norm": 1.1651556491851807, "learning_rate": 0.00018741992882562278, "loss": 2.4525, "step": 717 }, { "epoch": 0.3191111111111111, "grad_norm": 1.1928997039794922, "learning_rate": 0.00018740213523131674, "loss": 2.338, "step": 718 }, { "epoch": 0.31955555555555554, "grad_norm": 1.2094029188156128, "learning_rate": 0.0001873843416370107, "loss": 2.1728, "step": 719 }, { "epoch": 0.32, "grad_norm": 1.3674081563949585, "learning_rate": 0.00018736654804270465, "loss": 2.7086, "step": 720 }, { "epoch": 0.3204444444444444, "grad_norm": 1.2240111827850342, "learning_rate": 0.00018734875444839858, "loss": 2.4084, "step": 721 }, { "epoch": 0.3208888888888889, "grad_norm": 1.625939965248108, "learning_rate": 0.00018733096085409254, "loss": 2.233, "step": 722 }, { "epoch": 0.32133333333333336, "grad_norm": 1.572806477546692, "learning_rate": 0.0001873131672597865, "loss": 2.8494, "step": 723 }, { "epoch": 0.3217777777777778, "grad_norm": 1.4796736240386963, "learning_rate": 0.00018729537366548045, "loss": 2.5345, "step": 724 }, { "epoch": 0.32222222222222224, "grad_norm": 1.5258103609085083, "learning_rate": 0.0001872775800711744, "loss": 2.4471, "step": 725 }, { "epoch": 0.32266666666666666, "grad_norm": 1.5818583965301514, "learning_rate": 0.00018725978647686834, "loss": 2.9746, "step": 726 }, { "epoch": 0.3231111111111111, "grad_norm": 1.275765061378479, "learning_rate": 0.00018724199288256226, "loss": 2.4232, "step": 727 }, { "epoch": 0.32355555555555554, "grad_norm": 1.3722361326217651, "learning_rate": 0.00018722419928825622, "loss": 1.4788, "step": 728 }, { "epoch": 0.324, "grad_norm": 1.369632363319397, "learning_rate": 0.00018720640569395018, "loss": 2.5712, "step": 729 }, { "epoch": 0.3244444444444444, "grad_norm": 1.5968562364578247, "learning_rate": 0.00018718861209964413, "loss": 2.3008, "step": 730 }, { "epoch": 0.3248888888888889, "grad_norm": 1.6501327753067017, "learning_rate": 0.0001871708185053381, "loss": 2.5323, "step": 731 }, { "epoch": 0.3253333333333333, "grad_norm": 1.5736616849899292, "learning_rate": 0.00018715302491103205, "loss": 2.2086, "step": 732 }, { "epoch": 0.3257777777777778, "grad_norm": 1.4434736967086792, "learning_rate": 0.00018713523131672598, "loss": 2.613, "step": 733 }, { "epoch": 0.32622222222222225, "grad_norm": 1.5532594919204712, "learning_rate": 0.00018711743772241993, "loss": 2.4163, "step": 734 }, { "epoch": 0.32666666666666666, "grad_norm": 1.4101078510284424, "learning_rate": 0.0001870996441281139, "loss": 2.6866, "step": 735 }, { "epoch": 0.32711111111111113, "grad_norm": 1.3974218368530273, "learning_rate": 0.00018708185053380785, "loss": 2.0701, "step": 736 }, { "epoch": 0.32755555555555554, "grad_norm": 1.8705499172210693, "learning_rate": 0.0001870640569395018, "loss": 2.5967, "step": 737 }, { "epoch": 0.328, "grad_norm": 1.7035057544708252, "learning_rate": 0.00018704626334519576, "loss": 2.589, "step": 738 }, { "epoch": 0.32844444444444443, "grad_norm": 1.477556824684143, "learning_rate": 0.0001870284697508897, "loss": 2.576, "step": 739 }, { "epoch": 0.3288888888888889, "grad_norm": 1.5129868984222412, "learning_rate": 0.00018701067615658362, "loss": 2.5122, "step": 740 }, { "epoch": 0.3293333333333333, "grad_norm": 1.453865647315979, "learning_rate": 0.00018699288256227757, "loss": 2.1861, "step": 741 }, { "epoch": 0.3297777777777778, "grad_norm": 1.3834642171859741, "learning_rate": 0.00018697508896797153, "loss": 2.3745, "step": 742 }, { "epoch": 0.3302222222222222, "grad_norm": 1.610922932624817, "learning_rate": 0.0001869572953736655, "loss": 2.6936, "step": 743 }, { "epoch": 0.33066666666666666, "grad_norm": 1.550073266029358, "learning_rate": 0.00018693950177935944, "loss": 2.3292, "step": 744 }, { "epoch": 0.33111111111111113, "grad_norm": 1.5677452087402344, "learning_rate": 0.0001869217081850534, "loss": 2.8312, "step": 745 }, { "epoch": 0.33155555555555555, "grad_norm": 1.6848689317703247, "learning_rate": 0.00018690391459074733, "loss": 2.9414, "step": 746 }, { "epoch": 0.332, "grad_norm": 1.5544129610061646, "learning_rate": 0.00018688612099644129, "loss": 2.3318, "step": 747 }, { "epoch": 0.33244444444444443, "grad_norm": 1.705752968788147, "learning_rate": 0.00018686832740213524, "loss": 2.4389, "step": 748 }, { "epoch": 0.3328888888888889, "grad_norm": 1.7628095149993896, "learning_rate": 0.0001868505338078292, "loss": 2.7991, "step": 749 }, { "epoch": 0.3333333333333333, "grad_norm": 2.3342695236206055, "learning_rate": 0.00018683274021352316, "loss": 2.8554, "step": 750 }, { "epoch": 0.3337777777777778, "grad_norm": 0.9579228758811951, "learning_rate": 0.0001868149466192171, "loss": 3.0001, "step": 751 }, { "epoch": 0.3342222222222222, "grad_norm": 0.910830020904541, "learning_rate": 0.00018679715302491104, "loss": 2.7247, "step": 752 }, { "epoch": 0.33466666666666667, "grad_norm": 0.9618215560913086, "learning_rate": 0.00018677935943060497, "loss": 2.1627, "step": 753 }, { "epoch": 0.33511111111111114, "grad_norm": 1.398019790649414, "learning_rate": 0.00018676156583629893, "loss": 2.8774, "step": 754 }, { "epoch": 0.33555555555555555, "grad_norm": 0.98659348487854, "learning_rate": 0.00018674377224199288, "loss": 2.207, "step": 755 }, { "epoch": 0.336, "grad_norm": 1.0544646978378296, "learning_rate": 0.00018672597864768684, "loss": 2.5833, "step": 756 }, { "epoch": 0.33644444444444443, "grad_norm": 1.1433809995651245, "learning_rate": 0.0001867081850533808, "loss": 2.1706, "step": 757 }, { "epoch": 0.3368888888888889, "grad_norm": 1.170425295829773, "learning_rate": 0.00018669039145907475, "loss": 2.3311, "step": 758 }, { "epoch": 0.3373333333333333, "grad_norm": 1.2254000902175903, "learning_rate": 0.00018667259786476868, "loss": 2.671, "step": 759 }, { "epoch": 0.3377777777777778, "grad_norm": 1.2437927722930908, "learning_rate": 0.00018665480427046264, "loss": 2.9828, "step": 760 }, { "epoch": 0.3382222222222222, "grad_norm": 1.3588943481445312, "learning_rate": 0.0001866370106761566, "loss": 2.8734, "step": 761 }, { "epoch": 0.33866666666666667, "grad_norm": 1.5109245777130127, "learning_rate": 0.00018661921708185055, "loss": 2.0159, "step": 762 }, { "epoch": 0.3391111111111111, "grad_norm": 1.2116190195083618, "learning_rate": 0.0001866014234875445, "loss": 2.1226, "step": 763 }, { "epoch": 0.33955555555555555, "grad_norm": 1.3050971031188965, "learning_rate": 0.00018658362989323847, "loss": 2.2373, "step": 764 }, { "epoch": 0.34, "grad_norm": 1.2877930402755737, "learning_rate": 0.0001865658362989324, "loss": 2.5477, "step": 765 }, { "epoch": 0.34044444444444444, "grad_norm": 1.306307077407837, "learning_rate": 0.00018654804270462632, "loss": 2.7654, "step": 766 }, { "epoch": 0.3408888888888889, "grad_norm": 1.3450534343719482, "learning_rate": 0.00018653024911032028, "loss": 1.9589, "step": 767 }, { "epoch": 0.3413333333333333, "grad_norm": 1.574854850769043, "learning_rate": 0.00018651245551601424, "loss": 2.778, "step": 768 }, { "epoch": 0.3417777777777778, "grad_norm": 1.3935576677322388, "learning_rate": 0.0001864946619217082, "loss": 2.8835, "step": 769 }, { "epoch": 0.3422222222222222, "grad_norm": 1.169109582901001, "learning_rate": 0.00018647686832740215, "loss": 1.8586, "step": 770 }, { "epoch": 0.3426666666666667, "grad_norm": 1.6845237016677856, "learning_rate": 0.0001864590747330961, "loss": 2.3178, "step": 771 }, { "epoch": 0.3431111111111111, "grad_norm": 2.100719690322876, "learning_rate": 0.00018644128113879004, "loss": 2.5948, "step": 772 }, { "epoch": 0.34355555555555556, "grad_norm": 1.3049522638320923, "learning_rate": 0.000186423487544484, "loss": 2.2732, "step": 773 }, { "epoch": 0.344, "grad_norm": 1.610150933265686, "learning_rate": 0.00018640569395017795, "loss": 2.658, "step": 774 }, { "epoch": 0.34444444444444444, "grad_norm": 2.3013813495635986, "learning_rate": 0.0001863879003558719, "loss": 1.0201, "step": 775 }, { "epoch": 0.3448888888888889, "grad_norm": 1.3802649974822998, "learning_rate": 0.00018637010676156586, "loss": 2.4351, "step": 776 }, { "epoch": 0.3453333333333333, "grad_norm": 1.4456000328063965, "learning_rate": 0.0001863523131672598, "loss": 2.9545, "step": 777 }, { "epoch": 0.3457777777777778, "grad_norm": 1.7522518634796143, "learning_rate": 0.00018633451957295375, "loss": 2.3526, "step": 778 }, { "epoch": 0.3462222222222222, "grad_norm": 1.460486650466919, "learning_rate": 0.00018631672597864768, "loss": 2.9393, "step": 779 }, { "epoch": 0.3466666666666667, "grad_norm": 1.650462031364441, "learning_rate": 0.00018629893238434163, "loss": 2.8534, "step": 780 }, { "epoch": 0.3471111111111111, "grad_norm": 1.3861716985702515, "learning_rate": 0.0001862811387900356, "loss": 2.7778, "step": 781 }, { "epoch": 0.34755555555555556, "grad_norm": 1.4128412008285522, "learning_rate": 0.00018626334519572955, "loss": 2.5953, "step": 782 }, { "epoch": 0.348, "grad_norm": 1.5334755182266235, "learning_rate": 0.0001862455516014235, "loss": 2.4878, "step": 783 }, { "epoch": 0.34844444444444445, "grad_norm": 1.4212511777877808, "learning_rate": 0.00018622775800711746, "loss": 2.4014, "step": 784 }, { "epoch": 0.3488888888888889, "grad_norm": 1.7563406229019165, "learning_rate": 0.0001862099644128114, "loss": 2.8054, "step": 785 }, { "epoch": 0.34933333333333333, "grad_norm": 1.8490791320800781, "learning_rate": 0.00018619217081850535, "loss": 2.5487, "step": 786 }, { "epoch": 0.3497777777777778, "grad_norm": 1.55039644241333, "learning_rate": 0.0001861743772241993, "loss": 2.1236, "step": 787 }, { "epoch": 0.3502222222222222, "grad_norm": 1.187299132347107, "learning_rate": 0.00018615658362989326, "loss": 1.773, "step": 788 }, { "epoch": 0.3506666666666667, "grad_norm": 1.3866082429885864, "learning_rate": 0.00018613879003558722, "loss": 2.0484, "step": 789 }, { "epoch": 0.3511111111111111, "grad_norm": 1.5214849710464478, "learning_rate": 0.00018612099644128114, "loss": 2.5582, "step": 790 }, { "epoch": 0.35155555555555557, "grad_norm": 1.9252493381500244, "learning_rate": 0.0001861032028469751, "loss": 2.6299, "step": 791 }, { "epoch": 0.352, "grad_norm": 2.005993366241455, "learning_rate": 0.00018608540925266903, "loss": 2.9747, "step": 792 }, { "epoch": 0.35244444444444445, "grad_norm": 1.4414646625518799, "learning_rate": 0.000186067615658363, "loss": 2.3433, "step": 793 }, { "epoch": 0.35288888888888886, "grad_norm": 1.6313060522079468, "learning_rate": 0.00018604982206405694, "loss": 2.4237, "step": 794 }, { "epoch": 0.35333333333333333, "grad_norm": 1.917683720588684, "learning_rate": 0.0001860320284697509, "loss": 2.7755, "step": 795 }, { "epoch": 0.3537777777777778, "grad_norm": 1.681670069694519, "learning_rate": 0.00018601423487544486, "loss": 2.5065, "step": 796 }, { "epoch": 0.3542222222222222, "grad_norm": 1.6937282085418701, "learning_rate": 0.0001859964412811388, "loss": 2.1956, "step": 797 }, { "epoch": 0.3546666666666667, "grad_norm": 1.8412083387374878, "learning_rate": 0.00018597864768683274, "loss": 2.2921, "step": 798 }, { "epoch": 0.3551111111111111, "grad_norm": 2.1753652095794678, "learning_rate": 0.0001859608540925267, "loss": 2.0335, "step": 799 }, { "epoch": 0.35555555555555557, "grad_norm": 3.113302707672119, "learning_rate": 0.00018594306049822066, "loss": 2.3211, "step": 800 }, { "epoch": 0.356, "grad_norm": 1.0349335670471191, "learning_rate": 0.0001859252669039146, "loss": 2.5146, "step": 801 }, { "epoch": 0.35644444444444445, "grad_norm": 1.299102544784546, "learning_rate": 0.00018590747330960857, "loss": 2.537, "step": 802 }, { "epoch": 0.35688888888888887, "grad_norm": 1.0697323083877563, "learning_rate": 0.0001858896797153025, "loss": 2.2026, "step": 803 }, { "epoch": 0.35733333333333334, "grad_norm": 1.0610216856002808, "learning_rate": 0.00018587188612099645, "loss": 2.6074, "step": 804 }, { "epoch": 0.35777777777777775, "grad_norm": 1.3162498474121094, "learning_rate": 0.00018585409252669038, "loss": 3.1557, "step": 805 }, { "epoch": 0.3582222222222222, "grad_norm": 1.2941645383834839, "learning_rate": 0.00018583629893238434, "loss": 2.8547, "step": 806 }, { "epoch": 0.3586666666666667, "grad_norm": 1.1461007595062256, "learning_rate": 0.0001858185053380783, "loss": 2.7406, "step": 807 }, { "epoch": 0.3591111111111111, "grad_norm": 1.1096692085266113, "learning_rate": 0.00018580071174377225, "loss": 2.3455, "step": 808 }, { "epoch": 0.3595555555555556, "grad_norm": 1.158469796180725, "learning_rate": 0.0001857829181494662, "loss": 2.9105, "step": 809 }, { "epoch": 0.36, "grad_norm": 1.1534368991851807, "learning_rate": 0.00018576512455516017, "loss": 2.6555, "step": 810 }, { "epoch": 0.36044444444444446, "grad_norm": 1.1266659498214722, "learning_rate": 0.0001857473309608541, "loss": 2.7283, "step": 811 }, { "epoch": 0.36088888888888887, "grad_norm": 1.1437948942184448, "learning_rate": 0.00018572953736654805, "loss": 2.4803, "step": 812 }, { "epoch": 0.36133333333333334, "grad_norm": 1.182286262512207, "learning_rate": 0.000185711743772242, "loss": 2.6214, "step": 813 }, { "epoch": 0.36177777777777775, "grad_norm": 1.253722071647644, "learning_rate": 0.00018569395017793596, "loss": 2.6758, "step": 814 }, { "epoch": 0.3622222222222222, "grad_norm": 1.2334574460983276, "learning_rate": 0.00018567615658362992, "loss": 2.6005, "step": 815 }, { "epoch": 0.3626666666666667, "grad_norm": 1.1198886632919312, "learning_rate": 0.00018565836298932385, "loss": 2.4263, "step": 816 }, { "epoch": 0.3631111111111111, "grad_norm": 1.501847743988037, "learning_rate": 0.00018564056939501778, "loss": 3.3744, "step": 817 }, { "epoch": 0.3635555555555556, "grad_norm": 1.3934186697006226, "learning_rate": 0.00018562277580071174, "loss": 3.2117, "step": 818 }, { "epoch": 0.364, "grad_norm": 1.3710157871246338, "learning_rate": 0.0001856049822064057, "loss": 1.9331, "step": 819 }, { "epoch": 0.36444444444444446, "grad_norm": 1.4316257238388062, "learning_rate": 0.00018558718861209965, "loss": 1.9532, "step": 820 }, { "epoch": 0.3648888888888889, "grad_norm": 1.8586760759353638, "learning_rate": 0.0001855693950177936, "loss": 2.8284, "step": 821 }, { "epoch": 0.36533333333333334, "grad_norm": 1.5303040742874146, "learning_rate": 0.00018555160142348756, "loss": 2.0609, "step": 822 }, { "epoch": 0.36577777777777776, "grad_norm": 1.5688817501068115, "learning_rate": 0.0001855338078291815, "loss": 2.5499, "step": 823 }, { "epoch": 0.3662222222222222, "grad_norm": 1.3707939386367798, "learning_rate": 0.00018551601423487545, "loss": 2.4933, "step": 824 }, { "epoch": 0.36666666666666664, "grad_norm": 1.435909628868103, "learning_rate": 0.0001854982206405694, "loss": 2.7909, "step": 825 }, { "epoch": 0.3671111111111111, "grad_norm": 1.5045204162597656, "learning_rate": 0.00018548042704626336, "loss": 2.4644, "step": 826 }, { "epoch": 0.3675555555555556, "grad_norm": 1.3811103105545044, "learning_rate": 0.00018546263345195732, "loss": 2.1934, "step": 827 }, { "epoch": 0.368, "grad_norm": 1.7939069271087646, "learning_rate": 0.00018544483985765127, "loss": 2.2179, "step": 828 }, { "epoch": 0.36844444444444446, "grad_norm": 1.7151756286621094, "learning_rate": 0.0001854270462633452, "loss": 2.9784, "step": 829 }, { "epoch": 0.3688888888888889, "grad_norm": 1.6932202577590942, "learning_rate": 0.00018540925266903913, "loss": 2.5832, "step": 830 }, { "epoch": 0.36933333333333335, "grad_norm": 1.4874944686889648, "learning_rate": 0.0001853914590747331, "loss": 2.2795, "step": 831 }, { "epoch": 0.36977777777777776, "grad_norm": 1.5106111764907837, "learning_rate": 0.00018537366548042705, "loss": 1.6537, "step": 832 }, { "epoch": 0.37022222222222223, "grad_norm": 1.5347083806991577, "learning_rate": 0.000185355871886121, "loss": 2.5899, "step": 833 }, { "epoch": 0.37066666666666664, "grad_norm": 1.2724993228912354, "learning_rate": 0.00018533807829181496, "loss": 2.4979, "step": 834 }, { "epoch": 0.3711111111111111, "grad_norm": 1.5324300527572632, "learning_rate": 0.00018532028469750892, "loss": 2.4469, "step": 835 }, { "epoch": 0.37155555555555553, "grad_norm": 1.6249970197677612, "learning_rate": 0.00018530249110320285, "loss": 2.6245, "step": 836 }, { "epoch": 0.372, "grad_norm": 1.3731900453567505, "learning_rate": 0.0001852846975088968, "loss": 1.9859, "step": 837 }, { "epoch": 0.37244444444444447, "grad_norm": 1.437991976737976, "learning_rate": 0.00018526690391459076, "loss": 2.4142, "step": 838 }, { "epoch": 0.3728888888888889, "grad_norm": 2.2342700958251953, "learning_rate": 0.00018524911032028471, "loss": 1.0456, "step": 839 }, { "epoch": 0.37333333333333335, "grad_norm": 1.4727734327316284, "learning_rate": 0.00018523131672597867, "loss": 2.3314, "step": 840 }, { "epoch": 0.37377777777777776, "grad_norm": 1.6986255645751953, "learning_rate": 0.00018521352313167263, "loss": 2.5555, "step": 841 }, { "epoch": 0.37422222222222223, "grad_norm": 1.611127495765686, "learning_rate": 0.00018519572953736656, "loss": 2.5777, "step": 842 }, { "epoch": 0.37466666666666665, "grad_norm": 1.5206453800201416, "learning_rate": 0.0001851779359430605, "loss": 2.8207, "step": 843 }, { "epoch": 0.3751111111111111, "grad_norm": 1.5014015436172485, "learning_rate": 0.00018516014234875444, "loss": 2.2796, "step": 844 }, { "epoch": 0.37555555555555553, "grad_norm": 1.9145801067352295, "learning_rate": 0.0001851423487544484, "loss": 2.432, "step": 845 }, { "epoch": 0.376, "grad_norm": 1.7269951105117798, "learning_rate": 0.00018512455516014236, "loss": 2.8725, "step": 846 }, { "epoch": 0.37644444444444447, "grad_norm": 2.094856023788452, "learning_rate": 0.0001851067615658363, "loss": 2.5228, "step": 847 }, { "epoch": 0.3768888888888889, "grad_norm": 4.175536155700684, "learning_rate": 0.00018508896797153027, "loss": 1.4835, "step": 848 }, { "epoch": 0.37733333333333335, "grad_norm": 2.4652979373931885, "learning_rate": 0.0001850711743772242, "loss": 2.4617, "step": 849 }, { "epoch": 0.37777777777777777, "grad_norm": 2.9979593753814697, "learning_rate": 0.00018505338078291815, "loss": 2.2239, "step": 850 }, { "epoch": 0.37822222222222224, "grad_norm": 2.925299644470215, "learning_rate": 0.0001850355871886121, "loss": 1.4753, "step": 851 }, { "epoch": 0.37866666666666665, "grad_norm": 1.156390905380249, "learning_rate": 0.00018501779359430607, "loss": 2.5781, "step": 852 }, { "epoch": 0.3791111111111111, "grad_norm": 1.1416361331939697, "learning_rate": 0.00018500000000000002, "loss": 2.5683, "step": 853 }, { "epoch": 0.37955555555555553, "grad_norm": 1.1092318296432495, "learning_rate": 0.00018498220640569398, "loss": 2.6212, "step": 854 }, { "epoch": 0.38, "grad_norm": 1.1286892890930176, "learning_rate": 0.0001849644128113879, "loss": 2.3329, "step": 855 }, { "epoch": 0.3804444444444444, "grad_norm": 1.3141859769821167, "learning_rate": 0.00018494661921708184, "loss": 2.6131, "step": 856 }, { "epoch": 0.3808888888888889, "grad_norm": 1.1940083503723145, "learning_rate": 0.0001849288256227758, "loss": 2.6077, "step": 857 }, { "epoch": 0.38133333333333336, "grad_norm": 1.2545088529586792, "learning_rate": 0.00018491103202846975, "loss": 2.6905, "step": 858 }, { "epoch": 0.38177777777777777, "grad_norm": 1.0562766790390015, "learning_rate": 0.0001848932384341637, "loss": 2.1665, "step": 859 }, { "epoch": 0.38222222222222224, "grad_norm": 1.2652605772018433, "learning_rate": 0.00018487544483985767, "loss": 2.6562, "step": 860 }, { "epoch": 0.38266666666666665, "grad_norm": 1.213104486465454, "learning_rate": 0.00018485765124555162, "loss": 2.3212, "step": 861 }, { "epoch": 0.3831111111111111, "grad_norm": 1.2592909336090088, "learning_rate": 0.00018483985765124555, "loss": 2.2561, "step": 862 }, { "epoch": 0.38355555555555554, "grad_norm": 1.3437938690185547, "learning_rate": 0.0001848220640569395, "loss": 2.7051, "step": 863 }, { "epoch": 0.384, "grad_norm": 1.2356623411178589, "learning_rate": 0.00018480427046263346, "loss": 2.1958, "step": 864 }, { "epoch": 0.3844444444444444, "grad_norm": 1.6262998580932617, "learning_rate": 0.00018478647686832742, "loss": 3.2701, "step": 865 }, { "epoch": 0.3848888888888889, "grad_norm": 1.336805820465088, "learning_rate": 0.00018476868327402138, "loss": 2.8542, "step": 866 }, { "epoch": 0.38533333333333336, "grad_norm": 1.4716001749038696, "learning_rate": 0.00018475088967971533, "loss": 2.5937, "step": 867 }, { "epoch": 0.3857777777777778, "grad_norm": 1.3492522239685059, "learning_rate": 0.00018473309608540926, "loss": 2.7743, "step": 868 }, { "epoch": 0.38622222222222224, "grad_norm": 1.2297523021697998, "learning_rate": 0.0001847153024911032, "loss": 2.1033, "step": 869 }, { "epoch": 0.38666666666666666, "grad_norm": 1.3531607389450073, "learning_rate": 0.00018469750889679715, "loss": 2.3351, "step": 870 }, { "epoch": 0.38711111111111113, "grad_norm": 1.213259220123291, "learning_rate": 0.0001846797153024911, "loss": 2.422, "step": 871 }, { "epoch": 0.38755555555555554, "grad_norm": 1.6566977500915527, "learning_rate": 0.00018466192170818506, "loss": 3.0473, "step": 872 }, { "epoch": 0.388, "grad_norm": 1.389674425125122, "learning_rate": 0.00018464412811387902, "loss": 2.6186, "step": 873 }, { "epoch": 0.3884444444444444, "grad_norm": 1.4744458198547363, "learning_rate": 0.00018462633451957298, "loss": 2.5226, "step": 874 }, { "epoch": 0.3888888888888889, "grad_norm": 1.5133804082870483, "learning_rate": 0.0001846085409252669, "loss": 1.6426, "step": 875 }, { "epoch": 0.3893333333333333, "grad_norm": 1.3070919513702393, "learning_rate": 0.00018459074733096086, "loss": 2.5463, "step": 876 }, { "epoch": 0.3897777777777778, "grad_norm": 1.5536634922027588, "learning_rate": 0.00018457295373665482, "loss": 2.5429, "step": 877 }, { "epoch": 0.39022222222222225, "grad_norm": 1.3954426050186157, "learning_rate": 0.00018455516014234877, "loss": 2.1144, "step": 878 }, { "epoch": 0.39066666666666666, "grad_norm": 1.403937816619873, "learning_rate": 0.00018453736654804273, "loss": 2.4386, "step": 879 }, { "epoch": 0.39111111111111113, "grad_norm": 1.692806601524353, "learning_rate": 0.0001845195729537367, "loss": 2.7535, "step": 880 }, { "epoch": 0.39155555555555555, "grad_norm": 2.0360846519470215, "learning_rate": 0.00018450177935943062, "loss": 2.3359, "step": 881 }, { "epoch": 0.392, "grad_norm": 1.7397620677947998, "learning_rate": 0.00018448398576512455, "loss": 2.8038, "step": 882 }, { "epoch": 0.39244444444444443, "grad_norm": 1.9279683828353882, "learning_rate": 0.0001844661921708185, "loss": 2.6394, "step": 883 }, { "epoch": 0.3928888888888889, "grad_norm": 1.6761490106582642, "learning_rate": 0.00018444839857651246, "loss": 2.6922, "step": 884 }, { "epoch": 0.3933333333333333, "grad_norm": 1.7097992897033691, "learning_rate": 0.00018443060498220642, "loss": 2.8203, "step": 885 }, { "epoch": 0.3937777777777778, "grad_norm": 1.796673059463501, "learning_rate": 0.00018441281138790037, "loss": 3.291, "step": 886 }, { "epoch": 0.3942222222222222, "grad_norm": 1.646627426147461, "learning_rate": 0.00018439501779359433, "loss": 2.6629, "step": 887 }, { "epoch": 0.39466666666666667, "grad_norm": 1.7203751802444458, "learning_rate": 0.00018437722419928826, "loss": 2.3987, "step": 888 }, { "epoch": 0.39511111111111114, "grad_norm": 2.0489413738250732, "learning_rate": 0.00018435943060498221, "loss": 2.5578, "step": 889 }, { "epoch": 0.39555555555555555, "grad_norm": 1.5938488245010376, "learning_rate": 0.00018434163701067617, "loss": 2.1986, "step": 890 }, { "epoch": 0.396, "grad_norm": 1.604232907295227, "learning_rate": 0.00018432384341637013, "loss": 2.0916, "step": 891 }, { "epoch": 0.39644444444444443, "grad_norm": 1.5554643869400024, "learning_rate": 0.00018430604982206408, "loss": 2.5466, "step": 892 }, { "epoch": 0.3968888888888889, "grad_norm": 1.606425166130066, "learning_rate": 0.000184288256227758, "loss": 2.6946, "step": 893 }, { "epoch": 0.3973333333333333, "grad_norm": 1.6642916202545166, "learning_rate": 0.00018427046263345197, "loss": 2.1499, "step": 894 }, { "epoch": 0.3977777777777778, "grad_norm": 1.8029860258102417, "learning_rate": 0.0001842526690391459, "loss": 2.6755, "step": 895 }, { "epoch": 0.3982222222222222, "grad_norm": 2.077056407928467, "learning_rate": 0.00018423487544483986, "loss": 2.7935, "step": 896 }, { "epoch": 0.39866666666666667, "grad_norm": 1.7928773164749146, "learning_rate": 0.0001842170818505338, "loss": 2.8071, "step": 897 }, { "epoch": 0.39911111111111114, "grad_norm": 2.6374940872192383, "learning_rate": 0.00018419928825622777, "loss": 3.2532, "step": 898 }, { "epoch": 0.39955555555555555, "grad_norm": 2.1440799236297607, "learning_rate": 0.00018418149466192173, "loss": 1.447, "step": 899 }, { "epoch": 0.4, "grad_norm": 1.737084984779358, "learning_rate": 0.00018416370106761568, "loss": 2.3376, "step": 900 }, { "epoch": 0.40044444444444444, "grad_norm": 1.0363576412200928, "learning_rate": 0.0001841459074733096, "loss": 2.5708, "step": 901 }, { "epoch": 0.4008888888888889, "grad_norm": 0.8888387084007263, "learning_rate": 0.00018412811387900357, "loss": 2.8373, "step": 902 }, { "epoch": 0.4013333333333333, "grad_norm": 0.8826941847801208, "learning_rate": 0.00018411032028469752, "loss": 2.6704, "step": 903 }, { "epoch": 0.4017777777777778, "grad_norm": 0.9509096741676331, "learning_rate": 0.00018409252669039148, "loss": 2.6441, "step": 904 }, { "epoch": 0.4022222222222222, "grad_norm": 1.036983847618103, "learning_rate": 0.00018407473309608544, "loss": 2.3402, "step": 905 }, { "epoch": 0.4026666666666667, "grad_norm": 0.9840334057807922, "learning_rate": 0.00018405693950177937, "loss": 2.2159, "step": 906 }, { "epoch": 0.4031111111111111, "grad_norm": 1.118037223815918, "learning_rate": 0.0001840391459074733, "loss": 2.4926, "step": 907 }, { "epoch": 0.40355555555555556, "grad_norm": 1.1225322484970093, "learning_rate": 0.00018402135231316725, "loss": 2.6811, "step": 908 }, { "epoch": 0.404, "grad_norm": 1.295507788658142, "learning_rate": 0.0001840035587188612, "loss": 2.8815, "step": 909 }, { "epoch": 0.40444444444444444, "grad_norm": 1.1872442960739136, "learning_rate": 0.00018398576512455517, "loss": 3.0637, "step": 910 }, { "epoch": 0.4048888888888889, "grad_norm": 1.2855168581008911, "learning_rate": 0.00018396797153024912, "loss": 2.4883, "step": 911 }, { "epoch": 0.4053333333333333, "grad_norm": 1.232706904411316, "learning_rate": 0.00018395017793594308, "loss": 2.6065, "step": 912 }, { "epoch": 0.4057777777777778, "grad_norm": 1.326191782951355, "learning_rate": 0.000183932384341637, "loss": 2.0947, "step": 913 }, { "epoch": 0.4062222222222222, "grad_norm": 1.2210899591445923, "learning_rate": 0.00018391459074733096, "loss": 2.8879, "step": 914 }, { "epoch": 0.4066666666666667, "grad_norm": 1.358302354812622, "learning_rate": 0.00018389679715302492, "loss": 2.6797, "step": 915 }, { "epoch": 0.4071111111111111, "grad_norm": 1.2646130323410034, "learning_rate": 0.00018387900355871888, "loss": 2.2242, "step": 916 }, { "epoch": 0.40755555555555556, "grad_norm": 1.2646642923355103, "learning_rate": 0.00018386120996441283, "loss": 2.5151, "step": 917 }, { "epoch": 0.408, "grad_norm": 1.4230983257293701, "learning_rate": 0.0001838434163701068, "loss": 1.5127, "step": 918 }, { "epoch": 0.40844444444444444, "grad_norm": 1.5309816598892212, "learning_rate": 0.00018382562277580072, "loss": 2.9285, "step": 919 }, { "epoch": 0.4088888888888889, "grad_norm": 1.2716691493988037, "learning_rate": 0.00018380782918149465, "loss": 2.5965, "step": 920 }, { "epoch": 0.4093333333333333, "grad_norm": 1.433159589767456, "learning_rate": 0.0001837900355871886, "loss": 2.5312, "step": 921 }, { "epoch": 0.4097777777777778, "grad_norm": 1.3177905082702637, "learning_rate": 0.00018377224199288256, "loss": 2.5805, "step": 922 }, { "epoch": 0.4102222222222222, "grad_norm": 1.6881523132324219, "learning_rate": 0.00018375444839857652, "loss": 2.5188, "step": 923 }, { "epoch": 0.4106666666666667, "grad_norm": 1.5823473930358887, "learning_rate": 0.00018373665480427047, "loss": 2.6616, "step": 924 }, { "epoch": 0.4111111111111111, "grad_norm": 1.2907118797302246, "learning_rate": 0.00018371886120996443, "loss": 2.7342, "step": 925 }, { "epoch": 0.41155555555555556, "grad_norm": 1.569952368736267, "learning_rate": 0.00018370106761565836, "loss": 2.6745, "step": 926 }, { "epoch": 0.412, "grad_norm": 1.2594976425170898, "learning_rate": 0.00018368327402135232, "loss": 2.1675, "step": 927 }, { "epoch": 0.41244444444444445, "grad_norm": 1.449838638305664, "learning_rate": 0.00018366548042704627, "loss": 2.7803, "step": 928 }, { "epoch": 0.4128888888888889, "grad_norm": 1.5406020879745483, "learning_rate": 0.00018364768683274023, "loss": 3.0986, "step": 929 }, { "epoch": 0.41333333333333333, "grad_norm": 1.357981562614441, "learning_rate": 0.0001836298932384342, "loss": 2.2642, "step": 930 }, { "epoch": 0.4137777777777778, "grad_norm": 1.4212137460708618, "learning_rate": 0.00018361209964412814, "loss": 2.4661, "step": 931 }, { "epoch": 0.4142222222222222, "grad_norm": 1.3381963968276978, "learning_rate": 0.00018359430604982207, "loss": 2.1015, "step": 932 }, { "epoch": 0.4146666666666667, "grad_norm": 1.223344326019287, "learning_rate": 0.000183576512455516, "loss": 1.0327, "step": 933 }, { "epoch": 0.4151111111111111, "grad_norm": 1.5084744691848755, "learning_rate": 0.00018355871886120996, "loss": 3.0321, "step": 934 }, { "epoch": 0.41555555555555557, "grad_norm": 1.3545995950698853, "learning_rate": 0.00018354092526690392, "loss": 1.6716, "step": 935 }, { "epoch": 0.416, "grad_norm": 1.775868535041809, "learning_rate": 0.00018352313167259787, "loss": 1.6868, "step": 936 }, { "epoch": 0.41644444444444445, "grad_norm": 1.401804804801941, "learning_rate": 0.00018350533807829183, "loss": 2.2054, "step": 937 }, { "epoch": 0.41688888888888886, "grad_norm": 1.4973540306091309, "learning_rate": 0.00018348754448398578, "loss": 2.396, "step": 938 }, { "epoch": 0.41733333333333333, "grad_norm": 1.5535446405410767, "learning_rate": 0.00018346975088967971, "loss": 2.465, "step": 939 }, { "epoch": 0.4177777777777778, "grad_norm": 1.7133632898330688, "learning_rate": 0.00018345195729537367, "loss": 2.8616, "step": 940 }, { "epoch": 0.4182222222222222, "grad_norm": 1.5524804592132568, "learning_rate": 0.00018343416370106763, "loss": 2.1337, "step": 941 }, { "epoch": 0.4186666666666667, "grad_norm": 1.4653924703598022, "learning_rate": 0.00018341637010676158, "loss": 2.3331, "step": 942 }, { "epoch": 0.4191111111111111, "grad_norm": 1.5715734958648682, "learning_rate": 0.00018339857651245554, "loss": 2.1571, "step": 943 }, { "epoch": 0.41955555555555557, "grad_norm": 1.7893381118774414, "learning_rate": 0.0001833807829181495, "loss": 2.7177, "step": 944 }, { "epoch": 0.42, "grad_norm": 2.0075345039367676, "learning_rate": 0.00018336298932384343, "loss": 2.3181, "step": 945 }, { "epoch": 0.42044444444444445, "grad_norm": 2.4787044525146484, "learning_rate": 0.00018334519572953736, "loss": 2.8261, "step": 946 }, { "epoch": 0.42088888888888887, "grad_norm": 1.778351068496704, "learning_rate": 0.0001833274021352313, "loss": 2.7114, "step": 947 }, { "epoch": 0.42133333333333334, "grad_norm": 2.3666553497314453, "learning_rate": 0.00018330960854092527, "loss": 2.6281, "step": 948 }, { "epoch": 0.42177777777777775, "grad_norm": 2.386976718902588, "learning_rate": 0.00018329181494661922, "loss": 3.5857, "step": 949 }, { "epoch": 0.4222222222222222, "grad_norm": 1.791477918624878, "learning_rate": 0.00018327402135231318, "loss": 2.5117, "step": 950 }, { "epoch": 0.4226666666666667, "grad_norm": 1.2231998443603516, "learning_rate": 0.00018325622775800714, "loss": 2.0187, "step": 951 }, { "epoch": 0.4231111111111111, "grad_norm": 1.0212533473968506, "learning_rate": 0.00018323843416370107, "loss": 2.8211, "step": 952 }, { "epoch": 0.4235555555555556, "grad_norm": 1.1677687168121338, "learning_rate": 0.00018322064056939502, "loss": 2.5803, "step": 953 }, { "epoch": 0.424, "grad_norm": 1.0932122468948364, "learning_rate": 0.00018320284697508898, "loss": 2.5059, "step": 954 }, { "epoch": 0.42444444444444446, "grad_norm": 1.079312801361084, "learning_rate": 0.00018318505338078294, "loss": 2.4724, "step": 955 }, { "epoch": 0.42488888888888887, "grad_norm": 1.324885368347168, "learning_rate": 0.0001831672597864769, "loss": 2.8136, "step": 956 }, { "epoch": 0.42533333333333334, "grad_norm": 1.3072516918182373, "learning_rate": 0.00018314946619217085, "loss": 2.9349, "step": 957 }, { "epoch": 0.42577777777777776, "grad_norm": 1.3189862966537476, "learning_rate": 0.00018313167259786478, "loss": 2.5912, "step": 958 }, { "epoch": 0.4262222222222222, "grad_norm": 1.2304902076721191, "learning_rate": 0.0001831138790035587, "loss": 2.5022, "step": 959 }, { "epoch": 0.4266666666666667, "grad_norm": 1.3272204399108887, "learning_rate": 0.00018309608540925266, "loss": 3.0605, "step": 960 }, { "epoch": 0.4271111111111111, "grad_norm": 1.2381232976913452, "learning_rate": 0.00018307829181494662, "loss": 2.0443, "step": 961 }, { "epoch": 0.4275555555555556, "grad_norm": 1.268486499786377, "learning_rate": 0.00018306049822064058, "loss": 2.2911, "step": 962 }, { "epoch": 0.428, "grad_norm": 1.2793582677841187, "learning_rate": 0.00018304270462633453, "loss": 2.8313, "step": 963 }, { "epoch": 0.42844444444444446, "grad_norm": 1.3932663202285767, "learning_rate": 0.0001830249110320285, "loss": 2.8657, "step": 964 }, { "epoch": 0.4288888888888889, "grad_norm": 1.2110832929611206, "learning_rate": 0.00018300711743772242, "loss": 2.5214, "step": 965 }, { "epoch": 0.42933333333333334, "grad_norm": 1.253836989402771, "learning_rate": 0.00018298932384341638, "loss": 2.5128, "step": 966 }, { "epoch": 0.42977777777777776, "grad_norm": 1.33391273021698, "learning_rate": 0.00018297153024911033, "loss": 2.2437, "step": 967 }, { "epoch": 0.43022222222222223, "grad_norm": 1.2081773281097412, "learning_rate": 0.0001829537366548043, "loss": 2.5682, "step": 968 }, { "epoch": 0.43066666666666664, "grad_norm": 1.3490543365478516, "learning_rate": 0.00018293594306049825, "loss": 2.6874, "step": 969 }, { "epoch": 0.4311111111111111, "grad_norm": 1.4848097562789917, "learning_rate": 0.0001829181494661922, "loss": 2.2042, "step": 970 }, { "epoch": 0.4315555555555556, "grad_norm": 1.2465113401412964, "learning_rate": 0.00018290035587188613, "loss": 2.0922, "step": 971 }, { "epoch": 0.432, "grad_norm": 1.3515832424163818, "learning_rate": 0.00018288256227758006, "loss": 2.5685, "step": 972 }, { "epoch": 0.43244444444444446, "grad_norm": 1.4319607019424438, "learning_rate": 0.00018286476868327402, "loss": 2.0091, "step": 973 }, { "epoch": 0.4328888888888889, "grad_norm": 1.680587887763977, "learning_rate": 0.00018284697508896797, "loss": 2.6249, "step": 974 }, { "epoch": 0.43333333333333335, "grad_norm": 1.3431737422943115, "learning_rate": 0.00018282918149466193, "loss": 2.5173, "step": 975 }, { "epoch": 0.43377777777777776, "grad_norm": 1.2416516542434692, "learning_rate": 0.0001828113879003559, "loss": 2.644, "step": 976 }, { "epoch": 0.43422222222222223, "grad_norm": 1.2653284072875977, "learning_rate": 0.00018279359430604984, "loss": 2.4739, "step": 977 }, { "epoch": 0.43466666666666665, "grad_norm": 1.5131683349609375, "learning_rate": 0.00018277580071174377, "loss": 2.8553, "step": 978 }, { "epoch": 0.4351111111111111, "grad_norm": 1.4946473836898804, "learning_rate": 0.00018275800711743773, "loss": 2.6073, "step": 979 }, { "epoch": 0.43555555555555553, "grad_norm": 1.3179363012313843, "learning_rate": 0.0001827402135231317, "loss": 2.7802, "step": 980 }, { "epoch": 0.436, "grad_norm": 1.5546060800552368, "learning_rate": 0.00018272241992882564, "loss": 2.9142, "step": 981 }, { "epoch": 0.43644444444444447, "grad_norm": 1.3515474796295166, "learning_rate": 0.0001827046263345196, "loss": 2.4532, "step": 982 }, { "epoch": 0.4368888888888889, "grad_norm": 1.3547914028167725, "learning_rate": 0.00018268683274021356, "loss": 2.483, "step": 983 }, { "epoch": 0.43733333333333335, "grad_norm": 1.2733529806137085, "learning_rate": 0.00018266903914590749, "loss": 2.0126, "step": 984 }, { "epoch": 0.43777777777777777, "grad_norm": 1.6882998943328857, "learning_rate": 0.00018265124555160141, "loss": 3.432, "step": 985 }, { "epoch": 0.43822222222222224, "grad_norm": 1.3751314878463745, "learning_rate": 0.00018263345195729537, "loss": 2.4604, "step": 986 }, { "epoch": 0.43866666666666665, "grad_norm": 1.412575364112854, "learning_rate": 0.00018261565836298933, "loss": 2.4426, "step": 987 }, { "epoch": 0.4391111111111111, "grad_norm": 1.7669273614883423, "learning_rate": 0.00018259786476868328, "loss": 2.5918, "step": 988 }, { "epoch": 0.43955555555555553, "grad_norm": 1.605697751045227, "learning_rate": 0.00018258007117437724, "loss": 2.7675, "step": 989 }, { "epoch": 0.44, "grad_norm": 1.567189335823059, "learning_rate": 0.0001825622775800712, "loss": 2.9831, "step": 990 }, { "epoch": 0.44044444444444447, "grad_norm": 1.3639848232269287, "learning_rate": 0.00018254448398576513, "loss": 1.9593, "step": 991 }, { "epoch": 0.4408888888888889, "grad_norm": 1.586616039276123, "learning_rate": 0.00018252669039145908, "loss": 2.6094, "step": 992 }, { "epoch": 0.44133333333333336, "grad_norm": 1.5296803712844849, "learning_rate": 0.00018250889679715304, "loss": 2.219, "step": 993 }, { "epoch": 0.44177777777777777, "grad_norm": 1.6999601125717163, "learning_rate": 0.000182491103202847, "loss": 2.669, "step": 994 }, { "epoch": 0.44222222222222224, "grad_norm": 2.0300962924957275, "learning_rate": 0.00018247330960854095, "loss": 2.6897, "step": 995 }, { "epoch": 0.44266666666666665, "grad_norm": 1.7834362983703613, "learning_rate": 0.00018245551601423488, "loss": 2.3681, "step": 996 }, { "epoch": 0.4431111111111111, "grad_norm": 1.7336286306381226, "learning_rate": 0.0001824377224199288, "loss": 2.5579, "step": 997 }, { "epoch": 0.44355555555555554, "grad_norm": 1.8825653791427612, "learning_rate": 0.00018241992882562277, "loss": 2.8049, "step": 998 }, { "epoch": 0.444, "grad_norm": 1.9167088270187378, "learning_rate": 0.00018240213523131672, "loss": 2.9734, "step": 999 }, { "epoch": 0.4444444444444444, "grad_norm": 2.2173099517822266, "learning_rate": 0.00018238434163701068, "loss": 2.6596, "step": 1000 }, { "epoch": 0.4448888888888889, "grad_norm": 0.9079247117042542, "learning_rate": 0.00018236654804270464, "loss": 2.6596, "step": 1001 }, { "epoch": 0.44533333333333336, "grad_norm": 0.9305357336997986, "learning_rate": 0.0001823487544483986, "loss": 2.6803, "step": 1002 }, { "epoch": 0.4457777777777778, "grad_norm": 1.031278133392334, "learning_rate": 0.00018233096085409252, "loss": 2.5942, "step": 1003 }, { "epoch": 0.44622222222222224, "grad_norm": 1.5068715810775757, "learning_rate": 0.00018231316725978648, "loss": 1.6077, "step": 1004 }, { "epoch": 0.44666666666666666, "grad_norm": 1.17013680934906, "learning_rate": 0.00018229537366548044, "loss": 2.4984, "step": 1005 }, { "epoch": 0.4471111111111111, "grad_norm": 1.2330650091171265, "learning_rate": 0.0001822775800711744, "loss": 2.9625, "step": 1006 }, { "epoch": 0.44755555555555554, "grad_norm": 1.1845786571502686, "learning_rate": 0.00018225978647686835, "loss": 2.8457, "step": 1007 }, { "epoch": 0.448, "grad_norm": 1.144061803817749, "learning_rate": 0.0001822419928825623, "loss": 2.0844, "step": 1008 }, { "epoch": 0.4484444444444444, "grad_norm": 1.383981466293335, "learning_rate": 0.00018222419928825624, "loss": 2.4893, "step": 1009 }, { "epoch": 0.4488888888888889, "grad_norm": 1.125385046005249, "learning_rate": 0.00018220640569395016, "loss": 2.2932, "step": 1010 }, { "epoch": 0.4493333333333333, "grad_norm": 1.2119320631027222, "learning_rate": 0.00018218861209964412, "loss": 2.4534, "step": 1011 }, { "epoch": 0.4497777777777778, "grad_norm": 1.108948826789856, "learning_rate": 0.00018217081850533808, "loss": 2.3211, "step": 1012 }, { "epoch": 0.45022222222222225, "grad_norm": 1.3019822835922241, "learning_rate": 0.00018215302491103203, "loss": 2.3836, "step": 1013 }, { "epoch": 0.45066666666666666, "grad_norm": 1.3383592367172241, "learning_rate": 0.000182135231316726, "loss": 2.7794, "step": 1014 }, { "epoch": 0.45111111111111113, "grad_norm": 1.2700541019439697, "learning_rate": 0.00018211743772241995, "loss": 2.488, "step": 1015 }, { "epoch": 0.45155555555555554, "grad_norm": 1.280701756477356, "learning_rate": 0.00018209964412811388, "loss": 2.5005, "step": 1016 }, { "epoch": 0.452, "grad_norm": 1.1454371213912964, "learning_rate": 0.00018208185053380783, "loss": 2.3253, "step": 1017 }, { "epoch": 0.4524444444444444, "grad_norm": 1.242236614227295, "learning_rate": 0.0001820640569395018, "loss": 2.4167, "step": 1018 }, { "epoch": 0.4528888888888889, "grad_norm": 1.2990704774856567, "learning_rate": 0.00018204626334519575, "loss": 2.1872, "step": 1019 }, { "epoch": 0.4533333333333333, "grad_norm": 1.283494472503662, "learning_rate": 0.0001820284697508897, "loss": 2.3171, "step": 1020 }, { "epoch": 0.4537777777777778, "grad_norm": 2.063596487045288, "learning_rate": 0.00018201067615658366, "loss": 1.681, "step": 1021 }, { "epoch": 0.45422222222222225, "grad_norm": 1.3359391689300537, "learning_rate": 0.0001819928825622776, "loss": 2.0327, "step": 1022 }, { "epoch": 0.45466666666666666, "grad_norm": 1.263917326927185, "learning_rate": 0.00018197508896797152, "loss": 2.239, "step": 1023 }, { "epoch": 0.45511111111111113, "grad_norm": 1.8083239793777466, "learning_rate": 0.00018195729537366547, "loss": 2.7766, "step": 1024 }, { "epoch": 0.45555555555555555, "grad_norm": 1.2244737148284912, "learning_rate": 0.00018193950177935943, "loss": 2.0234, "step": 1025 }, { "epoch": 0.456, "grad_norm": 1.4717793464660645, "learning_rate": 0.0001819217081850534, "loss": 2.655, "step": 1026 }, { "epoch": 0.45644444444444443, "grad_norm": 1.5211389064788818, "learning_rate": 0.00018190391459074734, "loss": 2.335, "step": 1027 }, { "epoch": 0.4568888888888889, "grad_norm": 1.3322489261627197, "learning_rate": 0.0001818861209964413, "loss": 2.4195, "step": 1028 }, { "epoch": 0.4573333333333333, "grad_norm": 1.3629570007324219, "learning_rate": 0.00018186832740213523, "loss": 2.0856, "step": 1029 }, { "epoch": 0.4577777777777778, "grad_norm": 1.7404910326004028, "learning_rate": 0.00018185053380782919, "loss": 2.4039, "step": 1030 }, { "epoch": 0.4582222222222222, "grad_norm": 1.3935850858688354, "learning_rate": 0.00018183274021352314, "loss": 2.2005, "step": 1031 }, { "epoch": 0.45866666666666667, "grad_norm": 1.3948017358779907, "learning_rate": 0.0001818149466192171, "loss": 2.7373, "step": 1032 }, { "epoch": 0.45911111111111114, "grad_norm": 1.4649925231933594, "learning_rate": 0.00018179715302491106, "loss": 2.4669, "step": 1033 }, { "epoch": 0.45955555555555555, "grad_norm": 1.5847502946853638, "learning_rate": 0.000181779359430605, "loss": 1.9865, "step": 1034 }, { "epoch": 0.46, "grad_norm": 3.006155490875244, "learning_rate": 0.00018176156583629894, "loss": 0.4302, "step": 1035 }, { "epoch": 0.46044444444444443, "grad_norm": 1.4123693704605103, "learning_rate": 0.00018174377224199287, "loss": 2.6673, "step": 1036 }, { "epoch": 0.4608888888888889, "grad_norm": 1.4881420135498047, "learning_rate": 0.00018172597864768683, "loss": 2.4905, "step": 1037 }, { "epoch": 0.4613333333333333, "grad_norm": 1.5692585706710815, "learning_rate": 0.00018170818505338078, "loss": 2.5459, "step": 1038 }, { "epoch": 0.4617777777777778, "grad_norm": 1.4341565370559692, "learning_rate": 0.00018169039145907474, "loss": 2.3733, "step": 1039 }, { "epoch": 0.4622222222222222, "grad_norm": 1.3901523351669312, "learning_rate": 0.0001816725978647687, "loss": 2.3915, "step": 1040 }, { "epoch": 0.46266666666666667, "grad_norm": 1.5324454307556152, "learning_rate": 0.00018165480427046265, "loss": 2.3619, "step": 1041 }, { "epoch": 0.4631111111111111, "grad_norm": 1.567613959312439, "learning_rate": 0.00018163701067615658, "loss": 1.7382, "step": 1042 }, { "epoch": 0.46355555555555555, "grad_norm": 1.8476368188858032, "learning_rate": 0.00018161921708185054, "loss": 2.8568, "step": 1043 }, { "epoch": 0.464, "grad_norm": 1.7850401401519775, "learning_rate": 0.0001816014234875445, "loss": 2.773, "step": 1044 }, { "epoch": 0.46444444444444444, "grad_norm": 1.4900022745132446, "learning_rate": 0.00018158362989323845, "loss": 2.2433, "step": 1045 }, { "epoch": 0.4648888888888889, "grad_norm": 1.9751386642456055, "learning_rate": 0.0001815658362989324, "loss": 2.9478, "step": 1046 }, { "epoch": 0.4653333333333333, "grad_norm": 2.110889196395874, "learning_rate": 0.00018154804270462637, "loss": 3.1765, "step": 1047 }, { "epoch": 0.4657777777777778, "grad_norm": 1.8263683319091797, "learning_rate": 0.0001815302491103203, "loss": 2.7491, "step": 1048 }, { "epoch": 0.4662222222222222, "grad_norm": 1.875029444694519, "learning_rate": 0.00018151245551601422, "loss": 2.3277, "step": 1049 }, { "epoch": 0.4666666666666667, "grad_norm": 2.229357957839966, "learning_rate": 0.00018149466192170818, "loss": 2.8278, "step": 1050 }, { "epoch": 0.4671111111111111, "grad_norm": 1.0930174589157104, "learning_rate": 0.00018147686832740214, "loss": 2.5997, "step": 1051 }, { "epoch": 0.46755555555555556, "grad_norm": 1.0300097465515137, "learning_rate": 0.0001814590747330961, "loss": 2.647, "step": 1052 }, { "epoch": 0.468, "grad_norm": 1.4961392879486084, "learning_rate": 0.00018144128113879005, "loss": 1.1492, "step": 1053 }, { "epoch": 0.46844444444444444, "grad_norm": 1.1863456964492798, "learning_rate": 0.000181423487544484, "loss": 2.4534, "step": 1054 }, { "epoch": 0.4688888888888889, "grad_norm": 1.156611442565918, "learning_rate": 0.00018140569395017794, "loss": 2.627, "step": 1055 }, { "epoch": 0.4693333333333333, "grad_norm": 1.1836708784103394, "learning_rate": 0.0001813879003558719, "loss": 2.3464, "step": 1056 }, { "epoch": 0.4697777777777778, "grad_norm": 1.2201026678085327, "learning_rate": 0.00018137010676156585, "loss": 2.8861, "step": 1057 }, { "epoch": 0.4702222222222222, "grad_norm": 2.332244396209717, "learning_rate": 0.0001813523131672598, "loss": 1.634, "step": 1058 }, { "epoch": 0.4706666666666667, "grad_norm": 1.337121605873108, "learning_rate": 0.00018133451957295376, "loss": 1.8932, "step": 1059 }, { "epoch": 0.4711111111111111, "grad_norm": 1.2118984460830688, "learning_rate": 0.00018131672597864772, "loss": 2.2048, "step": 1060 }, { "epoch": 0.47155555555555556, "grad_norm": 1.2091714143753052, "learning_rate": 0.00018129893238434165, "loss": 2.3127, "step": 1061 }, { "epoch": 0.472, "grad_norm": 1.2021245956420898, "learning_rate": 0.00018128113879003558, "loss": 2.6434, "step": 1062 }, { "epoch": 0.47244444444444444, "grad_norm": 1.436784267425537, "learning_rate": 0.00018126334519572953, "loss": 2.3693, "step": 1063 }, { "epoch": 0.4728888888888889, "grad_norm": 2.271524429321289, "learning_rate": 0.0001812455516014235, "loss": 2.2956, "step": 1064 }, { "epoch": 0.47333333333333333, "grad_norm": 1.162920355796814, "learning_rate": 0.00018122775800711745, "loss": 2.1083, "step": 1065 }, { "epoch": 0.4737777777777778, "grad_norm": 1.3496499061584473, "learning_rate": 0.0001812099644128114, "loss": 2.7196, "step": 1066 }, { "epoch": 0.4742222222222222, "grad_norm": 1.6458051204681396, "learning_rate": 0.00018119217081850536, "loss": 2.3444, "step": 1067 }, { "epoch": 0.4746666666666667, "grad_norm": 1.4568052291870117, "learning_rate": 0.0001811743772241993, "loss": 2.3294, "step": 1068 }, { "epoch": 0.4751111111111111, "grad_norm": 1.4898393154144287, "learning_rate": 0.00018115658362989325, "loss": 2.5576, "step": 1069 }, { "epoch": 0.47555555555555556, "grad_norm": 1.3853182792663574, "learning_rate": 0.0001811387900355872, "loss": 2.4463, "step": 1070 }, { "epoch": 0.476, "grad_norm": 1.521707534790039, "learning_rate": 0.00018112099644128116, "loss": 2.7504, "step": 1071 }, { "epoch": 0.47644444444444445, "grad_norm": 1.8744828701019287, "learning_rate": 0.00018110320284697512, "loss": 2.4272, "step": 1072 }, { "epoch": 0.47688888888888886, "grad_norm": 1.4953957796096802, "learning_rate": 0.00018108540925266907, "loss": 2.2033, "step": 1073 }, { "epoch": 0.47733333333333333, "grad_norm": 1.463110327720642, "learning_rate": 0.000181067615658363, "loss": 2.3522, "step": 1074 }, { "epoch": 0.4777777777777778, "grad_norm": 1.3929156064987183, "learning_rate": 0.00018104982206405693, "loss": 2.5893, "step": 1075 }, { "epoch": 0.4782222222222222, "grad_norm": 1.3469513654708862, "learning_rate": 0.0001810320284697509, "loss": 2.1896, "step": 1076 }, { "epoch": 0.4786666666666667, "grad_norm": 1.3536866903305054, "learning_rate": 0.00018101423487544484, "loss": 2.8212, "step": 1077 }, { "epoch": 0.4791111111111111, "grad_norm": 1.3798056840896606, "learning_rate": 0.0001809964412811388, "loss": 2.7518, "step": 1078 }, { "epoch": 0.47955555555555557, "grad_norm": 1.553146243095398, "learning_rate": 0.00018097864768683276, "loss": 2.5848, "step": 1079 }, { "epoch": 0.48, "grad_norm": 1.532638669013977, "learning_rate": 0.0001809608540925267, "loss": 2.3551, "step": 1080 }, { "epoch": 0.48044444444444445, "grad_norm": 1.4585469961166382, "learning_rate": 0.00018094306049822064, "loss": 2.6695, "step": 1081 }, { "epoch": 0.48088888888888887, "grad_norm": 1.6650795936584473, "learning_rate": 0.0001809252669039146, "loss": 2.7979, "step": 1082 }, { "epoch": 0.48133333333333334, "grad_norm": 1.3776137828826904, "learning_rate": 0.00018090747330960856, "loss": 2.1564, "step": 1083 }, { "epoch": 0.4817777777777778, "grad_norm": 1.5997897386550903, "learning_rate": 0.0001808896797153025, "loss": 2.4603, "step": 1084 }, { "epoch": 0.4822222222222222, "grad_norm": 1.5599387884140015, "learning_rate": 0.00018087188612099647, "loss": 1.7888, "step": 1085 }, { "epoch": 0.4826666666666667, "grad_norm": 1.5563712120056152, "learning_rate": 0.0001808540925266904, "loss": 2.438, "step": 1086 }, { "epoch": 0.4831111111111111, "grad_norm": 1.3926085233688354, "learning_rate": 0.00018083629893238433, "loss": 2.5696, "step": 1087 }, { "epoch": 0.48355555555555557, "grad_norm": 1.7727118730545044, "learning_rate": 0.00018081850533807828, "loss": 2.2867, "step": 1088 }, { "epoch": 0.484, "grad_norm": 1.5934937000274658, "learning_rate": 0.00018080071174377224, "loss": 2.6703, "step": 1089 }, { "epoch": 0.48444444444444446, "grad_norm": 1.789147973060608, "learning_rate": 0.0001807829181494662, "loss": 2.5851, "step": 1090 }, { "epoch": 0.48488888888888887, "grad_norm": 1.4976032972335815, "learning_rate": 0.00018076512455516015, "loss": 2.1773, "step": 1091 }, { "epoch": 0.48533333333333334, "grad_norm": 1.5114315748214722, "learning_rate": 0.0001807473309608541, "loss": 2.1908, "step": 1092 }, { "epoch": 0.48577777777777775, "grad_norm": 1.5656942129135132, "learning_rate": 0.00018072953736654804, "loss": 2.5476, "step": 1093 }, { "epoch": 0.4862222222222222, "grad_norm": 1.7908459901809692, "learning_rate": 0.000180711743772242, "loss": 2.6867, "step": 1094 }, { "epoch": 0.4866666666666667, "grad_norm": 2.176135301589966, "learning_rate": 0.00018069395017793595, "loss": 2.0774, "step": 1095 }, { "epoch": 0.4871111111111111, "grad_norm": 1.5971789360046387, "learning_rate": 0.0001806761565836299, "loss": 1.9446, "step": 1096 }, { "epoch": 0.4875555555555556, "grad_norm": 1.709897518157959, "learning_rate": 0.00018065836298932386, "loss": 2.4818, "step": 1097 }, { "epoch": 0.488, "grad_norm": 1.9650827646255493, "learning_rate": 0.00018064056939501782, "loss": 2.5649, "step": 1098 }, { "epoch": 0.48844444444444446, "grad_norm": 1.6556960344314575, "learning_rate": 0.00018062277580071175, "loss": 2.4576, "step": 1099 }, { "epoch": 0.4888888888888889, "grad_norm": 2.5766797065734863, "learning_rate": 0.00018060498220640568, "loss": 2.1069, "step": 1100 }, { "epoch": 0.48933333333333334, "grad_norm": 1.075654149055481, "learning_rate": 0.00018058718861209964, "loss": 2.5237, "step": 1101 }, { "epoch": 0.48977777777777776, "grad_norm": 1.0496094226837158, "learning_rate": 0.0001805693950177936, "loss": 2.4905, "step": 1102 }, { "epoch": 0.4902222222222222, "grad_norm": 0.9558612704277039, "learning_rate": 0.00018055160142348755, "loss": 2.9379, "step": 1103 }, { "epoch": 0.49066666666666664, "grad_norm": 1.1676995754241943, "learning_rate": 0.0001805338078291815, "loss": 2.7487, "step": 1104 }, { "epoch": 0.4911111111111111, "grad_norm": 1.0585354566574097, "learning_rate": 0.00018051601423487546, "loss": 2.6956, "step": 1105 }, { "epoch": 0.4915555555555556, "grad_norm": 1.1087442636489868, "learning_rate": 0.0001804982206405694, "loss": 2.6808, "step": 1106 }, { "epoch": 0.492, "grad_norm": 1.053286075592041, "learning_rate": 0.00018048042704626335, "loss": 2.2225, "step": 1107 }, { "epoch": 0.49244444444444446, "grad_norm": 1.1056700944900513, "learning_rate": 0.0001804626334519573, "loss": 2.5699, "step": 1108 }, { "epoch": 0.4928888888888889, "grad_norm": 1.198739767074585, "learning_rate": 0.00018044483985765126, "loss": 2.552, "step": 1109 }, { "epoch": 0.49333333333333335, "grad_norm": 2.429079294204712, "learning_rate": 0.00018042704626334522, "loss": 1.7893, "step": 1110 }, { "epoch": 0.49377777777777776, "grad_norm": 1.0832264423370361, "learning_rate": 0.00018040925266903917, "loss": 2.4136, "step": 1111 }, { "epoch": 0.49422222222222223, "grad_norm": 1.0978045463562012, "learning_rate": 0.0001803914590747331, "loss": 2.1968, "step": 1112 }, { "epoch": 0.49466666666666664, "grad_norm": 1.118681788444519, "learning_rate": 0.00018037366548042703, "loss": 2.3322, "step": 1113 }, { "epoch": 0.4951111111111111, "grad_norm": 1.1858903169631958, "learning_rate": 0.000180355871886121, "loss": 2.5168, "step": 1114 }, { "epoch": 0.4955555555555556, "grad_norm": 1.3438916206359863, "learning_rate": 0.00018033807829181495, "loss": 2.7303, "step": 1115 }, { "epoch": 0.496, "grad_norm": 1.301822543144226, "learning_rate": 0.0001803202846975089, "loss": 2.824, "step": 1116 }, { "epoch": 0.49644444444444447, "grad_norm": 1.2330950498580933, "learning_rate": 0.00018030249110320286, "loss": 3.0469, "step": 1117 }, { "epoch": 0.4968888888888889, "grad_norm": 1.5200353860855103, "learning_rate": 0.00018028469750889682, "loss": 1.5884, "step": 1118 }, { "epoch": 0.49733333333333335, "grad_norm": 1.2964918613433838, "learning_rate": 0.00018026690391459075, "loss": 2.4635, "step": 1119 }, { "epoch": 0.49777777777777776, "grad_norm": 1.351252555847168, "learning_rate": 0.0001802491103202847, "loss": 2.1818, "step": 1120 }, { "epoch": 0.49822222222222223, "grad_norm": 1.2766691446304321, "learning_rate": 0.00018023131672597866, "loss": 2.387, "step": 1121 }, { "epoch": 0.49866666666666665, "grad_norm": 1.4819822311401367, "learning_rate": 0.00018021352313167261, "loss": 2.0414, "step": 1122 }, { "epoch": 0.4991111111111111, "grad_norm": 1.5996578931808472, "learning_rate": 0.00018019572953736657, "loss": 2.6798, "step": 1123 }, { "epoch": 0.49955555555555553, "grad_norm": 1.4682111740112305, "learning_rate": 0.00018017793594306053, "loss": 2.6812, "step": 1124 }, { "epoch": 0.5, "grad_norm": 1.392949104309082, "learning_rate": 0.00018016014234875446, "loss": 2.4617, "step": 1125 }, { "epoch": 0.5004444444444445, "grad_norm": 1.4642528295516968, "learning_rate": 0.0001801423487544484, "loss": 2.6291, "step": 1126 }, { "epoch": 0.5008888888888889, "grad_norm": 1.2145447731018066, "learning_rate": 0.00018012455516014234, "loss": 2.0178, "step": 1127 }, { "epoch": 0.5013333333333333, "grad_norm": 1.6017488241195679, "learning_rate": 0.0001801067615658363, "loss": 2.5999, "step": 1128 }, { "epoch": 0.5017777777777778, "grad_norm": 1.3489327430725098, "learning_rate": 0.00018008896797153026, "loss": 1.565, "step": 1129 }, { "epoch": 0.5022222222222222, "grad_norm": 1.815772533416748, "learning_rate": 0.0001800711743772242, "loss": 2.4781, "step": 1130 }, { "epoch": 0.5026666666666667, "grad_norm": 1.6084818840026855, "learning_rate": 0.00018005338078291817, "loss": 2.2115, "step": 1131 }, { "epoch": 0.5031111111111111, "grad_norm": 1.483842372894287, "learning_rate": 0.0001800355871886121, "loss": 2.4087, "step": 1132 }, { "epoch": 0.5035555555555555, "grad_norm": 1.555029273033142, "learning_rate": 0.00018001779359430605, "loss": 2.6442, "step": 1133 }, { "epoch": 0.504, "grad_norm": 1.6016467809677124, "learning_rate": 0.00018, "loss": 2.3034, "step": 1134 }, { "epoch": 0.5044444444444445, "grad_norm": 1.4886064529418945, "learning_rate": 0.00017998220640569397, "loss": 2.3137, "step": 1135 }, { "epoch": 0.5048888888888889, "grad_norm": 1.590067744255066, "learning_rate": 0.00017996441281138792, "loss": 2.507, "step": 1136 }, { "epoch": 0.5053333333333333, "grad_norm": 1.2926700115203857, "learning_rate": 0.00017994661921708188, "loss": 1.8661, "step": 1137 }, { "epoch": 0.5057777777777778, "grad_norm": 1.8830050230026245, "learning_rate": 0.0001799288256227758, "loss": 2.7748, "step": 1138 }, { "epoch": 0.5062222222222222, "grad_norm": 1.6669584512710571, "learning_rate": 0.00017991103202846974, "loss": 2.7596, "step": 1139 }, { "epoch": 0.5066666666666667, "grad_norm": 1.7653512954711914, "learning_rate": 0.0001798932384341637, "loss": 3.2697, "step": 1140 }, { "epoch": 0.5071111111111111, "grad_norm": 1.8505072593688965, "learning_rate": 0.00017987544483985765, "loss": 2.9115, "step": 1141 }, { "epoch": 0.5075555555555555, "grad_norm": 1.5989995002746582, "learning_rate": 0.0001798576512455516, "loss": 2.0211, "step": 1142 }, { "epoch": 0.508, "grad_norm": 1.929032802581787, "learning_rate": 0.00017983985765124557, "loss": 2.6159, "step": 1143 }, { "epoch": 0.5084444444444445, "grad_norm": 1.9541597366333008, "learning_rate": 0.00017982206405693952, "loss": 2.6225, "step": 1144 }, { "epoch": 0.5088888888888888, "grad_norm": 2.0774333477020264, "learning_rate": 0.00017980427046263345, "loss": 2.6007, "step": 1145 }, { "epoch": 0.5093333333333333, "grad_norm": 1.6789966821670532, "learning_rate": 0.0001797864768683274, "loss": 2.505, "step": 1146 }, { "epoch": 0.5097777777777778, "grad_norm": 1.837697148323059, "learning_rate": 0.00017976868327402136, "loss": 3.0319, "step": 1147 }, { "epoch": 0.5102222222222222, "grad_norm": 1.7084914445877075, "learning_rate": 0.00017975088967971532, "loss": 2.4174, "step": 1148 }, { "epoch": 0.5106666666666667, "grad_norm": 2.1682441234588623, "learning_rate": 0.00017973309608540928, "loss": 1.1208, "step": 1149 }, { "epoch": 0.5111111111111111, "grad_norm": 1.9390794038772583, "learning_rate": 0.00017971530249110323, "loss": 2.4768, "step": 1150 }, { "epoch": 0.5115555555555555, "grad_norm": 0.9777438044548035, "learning_rate": 0.00017969750889679716, "loss": 3.1091, "step": 1151 }, { "epoch": 0.512, "grad_norm": 0.9038203358650208, "learning_rate": 0.0001796797153024911, "loss": 2.5902, "step": 1152 }, { "epoch": 0.5124444444444445, "grad_norm": 1.2871443033218384, "learning_rate": 0.00017966192170818505, "loss": 1.4081, "step": 1153 }, { "epoch": 0.5128888888888888, "grad_norm": 1.109168529510498, "learning_rate": 0.000179644128113879, "loss": 2.5515, "step": 1154 }, { "epoch": 0.5133333333333333, "grad_norm": 1.2267260551452637, "learning_rate": 0.00017962633451957296, "loss": 2.9597, "step": 1155 }, { "epoch": 0.5137777777777778, "grad_norm": 1.304792881011963, "learning_rate": 0.00017960854092526692, "loss": 2.4844, "step": 1156 }, { "epoch": 0.5142222222222222, "grad_norm": 1.1886632442474365, "learning_rate": 0.00017959074733096088, "loss": 2.5076, "step": 1157 }, { "epoch": 0.5146666666666667, "grad_norm": 1.3380016088485718, "learning_rate": 0.0001795729537366548, "loss": 2.7098, "step": 1158 }, { "epoch": 0.5151111111111111, "grad_norm": 2.1414008140563965, "learning_rate": 0.00017955516014234876, "loss": 1.6224, "step": 1159 }, { "epoch": 0.5155555555555555, "grad_norm": 1.3274937868118286, "learning_rate": 0.00017953736654804272, "loss": 2.9532, "step": 1160 }, { "epoch": 0.516, "grad_norm": 1.297349214553833, "learning_rate": 0.00017951957295373667, "loss": 2.4212, "step": 1161 }, { "epoch": 0.5164444444444445, "grad_norm": 1.2180557250976562, "learning_rate": 0.00017950177935943063, "loss": 2.6664, "step": 1162 }, { "epoch": 0.5168888888888888, "grad_norm": 1.4957740306854248, "learning_rate": 0.0001794839857651246, "loss": 2.5514, "step": 1163 }, { "epoch": 0.5173333333333333, "grad_norm": 1.2369511127471924, "learning_rate": 0.00017946619217081852, "loss": 2.3431, "step": 1164 }, { "epoch": 0.5177777777777778, "grad_norm": 1.2156001329421997, "learning_rate": 0.00017944839857651245, "loss": 2.5481, "step": 1165 }, { "epoch": 0.5182222222222223, "grad_norm": 1.2086858749389648, "learning_rate": 0.0001794306049822064, "loss": 2.0433, "step": 1166 }, { "epoch": 0.5186666666666667, "grad_norm": 1.3301823139190674, "learning_rate": 0.00017941281138790036, "loss": 2.3485, "step": 1167 }, { "epoch": 0.5191111111111111, "grad_norm": 1.3325672149658203, "learning_rate": 0.00017939501779359432, "loss": 2.6565, "step": 1168 }, { "epoch": 0.5195555555555555, "grad_norm": 1.1277254819869995, "learning_rate": 0.00017937722419928827, "loss": 2.1358, "step": 1169 }, { "epoch": 0.52, "grad_norm": 1.3675233125686646, "learning_rate": 0.0001793594306049822, "loss": 2.7481, "step": 1170 }, { "epoch": 0.5204444444444445, "grad_norm": 1.3147132396697998, "learning_rate": 0.00017934163701067616, "loss": 2.4171, "step": 1171 }, { "epoch": 0.5208888888888888, "grad_norm": 1.2295223474502563, "learning_rate": 0.00017932384341637011, "loss": 2.4101, "step": 1172 }, { "epoch": 0.5213333333333333, "grad_norm": 1.762349009513855, "learning_rate": 0.00017930604982206407, "loss": 3.3649, "step": 1173 }, { "epoch": 0.5217777777777778, "grad_norm": 1.4821921586990356, "learning_rate": 0.00017928825622775803, "loss": 2.7119, "step": 1174 }, { "epoch": 0.5222222222222223, "grad_norm": 1.4601001739501953, "learning_rate": 0.00017927046263345198, "loss": 2.1603, "step": 1175 }, { "epoch": 0.5226666666666666, "grad_norm": 1.397454857826233, "learning_rate": 0.0001792526690391459, "loss": 2.8709, "step": 1176 }, { "epoch": 0.5231111111111111, "grad_norm": 1.5645291805267334, "learning_rate": 0.00017923487544483984, "loss": 2.264, "step": 1177 }, { "epoch": 0.5235555555555556, "grad_norm": 1.3776110410690308, "learning_rate": 0.0001792170818505338, "loss": 2.5389, "step": 1178 }, { "epoch": 0.524, "grad_norm": 1.2663601636886597, "learning_rate": 0.00017919928825622776, "loss": 2.2745, "step": 1179 }, { "epoch": 0.5244444444444445, "grad_norm": 1.4239956140518188, "learning_rate": 0.0001791814946619217, "loss": 2.7701, "step": 1180 }, { "epoch": 0.5248888888888888, "grad_norm": 1.3543609380722046, "learning_rate": 0.00017916370106761567, "loss": 2.1195, "step": 1181 }, { "epoch": 0.5253333333333333, "grad_norm": 1.3027421236038208, "learning_rate": 0.00017914590747330963, "loss": 2.1416, "step": 1182 }, { "epoch": 0.5257777777777778, "grad_norm": 1.191349744796753, "learning_rate": 0.00017912811387900355, "loss": 1.8744, "step": 1183 }, { "epoch": 0.5262222222222223, "grad_norm": 2.0700368881225586, "learning_rate": 0.0001791103202846975, "loss": 1.3923, "step": 1184 }, { "epoch": 0.5266666666666666, "grad_norm": 1.5801739692687988, "learning_rate": 0.00017909252669039147, "loss": 2.7669, "step": 1185 }, { "epoch": 0.5271111111111111, "grad_norm": 1.549028992652893, "learning_rate": 0.00017907473309608542, "loss": 2.7438, "step": 1186 }, { "epoch": 0.5275555555555556, "grad_norm": 1.8961384296417236, "learning_rate": 0.00017905693950177938, "loss": 2.4417, "step": 1187 }, { "epoch": 0.528, "grad_norm": 1.741623044013977, "learning_rate": 0.00017903914590747334, "loss": 2.5803, "step": 1188 }, { "epoch": 0.5284444444444445, "grad_norm": 1.3786072731018066, "learning_rate": 0.00017902135231316727, "loss": 2.0496, "step": 1189 }, { "epoch": 0.5288888888888889, "grad_norm": 1.7181576490402222, "learning_rate": 0.0001790035587188612, "loss": 2.5333, "step": 1190 }, { "epoch": 0.5293333333333333, "grad_norm": 1.6818015575408936, "learning_rate": 0.00017898576512455515, "loss": 1.9528, "step": 1191 }, { "epoch": 0.5297777777777778, "grad_norm": 1.7420971393585205, "learning_rate": 0.0001789679715302491, "loss": 1.8997, "step": 1192 }, { "epoch": 0.5302222222222223, "grad_norm": 1.5828181505203247, "learning_rate": 0.00017895017793594307, "loss": 2.5736, "step": 1193 }, { "epoch": 0.5306666666666666, "grad_norm": 1.89609956741333, "learning_rate": 0.00017893238434163702, "loss": 1.9595, "step": 1194 }, { "epoch": 0.5311111111111111, "grad_norm": 1.4787846803665161, "learning_rate": 0.00017891459074733098, "loss": 2.3573, "step": 1195 }, { "epoch": 0.5315555555555556, "grad_norm": 1.931437373161316, "learning_rate": 0.0001788967971530249, "loss": 3.1484, "step": 1196 }, { "epoch": 0.532, "grad_norm": 1.7234727144241333, "learning_rate": 0.00017887900355871886, "loss": 2.5473, "step": 1197 }, { "epoch": 0.5324444444444445, "grad_norm": 2.0949268341064453, "learning_rate": 0.00017886120996441282, "loss": 2.9174, "step": 1198 }, { "epoch": 0.5328888888888889, "grad_norm": 1.990982174873352, "learning_rate": 0.00017884341637010678, "loss": 2.9417, "step": 1199 }, { "epoch": 0.5333333333333333, "grad_norm": 2.504199981689453, "learning_rate": 0.00017882562277580073, "loss": 2.3845, "step": 1200 }, { "epoch": 0.5337777777777778, "grad_norm": 1.2785906791687012, "learning_rate": 0.0001788078291814947, "loss": 1.7756, "step": 1201 }, { "epoch": 0.5342222222222223, "grad_norm": 1.2264808416366577, "learning_rate": 0.00017879003558718862, "loss": 2.578, "step": 1202 }, { "epoch": 0.5346666666666666, "grad_norm": 1.7778968811035156, "learning_rate": 0.00017877224199288255, "loss": 0.1643, "step": 1203 }, { "epoch": 0.5351111111111111, "grad_norm": 0.8537470102310181, "learning_rate": 0.0001787544483985765, "loss": 1.3101, "step": 1204 }, { "epoch": 0.5355555555555556, "grad_norm": 1.3113861083984375, "learning_rate": 0.00017873665480427046, "loss": 3.1657, "step": 1205 }, { "epoch": 0.536, "grad_norm": 1.2095390558242798, "learning_rate": 0.00017871886120996442, "loss": 2.4251, "step": 1206 }, { "epoch": 0.5364444444444444, "grad_norm": 1.25002121925354, "learning_rate": 0.00017870106761565837, "loss": 2.5725, "step": 1207 }, { "epoch": 0.5368888888888889, "grad_norm": 1.3035950660705566, "learning_rate": 0.00017868327402135233, "loss": 2.8178, "step": 1208 }, { "epoch": 0.5373333333333333, "grad_norm": 1.295156717300415, "learning_rate": 0.00017866548042704626, "loss": 2.3121, "step": 1209 }, { "epoch": 0.5377777777777778, "grad_norm": 1.2773317098617554, "learning_rate": 0.00017864768683274022, "loss": 2.637, "step": 1210 }, { "epoch": 0.5382222222222223, "grad_norm": 1.2895100116729736, "learning_rate": 0.00017862989323843417, "loss": 2.5377, "step": 1211 }, { "epoch": 0.5386666666666666, "grad_norm": 1.4635089635849, "learning_rate": 0.00017861209964412813, "loss": 2.134, "step": 1212 }, { "epoch": 0.5391111111111111, "grad_norm": 1.3764005899429321, "learning_rate": 0.0001785943060498221, "loss": 2.4902, "step": 1213 }, { "epoch": 0.5395555555555556, "grad_norm": 1.3150113821029663, "learning_rate": 0.00017857651245551604, "loss": 2.8578, "step": 1214 }, { "epoch": 0.54, "grad_norm": 2.5147175788879395, "learning_rate": 0.00017855871886120997, "loss": 1.2944, "step": 1215 }, { "epoch": 0.5404444444444444, "grad_norm": 1.1644126176834106, "learning_rate": 0.0001785409252669039, "loss": 2.5019, "step": 1216 }, { "epoch": 0.5408888888888889, "grad_norm": 1.375962734222412, "learning_rate": 0.00017852313167259786, "loss": 2.5303, "step": 1217 }, { "epoch": 0.5413333333333333, "grad_norm": 1.2268140316009521, "learning_rate": 0.00017850533807829182, "loss": 2.1568, "step": 1218 }, { "epoch": 0.5417777777777778, "grad_norm": 1.3434302806854248, "learning_rate": 0.00017848754448398577, "loss": 2.1272, "step": 1219 }, { "epoch": 0.5422222222222223, "grad_norm": 1.3689292669296265, "learning_rate": 0.00017846975088967973, "loss": 2.4022, "step": 1220 }, { "epoch": 0.5426666666666666, "grad_norm": 1.6363227367401123, "learning_rate": 0.00017845195729537368, "loss": 2.8829, "step": 1221 }, { "epoch": 0.5431111111111111, "grad_norm": 1.4127588272094727, "learning_rate": 0.00017843416370106761, "loss": 2.699, "step": 1222 }, { "epoch": 0.5435555555555556, "grad_norm": 1.462015986442566, "learning_rate": 0.00017841637010676157, "loss": 2.8654, "step": 1223 }, { "epoch": 0.544, "grad_norm": 1.2841753959655762, "learning_rate": 0.00017839857651245553, "loss": 2.2034, "step": 1224 }, { "epoch": 0.5444444444444444, "grad_norm": 1.377759575843811, "learning_rate": 0.00017838078291814948, "loss": 2.5642, "step": 1225 }, { "epoch": 0.5448888888888889, "grad_norm": 1.3649755716323853, "learning_rate": 0.00017836298932384344, "loss": 2.7037, "step": 1226 }, { "epoch": 0.5453333333333333, "grad_norm": 1.399375319480896, "learning_rate": 0.0001783451957295374, "loss": 2.766, "step": 1227 }, { "epoch": 0.5457777777777778, "grad_norm": 1.3284432888031006, "learning_rate": 0.00017832740213523133, "loss": 2.6953, "step": 1228 }, { "epoch": 0.5462222222222223, "grad_norm": 1.5032292604446411, "learning_rate": 0.00017830960854092526, "loss": 2.4118, "step": 1229 }, { "epoch": 0.5466666666666666, "grad_norm": 1.4157973527908325, "learning_rate": 0.0001782918149466192, "loss": 2.6983, "step": 1230 }, { "epoch": 0.5471111111111111, "grad_norm": 1.2969857454299927, "learning_rate": 0.00017827402135231317, "loss": 2.1972, "step": 1231 }, { "epoch": 0.5475555555555556, "grad_norm": 1.4679317474365234, "learning_rate": 0.00017825622775800712, "loss": 2.6372, "step": 1232 }, { "epoch": 0.548, "grad_norm": 1.451851487159729, "learning_rate": 0.00017823843416370108, "loss": 2.5865, "step": 1233 }, { "epoch": 0.5484444444444444, "grad_norm": 1.4248473644256592, "learning_rate": 0.00017822064056939504, "loss": 2.3113, "step": 1234 }, { "epoch": 0.5488888888888889, "grad_norm": 1.5742985010147095, "learning_rate": 0.00017820284697508897, "loss": 2.6367, "step": 1235 }, { "epoch": 0.5493333333333333, "grad_norm": 1.569652795791626, "learning_rate": 0.00017818505338078292, "loss": 1.9242, "step": 1236 }, { "epoch": 0.5497777777777778, "grad_norm": 1.8335659503936768, "learning_rate": 0.00017816725978647688, "loss": 2.2898, "step": 1237 }, { "epoch": 0.5502222222222222, "grad_norm": 1.419884204864502, "learning_rate": 0.00017814946619217084, "loss": 2.3061, "step": 1238 }, { "epoch": 0.5506666666666666, "grad_norm": 1.5601950883865356, "learning_rate": 0.0001781316725978648, "loss": 2.5379, "step": 1239 }, { "epoch": 0.5511111111111111, "grad_norm": 1.4974377155303955, "learning_rate": 0.00017811387900355875, "loss": 2.5647, "step": 1240 }, { "epoch": 0.5515555555555556, "grad_norm": 1.5708105564117432, "learning_rate": 0.00017809608540925268, "loss": 2.8586, "step": 1241 }, { "epoch": 0.552, "grad_norm": 1.7998818159103394, "learning_rate": 0.0001780782918149466, "loss": 3.0485, "step": 1242 }, { "epoch": 0.5524444444444444, "grad_norm": 1.7063149213790894, "learning_rate": 0.00017806049822064056, "loss": 2.6647, "step": 1243 }, { "epoch": 0.5528888888888889, "grad_norm": 1.4614622592926025, "learning_rate": 0.00017804270462633452, "loss": 2.2354, "step": 1244 }, { "epoch": 0.5533333333333333, "grad_norm": 1.6693414449691772, "learning_rate": 0.00017802491103202848, "loss": 2.1268, "step": 1245 }, { "epoch": 0.5537777777777778, "grad_norm": 1.6453592777252197, "learning_rate": 0.00017800711743772243, "loss": 2.6987, "step": 1246 }, { "epoch": 0.5542222222222222, "grad_norm": 1.7275609970092773, "learning_rate": 0.0001779893238434164, "loss": 2.4686, "step": 1247 }, { "epoch": 0.5546666666666666, "grad_norm": 1.6857225894927979, "learning_rate": 0.00017797153024911032, "loss": 1.6319, "step": 1248 }, { "epoch": 0.5551111111111111, "grad_norm": 1.822630524635315, "learning_rate": 0.00017795373665480428, "loss": 2.215, "step": 1249 }, { "epoch": 0.5555555555555556, "grad_norm": 1.8363982439041138, "learning_rate": 0.00017793594306049823, "loss": 1.699, "step": 1250 }, { "epoch": 0.556, "grad_norm": 1.0663748979568481, "learning_rate": 0.0001779181494661922, "loss": 2.6193, "step": 1251 }, { "epoch": 0.5564444444444444, "grad_norm": 1.552035927772522, "learning_rate": 0.00017790035587188615, "loss": 1.0698, "step": 1252 }, { "epoch": 0.5568888888888889, "grad_norm": 1.1600538492202759, "learning_rate": 0.0001778825622775801, "loss": 2.5013, "step": 1253 }, { "epoch": 0.5573333333333333, "grad_norm": 1.0800617933273315, "learning_rate": 0.00017786476868327403, "loss": 2.5511, "step": 1254 }, { "epoch": 0.5577777777777778, "grad_norm": 1.054673433303833, "learning_rate": 0.00017784697508896796, "loss": 2.3978, "step": 1255 }, { "epoch": 0.5582222222222222, "grad_norm": 0.9816464185714722, "learning_rate": 0.00017782918149466192, "loss": 2.0309, "step": 1256 }, { "epoch": 0.5586666666666666, "grad_norm": 1.2822006940841675, "learning_rate": 0.00017781138790035587, "loss": 2.4745, "step": 1257 }, { "epoch": 0.5591111111111111, "grad_norm": 1.2752128839492798, "learning_rate": 0.00017779359430604983, "loss": 2.6353, "step": 1258 }, { "epoch": 0.5595555555555556, "grad_norm": 1.1391791105270386, "learning_rate": 0.0001777758007117438, "loss": 2.6778, "step": 1259 }, { "epoch": 0.56, "grad_norm": 1.1968642473220825, "learning_rate": 0.00017775800711743772, "loss": 2.9373, "step": 1260 }, { "epoch": 0.5604444444444444, "grad_norm": 1.2563437223434448, "learning_rate": 0.00017774021352313167, "loss": 2.3472, "step": 1261 }, { "epoch": 0.5608888888888889, "grad_norm": 1.2173583507537842, "learning_rate": 0.00017772241992882563, "loss": 1.8898, "step": 1262 }, { "epoch": 0.5613333333333334, "grad_norm": 1.0731583833694458, "learning_rate": 0.00017770462633451959, "loss": 2.4451, "step": 1263 }, { "epoch": 0.5617777777777778, "grad_norm": 1.0820194482803345, "learning_rate": 0.00017768683274021354, "loss": 2.7584, "step": 1264 }, { "epoch": 0.5622222222222222, "grad_norm": 1.2503118515014648, "learning_rate": 0.0001776690391459075, "loss": 2.5674, "step": 1265 }, { "epoch": 0.5626666666666666, "grad_norm": 1.265372633934021, "learning_rate": 0.00017765124555160143, "loss": 2.3988, "step": 1266 }, { "epoch": 0.5631111111111111, "grad_norm": 1.1392314434051514, "learning_rate": 0.00017763345195729536, "loss": 2.5185, "step": 1267 }, { "epoch": 0.5635555555555556, "grad_norm": 1.2027145624160767, "learning_rate": 0.00017761565836298931, "loss": 2.5246, "step": 1268 }, { "epoch": 0.564, "grad_norm": 2.07536244392395, "learning_rate": 0.00017759786476868327, "loss": 2.7272, "step": 1269 }, { "epoch": 0.5644444444444444, "grad_norm": 1.5870450735092163, "learning_rate": 0.00017758007117437723, "loss": 3.0011, "step": 1270 }, { "epoch": 0.5648888888888889, "grad_norm": 1.439990520477295, "learning_rate": 0.00017756227758007118, "loss": 2.0491, "step": 1271 }, { "epoch": 0.5653333333333334, "grad_norm": 2.632349967956543, "learning_rate": 0.00017754448398576514, "loss": 2.4149, "step": 1272 }, { "epoch": 0.5657777777777778, "grad_norm": 1.4456804990768433, "learning_rate": 0.00017752669039145907, "loss": 2.7718, "step": 1273 }, { "epoch": 0.5662222222222222, "grad_norm": 1.3915135860443115, "learning_rate": 0.00017750889679715303, "loss": 3.1489, "step": 1274 }, { "epoch": 0.5666666666666667, "grad_norm": 1.4057570695877075, "learning_rate": 0.00017749110320284698, "loss": 2.7257, "step": 1275 }, { "epoch": 0.5671111111111111, "grad_norm": 1.498278260231018, "learning_rate": 0.00017747330960854094, "loss": 2.1074, "step": 1276 }, { "epoch": 0.5675555555555556, "grad_norm": 1.3324486017227173, "learning_rate": 0.0001774555160142349, "loss": 2.5449, "step": 1277 }, { "epoch": 0.568, "grad_norm": 1.2378911972045898, "learning_rate": 0.00017743772241992885, "loss": 1.8185, "step": 1278 }, { "epoch": 0.5684444444444444, "grad_norm": 1.7115696668624878, "learning_rate": 0.00017741992882562278, "loss": 3.1832, "step": 1279 }, { "epoch": 0.5688888888888889, "grad_norm": 1.381099820137024, "learning_rate": 0.0001774021352313167, "loss": 2.3531, "step": 1280 }, { "epoch": 0.5693333333333334, "grad_norm": 1.3072692155838013, "learning_rate": 0.00017738434163701067, "loss": 2.4594, "step": 1281 }, { "epoch": 0.5697777777777778, "grad_norm": 1.5001025199890137, "learning_rate": 0.00017736654804270462, "loss": 2.822, "step": 1282 }, { "epoch": 0.5702222222222222, "grad_norm": 1.440004587173462, "learning_rate": 0.00017734875444839858, "loss": 2.6287, "step": 1283 }, { "epoch": 0.5706666666666667, "grad_norm": 1.5406244993209839, "learning_rate": 0.00017733096085409254, "loss": 2.4, "step": 1284 }, { "epoch": 0.5711111111111111, "grad_norm": 1.3207265138626099, "learning_rate": 0.0001773131672597865, "loss": 2.2423, "step": 1285 }, { "epoch": 0.5715555555555556, "grad_norm": 1.3449149131774902, "learning_rate": 0.00017729537366548042, "loss": 2.1636, "step": 1286 }, { "epoch": 0.572, "grad_norm": 1.5409855842590332, "learning_rate": 0.00017727758007117438, "loss": 2.5376, "step": 1287 }, { "epoch": 0.5724444444444444, "grad_norm": 1.6577012538909912, "learning_rate": 0.00017725978647686834, "loss": 2.1492, "step": 1288 }, { "epoch": 0.5728888888888889, "grad_norm": 1.4876697063446045, "learning_rate": 0.0001772419928825623, "loss": 2.4557, "step": 1289 }, { "epoch": 0.5733333333333334, "grad_norm": 1.600537657737732, "learning_rate": 0.00017722419928825625, "loss": 2.3974, "step": 1290 }, { "epoch": 0.5737777777777778, "grad_norm": 1.856227159500122, "learning_rate": 0.0001772064056939502, "loss": 2.8138, "step": 1291 }, { "epoch": 0.5742222222222222, "grad_norm": 1.80988609790802, "learning_rate": 0.00017718861209964414, "loss": 1.4204, "step": 1292 }, { "epoch": 0.5746666666666667, "grad_norm": 1.5674011707305908, "learning_rate": 0.00017717081850533806, "loss": 2.2016, "step": 1293 }, { "epoch": 0.5751111111111111, "grad_norm": 1.5148704051971436, "learning_rate": 0.00017715302491103202, "loss": 2.607, "step": 1294 }, { "epoch": 0.5755555555555556, "grad_norm": 1.924994945526123, "learning_rate": 0.00017713523131672598, "loss": 2.69, "step": 1295 }, { "epoch": 0.576, "grad_norm": 1.8337801694869995, "learning_rate": 0.00017711743772241993, "loss": 2.1814, "step": 1296 }, { "epoch": 0.5764444444444444, "grad_norm": 1.7834872007369995, "learning_rate": 0.0001770996441281139, "loss": 2.7517, "step": 1297 }, { "epoch": 0.5768888888888889, "grad_norm": 1.7494984865188599, "learning_rate": 0.00017708185053380785, "loss": 2.2146, "step": 1298 }, { "epoch": 0.5773333333333334, "grad_norm": 1.8861663341522217, "learning_rate": 0.00017706405693950178, "loss": 2.5008, "step": 1299 }, { "epoch": 0.5777777777777777, "grad_norm": 3.3163883686065674, "learning_rate": 0.00017704626334519573, "loss": 3.1244, "step": 1300 }, { "epoch": 0.5782222222222222, "grad_norm": 1.1315196752548218, "learning_rate": 0.0001770284697508897, "loss": 2.7789, "step": 1301 }, { "epoch": 0.5786666666666667, "grad_norm": 1.067335844039917, "learning_rate": 0.00017701067615658365, "loss": 3.2978, "step": 1302 }, { "epoch": 0.5791111111111111, "grad_norm": 1.1568325757980347, "learning_rate": 0.0001769928825622776, "loss": 2.8684, "step": 1303 }, { "epoch": 0.5795555555555556, "grad_norm": 1.2416324615478516, "learning_rate": 0.00017697508896797156, "loss": 3.2694, "step": 1304 }, { "epoch": 0.58, "grad_norm": 1.24941086769104, "learning_rate": 0.0001769572953736655, "loss": 2.8852, "step": 1305 }, { "epoch": 0.5804444444444444, "grad_norm": 1.0672186613082886, "learning_rate": 0.00017693950177935942, "loss": 2.4795, "step": 1306 }, { "epoch": 0.5808888888888889, "grad_norm": 1.0531431436538696, "learning_rate": 0.00017692170818505337, "loss": 2.5932, "step": 1307 }, { "epoch": 0.5813333333333334, "grad_norm": 1.2098814249038696, "learning_rate": 0.00017690391459074733, "loss": 2.3703, "step": 1308 }, { "epoch": 0.5817777777777777, "grad_norm": 1.1954690217971802, "learning_rate": 0.0001768861209964413, "loss": 2.5722, "step": 1309 }, { "epoch": 0.5822222222222222, "grad_norm": 1.2348884344100952, "learning_rate": 0.00017686832740213524, "loss": 2.2636, "step": 1310 }, { "epoch": 0.5826666666666667, "grad_norm": 1.145476222038269, "learning_rate": 0.0001768505338078292, "loss": 3.0043, "step": 1311 }, { "epoch": 0.5831111111111111, "grad_norm": 1.1092824935913086, "learning_rate": 0.00017683274021352313, "loss": 2.7379, "step": 1312 }, { "epoch": 0.5835555555555556, "grad_norm": 1.696060061454773, "learning_rate": 0.00017681494661921709, "loss": 1.8912, "step": 1313 }, { "epoch": 0.584, "grad_norm": 1.3610656261444092, "learning_rate": 0.00017679715302491104, "loss": 2.5365, "step": 1314 }, { "epoch": 0.5844444444444444, "grad_norm": 1.2558561563491821, "learning_rate": 0.000176779359430605, "loss": 2.5177, "step": 1315 }, { "epoch": 0.5848888888888889, "grad_norm": 1.0652177333831787, "learning_rate": 0.00017676156583629896, "loss": 2.0853, "step": 1316 }, { "epoch": 0.5853333333333334, "grad_norm": 1.3081934452056885, "learning_rate": 0.0001767437722419929, "loss": 2.6939, "step": 1317 }, { "epoch": 0.5857777777777777, "grad_norm": 1.3675099611282349, "learning_rate": 0.00017672597864768684, "loss": 2.4198, "step": 1318 }, { "epoch": 0.5862222222222222, "grad_norm": 1.2688225507736206, "learning_rate": 0.00017670818505338077, "loss": 2.472, "step": 1319 }, { "epoch": 0.5866666666666667, "grad_norm": 1.7709518671035767, "learning_rate": 0.00017669039145907473, "loss": 1.2899, "step": 1320 }, { "epoch": 0.5871111111111111, "grad_norm": 1.2735425233840942, "learning_rate": 0.00017667259786476868, "loss": 2.3649, "step": 1321 }, { "epoch": 0.5875555555555556, "grad_norm": 1.4274276494979858, "learning_rate": 0.00017665480427046264, "loss": 3.0392, "step": 1322 }, { "epoch": 0.588, "grad_norm": 1.1845803260803223, "learning_rate": 0.0001766370106761566, "loss": 2.2422, "step": 1323 }, { "epoch": 0.5884444444444444, "grad_norm": 1.505797028541565, "learning_rate": 0.00017661921708185055, "loss": 2.6997, "step": 1324 }, { "epoch": 0.5888888888888889, "grad_norm": 1.500378131866455, "learning_rate": 0.00017660142348754448, "loss": 2.4131, "step": 1325 }, { "epoch": 0.5893333333333334, "grad_norm": 1.666031837463379, "learning_rate": 0.00017658362989323844, "loss": 2.4642, "step": 1326 }, { "epoch": 0.5897777777777777, "grad_norm": 1.4224402904510498, "learning_rate": 0.0001765658362989324, "loss": 2.6931, "step": 1327 }, { "epoch": 0.5902222222222222, "grad_norm": 1.5196523666381836, "learning_rate": 0.00017654804270462635, "loss": 2.846, "step": 1328 }, { "epoch": 0.5906666666666667, "grad_norm": 1.455924391746521, "learning_rate": 0.0001765302491103203, "loss": 2.6315, "step": 1329 }, { "epoch": 0.5911111111111111, "grad_norm": 2.582533597946167, "learning_rate": 0.00017651245551601427, "loss": 1.3039, "step": 1330 }, { "epoch": 0.5915555555555555, "grad_norm": 1.7225983142852783, "learning_rate": 0.0001764946619217082, "loss": 2.5404, "step": 1331 }, { "epoch": 0.592, "grad_norm": 1.50846529006958, "learning_rate": 0.00017647686832740212, "loss": 2.2202, "step": 1332 }, { "epoch": 0.5924444444444444, "grad_norm": 1.578640103340149, "learning_rate": 0.00017645907473309608, "loss": 2.3533, "step": 1333 }, { "epoch": 0.5928888888888889, "grad_norm": 1.3282861709594727, "learning_rate": 0.00017644128113879004, "loss": 2.2071, "step": 1334 }, { "epoch": 0.5933333333333334, "grad_norm": 1.6370536088943481, "learning_rate": 0.000176423487544484, "loss": 2.9182, "step": 1335 }, { "epoch": 0.5937777777777777, "grad_norm": 1.1675159931182861, "learning_rate": 0.00017640569395017795, "loss": 1.177, "step": 1336 }, { "epoch": 0.5942222222222222, "grad_norm": 1.2795166969299316, "learning_rate": 0.0001763879003558719, "loss": 1.9403, "step": 1337 }, { "epoch": 0.5946666666666667, "grad_norm": 1.825806975364685, "learning_rate": 0.00017637010676156584, "loss": 2.655, "step": 1338 }, { "epoch": 0.5951111111111111, "grad_norm": 1.5679066181182861, "learning_rate": 0.0001763523131672598, "loss": 2.3021, "step": 1339 }, { "epoch": 0.5955555555555555, "grad_norm": 1.534218430519104, "learning_rate": 0.00017633451957295375, "loss": 2.5934, "step": 1340 }, { "epoch": 0.596, "grad_norm": 1.8854663372039795, "learning_rate": 0.0001763167259786477, "loss": 2.6493, "step": 1341 }, { "epoch": 0.5964444444444444, "grad_norm": 1.5538815259933472, "learning_rate": 0.00017629893238434166, "loss": 2.6666, "step": 1342 }, { "epoch": 0.5968888888888889, "grad_norm": 1.702937364578247, "learning_rate": 0.00017628113879003562, "loss": 2.6987, "step": 1343 }, { "epoch": 0.5973333333333334, "grad_norm": 1.9548336267471313, "learning_rate": 0.00017626334519572955, "loss": 2.4912, "step": 1344 }, { "epoch": 0.5977777777777777, "grad_norm": 1.5537859201431274, "learning_rate": 0.00017624555160142348, "loss": 2.2085, "step": 1345 }, { "epoch": 0.5982222222222222, "grad_norm": 1.6140497922897339, "learning_rate": 0.00017622775800711743, "loss": 2.6939, "step": 1346 }, { "epoch": 0.5986666666666667, "grad_norm": 2.120786666870117, "learning_rate": 0.0001762099644128114, "loss": 2.8833, "step": 1347 }, { "epoch": 0.5991111111111111, "grad_norm": 1.8668750524520874, "learning_rate": 0.00017619217081850535, "loss": 2.412, "step": 1348 }, { "epoch": 0.5995555555555555, "grad_norm": 1.6141252517700195, "learning_rate": 0.0001761743772241993, "loss": 2.3202, "step": 1349 }, { "epoch": 0.6, "grad_norm": 2.0876715183258057, "learning_rate": 0.00017615658362989323, "loss": 1.9042, "step": 1350 }, { "epoch": 0.6004444444444444, "grad_norm": 0.9330457448959351, "learning_rate": 0.0001761387900355872, "loss": 2.1474, "step": 1351 }, { "epoch": 0.6008888888888889, "grad_norm": 0.9378175139427185, "learning_rate": 0.00017612099644128115, "loss": 2.6252, "step": 1352 }, { "epoch": 0.6013333333333334, "grad_norm": 1.2432550191879272, "learning_rate": 0.0001761032028469751, "loss": 3.2561, "step": 1353 }, { "epoch": 0.6017777777777777, "grad_norm": 1.2766064405441284, "learning_rate": 0.00017608540925266906, "loss": 2.0631, "step": 1354 }, { "epoch": 0.6022222222222222, "grad_norm": 1.170651912689209, "learning_rate": 0.00017606761565836301, "loss": 3.0557, "step": 1355 }, { "epoch": 0.6026666666666667, "grad_norm": 1.724684476852417, "learning_rate": 0.00017604982206405694, "loss": 1.6797, "step": 1356 }, { "epoch": 0.6031111111111112, "grad_norm": 0.9961637258529663, "learning_rate": 0.00017603202846975087, "loss": 2.8432, "step": 1357 }, { "epoch": 0.6035555555555555, "grad_norm": 1.1883872747421265, "learning_rate": 0.00017601423487544483, "loss": 3.071, "step": 1358 }, { "epoch": 0.604, "grad_norm": 1.0519766807556152, "learning_rate": 0.0001759964412811388, "loss": 2.1003, "step": 1359 }, { "epoch": 0.6044444444444445, "grad_norm": 1.0907334089279175, "learning_rate": 0.00017597864768683274, "loss": 2.6082, "step": 1360 }, { "epoch": 0.6048888888888889, "grad_norm": 1.1930400133132935, "learning_rate": 0.0001759608540925267, "loss": 2.4901, "step": 1361 }, { "epoch": 0.6053333333333333, "grad_norm": 1.688298225402832, "learning_rate": 0.00017594306049822066, "loss": 1.7889, "step": 1362 }, { "epoch": 0.6057777777777777, "grad_norm": 1.192671537399292, "learning_rate": 0.00017592526690391459, "loss": 1.313, "step": 1363 }, { "epoch": 0.6062222222222222, "grad_norm": 1.3298887014389038, "learning_rate": 0.00017590747330960854, "loss": 3.2103, "step": 1364 }, { "epoch": 0.6066666666666667, "grad_norm": 1.1152617931365967, "learning_rate": 0.0001758896797153025, "loss": 2.4196, "step": 1365 }, { "epoch": 0.6071111111111112, "grad_norm": 1.2884933948516846, "learning_rate": 0.00017587188612099646, "loss": 2.6931, "step": 1366 }, { "epoch": 0.6075555555555555, "grad_norm": 1.1782528162002563, "learning_rate": 0.0001758540925266904, "loss": 2.0241, "step": 1367 }, { "epoch": 0.608, "grad_norm": 1.228033423423767, "learning_rate": 0.00017583629893238437, "loss": 2.1323, "step": 1368 }, { "epoch": 0.6084444444444445, "grad_norm": 1.3437577486038208, "learning_rate": 0.0001758185053380783, "loss": 1.3953, "step": 1369 }, { "epoch": 0.6088888888888889, "grad_norm": 1.3058631420135498, "learning_rate": 0.00017580071174377223, "loss": 2.6964, "step": 1370 }, { "epoch": 0.6093333333333333, "grad_norm": 1.219984531402588, "learning_rate": 0.00017578291814946618, "loss": 2.2271, "step": 1371 }, { "epoch": 0.6097777777777778, "grad_norm": 1.363558292388916, "learning_rate": 0.00017576512455516014, "loss": 2.5532, "step": 1372 }, { "epoch": 0.6102222222222222, "grad_norm": 1.8681366443634033, "learning_rate": 0.0001757473309608541, "loss": 1.3838, "step": 1373 }, { "epoch": 0.6106666666666667, "grad_norm": 1.3558433055877686, "learning_rate": 0.00017572953736654805, "loss": 2.6416, "step": 1374 }, { "epoch": 0.6111111111111112, "grad_norm": 1.3465511798858643, "learning_rate": 0.000175711743772242, "loss": 2.3744, "step": 1375 }, { "epoch": 0.6115555555555555, "grad_norm": 1.6007107496261597, "learning_rate": 0.00017569395017793594, "loss": 2.3307, "step": 1376 }, { "epoch": 0.612, "grad_norm": 1.602541208267212, "learning_rate": 0.0001756761565836299, "loss": 2.5173, "step": 1377 }, { "epoch": 0.6124444444444445, "grad_norm": 1.6473790407180786, "learning_rate": 0.00017565836298932385, "loss": 2.6707, "step": 1378 }, { "epoch": 0.6128888888888889, "grad_norm": 1.376232385635376, "learning_rate": 0.0001756405693950178, "loss": 1.9534, "step": 1379 }, { "epoch": 0.6133333333333333, "grad_norm": 1.420652151107788, "learning_rate": 0.00017562277580071176, "loss": 2.5401, "step": 1380 }, { "epoch": 0.6137777777777778, "grad_norm": 1.1767398118972778, "learning_rate": 0.00017560498220640572, "loss": 1.1765, "step": 1381 }, { "epoch": 0.6142222222222222, "grad_norm": 1.3465992212295532, "learning_rate": 0.00017558718861209965, "loss": 2.0177, "step": 1382 }, { "epoch": 0.6146666666666667, "grad_norm": 1.6473318338394165, "learning_rate": 0.00017556939501779358, "loss": 2.2605, "step": 1383 }, { "epoch": 0.6151111111111112, "grad_norm": 1.4791382551193237, "learning_rate": 0.00017555160142348754, "loss": 2.494, "step": 1384 }, { "epoch": 0.6155555555555555, "grad_norm": 1.4145492315292358, "learning_rate": 0.0001755338078291815, "loss": 2.1339, "step": 1385 }, { "epoch": 0.616, "grad_norm": 1.6023871898651123, "learning_rate": 0.00017551601423487545, "loss": 2.8089, "step": 1386 }, { "epoch": 0.6164444444444445, "grad_norm": 1.548842191696167, "learning_rate": 0.0001754982206405694, "loss": 2.2959, "step": 1387 }, { "epoch": 0.6168888888888889, "grad_norm": 1.6211026906967163, "learning_rate": 0.00017548042704626336, "loss": 2.6256, "step": 1388 }, { "epoch": 0.6173333333333333, "grad_norm": 1.508934497833252, "learning_rate": 0.0001754626334519573, "loss": 2.4794, "step": 1389 }, { "epoch": 0.6177777777777778, "grad_norm": 1.704952359199524, "learning_rate": 0.00017544483985765125, "loss": 2.6322, "step": 1390 }, { "epoch": 0.6182222222222222, "grad_norm": 1.4746431112289429, "learning_rate": 0.0001754270462633452, "loss": 2.2247, "step": 1391 }, { "epoch": 0.6186666666666667, "grad_norm": 1.5690298080444336, "learning_rate": 0.00017540925266903916, "loss": 2.1532, "step": 1392 }, { "epoch": 0.6191111111111111, "grad_norm": 1.7075462341308594, "learning_rate": 0.00017539145907473312, "loss": 2.864, "step": 1393 }, { "epoch": 0.6195555555555555, "grad_norm": 1.909990906715393, "learning_rate": 0.00017537366548042707, "loss": 2.8461, "step": 1394 }, { "epoch": 0.62, "grad_norm": 1.7880994081497192, "learning_rate": 0.000175355871886121, "loss": 2.6034, "step": 1395 }, { "epoch": 0.6204444444444445, "grad_norm": 1.8660807609558105, "learning_rate": 0.00017533807829181493, "loss": 2.372, "step": 1396 }, { "epoch": 0.6208888888888889, "grad_norm": 1.9713836908340454, "learning_rate": 0.0001753202846975089, "loss": 2.4238, "step": 1397 }, { "epoch": 0.6213333333333333, "grad_norm": 2.378556966781616, "learning_rate": 0.00017530249110320285, "loss": 2.9749, "step": 1398 }, { "epoch": 0.6217777777777778, "grad_norm": 3.1581037044525146, "learning_rate": 0.0001752846975088968, "loss": 0.1631, "step": 1399 }, { "epoch": 0.6222222222222222, "grad_norm": 2.725257396697998, "learning_rate": 0.00017526690391459076, "loss": 2.3153, "step": 1400 }, { "epoch": 0.6226666666666667, "grad_norm": 1.687172770500183, "learning_rate": 0.00017524911032028472, "loss": 1.8111, "step": 1401 }, { "epoch": 0.6231111111111111, "grad_norm": 1.2486687898635864, "learning_rate": 0.00017523131672597865, "loss": 1.6803, "step": 1402 }, { "epoch": 0.6235555555555555, "grad_norm": 0.9351308345794678, "learning_rate": 0.0001752135231316726, "loss": 2.5876, "step": 1403 }, { "epoch": 0.624, "grad_norm": 1.1229432821273804, "learning_rate": 0.00017519572953736656, "loss": 2.5936, "step": 1404 }, { "epoch": 0.6244444444444445, "grad_norm": 0.9976657629013062, "learning_rate": 0.00017517793594306051, "loss": 2.7395, "step": 1405 }, { "epoch": 0.6248888888888889, "grad_norm": 0.9780849814414978, "learning_rate": 0.00017516014234875447, "loss": 2.1686, "step": 1406 }, { "epoch": 0.6253333333333333, "grad_norm": 1.0690516233444214, "learning_rate": 0.00017514234875444843, "loss": 2.2365, "step": 1407 }, { "epoch": 0.6257777777777778, "grad_norm": 1.1127312183380127, "learning_rate": 0.00017512455516014236, "loss": 2.5104, "step": 1408 }, { "epoch": 0.6262222222222222, "grad_norm": 1.1946388483047485, "learning_rate": 0.00017510676156583629, "loss": 2.7337, "step": 1409 }, { "epoch": 0.6266666666666667, "grad_norm": 1.0641489028930664, "learning_rate": 0.00017508896797153024, "loss": 2.5558, "step": 1410 }, { "epoch": 0.6271111111111111, "grad_norm": 1.127685546875, "learning_rate": 0.0001750711743772242, "loss": 2.517, "step": 1411 }, { "epoch": 0.6275555555555555, "grad_norm": 1.2236816883087158, "learning_rate": 0.00017505338078291816, "loss": 2.6978, "step": 1412 }, { "epoch": 0.628, "grad_norm": 1.4464499950408936, "learning_rate": 0.0001750355871886121, "loss": 2.6535, "step": 1413 }, { "epoch": 0.6284444444444445, "grad_norm": 1.4766771793365479, "learning_rate": 0.00017501779359430607, "loss": 3.3998, "step": 1414 }, { "epoch": 0.6288888888888889, "grad_norm": 1.2874871492385864, "learning_rate": 0.000175, "loss": 2.3482, "step": 1415 }, { "epoch": 0.6293333333333333, "grad_norm": 1.3195867538452148, "learning_rate": 0.00017498220640569395, "loss": 2.7828, "step": 1416 }, { "epoch": 0.6297777777777778, "grad_norm": 1.2177435159683228, "learning_rate": 0.0001749644128113879, "loss": 2.3438, "step": 1417 }, { "epoch": 0.6302222222222222, "grad_norm": 1.236802577972412, "learning_rate": 0.00017494661921708187, "loss": 2.6197, "step": 1418 }, { "epoch": 0.6306666666666667, "grad_norm": 1.363099217414856, "learning_rate": 0.00017492882562277582, "loss": 2.6425, "step": 1419 }, { "epoch": 0.6311111111111111, "grad_norm": 2.4340715408325195, "learning_rate": 0.00017491103202846978, "loss": 1.5395, "step": 1420 }, { "epoch": 0.6315555555555555, "grad_norm": 1.425446629524231, "learning_rate": 0.0001748932384341637, "loss": 2.6681, "step": 1421 }, { "epoch": 0.632, "grad_norm": 1.3142110109329224, "learning_rate": 0.00017487544483985764, "loss": 2.53, "step": 1422 }, { "epoch": 0.6324444444444445, "grad_norm": 1.947712779045105, "learning_rate": 0.0001748576512455516, "loss": 2.8098, "step": 1423 }, { "epoch": 0.6328888888888888, "grad_norm": 1.202399730682373, "learning_rate": 0.00017483985765124555, "loss": 1.8831, "step": 1424 }, { "epoch": 0.6333333333333333, "grad_norm": 1.3699499368667603, "learning_rate": 0.0001748220640569395, "loss": 2.9502, "step": 1425 }, { "epoch": 0.6337777777777778, "grad_norm": 1.4097647666931152, "learning_rate": 0.00017480427046263347, "loss": 2.8757, "step": 1426 }, { "epoch": 0.6342222222222222, "grad_norm": 1.3218767642974854, "learning_rate": 0.00017478647686832742, "loss": 2.299, "step": 1427 }, { "epoch": 0.6346666666666667, "grad_norm": 1.4610822200775146, "learning_rate": 0.00017476868327402135, "loss": 2.5177, "step": 1428 }, { "epoch": 0.6351111111111111, "grad_norm": 1.65794038772583, "learning_rate": 0.0001747508896797153, "loss": 2.8784, "step": 1429 }, { "epoch": 0.6355555555555555, "grad_norm": 1.5559744834899902, "learning_rate": 0.00017473309608540926, "loss": 2.3513, "step": 1430 }, { "epoch": 0.636, "grad_norm": 1.8620177507400513, "learning_rate": 0.00017471530249110322, "loss": 0.9563, "step": 1431 }, { "epoch": 0.6364444444444445, "grad_norm": 1.3389177322387695, "learning_rate": 0.00017469750889679718, "loss": 1.5891, "step": 1432 }, { "epoch": 0.6368888888888888, "grad_norm": 1.3747659921646118, "learning_rate": 0.00017467971530249113, "loss": 1.4177, "step": 1433 }, { "epoch": 0.6373333333333333, "grad_norm": 1.6347014904022217, "learning_rate": 0.00017466192170818506, "loss": 2.359, "step": 1434 }, { "epoch": 0.6377777777777778, "grad_norm": 1.6315451860427856, "learning_rate": 0.000174644128113879, "loss": 3.0319, "step": 1435 }, { "epoch": 0.6382222222222222, "grad_norm": 1.5996496677398682, "learning_rate": 0.00017462633451957295, "loss": 2.3993, "step": 1436 }, { "epoch": 0.6386666666666667, "grad_norm": 1.7360764741897583, "learning_rate": 0.0001746085409252669, "loss": 2.3704, "step": 1437 }, { "epoch": 0.6391111111111111, "grad_norm": 1.6707364320755005, "learning_rate": 0.00017459074733096086, "loss": 2.4405, "step": 1438 }, { "epoch": 0.6395555555555555, "grad_norm": 1.5770982503890991, "learning_rate": 0.00017457295373665482, "loss": 2.4132, "step": 1439 }, { "epoch": 0.64, "grad_norm": 1.9009085893630981, "learning_rate": 0.00017455516014234875, "loss": 2.6834, "step": 1440 }, { "epoch": 0.6404444444444445, "grad_norm": 1.7474616765975952, "learning_rate": 0.0001745373665480427, "loss": 2.341, "step": 1441 }, { "epoch": 0.6408888888888888, "grad_norm": 2.131709337234497, "learning_rate": 0.00017451957295373666, "loss": 2.554, "step": 1442 }, { "epoch": 0.6413333333333333, "grad_norm": 2.146632194519043, "learning_rate": 0.00017450177935943062, "loss": 2.8733, "step": 1443 }, { "epoch": 0.6417777777777778, "grad_norm": 2.0064449310302734, "learning_rate": 0.00017448398576512457, "loss": 2.4239, "step": 1444 }, { "epoch": 0.6422222222222222, "grad_norm": 2.1821370124816895, "learning_rate": 0.00017446619217081853, "loss": 3.9109, "step": 1445 }, { "epoch": 0.6426666666666667, "grad_norm": 1.7780088186264038, "learning_rate": 0.00017444839857651246, "loss": 2.3761, "step": 1446 }, { "epoch": 0.6431111111111111, "grad_norm": 1.9316993951797485, "learning_rate": 0.00017443060498220642, "loss": 2.977, "step": 1447 }, { "epoch": 0.6435555555555555, "grad_norm": 2.113534927368164, "learning_rate": 0.00017441281138790035, "loss": 2.6501, "step": 1448 }, { "epoch": 0.644, "grad_norm": 2.1450841426849365, "learning_rate": 0.0001743950177935943, "loss": 1.5886, "step": 1449 }, { "epoch": 0.6444444444444445, "grad_norm": 2.311339855194092, "learning_rate": 0.00017437722419928826, "loss": 2.0959, "step": 1450 }, { "epoch": 0.6448888888888888, "grad_norm": 3.696924924850464, "learning_rate": 0.00017435943060498222, "loss": 1.73, "step": 1451 }, { "epoch": 0.6453333333333333, "grad_norm": 1.0169023275375366, "learning_rate": 0.00017434163701067617, "loss": 2.5696, "step": 1452 }, { "epoch": 0.6457777777777778, "grad_norm": 1.278465747833252, "learning_rate": 0.0001743238434163701, "loss": 2.876, "step": 1453 }, { "epoch": 0.6462222222222223, "grad_norm": 1.182702898979187, "learning_rate": 0.00017430604982206406, "loss": 2.4447, "step": 1454 }, { "epoch": 0.6466666666666666, "grad_norm": 1.3612101078033447, "learning_rate": 0.00017428825622775801, "loss": 2.5971, "step": 1455 }, { "epoch": 0.6471111111111111, "grad_norm": 1.2513208389282227, "learning_rate": 0.00017427046263345197, "loss": 1.7877, "step": 1456 }, { "epoch": 0.6475555555555556, "grad_norm": 1.108196496963501, "learning_rate": 0.00017425266903914593, "loss": 2.2443, "step": 1457 }, { "epoch": 0.648, "grad_norm": 1.3617606163024902, "learning_rate": 0.00017423487544483988, "loss": 2.4862, "step": 1458 }, { "epoch": 0.6484444444444445, "grad_norm": 1.2257615327835083, "learning_rate": 0.0001742170818505338, "loss": 2.6374, "step": 1459 }, { "epoch": 0.6488888888888888, "grad_norm": 1.6558245420455933, "learning_rate": 0.00017419928825622777, "loss": 1.4984, "step": 1460 }, { "epoch": 0.6493333333333333, "grad_norm": 1.308287262916565, "learning_rate": 0.0001741814946619217, "loss": 2.5151, "step": 1461 }, { "epoch": 0.6497777777777778, "grad_norm": 1.210597038269043, "learning_rate": 0.00017416370106761566, "loss": 2.0015, "step": 1462 }, { "epoch": 0.6502222222222223, "grad_norm": 1.6040871143341064, "learning_rate": 0.0001741459074733096, "loss": 2.5846, "step": 1463 }, { "epoch": 0.6506666666666666, "grad_norm": 1.5346297025680542, "learning_rate": 0.00017412811387900357, "loss": 2.6826, "step": 1464 }, { "epoch": 0.6511111111111111, "grad_norm": 1.5624873638153076, "learning_rate": 0.00017411032028469752, "loss": 2.7424, "step": 1465 }, { "epoch": 0.6515555555555556, "grad_norm": 1.3819774389266968, "learning_rate": 0.00017409252669039145, "loss": 2.7193, "step": 1466 }, { "epoch": 0.652, "grad_norm": 1.2688080072402954, "learning_rate": 0.0001740747330960854, "loss": 2.2891, "step": 1467 }, { "epoch": 0.6524444444444445, "grad_norm": 1.2661856412887573, "learning_rate": 0.00017405693950177937, "loss": 1.2741, "step": 1468 }, { "epoch": 0.6528888888888889, "grad_norm": 1.215834617614746, "learning_rate": 0.00017403914590747332, "loss": 2.3391, "step": 1469 }, { "epoch": 0.6533333333333333, "grad_norm": 1.450677752494812, "learning_rate": 0.00017402135231316728, "loss": 2.752, "step": 1470 }, { "epoch": 0.6537777777777778, "grad_norm": 1.2563883066177368, "learning_rate": 0.00017400355871886124, "loss": 2.3388, "step": 1471 }, { "epoch": 0.6542222222222223, "grad_norm": 1.5057649612426758, "learning_rate": 0.00017398576512455517, "loss": 2.4776, "step": 1472 }, { "epoch": 0.6546666666666666, "grad_norm": 1.5081661939620972, "learning_rate": 0.00017396797153024912, "loss": 2.4154, "step": 1473 }, { "epoch": 0.6551111111111111, "grad_norm": 1.3150933980941772, "learning_rate": 0.00017395017793594305, "loss": 2.4713, "step": 1474 }, { "epoch": 0.6555555555555556, "grad_norm": 1.830236554145813, "learning_rate": 0.000173932384341637, "loss": 2.4647, "step": 1475 }, { "epoch": 0.656, "grad_norm": 1.368726372718811, "learning_rate": 0.00017391459074733097, "loss": 2.1934, "step": 1476 }, { "epoch": 0.6564444444444445, "grad_norm": 1.3172972202301025, "learning_rate": 0.00017389679715302492, "loss": 2.269, "step": 1477 }, { "epoch": 0.6568888888888889, "grad_norm": 1.7238434553146362, "learning_rate": 0.00017387900355871888, "loss": 2.5588, "step": 1478 }, { "epoch": 0.6573333333333333, "grad_norm": 1.5723549127578735, "learning_rate": 0.0001738612099644128, "loss": 2.3316, "step": 1479 }, { "epoch": 0.6577777777777778, "grad_norm": 1.4798550605773926, "learning_rate": 0.00017384341637010676, "loss": 2.2944, "step": 1480 }, { "epoch": 0.6582222222222223, "grad_norm": 1.411055564880371, "learning_rate": 0.00017382562277580072, "loss": 2.5573, "step": 1481 }, { "epoch": 0.6586666666666666, "grad_norm": 2.178536891937256, "learning_rate": 0.00017380782918149468, "loss": 2.6412, "step": 1482 }, { "epoch": 0.6591111111111111, "grad_norm": 1.48294198513031, "learning_rate": 0.00017379003558718863, "loss": 2.2673, "step": 1483 }, { "epoch": 0.6595555555555556, "grad_norm": 1.597641944885254, "learning_rate": 0.0001737722419928826, "loss": 2.1974, "step": 1484 }, { "epoch": 0.66, "grad_norm": 1.7174814939498901, "learning_rate": 0.00017375444839857652, "loss": 2.7182, "step": 1485 }, { "epoch": 0.6604444444444444, "grad_norm": 2.032003879547119, "learning_rate": 0.00017373665480427045, "loss": 3.3137, "step": 1486 }, { "epoch": 0.6608888888888889, "grad_norm": 1.495666742324829, "learning_rate": 0.0001737188612099644, "loss": 1.8833, "step": 1487 }, { "epoch": 0.6613333333333333, "grad_norm": 1.6871966123580933, "learning_rate": 0.00017370106761565836, "loss": 3.0253, "step": 1488 }, { "epoch": 0.6617777777777778, "grad_norm": 1.5694763660430908, "learning_rate": 0.00017368327402135232, "loss": 1.9472, "step": 1489 }, { "epoch": 0.6622222222222223, "grad_norm": 1.8708395957946777, "learning_rate": 0.00017366548042704627, "loss": 2.8555, "step": 1490 }, { "epoch": 0.6626666666666666, "grad_norm": 1.8783323764801025, "learning_rate": 0.00017364768683274023, "loss": 2.4134, "step": 1491 }, { "epoch": 0.6631111111111111, "grad_norm": 2.046388626098633, "learning_rate": 0.00017362989323843416, "loss": 2.7402, "step": 1492 }, { "epoch": 0.6635555555555556, "grad_norm": 1.5785688161849976, "learning_rate": 0.00017361209964412812, "loss": 2.4242, "step": 1493 }, { "epoch": 0.664, "grad_norm": 1.7086628675460815, "learning_rate": 0.00017359430604982207, "loss": 2.3236, "step": 1494 }, { "epoch": 0.6644444444444444, "grad_norm": 1.5549596548080444, "learning_rate": 0.00017357651245551603, "loss": 2.23, "step": 1495 }, { "epoch": 0.6648888888888889, "grad_norm": 2.255401372909546, "learning_rate": 0.00017355871886121, "loss": 3.0353, "step": 1496 }, { "epoch": 0.6653333333333333, "grad_norm": 1.643557071685791, "learning_rate": 0.00017354092526690394, "loss": 2.4116, "step": 1497 }, { "epoch": 0.6657777777777778, "grad_norm": 2.1392343044281006, "learning_rate": 0.00017352313167259787, "loss": 3.4415, "step": 1498 }, { "epoch": 0.6662222222222223, "grad_norm": 2.6055009365081787, "learning_rate": 0.0001735053380782918, "loss": 0.1766, "step": 1499 }, { "epoch": 0.6666666666666666, "grad_norm": 2.152284622192383, "learning_rate": 0.00017348754448398576, "loss": 2.9102, "step": 1500 }, { "epoch": 0.6671111111111111, "grad_norm": 0.9273963570594788, "learning_rate": 0.00017346975088967971, "loss": 2.428, "step": 1501 }, { "epoch": 0.6675555555555556, "grad_norm": 0.9428668022155762, "learning_rate": 0.00017345195729537367, "loss": 2.6534, "step": 1502 }, { "epoch": 0.668, "grad_norm": 0.93215012550354, "learning_rate": 0.00017343416370106763, "loss": 2.351, "step": 1503 }, { "epoch": 0.6684444444444444, "grad_norm": 1.035394549369812, "learning_rate": 0.00017341637010676158, "loss": 1.1251, "step": 1504 }, { "epoch": 0.6688888888888889, "grad_norm": 1.0628288984298706, "learning_rate": 0.00017339857651245551, "loss": 2.4944, "step": 1505 }, { "epoch": 0.6693333333333333, "grad_norm": 1.3216973543167114, "learning_rate": 0.00017338078291814947, "loss": 2.5529, "step": 1506 }, { "epoch": 0.6697777777777778, "grad_norm": 1.2949331998825073, "learning_rate": 0.00017336298932384343, "loss": 2.5878, "step": 1507 }, { "epoch": 0.6702222222222223, "grad_norm": 1.3365072011947632, "learning_rate": 0.00017334519572953738, "loss": 3.059, "step": 1508 }, { "epoch": 0.6706666666666666, "grad_norm": 1.181065320968628, "learning_rate": 0.00017332740213523134, "loss": 2.6428, "step": 1509 }, { "epoch": 0.6711111111111111, "grad_norm": 1.2061887979507446, "learning_rate": 0.0001733096085409253, "loss": 1.4412, "step": 1510 }, { "epoch": 0.6715555555555556, "grad_norm": 1.4257198572158813, "learning_rate": 0.00017329181494661923, "loss": 2.9952, "step": 1511 }, { "epoch": 0.672, "grad_norm": 1.2718660831451416, "learning_rate": 0.00017327402135231316, "loss": 2.3034, "step": 1512 }, { "epoch": 0.6724444444444444, "grad_norm": 1.2620964050292969, "learning_rate": 0.0001732562277580071, "loss": 1.7634, "step": 1513 }, { "epoch": 0.6728888888888889, "grad_norm": 1.4162237644195557, "learning_rate": 0.00017323843416370107, "loss": 2.2134, "step": 1514 }, { "epoch": 0.6733333333333333, "grad_norm": 1.3909696340560913, "learning_rate": 0.00017322064056939502, "loss": 2.7267, "step": 1515 }, { "epoch": 0.6737777777777778, "grad_norm": 1.563040852546692, "learning_rate": 0.00017320284697508898, "loss": 1.83, "step": 1516 }, { "epoch": 0.6742222222222222, "grad_norm": 1.417112112045288, "learning_rate": 0.00017318505338078294, "loss": 2.1559, "step": 1517 }, { "epoch": 0.6746666666666666, "grad_norm": 1.617037057876587, "learning_rate": 0.00017316725978647687, "loss": 2.7987, "step": 1518 }, { "epoch": 0.6751111111111111, "grad_norm": 1.5133682489395142, "learning_rate": 0.00017314946619217082, "loss": 2.5342, "step": 1519 }, { "epoch": 0.6755555555555556, "grad_norm": 1.3706659078598022, "learning_rate": 0.00017313167259786478, "loss": 2.583, "step": 1520 }, { "epoch": 0.676, "grad_norm": 1.600906252861023, "learning_rate": 0.00017311387900355874, "loss": 2.7443, "step": 1521 }, { "epoch": 0.6764444444444444, "grad_norm": 1.269012689590454, "learning_rate": 0.0001730960854092527, "loss": 2.3754, "step": 1522 }, { "epoch": 0.6768888888888889, "grad_norm": 1.382144808769226, "learning_rate": 0.00017307829181494665, "loss": 2.5199, "step": 1523 }, { "epoch": 0.6773333333333333, "grad_norm": 1.71562659740448, "learning_rate": 0.00017306049822064058, "loss": 3.278, "step": 1524 }, { "epoch": 0.6777777777777778, "grad_norm": 1.2466914653778076, "learning_rate": 0.0001730427046263345, "loss": 2.3992, "step": 1525 }, { "epoch": 0.6782222222222222, "grad_norm": 1.547672986984253, "learning_rate": 0.00017302491103202846, "loss": 2.4622, "step": 1526 }, { "epoch": 0.6786666666666666, "grad_norm": 1.5382349491119385, "learning_rate": 0.00017300711743772242, "loss": 2.1739, "step": 1527 }, { "epoch": 0.6791111111111111, "grad_norm": 1.4863885641098022, "learning_rate": 0.00017298932384341638, "loss": 2.6332, "step": 1528 }, { "epoch": 0.6795555555555556, "grad_norm": 1.6594899892807007, "learning_rate": 0.00017297153024911033, "loss": 3.0222, "step": 1529 }, { "epoch": 0.68, "grad_norm": 1.72464120388031, "learning_rate": 0.00017295373665480426, "loss": 2.6409, "step": 1530 }, { "epoch": 0.6804444444444444, "grad_norm": 1.5191189050674438, "learning_rate": 0.00017293594306049822, "loss": 2.6336, "step": 1531 }, { "epoch": 0.6808888888888889, "grad_norm": 1.8320003747940063, "learning_rate": 0.00017291814946619218, "loss": 2.7605, "step": 1532 }, { "epoch": 0.6813333333333333, "grad_norm": 1.6135400533676147, "learning_rate": 0.00017290035587188613, "loss": 2.6308, "step": 1533 }, { "epoch": 0.6817777777777778, "grad_norm": 1.8295135498046875, "learning_rate": 0.0001728825622775801, "loss": 2.5421, "step": 1534 }, { "epoch": 0.6822222222222222, "grad_norm": 1.799838900566101, "learning_rate": 0.00017286476868327405, "loss": 3.0082, "step": 1535 }, { "epoch": 0.6826666666666666, "grad_norm": 1.427129864692688, "learning_rate": 0.00017284697508896798, "loss": 2.5288, "step": 1536 }, { "epoch": 0.6831111111111111, "grad_norm": 1.533745527267456, "learning_rate": 0.00017282918149466193, "loss": 2.7791, "step": 1537 }, { "epoch": 0.6835555555555556, "grad_norm": 1.6388285160064697, "learning_rate": 0.00017281138790035586, "loss": 3.325, "step": 1538 }, { "epoch": 0.684, "grad_norm": 1.5030821561813354, "learning_rate": 0.00017279359430604982, "loss": 2.0334, "step": 1539 }, { "epoch": 0.6844444444444444, "grad_norm": 1.2664015293121338, "learning_rate": 0.00017277580071174377, "loss": 2.0804, "step": 1540 }, { "epoch": 0.6848888888888889, "grad_norm": 1.496182918548584, "learning_rate": 0.00017275800711743773, "loss": 1.6106, "step": 1541 }, { "epoch": 0.6853333333333333, "grad_norm": 1.7371916770935059, "learning_rate": 0.0001727402135231317, "loss": 2.6694, "step": 1542 }, { "epoch": 0.6857777777777778, "grad_norm": 1.8223196268081665, "learning_rate": 0.00017272241992882562, "loss": 2.5409, "step": 1543 }, { "epoch": 0.6862222222222222, "grad_norm": 1.9429682493209839, "learning_rate": 0.00017270462633451957, "loss": 2.5704, "step": 1544 }, { "epoch": 0.6866666666666666, "grad_norm": 1.7982163429260254, "learning_rate": 0.00017268683274021353, "loss": 2.9834, "step": 1545 }, { "epoch": 0.6871111111111111, "grad_norm": 1.8245515823364258, "learning_rate": 0.00017266903914590749, "loss": 2.6373, "step": 1546 }, { "epoch": 0.6875555555555556, "grad_norm": 1.7472467422485352, "learning_rate": 0.00017265124555160144, "loss": 2.8218, "step": 1547 }, { "epoch": 0.688, "grad_norm": 1.9026468992233276, "learning_rate": 0.0001726334519572954, "loss": 3.0626, "step": 1548 }, { "epoch": 0.6884444444444444, "grad_norm": 2.187288522720337, "learning_rate": 0.00017261565836298933, "loss": 2.3243, "step": 1549 }, { "epoch": 0.6888888888888889, "grad_norm": 2.155287742614746, "learning_rate": 0.00017259786476868329, "loss": 3.0798, "step": 1550 }, { "epoch": 0.6893333333333334, "grad_norm": 2.2100751399993896, "learning_rate": 0.00017258007117437721, "loss": 2.1143, "step": 1551 }, { "epoch": 0.6897777777777778, "grad_norm": 1.0366290807724, "learning_rate": 0.00017256227758007117, "loss": 2.1292, "step": 1552 }, { "epoch": 0.6902222222222222, "grad_norm": 1.2031378746032715, "learning_rate": 0.00017254448398576513, "loss": 2.6824, "step": 1553 }, { "epoch": 0.6906666666666667, "grad_norm": 1.154441475868225, "learning_rate": 0.00017252669039145908, "loss": 2.203, "step": 1554 }, { "epoch": 0.6911111111111111, "grad_norm": 1.2247297763824463, "learning_rate": 0.00017250889679715304, "loss": 2.8808, "step": 1555 }, { "epoch": 0.6915555555555556, "grad_norm": 1.3467925786972046, "learning_rate": 0.00017249110320284697, "loss": 2.9562, "step": 1556 }, { "epoch": 0.692, "grad_norm": 1.1302120685577393, "learning_rate": 0.00017247330960854093, "loss": 2.485, "step": 1557 }, { "epoch": 0.6924444444444444, "grad_norm": 1.2643386125564575, "learning_rate": 0.00017245551601423488, "loss": 2.572, "step": 1558 }, { "epoch": 0.6928888888888889, "grad_norm": 1.353574514389038, "learning_rate": 0.00017243772241992884, "loss": 2.671, "step": 1559 }, { "epoch": 0.6933333333333334, "grad_norm": 1.7643210887908936, "learning_rate": 0.0001724199288256228, "loss": 2.6473, "step": 1560 }, { "epoch": 0.6937777777777778, "grad_norm": 1.5903198719024658, "learning_rate": 0.00017240213523131675, "loss": 2.2954, "step": 1561 }, { "epoch": 0.6942222222222222, "grad_norm": 1.627631425857544, "learning_rate": 0.00017238434163701068, "loss": 2.233, "step": 1562 }, { "epoch": 0.6946666666666667, "grad_norm": 1.3801125288009644, "learning_rate": 0.00017236654804270464, "loss": 2.2972, "step": 1563 }, { "epoch": 0.6951111111111111, "grad_norm": 1.451066017150879, "learning_rate": 0.00017234875444839857, "loss": 2.013, "step": 1564 }, { "epoch": 0.6955555555555556, "grad_norm": 1.316686987876892, "learning_rate": 0.00017233096085409252, "loss": 2.3651, "step": 1565 }, { "epoch": 0.696, "grad_norm": 1.300595760345459, "learning_rate": 0.00017231316725978648, "loss": 2.0519, "step": 1566 }, { "epoch": 0.6964444444444444, "grad_norm": 1.4267830848693848, "learning_rate": 0.00017229537366548044, "loss": 2.2664, "step": 1567 }, { "epoch": 0.6968888888888889, "grad_norm": 1.5179320573806763, "learning_rate": 0.0001722775800711744, "loss": 2.3228, "step": 1568 }, { "epoch": 0.6973333333333334, "grad_norm": 1.7718604803085327, "learning_rate": 0.00017225978647686832, "loss": 2.6723, "step": 1569 }, { "epoch": 0.6977777777777778, "grad_norm": 2.2856781482696533, "learning_rate": 0.00017224199288256228, "loss": 2.3014, "step": 1570 }, { "epoch": 0.6982222222222222, "grad_norm": 1.3740836381912231, "learning_rate": 0.00017222419928825624, "loss": 2.7384, "step": 1571 }, { "epoch": 0.6986666666666667, "grad_norm": 1.3104565143585205, "learning_rate": 0.0001722064056939502, "loss": 2.4267, "step": 1572 }, { "epoch": 0.6991111111111111, "grad_norm": 1.5572513341903687, "learning_rate": 0.00017218861209964415, "loss": 2.7882, "step": 1573 }, { "epoch": 0.6995555555555556, "grad_norm": 1.344378113746643, "learning_rate": 0.0001721708185053381, "loss": 2.6669, "step": 1574 }, { "epoch": 0.7, "grad_norm": 1.6647869348526, "learning_rate": 0.00017215302491103203, "loss": 2.5764, "step": 1575 }, { "epoch": 0.7004444444444444, "grad_norm": 1.4547927379608154, "learning_rate": 0.000172135231316726, "loss": 2.4882, "step": 1576 }, { "epoch": 0.7008888888888889, "grad_norm": 1.69290030002594, "learning_rate": 0.00017211743772241992, "loss": 2.6278, "step": 1577 }, { "epoch": 0.7013333333333334, "grad_norm": 1.5832207202911377, "learning_rate": 0.00017209964412811388, "loss": 2.3207, "step": 1578 }, { "epoch": 0.7017777777777777, "grad_norm": 1.6772409677505493, "learning_rate": 0.00017208185053380783, "loss": 2.9327, "step": 1579 }, { "epoch": 0.7022222222222222, "grad_norm": 1.6155133247375488, "learning_rate": 0.0001720640569395018, "loss": 2.6978, "step": 1580 }, { "epoch": 0.7026666666666667, "grad_norm": 1.2734161615371704, "learning_rate": 0.00017204626334519575, "loss": 1.9025, "step": 1581 }, { "epoch": 0.7031111111111111, "grad_norm": 1.4908726215362549, "learning_rate": 0.00017202846975088968, "loss": 2.1544, "step": 1582 }, { "epoch": 0.7035555555555556, "grad_norm": 1.7874783277511597, "learning_rate": 0.00017201067615658363, "loss": 3.0701, "step": 1583 }, { "epoch": 0.704, "grad_norm": 2.0259952545166016, "learning_rate": 0.0001719928825622776, "loss": 2.926, "step": 1584 }, { "epoch": 0.7044444444444444, "grad_norm": 1.7444260120391846, "learning_rate": 0.00017197508896797155, "loss": 2.2407, "step": 1585 }, { "epoch": 0.7048888888888889, "grad_norm": 1.7600386142730713, "learning_rate": 0.0001719572953736655, "loss": 2.6499, "step": 1586 }, { "epoch": 0.7053333333333334, "grad_norm": 1.7595195770263672, "learning_rate": 0.00017193950177935946, "loss": 2.4446, "step": 1587 }, { "epoch": 0.7057777777777777, "grad_norm": 1.9496681690216064, "learning_rate": 0.0001719217081850534, "loss": 3.0214, "step": 1588 }, { "epoch": 0.7062222222222222, "grad_norm": 1.8185930252075195, "learning_rate": 0.00017190391459074734, "loss": 2.5126, "step": 1589 }, { "epoch": 0.7066666666666667, "grad_norm": 1.7105134725570679, "learning_rate": 0.00017188612099644127, "loss": 2.6629, "step": 1590 }, { "epoch": 0.7071111111111111, "grad_norm": 1.8199312686920166, "learning_rate": 0.00017186832740213523, "loss": 2.2038, "step": 1591 }, { "epoch": 0.7075555555555556, "grad_norm": 1.6544042825698853, "learning_rate": 0.0001718505338078292, "loss": 2.4603, "step": 1592 }, { "epoch": 0.708, "grad_norm": 1.5924146175384521, "learning_rate": 0.00017183274021352314, "loss": 2.5588, "step": 1593 }, { "epoch": 0.7084444444444444, "grad_norm": 1.6457511186599731, "learning_rate": 0.0001718149466192171, "loss": 2.7249, "step": 1594 }, { "epoch": 0.7088888888888889, "grad_norm": 1.7341830730438232, "learning_rate": 0.00017179715302491103, "loss": 2.7286, "step": 1595 }, { "epoch": 0.7093333333333334, "grad_norm": 1.8553625345230103, "learning_rate": 0.00017177935943060499, "loss": 3.2606, "step": 1596 }, { "epoch": 0.7097777777777777, "grad_norm": 2.046403408050537, "learning_rate": 0.00017176156583629894, "loss": 2.7661, "step": 1597 }, { "epoch": 0.7102222222222222, "grad_norm": 1.9650744199752808, "learning_rate": 0.0001717437722419929, "loss": 2.7783, "step": 1598 }, { "epoch": 0.7106666666666667, "grad_norm": 2.4216885566711426, "learning_rate": 0.00017172597864768686, "loss": 2.6446, "step": 1599 }, { "epoch": 0.7111111111111111, "grad_norm": 2.498917818069458, "learning_rate": 0.0001717081850533808, "loss": 3.2051, "step": 1600 }, { "epoch": 0.7115555555555556, "grad_norm": 1.0230510234832764, "learning_rate": 0.00017169039145907474, "loss": 2.7331, "step": 1601 }, { "epoch": 0.712, "grad_norm": 1.6314057111740112, "learning_rate": 0.00017167259786476867, "loss": 1.3376, "step": 1602 }, { "epoch": 0.7124444444444444, "grad_norm": 1.033645510673523, "learning_rate": 0.00017165480427046263, "loss": 2.7086, "step": 1603 }, { "epoch": 0.7128888888888889, "grad_norm": 1.1229987144470215, "learning_rate": 0.00017163701067615658, "loss": 2.2418, "step": 1604 }, { "epoch": 0.7133333333333334, "grad_norm": 1.2182966470718384, "learning_rate": 0.00017161921708185054, "loss": 2.8067, "step": 1605 }, { "epoch": 0.7137777777777777, "grad_norm": 1.102390170097351, "learning_rate": 0.0001716014234875445, "loss": 3.0503, "step": 1606 }, { "epoch": 0.7142222222222222, "grad_norm": 1.0558178424835205, "learning_rate": 0.00017158362989323845, "loss": 3.2076, "step": 1607 }, { "epoch": 0.7146666666666667, "grad_norm": 1.236777424812317, "learning_rate": 0.00017156583629893238, "loss": 2.8685, "step": 1608 }, { "epoch": 0.7151111111111111, "grad_norm": 1.6383613348007202, "learning_rate": 0.00017154804270462634, "loss": 2.9057, "step": 1609 }, { "epoch": 0.7155555555555555, "grad_norm": 1.3844068050384521, "learning_rate": 0.0001715302491103203, "loss": 2.5375, "step": 1610 }, { "epoch": 0.716, "grad_norm": 1.1408178806304932, "learning_rate": 0.00017151245551601425, "loss": 2.5761, "step": 1611 }, { "epoch": 0.7164444444444444, "grad_norm": 1.1641733646392822, "learning_rate": 0.0001714946619217082, "loss": 2.6517, "step": 1612 }, { "epoch": 0.7168888888888889, "grad_norm": 1.1055686473846436, "learning_rate": 0.00017147686832740214, "loss": 2.1878, "step": 1613 }, { "epoch": 0.7173333333333334, "grad_norm": 1.1979750394821167, "learning_rate": 0.0001714590747330961, "loss": 2.4771, "step": 1614 }, { "epoch": 0.7177777777777777, "grad_norm": 1.271674633026123, "learning_rate": 0.00017144128113879002, "loss": 2.4917, "step": 1615 }, { "epoch": 0.7182222222222222, "grad_norm": 1.3329592943191528, "learning_rate": 0.00017142348754448398, "loss": 2.5694, "step": 1616 }, { "epoch": 0.7186666666666667, "grad_norm": 1.4913185834884644, "learning_rate": 0.00017140569395017794, "loss": 2.8739, "step": 1617 }, { "epoch": 0.7191111111111111, "grad_norm": 1.3278918266296387, "learning_rate": 0.0001713879003558719, "loss": 2.6695, "step": 1618 }, { "epoch": 0.7195555555555555, "grad_norm": 1.5261479616165161, "learning_rate": 0.00017137010676156585, "loss": 1.4149, "step": 1619 }, { "epoch": 0.72, "grad_norm": 1.4373037815093994, "learning_rate": 0.00017135231316725978, "loss": 2.8322, "step": 1620 }, { "epoch": 0.7204444444444444, "grad_norm": 1.2355538606643677, "learning_rate": 0.00017133451957295374, "loss": 2.1176, "step": 1621 }, { "epoch": 0.7208888888888889, "grad_norm": 1.29166579246521, "learning_rate": 0.0001713167259786477, "loss": 2.5198, "step": 1622 }, { "epoch": 0.7213333333333334, "grad_norm": 1.6880923509597778, "learning_rate": 0.00017129893238434165, "loss": 2.9581, "step": 1623 }, { "epoch": 0.7217777777777777, "grad_norm": 1.5091137886047363, "learning_rate": 0.0001712811387900356, "loss": 2.2347, "step": 1624 }, { "epoch": 0.7222222222222222, "grad_norm": 1.5347869396209717, "learning_rate": 0.00017126334519572956, "loss": 2.6514, "step": 1625 }, { "epoch": 0.7226666666666667, "grad_norm": 1.6462808847427368, "learning_rate": 0.0001712455516014235, "loss": 2.9572, "step": 1626 }, { "epoch": 0.7231111111111111, "grad_norm": 1.5206083059310913, "learning_rate": 0.00017122775800711745, "loss": 2.6225, "step": 1627 }, { "epoch": 0.7235555555555555, "grad_norm": 1.6654325723648071, "learning_rate": 0.00017120996441281138, "loss": 2.4817, "step": 1628 }, { "epoch": 0.724, "grad_norm": 1.6981536149978638, "learning_rate": 0.00017119217081850533, "loss": 2.5816, "step": 1629 }, { "epoch": 0.7244444444444444, "grad_norm": 1.5140714645385742, "learning_rate": 0.0001711743772241993, "loss": 2.6285, "step": 1630 }, { "epoch": 0.7248888888888889, "grad_norm": 1.4249933958053589, "learning_rate": 0.00017115658362989325, "loss": 1.8135, "step": 1631 }, { "epoch": 0.7253333333333334, "grad_norm": 1.72703218460083, "learning_rate": 0.0001711387900355872, "loss": 2.7962, "step": 1632 }, { "epoch": 0.7257777777777777, "grad_norm": 1.43552565574646, "learning_rate": 0.00017112099644128113, "loss": 2.0849, "step": 1633 }, { "epoch": 0.7262222222222222, "grad_norm": 1.728300929069519, "learning_rate": 0.0001711032028469751, "loss": 3.2766, "step": 1634 }, { "epoch": 0.7266666666666667, "grad_norm": 1.7892253398895264, "learning_rate": 0.00017108540925266905, "loss": 2.9754, "step": 1635 }, { "epoch": 0.7271111111111112, "grad_norm": 1.731090784072876, "learning_rate": 0.000171067615658363, "loss": 2.6769, "step": 1636 }, { "epoch": 0.7275555555555555, "grad_norm": 1.4827601909637451, "learning_rate": 0.00017104982206405696, "loss": 2.1877, "step": 1637 }, { "epoch": 0.728, "grad_norm": 1.4949201345443726, "learning_rate": 0.00017103202846975091, "loss": 2.7004, "step": 1638 }, { "epoch": 0.7284444444444444, "grad_norm": 1.5235779285430908, "learning_rate": 0.00017101423487544484, "loss": 2.451, "step": 1639 }, { "epoch": 0.7288888888888889, "grad_norm": 1.7077027559280396, "learning_rate": 0.0001709964412811388, "loss": 2.4066, "step": 1640 }, { "epoch": 0.7293333333333333, "grad_norm": 1.3838727474212646, "learning_rate": 0.00017097864768683273, "loss": 2.0036, "step": 1641 }, { "epoch": 0.7297777777777777, "grad_norm": 1.5762803554534912, "learning_rate": 0.0001709608540925267, "loss": 2.3423, "step": 1642 }, { "epoch": 0.7302222222222222, "grad_norm": 1.5735541582107544, "learning_rate": 0.00017094306049822064, "loss": 2.6597, "step": 1643 }, { "epoch": 0.7306666666666667, "grad_norm": 2.120513916015625, "learning_rate": 0.0001709252669039146, "loss": 3.0132, "step": 1644 }, { "epoch": 0.7311111111111112, "grad_norm": 1.8525890111923218, "learning_rate": 0.00017090747330960856, "loss": 2.3857, "step": 1645 }, { "epoch": 0.7315555555555555, "grad_norm": 1.7836278676986694, "learning_rate": 0.00017088967971530249, "loss": 2.9966, "step": 1646 }, { "epoch": 0.732, "grad_norm": 2.076381206512451, "learning_rate": 0.00017087188612099644, "loss": 3.142, "step": 1647 }, { "epoch": 0.7324444444444445, "grad_norm": 2.3997738361358643, "learning_rate": 0.0001708540925266904, "loss": 3.5624, "step": 1648 }, { "epoch": 0.7328888888888889, "grad_norm": 2.19384503364563, "learning_rate": 0.00017083629893238435, "loss": 2.0362, "step": 1649 }, { "epoch": 0.7333333333333333, "grad_norm": 2.8410866260528564, "learning_rate": 0.0001708185053380783, "loss": 2.3181, "step": 1650 }, { "epoch": 0.7337777777777778, "grad_norm": 1.1997778415679932, "learning_rate": 0.00017080071174377227, "loss": 1.5159, "step": 1651 }, { "epoch": 0.7342222222222222, "grad_norm": 1.0796202421188354, "learning_rate": 0.0001707829181494662, "loss": 3.136, "step": 1652 }, { "epoch": 0.7346666666666667, "grad_norm": 1.2189334630966187, "learning_rate": 0.00017076512455516015, "loss": 2.7014, "step": 1653 }, { "epoch": 0.7351111111111112, "grad_norm": 1.2097785472869873, "learning_rate": 0.00017074733096085408, "loss": 3.3386, "step": 1654 }, { "epoch": 0.7355555555555555, "grad_norm": 1.1241984367370605, "learning_rate": 0.00017072953736654804, "loss": 2.809, "step": 1655 }, { "epoch": 0.736, "grad_norm": 1.315993309020996, "learning_rate": 0.000170711743772242, "loss": 2.1582, "step": 1656 }, { "epoch": 0.7364444444444445, "grad_norm": 1.2245084047317505, "learning_rate": 0.00017069395017793595, "loss": 2.5975, "step": 1657 }, { "epoch": 0.7368888888888889, "grad_norm": 1.542858362197876, "learning_rate": 0.0001706761565836299, "loss": 1.5061, "step": 1658 }, { "epoch": 0.7373333333333333, "grad_norm": 1.0316481590270996, "learning_rate": 0.00017065836298932384, "loss": 1.3895, "step": 1659 }, { "epoch": 0.7377777777777778, "grad_norm": 1.2059721946716309, "learning_rate": 0.0001706405693950178, "loss": 2.5289, "step": 1660 }, { "epoch": 0.7382222222222222, "grad_norm": 1.303240418434143, "learning_rate": 0.00017062277580071175, "loss": 2.6186, "step": 1661 }, { "epoch": 0.7386666666666667, "grad_norm": 1.26139235496521, "learning_rate": 0.0001706049822064057, "loss": 2.7619, "step": 1662 }, { "epoch": 0.7391111111111112, "grad_norm": 1.4750614166259766, "learning_rate": 0.00017058718861209966, "loss": 2.4993, "step": 1663 }, { "epoch": 0.7395555555555555, "grad_norm": 1.3910586833953857, "learning_rate": 0.00017056939501779362, "loss": 2.2823, "step": 1664 }, { "epoch": 0.74, "grad_norm": 1.4160467386245728, "learning_rate": 0.00017055160142348755, "loss": 2.102, "step": 1665 }, { "epoch": 0.7404444444444445, "grad_norm": 1.1895157098770142, "learning_rate": 0.0001705338078291815, "loss": 1.9158, "step": 1666 }, { "epoch": 0.7408888888888889, "grad_norm": 1.5959806442260742, "learning_rate": 0.00017051601423487544, "loss": 2.5473, "step": 1667 }, { "epoch": 0.7413333333333333, "grad_norm": 1.5768413543701172, "learning_rate": 0.0001704982206405694, "loss": 1.2562, "step": 1668 }, { "epoch": 0.7417777777777778, "grad_norm": 1.4953051805496216, "learning_rate": 0.00017048042704626335, "loss": 2.5303, "step": 1669 }, { "epoch": 0.7422222222222222, "grad_norm": 1.3557592630386353, "learning_rate": 0.0001704626334519573, "loss": 2.2733, "step": 1670 }, { "epoch": 0.7426666666666667, "grad_norm": 1.5126179456710815, "learning_rate": 0.00017044483985765126, "loss": 2.7248, "step": 1671 }, { "epoch": 0.7431111111111111, "grad_norm": 1.3499473333358765, "learning_rate": 0.0001704270462633452, "loss": 2.5348, "step": 1672 }, { "epoch": 0.7435555555555555, "grad_norm": 1.2511281967163086, "learning_rate": 0.00017040925266903915, "loss": 2.2185, "step": 1673 }, { "epoch": 0.744, "grad_norm": 1.4628006219863892, "learning_rate": 0.0001703914590747331, "loss": 2.3975, "step": 1674 }, { "epoch": 0.7444444444444445, "grad_norm": 1.4203425645828247, "learning_rate": 0.00017037366548042706, "loss": 2.1031, "step": 1675 }, { "epoch": 0.7448888888888889, "grad_norm": 1.3346225023269653, "learning_rate": 0.00017035587188612102, "loss": 1.9704, "step": 1676 }, { "epoch": 0.7453333333333333, "grad_norm": 1.3518871068954468, "learning_rate": 0.00017033807829181497, "loss": 2.6569, "step": 1677 }, { "epoch": 0.7457777777777778, "grad_norm": 1.5000810623168945, "learning_rate": 0.0001703202846975089, "loss": 2.1933, "step": 1678 }, { "epoch": 0.7462222222222222, "grad_norm": 1.6626880168914795, "learning_rate": 0.00017030249110320286, "loss": 2.166, "step": 1679 }, { "epoch": 0.7466666666666667, "grad_norm": 1.3020575046539307, "learning_rate": 0.0001702846975088968, "loss": 2.4969, "step": 1680 }, { "epoch": 0.7471111111111111, "grad_norm": 1.4085545539855957, "learning_rate": 0.00017026690391459075, "loss": 2.4218, "step": 1681 }, { "epoch": 0.7475555555555555, "grad_norm": 1.861728310585022, "learning_rate": 0.0001702491103202847, "loss": 2.2863, "step": 1682 }, { "epoch": 0.748, "grad_norm": 1.688712239265442, "learning_rate": 0.00017023131672597866, "loss": 2.773, "step": 1683 }, { "epoch": 0.7484444444444445, "grad_norm": 1.4581354856491089, "learning_rate": 0.00017021352313167262, "loss": 2.4764, "step": 1684 }, { "epoch": 0.7488888888888889, "grad_norm": 1.8125518560409546, "learning_rate": 0.00017019572953736654, "loss": 2.1238, "step": 1685 }, { "epoch": 0.7493333333333333, "grad_norm": 1.5851460695266724, "learning_rate": 0.0001701779359430605, "loss": 2.2135, "step": 1686 }, { "epoch": 0.7497777777777778, "grad_norm": 1.852522611618042, "learning_rate": 0.00017016014234875446, "loss": 2.3781, "step": 1687 }, { "epoch": 0.7502222222222222, "grad_norm": 1.500272512435913, "learning_rate": 0.00017014234875444841, "loss": 2.2147, "step": 1688 }, { "epoch": 0.7506666666666667, "grad_norm": 1.4343912601470947, "learning_rate": 0.00017012455516014237, "loss": 2.3782, "step": 1689 }, { "epoch": 0.7511111111111111, "grad_norm": 1.8600046634674072, "learning_rate": 0.00017010676156583633, "loss": 2.7655, "step": 1690 }, { "epoch": 0.7515555555555555, "grad_norm": 1.5437164306640625, "learning_rate": 0.00017008896797153026, "loss": 2.3394, "step": 1691 }, { "epoch": 0.752, "grad_norm": 1.9030184745788574, "learning_rate": 0.0001700711743772242, "loss": 2.7159, "step": 1692 }, { "epoch": 0.7524444444444445, "grad_norm": 2.081378698348999, "learning_rate": 0.00017005338078291814, "loss": 2.5658, "step": 1693 }, { "epoch": 0.7528888888888889, "grad_norm": 1.6532082557678223, "learning_rate": 0.0001700355871886121, "loss": 2.3354, "step": 1694 }, { "epoch": 0.7533333333333333, "grad_norm": 2.3726096153259277, "learning_rate": 0.00017001779359430606, "loss": 3.1188, "step": 1695 }, { "epoch": 0.7537777777777778, "grad_norm": 2.014913558959961, "learning_rate": 0.00017, "loss": 3.0022, "step": 1696 }, { "epoch": 0.7542222222222222, "grad_norm": 2.250953197479248, "learning_rate": 0.00016998220640569397, "loss": 2.8569, "step": 1697 }, { "epoch": 0.7546666666666667, "grad_norm": 1.7740085124969482, "learning_rate": 0.0001699644128113879, "loss": 2.3986, "step": 1698 }, { "epoch": 0.7551111111111111, "grad_norm": 1.9951434135437012, "learning_rate": 0.00016994661921708185, "loss": 1.5462, "step": 1699 }, { "epoch": 0.7555555555555555, "grad_norm": 2.3089957237243652, "learning_rate": 0.0001699288256227758, "loss": 2.038, "step": 1700 }, { "epoch": 0.756, "grad_norm": 0.9273460507392883, "learning_rate": 0.00016991103202846977, "loss": 2.2534, "step": 1701 }, { "epoch": 0.7564444444444445, "grad_norm": 1.0741509199142456, "learning_rate": 0.00016989323843416372, "loss": 2.6395, "step": 1702 }, { "epoch": 0.7568888888888889, "grad_norm": 1.2935476303100586, "learning_rate": 0.00016987544483985765, "loss": 2.7484, "step": 1703 }, { "epoch": 0.7573333333333333, "grad_norm": 1.2503875494003296, "learning_rate": 0.0001698576512455516, "loss": 2.4681, "step": 1704 }, { "epoch": 0.7577777777777778, "grad_norm": 1.1031012535095215, "learning_rate": 0.00016983985765124557, "loss": 2.5408, "step": 1705 }, { "epoch": 0.7582222222222222, "grad_norm": 1.1890628337860107, "learning_rate": 0.0001698220640569395, "loss": 2.9032, "step": 1706 }, { "epoch": 0.7586666666666667, "grad_norm": 1.1588826179504395, "learning_rate": 0.00016980427046263345, "loss": 2.3581, "step": 1707 }, { "epoch": 0.7591111111111111, "grad_norm": 1.2451859712600708, "learning_rate": 0.0001697864768683274, "loss": 2.4299, "step": 1708 }, { "epoch": 0.7595555555555555, "grad_norm": 1.4853380918502808, "learning_rate": 0.00016976868327402137, "loss": 3.0938, "step": 1709 }, { "epoch": 0.76, "grad_norm": 1.686055302619934, "learning_rate": 0.0001697508896797153, "loss": 2.3023, "step": 1710 }, { "epoch": 0.7604444444444445, "grad_norm": 1.3711706399917603, "learning_rate": 0.00016973309608540925, "loss": 2.6126, "step": 1711 }, { "epoch": 0.7608888888888888, "grad_norm": 1.3711973428726196, "learning_rate": 0.0001697153024911032, "loss": 2.634, "step": 1712 }, { "epoch": 0.7613333333333333, "grad_norm": 1.236276388168335, "learning_rate": 0.00016969750889679716, "loss": 2.6303, "step": 1713 }, { "epoch": 0.7617777777777778, "grad_norm": 1.2677000761032104, "learning_rate": 0.00016967971530249112, "loss": 2.646, "step": 1714 }, { "epoch": 0.7622222222222222, "grad_norm": 1.2618008852005005, "learning_rate": 0.00016966192170818508, "loss": 2.3963, "step": 1715 }, { "epoch": 0.7626666666666667, "grad_norm": 1.3823282718658447, "learning_rate": 0.000169644128113879, "loss": 2.6979, "step": 1716 }, { "epoch": 0.7631111111111111, "grad_norm": 1.3976502418518066, "learning_rate": 0.00016962633451957296, "loss": 1.525, "step": 1717 }, { "epoch": 0.7635555555555555, "grad_norm": 1.343619465827942, "learning_rate": 0.0001696085409252669, "loss": 2.9, "step": 1718 }, { "epoch": 0.764, "grad_norm": 1.1925876140594482, "learning_rate": 0.00016959074733096085, "loss": 1.9166, "step": 1719 }, { "epoch": 0.7644444444444445, "grad_norm": 1.782758355140686, "learning_rate": 0.0001695729537366548, "loss": 2.5998, "step": 1720 }, { "epoch": 0.7648888888888888, "grad_norm": 1.5515620708465576, "learning_rate": 0.00016955516014234876, "loss": 2.1778, "step": 1721 }, { "epoch": 0.7653333333333333, "grad_norm": 1.3295077085494995, "learning_rate": 0.00016953736654804272, "loss": 2.0455, "step": 1722 }, { "epoch": 0.7657777777777778, "grad_norm": 1.6880308389663696, "learning_rate": 0.00016951957295373665, "loss": 2.6615, "step": 1723 }, { "epoch": 0.7662222222222222, "grad_norm": 1.8657255172729492, "learning_rate": 0.0001695017793594306, "loss": 2.5311, "step": 1724 }, { "epoch": 0.7666666666666667, "grad_norm": 1.4304696321487427, "learning_rate": 0.00016948398576512456, "loss": 1.9907, "step": 1725 }, { "epoch": 0.7671111111111111, "grad_norm": 1.3921934366226196, "learning_rate": 0.00016946619217081852, "loss": 2.5942, "step": 1726 }, { "epoch": 0.7675555555555555, "grad_norm": 1.1713439226150513, "learning_rate": 0.00016944839857651247, "loss": 1.329, "step": 1727 }, { "epoch": 0.768, "grad_norm": 1.447230577468872, "learning_rate": 0.00016943060498220643, "loss": 2.2461, "step": 1728 }, { "epoch": 0.7684444444444445, "grad_norm": 1.5526431798934937, "learning_rate": 0.00016941281138790036, "loss": 2.5692, "step": 1729 }, { "epoch": 0.7688888888888888, "grad_norm": 1.864875078201294, "learning_rate": 0.00016939501779359432, "loss": 2.3305, "step": 1730 }, { "epoch": 0.7693333333333333, "grad_norm": 1.5413137674331665, "learning_rate": 0.00016937722419928825, "loss": 2.4763, "step": 1731 }, { "epoch": 0.7697777777777778, "grad_norm": 1.5602953433990479, "learning_rate": 0.0001693594306049822, "loss": 2.3397, "step": 1732 }, { "epoch": 0.7702222222222223, "grad_norm": 1.5029823780059814, "learning_rate": 0.00016934163701067616, "loss": 2.5413, "step": 1733 }, { "epoch": 0.7706666666666667, "grad_norm": 1.9721778631210327, "learning_rate": 0.00016932384341637012, "loss": 2.3882, "step": 1734 }, { "epoch": 0.7711111111111111, "grad_norm": 1.5762044191360474, "learning_rate": 0.00016930604982206407, "loss": 2.7452, "step": 1735 }, { "epoch": 0.7715555555555556, "grad_norm": 1.6823639869689941, "learning_rate": 0.000169288256227758, "loss": 2.706, "step": 1736 }, { "epoch": 0.772, "grad_norm": 2.066340446472168, "learning_rate": 0.00016927046263345196, "loss": 2.8768, "step": 1737 }, { "epoch": 0.7724444444444445, "grad_norm": 1.7574570178985596, "learning_rate": 0.00016925266903914591, "loss": 2.9213, "step": 1738 }, { "epoch": 0.7728888888888888, "grad_norm": 1.5815603733062744, "learning_rate": 0.00016923487544483987, "loss": 2.7557, "step": 1739 }, { "epoch": 0.7733333333333333, "grad_norm": 2.00929594039917, "learning_rate": 0.00016921708185053383, "loss": 2.828, "step": 1740 }, { "epoch": 0.7737777777777778, "grad_norm": 1.5500198602676392, "learning_rate": 0.00016919928825622778, "loss": 2.8645, "step": 1741 }, { "epoch": 0.7742222222222223, "grad_norm": 1.5847936868667603, "learning_rate": 0.0001691814946619217, "loss": 2.4433, "step": 1742 }, { "epoch": 0.7746666666666666, "grad_norm": 1.5422377586364746, "learning_rate": 0.00016916370106761567, "loss": 2.7777, "step": 1743 }, { "epoch": 0.7751111111111111, "grad_norm": 1.4462459087371826, "learning_rate": 0.0001691459074733096, "loss": 2.3321, "step": 1744 }, { "epoch": 0.7755555555555556, "grad_norm": 2.3052868843078613, "learning_rate": 0.00016912811387900356, "loss": 2.3839, "step": 1745 }, { "epoch": 0.776, "grad_norm": 1.589308261871338, "learning_rate": 0.0001691103202846975, "loss": 2.2261, "step": 1746 }, { "epoch": 0.7764444444444445, "grad_norm": 1.5655573606491089, "learning_rate": 0.00016909252669039147, "loss": 2.3206, "step": 1747 }, { "epoch": 0.7768888888888889, "grad_norm": 1.4442392587661743, "learning_rate": 0.00016907473309608542, "loss": 1.8804, "step": 1748 }, { "epoch": 0.7773333333333333, "grad_norm": 2.0022120475769043, "learning_rate": 0.00016905693950177935, "loss": 3.0496, "step": 1749 }, { "epoch": 0.7777777777777778, "grad_norm": 2.1214585304260254, "learning_rate": 0.0001690391459074733, "loss": 3.1286, "step": 1750 }, { "epoch": 0.7782222222222223, "grad_norm": 1.1207834482192993, "learning_rate": 0.00016902135231316727, "loss": 1.5612, "step": 1751 }, { "epoch": 0.7786666666666666, "grad_norm": 1.0428500175476074, "learning_rate": 0.00016900355871886122, "loss": 2.3335, "step": 1752 }, { "epoch": 0.7791111111111111, "grad_norm": 1.0575370788574219, "learning_rate": 0.00016898576512455518, "loss": 1.4655, "step": 1753 }, { "epoch": 0.7795555555555556, "grad_norm": 1.2163389921188354, "learning_rate": 0.00016896797153024914, "loss": 2.3588, "step": 1754 }, { "epoch": 0.78, "grad_norm": 1.481124997138977, "learning_rate": 0.00016895017793594307, "loss": 2.8144, "step": 1755 }, { "epoch": 0.7804444444444445, "grad_norm": 1.3335241079330444, "learning_rate": 0.00016893238434163702, "loss": 3.0533, "step": 1756 }, { "epoch": 0.7808888888888889, "grad_norm": 1.2501187324523926, "learning_rate": 0.00016891459074733095, "loss": 2.7067, "step": 1757 }, { "epoch": 0.7813333333333333, "grad_norm": 1.2316926717758179, "learning_rate": 0.0001688967971530249, "loss": 3.0781, "step": 1758 }, { "epoch": 0.7817777777777778, "grad_norm": 1.4136369228363037, "learning_rate": 0.00016887900355871886, "loss": 2.4057, "step": 1759 }, { "epoch": 0.7822222222222223, "grad_norm": 1.2159130573272705, "learning_rate": 0.00016886120996441282, "loss": 2.3195, "step": 1760 }, { "epoch": 0.7826666666666666, "grad_norm": 1.3189160823822021, "learning_rate": 0.00016884341637010678, "loss": 2.6697, "step": 1761 }, { "epoch": 0.7831111111111111, "grad_norm": 1.2556674480438232, "learning_rate": 0.0001688256227758007, "loss": 2.4658, "step": 1762 }, { "epoch": 0.7835555555555556, "grad_norm": 1.4505153894424438, "learning_rate": 0.00016880782918149466, "loss": 2.4346, "step": 1763 }, { "epoch": 0.784, "grad_norm": 1.3776673078536987, "learning_rate": 0.00016879003558718862, "loss": 2.2668, "step": 1764 }, { "epoch": 0.7844444444444445, "grad_norm": 1.7608091831207275, "learning_rate": 0.00016877224199288258, "loss": 1.6074, "step": 1765 }, { "epoch": 0.7848888888888889, "grad_norm": 1.6191914081573486, "learning_rate": 0.00016875444839857653, "loss": 2.3601, "step": 1766 }, { "epoch": 0.7853333333333333, "grad_norm": 1.4360511302947998, "learning_rate": 0.0001687366548042705, "loss": 2.5914, "step": 1767 }, { "epoch": 0.7857777777777778, "grad_norm": 1.3812462091445923, "learning_rate": 0.00016871886120996442, "loss": 2.5718, "step": 1768 }, { "epoch": 0.7862222222222223, "grad_norm": 1.522873878479004, "learning_rate": 0.00016870106761565838, "loss": 2.7388, "step": 1769 }, { "epoch": 0.7866666666666666, "grad_norm": 1.7307029962539673, "learning_rate": 0.0001686832740213523, "loss": 2.0664, "step": 1770 }, { "epoch": 0.7871111111111111, "grad_norm": 1.383586049079895, "learning_rate": 0.00016866548042704626, "loss": 2.7238, "step": 1771 }, { "epoch": 0.7875555555555556, "grad_norm": 1.6344101428985596, "learning_rate": 0.00016864768683274022, "loss": 3.0245, "step": 1772 }, { "epoch": 0.788, "grad_norm": 1.4926892518997192, "learning_rate": 0.00016862989323843417, "loss": 2.4893, "step": 1773 }, { "epoch": 0.7884444444444444, "grad_norm": 1.7372292280197144, "learning_rate": 0.00016861209964412813, "loss": 2.9537, "step": 1774 }, { "epoch": 0.7888888888888889, "grad_norm": 1.3594554662704468, "learning_rate": 0.00016859430604982206, "loss": 1.9789, "step": 1775 }, { "epoch": 0.7893333333333333, "grad_norm": 1.3864822387695312, "learning_rate": 0.00016857651245551602, "loss": 2.585, "step": 1776 }, { "epoch": 0.7897777777777778, "grad_norm": 1.5721726417541504, "learning_rate": 0.00016855871886120997, "loss": 2.8356, "step": 1777 }, { "epoch": 0.7902222222222223, "grad_norm": 1.3060152530670166, "learning_rate": 0.00016854092526690393, "loss": 2.2237, "step": 1778 }, { "epoch": 0.7906666666666666, "grad_norm": 1.349345326423645, "learning_rate": 0.0001685231316725979, "loss": 2.3712, "step": 1779 }, { "epoch": 0.7911111111111111, "grad_norm": 1.4352922439575195, "learning_rate": 0.00016850533807829184, "loss": 1.6791, "step": 1780 }, { "epoch": 0.7915555555555556, "grad_norm": 1.5430026054382324, "learning_rate": 0.00016848754448398577, "loss": 2.7579, "step": 1781 }, { "epoch": 0.792, "grad_norm": 1.7820968627929688, "learning_rate": 0.00016846975088967973, "loss": 3.1443, "step": 1782 }, { "epoch": 0.7924444444444444, "grad_norm": 1.3988341093063354, "learning_rate": 0.00016845195729537366, "loss": 1.8977, "step": 1783 }, { "epoch": 0.7928888888888889, "grad_norm": 1.705902338027954, "learning_rate": 0.00016843416370106761, "loss": 2.795, "step": 1784 }, { "epoch": 0.7933333333333333, "grad_norm": 1.6256381273269653, "learning_rate": 0.00016841637010676157, "loss": 2.6093, "step": 1785 }, { "epoch": 0.7937777777777778, "grad_norm": 1.6664106845855713, "learning_rate": 0.00016839857651245553, "loss": 2.5954, "step": 1786 }, { "epoch": 0.7942222222222223, "grad_norm": 1.5220837593078613, "learning_rate": 0.00016838078291814948, "loss": 2.4056, "step": 1787 }, { "epoch": 0.7946666666666666, "grad_norm": 2.4615044593811035, "learning_rate": 0.00016836298932384341, "loss": 3.1873, "step": 1788 }, { "epoch": 0.7951111111111111, "grad_norm": 1.54017174243927, "learning_rate": 0.00016834519572953737, "loss": 2.5285, "step": 1789 }, { "epoch": 0.7955555555555556, "grad_norm": 1.5758004188537598, "learning_rate": 0.00016832740213523133, "loss": 2.4812, "step": 1790 }, { "epoch": 0.796, "grad_norm": 2.027078866958618, "learning_rate": 0.00016830960854092528, "loss": 2.5266, "step": 1791 }, { "epoch": 0.7964444444444444, "grad_norm": 1.9708378314971924, "learning_rate": 0.00016829181494661924, "loss": 2.7944, "step": 1792 }, { "epoch": 0.7968888888888889, "grad_norm": 1.4897676706314087, "learning_rate": 0.00016827402135231317, "loss": 2.1396, "step": 1793 }, { "epoch": 0.7973333333333333, "grad_norm": 1.5909985303878784, "learning_rate": 0.00016825622775800713, "loss": 2.1982, "step": 1794 }, { "epoch": 0.7977777777777778, "grad_norm": 1.780980110168457, "learning_rate": 0.00016823843416370108, "loss": 2.836, "step": 1795 }, { "epoch": 0.7982222222222223, "grad_norm": 1.9581319093704224, "learning_rate": 0.000168220640569395, "loss": 3.0858, "step": 1796 }, { "epoch": 0.7986666666666666, "grad_norm": 2.094820737838745, "learning_rate": 0.00016820284697508897, "loss": 3.2028, "step": 1797 }, { "epoch": 0.7991111111111111, "grad_norm": 2.108332872390747, "learning_rate": 0.00016818505338078292, "loss": 2.1878, "step": 1798 }, { "epoch": 0.7995555555555556, "grad_norm": 1.941762089729309, "learning_rate": 0.00016816725978647688, "loss": 2.6336, "step": 1799 }, { "epoch": 0.8, "grad_norm": 3.6256611347198486, "learning_rate": 0.0001681494661921708, "loss": 1.605, "step": 1800 }, { "epoch": 0.8004444444444444, "grad_norm": 0.9606013894081116, "learning_rate": 0.00016813167259786477, "loss": 2.5109, "step": 1801 }, { "epoch": 0.8008888888888889, "grad_norm": 1.1120461225509644, "learning_rate": 0.00016811387900355872, "loss": 3.0039, "step": 1802 }, { "epoch": 0.8013333333333333, "grad_norm": 0.9408321976661682, "learning_rate": 0.00016809608540925268, "loss": 2.1882, "step": 1803 }, { "epoch": 0.8017777777777778, "grad_norm": 1.210302472114563, "learning_rate": 0.00016807829181494664, "loss": 2.4686, "step": 1804 }, { "epoch": 0.8022222222222222, "grad_norm": 1.1656526327133179, "learning_rate": 0.0001680604982206406, "loss": 2.546, "step": 1805 }, { "epoch": 0.8026666666666666, "grad_norm": 1.2500494718551636, "learning_rate": 0.00016804270462633452, "loss": 2.7287, "step": 1806 }, { "epoch": 0.8031111111111111, "grad_norm": 1.1803468465805054, "learning_rate": 0.00016802491103202848, "loss": 2.7445, "step": 1807 }, { "epoch": 0.8035555555555556, "grad_norm": 1.7277123928070068, "learning_rate": 0.00016800711743772244, "loss": 1.3246, "step": 1808 }, { "epoch": 0.804, "grad_norm": 1.6080886125564575, "learning_rate": 0.00016798932384341636, "loss": 2.639, "step": 1809 }, { "epoch": 0.8044444444444444, "grad_norm": 1.1558185815811157, "learning_rate": 0.00016797153024911032, "loss": 2.5563, "step": 1810 }, { "epoch": 0.8048888888888889, "grad_norm": 1.3278673887252808, "learning_rate": 0.00016795373665480428, "loss": 2.1776, "step": 1811 }, { "epoch": 0.8053333333333333, "grad_norm": 1.2279390096664429, "learning_rate": 0.00016793594306049823, "loss": 2.5615, "step": 1812 }, { "epoch": 0.8057777777777778, "grad_norm": 1.3550021648406982, "learning_rate": 0.00016791814946619216, "loss": 2.432, "step": 1813 }, { "epoch": 0.8062222222222222, "grad_norm": 1.2661508321762085, "learning_rate": 0.00016790035587188612, "loss": 2.4724, "step": 1814 }, { "epoch": 0.8066666666666666, "grad_norm": 1.5162663459777832, "learning_rate": 0.00016788256227758008, "loss": 2.7064, "step": 1815 }, { "epoch": 0.8071111111111111, "grad_norm": 1.526525616645813, "learning_rate": 0.00016786476868327403, "loss": 2.7493, "step": 1816 }, { "epoch": 0.8075555555555556, "grad_norm": 1.349353551864624, "learning_rate": 0.000167846975088968, "loss": 2.175, "step": 1817 }, { "epoch": 0.808, "grad_norm": 1.518172025680542, "learning_rate": 0.00016782918149466195, "loss": 2.9974, "step": 1818 }, { "epoch": 0.8084444444444444, "grad_norm": 1.3705731630325317, "learning_rate": 0.00016781138790035588, "loss": 2.691, "step": 1819 }, { "epoch": 0.8088888888888889, "grad_norm": 1.4127305746078491, "learning_rate": 0.00016779359430604983, "loss": 2.6794, "step": 1820 }, { "epoch": 0.8093333333333333, "grad_norm": 1.3548202514648438, "learning_rate": 0.0001677758007117438, "loss": 2.3797, "step": 1821 }, { "epoch": 0.8097777777777778, "grad_norm": 2.2642502784729004, "learning_rate": 0.00016775800711743772, "loss": 2.1909, "step": 1822 }, { "epoch": 0.8102222222222222, "grad_norm": 1.6806228160858154, "learning_rate": 0.00016774021352313167, "loss": 2.8928, "step": 1823 }, { "epoch": 0.8106666666666666, "grad_norm": 1.296937346458435, "learning_rate": 0.00016772241992882563, "loss": 2.0109, "step": 1824 }, { "epoch": 0.8111111111111111, "grad_norm": 1.387312650680542, "learning_rate": 0.0001677046263345196, "loss": 2.2319, "step": 1825 }, { "epoch": 0.8115555555555556, "grad_norm": 1.6626933813095093, "learning_rate": 0.00016768683274021352, "loss": 2.6542, "step": 1826 }, { "epoch": 0.812, "grad_norm": 1.658470869064331, "learning_rate": 0.00016766903914590747, "loss": 2.3784, "step": 1827 }, { "epoch": 0.8124444444444444, "grad_norm": 1.5517417192459106, "learning_rate": 0.00016765124555160143, "loss": 2.6614, "step": 1828 }, { "epoch": 0.8128888888888889, "grad_norm": 1.6006346940994263, "learning_rate": 0.00016763345195729539, "loss": 3.0233, "step": 1829 }, { "epoch": 0.8133333333333334, "grad_norm": 1.503570795059204, "learning_rate": 0.00016761565836298934, "loss": 2.2458, "step": 1830 }, { "epoch": 0.8137777777777778, "grad_norm": 1.9097468852996826, "learning_rate": 0.0001675978647686833, "loss": 2.6981, "step": 1831 }, { "epoch": 0.8142222222222222, "grad_norm": 1.504949927330017, "learning_rate": 0.00016758007117437723, "loss": 2.8705, "step": 1832 }, { "epoch": 0.8146666666666667, "grad_norm": 1.7333391904830933, "learning_rate": 0.00016756227758007119, "loss": 2.3296, "step": 1833 }, { "epoch": 0.8151111111111111, "grad_norm": 4.087153434753418, "learning_rate": 0.00016754448398576511, "loss": 0.2551, "step": 1834 }, { "epoch": 0.8155555555555556, "grad_norm": 1.6669485569000244, "learning_rate": 0.00016752669039145907, "loss": 2.2058, "step": 1835 }, { "epoch": 0.816, "grad_norm": 1.4292755126953125, "learning_rate": 0.00016750889679715303, "loss": 2.2523, "step": 1836 }, { "epoch": 0.8164444444444444, "grad_norm": 1.538756251335144, "learning_rate": 0.00016749110320284698, "loss": 2.3624, "step": 1837 }, { "epoch": 0.8168888888888889, "grad_norm": 1.5922455787658691, "learning_rate": 0.00016747330960854094, "loss": 2.6137, "step": 1838 }, { "epoch": 0.8173333333333334, "grad_norm": 1.5305728912353516, "learning_rate": 0.00016745551601423487, "loss": 2.1719, "step": 1839 }, { "epoch": 0.8177777777777778, "grad_norm": 2.139403820037842, "learning_rate": 0.00016743772241992883, "loss": 3.0687, "step": 1840 }, { "epoch": 0.8182222222222222, "grad_norm": 1.7138938903808594, "learning_rate": 0.00016741992882562278, "loss": 2.684, "step": 1841 }, { "epoch": 0.8186666666666667, "grad_norm": 2.1638147830963135, "learning_rate": 0.00016740213523131674, "loss": 2.8127, "step": 1842 }, { "epoch": 0.8191111111111111, "grad_norm": 2.317457914352417, "learning_rate": 0.0001673843416370107, "loss": 2.7219, "step": 1843 }, { "epoch": 0.8195555555555556, "grad_norm": 1.7413108348846436, "learning_rate": 0.00016736654804270465, "loss": 2.6101, "step": 1844 }, { "epoch": 0.82, "grad_norm": 2.0976204872131348, "learning_rate": 0.00016734875444839858, "loss": 2.5319, "step": 1845 }, { "epoch": 0.8204444444444444, "grad_norm": 2.517296075820923, "learning_rate": 0.00016733096085409254, "loss": 3.3618, "step": 1846 }, { "epoch": 0.8208888888888889, "grad_norm": 2.0704150199890137, "learning_rate": 0.00016731316725978647, "loss": 2.4757, "step": 1847 }, { "epoch": 0.8213333333333334, "grad_norm": 2.702256917953491, "learning_rate": 0.00016729537366548042, "loss": 0.1476, "step": 1848 }, { "epoch": 0.8217777777777778, "grad_norm": 1.8367481231689453, "learning_rate": 0.00016727758007117438, "loss": 1.8312, "step": 1849 }, { "epoch": 0.8222222222222222, "grad_norm": 1.943381428718567, "learning_rate": 0.00016725978647686834, "loss": 0.9415, "step": 1850 }, { "epoch": 0.8226666666666667, "grad_norm": 1.0285029411315918, "learning_rate": 0.0001672419928825623, "loss": 2.5791, "step": 1851 }, { "epoch": 0.8231111111111111, "grad_norm": 1.0174617767333984, "learning_rate": 0.00016722419928825622, "loss": 2.4917, "step": 1852 }, { "epoch": 0.8235555555555556, "grad_norm": 1.1286165714263916, "learning_rate": 0.00016720640569395018, "loss": 2.7286, "step": 1853 }, { "epoch": 0.824, "grad_norm": 1.2801088094711304, "learning_rate": 0.00016718861209964414, "loss": 2.2644, "step": 1854 }, { "epoch": 0.8244444444444444, "grad_norm": 1.3122279644012451, "learning_rate": 0.0001671708185053381, "loss": 1.8628, "step": 1855 }, { "epoch": 0.8248888888888889, "grad_norm": 1.3034883737564087, "learning_rate": 0.00016715302491103205, "loss": 2.8352, "step": 1856 }, { "epoch": 0.8253333333333334, "grad_norm": 1.3606019020080566, "learning_rate": 0.000167135231316726, "loss": 3.1646, "step": 1857 }, { "epoch": 0.8257777777777778, "grad_norm": 1.503804087638855, "learning_rate": 0.00016711743772241993, "loss": 2.3705, "step": 1858 }, { "epoch": 0.8262222222222222, "grad_norm": 1.2869994640350342, "learning_rate": 0.0001670996441281139, "loss": 2.587, "step": 1859 }, { "epoch": 0.8266666666666667, "grad_norm": 1.376165747642517, "learning_rate": 0.00016708185053380782, "loss": 2.2356, "step": 1860 }, { "epoch": 0.8271111111111111, "grad_norm": 1.1255593299865723, "learning_rate": 0.00016706405693950178, "loss": 2.1244, "step": 1861 }, { "epoch": 0.8275555555555556, "grad_norm": 1.390615701675415, "learning_rate": 0.00016704626334519573, "loss": 2.6707, "step": 1862 }, { "epoch": 0.828, "grad_norm": 1.195713996887207, "learning_rate": 0.0001670284697508897, "loss": 2.1111, "step": 1863 }, { "epoch": 0.8284444444444444, "grad_norm": 1.275231957435608, "learning_rate": 0.00016701067615658365, "loss": 2.8402, "step": 1864 }, { "epoch": 0.8288888888888889, "grad_norm": 1.294203758239746, "learning_rate": 0.00016699288256227758, "loss": 2.9545, "step": 1865 }, { "epoch": 0.8293333333333334, "grad_norm": 1.4004563093185425, "learning_rate": 0.00016697508896797153, "loss": 2.9775, "step": 1866 }, { "epoch": 0.8297777777777777, "grad_norm": 1.3818445205688477, "learning_rate": 0.0001669572953736655, "loss": 2.4705, "step": 1867 }, { "epoch": 0.8302222222222222, "grad_norm": 1.381339192390442, "learning_rate": 0.00016693950177935945, "loss": 2.0993, "step": 1868 }, { "epoch": 0.8306666666666667, "grad_norm": 1.2843163013458252, "learning_rate": 0.0001669217081850534, "loss": 2.7269, "step": 1869 }, { "epoch": 0.8311111111111111, "grad_norm": 1.5089333057403564, "learning_rate": 0.00016690391459074736, "loss": 2.442, "step": 1870 }, { "epoch": 0.8315555555555556, "grad_norm": 1.5830549001693726, "learning_rate": 0.0001668861209964413, "loss": 2.5014, "step": 1871 }, { "epoch": 0.832, "grad_norm": 1.250998616218567, "learning_rate": 0.00016686832740213524, "loss": 2.3065, "step": 1872 }, { "epoch": 0.8324444444444444, "grad_norm": 1.5048855543136597, "learning_rate": 0.00016685053380782917, "loss": 2.3963, "step": 1873 }, { "epoch": 0.8328888888888889, "grad_norm": 1.3290457725524902, "learning_rate": 0.00016683274021352313, "loss": 2.3719, "step": 1874 }, { "epoch": 0.8333333333333334, "grad_norm": 1.4442025423049927, "learning_rate": 0.0001668149466192171, "loss": 2.5401, "step": 1875 }, { "epoch": 0.8337777777777777, "grad_norm": 1.5543490648269653, "learning_rate": 0.00016679715302491104, "loss": 2.6803, "step": 1876 }, { "epoch": 0.8342222222222222, "grad_norm": 1.3237760066986084, "learning_rate": 0.000166779359430605, "loss": 2.4261, "step": 1877 }, { "epoch": 0.8346666666666667, "grad_norm": 1.4572439193725586, "learning_rate": 0.00016676156583629893, "loss": 2.0907, "step": 1878 }, { "epoch": 0.8351111111111111, "grad_norm": 1.3316566944122314, "learning_rate": 0.00016674377224199289, "loss": 1.099, "step": 1879 }, { "epoch": 0.8355555555555556, "grad_norm": 1.640336513519287, "learning_rate": 0.00016672597864768684, "loss": 3.0017, "step": 1880 }, { "epoch": 0.836, "grad_norm": 1.6514651775360107, "learning_rate": 0.0001667081850533808, "loss": 2.6943, "step": 1881 }, { "epoch": 0.8364444444444444, "grad_norm": 1.7922955751419067, "learning_rate": 0.00016669039145907476, "loss": 3.0074, "step": 1882 }, { "epoch": 0.8368888888888889, "grad_norm": 1.293692708015442, "learning_rate": 0.00016667259786476868, "loss": 2.0004, "step": 1883 }, { "epoch": 0.8373333333333334, "grad_norm": 1.6649595499038696, "learning_rate": 0.00016665480427046264, "loss": 2.8518, "step": 1884 }, { "epoch": 0.8377777777777777, "grad_norm": 1.55930757522583, "learning_rate": 0.0001666370106761566, "loss": 2.5283, "step": 1885 }, { "epoch": 0.8382222222222222, "grad_norm": 1.808152198791504, "learning_rate": 0.00016661921708185053, "loss": 3.0004, "step": 1886 }, { "epoch": 0.8386666666666667, "grad_norm": 2.0772571563720703, "learning_rate": 0.00016660142348754448, "loss": 2.2274, "step": 1887 }, { "epoch": 0.8391111111111111, "grad_norm": 1.5994224548339844, "learning_rate": 0.00016658362989323844, "loss": 2.212, "step": 1888 }, { "epoch": 0.8395555555555556, "grad_norm": 1.7321178913116455, "learning_rate": 0.0001665658362989324, "loss": 2.4917, "step": 1889 }, { "epoch": 0.84, "grad_norm": 1.8322235345840454, "learning_rate": 0.00016654804270462633, "loss": 3.1259, "step": 1890 }, { "epoch": 0.8404444444444444, "grad_norm": 1.6857374906539917, "learning_rate": 0.00016653024911032028, "loss": 2.81, "step": 1891 }, { "epoch": 0.8408888888888889, "grad_norm": 1.6723679304122925, "learning_rate": 0.00016651245551601424, "loss": 2.4271, "step": 1892 }, { "epoch": 0.8413333333333334, "grad_norm": 1.2392948865890503, "learning_rate": 0.0001664946619217082, "loss": 1.6478, "step": 1893 }, { "epoch": 0.8417777777777777, "grad_norm": 1.6006635427474976, "learning_rate": 0.00016647686832740215, "loss": 2.7513, "step": 1894 }, { "epoch": 0.8422222222222222, "grad_norm": 1.623317003250122, "learning_rate": 0.0001664590747330961, "loss": 2.3341, "step": 1895 }, { "epoch": 0.8426666666666667, "grad_norm": 1.8542609214782715, "learning_rate": 0.00016644128113879004, "loss": 2.7884, "step": 1896 }, { "epoch": 0.8431111111111111, "grad_norm": 1.8695321083068848, "learning_rate": 0.000166423487544484, "loss": 2.3178, "step": 1897 }, { "epoch": 0.8435555555555555, "grad_norm": 1.759669303894043, "learning_rate": 0.00016640569395017795, "loss": 2.8503, "step": 1898 }, { "epoch": 0.844, "grad_norm": 2.0336124897003174, "learning_rate": 0.00016638790035587188, "loss": 2.4729, "step": 1899 }, { "epoch": 0.8444444444444444, "grad_norm": 2.0583572387695312, "learning_rate": 0.00016637010676156584, "loss": 2.7253, "step": 1900 }, { "epoch": 0.8448888888888889, "grad_norm": 0.9357160329818726, "learning_rate": 0.0001663523131672598, "loss": 1.1275, "step": 1901 }, { "epoch": 0.8453333333333334, "grad_norm": 1.1076958179473877, "learning_rate": 0.00016633451957295375, "loss": 2.4995, "step": 1902 }, { "epoch": 0.8457777777777777, "grad_norm": 1.0276774168014526, "learning_rate": 0.00016631672597864768, "loss": 2.427, "step": 1903 }, { "epoch": 0.8462222222222222, "grad_norm": 1.1579983234405518, "learning_rate": 0.00016629893238434164, "loss": 2.2964, "step": 1904 }, { "epoch": 0.8466666666666667, "grad_norm": 1.100016474723816, "learning_rate": 0.0001662811387900356, "loss": 2.4451, "step": 1905 }, { "epoch": 0.8471111111111111, "grad_norm": 1.2550407648086548, "learning_rate": 0.00016626334519572955, "loss": 2.985, "step": 1906 }, { "epoch": 0.8475555555555555, "grad_norm": 1.1398979425430298, "learning_rate": 0.0001662455516014235, "loss": 2.3818, "step": 1907 }, { "epoch": 0.848, "grad_norm": 1.3666727542877197, "learning_rate": 0.00016622775800711746, "loss": 2.3968, "step": 1908 }, { "epoch": 0.8484444444444444, "grad_norm": 1.4796351194381714, "learning_rate": 0.0001662099644128114, "loss": 1.2558, "step": 1909 }, { "epoch": 0.8488888888888889, "grad_norm": 1.3779754638671875, "learning_rate": 0.00016619217081850535, "loss": 3.1081, "step": 1910 }, { "epoch": 0.8493333333333334, "grad_norm": 1.4465447664260864, "learning_rate": 0.0001661743772241993, "loss": 2.4524, "step": 1911 }, { "epoch": 0.8497777777777777, "grad_norm": 1.3205504417419434, "learning_rate": 0.00016615658362989323, "loss": 2.7317, "step": 1912 }, { "epoch": 0.8502222222222222, "grad_norm": 1.286799669265747, "learning_rate": 0.0001661387900355872, "loss": 2.2615, "step": 1913 }, { "epoch": 0.8506666666666667, "grad_norm": 1.2320808172225952, "learning_rate": 0.00016612099644128115, "loss": 2.5005, "step": 1914 }, { "epoch": 0.8511111111111112, "grad_norm": 1.5686407089233398, "learning_rate": 0.0001661032028469751, "loss": 2.3068, "step": 1915 }, { "epoch": 0.8515555555555555, "grad_norm": 2.0578267574310303, "learning_rate": 0.00016608540925266903, "loss": 1.2472, "step": 1916 }, { "epoch": 0.852, "grad_norm": 1.3643602132797241, "learning_rate": 0.000166067615658363, "loss": 2.6433, "step": 1917 }, { "epoch": 0.8524444444444444, "grad_norm": 1.3753222227096558, "learning_rate": 0.00016604982206405695, "loss": 2.4551, "step": 1918 }, { "epoch": 0.8528888888888889, "grad_norm": 1.2894665002822876, "learning_rate": 0.0001660320284697509, "loss": 1.7715, "step": 1919 }, { "epoch": 0.8533333333333334, "grad_norm": 1.381399393081665, "learning_rate": 0.00016601423487544486, "loss": 2.5613, "step": 1920 }, { "epoch": 0.8537777777777777, "grad_norm": 1.5483062267303467, "learning_rate": 0.00016599644128113881, "loss": 2.9712, "step": 1921 }, { "epoch": 0.8542222222222222, "grad_norm": 1.481539011001587, "learning_rate": 0.00016597864768683274, "loss": 2.4221, "step": 1922 }, { "epoch": 0.8546666666666667, "grad_norm": 1.316893458366394, "learning_rate": 0.0001659608540925267, "loss": 2.0611, "step": 1923 }, { "epoch": 0.8551111111111112, "grad_norm": 1.398386836051941, "learning_rate": 0.00016594306049822066, "loss": 2.1331, "step": 1924 }, { "epoch": 0.8555555555555555, "grad_norm": 1.521558165550232, "learning_rate": 0.0001659252669039146, "loss": 2.6778, "step": 1925 }, { "epoch": 0.856, "grad_norm": 1.4601832628250122, "learning_rate": 0.00016590747330960854, "loss": 2.2369, "step": 1926 }, { "epoch": 0.8564444444444445, "grad_norm": 1.6191009283065796, "learning_rate": 0.0001658896797153025, "loss": 2.6541, "step": 1927 }, { "epoch": 0.8568888888888889, "grad_norm": 1.5223315954208374, "learning_rate": 0.00016587188612099646, "loss": 2.3937, "step": 1928 }, { "epoch": 0.8573333333333333, "grad_norm": 1.2509064674377441, "learning_rate": 0.00016585409252669039, "loss": 1.227, "step": 1929 }, { "epoch": 0.8577777777777778, "grad_norm": 1.5808355808258057, "learning_rate": 0.00016583629893238434, "loss": 2.3527, "step": 1930 }, { "epoch": 0.8582222222222222, "grad_norm": 1.5534776449203491, "learning_rate": 0.0001658185053380783, "loss": 2.8214, "step": 1931 }, { "epoch": 0.8586666666666667, "grad_norm": 1.675430178642273, "learning_rate": 0.00016580071174377225, "loss": 2.6934, "step": 1932 }, { "epoch": 0.8591111111111112, "grad_norm": 1.638614296913147, "learning_rate": 0.0001657829181494662, "loss": 2.6275, "step": 1933 }, { "epoch": 0.8595555555555555, "grad_norm": 1.4431530237197876, "learning_rate": 0.00016576512455516017, "loss": 2.4221, "step": 1934 }, { "epoch": 0.86, "grad_norm": 1.532965898513794, "learning_rate": 0.0001657473309608541, "loss": 2.1414, "step": 1935 }, { "epoch": 0.8604444444444445, "grad_norm": 1.8967840671539307, "learning_rate": 0.00016572953736654805, "loss": 2.2568, "step": 1936 }, { "epoch": 0.8608888888888889, "grad_norm": 1.6051034927368164, "learning_rate": 0.000165711743772242, "loss": 2.6386, "step": 1937 }, { "epoch": 0.8613333333333333, "grad_norm": 1.7966537475585938, "learning_rate": 0.00016569395017793594, "loss": 2.1053, "step": 1938 }, { "epoch": 0.8617777777777778, "grad_norm": 1.6629730463027954, "learning_rate": 0.0001656761565836299, "loss": 2.7147, "step": 1939 }, { "epoch": 0.8622222222222222, "grad_norm": 1.5632649660110474, "learning_rate": 0.00016565836298932385, "loss": 2.3997, "step": 1940 }, { "epoch": 0.8626666666666667, "grad_norm": 1.827580451965332, "learning_rate": 0.0001656405693950178, "loss": 2.118, "step": 1941 }, { "epoch": 0.8631111111111112, "grad_norm": 1.7074384689331055, "learning_rate": 0.00016562277580071174, "loss": 2.1276, "step": 1942 }, { "epoch": 0.8635555555555555, "grad_norm": 1.6580160856246948, "learning_rate": 0.0001656049822064057, "loss": 2.0692, "step": 1943 }, { "epoch": 0.864, "grad_norm": 2.2901294231414795, "learning_rate": 0.00016558718861209965, "loss": 2.8533, "step": 1944 }, { "epoch": 0.8644444444444445, "grad_norm": 1.9296009540557861, "learning_rate": 0.0001655693950177936, "loss": 2.5001, "step": 1945 }, { "epoch": 0.8648888888888889, "grad_norm": 1.7774969339370728, "learning_rate": 0.00016555160142348756, "loss": 3.0877, "step": 1946 }, { "epoch": 0.8653333333333333, "grad_norm": 1.7558300495147705, "learning_rate": 0.00016553380782918152, "loss": 2.1556, "step": 1947 }, { "epoch": 0.8657777777777778, "grad_norm": 1.870627760887146, "learning_rate": 0.00016551601423487545, "loss": 1.3417, "step": 1948 }, { "epoch": 0.8662222222222222, "grad_norm": 1.396712064743042, "learning_rate": 0.0001654982206405694, "loss": 1.0483, "step": 1949 }, { "epoch": 0.8666666666666667, "grad_norm": 1.8367741107940674, "learning_rate": 0.00016548042704626334, "loss": 2.2391, "step": 1950 }, { "epoch": 0.8671111111111112, "grad_norm": 1.0035446882247925, "learning_rate": 0.0001654626334519573, "loss": 2.7096, "step": 1951 }, { "epoch": 0.8675555555555555, "grad_norm": 1.2445075511932373, "learning_rate": 0.00016544483985765125, "loss": 2.5628, "step": 1952 }, { "epoch": 0.868, "grad_norm": 1.1393455266952515, "learning_rate": 0.0001654270462633452, "loss": 2.8044, "step": 1953 }, { "epoch": 0.8684444444444445, "grad_norm": 1.042448878288269, "learning_rate": 0.00016540925266903916, "loss": 2.183, "step": 1954 }, { "epoch": 0.8688888888888889, "grad_norm": 1.0702394247055054, "learning_rate": 0.0001653914590747331, "loss": 2.544, "step": 1955 }, { "epoch": 0.8693333333333333, "grad_norm": 1.1012481451034546, "learning_rate": 0.00016537366548042705, "loss": 2.4327, "step": 1956 }, { "epoch": 0.8697777777777778, "grad_norm": 1.0068960189819336, "learning_rate": 0.000165355871886121, "loss": 2.1235, "step": 1957 }, { "epoch": 0.8702222222222222, "grad_norm": 1.271165370941162, "learning_rate": 0.00016533807829181496, "loss": 2.3766, "step": 1958 }, { "epoch": 0.8706666666666667, "grad_norm": 1.2693040370941162, "learning_rate": 0.00016532028469750892, "loss": 2.355, "step": 1959 }, { "epoch": 0.8711111111111111, "grad_norm": 1.28933846950531, "learning_rate": 0.00016530249110320287, "loss": 2.1452, "step": 1960 }, { "epoch": 0.8715555555555555, "grad_norm": 1.3570892810821533, "learning_rate": 0.0001652846975088968, "loss": 2.8351, "step": 1961 }, { "epoch": 0.872, "grad_norm": 1.373550534248352, "learning_rate": 0.00016526690391459076, "loss": 2.1488, "step": 1962 }, { "epoch": 0.8724444444444445, "grad_norm": 1.1201701164245605, "learning_rate": 0.0001652491103202847, "loss": 2.1829, "step": 1963 }, { "epoch": 0.8728888888888889, "grad_norm": 1.28224778175354, "learning_rate": 0.00016523131672597865, "loss": 2.6936, "step": 1964 }, { "epoch": 0.8733333333333333, "grad_norm": 1.540144443511963, "learning_rate": 0.0001652135231316726, "loss": 2.5752, "step": 1965 }, { "epoch": 0.8737777777777778, "grad_norm": 1.3699182271957397, "learning_rate": 0.00016519572953736656, "loss": 2.5122, "step": 1966 }, { "epoch": 0.8742222222222222, "grad_norm": 1.5557972192764282, "learning_rate": 0.00016517793594306052, "loss": 2.2133, "step": 1967 }, { "epoch": 0.8746666666666667, "grad_norm": 1.4491480588912964, "learning_rate": 0.00016516014234875444, "loss": 2.4586, "step": 1968 }, { "epoch": 0.8751111111111111, "grad_norm": 1.3533332347869873, "learning_rate": 0.0001651423487544484, "loss": 2.1343, "step": 1969 }, { "epoch": 0.8755555555555555, "grad_norm": 1.8715101480484009, "learning_rate": 0.00016512455516014236, "loss": 2.5727, "step": 1970 }, { "epoch": 0.876, "grad_norm": 1.507683515548706, "learning_rate": 0.00016510676156583631, "loss": 1.8529, "step": 1971 }, { "epoch": 0.8764444444444445, "grad_norm": 1.381305456161499, "learning_rate": 0.00016508896797153027, "loss": 2.2016, "step": 1972 }, { "epoch": 0.8768888888888889, "grad_norm": 1.4611423015594482, "learning_rate": 0.0001650711743772242, "loss": 2.4748, "step": 1973 }, { "epoch": 0.8773333333333333, "grad_norm": 1.5445431470870972, "learning_rate": 0.00016505338078291816, "loss": 2.5277, "step": 1974 }, { "epoch": 0.8777777777777778, "grad_norm": 1.4540585279464722, "learning_rate": 0.0001650355871886121, "loss": 2.4206, "step": 1975 }, { "epoch": 0.8782222222222222, "grad_norm": 1.4331034421920776, "learning_rate": 0.00016501779359430604, "loss": 2.6362, "step": 1976 }, { "epoch": 0.8786666666666667, "grad_norm": 1.668470859527588, "learning_rate": 0.000165, "loss": 2.5848, "step": 1977 }, { "epoch": 0.8791111111111111, "grad_norm": 1.2265642881393433, "learning_rate": 0.00016498220640569396, "loss": 2.0659, "step": 1978 }, { "epoch": 0.8795555555555555, "grad_norm": 1.6252071857452393, "learning_rate": 0.0001649644128113879, "loss": 2.3045, "step": 1979 }, { "epoch": 0.88, "grad_norm": 1.3835192918777466, "learning_rate": 0.00016494661921708184, "loss": 2.3903, "step": 1980 }, { "epoch": 0.8804444444444445, "grad_norm": 1.4605368375778198, "learning_rate": 0.0001649288256227758, "loss": 2.7493, "step": 1981 }, { "epoch": 0.8808888888888889, "grad_norm": 1.863997459411621, "learning_rate": 0.00016491103202846975, "loss": 2.3149, "step": 1982 }, { "epoch": 0.8813333333333333, "grad_norm": 1.7310175895690918, "learning_rate": 0.0001648932384341637, "loss": 2.7611, "step": 1983 }, { "epoch": 0.8817777777777778, "grad_norm": 1.4304709434509277, "learning_rate": 0.00016487544483985767, "loss": 1.2887, "step": 1984 }, { "epoch": 0.8822222222222222, "grad_norm": 1.2733263969421387, "learning_rate": 0.00016485765124555162, "loss": 2.0624, "step": 1985 }, { "epoch": 0.8826666666666667, "grad_norm": 1.5163559913635254, "learning_rate": 0.00016483985765124555, "loss": 2.7885, "step": 1986 }, { "epoch": 0.8831111111111111, "grad_norm": 1.3988975286483765, "learning_rate": 0.0001648220640569395, "loss": 2.1499, "step": 1987 }, { "epoch": 0.8835555555555555, "grad_norm": 1.324731469154358, "learning_rate": 0.00016480427046263347, "loss": 2.1789, "step": 1988 }, { "epoch": 0.884, "grad_norm": 1.5569076538085938, "learning_rate": 0.0001647864768683274, "loss": 2.4129, "step": 1989 }, { "epoch": 0.8844444444444445, "grad_norm": 2.2256200313568115, "learning_rate": 0.00016476868327402135, "loss": 2.9758, "step": 1990 }, { "epoch": 0.8848888888888888, "grad_norm": 1.666374683380127, "learning_rate": 0.0001647508896797153, "loss": 2.7925, "step": 1991 }, { "epoch": 0.8853333333333333, "grad_norm": 1.6541733741760254, "learning_rate": 0.00016473309608540927, "loss": 2.4019, "step": 1992 }, { "epoch": 0.8857777777777778, "grad_norm": 1.5037000179290771, "learning_rate": 0.0001647153024911032, "loss": 2.6672, "step": 1993 }, { "epoch": 0.8862222222222222, "grad_norm": 1.7169650793075562, "learning_rate": 0.00016469750889679715, "loss": 2.7251, "step": 1994 }, { "epoch": 0.8866666666666667, "grad_norm": 1.9718469381332397, "learning_rate": 0.0001646797153024911, "loss": 2.1821, "step": 1995 }, { "epoch": 0.8871111111111111, "grad_norm": 1.751865029335022, "learning_rate": 0.00016466192170818506, "loss": 2.6323, "step": 1996 }, { "epoch": 0.8875555555555555, "grad_norm": 1.602544903755188, "learning_rate": 0.00016464412811387902, "loss": 2.6532, "step": 1997 }, { "epoch": 0.888, "grad_norm": 1.865159273147583, "learning_rate": 0.00016462633451957298, "loss": 3.023, "step": 1998 }, { "epoch": 0.8884444444444445, "grad_norm": 1.7071006298065186, "learning_rate": 0.0001646085409252669, "loss": 2.6798, "step": 1999 }, { "epoch": 0.8888888888888888, "grad_norm": 2.0052783489227295, "learning_rate": 0.00016459074733096086, "loss": 2.7293, "step": 2000 }, { "epoch": 0.8893333333333333, "grad_norm": 1.0259556770324707, "learning_rate": 0.00016457295373665482, "loss": 2.552, "step": 2001 }, { "epoch": 0.8897777777777778, "grad_norm": 1.1333271265029907, "learning_rate": 0.00016455516014234875, "loss": 2.8014, "step": 2002 }, { "epoch": 0.8902222222222222, "grad_norm": 0.973343551158905, "learning_rate": 0.0001645373665480427, "loss": 2.5627, "step": 2003 }, { "epoch": 0.8906666666666667, "grad_norm": 0.9888269901275635, "learning_rate": 0.00016451957295373666, "loss": 2.2681, "step": 2004 }, { "epoch": 0.8911111111111111, "grad_norm": 1.2307792901992798, "learning_rate": 0.00016450177935943062, "loss": 2.1229, "step": 2005 }, { "epoch": 0.8915555555555555, "grad_norm": 1.0843390226364136, "learning_rate": 0.00016448398576512455, "loss": 2.3058, "step": 2006 }, { "epoch": 0.892, "grad_norm": 1.3246184587478638, "learning_rate": 0.0001644661921708185, "loss": 2.5498, "step": 2007 }, { "epoch": 0.8924444444444445, "grad_norm": 1.156726360321045, "learning_rate": 0.00016444839857651246, "loss": 2.3375, "step": 2008 }, { "epoch": 0.8928888888888888, "grad_norm": 1.061392903327942, "learning_rate": 0.00016443060498220642, "loss": 1.2548, "step": 2009 }, { "epoch": 0.8933333333333333, "grad_norm": 1.357146978378296, "learning_rate": 0.00016441281138790037, "loss": 2.8068, "step": 2010 }, { "epoch": 0.8937777777777778, "grad_norm": 1.3615721464157104, "learning_rate": 0.00016439501779359433, "loss": 2.4011, "step": 2011 }, { "epoch": 0.8942222222222223, "grad_norm": 1.3508340120315552, "learning_rate": 0.00016437722419928826, "loss": 2.5085, "step": 2012 }, { "epoch": 0.8946666666666667, "grad_norm": 1.4771041870117188, "learning_rate": 0.00016435943060498222, "loss": 1.8489, "step": 2013 }, { "epoch": 0.8951111111111111, "grad_norm": 1.2312934398651123, "learning_rate": 0.00016434163701067617, "loss": 1.9961, "step": 2014 }, { "epoch": 0.8955555555555555, "grad_norm": 1.308119535446167, "learning_rate": 0.0001643238434163701, "loss": 2.6029, "step": 2015 }, { "epoch": 0.896, "grad_norm": 1.3931859731674194, "learning_rate": 0.00016430604982206406, "loss": 2.5537, "step": 2016 }, { "epoch": 0.8964444444444445, "grad_norm": 1.5460842847824097, "learning_rate": 0.00016428825622775802, "loss": 2.5682, "step": 2017 }, { "epoch": 0.8968888888888888, "grad_norm": 1.4253586530685425, "learning_rate": 0.00016427046263345197, "loss": 2.6531, "step": 2018 }, { "epoch": 0.8973333333333333, "grad_norm": 1.2461731433868408, "learning_rate": 0.0001642526690391459, "loss": 1.9027, "step": 2019 }, { "epoch": 0.8977777777777778, "grad_norm": 1.418392539024353, "learning_rate": 0.00016423487544483986, "loss": 2.5344, "step": 2020 }, { "epoch": 0.8982222222222223, "grad_norm": 1.5666571855545044, "learning_rate": 0.00016421708185053381, "loss": 2.4009, "step": 2021 }, { "epoch": 0.8986666666666666, "grad_norm": 1.2651710510253906, "learning_rate": 0.00016419928825622777, "loss": 2.1727, "step": 2022 }, { "epoch": 0.8991111111111111, "grad_norm": 1.4496339559555054, "learning_rate": 0.00016418149466192173, "loss": 2.3306, "step": 2023 }, { "epoch": 0.8995555555555556, "grad_norm": 1.3731813430786133, "learning_rate": 0.00016416370106761568, "loss": 2.0778, "step": 2024 }, { "epoch": 0.9, "grad_norm": 1.383135199546814, "learning_rate": 0.0001641459074733096, "loss": 2.7001, "step": 2025 }, { "epoch": 0.9004444444444445, "grad_norm": 1.2729257345199585, "learning_rate": 0.00016412811387900357, "loss": 2.1518, "step": 2026 }, { "epoch": 0.9008888888888889, "grad_norm": 1.5172004699707031, "learning_rate": 0.00016411032028469753, "loss": 2.4128, "step": 2027 }, { "epoch": 0.9013333333333333, "grad_norm": 1.3917080163955688, "learning_rate": 0.00016409252669039146, "loss": 1.2132, "step": 2028 }, { "epoch": 0.9017777777777778, "grad_norm": 1.359440565109253, "learning_rate": 0.0001640747330960854, "loss": 2.6389, "step": 2029 }, { "epoch": 0.9022222222222223, "grad_norm": 1.6984691619873047, "learning_rate": 0.00016405693950177937, "loss": 2.5598, "step": 2030 }, { "epoch": 0.9026666666666666, "grad_norm": 1.3481029272079468, "learning_rate": 0.00016403914590747332, "loss": 2.2033, "step": 2031 }, { "epoch": 0.9031111111111111, "grad_norm": 1.4901320934295654, "learning_rate": 0.00016402135231316725, "loss": 2.227, "step": 2032 }, { "epoch": 0.9035555555555556, "grad_norm": 1.4756929874420166, "learning_rate": 0.0001640035587188612, "loss": 2.3098, "step": 2033 }, { "epoch": 0.904, "grad_norm": 2.4183623790740967, "learning_rate": 0.00016398576512455517, "loss": 2.2787, "step": 2034 }, { "epoch": 0.9044444444444445, "grad_norm": 1.7991214990615845, "learning_rate": 0.00016396797153024912, "loss": 2.3717, "step": 2035 }, { "epoch": 0.9048888888888889, "grad_norm": 1.7338757514953613, "learning_rate": 0.00016395017793594308, "loss": 2.9113, "step": 2036 }, { "epoch": 0.9053333333333333, "grad_norm": 1.5208733081817627, "learning_rate": 0.00016393238434163704, "loss": 2.0556, "step": 2037 }, { "epoch": 0.9057777777777778, "grad_norm": 1.5961337089538574, "learning_rate": 0.00016391459074733097, "loss": 2.093, "step": 2038 }, { "epoch": 0.9062222222222223, "grad_norm": 1.6487394571304321, "learning_rate": 0.00016389679715302492, "loss": 2.6833, "step": 2039 }, { "epoch": 0.9066666666666666, "grad_norm": 2.891885995864868, "learning_rate": 0.00016387900355871888, "loss": 1.6798, "step": 2040 }, { "epoch": 0.9071111111111111, "grad_norm": 1.8462321758270264, "learning_rate": 0.0001638612099644128, "loss": 2.5937, "step": 2041 }, { "epoch": 0.9075555555555556, "grad_norm": 1.7322661876678467, "learning_rate": 0.00016384341637010676, "loss": 2.7494, "step": 2042 }, { "epoch": 0.908, "grad_norm": 1.7559343576431274, "learning_rate": 0.00016382562277580072, "loss": 2.7197, "step": 2043 }, { "epoch": 0.9084444444444445, "grad_norm": 1.7972688674926758, "learning_rate": 0.00016380782918149468, "loss": 2.2508, "step": 2044 }, { "epoch": 0.9088888888888889, "grad_norm": 1.5934430360794067, "learning_rate": 0.0001637900355871886, "loss": 2.5825, "step": 2045 }, { "epoch": 0.9093333333333333, "grad_norm": 2.1732285022735596, "learning_rate": 0.00016377224199288256, "loss": 2.5181, "step": 2046 }, { "epoch": 0.9097777777777778, "grad_norm": 1.8256020545959473, "learning_rate": 0.00016375444839857652, "loss": 2.4343, "step": 2047 }, { "epoch": 0.9102222222222223, "grad_norm": 1.8094704151153564, "learning_rate": 0.00016373665480427048, "loss": 2.4485, "step": 2048 }, { "epoch": 0.9106666666666666, "grad_norm": 2.246121644973755, "learning_rate": 0.00016371886120996443, "loss": 3.1074, "step": 2049 }, { "epoch": 0.9111111111111111, "grad_norm": 3.0540249347686768, "learning_rate": 0.0001637010676156584, "loss": 1.8492, "step": 2050 }, { "epoch": 0.9115555555555556, "grad_norm": 0.9966058731079102, "learning_rate": 0.00016368327402135232, "loss": 2.6141, "step": 2051 }, { "epoch": 0.912, "grad_norm": 1.0565382242202759, "learning_rate": 0.00016366548042704628, "loss": 2.5051, "step": 2052 }, { "epoch": 0.9124444444444444, "grad_norm": 0.999159574508667, "learning_rate": 0.00016364768683274023, "loss": 1.2208, "step": 2053 }, { "epoch": 0.9128888888888889, "grad_norm": 1.0446531772613525, "learning_rate": 0.00016362989323843416, "loss": 2.403, "step": 2054 }, { "epoch": 0.9133333333333333, "grad_norm": 1.7847496271133423, "learning_rate": 0.00016361209964412812, "loss": 1.4985, "step": 2055 }, { "epoch": 0.9137777777777778, "grad_norm": 1.1442463397979736, "learning_rate": 0.00016359430604982207, "loss": 2.8388, "step": 2056 }, { "epoch": 0.9142222222222223, "grad_norm": 1.1940288543701172, "learning_rate": 0.00016357651245551603, "loss": 2.7245, "step": 2057 }, { "epoch": 0.9146666666666666, "grad_norm": 1.3934929370880127, "learning_rate": 0.00016355871886120996, "loss": 2.8598, "step": 2058 }, { "epoch": 0.9151111111111111, "grad_norm": 1.361688256263733, "learning_rate": 0.00016354092526690392, "loss": 2.7173, "step": 2059 }, { "epoch": 0.9155555555555556, "grad_norm": 1.3823915719985962, "learning_rate": 0.00016352313167259787, "loss": 1.5828, "step": 2060 }, { "epoch": 0.916, "grad_norm": 1.3563628196716309, "learning_rate": 0.00016350533807829183, "loss": 2.5457, "step": 2061 }, { "epoch": 0.9164444444444444, "grad_norm": 1.2287131547927856, "learning_rate": 0.00016348754448398579, "loss": 2.9229, "step": 2062 }, { "epoch": 0.9168888888888889, "grad_norm": 1.1974775791168213, "learning_rate": 0.00016346975088967972, "loss": 2.2515, "step": 2063 }, { "epoch": 0.9173333333333333, "grad_norm": 1.5387582778930664, "learning_rate": 0.00016345195729537367, "loss": 1.6172, "step": 2064 }, { "epoch": 0.9177777777777778, "grad_norm": 1.3966673612594604, "learning_rate": 0.00016343416370106763, "loss": 2.771, "step": 2065 }, { "epoch": 0.9182222222222223, "grad_norm": 1.7705440521240234, "learning_rate": 0.00016341637010676156, "loss": 1.446, "step": 2066 }, { "epoch": 0.9186666666666666, "grad_norm": 1.4579976797103882, "learning_rate": 0.00016339857651245551, "loss": 1.8905, "step": 2067 }, { "epoch": 0.9191111111111111, "grad_norm": 1.4872655868530273, "learning_rate": 0.00016338078291814947, "loss": 2.2954, "step": 2068 }, { "epoch": 0.9195555555555556, "grad_norm": 1.5094295740127563, "learning_rate": 0.00016336298932384343, "loss": 3.1148, "step": 2069 }, { "epoch": 0.92, "grad_norm": 1.5641443729400635, "learning_rate": 0.00016334519572953736, "loss": 2.7561, "step": 2070 }, { "epoch": 0.9204444444444444, "grad_norm": 1.4170724153518677, "learning_rate": 0.00016332740213523131, "loss": 2.2233, "step": 2071 }, { "epoch": 0.9208888888888889, "grad_norm": 1.633217215538025, "learning_rate": 0.00016330960854092527, "loss": 2.9578, "step": 2072 }, { "epoch": 0.9213333333333333, "grad_norm": 1.2382259368896484, "learning_rate": 0.00016329181494661923, "loss": 1.4224, "step": 2073 }, { "epoch": 0.9217777777777778, "grad_norm": 1.9555550813674927, "learning_rate": 0.00016327402135231318, "loss": 2.4531, "step": 2074 }, { "epoch": 0.9222222222222223, "grad_norm": 1.4946098327636719, "learning_rate": 0.00016325622775800714, "loss": 1.3729, "step": 2075 }, { "epoch": 0.9226666666666666, "grad_norm": 0.9218053817749023, "learning_rate": 0.00016323843416370107, "loss": 1.1931, "step": 2076 }, { "epoch": 0.9231111111111111, "grad_norm": 1.8889461755752563, "learning_rate": 0.00016322064056939503, "loss": 2.8331, "step": 2077 }, { "epoch": 0.9235555555555556, "grad_norm": 2.0126349925994873, "learning_rate": 0.00016320284697508898, "loss": 2.0097, "step": 2078 }, { "epoch": 0.924, "grad_norm": 1.5034337043762207, "learning_rate": 0.0001631850533807829, "loss": 2.4002, "step": 2079 }, { "epoch": 0.9244444444444444, "grad_norm": 1.582135558128357, "learning_rate": 0.00016316725978647687, "loss": 2.4081, "step": 2080 }, { "epoch": 0.9248888888888889, "grad_norm": 1.4720321893692017, "learning_rate": 0.00016314946619217082, "loss": 2.0486, "step": 2081 }, { "epoch": 0.9253333333333333, "grad_norm": 1.524876594543457, "learning_rate": 0.00016313167259786478, "loss": 2.4884, "step": 2082 }, { "epoch": 0.9257777777777778, "grad_norm": 1.3611582517623901, "learning_rate": 0.0001631138790035587, "loss": 2.2896, "step": 2083 }, { "epoch": 0.9262222222222222, "grad_norm": 1.6695072650909424, "learning_rate": 0.00016309608540925267, "loss": 2.5879, "step": 2084 }, { "epoch": 0.9266666666666666, "grad_norm": 1.5357855558395386, "learning_rate": 0.00016307829181494662, "loss": 2.4094, "step": 2085 }, { "epoch": 0.9271111111111111, "grad_norm": 1.6165261268615723, "learning_rate": 0.00016306049822064058, "loss": 2.3728, "step": 2086 }, { "epoch": 0.9275555555555556, "grad_norm": 1.7907167673110962, "learning_rate": 0.00016304270462633454, "loss": 2.4651, "step": 2087 }, { "epoch": 0.928, "grad_norm": 1.6630196571350098, "learning_rate": 0.0001630249110320285, "loss": 2.2993, "step": 2088 }, { "epoch": 0.9284444444444444, "grad_norm": 1.7729859352111816, "learning_rate": 0.00016300711743772242, "loss": 3.0367, "step": 2089 }, { "epoch": 0.9288888888888889, "grad_norm": 1.6440625190734863, "learning_rate": 0.00016298932384341638, "loss": 2.4056, "step": 2090 }, { "epoch": 0.9293333333333333, "grad_norm": 1.8084213733673096, "learning_rate": 0.00016297153024911034, "loss": 2.5752, "step": 2091 }, { "epoch": 0.9297777777777778, "grad_norm": 1.6393537521362305, "learning_rate": 0.00016295373665480426, "loss": 2.2847, "step": 2092 }, { "epoch": 0.9302222222222222, "grad_norm": 1.849129319190979, "learning_rate": 0.00016293594306049822, "loss": 2.8678, "step": 2093 }, { "epoch": 0.9306666666666666, "grad_norm": 2.1448423862457275, "learning_rate": 0.00016291814946619218, "loss": 3.2427, "step": 2094 }, { "epoch": 0.9311111111111111, "grad_norm": 1.7885196208953857, "learning_rate": 0.00016290035587188613, "loss": 2.0247, "step": 2095 }, { "epoch": 0.9315555555555556, "grad_norm": 1.889359474182129, "learning_rate": 0.00016288256227758006, "loss": 2.492, "step": 2096 }, { "epoch": 0.932, "grad_norm": 1.7645171880722046, "learning_rate": 0.00016286476868327402, "loss": 2.6633, "step": 2097 }, { "epoch": 0.9324444444444444, "grad_norm": 1.768557071685791, "learning_rate": 0.00016284697508896798, "loss": 2.4075, "step": 2098 }, { "epoch": 0.9328888888888889, "grad_norm": 2.301161766052246, "learning_rate": 0.00016282918149466193, "loss": 2.7808, "step": 2099 }, { "epoch": 0.9333333333333333, "grad_norm": 2.5840353965759277, "learning_rate": 0.0001628113879003559, "loss": 3.1963, "step": 2100 }, { "epoch": 0.9337777777777778, "grad_norm": 1.097058653831482, "learning_rate": 0.00016279359430604985, "loss": 2.4867, "step": 2101 }, { "epoch": 0.9342222222222222, "grad_norm": 0.9999614357948303, "learning_rate": 0.00016277580071174378, "loss": 1.277, "step": 2102 }, { "epoch": 0.9346666666666666, "grad_norm": 0.8796145915985107, "learning_rate": 0.00016275800711743773, "loss": 1.4156, "step": 2103 }, { "epoch": 0.9351111111111111, "grad_norm": 1.1534805297851562, "learning_rate": 0.0001627402135231317, "loss": 2.3798, "step": 2104 }, { "epoch": 0.9355555555555556, "grad_norm": 1.3361473083496094, "learning_rate": 0.00016272241992882562, "loss": 2.6389, "step": 2105 }, { "epoch": 0.936, "grad_norm": 1.3026865720748901, "learning_rate": 0.00016270462633451957, "loss": 2.9566, "step": 2106 }, { "epoch": 0.9364444444444444, "grad_norm": 1.2137173414230347, "learning_rate": 0.00016268683274021353, "loss": 2.7551, "step": 2107 }, { "epoch": 0.9368888888888889, "grad_norm": 1.1544345617294312, "learning_rate": 0.0001626690391459075, "loss": 1.6071, "step": 2108 }, { "epoch": 0.9373333333333334, "grad_norm": 1.3754730224609375, "learning_rate": 0.00016265124555160142, "loss": 2.681, "step": 2109 }, { "epoch": 0.9377777777777778, "grad_norm": 1.2891576290130615, "learning_rate": 0.00016263345195729537, "loss": 2.5061, "step": 2110 }, { "epoch": 0.9382222222222222, "grad_norm": 1.6436229944229126, "learning_rate": 0.00016261565836298933, "loss": 2.7442, "step": 2111 }, { "epoch": 0.9386666666666666, "grad_norm": 1.6582046747207642, "learning_rate": 0.00016259786476868329, "loss": 2.4712, "step": 2112 }, { "epoch": 0.9391111111111111, "grad_norm": 1.338775873184204, "learning_rate": 0.00016258007117437724, "loss": 2.2104, "step": 2113 }, { "epoch": 0.9395555555555556, "grad_norm": 1.4280105829238892, "learning_rate": 0.0001625622775800712, "loss": 1.706, "step": 2114 }, { "epoch": 0.94, "grad_norm": 1.3031154870986938, "learning_rate": 0.00016254448398576513, "loss": 2.9918, "step": 2115 }, { "epoch": 0.9404444444444444, "grad_norm": 1.6146162748336792, "learning_rate": 0.00016252669039145908, "loss": 2.577, "step": 2116 }, { "epoch": 0.9408888888888889, "grad_norm": 1.5383062362670898, "learning_rate": 0.00016250889679715304, "loss": 2.38, "step": 2117 }, { "epoch": 0.9413333333333334, "grad_norm": 1.5242427587509155, "learning_rate": 0.00016249110320284697, "loss": 2.5093, "step": 2118 }, { "epoch": 0.9417777777777778, "grad_norm": 1.5081580877304077, "learning_rate": 0.00016247330960854093, "loss": 2.983, "step": 2119 }, { "epoch": 0.9422222222222222, "grad_norm": 1.362468957901001, "learning_rate": 0.00016245551601423488, "loss": 2.4969, "step": 2120 }, { "epoch": 0.9426666666666667, "grad_norm": 1.5495905876159668, "learning_rate": 0.00016243772241992884, "loss": 2.474, "step": 2121 }, { "epoch": 0.9431111111111111, "grad_norm": 1.6289684772491455, "learning_rate": 0.00016241992882562277, "loss": 2.7819, "step": 2122 }, { "epoch": 0.9435555555555556, "grad_norm": 1.3928167819976807, "learning_rate": 0.00016240213523131673, "loss": 2.3204, "step": 2123 }, { "epoch": 0.944, "grad_norm": 1.5994818210601807, "learning_rate": 0.00016238434163701068, "loss": 2.783, "step": 2124 }, { "epoch": 0.9444444444444444, "grad_norm": 1.4788901805877686, "learning_rate": 0.00016236654804270464, "loss": 2.4959, "step": 2125 }, { "epoch": 0.9448888888888889, "grad_norm": 1.7189639806747437, "learning_rate": 0.0001623487544483986, "loss": 2.7708, "step": 2126 }, { "epoch": 0.9453333333333334, "grad_norm": 1.6642398834228516, "learning_rate": 0.00016233096085409255, "loss": 1.4068, "step": 2127 }, { "epoch": 0.9457777777777778, "grad_norm": 1.9017895460128784, "learning_rate": 0.00016231316725978648, "loss": 2.0995, "step": 2128 }, { "epoch": 0.9462222222222222, "grad_norm": 1.4762321710586548, "learning_rate": 0.00016229537366548044, "loss": 2.8266, "step": 2129 }, { "epoch": 0.9466666666666667, "grad_norm": 1.1698795557022095, "learning_rate": 0.0001622775800711744, "loss": 1.2059, "step": 2130 }, { "epoch": 0.9471111111111111, "grad_norm": 1.7860333919525146, "learning_rate": 0.00016225978647686832, "loss": 3.0256, "step": 2131 }, { "epoch": 0.9475555555555556, "grad_norm": 1.6017791032791138, "learning_rate": 0.00016224199288256228, "loss": 3.0768, "step": 2132 }, { "epoch": 0.948, "grad_norm": 1.6588813066482544, "learning_rate": 0.00016222419928825624, "loss": 2.8438, "step": 2133 }, { "epoch": 0.9484444444444444, "grad_norm": 1.97148597240448, "learning_rate": 0.0001622064056939502, "loss": 2.4861, "step": 2134 }, { "epoch": 0.9488888888888889, "grad_norm": 1.5532220602035522, "learning_rate": 0.00016218861209964412, "loss": 2.999, "step": 2135 }, { "epoch": 0.9493333333333334, "grad_norm": 1.6381109952926636, "learning_rate": 0.00016217081850533808, "loss": 2.6297, "step": 2136 }, { "epoch": 0.9497777777777778, "grad_norm": 1.6247540712356567, "learning_rate": 0.00016215302491103204, "loss": 2.0808, "step": 2137 }, { "epoch": 0.9502222222222222, "grad_norm": 2.061701536178589, "learning_rate": 0.000162135231316726, "loss": 2.9111, "step": 2138 }, { "epoch": 0.9506666666666667, "grad_norm": 1.9049525260925293, "learning_rate": 0.00016211743772241995, "loss": 1.179, "step": 2139 }, { "epoch": 0.9511111111111111, "grad_norm": 1.4355841875076294, "learning_rate": 0.0001620996441281139, "loss": 2.2306, "step": 2140 }, { "epoch": 0.9515555555555556, "grad_norm": 2.1435563564300537, "learning_rate": 0.00016208185053380783, "loss": 3.1722, "step": 2141 }, { "epoch": 0.952, "grad_norm": 1.7206003665924072, "learning_rate": 0.0001620640569395018, "loss": 2.6664, "step": 2142 }, { "epoch": 0.9524444444444444, "grad_norm": 1.7475922107696533, "learning_rate": 0.00016204626334519575, "loss": 2.2984, "step": 2143 }, { "epoch": 0.9528888888888889, "grad_norm": 1.429494857788086, "learning_rate": 0.00016202846975088968, "loss": 2.0885, "step": 2144 }, { "epoch": 0.9533333333333334, "grad_norm": 1.6629289388656616, "learning_rate": 0.00016201067615658363, "loss": 2.312, "step": 2145 }, { "epoch": 0.9537777777777777, "grad_norm": 1.6623343229293823, "learning_rate": 0.0001619928825622776, "loss": 2.3646, "step": 2146 }, { "epoch": 0.9542222222222222, "grad_norm": 2.0395777225494385, "learning_rate": 0.00016197508896797155, "loss": 2.4601, "step": 2147 }, { "epoch": 0.9546666666666667, "grad_norm": 2.142592191696167, "learning_rate": 0.00016195729537366548, "loss": 2.5194, "step": 2148 }, { "epoch": 0.9551111111111111, "grad_norm": 1.8677221536636353, "learning_rate": 0.00016193950177935943, "loss": 2.3944, "step": 2149 }, { "epoch": 0.9555555555555556, "grad_norm": 1.8945562839508057, "learning_rate": 0.0001619217081850534, "loss": 2.6568, "step": 2150 }, { "epoch": 0.956, "grad_norm": 1.6855690479278564, "learning_rate": 0.00016190391459074735, "loss": 3.2555, "step": 2151 }, { "epoch": 0.9564444444444444, "grad_norm": 1.2461864948272705, "learning_rate": 0.0001618861209964413, "loss": 2.8541, "step": 2152 }, { "epoch": 0.9568888888888889, "grad_norm": 1.181875467300415, "learning_rate": 0.00016186832740213523, "loss": 2.6893, "step": 2153 }, { "epoch": 0.9573333333333334, "grad_norm": 1.1439327001571655, "learning_rate": 0.0001618505338078292, "loss": 2.8082, "step": 2154 }, { "epoch": 0.9577777777777777, "grad_norm": 1.2180849313735962, "learning_rate": 0.00016183274021352314, "loss": 2.4687, "step": 2155 }, { "epoch": 0.9582222222222222, "grad_norm": 1.2314586639404297, "learning_rate": 0.0001618149466192171, "loss": 2.1729, "step": 2156 }, { "epoch": 0.9586666666666667, "grad_norm": 1.3291667699813843, "learning_rate": 0.00016179715302491103, "loss": 2.5573, "step": 2157 }, { "epoch": 0.9591111111111111, "grad_norm": 1.359408974647522, "learning_rate": 0.000161779359430605, "loss": 2.2368, "step": 2158 }, { "epoch": 0.9595555555555556, "grad_norm": 1.2051069736480713, "learning_rate": 0.00016176156583629894, "loss": 2.2714, "step": 2159 }, { "epoch": 0.96, "grad_norm": 1.4360554218292236, "learning_rate": 0.00016174377224199287, "loss": 2.4253, "step": 2160 }, { "epoch": 0.9604444444444444, "grad_norm": 1.3218653202056885, "learning_rate": 0.00016172597864768683, "loss": 2.38, "step": 2161 }, { "epoch": 0.9608888888888889, "grad_norm": 1.658354640007019, "learning_rate": 0.00016170818505338079, "loss": 3.0823, "step": 2162 }, { "epoch": 0.9613333333333334, "grad_norm": 1.390121579170227, "learning_rate": 0.00016169039145907474, "loss": 2.6402, "step": 2163 }, { "epoch": 0.9617777777777777, "grad_norm": 1.279600977897644, "learning_rate": 0.0001616725978647687, "loss": 2.6816, "step": 2164 }, { "epoch": 0.9622222222222222, "grad_norm": 1.3409185409545898, "learning_rate": 0.00016165480427046266, "loss": 2.0361, "step": 2165 }, { "epoch": 0.9626666666666667, "grad_norm": 1.7012516260147095, "learning_rate": 0.00016163701067615658, "loss": 2.7103, "step": 2166 }, { "epoch": 0.9631111111111111, "grad_norm": 1.6114073991775513, "learning_rate": 0.00016161921708185054, "loss": 2.2603, "step": 2167 }, { "epoch": 0.9635555555555556, "grad_norm": 1.3906176090240479, "learning_rate": 0.0001616014234875445, "loss": 2.4579, "step": 2168 }, { "epoch": 0.964, "grad_norm": 1.3002898693084717, "learning_rate": 0.00016158362989323845, "loss": 2.3782, "step": 2169 }, { "epoch": 0.9644444444444444, "grad_norm": 1.4082340002059937, "learning_rate": 0.00016156583629893238, "loss": 2.6653, "step": 2170 }, { "epoch": 0.9648888888888889, "grad_norm": 1.4760489463806152, "learning_rate": 0.00016154804270462634, "loss": 2.7057, "step": 2171 }, { "epoch": 0.9653333333333334, "grad_norm": 1.411620020866394, "learning_rate": 0.0001615302491103203, "loss": 2.3573, "step": 2172 }, { "epoch": 0.9657777777777777, "grad_norm": 1.4951653480529785, "learning_rate": 0.00016151245551601423, "loss": 2.9025, "step": 2173 }, { "epoch": 0.9662222222222222, "grad_norm": 1.6565364599227905, "learning_rate": 0.00016149466192170818, "loss": 3.272, "step": 2174 }, { "epoch": 0.9666666666666667, "grad_norm": 1.4833595752716064, "learning_rate": 0.00016147686832740214, "loss": 2.6249, "step": 2175 }, { "epoch": 0.9671111111111111, "grad_norm": 1.5216375589370728, "learning_rate": 0.0001614590747330961, "loss": 2.4307, "step": 2176 }, { "epoch": 0.9675555555555555, "grad_norm": 1.8078597784042358, "learning_rate": 0.00016144128113879005, "loss": 2.629, "step": 2177 }, { "epoch": 0.968, "grad_norm": 1.560192584991455, "learning_rate": 0.000161423487544484, "loss": 2.6558, "step": 2178 }, { "epoch": 0.9684444444444444, "grad_norm": 1.6416150331497192, "learning_rate": 0.00016140569395017794, "loss": 2.3076, "step": 2179 }, { "epoch": 0.9688888888888889, "grad_norm": 1.6443932056427002, "learning_rate": 0.0001613879003558719, "loss": 2.9646, "step": 2180 }, { "epoch": 0.9693333333333334, "grad_norm": 2.160329580307007, "learning_rate": 0.00016137010676156585, "loss": 2.0787, "step": 2181 }, { "epoch": 0.9697777777777777, "grad_norm": 1.5181187391281128, "learning_rate": 0.00016135231316725978, "loss": 2.5944, "step": 2182 }, { "epoch": 0.9702222222222222, "grad_norm": 1.5956158638000488, "learning_rate": 0.00016133451957295374, "loss": 2.5744, "step": 2183 }, { "epoch": 0.9706666666666667, "grad_norm": 1.6073002815246582, "learning_rate": 0.0001613167259786477, "loss": 2.2124, "step": 2184 }, { "epoch": 0.9711111111111111, "grad_norm": 1.5423829555511475, "learning_rate": 0.00016129893238434165, "loss": 2.5337, "step": 2185 }, { "epoch": 0.9715555555555555, "grad_norm": 1.5740853548049927, "learning_rate": 0.00016128113879003558, "loss": 2.7245, "step": 2186 }, { "epoch": 0.972, "grad_norm": 1.5205440521240234, "learning_rate": 0.00016126334519572954, "loss": 2.6604, "step": 2187 }, { "epoch": 0.9724444444444444, "grad_norm": 1.425803303718567, "learning_rate": 0.0001612455516014235, "loss": 2.2333, "step": 2188 }, { "epoch": 0.9728888888888889, "grad_norm": 1.6136490106582642, "learning_rate": 0.00016122775800711745, "loss": 2.2972, "step": 2189 }, { "epoch": 0.9733333333333334, "grad_norm": 2.0137991905212402, "learning_rate": 0.0001612099644128114, "loss": 1.3431, "step": 2190 }, { "epoch": 0.9737777777777777, "grad_norm": 1.7414988279342651, "learning_rate": 0.00016119217081850536, "loss": 2.8453, "step": 2191 }, { "epoch": 0.9742222222222222, "grad_norm": 1.7436699867248535, "learning_rate": 0.0001611743772241993, "loss": 2.6342, "step": 2192 }, { "epoch": 0.9746666666666667, "grad_norm": 2.1652956008911133, "learning_rate": 0.00016115658362989325, "loss": 3.4064, "step": 2193 }, { "epoch": 0.9751111111111112, "grad_norm": 1.4634653329849243, "learning_rate": 0.0001611387900355872, "loss": 2.2654, "step": 2194 }, { "epoch": 0.9755555555555555, "grad_norm": 1.380988359451294, "learning_rate": 0.00016112099644128113, "loss": 1.9905, "step": 2195 }, { "epoch": 0.976, "grad_norm": 1.78019380569458, "learning_rate": 0.0001611032028469751, "loss": 2.7685, "step": 2196 }, { "epoch": 0.9764444444444444, "grad_norm": 2.219177722930908, "learning_rate": 0.00016108540925266905, "loss": 2.8879, "step": 2197 }, { "epoch": 0.9768888888888889, "grad_norm": 2.295215129852295, "learning_rate": 0.000161067615658363, "loss": 2.7534, "step": 2198 }, { "epoch": 0.9773333333333334, "grad_norm": 2.250352144241333, "learning_rate": 0.00016104982206405693, "loss": 2.7764, "step": 2199 }, { "epoch": 0.9777777777777777, "grad_norm": 2.36488938331604, "learning_rate": 0.0001610320284697509, "loss": 3.1503, "step": 2200 }, { "epoch": 0.9782222222222222, "grad_norm": 1.0454128980636597, "learning_rate": 0.00016101423487544485, "loss": 1.2486, "step": 2201 }, { "epoch": 0.9786666666666667, "grad_norm": 1.087558388710022, "learning_rate": 0.0001609964412811388, "loss": 2.7034, "step": 2202 }, { "epoch": 0.9791111111111112, "grad_norm": 1.1958833932876587, "learning_rate": 0.00016097864768683276, "loss": 2.5241, "step": 2203 }, { "epoch": 0.9795555555555555, "grad_norm": 1.2436286211013794, "learning_rate": 0.00016096085409252671, "loss": 2.5663, "step": 2204 }, { "epoch": 0.98, "grad_norm": 1.29501211643219, "learning_rate": 0.00016094306049822064, "loss": 2.2681, "step": 2205 }, { "epoch": 0.9804444444444445, "grad_norm": 1.4029202461242676, "learning_rate": 0.0001609252669039146, "loss": 2.6442, "step": 2206 }, { "epoch": 0.9808888888888889, "grad_norm": 1.2167294025421143, "learning_rate": 0.00016090747330960856, "loss": 2.3042, "step": 2207 }, { "epoch": 0.9813333333333333, "grad_norm": 1.1777758598327637, "learning_rate": 0.00016088967971530249, "loss": 2.3734, "step": 2208 }, { "epoch": 0.9817777777777777, "grad_norm": 1.352673053741455, "learning_rate": 0.00016087188612099644, "loss": 2.5139, "step": 2209 }, { "epoch": 0.9822222222222222, "grad_norm": 1.5402523279190063, "learning_rate": 0.0001608540925266904, "loss": 2.4843, "step": 2210 }, { "epoch": 0.9826666666666667, "grad_norm": 1.278908371925354, "learning_rate": 0.00016083629893238436, "loss": 2.6108, "step": 2211 }, { "epoch": 0.9831111111111112, "grad_norm": 1.1538258790969849, "learning_rate": 0.00016081850533807829, "loss": 2.221, "step": 2212 }, { "epoch": 0.9835555555555555, "grad_norm": 1.3440600633621216, "learning_rate": 0.00016080071174377224, "loss": 2.1903, "step": 2213 }, { "epoch": 0.984, "grad_norm": 1.4237117767333984, "learning_rate": 0.0001607829181494662, "loss": 2.6641, "step": 2214 }, { "epoch": 0.9844444444444445, "grad_norm": 1.4718806743621826, "learning_rate": 0.00016076512455516015, "loss": 1.0382, "step": 2215 }, { "epoch": 0.9848888888888889, "grad_norm": 1.376482605934143, "learning_rate": 0.0001607473309608541, "loss": 2.2212, "step": 2216 }, { "epoch": 0.9853333333333333, "grad_norm": 1.2039564847946167, "learning_rate": 0.00016072953736654807, "loss": 1.969, "step": 2217 }, { "epoch": 0.9857777777777778, "grad_norm": 1.4104335308074951, "learning_rate": 0.000160711743772242, "loss": 2.5837, "step": 2218 }, { "epoch": 0.9862222222222222, "grad_norm": 1.5590723752975464, "learning_rate": 0.00016069395017793595, "loss": 2.5401, "step": 2219 }, { "epoch": 0.9866666666666667, "grad_norm": 1.4591701030731201, "learning_rate": 0.0001606761565836299, "loss": 2.1405, "step": 2220 }, { "epoch": 0.9871111111111112, "grad_norm": 1.6289016008377075, "learning_rate": 0.00016065836298932384, "loss": 2.6935, "step": 2221 }, { "epoch": 0.9875555555555555, "grad_norm": 1.431257963180542, "learning_rate": 0.0001606405693950178, "loss": 2.7326, "step": 2222 }, { "epoch": 0.988, "grad_norm": 1.7325876951217651, "learning_rate": 0.00016062277580071175, "loss": 2.2347, "step": 2223 }, { "epoch": 0.9884444444444445, "grad_norm": 1.4045909643173218, "learning_rate": 0.0001606049822064057, "loss": 2.4795, "step": 2224 }, { "epoch": 0.9888888888888889, "grad_norm": 1.5909349918365479, "learning_rate": 0.00016058718861209964, "loss": 2.89, "step": 2225 }, { "epoch": 0.9893333333333333, "grad_norm": 1.7079018354415894, "learning_rate": 0.0001605693950177936, "loss": 2.9235, "step": 2226 }, { "epoch": 0.9897777777777778, "grad_norm": 1.401973843574524, "learning_rate": 0.00016055160142348755, "loss": 2.5211, "step": 2227 }, { "epoch": 0.9902222222222222, "grad_norm": 1.7690025568008423, "learning_rate": 0.0001605338078291815, "loss": 2.8388, "step": 2228 }, { "epoch": 0.9906666666666667, "grad_norm": 1.3721446990966797, "learning_rate": 0.00016051601423487546, "loss": 2.6306, "step": 2229 }, { "epoch": 0.9911111111111112, "grad_norm": 1.9013787508010864, "learning_rate": 0.00016049822064056942, "loss": 3.1249, "step": 2230 }, { "epoch": 0.9915555555555555, "grad_norm": 1.5793306827545166, "learning_rate": 0.00016048042704626335, "loss": 0.0805, "step": 2231 }, { "epoch": 0.992, "grad_norm": 1.6632827520370483, "learning_rate": 0.0001604626334519573, "loss": 2.1838, "step": 2232 }, { "epoch": 0.9924444444444445, "grad_norm": 1.837686538696289, "learning_rate": 0.00016044483985765126, "loss": 2.245, "step": 2233 }, { "epoch": 0.9928888888888889, "grad_norm": 2.039071559906006, "learning_rate": 0.0001604270462633452, "loss": 2.713, "step": 2234 }, { "epoch": 0.9933333333333333, "grad_norm": 1.6307156085968018, "learning_rate": 0.00016040925266903915, "loss": 3.0165, "step": 2235 }, { "epoch": 0.9937777777777778, "grad_norm": 1.4582569599151611, "learning_rate": 0.0001603914590747331, "loss": 2.3939, "step": 2236 }, { "epoch": 0.9942222222222222, "grad_norm": 1.3615283966064453, "learning_rate": 0.00016037366548042706, "loss": 1.9555, "step": 2237 }, { "epoch": 0.9946666666666667, "grad_norm": 1.5992584228515625, "learning_rate": 0.000160355871886121, "loss": 2.4956, "step": 2238 }, { "epoch": 0.9951111111111111, "grad_norm": 1.8983911275863647, "learning_rate": 0.00016033807829181495, "loss": 2.6738, "step": 2239 }, { "epoch": 0.9955555555555555, "grad_norm": 1.622286081314087, "learning_rate": 0.0001603202846975089, "loss": 2.2483, "step": 2240 }, { "epoch": 0.996, "grad_norm": 1.2114402055740356, "learning_rate": 0.00016030249110320286, "loss": 1.3111, "step": 2241 }, { "epoch": 0.9964444444444445, "grad_norm": 1.6320134401321411, "learning_rate": 0.00016028469750889682, "loss": 2.7794, "step": 2242 }, { "epoch": 0.9968888888888889, "grad_norm": 1.804682970046997, "learning_rate": 0.00016026690391459075, "loss": 2.515, "step": 2243 }, { "epoch": 0.9973333333333333, "grad_norm": 2.0232090950012207, "learning_rate": 0.0001602491103202847, "loss": 3.3673, "step": 2244 }, { "epoch": 0.9977777777777778, "grad_norm": 1.852048635482788, "learning_rate": 0.00016023131672597866, "loss": 2.7809, "step": 2245 }, { "epoch": 0.9982222222222222, "grad_norm": 1.7070189714431763, "learning_rate": 0.00016021352313167262, "loss": 2.2775, "step": 2246 }, { "epoch": 0.9986666666666667, "grad_norm": 1.6251046657562256, "learning_rate": 0.00016019572953736655, "loss": 2.3802, "step": 2247 }, { "epoch": 0.9991111111111111, "grad_norm": 1.7044910192489624, "learning_rate": 0.0001601779359430605, "loss": 2.5114, "step": 2248 }, { "epoch": 0.9995555555555555, "grad_norm": 2.653667688369751, "learning_rate": 0.00016016014234875446, "loss": 1.5299, "step": 2249 }, { "epoch": 1.0, "grad_norm": 2.5051262378692627, "learning_rate": 0.0001601423487544484, "loss": 1.7248, "step": 2250 }, { "epoch": 1.0, "eval_loss": 2.410618305206299, "eval_runtime": 47.6193, "eval_samples_per_second": 10.5, "eval_steps_per_second": 10.5, "step": 2250 }, { "epoch": 1.0004444444444445, "grad_norm": 0.9968487620353699, "learning_rate": 0.00016012455516014234, "loss": 2.3568, "step": 2251 }, { "epoch": 1.000888888888889, "grad_norm": 1.1485272645950317, "learning_rate": 0.0001601067615658363, "loss": 2.6016, "step": 2252 }, { "epoch": 1.0013333333333334, "grad_norm": 0.9918361902236938, "learning_rate": 0.00016008896797153026, "loss": 2.1317, "step": 2253 }, { "epoch": 1.0017777777777779, "grad_norm": 1.443756103515625, "learning_rate": 0.00016007117437722421, "loss": 1.3992, "step": 2254 }, { "epoch": 1.0022222222222221, "grad_norm": 1.2466870546340942, "learning_rate": 0.00016005338078291817, "loss": 2.7594, "step": 2255 }, { "epoch": 1.0026666666666666, "grad_norm": 1.1465460062026978, "learning_rate": 0.0001600355871886121, "loss": 2.3712, "step": 2256 }, { "epoch": 1.003111111111111, "grad_norm": 1.0530627965927124, "learning_rate": 0.00016001779359430606, "loss": 1.8851, "step": 2257 }, { "epoch": 1.0035555555555555, "grad_norm": 1.195264458656311, "learning_rate": 0.00016, "loss": 2.2701, "step": 2258 }, { "epoch": 1.004, "grad_norm": 1.2500135898590088, "learning_rate": 0.00015998220640569397, "loss": 2.167, "step": 2259 }, { "epoch": 1.0044444444444445, "grad_norm": 1.24211585521698, "learning_rate": 0.0001599644128113879, "loss": 2.3523, "step": 2260 }, { "epoch": 1.004888888888889, "grad_norm": 1.3692959547042847, "learning_rate": 0.00015994661921708186, "loss": 2.5483, "step": 2261 }, { "epoch": 1.0053333333333334, "grad_norm": 1.1393674612045288, "learning_rate": 0.0001599288256227758, "loss": 2.3064, "step": 2262 }, { "epoch": 1.0057777777777779, "grad_norm": 1.341461181640625, "learning_rate": 0.00015991103202846974, "loss": 2.0089, "step": 2263 }, { "epoch": 1.0062222222222221, "grad_norm": 1.434979796409607, "learning_rate": 0.0001598932384341637, "loss": 1.9006, "step": 2264 }, { "epoch": 1.0066666666666666, "grad_norm": 1.4398572444915771, "learning_rate": 0.00015987544483985765, "loss": 2.1331, "step": 2265 }, { "epoch": 1.007111111111111, "grad_norm": 1.2505806684494019, "learning_rate": 0.0001598576512455516, "loss": 2.059, "step": 2266 }, { "epoch": 1.0075555555555555, "grad_norm": 1.3361196517944336, "learning_rate": 0.00015983985765124557, "loss": 1.2448, "step": 2267 }, { "epoch": 1.008, "grad_norm": 1.3557100296020508, "learning_rate": 0.00015982206405693952, "loss": 2.2836, "step": 2268 }, { "epoch": 1.0084444444444445, "grad_norm": 1.4562923908233643, "learning_rate": 0.00015980427046263345, "loss": 2.4052, "step": 2269 }, { "epoch": 1.008888888888889, "grad_norm": 1.8324874639511108, "learning_rate": 0.0001597864768683274, "loss": 1.0363, "step": 2270 }, { "epoch": 1.0093333333333334, "grad_norm": 1.3576056957244873, "learning_rate": 0.00015976868327402137, "loss": 2.5, "step": 2271 }, { "epoch": 1.0097777777777779, "grad_norm": 1.369577407836914, "learning_rate": 0.00015975088967971532, "loss": 2.4154, "step": 2272 }, { "epoch": 1.0102222222222221, "grad_norm": 1.7282054424285889, "learning_rate": 0.00015973309608540925, "loss": 2.4238, "step": 2273 }, { "epoch": 1.0106666666666666, "grad_norm": 1.4440052509307861, "learning_rate": 0.0001597153024911032, "loss": 1.8114, "step": 2274 }, { "epoch": 1.011111111111111, "grad_norm": 1.8842155933380127, "learning_rate": 0.00015969750889679717, "loss": 2.5752, "step": 2275 }, { "epoch": 1.0115555555555555, "grad_norm": 1.6671173572540283, "learning_rate": 0.0001596797153024911, "loss": 2.3995, "step": 2276 }, { "epoch": 1.012, "grad_norm": 1.6246118545532227, "learning_rate": 0.00015966192170818505, "loss": 2.4257, "step": 2277 }, { "epoch": 1.0124444444444445, "grad_norm": 1.416971206665039, "learning_rate": 0.000159644128113879, "loss": 1.9455, "step": 2278 }, { "epoch": 1.012888888888889, "grad_norm": 1.754091739654541, "learning_rate": 0.00015962633451957296, "loss": 2.6238, "step": 2279 }, { "epoch": 1.0133333333333334, "grad_norm": 1.3849114179611206, "learning_rate": 0.00015960854092526692, "loss": 1.5943, "step": 2280 }, { "epoch": 1.0137777777777779, "grad_norm": 1.6501544713974, "learning_rate": 0.00015959074733096088, "loss": 1.9212, "step": 2281 }, { "epoch": 1.0142222222222221, "grad_norm": 1.7187528610229492, "learning_rate": 0.0001595729537366548, "loss": 1.7647, "step": 2282 }, { "epoch": 1.0146666666666666, "grad_norm": 1.8686177730560303, "learning_rate": 0.00015955516014234876, "loss": 2.4814, "step": 2283 }, { "epoch": 1.015111111111111, "grad_norm": 1.5445845127105713, "learning_rate": 0.00015953736654804272, "loss": 1.9214, "step": 2284 }, { "epoch": 1.0155555555555555, "grad_norm": 1.7692326307296753, "learning_rate": 0.00015951957295373668, "loss": 2.1594, "step": 2285 }, { "epoch": 1.016, "grad_norm": 1.6517410278320312, "learning_rate": 0.0001595017793594306, "loss": 2.1724, "step": 2286 }, { "epoch": 1.0164444444444445, "grad_norm": 1.2496143579483032, "learning_rate": 0.00015948398576512456, "loss": 0.7354, "step": 2287 }, { "epoch": 1.016888888888889, "grad_norm": 1.6711918115615845, "learning_rate": 0.00015946619217081852, "loss": 2.1376, "step": 2288 }, { "epoch": 1.0173333333333334, "grad_norm": 1.8869587182998657, "learning_rate": 0.00015944839857651245, "loss": 2.2878, "step": 2289 }, { "epoch": 1.0177777777777777, "grad_norm": 1.5782763957977295, "learning_rate": 0.0001594306049822064, "loss": 1.9263, "step": 2290 }, { "epoch": 1.0182222222222221, "grad_norm": 1.8840655088424683, "learning_rate": 0.00015941281138790036, "loss": 2.4303, "step": 2291 }, { "epoch": 1.0186666666666666, "grad_norm": 2.064854383468628, "learning_rate": 0.00015939501779359432, "loss": 2.246, "step": 2292 }, { "epoch": 1.019111111111111, "grad_norm": 1.704014539718628, "learning_rate": 0.00015937722419928827, "loss": 1.6609, "step": 2293 }, { "epoch": 1.0195555555555555, "grad_norm": 1.7326053380966187, "learning_rate": 0.00015935943060498223, "loss": 2.0684, "step": 2294 }, { "epoch": 1.02, "grad_norm": 1.9503422975540161, "learning_rate": 0.00015934163701067616, "loss": 2.1345, "step": 2295 }, { "epoch": 1.0204444444444445, "grad_norm": 2.1505517959594727, "learning_rate": 0.00015932384341637012, "loss": 2.2523, "step": 2296 }, { "epoch": 1.020888888888889, "grad_norm": 2.059180736541748, "learning_rate": 0.00015930604982206407, "loss": 1.956, "step": 2297 }, { "epoch": 1.0213333333333334, "grad_norm": 1.7710636854171753, "learning_rate": 0.000159288256227758, "loss": 1.8129, "step": 2298 }, { "epoch": 1.0217777777777777, "grad_norm": 1.9192954301834106, "learning_rate": 0.00015927046263345196, "loss": 2.004, "step": 2299 }, { "epoch": 1.0222222222222221, "grad_norm": 3.243239402770996, "learning_rate": 0.00015925266903914591, "loss": 1.0487, "step": 2300 }, { "epoch": 1.0226666666666666, "grad_norm": 1.2613039016723633, "learning_rate": 0.00015923487544483987, "loss": 2.2974, "step": 2301 }, { "epoch": 1.023111111111111, "grad_norm": 1.9154945611953735, "learning_rate": 0.0001592170818505338, "loss": 1.0126, "step": 2302 }, { "epoch": 1.0235555555555556, "grad_norm": 1.2044062614440918, "learning_rate": 0.00015919928825622776, "loss": 0.5583, "step": 2303 }, { "epoch": 1.024, "grad_norm": 1.3576868772506714, "learning_rate": 0.00015918149466192171, "loss": 2.2222, "step": 2304 }, { "epoch": 1.0244444444444445, "grad_norm": 1.372114658355713, "learning_rate": 0.00015916370106761567, "loss": 2.2053, "step": 2305 }, { "epoch": 1.024888888888889, "grad_norm": 1.9428937435150146, "learning_rate": 0.00015914590747330963, "loss": 1.9576, "step": 2306 }, { "epoch": 1.0253333333333334, "grad_norm": 1.2017688751220703, "learning_rate": 0.00015912811387900358, "loss": 2.264, "step": 2307 }, { "epoch": 1.0257777777777777, "grad_norm": 1.3299150466918945, "learning_rate": 0.0001591103202846975, "loss": 1.8914, "step": 2308 }, { "epoch": 1.0262222222222221, "grad_norm": 1.3821178674697876, "learning_rate": 0.00015909252669039147, "loss": 2.7482, "step": 2309 }, { "epoch": 1.0266666666666666, "grad_norm": 1.5535212755203247, "learning_rate": 0.00015907473309608543, "loss": 2.3124, "step": 2310 }, { "epoch": 1.027111111111111, "grad_norm": 1.2688461542129517, "learning_rate": 0.00015905693950177936, "loss": 2.0699, "step": 2311 }, { "epoch": 1.0275555555555556, "grad_norm": 1.5910311937332153, "learning_rate": 0.0001590391459074733, "loss": 2.258, "step": 2312 }, { "epoch": 1.028, "grad_norm": 1.4131362438201904, "learning_rate": 0.00015902135231316727, "loss": 1.8134, "step": 2313 }, { "epoch": 1.0284444444444445, "grad_norm": 1.4447015523910522, "learning_rate": 0.00015900355871886122, "loss": 1.4502, "step": 2314 }, { "epoch": 1.028888888888889, "grad_norm": 1.6646491289138794, "learning_rate": 0.00015898576512455515, "loss": 2.3019, "step": 2315 }, { "epoch": 1.0293333333333334, "grad_norm": 1.4541774988174438, "learning_rate": 0.0001589679715302491, "loss": 2.2436, "step": 2316 }, { "epoch": 1.0297777777777777, "grad_norm": 1.4342156648635864, "learning_rate": 0.00015895017793594307, "loss": 2.3697, "step": 2317 }, { "epoch": 1.0302222222222222, "grad_norm": 1.6885074377059937, "learning_rate": 0.00015893238434163702, "loss": 2.3556, "step": 2318 }, { "epoch": 1.0306666666666666, "grad_norm": 2.0182912349700928, "learning_rate": 0.00015891459074733098, "loss": 2.886, "step": 2319 }, { "epoch": 1.031111111111111, "grad_norm": 1.5477981567382812, "learning_rate": 0.00015889679715302494, "loss": 2.3829, "step": 2320 }, { "epoch": 1.0315555555555556, "grad_norm": 1.4278366565704346, "learning_rate": 0.00015887900355871887, "loss": 1.8171, "step": 2321 }, { "epoch": 1.032, "grad_norm": 1.5529868602752686, "learning_rate": 0.00015886120996441282, "loss": 1.9937, "step": 2322 }, { "epoch": 1.0324444444444445, "grad_norm": 1.7762391567230225, "learning_rate": 0.00015884341637010678, "loss": 1.9945, "step": 2323 }, { "epoch": 1.032888888888889, "grad_norm": 1.378474473953247, "learning_rate": 0.0001588256227758007, "loss": 1.907, "step": 2324 }, { "epoch": 1.0333333333333334, "grad_norm": 1.6256483793258667, "learning_rate": 0.00015880782918149466, "loss": 2.2908, "step": 2325 }, { "epoch": 1.0337777777777777, "grad_norm": 1.5568405389785767, "learning_rate": 0.00015879003558718862, "loss": 2.2006, "step": 2326 }, { "epoch": 1.0342222222222222, "grad_norm": 1.5443711280822754, "learning_rate": 0.00015877224199288258, "loss": 1.892, "step": 2327 }, { "epoch": 1.0346666666666666, "grad_norm": 1.584693193435669, "learning_rate": 0.0001587544483985765, "loss": 2.2243, "step": 2328 }, { "epoch": 1.035111111111111, "grad_norm": 1.7238883972167969, "learning_rate": 0.00015873665480427046, "loss": 1.8239, "step": 2329 }, { "epoch": 1.0355555555555556, "grad_norm": 1.7188372611999512, "learning_rate": 0.00015871886120996442, "loss": 2.1286, "step": 2330 }, { "epoch": 1.036, "grad_norm": 1.8142226934432983, "learning_rate": 0.00015870106761565838, "loss": 2.342, "step": 2331 }, { "epoch": 1.0364444444444445, "grad_norm": 1.9462339878082275, "learning_rate": 0.00015868327402135233, "loss": 2.3758, "step": 2332 }, { "epoch": 1.036888888888889, "grad_norm": 1.4883403778076172, "learning_rate": 0.00015866548042704626, "loss": 1.6063, "step": 2333 }, { "epoch": 1.0373333333333334, "grad_norm": 1.6989222764968872, "learning_rate": 0.00015864768683274022, "loss": 2.2398, "step": 2334 }, { "epoch": 1.0377777777777777, "grad_norm": 1.6604561805725098, "learning_rate": 0.00015862989323843418, "loss": 2.2341, "step": 2335 }, { "epoch": 1.0382222222222222, "grad_norm": 2.0556907653808594, "learning_rate": 0.00015861209964412813, "loss": 2.4467, "step": 2336 }, { "epoch": 1.0386666666666666, "grad_norm": 2.043485164642334, "learning_rate": 0.00015859430604982206, "loss": 1.7722, "step": 2337 }, { "epoch": 1.039111111111111, "grad_norm": 1.7736142873764038, "learning_rate": 0.00015857651245551602, "loss": 1.9208, "step": 2338 }, { "epoch": 1.0395555555555556, "grad_norm": 2.0610883235931396, "learning_rate": 0.00015855871886120997, "loss": 2.8929, "step": 2339 }, { "epoch": 1.04, "grad_norm": 1.8629289865493774, "learning_rate": 0.0001585409252669039, "loss": 1.9888, "step": 2340 }, { "epoch": 1.0404444444444445, "grad_norm": 2.201791286468506, "learning_rate": 0.00015852313167259786, "loss": 2.4039, "step": 2341 }, { "epoch": 1.040888888888889, "grad_norm": 2.3649518489837646, "learning_rate": 0.00015850533807829182, "loss": 2.342, "step": 2342 }, { "epoch": 1.0413333333333332, "grad_norm": 1.763653039932251, "learning_rate": 0.00015848754448398577, "loss": 1.7666, "step": 2343 }, { "epoch": 1.0417777777777777, "grad_norm": 1.721071481704712, "learning_rate": 0.00015846975088967973, "loss": 2.2088, "step": 2344 }, { "epoch": 1.0422222222222222, "grad_norm": 2.0251195430755615, "learning_rate": 0.00015845195729537369, "loss": 2.3982, "step": 2345 }, { "epoch": 1.0426666666666666, "grad_norm": 1.797646403312683, "learning_rate": 0.00015843416370106762, "loss": 2.2807, "step": 2346 }, { "epoch": 1.043111111111111, "grad_norm": 2.336357831954956, "learning_rate": 0.00015841637010676157, "loss": 2.7199, "step": 2347 }, { "epoch": 1.0435555555555556, "grad_norm": 2.0868053436279297, "learning_rate": 0.00015839857651245553, "loss": 1.9292, "step": 2348 }, { "epoch": 1.044, "grad_norm": 2.3412818908691406, "learning_rate": 0.00015838078291814949, "loss": 2.1445, "step": 2349 }, { "epoch": 1.0444444444444445, "grad_norm": 2.016115188598633, "learning_rate": 0.00015836298932384341, "loss": 1.079, "step": 2350 }, { "epoch": 1.044888888888889, "grad_norm": 1.2892199754714966, "learning_rate": 0.00015834519572953737, "loss": 1.1067, "step": 2351 }, { "epoch": 1.0453333333333332, "grad_norm": 1.37278151512146, "learning_rate": 0.00015832740213523133, "loss": 2.208, "step": 2352 }, { "epoch": 1.0457777777777777, "grad_norm": 1.2415770292282104, "learning_rate": 0.00015830960854092526, "loss": 2.2114, "step": 2353 }, { "epoch": 1.0462222222222222, "grad_norm": 1.232061505317688, "learning_rate": 0.0001582918149466192, "loss": 1.4155, "step": 2354 }, { "epoch": 1.0466666666666666, "grad_norm": 1.177029013633728, "learning_rate": 0.00015827402135231317, "loss": 2.5071, "step": 2355 }, { "epoch": 1.047111111111111, "grad_norm": 1.4585161209106445, "learning_rate": 0.00015825622775800713, "loss": 2.3271, "step": 2356 }, { "epoch": 1.0475555555555556, "grad_norm": 1.424669623374939, "learning_rate": 0.00015823843416370108, "loss": 2.1732, "step": 2357 }, { "epoch": 1.048, "grad_norm": 1.3325684070587158, "learning_rate": 0.00015822064056939504, "loss": 2.2668, "step": 2358 }, { "epoch": 1.0484444444444445, "grad_norm": 1.5595697164535522, "learning_rate": 0.00015820284697508897, "loss": 2.3379, "step": 2359 }, { "epoch": 1.048888888888889, "grad_norm": 1.6011629104614258, "learning_rate": 0.00015818505338078293, "loss": 2.8927, "step": 2360 }, { "epoch": 1.0493333333333332, "grad_norm": 1.100995421409607, "learning_rate": 0.00015816725978647688, "loss": 1.6169, "step": 2361 }, { "epoch": 1.0497777777777777, "grad_norm": 1.3697389364242554, "learning_rate": 0.00015814946619217084, "loss": 2.3766, "step": 2362 }, { "epoch": 1.0502222222222222, "grad_norm": 1.332924485206604, "learning_rate": 0.00015813167259786477, "loss": 1.7657, "step": 2363 }, { "epoch": 1.0506666666666666, "grad_norm": 1.3072422742843628, "learning_rate": 0.00015811387900355872, "loss": 2.0262, "step": 2364 }, { "epoch": 1.051111111111111, "grad_norm": 1.370421290397644, "learning_rate": 0.00015809608540925268, "loss": 1.8601, "step": 2365 }, { "epoch": 1.0515555555555556, "grad_norm": 1.580460786819458, "learning_rate": 0.0001580782918149466, "loss": 1.3883, "step": 2366 }, { "epoch": 1.052, "grad_norm": 1.478049635887146, "learning_rate": 0.00015806049822064057, "loss": 2.3788, "step": 2367 }, { "epoch": 1.0524444444444445, "grad_norm": 1.421947717666626, "learning_rate": 0.00015804270462633452, "loss": 1.4549, "step": 2368 }, { "epoch": 1.052888888888889, "grad_norm": 1.768334984779358, "learning_rate": 0.00015802491103202848, "loss": 1.0313, "step": 2369 }, { "epoch": 1.0533333333333332, "grad_norm": 1.5838056802749634, "learning_rate": 0.00015800711743772244, "loss": 2.3386, "step": 2370 }, { "epoch": 1.0537777777777777, "grad_norm": 1.6991932392120361, "learning_rate": 0.0001579893238434164, "loss": 2.5764, "step": 2371 }, { "epoch": 1.0542222222222222, "grad_norm": 1.4423344135284424, "learning_rate": 0.00015797153024911032, "loss": 1.9598, "step": 2372 }, { "epoch": 1.0546666666666666, "grad_norm": 1.508663535118103, "learning_rate": 0.00015795373665480428, "loss": 2.3974, "step": 2373 }, { "epoch": 1.055111111111111, "grad_norm": 1.6960604190826416, "learning_rate": 0.00015793594306049823, "loss": 2.2544, "step": 2374 }, { "epoch": 1.0555555555555556, "grad_norm": 1.786561131477356, "learning_rate": 0.0001579181494661922, "loss": 1.688, "step": 2375 }, { "epoch": 1.056, "grad_norm": 1.505338191986084, "learning_rate": 0.00015790035587188612, "loss": 2.2269, "step": 2376 }, { "epoch": 1.0564444444444445, "grad_norm": 1.4534296989440918, "learning_rate": 0.00015788256227758008, "loss": 1.9174, "step": 2377 }, { "epoch": 1.056888888888889, "grad_norm": 1.5155029296875, "learning_rate": 0.00015786476868327403, "loss": 2.2544, "step": 2378 }, { "epoch": 1.0573333333333332, "grad_norm": 1.8670555353164673, "learning_rate": 0.00015784697508896796, "loss": 2.4167, "step": 2379 }, { "epoch": 1.0577777777777777, "grad_norm": 1.633664846420288, "learning_rate": 0.00015782918149466192, "loss": 1.8265, "step": 2380 }, { "epoch": 1.0582222222222222, "grad_norm": 1.5484849214553833, "learning_rate": 0.00015781138790035588, "loss": 2.1692, "step": 2381 }, { "epoch": 1.0586666666666666, "grad_norm": 1.8765406608581543, "learning_rate": 0.00015779359430604983, "loss": 0.8998, "step": 2382 }, { "epoch": 1.0591111111111111, "grad_norm": 1.6616917848587036, "learning_rate": 0.0001577758007117438, "loss": 2.3774, "step": 2383 }, { "epoch": 1.0595555555555556, "grad_norm": 1.5309672355651855, "learning_rate": 0.00015775800711743775, "loss": 1.8152, "step": 2384 }, { "epoch": 1.06, "grad_norm": 1.7901145219802856, "learning_rate": 0.00015774021352313168, "loss": 2.6988, "step": 2385 }, { "epoch": 1.0604444444444445, "grad_norm": 1.5736534595489502, "learning_rate": 0.00015772241992882563, "loss": 1.9631, "step": 2386 }, { "epoch": 1.060888888888889, "grad_norm": 1.8793672323226929, "learning_rate": 0.0001577046263345196, "loss": 2.7282, "step": 2387 }, { "epoch": 1.0613333333333332, "grad_norm": 1.6914716958999634, "learning_rate": 0.00015768683274021354, "loss": 2.1709, "step": 2388 }, { "epoch": 1.0617777777777777, "grad_norm": 1.847061038017273, "learning_rate": 0.00015766903914590747, "loss": 2.1721, "step": 2389 }, { "epoch": 1.0622222222222222, "grad_norm": 1.698413610458374, "learning_rate": 0.00015765124555160143, "loss": 1.6681, "step": 2390 }, { "epoch": 1.0626666666666666, "grad_norm": 1.9005299806594849, "learning_rate": 0.0001576334519572954, "loss": 1.9767, "step": 2391 }, { "epoch": 1.0631111111111111, "grad_norm": 1.9315385818481445, "learning_rate": 0.00015761565836298932, "loss": 2.4512, "step": 2392 }, { "epoch": 1.0635555555555556, "grad_norm": 1.114691972732544, "learning_rate": 0.00015759786476868327, "loss": 0.8507, "step": 2393 }, { "epoch": 1.064, "grad_norm": 1.4050822257995605, "learning_rate": 0.00015758007117437723, "loss": 1.3638, "step": 2394 }, { "epoch": 1.0644444444444445, "grad_norm": 1.3316043615341187, "learning_rate": 0.00015756227758007119, "loss": 1.1736, "step": 2395 }, { "epoch": 1.064888888888889, "grad_norm": 1.950830340385437, "learning_rate": 0.00015754448398576514, "loss": 2.0624, "step": 2396 }, { "epoch": 1.0653333333333332, "grad_norm": 1.9094411134719849, "learning_rate": 0.0001575266903914591, "loss": 2.4114, "step": 2397 }, { "epoch": 1.0657777777777777, "grad_norm": 2.046294927597046, "learning_rate": 0.00015750889679715303, "loss": 2.32, "step": 2398 }, { "epoch": 1.0662222222222222, "grad_norm": 2.1060452461242676, "learning_rate": 0.00015749110320284698, "loss": 2.7453, "step": 2399 }, { "epoch": 1.0666666666666667, "grad_norm": 3.819446325302124, "learning_rate": 0.00015747330960854094, "loss": 0.8665, "step": 2400 }, { "epoch": 1.0671111111111111, "grad_norm": 1.0843795537948608, "learning_rate": 0.0001574555160142349, "loss": 2.5427, "step": 2401 }, { "epoch": 1.0675555555555556, "grad_norm": 1.219653606414795, "learning_rate": 0.00015743772241992883, "loss": 0.8994, "step": 2402 }, { "epoch": 1.068, "grad_norm": 1.363571047782898, "learning_rate": 0.00015741992882562278, "loss": 2.6447, "step": 2403 }, { "epoch": 1.0684444444444445, "grad_norm": 1.192108154296875, "learning_rate": 0.00015740213523131674, "loss": 2.1144, "step": 2404 }, { "epoch": 1.068888888888889, "grad_norm": 1.539057731628418, "learning_rate": 0.00015738434163701067, "loss": 2.487, "step": 2405 }, { "epoch": 1.0693333333333332, "grad_norm": 1.3476603031158447, "learning_rate": 0.00015736654804270463, "loss": 2.207, "step": 2406 }, { "epoch": 1.0697777777777777, "grad_norm": 1.304561972618103, "learning_rate": 0.00015734875444839858, "loss": 2.0535, "step": 2407 }, { "epoch": 1.0702222222222222, "grad_norm": 1.4313247203826904, "learning_rate": 0.00015733096085409254, "loss": 2.5509, "step": 2408 }, { "epoch": 1.0706666666666667, "grad_norm": 1.3336185216903687, "learning_rate": 0.0001573131672597865, "loss": 1.8019, "step": 2409 }, { "epoch": 1.0711111111111111, "grad_norm": 1.3517190217971802, "learning_rate": 0.00015729537366548045, "loss": 2.188, "step": 2410 }, { "epoch": 1.0715555555555556, "grad_norm": 1.563821792602539, "learning_rate": 0.00015727758007117438, "loss": 2.2798, "step": 2411 }, { "epoch": 1.072, "grad_norm": 1.5215498208999634, "learning_rate": 0.00015725978647686834, "loss": 2.454, "step": 2412 }, { "epoch": 1.0724444444444445, "grad_norm": 1.465469479560852, "learning_rate": 0.0001572419928825623, "loss": 1.8113, "step": 2413 }, { "epoch": 1.072888888888889, "grad_norm": 1.5329351425170898, "learning_rate": 0.00015722419928825622, "loss": 2.3624, "step": 2414 }, { "epoch": 1.0733333333333333, "grad_norm": 1.7038596868515015, "learning_rate": 0.00015720640569395018, "loss": 0.1039, "step": 2415 }, { "epoch": 1.0737777777777777, "grad_norm": 1.4838898181915283, "learning_rate": 0.00015718861209964414, "loss": 1.8683, "step": 2416 }, { "epoch": 1.0742222222222222, "grad_norm": 1.7675329446792603, "learning_rate": 0.0001571708185053381, "loss": 2.3259, "step": 2417 }, { "epoch": 1.0746666666666667, "grad_norm": 1.687468409538269, "learning_rate": 0.00015715302491103202, "loss": 2.6304, "step": 2418 }, { "epoch": 1.0751111111111111, "grad_norm": 1.5204507112503052, "learning_rate": 0.00015713523131672598, "loss": 2.5227, "step": 2419 }, { "epoch": 1.0755555555555556, "grad_norm": 2.2087671756744385, "learning_rate": 0.00015711743772241994, "loss": 1.0911, "step": 2420 }, { "epoch": 1.076, "grad_norm": 1.5000768899917603, "learning_rate": 0.0001570996441281139, "loss": 2.3532, "step": 2421 }, { "epoch": 1.0764444444444445, "grad_norm": 1.5198241472244263, "learning_rate": 0.00015708185053380785, "loss": 2.0702, "step": 2422 }, { "epoch": 1.076888888888889, "grad_norm": 1.6272002458572388, "learning_rate": 0.00015706405693950178, "loss": 2.0378, "step": 2423 }, { "epoch": 1.0773333333333333, "grad_norm": 1.5990360975265503, "learning_rate": 0.00015704626334519573, "loss": 2.2515, "step": 2424 }, { "epoch": 1.0777777777777777, "grad_norm": 1.4444339275360107, "learning_rate": 0.0001570284697508897, "loss": 2.2619, "step": 2425 }, { "epoch": 1.0782222222222222, "grad_norm": 1.8956879377365112, "learning_rate": 0.00015701067615658365, "loss": 2.8021, "step": 2426 }, { "epoch": 1.0786666666666667, "grad_norm": 1.6425714492797852, "learning_rate": 0.00015699288256227758, "loss": 2.0961, "step": 2427 }, { "epoch": 1.0791111111111111, "grad_norm": 1.7819446325302124, "learning_rate": 0.00015697508896797153, "loss": 2.2295, "step": 2428 }, { "epoch": 1.0795555555555556, "grad_norm": 1.8124161958694458, "learning_rate": 0.0001569572953736655, "loss": 2.3429, "step": 2429 }, { "epoch": 1.08, "grad_norm": 1.7560713291168213, "learning_rate": 0.00015693950177935942, "loss": 2.0773, "step": 2430 }, { "epoch": 1.0804444444444445, "grad_norm": 1.647606611251831, "learning_rate": 0.00015692170818505338, "loss": 1.9757, "step": 2431 }, { "epoch": 1.0808888888888888, "grad_norm": 1.6525201797485352, "learning_rate": 0.00015690391459074733, "loss": 1.8019, "step": 2432 }, { "epoch": 1.0813333333333333, "grad_norm": 1.7084051370620728, "learning_rate": 0.0001568861209964413, "loss": 2.6304, "step": 2433 }, { "epoch": 1.0817777777777777, "grad_norm": 1.3071404695510864, "learning_rate": 0.00015686832740213525, "loss": 1.1213, "step": 2434 }, { "epoch": 1.0822222222222222, "grad_norm": 1.7664408683776855, "learning_rate": 0.0001568505338078292, "loss": 2.335, "step": 2435 }, { "epoch": 1.0826666666666667, "grad_norm": 1.7795616388320923, "learning_rate": 0.00015683274021352313, "loss": 1.8807, "step": 2436 }, { "epoch": 1.0831111111111111, "grad_norm": 1.9509518146514893, "learning_rate": 0.0001568149466192171, "loss": 2.2775, "step": 2437 }, { "epoch": 1.0835555555555556, "grad_norm": 1.7835257053375244, "learning_rate": 0.00015679715302491104, "loss": 1.8702, "step": 2438 }, { "epoch": 1.084, "grad_norm": 1.7957788705825806, "learning_rate": 0.000156779359430605, "loss": 2.4921, "step": 2439 }, { "epoch": 1.0844444444444445, "grad_norm": 1.2208243608474731, "learning_rate": 0.00015676156583629893, "loss": 1.2614, "step": 2440 }, { "epoch": 1.0848888888888888, "grad_norm": 1.8217169046401978, "learning_rate": 0.0001567437722419929, "loss": 2.0923, "step": 2441 }, { "epoch": 1.0853333333333333, "grad_norm": 2.509866952896118, "learning_rate": 0.00015672597864768684, "loss": 2.5114, "step": 2442 }, { "epoch": 1.0857777777777777, "grad_norm": 2.0780751705169678, "learning_rate": 0.00015670818505338077, "loss": 2.5196, "step": 2443 }, { "epoch": 1.0862222222222222, "grad_norm": 1.780432105064392, "learning_rate": 0.00015669039145907473, "loss": 1.6573, "step": 2444 }, { "epoch": 1.0866666666666667, "grad_norm": 1.7413227558135986, "learning_rate": 0.00015667259786476869, "loss": 1.6994, "step": 2445 }, { "epoch": 1.0871111111111111, "grad_norm": 2.0534093379974365, "learning_rate": 0.00015665480427046264, "loss": 2.5252, "step": 2446 }, { "epoch": 1.0875555555555556, "grad_norm": 1.8891476392745972, "learning_rate": 0.0001566370106761566, "loss": 1.7454, "step": 2447 }, { "epoch": 1.088, "grad_norm": 2.5236616134643555, "learning_rate": 0.00015661921708185056, "loss": 2.7, "step": 2448 }, { "epoch": 1.0884444444444445, "grad_norm": 2.130950689315796, "learning_rate": 0.00015660142348754448, "loss": 1.9737, "step": 2449 }, { "epoch": 1.0888888888888888, "grad_norm": 3.0445713996887207, "learning_rate": 0.00015658362989323844, "loss": 1.5697, "step": 2450 }, { "epoch": 1.0893333333333333, "grad_norm": 1.1887680292129517, "learning_rate": 0.0001565658362989324, "loss": 2.4287, "step": 2451 }, { "epoch": 1.0897777777777777, "grad_norm": 1.3546632528305054, "learning_rate": 0.00015654804270462635, "loss": 1.377, "step": 2452 }, { "epoch": 1.0902222222222222, "grad_norm": 1.6304256916046143, "learning_rate": 0.00015653024911032028, "loss": 2.4908, "step": 2453 }, { "epoch": 1.0906666666666667, "grad_norm": 1.516430139541626, "learning_rate": 0.00015651245551601424, "loss": 2.0657, "step": 2454 }, { "epoch": 1.0911111111111111, "grad_norm": 1.5727593898773193, "learning_rate": 0.0001564946619217082, "loss": 2.0812, "step": 2455 }, { "epoch": 1.0915555555555556, "grad_norm": 1.2789214849472046, "learning_rate": 0.00015647686832740213, "loss": 2.01, "step": 2456 }, { "epoch": 1.092, "grad_norm": 1.4972316026687622, "learning_rate": 0.00015645907473309608, "loss": 1.0983, "step": 2457 }, { "epoch": 1.0924444444444443, "grad_norm": 1.3972692489624023, "learning_rate": 0.00015644128113879004, "loss": 2.2204, "step": 2458 }, { "epoch": 1.0928888888888888, "grad_norm": 1.5354390144348145, "learning_rate": 0.000156423487544484, "loss": 2.3677, "step": 2459 }, { "epoch": 1.0933333333333333, "grad_norm": 1.5079275369644165, "learning_rate": 0.00015640569395017795, "loss": 2.4312, "step": 2460 }, { "epoch": 1.0937777777777777, "grad_norm": 1.4273076057434082, "learning_rate": 0.0001563879003558719, "loss": 2.1158, "step": 2461 }, { "epoch": 1.0942222222222222, "grad_norm": 1.5340080261230469, "learning_rate": 0.00015637010676156584, "loss": 2.559, "step": 2462 }, { "epoch": 1.0946666666666667, "grad_norm": 1.5617725849151611, "learning_rate": 0.0001563523131672598, "loss": 2.5294, "step": 2463 }, { "epoch": 1.0951111111111111, "grad_norm": 1.6314741373062134, "learning_rate": 0.00015633451957295375, "loss": 2.1713, "step": 2464 }, { "epoch": 1.0955555555555556, "grad_norm": 1.460752010345459, "learning_rate": 0.0001563167259786477, "loss": 2.3558, "step": 2465 }, { "epoch": 1.096, "grad_norm": 1.428756833076477, "learning_rate": 0.00015629893238434164, "loss": 2.2738, "step": 2466 }, { "epoch": 1.0964444444444443, "grad_norm": 1.7158453464508057, "learning_rate": 0.0001562811387900356, "loss": 2.172, "step": 2467 }, { "epoch": 1.0968888888888888, "grad_norm": 1.5051125288009644, "learning_rate": 0.00015626334519572955, "loss": 1.9741, "step": 2468 }, { "epoch": 1.0973333333333333, "grad_norm": 1.4402563571929932, "learning_rate": 0.00015624555160142348, "loss": 2.2152, "step": 2469 }, { "epoch": 1.0977777777777777, "grad_norm": 1.6247109174728394, "learning_rate": 0.00015622775800711744, "loss": 2.1543, "step": 2470 }, { "epoch": 1.0982222222222222, "grad_norm": 1.6644169092178345, "learning_rate": 0.0001562099644128114, "loss": 2.2489, "step": 2471 }, { "epoch": 1.0986666666666667, "grad_norm": 2.04425048828125, "learning_rate": 0.00015619217081850535, "loss": 2.2436, "step": 2472 }, { "epoch": 1.0991111111111111, "grad_norm": 1.86391282081604, "learning_rate": 0.0001561743772241993, "loss": 1.8856, "step": 2473 }, { "epoch": 1.0995555555555556, "grad_norm": 1.5092231035232544, "learning_rate": 0.00015615658362989326, "loss": 2.0027, "step": 2474 }, { "epoch": 1.1, "grad_norm": 1.5313433408737183, "learning_rate": 0.0001561387900355872, "loss": 1.4636, "step": 2475 }, { "epoch": 1.1004444444444443, "grad_norm": 1.7613354921340942, "learning_rate": 0.00015612099644128115, "loss": 2.3526, "step": 2476 }, { "epoch": 1.1008888888888888, "grad_norm": 1.567148208618164, "learning_rate": 0.0001561032028469751, "loss": 1.8585, "step": 2477 }, { "epoch": 1.1013333333333333, "grad_norm": 2.1191651821136475, "learning_rate": 0.00015608540925266906, "loss": 2.188, "step": 2478 }, { "epoch": 1.1017777777777777, "grad_norm": 1.6111822128295898, "learning_rate": 0.000156067615658363, "loss": 2.0623, "step": 2479 }, { "epoch": 1.1022222222222222, "grad_norm": 1.5612345933914185, "learning_rate": 0.00015604982206405695, "loss": 1.881, "step": 2480 }, { "epoch": 1.1026666666666667, "grad_norm": 1.9181139469146729, "learning_rate": 0.0001560320284697509, "loss": 2.4053, "step": 2481 }, { "epoch": 1.1031111111111112, "grad_norm": 1.5462487936019897, "learning_rate": 0.00015601423487544483, "loss": 1.7245, "step": 2482 }, { "epoch": 1.1035555555555556, "grad_norm": 1.9112005233764648, "learning_rate": 0.0001559964412811388, "loss": 2.0651, "step": 2483 }, { "epoch": 1.104, "grad_norm": 1.8536262512207031, "learning_rate": 0.00015597864768683274, "loss": 2.2609, "step": 2484 }, { "epoch": 1.1044444444444443, "grad_norm": 1.6455966234207153, "learning_rate": 0.0001559608540925267, "loss": 1.8761, "step": 2485 }, { "epoch": 1.1048888888888888, "grad_norm": 1.9066351652145386, "learning_rate": 0.00015594306049822066, "loss": 1.9272, "step": 2486 }, { "epoch": 1.1053333333333333, "grad_norm": 1.5465588569641113, "learning_rate": 0.00015592526690391461, "loss": 1.6961, "step": 2487 }, { "epoch": 1.1057777777777777, "grad_norm": 2.180607318878174, "learning_rate": 0.00015590747330960854, "loss": 2.4785, "step": 2488 }, { "epoch": 1.1062222222222222, "grad_norm": 1.9382919073104858, "learning_rate": 0.0001558896797153025, "loss": 1.754, "step": 2489 }, { "epoch": 1.1066666666666667, "grad_norm": 1.8899612426757812, "learning_rate": 0.00015587188612099646, "loss": 2.2687, "step": 2490 }, { "epoch": 1.1071111111111112, "grad_norm": 1.633954644203186, "learning_rate": 0.0001558540925266904, "loss": 1.9051, "step": 2491 }, { "epoch": 1.1075555555555556, "grad_norm": 3.747358560562134, "learning_rate": 0.00015583629893238434, "loss": 1.9931, "step": 2492 }, { "epoch": 1.108, "grad_norm": 2.0736889839172363, "learning_rate": 0.0001558185053380783, "loss": 2.3253, "step": 2493 }, { "epoch": 1.1084444444444443, "grad_norm": 1.9722511768341064, "learning_rate": 0.00015580071174377226, "loss": 2.2268, "step": 2494 }, { "epoch": 1.1088888888888888, "grad_norm": 1.9055894613265991, "learning_rate": 0.00015578291814946619, "loss": 2.0168, "step": 2495 }, { "epoch": 1.1093333333333333, "grad_norm": 2.424161672592163, "learning_rate": 0.00015576512455516014, "loss": 2.4325, "step": 2496 }, { "epoch": 1.1097777777777778, "grad_norm": 1.9778692722320557, "learning_rate": 0.0001557473309608541, "loss": 1.8312, "step": 2497 }, { "epoch": 1.1102222222222222, "grad_norm": 2.590223550796509, "learning_rate": 0.00015572953736654805, "loss": 2.6732, "step": 2498 }, { "epoch": 1.1106666666666667, "grad_norm": 1.8567228317260742, "learning_rate": 0.000155711743772242, "loss": 0.7758, "step": 2499 }, { "epoch": 1.1111111111111112, "grad_norm": 2.436675786972046, "learning_rate": 0.00015569395017793597, "loss": 2.0896, "step": 2500 }, { "epoch": 1.1115555555555556, "grad_norm": 1.4165303707122803, "learning_rate": 0.0001556761565836299, "loss": 2.3637, "step": 2501 }, { "epoch": 1.112, "grad_norm": 1.173225998878479, "learning_rate": 0.00015565836298932385, "loss": 2.1477, "step": 2502 }, { "epoch": 1.1124444444444443, "grad_norm": 1.1812031269073486, "learning_rate": 0.0001556405693950178, "loss": 2.5036, "step": 2503 }, { "epoch": 1.1128888888888888, "grad_norm": 1.4120737314224243, "learning_rate": 0.00015562277580071177, "loss": 2.1574, "step": 2504 }, { "epoch": 1.1133333333333333, "grad_norm": 1.3986128568649292, "learning_rate": 0.0001556049822064057, "loss": 1.8443, "step": 2505 }, { "epoch": 1.1137777777777778, "grad_norm": 1.6244175434112549, "learning_rate": 0.00015558718861209965, "loss": 2.7789, "step": 2506 }, { "epoch": 1.1142222222222222, "grad_norm": 1.4272732734680176, "learning_rate": 0.0001555693950177936, "loss": 2.2718, "step": 2507 }, { "epoch": 1.1146666666666667, "grad_norm": 1.3611066341400146, "learning_rate": 0.00015555160142348754, "loss": 2.3992, "step": 2508 }, { "epoch": 1.1151111111111112, "grad_norm": 1.751434087753296, "learning_rate": 0.0001555338078291815, "loss": 2.1827, "step": 2509 }, { "epoch": 1.1155555555555556, "grad_norm": 1.3341114521026611, "learning_rate": 0.00015551601423487545, "loss": 1.985, "step": 2510 }, { "epoch": 1.116, "grad_norm": 1.3708674907684326, "learning_rate": 0.0001554982206405694, "loss": 2.0507, "step": 2511 }, { "epoch": 1.1164444444444444, "grad_norm": 1.4740220308303833, "learning_rate": 0.00015548042704626336, "loss": 2.1808, "step": 2512 }, { "epoch": 1.1168888888888888, "grad_norm": 1.5105361938476562, "learning_rate": 0.0001554626334519573, "loss": 2.055, "step": 2513 }, { "epoch": 1.1173333333333333, "grad_norm": 1.4184283018112183, "learning_rate": 0.00015544483985765125, "loss": 2.0523, "step": 2514 }, { "epoch": 1.1177777777777778, "grad_norm": 1.5358822345733643, "learning_rate": 0.0001554270462633452, "loss": 2.073, "step": 2515 }, { "epoch": 1.1182222222222222, "grad_norm": 1.636608362197876, "learning_rate": 0.00015540925266903916, "loss": 2.0907, "step": 2516 }, { "epoch": 1.1186666666666667, "grad_norm": 1.7694652080535889, "learning_rate": 0.00015539145907473312, "loss": 2.4836, "step": 2517 }, { "epoch": 1.1191111111111112, "grad_norm": 1.6159368753433228, "learning_rate": 0.00015537366548042705, "loss": 1.9878, "step": 2518 }, { "epoch": 1.1195555555555556, "grad_norm": 2.006478786468506, "learning_rate": 0.000155355871886121, "loss": 2.6112, "step": 2519 }, { "epoch": 1.12, "grad_norm": 1.530470371246338, "learning_rate": 0.00015533807829181493, "loss": 1.8775, "step": 2520 }, { "epoch": 1.1204444444444444, "grad_norm": 1.8092018365859985, "learning_rate": 0.0001553202846975089, "loss": 2.0667, "step": 2521 }, { "epoch": 1.1208888888888888, "grad_norm": 1.5595135688781738, "learning_rate": 0.00015530249110320285, "loss": 2.7584, "step": 2522 }, { "epoch": 1.1213333333333333, "grad_norm": 1.5130575895309448, "learning_rate": 0.0001552846975088968, "loss": 2.3097, "step": 2523 }, { "epoch": 1.1217777777777778, "grad_norm": 1.7195639610290527, "learning_rate": 0.00015526690391459076, "loss": 2.3219, "step": 2524 }, { "epoch": 1.1222222222222222, "grad_norm": 1.7365368604660034, "learning_rate": 0.00015524911032028472, "loss": 1.8011, "step": 2525 }, { "epoch": 1.1226666666666667, "grad_norm": 1.6279082298278809, "learning_rate": 0.00015523131672597865, "loss": 1.9524, "step": 2526 }, { "epoch": 1.1231111111111112, "grad_norm": 1.9169297218322754, "learning_rate": 0.0001552135231316726, "loss": 2.3763, "step": 2527 }, { "epoch": 1.1235555555555556, "grad_norm": 1.688704490661621, "learning_rate": 0.00015519572953736656, "loss": 2.1569, "step": 2528 }, { "epoch": 1.124, "grad_norm": 1.2597405910491943, "learning_rate": 0.00015517793594306052, "loss": 1.3028, "step": 2529 }, { "epoch": 1.1244444444444444, "grad_norm": 1.07369863986969, "learning_rate": 0.00015516014234875445, "loss": 0.8657, "step": 2530 }, { "epoch": 1.1248888888888888, "grad_norm": 1.3311814069747925, "learning_rate": 0.0001551423487544484, "loss": 1.1018, "step": 2531 }, { "epoch": 1.1253333333333333, "grad_norm": 1.7124340534210205, "learning_rate": 0.00015512455516014236, "loss": 2.2348, "step": 2532 }, { "epoch": 1.1257777777777778, "grad_norm": 1.6865593194961548, "learning_rate": 0.0001551067615658363, "loss": 2.095, "step": 2533 }, { "epoch": 1.1262222222222222, "grad_norm": 2.115900754928589, "learning_rate": 0.00015508896797153024, "loss": 2.2804, "step": 2534 }, { "epoch": 1.1266666666666667, "grad_norm": 1.720361590385437, "learning_rate": 0.0001550711743772242, "loss": 2.125, "step": 2535 }, { "epoch": 1.1271111111111112, "grad_norm": 1.8980624675750732, "learning_rate": 0.00015505338078291816, "loss": 2.2915, "step": 2536 }, { "epoch": 1.1275555555555556, "grad_norm": 2.2021002769470215, "learning_rate": 0.00015503558718861211, "loss": 2.2995, "step": 2537 }, { "epoch": 1.1280000000000001, "grad_norm": 1.804162621498108, "learning_rate": 0.00015501779359430607, "loss": 2.1668, "step": 2538 }, { "epoch": 1.1284444444444444, "grad_norm": 1.6780054569244385, "learning_rate": 0.000155, "loss": 1.7612, "step": 2539 }, { "epoch": 1.1288888888888888, "grad_norm": 2.008111000061035, "learning_rate": 0.00015498220640569396, "loss": 2.2902, "step": 2540 }, { "epoch": 1.1293333333333333, "grad_norm": 2.181689500808716, "learning_rate": 0.0001549644128113879, "loss": 2.3495, "step": 2541 }, { "epoch": 1.1297777777777778, "grad_norm": 2.064223289489746, "learning_rate": 0.00015494661921708187, "loss": 1.9178, "step": 2542 }, { "epoch": 1.1302222222222222, "grad_norm": 2.1660356521606445, "learning_rate": 0.0001549288256227758, "loss": 2.6801, "step": 2543 }, { "epoch": 1.1306666666666667, "grad_norm": 2.031355142593384, "learning_rate": 0.00015491103202846976, "loss": 2.2943, "step": 2544 }, { "epoch": 1.1311111111111112, "grad_norm": 2.2684144973754883, "learning_rate": 0.0001548932384341637, "loss": 2.5049, "step": 2545 }, { "epoch": 1.1315555555555556, "grad_norm": 2.3677666187286377, "learning_rate": 0.00015487544483985764, "loss": 2.6018, "step": 2546 }, { "epoch": 1.1320000000000001, "grad_norm": 2.2659435272216797, "learning_rate": 0.0001548576512455516, "loss": 2.3025, "step": 2547 }, { "epoch": 1.1324444444444444, "grad_norm": 2.0491015911102295, "learning_rate": 0.00015483985765124555, "loss": 2.3657, "step": 2548 }, { "epoch": 1.1328888888888888, "grad_norm": 1.3263907432556152, "learning_rate": 0.0001548220640569395, "loss": 0.88, "step": 2549 }, { "epoch": 1.1333333333333333, "grad_norm": 2.2537829875946045, "learning_rate": 0.00015480427046263347, "loss": 1.195, "step": 2550 }, { "epoch": 1.1337777777777778, "grad_norm": 1.338564395904541, "learning_rate": 0.00015478647686832742, "loss": 2.866, "step": 2551 }, { "epoch": 1.1342222222222222, "grad_norm": 1.3049834966659546, "learning_rate": 0.00015476868327402135, "loss": 2.7732, "step": 2552 }, { "epoch": 1.1346666666666667, "grad_norm": 0.8578532338142395, "learning_rate": 0.0001547508896797153, "loss": 1.0418, "step": 2553 }, { "epoch": 1.1351111111111112, "grad_norm": 1.3947099447250366, "learning_rate": 0.00015473309608540927, "loss": 2.3232, "step": 2554 }, { "epoch": 1.1355555555555557, "grad_norm": 1.4263209104537964, "learning_rate": 0.00015471530249110322, "loss": 2.4567, "step": 2555 }, { "epoch": 1.1360000000000001, "grad_norm": 1.4634780883789062, "learning_rate": 0.00015469750889679715, "loss": 2.2001, "step": 2556 }, { "epoch": 1.1364444444444444, "grad_norm": 1.4137287139892578, "learning_rate": 0.0001546797153024911, "loss": 2.3393, "step": 2557 }, { "epoch": 1.1368888888888888, "grad_norm": 1.3310433626174927, "learning_rate": 0.00015466192170818507, "loss": 2.1137, "step": 2558 }, { "epoch": 1.1373333333333333, "grad_norm": 1.6760700941085815, "learning_rate": 0.000154644128113879, "loss": 2.6479, "step": 2559 }, { "epoch": 1.1377777777777778, "grad_norm": 1.6810277700424194, "learning_rate": 0.00015462633451957295, "loss": 1.7997, "step": 2560 }, { "epoch": 1.1382222222222222, "grad_norm": 1.40380859375, "learning_rate": 0.0001546085409252669, "loss": 2.1069, "step": 2561 }, { "epoch": 1.1386666666666667, "grad_norm": 1.451357126235962, "learning_rate": 0.00015459074733096086, "loss": 2.0879, "step": 2562 }, { "epoch": 1.1391111111111112, "grad_norm": 1.7103229761123657, "learning_rate": 0.00015457295373665482, "loss": 2.1198, "step": 2563 }, { "epoch": 1.1395555555555554, "grad_norm": 1.3014283180236816, "learning_rate": 0.00015455516014234878, "loss": 1.4409, "step": 2564 }, { "epoch": 1.1400000000000001, "grad_norm": 1.7110216617584229, "learning_rate": 0.0001545373665480427, "loss": 2.064, "step": 2565 }, { "epoch": 1.1404444444444444, "grad_norm": 1.2959778308868408, "learning_rate": 0.00015451957295373666, "loss": 1.9788, "step": 2566 }, { "epoch": 1.1408888888888888, "grad_norm": 1.339138388633728, "learning_rate": 0.00015450177935943062, "loss": 1.525, "step": 2567 }, { "epoch": 1.1413333333333333, "grad_norm": 2.068941116333008, "learning_rate": 0.00015448398576512458, "loss": 2.5298, "step": 2568 }, { "epoch": 1.1417777777777778, "grad_norm": 1.6485881805419922, "learning_rate": 0.0001544661921708185, "loss": 2.0371, "step": 2569 }, { "epoch": 1.1422222222222222, "grad_norm": 1.6411560773849487, "learning_rate": 0.00015444839857651246, "loss": 2.1563, "step": 2570 }, { "epoch": 1.1426666666666667, "grad_norm": 1.8316292762756348, "learning_rate": 0.00015443060498220642, "loss": 1.9724, "step": 2571 }, { "epoch": 1.1431111111111112, "grad_norm": 1.3947020769119263, "learning_rate": 0.00015441281138790035, "loss": 2.1488, "step": 2572 }, { "epoch": 1.1435555555555554, "grad_norm": 1.722806692123413, "learning_rate": 0.0001543950177935943, "loss": 1.9657, "step": 2573 }, { "epoch": 1.144, "grad_norm": 1.944720983505249, "learning_rate": 0.00015437722419928826, "loss": 2.0344, "step": 2574 }, { "epoch": 1.1444444444444444, "grad_norm": 1.69381582736969, "learning_rate": 0.00015435943060498222, "loss": 1.6078, "step": 2575 }, { "epoch": 1.1448888888888888, "grad_norm": 1.678240180015564, "learning_rate": 0.00015434163701067617, "loss": 2.2514, "step": 2576 }, { "epoch": 1.1453333333333333, "grad_norm": 1.8483023643493652, "learning_rate": 0.00015432384341637013, "loss": 2.3157, "step": 2577 }, { "epoch": 1.1457777777777778, "grad_norm": 1.6956913471221924, "learning_rate": 0.00015430604982206406, "loss": 2.2523, "step": 2578 }, { "epoch": 1.1462222222222223, "grad_norm": 1.861174464225769, "learning_rate": 0.00015428825622775802, "loss": 2.5922, "step": 2579 }, { "epoch": 1.1466666666666667, "grad_norm": 1.6170905828475952, "learning_rate": 0.00015427046263345197, "loss": 1.7942, "step": 2580 }, { "epoch": 1.1471111111111112, "grad_norm": 1.5658963918685913, "learning_rate": 0.00015425266903914593, "loss": 2.0127, "step": 2581 }, { "epoch": 1.1475555555555554, "grad_norm": 1.7916998863220215, "learning_rate": 0.00015423487544483986, "loss": 2.2693, "step": 2582 }, { "epoch": 1.148, "grad_norm": 1.8371453285217285, "learning_rate": 0.00015421708185053381, "loss": 1.8348, "step": 2583 }, { "epoch": 1.1484444444444444, "grad_norm": 1.862358808517456, "learning_rate": 0.00015419928825622777, "loss": 2.536, "step": 2584 }, { "epoch": 1.1488888888888888, "grad_norm": 2.036752700805664, "learning_rate": 0.0001541814946619217, "loss": 2.4652, "step": 2585 }, { "epoch": 1.1493333333333333, "grad_norm": 2.268584966659546, "learning_rate": 0.00015416370106761566, "loss": 2.2486, "step": 2586 }, { "epoch": 1.1497777777777778, "grad_norm": 1.7090932130813599, "learning_rate": 0.00015414590747330961, "loss": 1.95, "step": 2587 }, { "epoch": 1.1502222222222223, "grad_norm": 1.6534048318862915, "learning_rate": 0.00015412811387900357, "loss": 1.301, "step": 2588 }, { "epoch": 1.1506666666666667, "grad_norm": 1.7545114755630493, "learning_rate": 0.00015411032028469753, "loss": 1.753, "step": 2589 }, { "epoch": 1.1511111111111112, "grad_norm": 2.4128382205963135, "learning_rate": 0.00015409252669039148, "loss": 2.5481, "step": 2590 }, { "epoch": 1.1515555555555554, "grad_norm": 1.815370798110962, "learning_rate": 0.0001540747330960854, "loss": 2.091, "step": 2591 }, { "epoch": 1.152, "grad_norm": 2.445251941680908, "learning_rate": 0.00015405693950177937, "loss": 2.1765, "step": 2592 }, { "epoch": 1.1524444444444444, "grad_norm": 1.9091752767562866, "learning_rate": 0.00015403914590747333, "loss": 1.5047, "step": 2593 }, { "epoch": 1.1528888888888889, "grad_norm": 2.3893678188323975, "learning_rate": 0.00015402135231316728, "loss": 2.2924, "step": 2594 }, { "epoch": 1.1533333333333333, "grad_norm": 2.1791248321533203, "learning_rate": 0.0001540035587188612, "loss": 2.1891, "step": 2595 }, { "epoch": 1.1537777777777778, "grad_norm": 1.9411416053771973, "learning_rate": 0.00015398576512455517, "loss": 2.2953, "step": 2596 }, { "epoch": 1.1542222222222223, "grad_norm": 2.478189468383789, "learning_rate": 0.00015396797153024912, "loss": 1.9645, "step": 2597 }, { "epoch": 1.1546666666666667, "grad_norm": 2.280930280685425, "learning_rate": 0.00015395017793594305, "loss": 2.2849, "step": 2598 }, { "epoch": 1.1551111111111112, "grad_norm": 2.5622832775115967, "learning_rate": 0.000153932384341637, "loss": 1.4624, "step": 2599 }, { "epoch": 1.1555555555555554, "grad_norm": 2.4844248294830322, "learning_rate": 0.00015391459074733097, "loss": 1.5791, "step": 2600 }, { "epoch": 1.156, "grad_norm": 1.2558754682540894, "learning_rate": 0.00015389679715302492, "loss": 2.402, "step": 2601 }, { "epoch": 1.1564444444444444, "grad_norm": 1.8104875087738037, "learning_rate": 0.00015387900355871888, "loss": 1.0876, "step": 2602 }, { "epoch": 1.1568888888888889, "grad_norm": 1.2649706602096558, "learning_rate": 0.0001538612099644128, "loss": 2.1157, "step": 2603 }, { "epoch": 1.1573333333333333, "grad_norm": 1.4647830724716187, "learning_rate": 0.00015384341637010677, "loss": 2.2213, "step": 2604 }, { "epoch": 1.1577777777777778, "grad_norm": 1.4536770582199097, "learning_rate": 0.00015382562277580072, "loss": 2.0763, "step": 2605 }, { "epoch": 1.1582222222222223, "grad_norm": 1.4016244411468506, "learning_rate": 0.00015380782918149468, "loss": 2.3909, "step": 2606 }, { "epoch": 1.1586666666666667, "grad_norm": 1.4496042728424072, "learning_rate": 0.00015379003558718864, "loss": 1.9667, "step": 2607 }, { "epoch": 1.1591111111111112, "grad_norm": 1.384210228919983, "learning_rate": 0.00015377224199288256, "loss": 2.8226, "step": 2608 }, { "epoch": 1.1595555555555555, "grad_norm": 1.5709418058395386, "learning_rate": 0.00015375444839857652, "loss": 2.7716, "step": 2609 }, { "epoch": 1.16, "grad_norm": 1.6072601079940796, "learning_rate": 0.00015373665480427045, "loss": 2.2285, "step": 2610 }, { "epoch": 1.1604444444444444, "grad_norm": 1.4224820137023926, "learning_rate": 0.0001537188612099644, "loss": 2.049, "step": 2611 }, { "epoch": 1.1608888888888889, "grad_norm": 1.5850938558578491, "learning_rate": 0.00015370106761565836, "loss": 2.326, "step": 2612 }, { "epoch": 1.1613333333333333, "grad_norm": 1.3699077367782593, "learning_rate": 0.00015368327402135232, "loss": 2.1054, "step": 2613 }, { "epoch": 1.1617777777777778, "grad_norm": 1.5815706253051758, "learning_rate": 0.00015366548042704628, "loss": 2.4236, "step": 2614 }, { "epoch": 1.1622222222222223, "grad_norm": 1.3918722867965698, "learning_rate": 0.00015364768683274023, "loss": 2.2995, "step": 2615 }, { "epoch": 1.1626666666666667, "grad_norm": 1.4517208337783813, "learning_rate": 0.00015362989323843416, "loss": 1.8738, "step": 2616 }, { "epoch": 1.1631111111111112, "grad_norm": 1.84287691116333, "learning_rate": 0.00015361209964412812, "loss": 1.3763, "step": 2617 }, { "epoch": 1.1635555555555555, "grad_norm": 1.6634745597839355, "learning_rate": 0.00015359430604982208, "loss": 2.3855, "step": 2618 }, { "epoch": 1.164, "grad_norm": 1.544952154159546, "learning_rate": 0.00015357651245551603, "loss": 1.9593, "step": 2619 }, { "epoch": 1.1644444444444444, "grad_norm": 1.4931992292404175, "learning_rate": 0.00015355871886121, "loss": 2.1309, "step": 2620 }, { "epoch": 1.1648888888888889, "grad_norm": 1.7971177101135254, "learning_rate": 0.00015354092526690392, "loss": 2.2084, "step": 2621 }, { "epoch": 1.1653333333333333, "grad_norm": 1.4012914896011353, "learning_rate": 0.00015352313167259787, "loss": 1.5844, "step": 2622 }, { "epoch": 1.1657777777777778, "grad_norm": 1.5000579357147217, "learning_rate": 0.0001535053380782918, "loss": 1.8046, "step": 2623 }, { "epoch": 1.1662222222222223, "grad_norm": 1.830424427986145, "learning_rate": 0.00015348754448398576, "loss": 2.6892, "step": 2624 }, { "epoch": 1.1666666666666667, "grad_norm": 1.5387390851974487, "learning_rate": 0.00015346975088967972, "loss": 1.9499, "step": 2625 }, { "epoch": 1.1671111111111112, "grad_norm": 1.5739384889602661, "learning_rate": 0.00015345195729537367, "loss": 2.0548, "step": 2626 }, { "epoch": 1.1675555555555555, "grad_norm": 1.7389553785324097, "learning_rate": 0.00015343416370106763, "loss": 2.3472, "step": 2627 }, { "epoch": 1.168, "grad_norm": 1.8690800666809082, "learning_rate": 0.00015341637010676159, "loss": 2.5385, "step": 2628 }, { "epoch": 1.1684444444444444, "grad_norm": 1.6705001592636108, "learning_rate": 0.00015339857651245552, "loss": 2.2232, "step": 2629 }, { "epoch": 1.1688888888888889, "grad_norm": 1.811651587486267, "learning_rate": 0.00015338078291814947, "loss": 2.3896, "step": 2630 }, { "epoch": 1.1693333333333333, "grad_norm": 2.812932252883911, "learning_rate": 0.00015336298932384343, "loss": 2.3391, "step": 2631 }, { "epoch": 1.1697777777777778, "grad_norm": 1.434435248374939, "learning_rate": 0.00015334519572953739, "loss": 0.9854, "step": 2632 }, { "epoch": 1.1702222222222223, "grad_norm": 1.7616302967071533, "learning_rate": 0.00015332740213523134, "loss": 2.0482, "step": 2633 }, { "epoch": 1.1706666666666667, "grad_norm": 2.4221534729003906, "learning_rate": 0.00015330960854092527, "loss": 2.338, "step": 2634 }, { "epoch": 1.1711111111111112, "grad_norm": 1.9369844198226929, "learning_rate": 0.00015329181494661923, "loss": 1.8924, "step": 2635 }, { "epoch": 1.1715555555555555, "grad_norm": 2.09136700630188, "learning_rate": 0.00015327402135231316, "loss": 2.5067, "step": 2636 }, { "epoch": 1.172, "grad_norm": 1.7432854175567627, "learning_rate": 0.0001532562277580071, "loss": 1.6748, "step": 2637 }, { "epoch": 1.1724444444444444, "grad_norm": 1.8707739114761353, "learning_rate": 0.00015323843416370107, "loss": 2.3218, "step": 2638 }, { "epoch": 1.1728888888888889, "grad_norm": 1.9009658098220825, "learning_rate": 0.00015322064056939503, "loss": 2.1614, "step": 2639 }, { "epoch": 1.1733333333333333, "grad_norm": 2.321162462234497, "learning_rate": 0.00015320284697508898, "loss": 2.0851, "step": 2640 }, { "epoch": 1.1737777777777778, "grad_norm": 1.7875491380691528, "learning_rate": 0.00015318505338078294, "loss": 1.7379, "step": 2641 }, { "epoch": 1.1742222222222223, "grad_norm": 1.9961577653884888, "learning_rate": 0.00015316725978647687, "loss": 2.4488, "step": 2642 }, { "epoch": 1.1746666666666667, "grad_norm": 2.089043617248535, "learning_rate": 0.00015314946619217083, "loss": 2.2587, "step": 2643 }, { "epoch": 1.1751111111111112, "grad_norm": 2.016988754272461, "learning_rate": 0.00015313167259786478, "loss": 2.3505, "step": 2644 }, { "epoch": 1.1755555555555555, "grad_norm": 3.9953866004943848, "learning_rate": 0.00015311387900355874, "loss": 2.6823, "step": 2645 }, { "epoch": 1.176, "grad_norm": 2.324265956878662, "learning_rate": 0.00015309608540925267, "loss": 2.0942, "step": 2646 }, { "epoch": 1.1764444444444444, "grad_norm": 1.8716621398925781, "learning_rate": 0.00015307829181494662, "loss": 1.8976, "step": 2647 }, { "epoch": 1.1768888888888889, "grad_norm": 2.1721549034118652, "learning_rate": 0.00015306049822064058, "loss": 2.3372, "step": 2648 }, { "epoch": 1.1773333333333333, "grad_norm": 2.4310812950134277, "learning_rate": 0.0001530427046263345, "loss": 2.3681, "step": 2649 }, { "epoch": 1.1777777777777778, "grad_norm": 2.1429104804992676, "learning_rate": 0.00015302491103202847, "loss": 1.3194, "step": 2650 }, { "epoch": 1.1782222222222223, "grad_norm": 1.4743183851242065, "learning_rate": 0.00015300711743772242, "loss": 2.7132, "step": 2651 }, { "epoch": 1.1786666666666668, "grad_norm": 1.449602723121643, "learning_rate": 0.00015298932384341638, "loss": 2.0043, "step": 2652 }, { "epoch": 1.1791111111111112, "grad_norm": 1.5418530702590942, "learning_rate": 0.00015297153024911034, "loss": 2.4505, "step": 2653 }, { "epoch": 1.1795555555555555, "grad_norm": 1.408302664756775, "learning_rate": 0.0001529537366548043, "loss": 2.1342, "step": 2654 }, { "epoch": 1.18, "grad_norm": 1.6608649492263794, "learning_rate": 0.00015293594306049822, "loss": 2.3212, "step": 2655 }, { "epoch": 1.1804444444444444, "grad_norm": 1.560037612915039, "learning_rate": 0.00015291814946619218, "loss": 2.9029, "step": 2656 }, { "epoch": 1.1808888888888889, "grad_norm": 1.5058655738830566, "learning_rate": 0.00015290035587188613, "loss": 2.5045, "step": 2657 }, { "epoch": 1.1813333333333333, "grad_norm": 1.5224006175994873, "learning_rate": 0.0001528825622775801, "loss": 2.2639, "step": 2658 }, { "epoch": 1.1817777777777778, "grad_norm": 1.611315131187439, "learning_rate": 0.00015286476868327402, "loss": 2.594, "step": 2659 }, { "epoch": 1.1822222222222223, "grad_norm": 1.6001996994018555, "learning_rate": 0.00015284697508896798, "loss": 2.6139, "step": 2660 }, { "epoch": 1.1826666666666668, "grad_norm": 1.5929144620895386, "learning_rate": 0.00015282918149466193, "loss": 2.6785, "step": 2661 }, { "epoch": 1.1831111111111112, "grad_norm": 1.6971992254257202, "learning_rate": 0.00015281138790035586, "loss": 2.364, "step": 2662 }, { "epoch": 1.1835555555555555, "grad_norm": 1.468849539756775, "learning_rate": 0.00015279359430604982, "loss": 1.773, "step": 2663 }, { "epoch": 1.184, "grad_norm": 1.352769374847412, "learning_rate": 0.00015277580071174378, "loss": 2.5955, "step": 2664 }, { "epoch": 1.1844444444444444, "grad_norm": 1.4742112159729004, "learning_rate": 0.00015275800711743773, "loss": 1.9913, "step": 2665 }, { "epoch": 1.1848888888888889, "grad_norm": 1.3926454782485962, "learning_rate": 0.0001527402135231317, "loss": 1.8751, "step": 2666 }, { "epoch": 1.1853333333333333, "grad_norm": 1.6356984376907349, "learning_rate": 0.00015272241992882565, "loss": 2.353, "step": 2667 }, { "epoch": 1.1857777777777778, "grad_norm": 1.5044867992401123, "learning_rate": 0.00015270462633451958, "loss": 1.988, "step": 2668 }, { "epoch": 1.1862222222222223, "grad_norm": 1.8624123334884644, "learning_rate": 0.00015268683274021353, "loss": 2.5895, "step": 2669 }, { "epoch": 1.1866666666666668, "grad_norm": 1.6830346584320068, "learning_rate": 0.0001526690391459075, "loss": 1.3593, "step": 2670 }, { "epoch": 1.1871111111111112, "grad_norm": 1.6659824848175049, "learning_rate": 0.00015265124555160144, "loss": 1.8984, "step": 2671 }, { "epoch": 1.1875555555555555, "grad_norm": 1.5918940305709839, "learning_rate": 0.00015263345195729537, "loss": 2.0415, "step": 2672 }, { "epoch": 1.188, "grad_norm": 1.8598551750183105, "learning_rate": 0.00015261565836298933, "loss": 2.1718, "step": 2673 }, { "epoch": 1.1884444444444444, "grad_norm": 1.764703631401062, "learning_rate": 0.0001525978647686833, "loss": 2.0413, "step": 2674 }, { "epoch": 1.1888888888888889, "grad_norm": 1.7580000162124634, "learning_rate": 0.00015258007117437722, "loss": 2.1583, "step": 2675 }, { "epoch": 1.1893333333333334, "grad_norm": 1.7123738527297974, "learning_rate": 0.00015256227758007117, "loss": 2.0192, "step": 2676 }, { "epoch": 1.1897777777777778, "grad_norm": 2.88462233543396, "learning_rate": 0.00015254448398576513, "loss": 1.9526, "step": 2677 }, { "epoch": 1.1902222222222223, "grad_norm": 1.6699835062026978, "learning_rate": 0.00015252669039145909, "loss": 2.0907, "step": 2678 }, { "epoch": 1.1906666666666668, "grad_norm": 2.0208330154418945, "learning_rate": 0.00015250889679715304, "loss": 2.3712, "step": 2679 }, { "epoch": 1.1911111111111112, "grad_norm": 1.9458266496658325, "learning_rate": 0.000152491103202847, "loss": 2.1126, "step": 2680 }, { "epoch": 1.1915555555555555, "grad_norm": 1.6024980545043945, "learning_rate": 0.00015247330960854093, "loss": 1.8988, "step": 2681 }, { "epoch": 1.192, "grad_norm": 1.7899705171585083, "learning_rate": 0.00015245551601423488, "loss": 2.0417, "step": 2682 }, { "epoch": 1.1924444444444444, "grad_norm": 1.6227293014526367, "learning_rate": 0.00015243772241992884, "loss": 2.1364, "step": 2683 }, { "epoch": 1.1928888888888889, "grad_norm": 1.7193636894226074, "learning_rate": 0.0001524199288256228, "loss": 1.9889, "step": 2684 }, { "epoch": 1.1933333333333334, "grad_norm": 1.7960073947906494, "learning_rate": 0.00015240213523131673, "loss": 2.0971, "step": 2685 }, { "epoch": 1.1937777777777778, "grad_norm": 1.8026853799819946, "learning_rate": 0.00015238434163701068, "loss": 2.0781, "step": 2686 }, { "epoch": 1.1942222222222223, "grad_norm": 1.0487536191940308, "learning_rate": 0.00015236654804270464, "loss": 0.076, "step": 2687 }, { "epoch": 1.1946666666666665, "grad_norm": 2.0273492336273193, "learning_rate": 0.00015234875444839857, "loss": 2.2794, "step": 2688 }, { "epoch": 1.1951111111111112, "grad_norm": 1.8268475532531738, "learning_rate": 0.00015233096085409253, "loss": 1.9992, "step": 2689 }, { "epoch": 1.1955555555555555, "grad_norm": 1.6929086446762085, "learning_rate": 0.00015231316725978648, "loss": 2.0807, "step": 2690 }, { "epoch": 1.196, "grad_norm": 2.158275842666626, "learning_rate": 0.00015229537366548044, "loss": 2.2911, "step": 2691 }, { "epoch": 1.1964444444444444, "grad_norm": 1.98186457157135, "learning_rate": 0.0001522775800711744, "loss": 2.0243, "step": 2692 }, { "epoch": 1.196888888888889, "grad_norm": 1.696062445640564, "learning_rate": 0.00015225978647686832, "loss": 1.8214, "step": 2693 }, { "epoch": 1.1973333333333334, "grad_norm": 2.215367078781128, "learning_rate": 0.00015224199288256228, "loss": 2.1777, "step": 2694 }, { "epoch": 1.1977777777777778, "grad_norm": 2.0742318630218506, "learning_rate": 0.00015222419928825624, "loss": 2.1899, "step": 2695 }, { "epoch": 1.1982222222222223, "grad_norm": 2.0556631088256836, "learning_rate": 0.0001522064056939502, "loss": 2.1347, "step": 2696 }, { "epoch": 1.1986666666666665, "grad_norm": 2.068554162979126, "learning_rate": 0.00015218861209964415, "loss": 1.8877, "step": 2697 }, { "epoch": 1.199111111111111, "grad_norm": 2.118912696838379, "learning_rate": 0.00015217081850533808, "loss": 2.1662, "step": 2698 }, { "epoch": 1.1995555555555555, "grad_norm": 1.5802162885665894, "learning_rate": 0.00015215302491103204, "loss": 0.963, "step": 2699 }, { "epoch": 1.2, "grad_norm": 2.575432062149048, "learning_rate": 0.00015213523131672597, "loss": 2.3078, "step": 2700 }, { "epoch": 1.2004444444444444, "grad_norm": 1.573493480682373, "learning_rate": 0.00015211743772241992, "loss": 1.3132, "step": 2701 }, { "epoch": 1.200888888888889, "grad_norm": 1.0483604669570923, "learning_rate": 0.00015209964412811388, "loss": 1.0147, "step": 2702 }, { "epoch": 1.2013333333333334, "grad_norm": 0.8470864295959473, "learning_rate": 0.00015208185053380784, "loss": 0.9255, "step": 2703 }, { "epoch": 1.2017777777777778, "grad_norm": 1.5103946924209595, "learning_rate": 0.0001520640569395018, "loss": 1.7368, "step": 2704 }, { "epoch": 1.2022222222222223, "grad_norm": 1.2146368026733398, "learning_rate": 0.00015204626334519575, "loss": 1.5886, "step": 2705 }, { "epoch": 1.2026666666666666, "grad_norm": 1.3831676244735718, "learning_rate": 0.00015202846975088968, "loss": 2.1525, "step": 2706 }, { "epoch": 1.203111111111111, "grad_norm": 1.5528510808944702, "learning_rate": 0.00015201067615658363, "loss": 2.5657, "step": 2707 }, { "epoch": 1.2035555555555555, "grad_norm": 1.551809549331665, "learning_rate": 0.0001519928825622776, "loss": 2.2547, "step": 2708 }, { "epoch": 1.204, "grad_norm": 1.305998682975769, "learning_rate": 0.00015197508896797155, "loss": 1.3105, "step": 2709 }, { "epoch": 1.2044444444444444, "grad_norm": 1.868577480316162, "learning_rate": 0.0001519572953736655, "loss": 2.6362, "step": 2710 }, { "epoch": 1.204888888888889, "grad_norm": 1.6573137044906616, "learning_rate": 0.00015193950177935943, "loss": 2.3689, "step": 2711 }, { "epoch": 1.2053333333333334, "grad_norm": 1.4828029870986938, "learning_rate": 0.0001519217081850534, "loss": 1.2541, "step": 2712 }, { "epoch": 1.2057777777777778, "grad_norm": 1.4731237888336182, "learning_rate": 0.00015190391459074732, "loss": 2.0792, "step": 2713 }, { "epoch": 1.2062222222222223, "grad_norm": 1.6513289213180542, "learning_rate": 0.00015188612099644128, "loss": 2.2811, "step": 2714 }, { "epoch": 1.2066666666666666, "grad_norm": 1.5226035118103027, "learning_rate": 0.00015186832740213523, "loss": 2.117, "step": 2715 }, { "epoch": 1.207111111111111, "grad_norm": 1.672688603401184, "learning_rate": 0.0001518505338078292, "loss": 2.0124, "step": 2716 }, { "epoch": 1.2075555555555555, "grad_norm": 1.6700776815414429, "learning_rate": 0.00015183274021352315, "loss": 2.0039, "step": 2717 }, { "epoch": 1.208, "grad_norm": 1.613197922706604, "learning_rate": 0.0001518149466192171, "loss": 2.182, "step": 2718 }, { "epoch": 1.2084444444444444, "grad_norm": 1.8841910362243652, "learning_rate": 0.00015179715302491103, "loss": 2.4719, "step": 2719 }, { "epoch": 1.208888888888889, "grad_norm": 1.5783162117004395, "learning_rate": 0.000151779359430605, "loss": 2.2804, "step": 2720 }, { "epoch": 1.2093333333333334, "grad_norm": 1.5242904424667358, "learning_rate": 0.00015176156583629894, "loss": 2.322, "step": 2721 }, { "epoch": 1.2097777777777778, "grad_norm": 1.4535586833953857, "learning_rate": 0.0001517437722419929, "loss": 1.9133, "step": 2722 }, { "epoch": 1.2102222222222223, "grad_norm": 2.0285212993621826, "learning_rate": 0.00015172597864768686, "loss": 2.0526, "step": 2723 }, { "epoch": 1.2106666666666666, "grad_norm": 1.6039782762527466, "learning_rate": 0.0001517081850533808, "loss": 1.8729, "step": 2724 }, { "epoch": 1.211111111111111, "grad_norm": 1.4324172735214233, "learning_rate": 0.00015169039145907474, "loss": 1.6432, "step": 2725 }, { "epoch": 1.2115555555555555, "grad_norm": 1.6351962089538574, "learning_rate": 0.00015167259786476867, "loss": 2.4551, "step": 2726 }, { "epoch": 1.212, "grad_norm": 1.7832766771316528, "learning_rate": 0.00015165480427046263, "loss": 2.1844, "step": 2727 }, { "epoch": 1.2124444444444444, "grad_norm": 1.8114533424377441, "learning_rate": 0.00015163701067615659, "loss": 2.1506, "step": 2728 }, { "epoch": 1.212888888888889, "grad_norm": 1.875593900680542, "learning_rate": 0.00015161921708185054, "loss": 2.0954, "step": 2729 }, { "epoch": 1.2133333333333334, "grad_norm": 1.673852562904358, "learning_rate": 0.0001516014234875445, "loss": 2.1643, "step": 2730 }, { "epoch": 1.2137777777777778, "grad_norm": 1.6588380336761475, "learning_rate": 0.00015158362989323845, "loss": 1.6013, "step": 2731 }, { "epoch": 1.2142222222222223, "grad_norm": 1.8047765493392944, "learning_rate": 0.00015156583629893238, "loss": 2.441, "step": 2732 }, { "epoch": 1.2146666666666666, "grad_norm": 1.8153445720672607, "learning_rate": 0.00015154804270462634, "loss": 2.2575, "step": 2733 }, { "epoch": 1.215111111111111, "grad_norm": 1.9475229978561401, "learning_rate": 0.0001515302491103203, "loss": 2.0499, "step": 2734 }, { "epoch": 1.2155555555555555, "grad_norm": 1.8662759065628052, "learning_rate": 0.00015151245551601425, "loss": 1.9091, "step": 2735 }, { "epoch": 1.216, "grad_norm": 1.9955482482910156, "learning_rate": 0.0001514946619217082, "loss": 2.5757, "step": 2736 }, { "epoch": 1.2164444444444444, "grad_norm": 1.897420883178711, "learning_rate": 0.00015147686832740214, "loss": 1.8366, "step": 2737 }, { "epoch": 1.216888888888889, "grad_norm": 1.6787134408950806, "learning_rate": 0.0001514590747330961, "loss": 0.962, "step": 2738 }, { "epoch": 1.2173333333333334, "grad_norm": 1.790507197380066, "learning_rate": 0.00015144128113879003, "loss": 2.1025, "step": 2739 }, { "epoch": 1.2177777777777778, "grad_norm": 2.025376558303833, "learning_rate": 0.00015142348754448398, "loss": 2.4571, "step": 2740 }, { "epoch": 1.2182222222222223, "grad_norm": 2.094409942626953, "learning_rate": 0.00015140569395017794, "loss": 2.2626, "step": 2741 }, { "epoch": 1.2186666666666666, "grad_norm": 2.1918323040008545, "learning_rate": 0.0001513879003558719, "loss": 2.4293, "step": 2742 }, { "epoch": 1.219111111111111, "grad_norm": 2.2895400524139404, "learning_rate": 0.00015137010676156585, "loss": 2.4093, "step": 2743 }, { "epoch": 1.2195555555555555, "grad_norm": 2.0072021484375, "learning_rate": 0.0001513523131672598, "loss": 2.3826, "step": 2744 }, { "epoch": 1.22, "grad_norm": 2.0313949584960938, "learning_rate": 0.00015133451957295374, "loss": 1.6368, "step": 2745 }, { "epoch": 1.2204444444444444, "grad_norm": 2.9698612689971924, "learning_rate": 0.0001513167259786477, "loss": 2.5953, "step": 2746 }, { "epoch": 1.220888888888889, "grad_norm": 2.604888439178467, "learning_rate": 0.00015129893238434165, "loss": 2.9221, "step": 2747 }, { "epoch": 1.2213333333333334, "grad_norm": 2.023358106613159, "learning_rate": 0.0001512811387900356, "loss": 1.8182, "step": 2748 }, { "epoch": 1.2217777777777779, "grad_norm": 2.1045310497283936, "learning_rate": 0.00015126334519572956, "loss": 1.6869, "step": 2749 }, { "epoch": 1.2222222222222223, "grad_norm": 2.9360511302948, "learning_rate": 0.0001512455516014235, "loss": 2.0189, "step": 2750 }, { "epoch": 1.2226666666666666, "grad_norm": 1.2831294536590576, "learning_rate": 0.00015122775800711745, "loss": 2.4483, "step": 2751 }, { "epoch": 1.223111111111111, "grad_norm": 1.1311028003692627, "learning_rate": 0.00015120996441281138, "loss": 1.2781, "step": 2752 }, { "epoch": 1.2235555555555555, "grad_norm": 1.4160290956497192, "learning_rate": 0.00015119217081850534, "loss": 2.2221, "step": 2753 }, { "epoch": 1.224, "grad_norm": 1.3835599422454834, "learning_rate": 0.0001511743772241993, "loss": 2.2433, "step": 2754 }, { "epoch": 1.2244444444444444, "grad_norm": 1.6253594160079956, "learning_rate": 0.00015115658362989325, "loss": 1.8161, "step": 2755 }, { "epoch": 1.224888888888889, "grad_norm": 1.227462649345398, "learning_rate": 0.0001511387900355872, "loss": 1.9292, "step": 2756 }, { "epoch": 1.2253333333333334, "grad_norm": 1.425794005393982, "learning_rate": 0.00015112099644128116, "loss": 1.6751, "step": 2757 }, { "epoch": 1.2257777777777779, "grad_norm": 0.8352449536323547, "learning_rate": 0.0001511032028469751, "loss": 0.042, "step": 2758 }, { "epoch": 1.2262222222222223, "grad_norm": 1.41720449924469, "learning_rate": 0.00015108540925266905, "loss": 2.2062, "step": 2759 }, { "epoch": 1.2266666666666666, "grad_norm": 1.4105916023254395, "learning_rate": 0.000151067615658363, "loss": 1.8583, "step": 2760 }, { "epoch": 1.227111111111111, "grad_norm": 1.653696060180664, "learning_rate": 0.00015104982206405696, "loss": 1.4183, "step": 2761 }, { "epoch": 1.2275555555555555, "grad_norm": 1.5550695657730103, "learning_rate": 0.0001510320284697509, "loss": 2.0958, "step": 2762 }, { "epoch": 1.228, "grad_norm": 1.5534552335739136, "learning_rate": 0.00015101423487544485, "loss": 1.1471, "step": 2763 }, { "epoch": 1.2284444444444444, "grad_norm": 1.956020474433899, "learning_rate": 0.0001509964412811388, "loss": 2.408, "step": 2764 }, { "epoch": 1.228888888888889, "grad_norm": 1.5008649826049805, "learning_rate": 0.00015097864768683273, "loss": 1.9145, "step": 2765 }, { "epoch": 1.2293333333333334, "grad_norm": 1.4730578660964966, "learning_rate": 0.0001509608540925267, "loss": 1.8439, "step": 2766 }, { "epoch": 1.2297777777777779, "grad_norm": 1.7233079671859741, "learning_rate": 0.00015094306049822064, "loss": 2.4389, "step": 2767 }, { "epoch": 1.2302222222222223, "grad_norm": 1.4450547695159912, "learning_rate": 0.0001509252669039146, "loss": 2.1717, "step": 2768 }, { "epoch": 1.2306666666666666, "grad_norm": 1.7250124216079712, "learning_rate": 0.00015090747330960856, "loss": 2.198, "step": 2769 }, { "epoch": 1.231111111111111, "grad_norm": 2.009876012802124, "learning_rate": 0.00015088967971530251, "loss": 2.6704, "step": 2770 }, { "epoch": 1.2315555555555555, "grad_norm": 1.9580446481704712, "learning_rate": 0.00015087188612099644, "loss": 1.803, "step": 2771 }, { "epoch": 1.232, "grad_norm": 1.7807945013046265, "learning_rate": 0.0001508540925266904, "loss": 2.401, "step": 2772 }, { "epoch": 1.2324444444444445, "grad_norm": 1.9631510972976685, "learning_rate": 0.00015083629893238436, "loss": 2.762, "step": 2773 }, { "epoch": 1.232888888888889, "grad_norm": 1.7688226699829102, "learning_rate": 0.0001508185053380783, "loss": 2.5009, "step": 2774 }, { "epoch": 1.2333333333333334, "grad_norm": 1.898895263671875, "learning_rate": 0.00015080071174377224, "loss": 2.1815, "step": 2775 }, { "epoch": 1.2337777777777779, "grad_norm": 1.748230218887329, "learning_rate": 0.0001507829181494662, "loss": 2.1702, "step": 2776 }, { "epoch": 1.2342222222222223, "grad_norm": 1.829336404800415, "learning_rate": 0.00015076512455516016, "loss": 2.6297, "step": 2777 }, { "epoch": 1.2346666666666666, "grad_norm": 1.7293047904968262, "learning_rate": 0.00015074733096085409, "loss": 2.0303, "step": 2778 }, { "epoch": 1.235111111111111, "grad_norm": 1.6625522375106812, "learning_rate": 0.00015072953736654804, "loss": 1.9354, "step": 2779 }, { "epoch": 1.2355555555555555, "grad_norm": 1.653939127922058, "learning_rate": 0.000150711743772242, "loss": 1.7279, "step": 2780 }, { "epoch": 1.236, "grad_norm": 1.7402019500732422, "learning_rate": 0.00015069395017793595, "loss": 1.8008, "step": 2781 }, { "epoch": 1.2364444444444445, "grad_norm": 1.5978055000305176, "learning_rate": 0.0001506761565836299, "loss": 2.0594, "step": 2782 }, { "epoch": 1.236888888888889, "grad_norm": 1.8129159212112427, "learning_rate": 0.00015065836298932384, "loss": 1.8207, "step": 2783 }, { "epoch": 1.2373333333333334, "grad_norm": 1.7434604167938232, "learning_rate": 0.0001506405693950178, "loss": 1.9957, "step": 2784 }, { "epoch": 1.2377777777777779, "grad_norm": 1.829714059829712, "learning_rate": 0.00015062277580071175, "loss": 2.2293, "step": 2785 }, { "epoch": 1.2382222222222223, "grad_norm": 2.231995105743408, "learning_rate": 0.0001506049822064057, "loss": 2.5919, "step": 2786 }, { "epoch": 1.2386666666666666, "grad_norm": 2.2278225421905518, "learning_rate": 0.00015058718861209967, "loss": 2.299, "step": 2787 }, { "epoch": 1.239111111111111, "grad_norm": 1.5640493631362915, "learning_rate": 0.0001505693950177936, "loss": 1.2429, "step": 2788 }, { "epoch": 1.2395555555555555, "grad_norm": 1.987496256828308, "learning_rate": 0.00015055160142348755, "loss": 1.1852, "step": 2789 }, { "epoch": 1.24, "grad_norm": 1.910750389099121, "learning_rate": 0.00015053380782918148, "loss": 1.7426, "step": 2790 }, { "epoch": 1.2404444444444445, "grad_norm": 1.7743321657180786, "learning_rate": 0.00015051601423487544, "loss": 1.7287, "step": 2791 }, { "epoch": 1.240888888888889, "grad_norm": 1.9322333335876465, "learning_rate": 0.0001504982206405694, "loss": 2.3926, "step": 2792 }, { "epoch": 1.2413333333333334, "grad_norm": 1.9463812112808228, "learning_rate": 0.00015048042704626335, "loss": 2.2058, "step": 2793 }, { "epoch": 1.2417777777777779, "grad_norm": 1.9254072904586792, "learning_rate": 0.0001504626334519573, "loss": 2.167, "step": 2794 }, { "epoch": 1.2422222222222223, "grad_norm": 2.315269947052002, "learning_rate": 0.00015044483985765126, "loss": 1.988, "step": 2795 }, { "epoch": 1.2426666666666666, "grad_norm": 2.6116104125976562, "learning_rate": 0.0001504270462633452, "loss": 2.5243, "step": 2796 }, { "epoch": 1.243111111111111, "grad_norm": 2.500777244567871, "learning_rate": 0.00015040925266903915, "loss": 2.4406, "step": 2797 }, { "epoch": 1.2435555555555555, "grad_norm": 2.5886335372924805, "learning_rate": 0.0001503914590747331, "loss": 1.7912, "step": 2798 }, { "epoch": 1.244, "grad_norm": 2.798053503036499, "learning_rate": 0.00015037366548042706, "loss": 1.7156, "step": 2799 }, { "epoch": 1.2444444444444445, "grad_norm": 3.800767660140991, "learning_rate": 0.00015035587188612102, "loss": 1.2951, "step": 2800 }, { "epoch": 1.244888888888889, "grad_norm": 1.2219860553741455, "learning_rate": 0.00015033807829181495, "loss": 2.3513, "step": 2801 }, { "epoch": 1.2453333333333334, "grad_norm": 1.1778558492660522, "learning_rate": 0.0001503202846975089, "loss": 2.2511, "step": 2802 }, { "epoch": 1.2457777777777779, "grad_norm": 1.369297981262207, "learning_rate": 0.00015030249110320283, "loss": 1.935, "step": 2803 }, { "epoch": 1.2462222222222223, "grad_norm": 1.6750556230545044, "learning_rate": 0.0001502846975088968, "loss": 2.3057, "step": 2804 }, { "epoch": 1.2466666666666666, "grad_norm": 1.4354381561279297, "learning_rate": 0.00015026690391459075, "loss": 2.4099, "step": 2805 }, { "epoch": 1.247111111111111, "grad_norm": 1.3664571046829224, "learning_rate": 0.0001502491103202847, "loss": 2.0848, "step": 2806 }, { "epoch": 1.2475555555555555, "grad_norm": 1.3316177129745483, "learning_rate": 0.00015023131672597866, "loss": 2.1252, "step": 2807 }, { "epoch": 1.248, "grad_norm": 1.3470760583877563, "learning_rate": 0.00015021352313167262, "loss": 2.3662, "step": 2808 }, { "epoch": 1.2484444444444445, "grad_norm": 1.4858183860778809, "learning_rate": 0.00015019572953736655, "loss": 2.4822, "step": 2809 }, { "epoch": 1.248888888888889, "grad_norm": 1.4876043796539307, "learning_rate": 0.0001501779359430605, "loss": 2.2044, "step": 2810 }, { "epoch": 1.2493333333333334, "grad_norm": 1.807070016860962, "learning_rate": 0.00015016014234875446, "loss": 2.438, "step": 2811 }, { "epoch": 1.2497777777777777, "grad_norm": 1.5529999732971191, "learning_rate": 0.00015014234875444842, "loss": 2.2415, "step": 2812 }, { "epoch": 1.2502222222222223, "grad_norm": 1.5677090883255005, "learning_rate": 0.00015012455516014237, "loss": 2.1466, "step": 2813 }, { "epoch": 1.2506666666666666, "grad_norm": 1.4437453746795654, "learning_rate": 0.0001501067615658363, "loss": 2.1075, "step": 2814 }, { "epoch": 1.251111111111111, "grad_norm": 1.7834696769714355, "learning_rate": 0.00015008896797153026, "loss": 2.6351, "step": 2815 }, { "epoch": 1.2515555555555555, "grad_norm": 1.593764305114746, "learning_rate": 0.0001500711743772242, "loss": 2.1957, "step": 2816 }, { "epoch": 1.252, "grad_norm": 1.5799425840377808, "learning_rate": 0.00015005338078291814, "loss": 2.7452, "step": 2817 }, { "epoch": 1.2524444444444445, "grad_norm": 1.6034505367279053, "learning_rate": 0.0001500355871886121, "loss": 1.7773, "step": 2818 }, { "epoch": 1.252888888888889, "grad_norm": 1.1954715251922607, "learning_rate": 0.00015001779359430606, "loss": 1.204, "step": 2819 }, { "epoch": 1.2533333333333334, "grad_norm": 1.638249158859253, "learning_rate": 0.00015000000000000001, "loss": 2.394, "step": 2820 }, { "epoch": 1.2537777777777777, "grad_norm": 1.640615463256836, "learning_rate": 0.00014998220640569397, "loss": 2.6767, "step": 2821 }, { "epoch": 1.2542222222222223, "grad_norm": 1.6124838590621948, "learning_rate": 0.0001499644128113879, "loss": 2.2059, "step": 2822 }, { "epoch": 1.2546666666666666, "grad_norm": 1.6705976724624634, "learning_rate": 0.00014994661921708186, "loss": 2.2517, "step": 2823 }, { "epoch": 1.255111111111111, "grad_norm": 1.4649361371994019, "learning_rate": 0.0001499288256227758, "loss": 1.9224, "step": 2824 }, { "epoch": 1.2555555555555555, "grad_norm": 1.701545238494873, "learning_rate": 0.00014991103202846977, "loss": 2.2677, "step": 2825 }, { "epoch": 1.256, "grad_norm": 1.726928949356079, "learning_rate": 0.00014989323843416373, "loss": 2.1658, "step": 2826 }, { "epoch": 1.2564444444444445, "grad_norm": 1.6811003684997559, "learning_rate": 0.00014987544483985766, "loss": 2.0167, "step": 2827 }, { "epoch": 1.256888888888889, "grad_norm": 1.8641170263290405, "learning_rate": 0.0001498576512455516, "loss": 1.7901, "step": 2828 }, { "epoch": 1.2573333333333334, "grad_norm": 1.6056395769119263, "learning_rate": 0.00014983985765124554, "loss": 1.6597, "step": 2829 }, { "epoch": 1.2577777777777777, "grad_norm": 1.687373399734497, "learning_rate": 0.0001498220640569395, "loss": 1.9804, "step": 2830 }, { "epoch": 1.2582222222222224, "grad_norm": 1.6241012811660767, "learning_rate": 0.00014980427046263345, "loss": 2.0725, "step": 2831 }, { "epoch": 1.2586666666666666, "grad_norm": 1.4386781454086304, "learning_rate": 0.0001497864768683274, "loss": 1.1918, "step": 2832 }, { "epoch": 1.259111111111111, "grad_norm": 1.7372790575027466, "learning_rate": 0.00014976868327402137, "loss": 2.2247, "step": 2833 }, { "epoch": 1.2595555555555555, "grad_norm": 1.8010145425796509, "learning_rate": 0.00014975088967971532, "loss": 2.1944, "step": 2834 }, { "epoch": 1.26, "grad_norm": 1.9382820129394531, "learning_rate": 0.00014973309608540925, "loss": 2.3833, "step": 2835 }, { "epoch": 1.2604444444444445, "grad_norm": 1.9000599384307861, "learning_rate": 0.0001497153024911032, "loss": 1.7084, "step": 2836 }, { "epoch": 1.260888888888889, "grad_norm": 2.1467716693878174, "learning_rate": 0.00014969750889679717, "loss": 2.0741, "step": 2837 }, { "epoch": 1.2613333333333334, "grad_norm": 2.0739872455596924, "learning_rate": 0.00014967971530249112, "loss": 2.2452, "step": 2838 }, { "epoch": 1.2617777777777777, "grad_norm": 1.7084413766860962, "learning_rate": 0.00014966192170818508, "loss": 1.3884, "step": 2839 }, { "epoch": 1.2622222222222224, "grad_norm": 1.855447769165039, "learning_rate": 0.000149644128113879, "loss": 1.9037, "step": 2840 }, { "epoch": 1.2626666666666666, "grad_norm": 1.8656028509140015, "learning_rate": 0.00014962633451957296, "loss": 1.7658, "step": 2841 }, { "epoch": 1.263111111111111, "grad_norm": 2.142399549484253, "learning_rate": 0.0001496085409252669, "loss": 1.9569, "step": 2842 }, { "epoch": 1.2635555555555555, "grad_norm": 1.9603620767593384, "learning_rate": 0.00014959074733096085, "loss": 1.9807, "step": 2843 }, { "epoch": 1.264, "grad_norm": 2.183345317840576, "learning_rate": 0.0001495729537366548, "loss": 2.4544, "step": 2844 }, { "epoch": 1.2644444444444445, "grad_norm": 2.205909490585327, "learning_rate": 0.00014955516014234876, "loss": 2.4811, "step": 2845 }, { "epoch": 1.264888888888889, "grad_norm": 2.540581226348877, "learning_rate": 0.00014953736654804272, "loss": 2.9558, "step": 2846 }, { "epoch": 1.2653333333333334, "grad_norm": 2.1151061058044434, "learning_rate": 0.00014951957295373668, "loss": 1.7681, "step": 2847 }, { "epoch": 1.2657777777777777, "grad_norm": 2.5562145709991455, "learning_rate": 0.0001495017793594306, "loss": 2.3364, "step": 2848 }, { "epoch": 1.2662222222222224, "grad_norm": 2.720233201980591, "learning_rate": 0.00014948398576512456, "loss": 1.7419, "step": 2849 }, { "epoch": 1.2666666666666666, "grad_norm": 2.397717237472534, "learning_rate": 0.00014946619217081852, "loss": 1.9456, "step": 2850 }, { "epoch": 1.267111111111111, "grad_norm": 1.2284705638885498, "learning_rate": 0.00014944839857651248, "loss": 2.54, "step": 2851 }, { "epoch": 1.2675555555555555, "grad_norm": 1.3442673683166504, "learning_rate": 0.00014943060498220643, "loss": 2.1183, "step": 2852 }, { "epoch": 1.268, "grad_norm": 1.3058741092681885, "learning_rate": 0.00014941281138790036, "loss": 2.0397, "step": 2853 }, { "epoch": 1.2684444444444445, "grad_norm": 1.4303501844406128, "learning_rate": 0.00014939501779359432, "loss": 2.9471, "step": 2854 }, { "epoch": 1.268888888888889, "grad_norm": 1.456242561340332, "learning_rate": 0.00014937722419928825, "loss": 2.1462, "step": 2855 }, { "epoch": 1.2693333333333334, "grad_norm": 1.5883921384811401, "learning_rate": 0.0001493594306049822, "loss": 2.7315, "step": 2856 }, { "epoch": 1.2697777777777777, "grad_norm": 1.4734134674072266, "learning_rate": 0.00014934163701067616, "loss": 1.95, "step": 2857 }, { "epoch": 1.2702222222222221, "grad_norm": 1.4658904075622559, "learning_rate": 0.00014932384341637012, "loss": 2.5269, "step": 2858 }, { "epoch": 1.2706666666666666, "grad_norm": 1.253811240196228, "learning_rate": 0.00014930604982206407, "loss": 1.3081, "step": 2859 }, { "epoch": 1.271111111111111, "grad_norm": 1.178534984588623, "learning_rate": 0.00014928825622775803, "loss": 1.2465, "step": 2860 }, { "epoch": 1.2715555555555556, "grad_norm": 1.6904065608978271, "learning_rate": 0.00014927046263345196, "loss": 2.314, "step": 2861 }, { "epoch": 1.272, "grad_norm": 1.5300809144973755, "learning_rate": 0.00014925266903914592, "loss": 2.0894, "step": 2862 }, { "epoch": 1.2724444444444445, "grad_norm": 1.5079879760742188, "learning_rate": 0.00014923487544483987, "loss": 2.1044, "step": 2863 }, { "epoch": 1.272888888888889, "grad_norm": 1.88065505027771, "learning_rate": 0.00014921708185053383, "loss": 2.1615, "step": 2864 }, { "epoch": 1.2733333333333334, "grad_norm": 1.3186235427856445, "learning_rate": 0.00014919928825622779, "loss": 0.7738, "step": 2865 }, { "epoch": 1.2737777777777777, "grad_norm": 2.2539889812469482, "learning_rate": 0.00014918149466192171, "loss": 2.2635, "step": 2866 }, { "epoch": 1.2742222222222221, "grad_norm": 1.5537022352218628, "learning_rate": 0.00014916370106761567, "loss": 1.9448, "step": 2867 }, { "epoch": 1.2746666666666666, "grad_norm": 1.676327109336853, "learning_rate": 0.0001491459074733096, "loss": 2.2653, "step": 2868 }, { "epoch": 1.275111111111111, "grad_norm": 1.697751760482788, "learning_rate": 0.00014912811387900356, "loss": 2.2573, "step": 2869 }, { "epoch": 1.2755555555555556, "grad_norm": 1.7857534885406494, "learning_rate": 0.00014911032028469751, "loss": 2.6627, "step": 2870 }, { "epoch": 1.276, "grad_norm": 1.6772701740264893, "learning_rate": 0.00014909252669039147, "loss": 2.1949, "step": 2871 }, { "epoch": 1.2764444444444445, "grad_norm": 1.546369194984436, "learning_rate": 0.00014907473309608543, "loss": 1.5786, "step": 2872 }, { "epoch": 1.276888888888889, "grad_norm": 1.6441593170166016, "learning_rate": 0.00014905693950177936, "loss": 1.9288, "step": 2873 }, { "epoch": 1.2773333333333334, "grad_norm": 1.613301396369934, "learning_rate": 0.0001490391459074733, "loss": 1.9617, "step": 2874 }, { "epoch": 1.2777777777777777, "grad_norm": 2.057661771774292, "learning_rate": 0.00014902135231316727, "loss": 2.3796, "step": 2875 }, { "epoch": 1.2782222222222221, "grad_norm": 2.0095505714416504, "learning_rate": 0.00014900355871886123, "loss": 2.1544, "step": 2876 }, { "epoch": 1.2786666666666666, "grad_norm": 1.9702578783035278, "learning_rate": 0.00014898576512455518, "loss": 2.562, "step": 2877 }, { "epoch": 1.279111111111111, "grad_norm": 1.7147190570831299, "learning_rate": 0.00014896797153024914, "loss": 2.1862, "step": 2878 }, { "epoch": 1.2795555555555556, "grad_norm": 1.5411655902862549, "learning_rate": 0.00014895017793594307, "loss": 2.0377, "step": 2879 }, { "epoch": 1.28, "grad_norm": 1.9299793243408203, "learning_rate": 0.000148932384341637, "loss": 2.4939, "step": 2880 }, { "epoch": 1.2804444444444445, "grad_norm": 1.9820499420166016, "learning_rate": 0.00014891459074733095, "loss": 2.4094, "step": 2881 }, { "epoch": 1.280888888888889, "grad_norm": 1.9093626737594604, "learning_rate": 0.0001488967971530249, "loss": 1.8737, "step": 2882 }, { "epoch": 1.2813333333333334, "grad_norm": 1.7828611135482788, "learning_rate": 0.00014887900355871887, "loss": 2.3426, "step": 2883 }, { "epoch": 1.2817777777777777, "grad_norm": 2.0726230144500732, "learning_rate": 0.00014886120996441282, "loss": 2.2123, "step": 2884 }, { "epoch": 1.2822222222222222, "grad_norm": 1.8538103103637695, "learning_rate": 0.00014884341637010678, "loss": 1.9047, "step": 2885 }, { "epoch": 1.2826666666666666, "grad_norm": 1.746737003326416, "learning_rate": 0.0001488256227758007, "loss": 1.693, "step": 2886 }, { "epoch": 1.283111111111111, "grad_norm": 1.8844788074493408, "learning_rate": 0.00014880782918149467, "loss": 2.0716, "step": 2887 }, { "epoch": 1.2835555555555556, "grad_norm": 1.7158288955688477, "learning_rate": 0.00014879003558718862, "loss": 1.9545, "step": 2888 }, { "epoch": 1.284, "grad_norm": 1.926275610923767, "learning_rate": 0.00014877224199288258, "loss": 1.8723, "step": 2889 }, { "epoch": 1.2844444444444445, "grad_norm": 2.2394421100616455, "learning_rate": 0.00014875444839857654, "loss": 2.3997, "step": 2890 }, { "epoch": 1.284888888888889, "grad_norm": 1.957261323928833, "learning_rate": 0.00014873665480427046, "loss": 2.1043, "step": 2891 }, { "epoch": 1.2853333333333334, "grad_norm": 2.291721820831299, "learning_rate": 0.00014871886120996442, "loss": 2.213, "step": 2892 }, { "epoch": 1.2857777777777777, "grad_norm": 1.8697887659072876, "learning_rate": 0.00014870106761565835, "loss": 1.8325, "step": 2893 }, { "epoch": 1.2862222222222222, "grad_norm": 1.8223001956939697, "learning_rate": 0.0001486832740213523, "loss": 1.8901, "step": 2894 }, { "epoch": 1.2866666666666666, "grad_norm": 1.9460232257843018, "learning_rate": 0.00014866548042704626, "loss": 2.0251, "step": 2895 }, { "epoch": 1.287111111111111, "grad_norm": 2.430386543273926, "learning_rate": 0.00014864768683274022, "loss": 2.591, "step": 2896 }, { "epoch": 1.2875555555555556, "grad_norm": 2.0571846961975098, "learning_rate": 0.00014862989323843418, "loss": 2.0458, "step": 2897 }, { "epoch": 1.288, "grad_norm": 2.013607978820801, "learning_rate": 0.00014861209964412813, "loss": 1.9527, "step": 2898 }, { "epoch": 1.2884444444444445, "grad_norm": 1.809848427772522, "learning_rate": 0.00014859430604982206, "loss": 1.2951, "step": 2899 }, { "epoch": 1.2888888888888888, "grad_norm": 2.80146861076355, "learning_rate": 0.00014857651245551602, "loss": 1.8434, "step": 2900 }, { "epoch": 1.2893333333333334, "grad_norm": 0.43238645792007446, "learning_rate": 0.00014855871886120998, "loss": 0.0395, "step": 2901 }, { "epoch": 1.2897777777777777, "grad_norm": 1.3477979898452759, "learning_rate": 0.00014854092526690393, "loss": 2.1978, "step": 2902 }, { "epoch": 1.2902222222222222, "grad_norm": 1.4748413562774658, "learning_rate": 0.0001485231316725979, "loss": 2.2269, "step": 2903 }, { "epoch": 1.2906666666666666, "grad_norm": 1.8670775890350342, "learning_rate": 0.00014850533807829182, "loss": 2.5789, "step": 2904 }, { "epoch": 1.291111111111111, "grad_norm": 1.5996267795562744, "learning_rate": 0.00014848754448398577, "loss": 2.0898, "step": 2905 }, { "epoch": 1.2915555555555556, "grad_norm": 1.5340416431427002, "learning_rate": 0.0001484697508896797, "loss": 2.0439, "step": 2906 }, { "epoch": 1.292, "grad_norm": 1.7059005498886108, "learning_rate": 0.00014845195729537366, "loss": 2.6905, "step": 2907 }, { "epoch": 1.2924444444444445, "grad_norm": 1.3028349876403809, "learning_rate": 0.00014843416370106762, "loss": 1.1318, "step": 2908 }, { "epoch": 1.2928888888888888, "grad_norm": 1.6985855102539062, "learning_rate": 0.00014841637010676157, "loss": 2.2406, "step": 2909 }, { "epoch": 1.2933333333333334, "grad_norm": 1.416417121887207, "learning_rate": 0.00014839857651245553, "loss": 2.1449, "step": 2910 }, { "epoch": 1.2937777777777777, "grad_norm": 1.791305422782898, "learning_rate": 0.00014838078291814949, "loss": 2.4623, "step": 2911 }, { "epoch": 1.2942222222222222, "grad_norm": 1.3889151811599731, "learning_rate": 0.00014836298932384342, "loss": 1.965, "step": 2912 }, { "epoch": 1.2946666666666666, "grad_norm": 1.8636940717697144, "learning_rate": 0.00014834519572953737, "loss": 2.9771, "step": 2913 }, { "epoch": 1.295111111111111, "grad_norm": 1.9207584857940674, "learning_rate": 0.00014832740213523133, "loss": 2.3587, "step": 2914 }, { "epoch": 1.2955555555555556, "grad_norm": 1.6041591167449951, "learning_rate": 0.00014830960854092528, "loss": 1.986, "step": 2915 }, { "epoch": 1.296, "grad_norm": 1.9507296085357666, "learning_rate": 0.00014829181494661924, "loss": 2.7387, "step": 2916 }, { "epoch": 1.2964444444444445, "grad_norm": 1.7080721855163574, "learning_rate": 0.00014827402135231317, "loss": 2.0368, "step": 2917 }, { "epoch": 1.2968888888888888, "grad_norm": 1.7469477653503418, "learning_rate": 0.00014825622775800713, "loss": 2.0248, "step": 2918 }, { "epoch": 1.2973333333333334, "grad_norm": 1.622348666191101, "learning_rate": 0.00014823843416370106, "loss": 2.3361, "step": 2919 }, { "epoch": 1.2977777777777777, "grad_norm": 1.9793723821640015, "learning_rate": 0.000148220640569395, "loss": 2.7984, "step": 2920 }, { "epoch": 1.2982222222222222, "grad_norm": 1.6501868963241577, "learning_rate": 0.00014820284697508897, "loss": 1.9731, "step": 2921 }, { "epoch": 1.2986666666666666, "grad_norm": 1.7134915590286255, "learning_rate": 0.00014818505338078293, "loss": 2.3017, "step": 2922 }, { "epoch": 1.299111111111111, "grad_norm": 1.6116629838943481, "learning_rate": 0.00014816725978647688, "loss": 2.0448, "step": 2923 }, { "epoch": 1.2995555555555556, "grad_norm": 1.7651207447052002, "learning_rate": 0.00014814946619217084, "loss": 2.2993, "step": 2924 }, { "epoch": 1.3, "grad_norm": 1.5447845458984375, "learning_rate": 0.00014813167259786477, "loss": 1.6308, "step": 2925 }, { "epoch": 1.3004444444444445, "grad_norm": 1.505743145942688, "learning_rate": 0.00014811387900355873, "loss": 1.54, "step": 2926 }, { "epoch": 1.3008888888888888, "grad_norm": 1.8078949451446533, "learning_rate": 0.00014809608540925268, "loss": 2.3919, "step": 2927 }, { "epoch": 1.3013333333333335, "grad_norm": 1.799320101737976, "learning_rate": 0.00014807829181494664, "loss": 2.7337, "step": 2928 }, { "epoch": 1.3017777777777777, "grad_norm": 1.7614754438400269, "learning_rate": 0.0001480604982206406, "loss": 2.1453, "step": 2929 }, { "epoch": 1.3022222222222222, "grad_norm": 1.8952438831329346, "learning_rate": 0.00014804270462633452, "loss": 2.2258, "step": 2930 }, { "epoch": 1.3026666666666666, "grad_norm": 1.7075976133346558, "learning_rate": 0.00014802491103202848, "loss": 2.0048, "step": 2931 }, { "epoch": 1.303111111111111, "grad_norm": 1.847962737083435, "learning_rate": 0.0001480071174377224, "loss": 1.7736, "step": 2932 }, { "epoch": 1.3035555555555556, "grad_norm": 1.7101470232009888, "learning_rate": 0.00014798932384341637, "loss": 2.0625, "step": 2933 }, { "epoch": 1.304, "grad_norm": 1.760359764099121, "learning_rate": 0.00014797153024911032, "loss": 1.7883, "step": 2934 }, { "epoch": 1.3044444444444445, "grad_norm": 2.244199275970459, "learning_rate": 0.00014795373665480428, "loss": 2.3975, "step": 2935 }, { "epoch": 1.3048888888888888, "grad_norm": 2.010221242904663, "learning_rate": 0.00014793594306049824, "loss": 2.2006, "step": 2936 }, { "epoch": 1.3053333333333335, "grad_norm": 1.8990859985351562, "learning_rate": 0.0001479181494661922, "loss": 2.0714, "step": 2937 }, { "epoch": 1.3057777777777777, "grad_norm": 1.8823907375335693, "learning_rate": 0.00014790035587188612, "loss": 1.9876, "step": 2938 }, { "epoch": 1.3062222222222222, "grad_norm": 1.7615153789520264, "learning_rate": 0.00014788256227758008, "loss": 2.2601, "step": 2939 }, { "epoch": 1.3066666666666666, "grad_norm": 2.3797504901885986, "learning_rate": 0.00014786476868327403, "loss": 2.3143, "step": 2940 }, { "epoch": 1.3071111111111111, "grad_norm": 2.3076117038726807, "learning_rate": 0.000147846975088968, "loss": 2.4184, "step": 2941 }, { "epoch": 1.3075555555555556, "grad_norm": 2.3064980506896973, "learning_rate": 0.00014782918149466195, "loss": 2.0238, "step": 2942 }, { "epoch": 1.308, "grad_norm": 2.125981330871582, "learning_rate": 0.00014781138790035588, "loss": 2.4634, "step": 2943 }, { "epoch": 1.3084444444444445, "grad_norm": 2.4080536365509033, "learning_rate": 0.00014779359430604983, "loss": 2.204, "step": 2944 }, { "epoch": 1.3088888888888888, "grad_norm": 2.146500587463379, "learning_rate": 0.00014777580071174376, "loss": 2.2429, "step": 2945 }, { "epoch": 1.3093333333333335, "grad_norm": 2.469111680984497, "learning_rate": 0.00014775800711743772, "loss": 2.4771, "step": 2946 }, { "epoch": 1.3097777777777777, "grad_norm": 2.244917392730713, "learning_rate": 0.00014774021352313168, "loss": 1.8265, "step": 2947 }, { "epoch": 1.3102222222222222, "grad_norm": 2.1437273025512695, "learning_rate": 0.00014772241992882563, "loss": 2.496, "step": 2948 }, { "epoch": 1.3106666666666666, "grad_norm": 2.3928580284118652, "learning_rate": 0.0001477046263345196, "loss": 1.9276, "step": 2949 }, { "epoch": 1.3111111111111111, "grad_norm": 3.0038251876831055, "learning_rate": 0.00014768683274021355, "loss": 2.0514, "step": 2950 }, { "epoch": 1.3115555555555556, "grad_norm": 1.4333771467208862, "learning_rate": 0.00014766903914590747, "loss": 2.8184, "step": 2951 }, { "epoch": 1.312, "grad_norm": 1.254792332649231, "learning_rate": 0.00014765124555160143, "loss": 2.1764, "step": 2952 }, { "epoch": 1.3124444444444445, "grad_norm": 1.2323510646820068, "learning_rate": 0.0001476334519572954, "loss": 2.4008, "step": 2953 }, { "epoch": 1.3128888888888888, "grad_norm": 1.026353120803833, "learning_rate": 0.00014761565836298934, "loss": 1.6053, "step": 2954 }, { "epoch": 1.3133333333333335, "grad_norm": 1.8161970376968384, "learning_rate": 0.0001475978647686833, "loss": 2.4387, "step": 2955 }, { "epoch": 1.3137777777777777, "grad_norm": 1.568258285522461, "learning_rate": 0.00014758007117437723, "loss": 2.1573, "step": 2956 }, { "epoch": 1.3142222222222222, "grad_norm": 1.747540831565857, "learning_rate": 0.00014756227758007116, "loss": 2.3005, "step": 2957 }, { "epoch": 1.3146666666666667, "grad_norm": 1.9220967292785645, "learning_rate": 0.00014754448398576512, "loss": 2.517, "step": 2958 }, { "epoch": 1.3151111111111111, "grad_norm": 1.7257778644561768, "learning_rate": 0.00014752669039145907, "loss": 2.576, "step": 2959 }, { "epoch": 1.3155555555555556, "grad_norm": 1.6477985382080078, "learning_rate": 0.00014750889679715303, "loss": 2.1995, "step": 2960 }, { "epoch": 1.316, "grad_norm": 1.5505272150039673, "learning_rate": 0.00014749110320284699, "loss": 2.346, "step": 2961 }, { "epoch": 1.3164444444444445, "grad_norm": 1.4658108949661255, "learning_rate": 0.00014747330960854094, "loss": 2.0652, "step": 2962 }, { "epoch": 1.3168888888888888, "grad_norm": 1.694881558418274, "learning_rate": 0.00014745551601423487, "loss": 2.1266, "step": 2963 }, { "epoch": 1.3173333333333335, "grad_norm": 1.606266736984253, "learning_rate": 0.00014743772241992883, "loss": 2.0245, "step": 2964 }, { "epoch": 1.3177777777777777, "grad_norm": 1.483372688293457, "learning_rate": 0.00014741992882562278, "loss": 2.0158, "step": 2965 }, { "epoch": 1.3182222222222222, "grad_norm": 1.510193943977356, "learning_rate": 0.00014740213523131674, "loss": 1.8094, "step": 2966 }, { "epoch": 1.3186666666666667, "grad_norm": 1.2403696775436401, "learning_rate": 0.0001473843416370107, "loss": 1.1705, "step": 2967 }, { "epoch": 1.3191111111111111, "grad_norm": 1.4912583827972412, "learning_rate": 0.00014736654804270465, "loss": 1.9255, "step": 2968 }, { "epoch": 1.3195555555555556, "grad_norm": 1.7279419898986816, "learning_rate": 0.00014734875444839858, "loss": 1.2055, "step": 2969 }, { "epoch": 1.32, "grad_norm": 1.8031781911849976, "learning_rate": 0.0001473309608540925, "loss": 2.2159, "step": 2970 }, { "epoch": 1.3204444444444445, "grad_norm": 2.015742778778076, "learning_rate": 0.00014731316725978647, "loss": 2.5995, "step": 2971 }, { "epoch": 1.3208888888888888, "grad_norm": 1.7225654125213623, "learning_rate": 0.00014729537366548043, "loss": 2.2011, "step": 2972 }, { "epoch": 1.3213333333333335, "grad_norm": 1.881436824798584, "learning_rate": 0.00014727758007117438, "loss": 2.1361, "step": 2973 }, { "epoch": 1.3217777777777777, "grad_norm": 1.8075528144836426, "learning_rate": 0.00014725978647686834, "loss": 2.4812, "step": 2974 }, { "epoch": 1.3222222222222222, "grad_norm": 1.7048109769821167, "learning_rate": 0.0001472419928825623, "loss": 2.2206, "step": 2975 }, { "epoch": 1.3226666666666667, "grad_norm": 1.8709800243377686, "learning_rate": 0.00014722419928825622, "loss": 2.0549, "step": 2976 }, { "epoch": 1.3231111111111111, "grad_norm": 1.6973004341125488, "learning_rate": 0.00014720640569395018, "loss": 2.0749, "step": 2977 }, { "epoch": 1.3235555555555556, "grad_norm": 2.0399484634399414, "learning_rate": 0.00014718861209964414, "loss": 2.59, "step": 2978 }, { "epoch": 1.324, "grad_norm": 1.7553602457046509, "learning_rate": 0.0001471708185053381, "loss": 1.4675, "step": 2979 }, { "epoch": 1.3244444444444445, "grad_norm": 1.8353267908096313, "learning_rate": 0.00014715302491103205, "loss": 2.1814, "step": 2980 }, { "epoch": 1.3248888888888888, "grad_norm": 1.942821741104126, "learning_rate": 0.000147135231316726, "loss": 1.7173, "step": 2981 }, { "epoch": 1.3253333333333333, "grad_norm": 2.025289535522461, "learning_rate": 0.00014711743772241994, "loss": 2.4813, "step": 2982 }, { "epoch": 1.3257777777777777, "grad_norm": 1.890375018119812, "learning_rate": 0.00014709964412811387, "loss": 2.2062, "step": 2983 }, { "epoch": 1.3262222222222222, "grad_norm": 1.8974454402923584, "learning_rate": 0.00014708185053380782, "loss": 2.3105, "step": 2984 }, { "epoch": 1.3266666666666667, "grad_norm": 1.7386442422866821, "learning_rate": 0.00014706405693950178, "loss": 1.9486, "step": 2985 }, { "epoch": 1.3271111111111111, "grad_norm": 1.974223017692566, "learning_rate": 0.00014704626334519574, "loss": 2.5025, "step": 2986 }, { "epoch": 1.3275555555555556, "grad_norm": 2.720777988433838, "learning_rate": 0.0001470284697508897, "loss": 2.2163, "step": 2987 }, { "epoch": 1.328, "grad_norm": 2.455169200897217, "learning_rate": 0.00014701067615658365, "loss": 2.2537, "step": 2988 }, { "epoch": 1.3284444444444445, "grad_norm": 1.8268187046051025, "learning_rate": 0.00014699288256227758, "loss": 1.3468, "step": 2989 }, { "epoch": 1.3288888888888888, "grad_norm": 1.7359619140625, "learning_rate": 0.00014697508896797153, "loss": 2.2044, "step": 2990 }, { "epoch": 1.3293333333333333, "grad_norm": 2.3060410022735596, "learning_rate": 0.0001469572953736655, "loss": 2.1335, "step": 2991 }, { "epoch": 1.3297777777777777, "grad_norm": 2.0797529220581055, "learning_rate": 0.00014693950177935945, "loss": 2.4904, "step": 2992 }, { "epoch": 1.3302222222222222, "grad_norm": 2.0389015674591064, "learning_rate": 0.0001469217081850534, "loss": 2.1793, "step": 2993 }, { "epoch": 1.3306666666666667, "grad_norm": 1.99758780002594, "learning_rate": 0.00014690391459074736, "loss": 1.7031, "step": 2994 }, { "epoch": 1.3311111111111111, "grad_norm": 2.201939582824707, "learning_rate": 0.0001468861209964413, "loss": 2.254, "step": 2995 }, { "epoch": 1.3315555555555556, "grad_norm": 2.535717010498047, "learning_rate": 0.00014686832740213522, "loss": 2.2282, "step": 2996 }, { "epoch": 1.332, "grad_norm": 2.139512062072754, "learning_rate": 0.00014685053380782918, "loss": 2.1161, "step": 2997 }, { "epoch": 1.3324444444444445, "grad_norm": 2.589766502380371, "learning_rate": 0.00014683274021352313, "loss": 2.2188, "step": 2998 }, { "epoch": 1.3328888888888888, "grad_norm": 2.3444082736968994, "learning_rate": 0.0001468149466192171, "loss": 2.3078, "step": 2999 }, { "epoch": 1.3333333333333333, "grad_norm": 2.9983866214752197, "learning_rate": 0.00014679715302491105, "loss": 1.9401, "step": 3000 }, { "epoch": 1.3337777777777777, "grad_norm": 1.0303854942321777, "learning_rate": 0.000146779359430605, "loss": 1.3335, "step": 3001 }, { "epoch": 1.3342222222222222, "grad_norm": 1.3651078939437866, "learning_rate": 0.00014676156583629893, "loss": 2.7932, "step": 3002 }, { "epoch": 1.3346666666666667, "grad_norm": 1.4741727113723755, "learning_rate": 0.0001467437722419929, "loss": 2.4145, "step": 3003 }, { "epoch": 1.3351111111111111, "grad_norm": 1.308189868927002, "learning_rate": 0.00014672597864768684, "loss": 2.492, "step": 3004 }, { "epoch": 1.3355555555555556, "grad_norm": 1.5847667455673218, "learning_rate": 0.0001467081850533808, "loss": 2.1946, "step": 3005 }, { "epoch": 1.336, "grad_norm": 1.329132318496704, "learning_rate": 0.00014669039145907476, "loss": 1.5467, "step": 3006 }, { "epoch": 1.3364444444444445, "grad_norm": 1.514951229095459, "learning_rate": 0.00014667259786476869, "loss": 2.048, "step": 3007 }, { "epoch": 1.3368888888888888, "grad_norm": 1.4534947872161865, "learning_rate": 0.00014665480427046264, "loss": 1.3235, "step": 3008 }, { "epoch": 1.3373333333333333, "grad_norm": 1.4798189401626587, "learning_rate": 0.00014663701067615657, "loss": 2.0619, "step": 3009 }, { "epoch": 1.3377777777777777, "grad_norm": 1.5956952571868896, "learning_rate": 0.00014661921708185053, "loss": 2.325, "step": 3010 }, { "epoch": 1.3382222222222222, "grad_norm": 1.8076585531234741, "learning_rate": 0.00014660142348754449, "loss": 2.8001, "step": 3011 }, { "epoch": 1.3386666666666667, "grad_norm": 1.6839090585708618, "learning_rate": 0.00014658362989323844, "loss": 2.4422, "step": 3012 }, { "epoch": 1.3391111111111111, "grad_norm": 1.4808694124221802, "learning_rate": 0.0001465658362989324, "loss": 2.2948, "step": 3013 }, { "epoch": 1.3395555555555556, "grad_norm": 1.645921230316162, "learning_rate": 0.00014654804270462635, "loss": 2.1147, "step": 3014 }, { "epoch": 1.34, "grad_norm": 1.490327000617981, "learning_rate": 0.00014653024911032028, "loss": 1.5932, "step": 3015 }, { "epoch": 1.3404444444444445, "grad_norm": 1.7233227491378784, "learning_rate": 0.00014651245551601424, "loss": 2.5041, "step": 3016 }, { "epoch": 1.3408888888888888, "grad_norm": 1.904835820198059, "learning_rate": 0.0001464946619217082, "loss": 2.4865, "step": 3017 }, { "epoch": 1.3413333333333333, "grad_norm": 1.3651695251464844, "learning_rate": 0.00014647686832740215, "loss": 1.6397, "step": 3018 }, { "epoch": 1.3417777777777777, "grad_norm": 1.8083473443984985, "learning_rate": 0.0001464590747330961, "loss": 2.0558, "step": 3019 }, { "epoch": 1.3422222222222222, "grad_norm": 1.4256452322006226, "learning_rate": 0.00014644128113879004, "loss": 1.4262, "step": 3020 }, { "epoch": 1.3426666666666667, "grad_norm": 1.686566948890686, "learning_rate": 0.000146423487544484, "loss": 2.235, "step": 3021 }, { "epoch": 1.3431111111111111, "grad_norm": 1.6832934617996216, "learning_rate": 0.00014640569395017793, "loss": 2.3787, "step": 3022 }, { "epoch": 1.3435555555555556, "grad_norm": 5.573652267456055, "learning_rate": 0.00014638790035587188, "loss": 2.2192, "step": 3023 }, { "epoch": 1.3439999999999999, "grad_norm": 1.8312151432037354, "learning_rate": 0.00014637010676156584, "loss": 2.4134, "step": 3024 }, { "epoch": 1.3444444444444446, "grad_norm": 2.171259641647339, "learning_rate": 0.0001463523131672598, "loss": 2.5597, "step": 3025 }, { "epoch": 1.3448888888888888, "grad_norm": 1.7489248514175415, "learning_rate": 0.00014633451957295375, "loss": 2.0362, "step": 3026 }, { "epoch": 1.3453333333333333, "grad_norm": 1.915249228477478, "learning_rate": 0.0001463167259786477, "loss": 2.0456, "step": 3027 }, { "epoch": 1.3457777777777777, "grad_norm": 2.186251640319824, "learning_rate": 0.00014629893238434164, "loss": 2.448, "step": 3028 }, { "epoch": 1.3462222222222222, "grad_norm": 2.2437589168548584, "learning_rate": 0.0001462811387900356, "loss": 0.093, "step": 3029 }, { "epoch": 1.3466666666666667, "grad_norm": 1.4108742475509644, "learning_rate": 0.00014626334519572955, "loss": 1.0545, "step": 3030 }, { "epoch": 1.3471111111111111, "grad_norm": 1.1790105104446411, "learning_rate": 0.0001462455516014235, "loss": 0.8722, "step": 3031 }, { "epoch": 1.3475555555555556, "grad_norm": 1.8908590078353882, "learning_rate": 0.00014622775800711746, "loss": 2.3223, "step": 3032 }, { "epoch": 1.3479999999999999, "grad_norm": 1.7565504312515259, "learning_rate": 0.0001462099644128114, "loss": 2.0448, "step": 3033 }, { "epoch": 1.3484444444444446, "grad_norm": 2.0749552249908447, "learning_rate": 0.00014619217081850535, "loss": 2.4472, "step": 3034 }, { "epoch": 1.3488888888888888, "grad_norm": 1.9328750371932983, "learning_rate": 0.00014617437722419928, "loss": 2.3066, "step": 3035 }, { "epoch": 1.3493333333333333, "grad_norm": 1.9503514766693115, "learning_rate": 0.00014615658362989324, "loss": 2.1583, "step": 3036 }, { "epoch": 1.3497777777777777, "grad_norm": 2.1976773738861084, "learning_rate": 0.0001461387900355872, "loss": 2.6388, "step": 3037 }, { "epoch": 1.3502222222222222, "grad_norm": 1.8023751974105835, "learning_rate": 0.00014612099644128115, "loss": 2.0306, "step": 3038 }, { "epoch": 1.3506666666666667, "grad_norm": 2.1533122062683105, "learning_rate": 0.0001461032028469751, "loss": 2.7645, "step": 3039 }, { "epoch": 1.3511111111111112, "grad_norm": 1.889941930770874, "learning_rate": 0.00014608540925266906, "loss": 2.2246, "step": 3040 }, { "epoch": 1.3515555555555556, "grad_norm": 2.390805244445801, "learning_rate": 0.000146067615658363, "loss": 2.8298, "step": 3041 }, { "epoch": 1.3519999999999999, "grad_norm": 2.4162545204162598, "learning_rate": 0.00014604982206405695, "loss": 2.1917, "step": 3042 }, { "epoch": 1.3524444444444446, "grad_norm": 2.0249195098876953, "learning_rate": 0.0001460320284697509, "loss": 2.6536, "step": 3043 }, { "epoch": 1.3528888888888888, "grad_norm": 1.7109678983688354, "learning_rate": 0.00014601423487544486, "loss": 1.6031, "step": 3044 }, { "epoch": 1.3533333333333333, "grad_norm": 1.919968843460083, "learning_rate": 0.00014599644128113882, "loss": 2.3276, "step": 3045 }, { "epoch": 1.3537777777777777, "grad_norm": 2.2520694732666016, "learning_rate": 0.00014597864768683275, "loss": 2.5268, "step": 3046 }, { "epoch": 1.3542222222222222, "grad_norm": 2.1359968185424805, "learning_rate": 0.00014596085409252668, "loss": 2.3626, "step": 3047 }, { "epoch": 1.3546666666666667, "grad_norm": 2.3673102855682373, "learning_rate": 0.00014594306049822063, "loss": 2.5428, "step": 3048 }, { "epoch": 1.3551111111111112, "grad_norm": 2.3437814712524414, "learning_rate": 0.0001459252669039146, "loss": 2.5333, "step": 3049 }, { "epoch": 1.3555555555555556, "grad_norm": 1.5844188928604126, "learning_rate": 0.00014590747330960854, "loss": 0.678, "step": 3050 }, { "epoch": 1.3559999999999999, "grad_norm": 0.8454328775405884, "learning_rate": 0.0001458896797153025, "loss": 0.0259, "step": 3051 }, { "epoch": 1.3564444444444446, "grad_norm": 1.2323882579803467, "learning_rate": 0.00014587188612099646, "loss": 2.408, "step": 3052 }, { "epoch": 1.3568888888888888, "grad_norm": 1.4226750135421753, "learning_rate": 0.0001458540925266904, "loss": 2.3492, "step": 3053 }, { "epoch": 1.3573333333333333, "grad_norm": 1.3918040990829468, "learning_rate": 0.00014583629893238434, "loss": 2.2151, "step": 3054 }, { "epoch": 1.3577777777777778, "grad_norm": 1.5795356035232544, "learning_rate": 0.0001458185053380783, "loss": 2.5257, "step": 3055 }, { "epoch": 1.3582222222222222, "grad_norm": 1.3857512474060059, "learning_rate": 0.00014580071174377226, "loss": 2.3031, "step": 3056 }, { "epoch": 1.3586666666666667, "grad_norm": 1.50767183303833, "learning_rate": 0.0001457829181494662, "loss": 1.606, "step": 3057 }, { "epoch": 1.3591111111111112, "grad_norm": 1.5515854358673096, "learning_rate": 0.00014576512455516017, "loss": 2.4394, "step": 3058 }, { "epoch": 1.3595555555555556, "grad_norm": 1.567515254020691, "learning_rate": 0.0001457473309608541, "loss": 2.7249, "step": 3059 }, { "epoch": 1.3599999999999999, "grad_norm": 1.5846283435821533, "learning_rate": 0.00014572953736654803, "loss": 1.8917, "step": 3060 }, { "epoch": 1.3604444444444446, "grad_norm": 1.4666316509246826, "learning_rate": 0.00014571174377224198, "loss": 2.1971, "step": 3061 }, { "epoch": 1.3608888888888888, "grad_norm": 1.708336353302002, "learning_rate": 0.00014569395017793594, "loss": 2.177, "step": 3062 }, { "epoch": 1.3613333333333333, "grad_norm": 1.7200583219528198, "learning_rate": 0.0001456761565836299, "loss": 2.3996, "step": 3063 }, { "epoch": 1.3617777777777778, "grad_norm": 1.7250936031341553, "learning_rate": 0.00014565836298932385, "loss": 2.4772, "step": 3064 }, { "epoch": 1.3622222222222222, "grad_norm": 1.6220717430114746, "learning_rate": 0.0001456405693950178, "loss": 1.7843, "step": 3065 }, { "epoch": 1.3626666666666667, "grad_norm": 1.2669525146484375, "learning_rate": 0.00014562277580071174, "loss": 1.0277, "step": 3066 }, { "epoch": 1.3631111111111112, "grad_norm": 1.7346656322479248, "learning_rate": 0.0001456049822064057, "loss": 1.7548, "step": 3067 }, { "epoch": 1.3635555555555556, "grad_norm": 1.718342900276184, "learning_rate": 0.00014558718861209965, "loss": 1.7593, "step": 3068 }, { "epoch": 1.3639999999999999, "grad_norm": 1.738747239112854, "learning_rate": 0.0001455693950177936, "loss": 2.1453, "step": 3069 }, { "epoch": 1.3644444444444446, "grad_norm": 1.6312371492385864, "learning_rate": 0.00014555160142348757, "loss": 2.1307, "step": 3070 }, { "epoch": 1.3648888888888888, "grad_norm": 1.9798495769500732, "learning_rate": 0.00014553380782918152, "loss": 1.9325, "step": 3071 }, { "epoch": 1.3653333333333333, "grad_norm": 1.8481535911560059, "learning_rate": 0.00014551601423487545, "loss": 2.0689, "step": 3072 }, { "epoch": 1.3657777777777778, "grad_norm": 1.7567691802978516, "learning_rate": 0.00014549822064056938, "loss": 1.8361, "step": 3073 }, { "epoch": 1.3662222222222222, "grad_norm": 1.9224940538406372, "learning_rate": 0.00014548042704626334, "loss": 2.2146, "step": 3074 }, { "epoch": 1.3666666666666667, "grad_norm": 2.118567705154419, "learning_rate": 0.0001454626334519573, "loss": 2.7325, "step": 3075 }, { "epoch": 1.3671111111111112, "grad_norm": 1.6111234426498413, "learning_rate": 0.00014544483985765125, "loss": 1.5926, "step": 3076 }, { "epoch": 1.3675555555555556, "grad_norm": 1.8418885469436646, "learning_rate": 0.0001454270462633452, "loss": 2.1703, "step": 3077 }, { "epoch": 1.3679999999999999, "grad_norm": 3.9272093772888184, "learning_rate": 0.00014540925266903916, "loss": 0.0625, "step": 3078 }, { "epoch": 1.3684444444444446, "grad_norm": 1.4438681602478027, "learning_rate": 0.0001453914590747331, "loss": 1.2624, "step": 3079 }, { "epoch": 1.3688888888888888, "grad_norm": 2.313140392303467, "learning_rate": 0.00014537366548042705, "loss": 2.459, "step": 3080 }, { "epoch": 1.3693333333333333, "grad_norm": 1.765426754951477, "learning_rate": 0.000145355871886121, "loss": 1.0009, "step": 3081 }, { "epoch": 1.3697777777777778, "grad_norm": 1.7897223234176636, "learning_rate": 0.00014533807829181496, "loss": 1.6343, "step": 3082 }, { "epoch": 1.3702222222222222, "grad_norm": 1.9620853662490845, "learning_rate": 0.00014532028469750892, "loss": 1.9386, "step": 3083 }, { "epoch": 1.3706666666666667, "grad_norm": 2.0378286838531494, "learning_rate": 0.00014530249110320288, "loss": 1.7725, "step": 3084 }, { "epoch": 1.3711111111111112, "grad_norm": 1.9394813776016235, "learning_rate": 0.0001452846975088968, "loss": 2.1801, "step": 3085 }, { "epoch": 1.3715555555555556, "grad_norm": 2.0343053340911865, "learning_rate": 0.00014526690391459073, "loss": 2.0472, "step": 3086 }, { "epoch": 1.3719999999999999, "grad_norm": 2.085235595703125, "learning_rate": 0.0001452491103202847, "loss": 1.969, "step": 3087 }, { "epoch": 1.3724444444444446, "grad_norm": 1.9965012073516846, "learning_rate": 0.00014523131672597865, "loss": 2.3486, "step": 3088 }, { "epoch": 1.3728888888888888, "grad_norm": 1.9986323118209839, "learning_rate": 0.0001452135231316726, "loss": 2.2368, "step": 3089 }, { "epoch": 1.3733333333333333, "grad_norm": 2.003603935241699, "learning_rate": 0.00014519572953736656, "loss": 2.0619, "step": 3090 }, { "epoch": 1.3737777777777778, "grad_norm": 2.0343897342681885, "learning_rate": 0.00014517793594306052, "loss": 2.2089, "step": 3091 }, { "epoch": 1.3742222222222222, "grad_norm": 2.0502665042877197, "learning_rate": 0.00014516014234875445, "loss": 2.3439, "step": 3092 }, { "epoch": 1.3746666666666667, "grad_norm": 2.019620895385742, "learning_rate": 0.0001451423487544484, "loss": 2.2632, "step": 3093 }, { "epoch": 1.3751111111111112, "grad_norm": 1.9464764595031738, "learning_rate": 0.00014512455516014236, "loss": 1.7843, "step": 3094 }, { "epoch": 1.3755555555555556, "grad_norm": 2.3599178791046143, "learning_rate": 0.00014510676156583632, "loss": 2.2186, "step": 3095 }, { "epoch": 1.376, "grad_norm": 2.2975494861602783, "learning_rate": 0.00014508896797153027, "loss": 2.5936, "step": 3096 }, { "epoch": 1.3764444444444446, "grad_norm": 2.219733476638794, "learning_rate": 0.00014507117437722423, "loss": 2.0997, "step": 3097 }, { "epoch": 1.3768888888888888, "grad_norm": 2.035273790359497, "learning_rate": 0.00014505338078291816, "loss": 2.0009, "step": 3098 }, { "epoch": 1.3773333333333333, "grad_norm": 2.999622106552124, "learning_rate": 0.0001450355871886121, "loss": 2.7129, "step": 3099 }, { "epoch": 1.3777777777777778, "grad_norm": 2.15091872215271, "learning_rate": 0.00014501779359430604, "loss": 2.1994, "step": 3100 }, { "epoch": 1.3782222222222222, "grad_norm": 1.0955103635787964, "learning_rate": 0.000145, "loss": 1.416, "step": 3101 }, { "epoch": 1.3786666666666667, "grad_norm": 1.3353731632232666, "learning_rate": 0.00014498220640569396, "loss": 2.4926, "step": 3102 }, { "epoch": 1.3791111111111112, "grad_norm": 1.4232149124145508, "learning_rate": 0.00014496441281138791, "loss": 2.5539, "step": 3103 }, { "epoch": 1.3795555555555556, "grad_norm": 1.4904459714889526, "learning_rate": 0.00014494661921708187, "loss": 2.5516, "step": 3104 }, { "epoch": 1.38, "grad_norm": 1.351136565208435, "learning_rate": 0.0001449288256227758, "loss": 1.9835, "step": 3105 }, { "epoch": 1.3804444444444444, "grad_norm": 1.4860031604766846, "learning_rate": 0.00014491103202846976, "loss": 1.9876, "step": 3106 }, { "epoch": 1.3808888888888888, "grad_norm": 1.582760214805603, "learning_rate": 0.0001448932384341637, "loss": 2.6782, "step": 3107 }, { "epoch": 1.3813333333333333, "grad_norm": 1.3775660991668701, "learning_rate": 0.00014487544483985767, "loss": 1.6808, "step": 3108 }, { "epoch": 1.3817777777777778, "grad_norm": 1.6874479055404663, "learning_rate": 0.00014485765124555163, "loss": 2.6314, "step": 3109 }, { "epoch": 1.3822222222222222, "grad_norm": 1.7144551277160645, "learning_rate": 0.00014483985765124558, "loss": 2.7101, "step": 3110 }, { "epoch": 1.3826666666666667, "grad_norm": 1.5098775625228882, "learning_rate": 0.0001448220640569395, "loss": 1.8929, "step": 3111 }, { "epoch": 1.3831111111111112, "grad_norm": 1.653300404548645, "learning_rate": 0.00014480427046263344, "loss": 2.1654, "step": 3112 }, { "epoch": 1.3835555555555556, "grad_norm": 1.4902502298355103, "learning_rate": 0.0001447864768683274, "loss": 2.3472, "step": 3113 }, { "epoch": 1.384, "grad_norm": 1.75850248336792, "learning_rate": 0.00014476868327402135, "loss": 1.99, "step": 3114 }, { "epoch": 1.3844444444444444, "grad_norm": 1.5602999925613403, "learning_rate": 0.0001447508896797153, "loss": 1.8835, "step": 3115 }, { "epoch": 1.3848888888888888, "grad_norm": 1.6873457431793213, "learning_rate": 0.00014473309608540927, "loss": 1.9495, "step": 3116 }, { "epoch": 1.3853333333333333, "grad_norm": 1.1590341329574585, "learning_rate": 0.00014471530249110322, "loss": 1.2124, "step": 3117 }, { "epoch": 1.3857777777777778, "grad_norm": 1.7708086967468262, "learning_rate": 0.00014469750889679715, "loss": 2.0342, "step": 3118 }, { "epoch": 1.3862222222222222, "grad_norm": 1.969315767288208, "learning_rate": 0.0001446797153024911, "loss": 2.5779, "step": 3119 }, { "epoch": 1.3866666666666667, "grad_norm": 1.6105482578277588, "learning_rate": 0.00014466192170818507, "loss": 2.2028, "step": 3120 }, { "epoch": 1.3871111111111112, "grad_norm": 1.7451056241989136, "learning_rate": 0.00014464412811387902, "loss": 2.1619, "step": 3121 }, { "epoch": 1.3875555555555557, "grad_norm": 1.5029910802841187, "learning_rate": 0.00014462633451957298, "loss": 2.2189, "step": 3122 }, { "epoch": 1.388, "grad_norm": 1.8065159320831299, "learning_rate": 0.0001446085409252669, "loss": 2.52, "step": 3123 }, { "epoch": 1.3884444444444444, "grad_norm": 1.7965675592422485, "learning_rate": 0.00014459074733096086, "loss": 2.6056, "step": 3124 }, { "epoch": 1.3888888888888888, "grad_norm": 1.6334154605865479, "learning_rate": 0.0001445729537366548, "loss": 2.2306, "step": 3125 }, { "epoch": 1.3893333333333333, "grad_norm": 2.191298484802246, "learning_rate": 0.00014455516014234875, "loss": 2.3345, "step": 3126 }, { "epoch": 1.3897777777777778, "grad_norm": 2.035778045654297, "learning_rate": 0.0001445373665480427, "loss": 2.2855, "step": 3127 }, { "epoch": 1.3902222222222222, "grad_norm": 1.8941333293914795, "learning_rate": 0.00014451957295373666, "loss": 2.0976, "step": 3128 }, { "epoch": 1.3906666666666667, "grad_norm": 1.8983358144760132, "learning_rate": 0.00014450177935943062, "loss": 2.1321, "step": 3129 }, { "epoch": 1.3911111111111112, "grad_norm": 1.8651962280273438, "learning_rate": 0.00014448398576512458, "loss": 2.2432, "step": 3130 }, { "epoch": 1.3915555555555557, "grad_norm": 1.7466819286346436, "learning_rate": 0.0001444661921708185, "loss": 1.2187, "step": 3131 }, { "epoch": 1.392, "grad_norm": 1.9504824876785278, "learning_rate": 0.00014444839857651246, "loss": 1.0134, "step": 3132 }, { "epoch": 1.3924444444444444, "grad_norm": 2.349276065826416, "learning_rate": 0.00014443060498220642, "loss": 2.3251, "step": 3133 }, { "epoch": 1.3928888888888888, "grad_norm": 2.0128836631774902, "learning_rate": 0.00014441281138790038, "loss": 1.8844, "step": 3134 }, { "epoch": 1.3933333333333333, "grad_norm": 1.8258366584777832, "learning_rate": 0.00014439501779359433, "loss": 2.1697, "step": 3135 }, { "epoch": 1.3937777777777778, "grad_norm": 1.9964505434036255, "learning_rate": 0.00014437722419928826, "loss": 2.0272, "step": 3136 }, { "epoch": 1.3942222222222223, "grad_norm": 2.2089779376983643, "learning_rate": 0.0001443594306049822, "loss": 2.4377, "step": 3137 }, { "epoch": 1.3946666666666667, "grad_norm": 1.9052916765213013, "learning_rate": 0.00014434163701067615, "loss": 1.9119, "step": 3138 }, { "epoch": 1.3951111111111112, "grad_norm": 2.0558083057403564, "learning_rate": 0.0001443238434163701, "loss": 2.0866, "step": 3139 }, { "epoch": 1.3955555555555557, "grad_norm": 1.9824244976043701, "learning_rate": 0.00014430604982206406, "loss": 2.1271, "step": 3140 }, { "epoch": 1.396, "grad_norm": 2.383279800415039, "learning_rate": 0.00014428825622775802, "loss": 2.5828, "step": 3141 }, { "epoch": 1.3964444444444444, "grad_norm": 2.1160545349121094, "learning_rate": 0.00014427046263345197, "loss": 1.8748, "step": 3142 }, { "epoch": 1.3968888888888888, "grad_norm": 1.8280696868896484, "learning_rate": 0.0001442526690391459, "loss": 2.0148, "step": 3143 }, { "epoch": 1.3973333333333333, "grad_norm": 1.842757225036621, "learning_rate": 0.00014423487544483986, "loss": 1.9273, "step": 3144 }, { "epoch": 1.3977777777777778, "grad_norm": 1.878212809562683, "learning_rate": 0.00014421708185053382, "loss": 1.8468, "step": 3145 }, { "epoch": 1.3982222222222223, "grad_norm": 2.176372766494751, "learning_rate": 0.00014419928825622777, "loss": 1.7978, "step": 3146 }, { "epoch": 1.3986666666666667, "grad_norm": 2.247149705886841, "learning_rate": 0.00014418149466192173, "loss": 2.298, "step": 3147 }, { "epoch": 1.3991111111111112, "grad_norm": 2.320523977279663, "learning_rate": 0.00014416370106761569, "loss": 2.3023, "step": 3148 }, { "epoch": 1.3995555555555557, "grad_norm": 2.5072226524353027, "learning_rate": 0.00014414590747330961, "loss": 2.0168, "step": 3149 }, { "epoch": 1.4, "grad_norm": 3.0211939811706543, "learning_rate": 0.00014412811387900354, "loss": 2.7225, "step": 3150 }, { "epoch": 1.4004444444444444, "grad_norm": 1.3240656852722168, "learning_rate": 0.0001441103202846975, "loss": 2.5294, "step": 3151 }, { "epoch": 1.4008888888888889, "grad_norm": 1.478697657585144, "learning_rate": 0.00014409252669039146, "loss": 2.2188, "step": 3152 }, { "epoch": 1.4013333333333333, "grad_norm": 1.3129013776779175, "learning_rate": 0.0001440747330960854, "loss": 2.188, "step": 3153 }, { "epoch": 1.4017777777777778, "grad_norm": 1.450279951095581, "learning_rate": 0.00014405693950177937, "loss": 2.3634, "step": 3154 }, { "epoch": 1.4022222222222223, "grad_norm": 1.3494071960449219, "learning_rate": 0.00014403914590747333, "loss": 2.2367, "step": 3155 }, { "epoch": 1.4026666666666667, "grad_norm": 1.4874467849731445, "learning_rate": 0.00014402135231316726, "loss": 2.548, "step": 3156 }, { "epoch": 1.403111111111111, "grad_norm": 1.3934712409973145, "learning_rate": 0.0001440035587188612, "loss": 1.5867, "step": 3157 }, { "epoch": 1.4035555555555557, "grad_norm": 1.5048962831497192, "learning_rate": 0.00014398576512455517, "loss": 2.5955, "step": 3158 }, { "epoch": 1.404, "grad_norm": 1.5615451335906982, "learning_rate": 0.00014396797153024913, "loss": 2.1421, "step": 3159 }, { "epoch": 1.4044444444444444, "grad_norm": 1.5293431282043457, "learning_rate": 0.00014395017793594308, "loss": 2.3808, "step": 3160 }, { "epoch": 1.4048888888888889, "grad_norm": 1.3629491329193115, "learning_rate": 0.00014393238434163704, "loss": 1.8447, "step": 3161 }, { "epoch": 1.4053333333333333, "grad_norm": 1.76398766040802, "learning_rate": 0.00014391459074733097, "loss": 1.8923, "step": 3162 }, { "epoch": 1.4057777777777778, "grad_norm": 1.6600054502487183, "learning_rate": 0.0001438967971530249, "loss": 2.0265, "step": 3163 }, { "epoch": 1.4062222222222223, "grad_norm": 1.6202727556228638, "learning_rate": 0.00014387900355871885, "loss": 2.2756, "step": 3164 }, { "epoch": 1.4066666666666667, "grad_norm": 1.749403953552246, "learning_rate": 0.0001438612099644128, "loss": 2.5024, "step": 3165 }, { "epoch": 1.407111111111111, "grad_norm": 1.7654697895050049, "learning_rate": 0.00014384341637010677, "loss": 2.4529, "step": 3166 }, { "epoch": 1.4075555555555557, "grad_norm": 1.5884429216384888, "learning_rate": 0.00014382562277580072, "loss": 1.8367, "step": 3167 }, { "epoch": 1.408, "grad_norm": 1.5916013717651367, "learning_rate": 0.00014380782918149468, "loss": 1.8217, "step": 3168 }, { "epoch": 1.4084444444444444, "grad_norm": 1.7130736112594604, "learning_rate": 0.0001437900355871886, "loss": 2.2377, "step": 3169 }, { "epoch": 1.4088888888888889, "grad_norm": 1.05029296875, "learning_rate": 0.00014377224199288257, "loss": 0.5225, "step": 3170 }, { "epoch": 1.4093333333333333, "grad_norm": 1.631998062133789, "learning_rate": 0.00014375444839857652, "loss": 2.1113, "step": 3171 }, { "epoch": 1.4097777777777778, "grad_norm": 1.6177490949630737, "learning_rate": 0.00014373665480427048, "loss": 2.0905, "step": 3172 }, { "epoch": 1.4102222222222223, "grad_norm": 1.727180004119873, "learning_rate": 0.00014371886120996443, "loss": 2.2642, "step": 3173 }, { "epoch": 1.4106666666666667, "grad_norm": 1.755303144454956, "learning_rate": 0.0001437010676156584, "loss": 2.2187, "step": 3174 }, { "epoch": 1.411111111111111, "grad_norm": 1.6000373363494873, "learning_rate": 0.00014368327402135232, "loss": 1.8188, "step": 3175 }, { "epoch": 1.4115555555555557, "grad_norm": 2.0754306316375732, "learning_rate": 0.00014366548042704625, "loss": 2.0832, "step": 3176 }, { "epoch": 1.412, "grad_norm": 1.7767425775527954, "learning_rate": 0.0001436476868327402, "loss": 2.1773, "step": 3177 }, { "epoch": 1.4124444444444444, "grad_norm": 2.1510021686553955, "learning_rate": 0.00014362989323843416, "loss": 2.1337, "step": 3178 }, { "epoch": 1.4128888888888889, "grad_norm": 1.9618239402770996, "learning_rate": 0.00014361209964412812, "loss": 2.108, "step": 3179 }, { "epoch": 1.4133333333333333, "grad_norm": 1.8377591371536255, "learning_rate": 0.00014359430604982208, "loss": 1.7457, "step": 3180 }, { "epoch": 1.4137777777777778, "grad_norm": 2.0039236545562744, "learning_rate": 0.00014357651245551603, "loss": 2.4462, "step": 3181 }, { "epoch": 1.4142222222222223, "grad_norm": 1.9533127546310425, "learning_rate": 0.00014355871886120996, "loss": 2.0373, "step": 3182 }, { "epoch": 1.4146666666666667, "grad_norm": 2.212468385696411, "learning_rate": 0.00014354092526690392, "loss": 2.2246, "step": 3183 }, { "epoch": 1.415111111111111, "grad_norm": 1.4578838348388672, "learning_rate": 0.00014352313167259788, "loss": 1.1584, "step": 3184 }, { "epoch": 1.4155555555555557, "grad_norm": 2.0764267444610596, "learning_rate": 0.00014350533807829183, "loss": 1.3613, "step": 3185 }, { "epoch": 1.416, "grad_norm": 2.053358316421509, "learning_rate": 0.0001434875444839858, "loss": 2.5823, "step": 3186 }, { "epoch": 1.4164444444444444, "grad_norm": 0.93979811668396, "learning_rate": 0.00014346975088967974, "loss": 0.0561, "step": 3187 }, { "epoch": 1.4168888888888889, "grad_norm": 1.6901674270629883, "learning_rate": 0.00014345195729537367, "loss": 1.9685, "step": 3188 }, { "epoch": 1.4173333333333333, "grad_norm": 1.8892921209335327, "learning_rate": 0.0001434341637010676, "loss": 2.1895, "step": 3189 }, { "epoch": 1.4177777777777778, "grad_norm": 2.0869650840759277, "learning_rate": 0.00014341637010676156, "loss": 2.1289, "step": 3190 }, { "epoch": 1.4182222222222223, "grad_norm": 2.154538154602051, "learning_rate": 0.00014339857651245552, "loss": 2.005, "step": 3191 }, { "epoch": 1.4186666666666667, "grad_norm": 2.2069180011749268, "learning_rate": 0.00014338078291814947, "loss": 2.2602, "step": 3192 }, { "epoch": 1.419111111111111, "grad_norm": 2.003593921661377, "learning_rate": 0.00014336298932384343, "loss": 1.9024, "step": 3193 }, { "epoch": 1.4195555555555557, "grad_norm": 2.1326658725738525, "learning_rate": 0.00014334519572953739, "loss": 2.3338, "step": 3194 }, { "epoch": 1.42, "grad_norm": 2.287719488143921, "learning_rate": 0.00014332740213523132, "loss": 2.1353, "step": 3195 }, { "epoch": 1.4204444444444444, "grad_norm": 1.7974603176116943, "learning_rate": 0.00014330960854092527, "loss": 1.9235, "step": 3196 }, { "epoch": 1.4208888888888889, "grad_norm": 2.7026476860046387, "learning_rate": 0.00014329181494661923, "loss": 1.8728, "step": 3197 }, { "epoch": 1.4213333333333333, "grad_norm": 2.0487453937530518, "learning_rate": 0.00014327402135231318, "loss": 2.3309, "step": 3198 }, { "epoch": 1.4217777777777778, "grad_norm": 2.1622159481048584, "learning_rate": 0.00014325622775800714, "loss": 2.104, "step": 3199 }, { "epoch": 1.4222222222222223, "grad_norm": 2.4706900119781494, "learning_rate": 0.0001432384341637011, "loss": 2.2835, "step": 3200 }, { "epoch": 1.4226666666666667, "grad_norm": 0.8910221457481384, "learning_rate": 0.00014322064056939503, "loss": 1.2837, "step": 3201 }, { "epoch": 1.423111111111111, "grad_norm": 1.3645347356796265, "learning_rate": 0.00014320284697508896, "loss": 2.7213, "step": 3202 }, { "epoch": 1.4235555555555557, "grad_norm": 1.3697624206542969, "learning_rate": 0.0001431850533807829, "loss": 2.6312, "step": 3203 }, { "epoch": 1.424, "grad_norm": 1.5852277278900146, "learning_rate": 0.00014316725978647687, "loss": 2.8356, "step": 3204 }, { "epoch": 1.4244444444444444, "grad_norm": 1.2539130449295044, "learning_rate": 0.00014314946619217083, "loss": 1.7608, "step": 3205 }, { "epoch": 1.4248888888888889, "grad_norm": 1.5661532878875732, "learning_rate": 0.00014313167259786478, "loss": 2.0956, "step": 3206 }, { "epoch": 1.4253333333333333, "grad_norm": 1.6269443035125732, "learning_rate": 0.00014311387900355874, "loss": 2.2012, "step": 3207 }, { "epoch": 1.4257777777777778, "grad_norm": 1.4714109897613525, "learning_rate": 0.00014309608540925267, "loss": 2.3546, "step": 3208 }, { "epoch": 1.4262222222222223, "grad_norm": 1.7439886331558228, "learning_rate": 0.00014307829181494662, "loss": 2.7, "step": 3209 }, { "epoch": 1.4266666666666667, "grad_norm": 1.4173275232315063, "learning_rate": 0.00014306049822064058, "loss": 2.0922, "step": 3210 }, { "epoch": 1.427111111111111, "grad_norm": 1.5306942462921143, "learning_rate": 0.00014304270462633454, "loss": 2.0992, "step": 3211 }, { "epoch": 1.4275555555555557, "grad_norm": 1.797987461090088, "learning_rate": 0.0001430249110320285, "loss": 2.9628, "step": 3212 }, { "epoch": 1.428, "grad_norm": 2.1177406311035156, "learning_rate": 0.00014300711743772245, "loss": 2.1528, "step": 3213 }, { "epoch": 1.4284444444444444, "grad_norm": 1.593675136566162, "learning_rate": 0.00014298932384341638, "loss": 2.5579, "step": 3214 }, { "epoch": 1.4288888888888889, "grad_norm": 1.5105654001235962, "learning_rate": 0.0001429715302491103, "loss": 2.1294, "step": 3215 }, { "epoch": 1.4293333333333333, "grad_norm": 1.520651936531067, "learning_rate": 0.00014295373665480427, "loss": 1.8752, "step": 3216 }, { "epoch": 1.4297777777777778, "grad_norm": 1.612784504890442, "learning_rate": 0.00014293594306049822, "loss": 2.0609, "step": 3217 }, { "epoch": 1.4302222222222223, "grad_norm": 1.5184054374694824, "learning_rate": 0.00014291814946619218, "loss": 1.4772, "step": 3218 }, { "epoch": 1.4306666666666668, "grad_norm": 1.46523916721344, "learning_rate": 0.00014290035587188614, "loss": 1.9029, "step": 3219 }, { "epoch": 1.431111111111111, "grad_norm": 1.4900418519973755, "learning_rate": 0.0001428825622775801, "loss": 1.5782, "step": 3220 }, { "epoch": 1.4315555555555557, "grad_norm": 1.8551801443099976, "learning_rate": 0.00014286476868327402, "loss": 2.47, "step": 3221 }, { "epoch": 1.432, "grad_norm": 1.8610374927520752, "learning_rate": 0.00014284697508896798, "loss": 2.5916, "step": 3222 }, { "epoch": 1.4324444444444444, "grad_norm": 1.73556649684906, "learning_rate": 0.00014282918149466193, "loss": 2.2463, "step": 3223 }, { "epoch": 1.4328888888888889, "grad_norm": 1.574223279953003, "learning_rate": 0.0001428113879003559, "loss": 1.7704, "step": 3224 }, { "epoch": 1.4333333333333333, "grad_norm": 2.1118242740631104, "learning_rate": 0.00014279359430604985, "loss": 1.9923, "step": 3225 }, { "epoch": 1.4337777777777778, "grad_norm": 2.1607296466827393, "learning_rate": 0.00014277580071174378, "loss": 2.2045, "step": 3226 }, { "epoch": 1.4342222222222223, "grad_norm": 1.532531499862671, "learning_rate": 0.0001427580071174377, "loss": 1.904, "step": 3227 }, { "epoch": 1.4346666666666668, "grad_norm": 2.332968235015869, "learning_rate": 0.00014274021352313166, "loss": 2.3031, "step": 3228 }, { "epoch": 1.435111111111111, "grad_norm": 1.7639070749282837, "learning_rate": 0.00014272241992882562, "loss": 2.3637, "step": 3229 }, { "epoch": 1.4355555555555555, "grad_norm": 2.296191453933716, "learning_rate": 0.00014270462633451958, "loss": 1.664, "step": 3230 }, { "epoch": 1.436, "grad_norm": 1.685143232345581, "learning_rate": 0.00014268683274021353, "loss": 1.9173, "step": 3231 }, { "epoch": 1.4364444444444444, "grad_norm": 1.8601534366607666, "learning_rate": 0.0001426690391459075, "loss": 2.0097, "step": 3232 }, { "epoch": 1.4368888888888889, "grad_norm": 1.818623423576355, "learning_rate": 0.00014265124555160142, "loss": 1.8948, "step": 3233 }, { "epoch": 1.4373333333333334, "grad_norm": 2.0175423622131348, "learning_rate": 0.00014263345195729537, "loss": 2.5014, "step": 3234 }, { "epoch": 1.4377777777777778, "grad_norm": 1.874712586402893, "learning_rate": 0.00014261565836298933, "loss": 2.4529, "step": 3235 }, { "epoch": 1.4382222222222223, "grad_norm": 2.350339889526367, "learning_rate": 0.0001425978647686833, "loss": 2.5295, "step": 3236 }, { "epoch": 1.4386666666666668, "grad_norm": 1.8386290073394775, "learning_rate": 0.00014258007117437724, "loss": 2.4049, "step": 3237 }, { "epoch": 1.439111111111111, "grad_norm": 1.9711859226226807, "learning_rate": 0.0001425622775800712, "loss": 2.1427, "step": 3238 }, { "epoch": 1.4395555555555555, "grad_norm": 1.9588954448699951, "learning_rate": 0.00014254448398576513, "loss": 2.0168, "step": 3239 }, { "epoch": 1.44, "grad_norm": 2.025226593017578, "learning_rate": 0.00014252669039145906, "loss": 1.9483, "step": 3240 }, { "epoch": 1.4404444444444444, "grad_norm": 2.1897053718566895, "learning_rate": 0.00014250889679715302, "loss": 2.3079, "step": 3241 }, { "epoch": 1.4408888888888889, "grad_norm": 1.9101537466049194, "learning_rate": 0.00014249110320284697, "loss": 1.5792, "step": 3242 }, { "epoch": 1.4413333333333334, "grad_norm": 2.3152666091918945, "learning_rate": 0.00014247330960854093, "loss": 2.6493, "step": 3243 }, { "epoch": 1.4417777777777778, "grad_norm": 3.443295478820801, "learning_rate": 0.00014245551601423489, "loss": 2.0788, "step": 3244 }, { "epoch": 1.4422222222222223, "grad_norm": 2.00852370262146, "learning_rate": 0.00014243772241992884, "loss": 2.2241, "step": 3245 }, { "epoch": 1.4426666666666668, "grad_norm": 2.1195225715637207, "learning_rate": 0.00014241992882562277, "loss": 2.2187, "step": 3246 }, { "epoch": 1.443111111111111, "grad_norm": 2.060398817062378, "learning_rate": 0.00014240213523131673, "loss": 1.73, "step": 3247 }, { "epoch": 1.4435555555555555, "grad_norm": 2.193606376647949, "learning_rate": 0.00014238434163701068, "loss": 2.5191, "step": 3248 }, { "epoch": 1.444, "grad_norm": 2.3782386779785156, "learning_rate": 0.00014236654804270464, "loss": 2.3936, "step": 3249 }, { "epoch": 1.4444444444444444, "grad_norm": 3.235896348953247, "learning_rate": 0.0001423487544483986, "loss": 2.299, "step": 3250 }, { "epoch": 1.444888888888889, "grad_norm": 1.3519978523254395, "learning_rate": 0.00014233096085409255, "loss": 2.6515, "step": 3251 }, { "epoch": 1.4453333333333334, "grad_norm": 1.7427107095718384, "learning_rate": 0.00014231316725978648, "loss": 2.758, "step": 3252 }, { "epoch": 1.4457777777777778, "grad_norm": 1.2374142408370972, "learning_rate": 0.0001422953736654804, "loss": 2.4892, "step": 3253 }, { "epoch": 1.4462222222222223, "grad_norm": 1.4171687364578247, "learning_rate": 0.00014227758007117437, "loss": 2.5246, "step": 3254 }, { "epoch": 1.4466666666666668, "grad_norm": 1.9048703908920288, "learning_rate": 0.00014225978647686833, "loss": 2.5152, "step": 3255 }, { "epoch": 1.447111111111111, "grad_norm": 1.6656217575073242, "learning_rate": 0.00014224199288256228, "loss": 2.3669, "step": 3256 }, { "epoch": 1.4475555555555555, "grad_norm": 1.9828662872314453, "learning_rate": 0.00014222419928825624, "loss": 2.5827, "step": 3257 }, { "epoch": 1.448, "grad_norm": 1.593224287033081, "learning_rate": 0.0001422064056939502, "loss": 2.3798, "step": 3258 }, { "epoch": 1.4484444444444444, "grad_norm": 1.7066659927368164, "learning_rate": 0.00014218861209964412, "loss": 2.5127, "step": 3259 }, { "epoch": 1.448888888888889, "grad_norm": 1.5722315311431885, "learning_rate": 0.00014217081850533808, "loss": 1.6095, "step": 3260 }, { "epoch": 1.4493333333333334, "grad_norm": 1.5399599075317383, "learning_rate": 0.00014215302491103204, "loss": 2.2268, "step": 3261 }, { "epoch": 1.4497777777777778, "grad_norm": 2.0712404251098633, "learning_rate": 0.000142135231316726, "loss": 1.9943, "step": 3262 }, { "epoch": 1.4502222222222223, "grad_norm": 1.9809646606445312, "learning_rate": 0.00014211743772241995, "loss": 2.8128, "step": 3263 }, { "epoch": 1.4506666666666668, "grad_norm": 1.5712720155715942, "learning_rate": 0.0001420996441281139, "loss": 2.3628, "step": 3264 }, { "epoch": 1.451111111111111, "grad_norm": 1.8636808395385742, "learning_rate": 0.00014208185053380784, "loss": 2.5732, "step": 3265 }, { "epoch": 1.4515555555555555, "grad_norm": 1.9150162935256958, "learning_rate": 0.00014206405693950177, "loss": 2.2289, "step": 3266 }, { "epoch": 1.452, "grad_norm": 1.132552146911621, "learning_rate": 0.00014204626334519572, "loss": 0.6418, "step": 3267 }, { "epoch": 1.4524444444444444, "grad_norm": 1.7738691568374634, "learning_rate": 0.00014202846975088968, "loss": 2.2081, "step": 3268 }, { "epoch": 1.452888888888889, "grad_norm": 1.6669455766677856, "learning_rate": 0.00014201067615658364, "loss": 1.912, "step": 3269 }, { "epoch": 1.4533333333333334, "grad_norm": 1.720966100692749, "learning_rate": 0.0001419928825622776, "loss": 2.7566, "step": 3270 }, { "epoch": 1.4537777777777778, "grad_norm": 1.593934178352356, "learning_rate": 0.00014197508896797155, "loss": 2.076, "step": 3271 }, { "epoch": 1.4542222222222223, "grad_norm": 1.9219107627868652, "learning_rate": 0.00014195729537366548, "loss": 2.3327, "step": 3272 }, { "epoch": 1.4546666666666668, "grad_norm": 1.694810390472412, "learning_rate": 0.00014193950177935943, "loss": 2.2732, "step": 3273 }, { "epoch": 1.455111111111111, "grad_norm": 2.009329319000244, "learning_rate": 0.0001419217081850534, "loss": 1.7474, "step": 3274 }, { "epoch": 1.4555555555555555, "grad_norm": 1.8578834533691406, "learning_rate": 0.00014190391459074735, "loss": 2.2934, "step": 3275 }, { "epoch": 1.456, "grad_norm": 1.8823623657226562, "learning_rate": 0.0001418861209964413, "loss": 2.2761, "step": 3276 }, { "epoch": 1.4564444444444444, "grad_norm": 1.817597508430481, "learning_rate": 0.00014186832740213526, "loss": 1.9425, "step": 3277 }, { "epoch": 1.456888888888889, "grad_norm": 2.0119707584381104, "learning_rate": 0.0001418505338078292, "loss": 2.2159, "step": 3278 }, { "epoch": 1.4573333333333334, "grad_norm": 1.5498830080032349, "learning_rate": 0.00014183274021352312, "loss": 1.4838, "step": 3279 }, { "epoch": 1.4577777777777778, "grad_norm": 1.7245508432388306, "learning_rate": 0.00014181494661921708, "loss": 2.362, "step": 3280 }, { "epoch": 1.458222222222222, "grad_norm": 1.6693973541259766, "learning_rate": 0.00014179715302491103, "loss": 2.0165, "step": 3281 }, { "epoch": 1.4586666666666668, "grad_norm": 1.8107409477233887, "learning_rate": 0.000141779359430605, "loss": 2.0808, "step": 3282 }, { "epoch": 1.459111111111111, "grad_norm": 1.8532699346542358, "learning_rate": 0.00014176156583629894, "loss": 2.2752, "step": 3283 }, { "epoch": 1.4595555555555555, "grad_norm": 1.8230167627334595, "learning_rate": 0.0001417437722419929, "loss": 2.1949, "step": 3284 }, { "epoch": 1.46, "grad_norm": 2.083483934402466, "learning_rate": 0.00014172597864768683, "loss": 2.4207, "step": 3285 }, { "epoch": 1.4604444444444444, "grad_norm": 2.386479377746582, "learning_rate": 0.0001417081850533808, "loss": 2.4775, "step": 3286 }, { "epoch": 1.460888888888889, "grad_norm": 1.8848226070404053, "learning_rate": 0.00014169039145907474, "loss": 2.1974, "step": 3287 }, { "epoch": 1.4613333333333334, "grad_norm": 1.7829835414886475, "learning_rate": 0.0001416725978647687, "loss": 1.9449, "step": 3288 }, { "epoch": 1.4617777777777778, "grad_norm": 1.645369529724121, "learning_rate": 0.00014165480427046266, "loss": 1.5355, "step": 3289 }, { "epoch": 1.462222222222222, "grad_norm": 2.256089687347412, "learning_rate": 0.0001416370106761566, "loss": 2.5316, "step": 3290 }, { "epoch": 1.4626666666666668, "grad_norm": 2.052887201309204, "learning_rate": 0.00014161921708185054, "loss": 2.3761, "step": 3291 }, { "epoch": 1.463111111111111, "grad_norm": 2.1068687438964844, "learning_rate": 0.00014160142348754447, "loss": 0.9723, "step": 3292 }, { "epoch": 1.4635555555555555, "grad_norm": 2.4331436157226562, "learning_rate": 0.00014158362989323843, "loss": 2.1379, "step": 3293 }, { "epoch": 1.464, "grad_norm": 1.9529526233673096, "learning_rate": 0.00014156583629893239, "loss": 2.0011, "step": 3294 }, { "epoch": 1.4644444444444444, "grad_norm": 2.1232481002807617, "learning_rate": 0.00014154804270462634, "loss": 1.8544, "step": 3295 }, { "epoch": 1.464888888888889, "grad_norm": 2.0463287830352783, "learning_rate": 0.0001415302491103203, "loss": 2.1042, "step": 3296 }, { "epoch": 1.4653333333333334, "grad_norm": 2.2398715019226074, "learning_rate": 0.00014151245551601425, "loss": 2.389, "step": 3297 }, { "epoch": 1.4657777777777778, "grad_norm": 2.3587806224823, "learning_rate": 0.00014149466192170818, "loss": 2.2351, "step": 3298 }, { "epoch": 1.466222222222222, "grad_norm": 2.272650957107544, "learning_rate": 0.00014147686832740214, "loss": 2.1772, "step": 3299 }, { "epoch": 1.4666666666666668, "grad_norm": 1.661880373954773, "learning_rate": 0.0001414590747330961, "loss": 1.2369, "step": 3300 }, { "epoch": 1.467111111111111, "grad_norm": 1.3112317323684692, "learning_rate": 0.00014144128113879005, "loss": 2.3523, "step": 3301 }, { "epoch": 1.4675555555555555, "grad_norm": 1.5207730531692505, "learning_rate": 0.000141423487544484, "loss": 2.3686, "step": 3302 }, { "epoch": 1.468, "grad_norm": 1.6390271186828613, "learning_rate": 0.00014140569395017797, "loss": 0.0503, "step": 3303 }, { "epoch": 1.4684444444444444, "grad_norm": 1.316325068473816, "learning_rate": 0.0001413879003558719, "loss": 2.2079, "step": 3304 }, { "epoch": 1.468888888888889, "grad_norm": 1.6727405786514282, "learning_rate": 0.00014137010676156583, "loss": 2.8864, "step": 3305 }, { "epoch": 1.4693333333333334, "grad_norm": 1.413974404335022, "learning_rate": 0.00014135231316725978, "loss": 1.4162, "step": 3306 }, { "epoch": 1.4697777777777778, "grad_norm": 1.4699324369430542, "learning_rate": 0.00014133451957295374, "loss": 2.0043, "step": 3307 }, { "epoch": 1.470222222222222, "grad_norm": 1.9841383695602417, "learning_rate": 0.0001413167259786477, "loss": 2.5486, "step": 3308 }, { "epoch": 1.4706666666666668, "grad_norm": 1.5038107633590698, "learning_rate": 0.00014129893238434165, "loss": 2.3013, "step": 3309 }, { "epoch": 1.471111111111111, "grad_norm": 1.5381580591201782, "learning_rate": 0.0001412811387900356, "loss": 2.1017, "step": 3310 }, { "epoch": 1.4715555555555555, "grad_norm": 1.4826030731201172, "learning_rate": 0.00014126334519572954, "loss": 2.4797, "step": 3311 }, { "epoch": 1.472, "grad_norm": 1.4599792957305908, "learning_rate": 0.0001412455516014235, "loss": 1.9834, "step": 3312 }, { "epoch": 1.4724444444444444, "grad_norm": 1.4552773237228394, "learning_rate": 0.00014122775800711745, "loss": 2.2435, "step": 3313 }, { "epoch": 1.472888888888889, "grad_norm": 1.4118120670318604, "learning_rate": 0.0001412099644128114, "loss": 1.735, "step": 3314 }, { "epoch": 1.4733333333333334, "grad_norm": 1.5949891805648804, "learning_rate": 0.00014119217081850536, "loss": 1.9896, "step": 3315 }, { "epoch": 1.4737777777777779, "grad_norm": 1.7730779647827148, "learning_rate": 0.0001411743772241993, "loss": 2.0223, "step": 3316 }, { "epoch": 1.474222222222222, "grad_norm": 1.7242622375488281, "learning_rate": 0.00014115658362989322, "loss": 2.3709, "step": 3317 }, { "epoch": 1.4746666666666668, "grad_norm": 1.8402231931686401, "learning_rate": 0.00014113879003558718, "loss": 2.3489, "step": 3318 }, { "epoch": 1.475111111111111, "grad_norm": 1.328906774520874, "learning_rate": 0.00014112099644128113, "loss": 1.0891, "step": 3319 }, { "epoch": 1.4755555555555555, "grad_norm": 1.8179643154144287, "learning_rate": 0.0001411032028469751, "loss": 2.6636, "step": 3320 }, { "epoch": 1.476, "grad_norm": 1.6425268650054932, "learning_rate": 0.00014108540925266905, "loss": 1.9906, "step": 3321 }, { "epoch": 1.4764444444444444, "grad_norm": 1.7107009887695312, "learning_rate": 0.000141067615658363, "loss": 2.066, "step": 3322 }, { "epoch": 1.476888888888889, "grad_norm": 1.7221518754959106, "learning_rate": 0.00014104982206405693, "loss": 2.4627, "step": 3323 }, { "epoch": 1.4773333333333334, "grad_norm": 1.9194080829620361, "learning_rate": 0.0001410320284697509, "loss": 2.2223, "step": 3324 }, { "epoch": 1.4777777777777779, "grad_norm": 1.7251501083374023, "learning_rate": 0.00014101423487544485, "loss": 1.902, "step": 3325 }, { "epoch": 1.478222222222222, "grad_norm": 1.7051130533218384, "learning_rate": 0.0001409964412811388, "loss": 2.2801, "step": 3326 }, { "epoch": 1.4786666666666668, "grad_norm": 2.0462424755096436, "learning_rate": 0.00014097864768683276, "loss": 2.2611, "step": 3327 }, { "epoch": 1.479111111111111, "grad_norm": 1.6997382640838623, "learning_rate": 0.00014096085409252672, "loss": 2.0672, "step": 3328 }, { "epoch": 1.4795555555555555, "grad_norm": 1.790878415107727, "learning_rate": 0.00014094306049822065, "loss": 1.866, "step": 3329 }, { "epoch": 1.48, "grad_norm": 2.1614255905151367, "learning_rate": 0.00014092526690391458, "loss": 2.691, "step": 3330 }, { "epoch": 1.4804444444444445, "grad_norm": 1.456119418144226, "learning_rate": 0.00014090747330960853, "loss": 0.8112, "step": 3331 }, { "epoch": 1.480888888888889, "grad_norm": 1.8652554750442505, "learning_rate": 0.0001408896797153025, "loss": 1.9741, "step": 3332 }, { "epoch": 1.4813333333333334, "grad_norm": 1.9750827550888062, "learning_rate": 0.00014087188612099644, "loss": 2.1358, "step": 3333 }, { "epoch": 1.4817777777777779, "grad_norm": 2.0224530696868896, "learning_rate": 0.0001408540925266904, "loss": 2.2299, "step": 3334 }, { "epoch": 1.482222222222222, "grad_norm": 2.1135432720184326, "learning_rate": 0.00014083629893238436, "loss": 2.2098, "step": 3335 }, { "epoch": 1.4826666666666668, "grad_norm": 1.8072072267532349, "learning_rate": 0.0001408185053380783, "loss": 1.9179, "step": 3336 }, { "epoch": 1.483111111111111, "grad_norm": 2.2474746704101562, "learning_rate": 0.00014080071174377224, "loss": 2.6121, "step": 3337 }, { "epoch": 1.4835555555555555, "grad_norm": 1.997774600982666, "learning_rate": 0.0001407829181494662, "loss": 1.8134, "step": 3338 }, { "epoch": 1.484, "grad_norm": 1.8671613931655884, "learning_rate": 0.00014076512455516016, "loss": 1.8315, "step": 3339 }, { "epoch": 1.4844444444444445, "grad_norm": 1.809183955192566, "learning_rate": 0.0001407473309608541, "loss": 1.9693, "step": 3340 }, { "epoch": 1.484888888888889, "grad_norm": 2.1424343585968018, "learning_rate": 0.00014072953736654807, "loss": 1.7883, "step": 3341 }, { "epoch": 1.4853333333333334, "grad_norm": 2.0938260555267334, "learning_rate": 0.000140711743772242, "loss": 2.2282, "step": 3342 }, { "epoch": 1.4857777777777779, "grad_norm": 2.3834707736968994, "learning_rate": 0.00014069395017793593, "loss": 2.1236, "step": 3343 }, { "epoch": 1.4862222222222221, "grad_norm": 2.215244770050049, "learning_rate": 0.00014067615658362988, "loss": 2.3009, "step": 3344 }, { "epoch": 1.4866666666666668, "grad_norm": 1.9744479656219482, "learning_rate": 0.00014065836298932384, "loss": 2.0397, "step": 3345 }, { "epoch": 1.487111111111111, "grad_norm": 2.0358409881591797, "learning_rate": 0.0001406405693950178, "loss": 1.7655, "step": 3346 }, { "epoch": 1.4875555555555555, "grad_norm": 2.4060535430908203, "learning_rate": 0.00014062277580071175, "loss": 2.1293, "step": 3347 }, { "epoch": 1.488, "grad_norm": 2.8066680431365967, "learning_rate": 0.0001406049822064057, "loss": 2.1486, "step": 3348 }, { "epoch": 1.4884444444444445, "grad_norm": 2.352820873260498, "learning_rate": 0.00014058718861209964, "loss": 1.4603, "step": 3349 }, { "epoch": 1.488888888888889, "grad_norm": 2.9096052646636963, "learning_rate": 0.0001405693950177936, "loss": 2.8709, "step": 3350 }, { "epoch": 1.4893333333333334, "grad_norm": 1.335901141166687, "learning_rate": 0.00014055160142348755, "loss": 2.7017, "step": 3351 }, { "epoch": 1.4897777777777779, "grad_norm": 1.4894367456436157, "learning_rate": 0.0001405338078291815, "loss": 2.3371, "step": 3352 }, { "epoch": 1.4902222222222221, "grad_norm": 1.6448569297790527, "learning_rate": 0.00014051601423487547, "loss": 2.3155, "step": 3353 }, { "epoch": 1.4906666666666666, "grad_norm": 1.5843652486801147, "learning_rate": 0.00014049822064056942, "loss": 2.0601, "step": 3354 }, { "epoch": 1.491111111111111, "grad_norm": 1.6974892616271973, "learning_rate": 0.00014048042704626335, "loss": 2.651, "step": 3355 }, { "epoch": 1.4915555555555555, "grad_norm": 1.653384804725647, "learning_rate": 0.00014046263345195728, "loss": 2.0151, "step": 3356 }, { "epoch": 1.492, "grad_norm": 1.6719948053359985, "learning_rate": 0.00014044483985765124, "loss": 2.2734, "step": 3357 }, { "epoch": 1.4924444444444445, "grad_norm": 1.239280104637146, "learning_rate": 0.0001404270462633452, "loss": 0.9993, "step": 3358 }, { "epoch": 1.492888888888889, "grad_norm": 1.8192747831344604, "learning_rate": 0.00014040925266903915, "loss": 2.4339, "step": 3359 }, { "epoch": 1.4933333333333334, "grad_norm": 1.7065457105636597, "learning_rate": 0.0001403914590747331, "loss": 2.1031, "step": 3360 }, { "epoch": 1.4937777777777779, "grad_norm": 1.8094873428344727, "learning_rate": 0.00014037366548042706, "loss": 2.3386, "step": 3361 }, { "epoch": 1.4942222222222221, "grad_norm": 1.534125566482544, "learning_rate": 0.000140355871886121, "loss": 1.7805, "step": 3362 }, { "epoch": 1.4946666666666666, "grad_norm": 1.619163990020752, "learning_rate": 0.00014033807829181495, "loss": 2.4424, "step": 3363 }, { "epoch": 1.495111111111111, "grad_norm": 1.7250571250915527, "learning_rate": 0.0001403202846975089, "loss": 2.0668, "step": 3364 }, { "epoch": 1.4955555555555555, "grad_norm": 1.4816625118255615, "learning_rate": 0.00014030249110320286, "loss": 2.0556, "step": 3365 }, { "epoch": 1.496, "grad_norm": 1.794979214668274, "learning_rate": 0.00014028469750889682, "loss": 1.9561, "step": 3366 }, { "epoch": 1.4964444444444445, "grad_norm": 1.855759859085083, "learning_rate": 0.00014026690391459078, "loss": 2.3782, "step": 3367 }, { "epoch": 1.496888888888889, "grad_norm": 1.706645131111145, "learning_rate": 0.0001402491103202847, "loss": 2.3613, "step": 3368 }, { "epoch": 1.4973333333333334, "grad_norm": 1.6514323949813843, "learning_rate": 0.00014023131672597863, "loss": 2.3841, "step": 3369 }, { "epoch": 1.4977777777777779, "grad_norm": 1.9299840927124023, "learning_rate": 0.0001402135231316726, "loss": 1.8054, "step": 3370 }, { "epoch": 1.4982222222222221, "grad_norm": 1.72597336769104, "learning_rate": 0.00014019572953736655, "loss": 2.2748, "step": 3371 }, { "epoch": 1.4986666666666666, "grad_norm": 1.791800618171692, "learning_rate": 0.0001401779359430605, "loss": 1.8032, "step": 3372 }, { "epoch": 1.499111111111111, "grad_norm": 1.779707431793213, "learning_rate": 0.00014016014234875446, "loss": 2.1661, "step": 3373 }, { "epoch": 1.4995555555555555, "grad_norm": 1.7183303833007812, "learning_rate": 0.00014014234875444842, "loss": 1.9356, "step": 3374 }, { "epoch": 1.5, "grad_norm": 1.637531042098999, "learning_rate": 0.00014012455516014235, "loss": 1.9201, "step": 3375 }, { "epoch": 1.5004444444444445, "grad_norm": 1.4911421537399292, "learning_rate": 0.0001401067615658363, "loss": 1.2015, "step": 3376 }, { "epoch": 1.500888888888889, "grad_norm": 1.6465941667556763, "learning_rate": 0.00014008896797153026, "loss": 1.8771, "step": 3377 }, { "epoch": 1.5013333333333332, "grad_norm": 1.7987436056137085, "learning_rate": 0.00014007117437722422, "loss": 2.0635, "step": 3378 }, { "epoch": 1.5017777777777779, "grad_norm": 1.6817963123321533, "learning_rate": 0.00014005338078291817, "loss": 1.8847, "step": 3379 }, { "epoch": 1.5022222222222221, "grad_norm": 1.6715887784957886, "learning_rate": 0.00014003558718861213, "loss": 1.8009, "step": 3380 }, { "epoch": 1.5026666666666668, "grad_norm": 2.189204216003418, "learning_rate": 0.00014001779359430606, "loss": 2.1705, "step": 3381 }, { "epoch": 1.503111111111111, "grad_norm": 1.7621302604675293, "learning_rate": 0.00014, "loss": 2.3973, "step": 3382 }, { "epoch": 1.5035555555555555, "grad_norm": 1.9194334745407104, "learning_rate": 0.00013998220640569394, "loss": 2.1278, "step": 3383 }, { "epoch": 1.504, "grad_norm": 2.001845121383667, "learning_rate": 0.0001399644128113879, "loss": 2.6398, "step": 3384 }, { "epoch": 1.5044444444444445, "grad_norm": 1.9402610063552856, "learning_rate": 0.00013994661921708186, "loss": 1.8872, "step": 3385 }, { "epoch": 1.504888888888889, "grad_norm": 1.9671640396118164, "learning_rate": 0.00013992882562277581, "loss": 2.1836, "step": 3386 }, { "epoch": 1.5053333333333332, "grad_norm": 2.1770575046539307, "learning_rate": 0.00013991103202846977, "loss": 2.1509, "step": 3387 }, { "epoch": 1.5057777777777779, "grad_norm": 2.059763193130493, "learning_rate": 0.0001398932384341637, "loss": 2.3197, "step": 3388 }, { "epoch": 1.5062222222222221, "grad_norm": 1.899730920791626, "learning_rate": 0.00013987544483985766, "loss": 1.9101, "step": 3389 }, { "epoch": 1.5066666666666668, "grad_norm": 1.6671397686004639, "learning_rate": 0.0001398576512455516, "loss": 1.8031, "step": 3390 }, { "epoch": 1.507111111111111, "grad_norm": 2.2725398540496826, "learning_rate": 0.00013983985765124557, "loss": 2.2501, "step": 3391 }, { "epoch": 1.5075555555555555, "grad_norm": 2.0739402770996094, "learning_rate": 0.00013982206405693953, "loss": 2.5371, "step": 3392 }, { "epoch": 1.508, "grad_norm": 2.2705914974212646, "learning_rate": 0.00013980427046263348, "loss": 2.1084, "step": 3393 }, { "epoch": 1.5084444444444445, "grad_norm": 1.981615662574768, "learning_rate": 0.0001397864768683274, "loss": 1.9832, "step": 3394 }, { "epoch": 1.508888888888889, "grad_norm": 2.1815102100372314, "learning_rate": 0.00013976868327402134, "loss": 2.1987, "step": 3395 }, { "epoch": 1.5093333333333332, "grad_norm": 1.9982526302337646, "learning_rate": 0.0001397508896797153, "loss": 2.1131, "step": 3396 }, { "epoch": 1.5097777777777779, "grad_norm": 2.5864624977111816, "learning_rate": 0.00013973309608540925, "loss": 2.7484, "step": 3397 }, { "epoch": 1.5102222222222221, "grad_norm": 2.7735939025878906, "learning_rate": 0.0001397153024911032, "loss": 2.8055, "step": 3398 }, { "epoch": 1.5106666666666668, "grad_norm": 2.547844648361206, "learning_rate": 0.00013969750889679717, "loss": 2.8584, "step": 3399 }, { "epoch": 1.511111111111111, "grad_norm": 3.193415880203247, "learning_rate": 0.0001396797153024911, "loss": 1.5981, "step": 3400 }, { "epoch": 1.5115555555555555, "grad_norm": 1.2286021709442139, "learning_rate": 0.00013966192170818505, "loss": 2.0801, "step": 3401 }, { "epoch": 1.512, "grad_norm": 1.3546948432922363, "learning_rate": 0.000139644128113879, "loss": 2.2488, "step": 3402 }, { "epoch": 1.5124444444444445, "grad_norm": 1.3589816093444824, "learning_rate": 0.00013962633451957297, "loss": 2.5642, "step": 3403 }, { "epoch": 1.512888888888889, "grad_norm": 0.9513995051383972, "learning_rate": 0.00013960854092526692, "loss": 0.5388, "step": 3404 }, { "epoch": 1.5133333333333332, "grad_norm": 1.2379084825515747, "learning_rate": 0.00013959074733096088, "loss": 1.2631, "step": 3405 }, { "epoch": 1.5137777777777779, "grad_norm": 1.4364656209945679, "learning_rate": 0.0001395729537366548, "loss": 2.2967, "step": 3406 }, { "epoch": 1.5142222222222221, "grad_norm": 1.6733119487762451, "learning_rate": 0.00013955516014234874, "loss": 2.4779, "step": 3407 }, { "epoch": 1.5146666666666668, "grad_norm": 1.715740442276001, "learning_rate": 0.0001395373665480427, "loss": 2.4899, "step": 3408 }, { "epoch": 1.515111111111111, "grad_norm": 1.6031004190444946, "learning_rate": 0.00013951957295373665, "loss": 2.3027, "step": 3409 }, { "epoch": 1.5155555555555555, "grad_norm": 1.6325712203979492, "learning_rate": 0.0001395017793594306, "loss": 1.7408, "step": 3410 }, { "epoch": 1.516, "grad_norm": 1.7524162530899048, "learning_rate": 0.00013948398576512456, "loss": 2.1623, "step": 3411 }, { "epoch": 1.5164444444444445, "grad_norm": 1.493108868598938, "learning_rate": 0.00013946619217081852, "loss": 2.1662, "step": 3412 }, { "epoch": 1.516888888888889, "grad_norm": 1.7417209148406982, "learning_rate": 0.00013944839857651245, "loss": 2.3913, "step": 3413 }, { "epoch": 1.5173333333333332, "grad_norm": 1.671183705329895, "learning_rate": 0.0001394306049822064, "loss": 2.4024, "step": 3414 }, { "epoch": 1.517777777777778, "grad_norm": 1.567742943763733, "learning_rate": 0.00013941281138790036, "loss": 1.9279, "step": 3415 }, { "epoch": 1.5182222222222221, "grad_norm": 1.7501814365386963, "learning_rate": 0.00013939501779359432, "loss": 1.9253, "step": 3416 }, { "epoch": 1.5186666666666668, "grad_norm": 1.5426925420761108, "learning_rate": 0.00013937722419928828, "loss": 2.021, "step": 3417 }, { "epoch": 1.519111111111111, "grad_norm": 1.4181113243103027, "learning_rate": 0.00013935943060498223, "loss": 1.803, "step": 3418 }, { "epoch": 1.5195555555555555, "grad_norm": 1.6199541091918945, "learning_rate": 0.00013934163701067616, "loss": 1.8904, "step": 3419 }, { "epoch": 1.52, "grad_norm": 1.6284871101379395, "learning_rate": 0.0001393238434163701, "loss": 2.1618, "step": 3420 }, { "epoch": 1.5204444444444445, "grad_norm": 2.004983425140381, "learning_rate": 0.00013930604982206405, "loss": 2.4985, "step": 3421 }, { "epoch": 1.520888888888889, "grad_norm": 1.890509843826294, "learning_rate": 0.000139288256227758, "loss": 2.0206, "step": 3422 }, { "epoch": 1.5213333333333332, "grad_norm": 2.1789512634277344, "learning_rate": 0.00013927046263345196, "loss": 1.9493, "step": 3423 }, { "epoch": 1.521777777777778, "grad_norm": 1.6540831327438354, "learning_rate": 0.00013925266903914592, "loss": 2.0598, "step": 3424 }, { "epoch": 1.5222222222222221, "grad_norm": 2.1028473377227783, "learning_rate": 0.00013923487544483987, "loss": 2.4147, "step": 3425 }, { "epoch": 1.5226666666666666, "grad_norm": 2.1696505546569824, "learning_rate": 0.0001392170818505338, "loss": 2.3895, "step": 3426 }, { "epoch": 1.523111111111111, "grad_norm": 2.0171515941619873, "learning_rate": 0.00013919928825622776, "loss": 2.0155, "step": 3427 }, { "epoch": 1.5235555555555556, "grad_norm": 2.0088951587677, "learning_rate": 0.00013918149466192172, "loss": 2.1746, "step": 3428 }, { "epoch": 1.524, "grad_norm": 1.8271888494491577, "learning_rate": 0.00013916370106761567, "loss": 2.2367, "step": 3429 }, { "epoch": 1.5244444444444445, "grad_norm": 2.0240986347198486, "learning_rate": 0.00013914590747330963, "loss": 2.5284, "step": 3430 }, { "epoch": 1.524888888888889, "grad_norm": 1.7613776922225952, "learning_rate": 0.00013912811387900359, "loss": 1.8526, "step": 3431 }, { "epoch": 1.5253333333333332, "grad_norm": 2.417299509048462, "learning_rate": 0.00013911032028469751, "loss": 2.6705, "step": 3432 }, { "epoch": 1.525777777777778, "grad_norm": 2.0129454135894775, "learning_rate": 0.00013909252669039144, "loss": 1.8315, "step": 3433 }, { "epoch": 1.5262222222222221, "grad_norm": 2.165886640548706, "learning_rate": 0.0001390747330960854, "loss": 2.5886, "step": 3434 }, { "epoch": 1.5266666666666666, "grad_norm": 2.1423709392547607, "learning_rate": 0.00013905693950177936, "loss": 1.9441, "step": 3435 }, { "epoch": 1.527111111111111, "grad_norm": 1.9875420331954956, "learning_rate": 0.0001390391459074733, "loss": 2.1317, "step": 3436 }, { "epoch": 1.5275555555555556, "grad_norm": 2.0450797080993652, "learning_rate": 0.00013902135231316727, "loss": 2.0809, "step": 3437 }, { "epoch": 1.528, "grad_norm": 2.3254358768463135, "learning_rate": 0.00013900355871886123, "loss": 2.5318, "step": 3438 }, { "epoch": 1.5284444444444445, "grad_norm": 2.041480779647827, "learning_rate": 0.00013898576512455516, "loss": 1.9015, "step": 3439 }, { "epoch": 1.528888888888889, "grad_norm": 1.6092534065246582, "learning_rate": 0.0001389679715302491, "loss": 1.5966, "step": 3440 }, { "epoch": 1.5293333333333332, "grad_norm": 2.0477304458618164, "learning_rate": 0.00013895017793594307, "loss": 2.1176, "step": 3441 }, { "epoch": 1.529777777777778, "grad_norm": 2.8084466457366943, "learning_rate": 0.00013893238434163703, "loss": 2.5453, "step": 3442 }, { "epoch": 1.5302222222222222, "grad_norm": 2.2902886867523193, "learning_rate": 0.00013891459074733098, "loss": 2.0898, "step": 3443 }, { "epoch": 1.5306666666666666, "grad_norm": 2.420135259628296, "learning_rate": 0.00013889679715302494, "loss": 2.3878, "step": 3444 }, { "epoch": 1.531111111111111, "grad_norm": 3.70841646194458, "learning_rate": 0.00013887900355871887, "loss": 2.4154, "step": 3445 }, { "epoch": 1.5315555555555556, "grad_norm": 2.6717746257781982, "learning_rate": 0.0001388612099644128, "loss": 2.1492, "step": 3446 }, { "epoch": 1.532, "grad_norm": 2.6898179054260254, "learning_rate": 0.00013884341637010675, "loss": 2.1948, "step": 3447 }, { "epoch": 1.5324444444444445, "grad_norm": 2.706569194793701, "learning_rate": 0.0001388256227758007, "loss": 2.5001, "step": 3448 }, { "epoch": 1.532888888888889, "grad_norm": 2.5174717903137207, "learning_rate": 0.00013880782918149467, "loss": 2.0004, "step": 3449 }, { "epoch": 1.5333333333333332, "grad_norm": 2.5441904067993164, "learning_rate": 0.00013879003558718862, "loss": 2.0386, "step": 3450 }, { "epoch": 1.533777777777778, "grad_norm": 1.2441266775131226, "learning_rate": 0.00013877224199288258, "loss": 2.3708, "step": 3451 }, { "epoch": 1.5342222222222222, "grad_norm": 1.6138944625854492, "learning_rate": 0.0001387544483985765, "loss": 2.2968, "step": 3452 }, { "epoch": 1.5346666666666666, "grad_norm": 1.7886635065078735, "learning_rate": 0.00013873665480427047, "loss": 2.3651, "step": 3453 }, { "epoch": 1.535111111111111, "grad_norm": 1.5253329277038574, "learning_rate": 0.00013871886120996442, "loss": 2.409, "step": 3454 }, { "epoch": 1.5355555555555556, "grad_norm": 1.6435463428497314, "learning_rate": 0.00013870106761565838, "loss": 2.1088, "step": 3455 }, { "epoch": 1.536, "grad_norm": 1.7751388549804688, "learning_rate": 0.00013868327402135233, "loss": 1.8769, "step": 3456 }, { "epoch": 1.5364444444444443, "grad_norm": 1.4366142749786377, "learning_rate": 0.0001386654804270463, "loss": 1.3134, "step": 3457 }, { "epoch": 1.536888888888889, "grad_norm": 1.9377721548080444, "learning_rate": 0.00013864768683274022, "loss": 2.3063, "step": 3458 }, { "epoch": 1.5373333333333332, "grad_norm": 2.1702098846435547, "learning_rate": 0.00013862989323843415, "loss": 1.8274, "step": 3459 }, { "epoch": 1.537777777777778, "grad_norm": 1.566851019859314, "learning_rate": 0.0001386120996441281, "loss": 2.2409, "step": 3460 }, { "epoch": 1.5382222222222222, "grad_norm": 1.5840020179748535, "learning_rate": 0.00013859430604982206, "loss": 2.676, "step": 3461 }, { "epoch": 1.5386666666666666, "grad_norm": 1.6341493129730225, "learning_rate": 0.00013857651245551602, "loss": 1.7889, "step": 3462 }, { "epoch": 1.539111111111111, "grad_norm": 1.810591220855713, "learning_rate": 0.00013855871886120998, "loss": 2.312, "step": 3463 }, { "epoch": 1.5395555555555556, "grad_norm": 1.6743979454040527, "learning_rate": 0.00013854092526690393, "loss": 2.1647, "step": 3464 }, { "epoch": 1.54, "grad_norm": 1.6081904172897339, "learning_rate": 0.00013852313167259786, "loss": 2.3812, "step": 3465 }, { "epoch": 1.5404444444444443, "grad_norm": 1.6036180257797241, "learning_rate": 0.00013850533807829182, "loss": 2.1979, "step": 3466 }, { "epoch": 1.540888888888889, "grad_norm": 1.4913078546524048, "learning_rate": 0.00013848754448398578, "loss": 2.0216, "step": 3467 }, { "epoch": 1.5413333333333332, "grad_norm": 1.6572915315628052, "learning_rate": 0.00013846975088967973, "loss": 2.0528, "step": 3468 }, { "epoch": 1.541777777777778, "grad_norm": 2.2356021404266357, "learning_rate": 0.0001384519572953737, "loss": 2.1856, "step": 3469 }, { "epoch": 1.5422222222222222, "grad_norm": 1.9612963199615479, "learning_rate": 0.00013843416370106764, "loss": 2.4308, "step": 3470 }, { "epoch": 1.5426666666666666, "grad_norm": 1.8817795515060425, "learning_rate": 0.00013841637010676157, "loss": 2.1989, "step": 3471 }, { "epoch": 1.543111111111111, "grad_norm": 1.5524616241455078, "learning_rate": 0.0001383985765124555, "loss": 1.9362, "step": 3472 }, { "epoch": 1.5435555555555556, "grad_norm": 1.8420137166976929, "learning_rate": 0.00013838078291814946, "loss": 2.0204, "step": 3473 }, { "epoch": 1.544, "grad_norm": 1.6466306447982788, "learning_rate": 0.00013836298932384342, "loss": 2.0033, "step": 3474 }, { "epoch": 1.5444444444444443, "grad_norm": 1.7015817165374756, "learning_rate": 0.00013834519572953737, "loss": 1.986, "step": 3475 }, { "epoch": 1.544888888888889, "grad_norm": 2.1093199253082275, "learning_rate": 0.00013832740213523133, "loss": 2.3153, "step": 3476 }, { "epoch": 1.5453333333333332, "grad_norm": 1.966652274131775, "learning_rate": 0.00013830960854092529, "loss": 2.2379, "step": 3477 }, { "epoch": 1.545777777777778, "grad_norm": 1.2320265769958496, "learning_rate": 0.00013829181494661922, "loss": 0.9829, "step": 3478 }, { "epoch": 1.5462222222222222, "grad_norm": 2.0316035747528076, "learning_rate": 0.00013827402135231317, "loss": 2.2463, "step": 3479 }, { "epoch": 1.5466666666666666, "grad_norm": 1.9726709127426147, "learning_rate": 0.00013825622775800713, "loss": 1.8361, "step": 3480 }, { "epoch": 1.547111111111111, "grad_norm": 1.76983642578125, "learning_rate": 0.00013823843416370108, "loss": 2.151, "step": 3481 }, { "epoch": 1.5475555555555556, "grad_norm": 1.8415701389312744, "learning_rate": 0.00013822064056939504, "loss": 2.0344, "step": 3482 }, { "epoch": 1.548, "grad_norm": 2.0059757232666016, "learning_rate": 0.000138202846975089, "loss": 2.1534, "step": 3483 }, { "epoch": 1.5484444444444443, "grad_norm": 1.8996038436889648, "learning_rate": 0.00013818505338078293, "loss": 2.3525, "step": 3484 }, { "epoch": 1.548888888888889, "grad_norm": 2.0750856399536133, "learning_rate": 0.00013816725978647686, "loss": 2.0834, "step": 3485 }, { "epoch": 1.5493333333333332, "grad_norm": 2.5925469398498535, "learning_rate": 0.0001381494661921708, "loss": 2.6209, "step": 3486 }, { "epoch": 1.549777777777778, "grad_norm": 2.2502434253692627, "learning_rate": 0.00013813167259786477, "loss": 2.4877, "step": 3487 }, { "epoch": 1.5502222222222222, "grad_norm": 2.1007752418518066, "learning_rate": 0.00013811387900355873, "loss": 2.164, "step": 3488 }, { "epoch": 1.5506666666666666, "grad_norm": 2.5511112213134766, "learning_rate": 0.00013809608540925268, "loss": 1.7419, "step": 3489 }, { "epoch": 1.551111111111111, "grad_norm": 2.076032876968384, "learning_rate": 0.0001380782918149466, "loss": 2.2016, "step": 3490 }, { "epoch": 1.5515555555555556, "grad_norm": 2.3464345932006836, "learning_rate": 0.00013806049822064057, "loss": 2.828, "step": 3491 }, { "epoch": 1.552, "grad_norm": 2.0291099548339844, "learning_rate": 0.00013804270462633452, "loss": 1.6438, "step": 3492 }, { "epoch": 1.5524444444444443, "grad_norm": 2.3025505542755127, "learning_rate": 0.00013802491103202848, "loss": 1.8687, "step": 3493 }, { "epoch": 1.552888888888889, "grad_norm": 2.7315847873687744, "learning_rate": 0.00013800711743772244, "loss": 2.6435, "step": 3494 }, { "epoch": 1.5533333333333332, "grad_norm": 2.493013620376587, "learning_rate": 0.0001379893238434164, "loss": 2.3683, "step": 3495 }, { "epoch": 1.553777777777778, "grad_norm": 2.1778078079223633, "learning_rate": 0.00013797153024911032, "loss": 2.2781, "step": 3496 }, { "epoch": 1.5542222222222222, "grad_norm": 2.7922496795654297, "learning_rate": 0.00013795373665480425, "loss": 2.0727, "step": 3497 }, { "epoch": 1.5546666666666666, "grad_norm": 2.76652193069458, "learning_rate": 0.0001379359430604982, "loss": 1.6055, "step": 3498 }, { "epoch": 1.555111111111111, "grad_norm": 2.2120234966278076, "learning_rate": 0.00013791814946619217, "loss": 1.9275, "step": 3499 }, { "epoch": 1.5555555555555556, "grad_norm": 3.5710928440093994, "learning_rate": 0.00013790035587188612, "loss": 2.7359, "step": 3500 }, { "epoch": 1.556, "grad_norm": 1.3622146844863892, "learning_rate": 0.00013788256227758008, "loss": 2.8565, "step": 3501 }, { "epoch": 1.5564444444444443, "grad_norm": 1.3293352127075195, "learning_rate": 0.00013786476868327404, "loss": 2.1254, "step": 3502 }, { "epoch": 1.556888888888889, "grad_norm": 1.4484037160873413, "learning_rate": 0.00013784697508896797, "loss": 2.5264, "step": 3503 }, { "epoch": 1.5573333333333332, "grad_norm": 1.5083986520767212, "learning_rate": 0.00013782918149466192, "loss": 2.212, "step": 3504 }, { "epoch": 1.557777777777778, "grad_norm": 1.6028261184692383, "learning_rate": 0.00013781138790035588, "loss": 2.1341, "step": 3505 }, { "epoch": 1.5582222222222222, "grad_norm": 1.4829325675964355, "learning_rate": 0.00013779359430604983, "loss": 1.8982, "step": 3506 }, { "epoch": 1.5586666666666666, "grad_norm": 1.533338189125061, "learning_rate": 0.0001377758007117438, "loss": 2.2298, "step": 3507 }, { "epoch": 1.5591111111111111, "grad_norm": 1.5170714855194092, "learning_rate": 0.00013775800711743775, "loss": 1.9902, "step": 3508 }, { "epoch": 1.5595555555555556, "grad_norm": 1.5905641317367554, "learning_rate": 0.00013774021352313168, "loss": 2.1633, "step": 3509 }, { "epoch": 1.56, "grad_norm": 1.601283311843872, "learning_rate": 0.0001377224199288256, "loss": 2.1138, "step": 3510 }, { "epoch": 1.5604444444444443, "grad_norm": 1.5424052476882935, "learning_rate": 0.00013770462633451956, "loss": 1.9056, "step": 3511 }, { "epoch": 1.560888888888889, "grad_norm": 1.7833845615386963, "learning_rate": 0.00013768683274021352, "loss": 2.0569, "step": 3512 }, { "epoch": 1.5613333333333332, "grad_norm": 1.729130506515503, "learning_rate": 0.00013766903914590748, "loss": 2.2351, "step": 3513 }, { "epoch": 1.561777777777778, "grad_norm": 1.7116620540618896, "learning_rate": 0.00013765124555160143, "loss": 2.7517, "step": 3514 }, { "epoch": 1.5622222222222222, "grad_norm": 1.9057334661483765, "learning_rate": 0.0001376334519572954, "loss": 2.7395, "step": 3515 }, { "epoch": 1.5626666666666666, "grad_norm": 1.9144783020019531, "learning_rate": 0.00013761565836298932, "loss": 2.2624, "step": 3516 }, { "epoch": 1.5631111111111111, "grad_norm": 1.756967306137085, "learning_rate": 0.00013759786476868327, "loss": 1.5217, "step": 3517 }, { "epoch": 1.5635555555555556, "grad_norm": 2.028092861175537, "learning_rate": 0.00013758007117437723, "loss": 2.6578, "step": 3518 }, { "epoch": 1.564, "grad_norm": 1.8625141382217407, "learning_rate": 0.0001375622775800712, "loss": 2.4186, "step": 3519 }, { "epoch": 1.5644444444444443, "grad_norm": 2.1919872760772705, "learning_rate": 0.00013754448398576514, "loss": 2.2043, "step": 3520 }, { "epoch": 1.564888888888889, "grad_norm": 1.709939956665039, "learning_rate": 0.0001375266903914591, "loss": 2.3172, "step": 3521 }, { "epoch": 1.5653333333333332, "grad_norm": 2.1514196395874023, "learning_rate": 0.00013750889679715303, "loss": 2.4595, "step": 3522 }, { "epoch": 1.565777777777778, "grad_norm": 1.8870500326156616, "learning_rate": 0.00013749110320284696, "loss": 2.6702, "step": 3523 }, { "epoch": 1.5662222222222222, "grad_norm": 1.7913631200790405, "learning_rate": 0.00013747330960854092, "loss": 2.3106, "step": 3524 }, { "epoch": 1.5666666666666667, "grad_norm": 1.8012170791625977, "learning_rate": 0.00013745551601423487, "loss": 1.7666, "step": 3525 }, { "epoch": 1.5671111111111111, "grad_norm": 2.001098871231079, "learning_rate": 0.00013743772241992883, "loss": 2.1891, "step": 3526 }, { "epoch": 1.5675555555555556, "grad_norm": 1.766248345375061, "learning_rate": 0.00013741992882562279, "loss": 2.1883, "step": 3527 }, { "epoch": 1.568, "grad_norm": 1.6869791746139526, "learning_rate": 0.00013740213523131674, "loss": 1.8203, "step": 3528 }, { "epoch": 1.5684444444444443, "grad_norm": 1.724373459815979, "learning_rate": 0.00013738434163701067, "loss": 1.8375, "step": 3529 }, { "epoch": 1.568888888888889, "grad_norm": 1.7977555990219116, "learning_rate": 0.00013736654804270463, "loss": 2.2153, "step": 3530 }, { "epoch": 1.5693333333333332, "grad_norm": 1.736857295036316, "learning_rate": 0.00013734875444839858, "loss": 1.7471, "step": 3531 }, { "epoch": 1.569777777777778, "grad_norm": 1.7593656778335571, "learning_rate": 0.00013733096085409254, "loss": 1.7479, "step": 3532 }, { "epoch": 1.5702222222222222, "grad_norm": 1.9257147312164307, "learning_rate": 0.0001373131672597865, "loss": 1.9489, "step": 3533 }, { "epoch": 1.5706666666666667, "grad_norm": 2.0456788539886475, "learning_rate": 0.00013729537366548045, "loss": 2.1165, "step": 3534 }, { "epoch": 1.5711111111111111, "grad_norm": 2.0241599082946777, "learning_rate": 0.00013727758007117438, "loss": 2.3391, "step": 3535 }, { "epoch": 1.5715555555555556, "grad_norm": 2.1118271350860596, "learning_rate": 0.0001372597864768683, "loss": 2.3182, "step": 3536 }, { "epoch": 1.572, "grad_norm": 2.060579776763916, "learning_rate": 0.00013724199288256227, "loss": 2.3438, "step": 3537 }, { "epoch": 1.5724444444444443, "grad_norm": 1.868186354637146, "learning_rate": 0.00013722419928825623, "loss": 1.67, "step": 3538 }, { "epoch": 1.572888888888889, "grad_norm": 2.3484630584716797, "learning_rate": 0.00013720640569395018, "loss": 2.2858, "step": 3539 }, { "epoch": 1.5733333333333333, "grad_norm": 2.5575010776519775, "learning_rate": 0.00013718861209964414, "loss": 2.4188, "step": 3540 }, { "epoch": 1.573777777777778, "grad_norm": 1.9896149635314941, "learning_rate": 0.0001371708185053381, "loss": 1.6996, "step": 3541 }, { "epoch": 1.5742222222222222, "grad_norm": 2.069122791290283, "learning_rate": 0.00013715302491103202, "loss": 1.705, "step": 3542 }, { "epoch": 1.5746666666666667, "grad_norm": 2.1964266300201416, "learning_rate": 0.00013713523131672598, "loss": 1.8919, "step": 3543 }, { "epoch": 1.5751111111111111, "grad_norm": 2.8693044185638428, "learning_rate": 0.00013711743772241994, "loss": 2.5328, "step": 3544 }, { "epoch": 1.5755555555555556, "grad_norm": 2.714355707168579, "learning_rate": 0.0001370996441281139, "loss": 2.4895, "step": 3545 }, { "epoch": 1.576, "grad_norm": 2.324647903442383, "learning_rate": 0.00013708185053380785, "loss": 2.1343, "step": 3546 }, { "epoch": 1.5764444444444443, "grad_norm": 2.0678861141204834, "learning_rate": 0.0001370640569395018, "loss": 2.1356, "step": 3547 }, { "epoch": 1.576888888888889, "grad_norm": 1.9118576049804688, "learning_rate": 0.00013704626334519574, "loss": 1.1119, "step": 3548 }, { "epoch": 1.5773333333333333, "grad_norm": 0.4530702829360962, "learning_rate": 0.00013702846975088967, "loss": 0.078, "step": 3549 }, { "epoch": 1.5777777777777777, "grad_norm": 2.177344560623169, "learning_rate": 0.00013701067615658362, "loss": 1.7089, "step": 3550 }, { "epoch": 1.5782222222222222, "grad_norm": 1.2385826110839844, "learning_rate": 0.00013699288256227758, "loss": 2.394, "step": 3551 }, { "epoch": 1.5786666666666667, "grad_norm": 1.412482500076294, "learning_rate": 0.00013697508896797154, "loss": 2.4647, "step": 3552 }, { "epoch": 1.5791111111111111, "grad_norm": 1.1037476062774658, "learning_rate": 0.0001369572953736655, "loss": 1.245, "step": 3553 }, { "epoch": 1.5795555555555556, "grad_norm": 1.6340032815933228, "learning_rate": 0.00013693950177935945, "loss": 1.5744, "step": 3554 }, { "epoch": 1.58, "grad_norm": 1.600129246711731, "learning_rate": 0.00013692170818505338, "loss": 2.1649, "step": 3555 }, { "epoch": 1.5804444444444443, "grad_norm": 1.5471206903457642, "learning_rate": 0.00013690391459074733, "loss": 1.8665, "step": 3556 }, { "epoch": 1.580888888888889, "grad_norm": 1.4040687084197998, "learning_rate": 0.0001368861209964413, "loss": 2.1955, "step": 3557 }, { "epoch": 1.5813333333333333, "grad_norm": 1.490673542022705, "learning_rate": 0.00013686832740213525, "loss": 2.0096, "step": 3558 }, { "epoch": 1.5817777777777777, "grad_norm": 1.6409083604812622, "learning_rate": 0.0001368505338078292, "loss": 2.2744, "step": 3559 }, { "epoch": 1.5822222222222222, "grad_norm": 1.574512004852295, "learning_rate": 0.00013683274021352316, "loss": 1.9902, "step": 3560 }, { "epoch": 1.5826666666666667, "grad_norm": 1.7083110809326172, "learning_rate": 0.0001368149466192171, "loss": 1.6893, "step": 3561 }, { "epoch": 1.5831111111111111, "grad_norm": 1.8733265399932861, "learning_rate": 0.00013679715302491102, "loss": 2.2119, "step": 3562 }, { "epoch": 1.5835555555555556, "grad_norm": 1.8871500492095947, "learning_rate": 0.00013677935943060498, "loss": 2.3291, "step": 3563 }, { "epoch": 1.584, "grad_norm": 1.5511809587478638, "learning_rate": 0.00013676156583629893, "loss": 2.2039, "step": 3564 }, { "epoch": 1.5844444444444443, "grad_norm": 1.8675401210784912, "learning_rate": 0.0001367437722419929, "loss": 2.0337, "step": 3565 }, { "epoch": 1.584888888888889, "grad_norm": 1.8697566986083984, "learning_rate": 0.00013672597864768684, "loss": 1.5566, "step": 3566 }, { "epoch": 1.5853333333333333, "grad_norm": 1.7386986017227173, "learning_rate": 0.0001367081850533808, "loss": 2.0596, "step": 3567 }, { "epoch": 1.5857777777777777, "grad_norm": 1.9721729755401611, "learning_rate": 0.00013669039145907473, "loss": 2.6456, "step": 3568 }, { "epoch": 1.5862222222222222, "grad_norm": 1.8694285154342651, "learning_rate": 0.0001366725978647687, "loss": 2.3915, "step": 3569 }, { "epoch": 1.5866666666666667, "grad_norm": 2.148411512374878, "learning_rate": 0.00013665480427046264, "loss": 2.6203, "step": 3570 }, { "epoch": 1.5871111111111111, "grad_norm": 1.9500958919525146, "learning_rate": 0.0001366370106761566, "loss": 2.3018, "step": 3571 }, { "epoch": 1.5875555555555556, "grad_norm": 1.888933777809143, "learning_rate": 0.00013661921708185056, "loss": 2.2713, "step": 3572 }, { "epoch": 1.588, "grad_norm": 2.1509106159210205, "learning_rate": 0.0001366014234875445, "loss": 2.2562, "step": 3573 }, { "epoch": 1.5884444444444443, "grad_norm": 1.9218084812164307, "learning_rate": 0.00013658362989323844, "loss": 1.9106, "step": 3574 }, { "epoch": 1.588888888888889, "grad_norm": 2.161742687225342, "learning_rate": 0.00013656583629893237, "loss": 2.061, "step": 3575 }, { "epoch": 1.5893333333333333, "grad_norm": 1.4747031927108765, "learning_rate": 0.00013654804270462633, "loss": 1.0917, "step": 3576 }, { "epoch": 1.5897777777777777, "grad_norm": 2.0540664196014404, "learning_rate": 0.00013653024911032029, "loss": 2.2196, "step": 3577 }, { "epoch": 1.5902222222222222, "grad_norm": 2.018321990966797, "learning_rate": 0.00013651245551601424, "loss": 2.4654, "step": 3578 }, { "epoch": 1.5906666666666667, "grad_norm": 2.201575517654419, "learning_rate": 0.0001364946619217082, "loss": 2.1311, "step": 3579 }, { "epoch": 1.5911111111111111, "grad_norm": 2.108130931854248, "learning_rate": 0.00013647686832740213, "loss": 2.4114, "step": 3580 }, { "epoch": 1.5915555555555554, "grad_norm": 1.782263159751892, "learning_rate": 0.00013645907473309608, "loss": 1.9585, "step": 3581 }, { "epoch": 1.592, "grad_norm": 2.01802134513855, "learning_rate": 0.00013644128113879004, "loss": 2.4353, "step": 3582 }, { "epoch": 1.5924444444444443, "grad_norm": 2.023480176925659, "learning_rate": 0.000136423487544484, "loss": 1.6961, "step": 3583 }, { "epoch": 1.592888888888889, "grad_norm": 2.0900518894195557, "learning_rate": 0.00013640569395017795, "loss": 2.1796, "step": 3584 }, { "epoch": 1.5933333333333333, "grad_norm": 2.238590717315674, "learning_rate": 0.0001363879003558719, "loss": 2.2932, "step": 3585 }, { "epoch": 1.5937777777777777, "grad_norm": 2.2192628383636475, "learning_rate": 0.00013637010676156584, "loss": 1.9587, "step": 3586 }, { "epoch": 1.5942222222222222, "grad_norm": 2.1162943840026855, "learning_rate": 0.00013635231316725977, "loss": 2.5576, "step": 3587 }, { "epoch": 1.5946666666666667, "grad_norm": 2.580493927001953, "learning_rate": 0.00013633451957295373, "loss": 2.4377, "step": 3588 }, { "epoch": 1.5951111111111111, "grad_norm": 2.192513942718506, "learning_rate": 0.00013631672597864768, "loss": 2.2521, "step": 3589 }, { "epoch": 1.5955555555555554, "grad_norm": 2.2190680503845215, "learning_rate": 0.00013629893238434164, "loss": 2.0167, "step": 3590 }, { "epoch": 1.596, "grad_norm": 2.1189939975738525, "learning_rate": 0.0001362811387900356, "loss": 2.1514, "step": 3591 }, { "epoch": 1.5964444444444443, "grad_norm": 1.9854724407196045, "learning_rate": 0.00013626334519572955, "loss": 2.4261, "step": 3592 }, { "epoch": 1.596888888888889, "grad_norm": 2.232863426208496, "learning_rate": 0.00013624555160142348, "loss": 2.0627, "step": 3593 }, { "epoch": 1.5973333333333333, "grad_norm": 2.1362996101379395, "learning_rate": 0.00013622775800711744, "loss": 2.0867, "step": 3594 }, { "epoch": 1.5977777777777777, "grad_norm": 2.116586923599243, "learning_rate": 0.0001362099644128114, "loss": 2.0974, "step": 3595 }, { "epoch": 1.5982222222222222, "grad_norm": 2.2818892002105713, "learning_rate": 0.00013619217081850535, "loss": 2.4957, "step": 3596 }, { "epoch": 1.5986666666666667, "grad_norm": 2.1921818256378174, "learning_rate": 0.0001361743772241993, "loss": 2.0431, "step": 3597 }, { "epoch": 1.5991111111111111, "grad_norm": 2.915421724319458, "learning_rate": 0.00013615658362989326, "loss": 2.8215, "step": 3598 }, { "epoch": 1.5995555555555554, "grad_norm": 2.097731590270996, "learning_rate": 0.0001361387900355872, "loss": 1.9497, "step": 3599 }, { "epoch": 1.6, "grad_norm": 2.766552209854126, "learning_rate": 0.00013612099644128112, "loss": 2.1285, "step": 3600 }, { "epoch": 1.6004444444444443, "grad_norm": 1.9238742589950562, "learning_rate": 0.00013610320284697508, "loss": 2.5526, "step": 3601 }, { "epoch": 1.600888888888889, "grad_norm": 1.162853717803955, "learning_rate": 0.00013608540925266903, "loss": 1.0674, "step": 3602 }, { "epoch": 1.6013333333333333, "grad_norm": 1.4226114749908447, "learning_rate": 0.000136067615658363, "loss": 2.2656, "step": 3603 }, { "epoch": 1.6017777777777777, "grad_norm": 1.5394558906555176, "learning_rate": 0.00013604982206405695, "loss": 2.1054, "step": 3604 }, { "epoch": 1.6022222222222222, "grad_norm": 1.7394784688949585, "learning_rate": 0.0001360320284697509, "loss": 2.3422, "step": 3605 }, { "epoch": 1.6026666666666667, "grad_norm": 1.7149112224578857, "learning_rate": 0.00013601423487544483, "loss": 2.4442, "step": 3606 }, { "epoch": 1.6031111111111112, "grad_norm": 1.8628418445587158, "learning_rate": 0.0001359964412811388, "loss": 2.5289, "step": 3607 }, { "epoch": 1.6035555555555554, "grad_norm": 1.5874834060668945, "learning_rate": 0.00013597864768683275, "loss": 2.4502, "step": 3608 }, { "epoch": 1.604, "grad_norm": 1.6472437381744385, "learning_rate": 0.0001359608540925267, "loss": 2.1657, "step": 3609 }, { "epoch": 1.6044444444444443, "grad_norm": 1.950655221939087, "learning_rate": 0.00013594306049822066, "loss": 1.9845, "step": 3610 }, { "epoch": 1.604888888888889, "grad_norm": 1.5699740648269653, "learning_rate": 0.00013592526690391462, "loss": 2.5056, "step": 3611 }, { "epoch": 1.6053333333333333, "grad_norm": 1.7249257564544678, "learning_rate": 0.00013590747330960855, "loss": 2.4613, "step": 3612 }, { "epoch": 1.6057777777777777, "grad_norm": 1.6163465976715088, "learning_rate": 0.00013588967971530248, "loss": 2.2756, "step": 3613 }, { "epoch": 1.6062222222222222, "grad_norm": 1.6546217203140259, "learning_rate": 0.00013587188612099643, "loss": 2.0813, "step": 3614 }, { "epoch": 1.6066666666666667, "grad_norm": 1.7059136629104614, "learning_rate": 0.0001358540925266904, "loss": 2.2557, "step": 3615 }, { "epoch": 1.6071111111111112, "grad_norm": 1.9633029699325562, "learning_rate": 0.00013583629893238434, "loss": 2.3581, "step": 3616 }, { "epoch": 1.6075555555555554, "grad_norm": 1.5104560852050781, "learning_rate": 0.0001358185053380783, "loss": 1.8789, "step": 3617 }, { "epoch": 1.608, "grad_norm": 1.7537540197372437, "learning_rate": 0.00013580071174377226, "loss": 2.1946, "step": 3618 }, { "epoch": 1.6084444444444443, "grad_norm": 1.8772600889205933, "learning_rate": 0.0001357829181494662, "loss": 2.3868, "step": 3619 }, { "epoch": 1.608888888888889, "grad_norm": 1.7722148895263672, "learning_rate": 0.00013576512455516014, "loss": 2.0009, "step": 3620 }, { "epoch": 1.6093333333333333, "grad_norm": 1.662315011024475, "learning_rate": 0.0001357473309608541, "loss": 1.721, "step": 3621 }, { "epoch": 1.6097777777777778, "grad_norm": 1.9207539558410645, "learning_rate": 0.00013572953736654806, "loss": 2.4827, "step": 3622 }, { "epoch": 1.6102222222222222, "grad_norm": 1.8683892488479614, "learning_rate": 0.000135711743772242, "loss": 2.2517, "step": 3623 }, { "epoch": 1.6106666666666667, "grad_norm": 1.8369477987289429, "learning_rate": 0.00013569395017793597, "loss": 2.0752, "step": 3624 }, { "epoch": 1.6111111111111112, "grad_norm": 1.9077363014221191, "learning_rate": 0.0001356761565836299, "loss": 1.9962, "step": 3625 }, { "epoch": 1.6115555555555554, "grad_norm": 1.9185972213745117, "learning_rate": 0.00013565836298932383, "loss": 1.8832, "step": 3626 }, { "epoch": 1.612, "grad_norm": 1.9970186948776245, "learning_rate": 0.00013564056939501778, "loss": 2.2276, "step": 3627 }, { "epoch": 1.6124444444444443, "grad_norm": 1.8423793315887451, "learning_rate": 0.00013562277580071174, "loss": 2.3652, "step": 3628 }, { "epoch": 1.612888888888889, "grad_norm": 2.101058006286621, "learning_rate": 0.0001356049822064057, "loss": 2.3492, "step": 3629 }, { "epoch": 1.6133333333333333, "grad_norm": 2.483633279800415, "learning_rate": 0.00013558718861209965, "loss": 2.4914, "step": 3630 }, { "epoch": 1.6137777777777778, "grad_norm": 2.1490073204040527, "learning_rate": 0.0001355693950177936, "loss": 1.7586, "step": 3631 }, { "epoch": 1.6142222222222222, "grad_norm": 2.103785276412964, "learning_rate": 0.00013555160142348754, "loss": 2.379, "step": 3632 }, { "epoch": 1.6146666666666667, "grad_norm": 2.0692965984344482, "learning_rate": 0.0001355338078291815, "loss": 2.0689, "step": 3633 }, { "epoch": 1.6151111111111112, "grad_norm": 2.025420665740967, "learning_rate": 0.00013551601423487545, "loss": 2.1845, "step": 3634 }, { "epoch": 1.6155555555555554, "grad_norm": 1.7488460540771484, "learning_rate": 0.0001354982206405694, "loss": 1.6538, "step": 3635 }, { "epoch": 1.616, "grad_norm": 2.308826446533203, "learning_rate": 0.00013548042704626337, "loss": 2.2989, "step": 3636 }, { "epoch": 1.6164444444444444, "grad_norm": 2.220733404159546, "learning_rate": 0.00013546263345195732, "loss": 2.6318, "step": 3637 }, { "epoch": 1.616888888888889, "grad_norm": 2.038041830062866, "learning_rate": 0.00013544483985765125, "loss": 1.8199, "step": 3638 }, { "epoch": 1.6173333333333333, "grad_norm": 2.3639976978302, "learning_rate": 0.00013542704626334518, "loss": 1.7709, "step": 3639 }, { "epoch": 1.6177777777777778, "grad_norm": 2.1797988414764404, "learning_rate": 0.00013540925266903914, "loss": 1.9726, "step": 3640 }, { "epoch": 1.6182222222222222, "grad_norm": 2.5964488983154297, "learning_rate": 0.0001353914590747331, "loss": 2.5678, "step": 3641 }, { "epoch": 1.6186666666666667, "grad_norm": 2.0529944896698, "learning_rate": 0.00013537366548042705, "loss": 1.9994, "step": 3642 }, { "epoch": 1.6191111111111112, "grad_norm": 1.9986488819122314, "learning_rate": 0.000135355871886121, "loss": 2.1701, "step": 3643 }, { "epoch": 1.6195555555555554, "grad_norm": 2.469334840774536, "learning_rate": 0.00013533807829181496, "loss": 2.5633, "step": 3644 }, { "epoch": 1.62, "grad_norm": 2.4147121906280518, "learning_rate": 0.0001353202846975089, "loss": 2.2908, "step": 3645 }, { "epoch": 1.6204444444444444, "grad_norm": 2.1340951919555664, "learning_rate": 0.00013530249110320285, "loss": 1.8365, "step": 3646 }, { "epoch": 1.620888888888889, "grad_norm": 2.6548309326171875, "learning_rate": 0.0001352846975088968, "loss": 2.6132, "step": 3647 }, { "epoch": 1.6213333333333333, "grad_norm": 2.923912286758423, "learning_rate": 0.00013526690391459076, "loss": 2.6588, "step": 3648 }, { "epoch": 1.6217777777777778, "grad_norm": 2.134868621826172, "learning_rate": 0.00013524911032028472, "loss": 0.9781, "step": 3649 }, { "epoch": 1.6222222222222222, "grad_norm": 3.1165969371795654, "learning_rate": 0.00013523131672597868, "loss": 1.6002, "step": 3650 }, { "epoch": 1.6226666666666667, "grad_norm": 1.4649471044540405, "learning_rate": 0.0001352135231316726, "loss": 0.9932, "step": 3651 }, { "epoch": 1.6231111111111112, "grad_norm": 1.4319626092910767, "learning_rate": 0.00013519572953736653, "loss": 2.754, "step": 3652 }, { "epoch": 1.6235555555555554, "grad_norm": 1.6543512344360352, "learning_rate": 0.0001351779359430605, "loss": 2.3402, "step": 3653 }, { "epoch": 1.624, "grad_norm": 1.540387511253357, "learning_rate": 0.00013516014234875445, "loss": 2.5623, "step": 3654 }, { "epoch": 1.6244444444444444, "grad_norm": 1.4166914224624634, "learning_rate": 0.0001351423487544484, "loss": 1.6858, "step": 3655 }, { "epoch": 1.624888888888889, "grad_norm": 2.0138063430786133, "learning_rate": 0.00013512455516014236, "loss": 2.6032, "step": 3656 }, { "epoch": 1.6253333333333333, "grad_norm": 1.8225046396255493, "learning_rate": 0.00013510676156583632, "loss": 2.6059, "step": 3657 }, { "epoch": 1.6257777777777778, "grad_norm": 1.8486915826797485, "learning_rate": 0.00013508896797153025, "loss": 2.212, "step": 3658 }, { "epoch": 1.6262222222222222, "grad_norm": 1.6416893005371094, "learning_rate": 0.0001350711743772242, "loss": 2.4992, "step": 3659 }, { "epoch": 1.6266666666666667, "grad_norm": 1.5142600536346436, "learning_rate": 0.00013505338078291816, "loss": 2.5075, "step": 3660 }, { "epoch": 1.6271111111111112, "grad_norm": 1.2331926822662354, "learning_rate": 0.00013503558718861212, "loss": 0.9244, "step": 3661 }, { "epoch": 1.6275555555555554, "grad_norm": 1.6997432708740234, "learning_rate": 0.00013501779359430607, "loss": 2.1417, "step": 3662 }, { "epoch": 1.6280000000000001, "grad_norm": 1.8140586614608765, "learning_rate": 0.00013500000000000003, "loss": 2.3071, "step": 3663 }, { "epoch": 1.6284444444444444, "grad_norm": 1.7522242069244385, "learning_rate": 0.00013498220640569396, "loss": 1.7328, "step": 3664 }, { "epoch": 1.628888888888889, "grad_norm": 1.7086604833602905, "learning_rate": 0.0001349644128113879, "loss": 2.15, "step": 3665 }, { "epoch": 1.6293333333333333, "grad_norm": 1.699811577796936, "learning_rate": 0.00013494661921708184, "loss": 2.1629, "step": 3666 }, { "epoch": 1.6297777777777778, "grad_norm": 1.6811929941177368, "learning_rate": 0.0001349288256227758, "loss": 1.7282, "step": 3667 }, { "epoch": 1.6302222222222222, "grad_norm": 1.8707060813903809, "learning_rate": 0.00013491103202846976, "loss": 2.4604, "step": 3668 }, { "epoch": 1.6306666666666667, "grad_norm": 1.7901276350021362, "learning_rate": 0.00013489323843416371, "loss": 2.1453, "step": 3669 }, { "epoch": 1.6311111111111112, "grad_norm": 1.7635587453842163, "learning_rate": 0.00013487544483985764, "loss": 2.688, "step": 3670 }, { "epoch": 1.6315555555555554, "grad_norm": 2.1348886489868164, "learning_rate": 0.0001348576512455516, "loss": 2.6815, "step": 3671 }, { "epoch": 1.6320000000000001, "grad_norm": 1.738974690437317, "learning_rate": 0.00013483985765124556, "loss": 2.1772, "step": 3672 }, { "epoch": 1.6324444444444444, "grad_norm": 1.7756551504135132, "learning_rate": 0.0001348220640569395, "loss": 2.3101, "step": 3673 }, { "epoch": 1.6328888888888888, "grad_norm": 1.9206149578094482, "learning_rate": 0.00013480427046263347, "loss": 2.318, "step": 3674 }, { "epoch": 1.6333333333333333, "grad_norm": 1.740354299545288, "learning_rate": 0.00013478647686832743, "loss": 1.6665, "step": 3675 }, { "epoch": 1.6337777777777778, "grad_norm": 1.8040590286254883, "learning_rate": 0.00013476868327402135, "loss": 1.9037, "step": 3676 }, { "epoch": 1.6342222222222222, "grad_norm": 1.8580551147460938, "learning_rate": 0.0001347508896797153, "loss": 1.9794, "step": 3677 }, { "epoch": 1.6346666666666667, "grad_norm": 1.9417858123779297, "learning_rate": 0.00013473309608540924, "loss": 2.0177, "step": 3678 }, { "epoch": 1.6351111111111112, "grad_norm": 1.7505145072937012, "learning_rate": 0.0001347153024911032, "loss": 2.1137, "step": 3679 }, { "epoch": 1.6355555555555554, "grad_norm": 2.044389486312866, "learning_rate": 0.00013469750889679715, "loss": 2.5816, "step": 3680 }, { "epoch": 1.6360000000000001, "grad_norm": 1.7209917306900024, "learning_rate": 0.0001346797153024911, "loss": 1.2114, "step": 3681 }, { "epoch": 1.6364444444444444, "grad_norm": 0.24984845519065857, "learning_rate": 0.00013466192170818507, "loss": 0.037, "step": 3682 }, { "epoch": 1.6368888888888888, "grad_norm": 2.0459017753601074, "learning_rate": 0.000134644128113879, "loss": 2.2279, "step": 3683 }, { "epoch": 1.6373333333333333, "grad_norm": 2.004143714904785, "learning_rate": 0.00013462633451957295, "loss": 2.1229, "step": 3684 }, { "epoch": 1.6377777777777778, "grad_norm": 1.9758665561676025, "learning_rate": 0.0001346085409252669, "loss": 2.0448, "step": 3685 }, { "epoch": 1.6382222222222222, "grad_norm": 2.130927324295044, "learning_rate": 0.00013459074733096087, "loss": 2.2224, "step": 3686 }, { "epoch": 1.6386666666666667, "grad_norm": 2.007413148880005, "learning_rate": 0.00013457295373665482, "loss": 2.2341, "step": 3687 }, { "epoch": 1.6391111111111112, "grad_norm": 2.422636032104492, "learning_rate": 0.00013455516014234878, "loss": 2.4104, "step": 3688 }, { "epoch": 1.6395555555555554, "grad_norm": 2.4192779064178467, "learning_rate": 0.0001345373665480427, "loss": 2.3782, "step": 3689 }, { "epoch": 1.6400000000000001, "grad_norm": 1.9421988725662231, "learning_rate": 0.00013451957295373666, "loss": 2.1848, "step": 3690 }, { "epoch": 1.6404444444444444, "grad_norm": 2.0658280849456787, "learning_rate": 0.0001345017793594306, "loss": 1.9037, "step": 3691 }, { "epoch": 1.6408888888888888, "grad_norm": 2.1715986728668213, "learning_rate": 0.00013448398576512455, "loss": 2.5529, "step": 3692 }, { "epoch": 1.6413333333333333, "grad_norm": 2.3295042514801025, "learning_rate": 0.0001344661921708185, "loss": 2.2945, "step": 3693 }, { "epoch": 1.6417777777777778, "grad_norm": 2.211021661758423, "learning_rate": 0.00013444839857651246, "loss": 2.2324, "step": 3694 }, { "epoch": 1.6422222222222222, "grad_norm": 2.5660417079925537, "learning_rate": 0.00013443060498220642, "loss": 2.6198, "step": 3695 }, { "epoch": 1.6426666666666667, "grad_norm": 2.0584888458251953, "learning_rate": 0.00013441281138790035, "loss": 1.9456, "step": 3696 }, { "epoch": 1.6431111111111112, "grad_norm": 2.6596102714538574, "learning_rate": 0.0001343950177935943, "loss": 3.0825, "step": 3697 }, { "epoch": 1.6435555555555554, "grad_norm": 2.368485689163208, "learning_rate": 0.00013437722419928826, "loss": 2.4511, "step": 3698 }, { "epoch": 1.6440000000000001, "grad_norm": 2.7313435077667236, "learning_rate": 0.00013435943060498222, "loss": 1.0798, "step": 3699 }, { "epoch": 1.6444444444444444, "grad_norm": 2.3031601905822754, "learning_rate": 0.00013434163701067618, "loss": 2.4437, "step": 3700 }, { "epoch": 1.6448888888888888, "grad_norm": 1.3070570230484009, "learning_rate": 0.00013432384341637013, "loss": 2.4191, "step": 3701 }, { "epoch": 1.6453333333333333, "grad_norm": 1.4076443910598755, "learning_rate": 0.00013430604982206406, "loss": 2.2877, "step": 3702 }, { "epoch": 1.6457777777777778, "grad_norm": 1.5221251249313354, "learning_rate": 0.000134288256227758, "loss": 2.6039, "step": 3703 }, { "epoch": 1.6462222222222223, "grad_norm": 1.4921314716339111, "learning_rate": 0.00013427046263345195, "loss": 2.7059, "step": 3704 }, { "epoch": 1.6466666666666665, "grad_norm": 1.7720656394958496, "learning_rate": 0.0001342526690391459, "loss": 2.9481, "step": 3705 }, { "epoch": 1.6471111111111112, "grad_norm": 1.6238889694213867, "learning_rate": 0.00013423487544483986, "loss": 1.6842, "step": 3706 }, { "epoch": 1.6475555555555554, "grad_norm": 1.5820468664169312, "learning_rate": 0.00013421708185053382, "loss": 2.4521, "step": 3707 }, { "epoch": 1.6480000000000001, "grad_norm": 1.584951639175415, "learning_rate": 0.00013419928825622777, "loss": 1.9391, "step": 3708 }, { "epoch": 1.6484444444444444, "grad_norm": 1.8263479471206665, "learning_rate": 0.0001341814946619217, "loss": 2.4315, "step": 3709 }, { "epoch": 1.6488888888888888, "grad_norm": 1.4678611755371094, "learning_rate": 0.00013416370106761566, "loss": 1.9397, "step": 3710 }, { "epoch": 1.6493333333333333, "grad_norm": 1.4494836330413818, "learning_rate": 0.00013414590747330962, "loss": 1.6722, "step": 3711 }, { "epoch": 1.6497777777777778, "grad_norm": 1.7007122039794922, "learning_rate": 0.00013412811387900357, "loss": 2.412, "step": 3712 }, { "epoch": 1.6502222222222223, "grad_norm": 1.7339609861373901, "learning_rate": 0.00013411032028469753, "loss": 2.1592, "step": 3713 }, { "epoch": 1.6506666666666665, "grad_norm": 1.8747652769088745, "learning_rate": 0.00013409252669039148, "loss": 2.8591, "step": 3714 }, { "epoch": 1.6511111111111112, "grad_norm": 1.7554471492767334, "learning_rate": 0.00013407473309608541, "loss": 2.3202, "step": 3715 }, { "epoch": 1.6515555555555554, "grad_norm": 1.5907261371612549, "learning_rate": 0.00013405693950177934, "loss": 1.7988, "step": 3716 }, { "epoch": 1.6520000000000001, "grad_norm": 1.1176869869232178, "learning_rate": 0.0001340391459074733, "loss": 0.8156, "step": 3717 }, { "epoch": 1.6524444444444444, "grad_norm": 1.6439223289489746, "learning_rate": 0.00013402135231316726, "loss": 2.4484, "step": 3718 }, { "epoch": 1.6528888888888889, "grad_norm": 1.4052650928497314, "learning_rate": 0.0001340035587188612, "loss": 1.775, "step": 3719 }, { "epoch": 1.6533333333333333, "grad_norm": 1.7691340446472168, "learning_rate": 0.00013398576512455517, "loss": 2.0681, "step": 3720 }, { "epoch": 1.6537777777777778, "grad_norm": 1.88296377658844, "learning_rate": 0.00013396797153024913, "loss": 2.1395, "step": 3721 }, { "epoch": 1.6542222222222223, "grad_norm": 1.7832571268081665, "learning_rate": 0.00013395017793594306, "loss": 1.7583, "step": 3722 }, { "epoch": 1.6546666666666665, "grad_norm": 1.8587702512741089, "learning_rate": 0.000133932384341637, "loss": 1.9886, "step": 3723 }, { "epoch": 1.6551111111111112, "grad_norm": 1.7117775678634644, "learning_rate": 0.00013391459074733097, "loss": 2.3072, "step": 3724 }, { "epoch": 1.6555555555555554, "grad_norm": 1.7575953006744385, "learning_rate": 0.00013389679715302493, "loss": 2.2971, "step": 3725 }, { "epoch": 1.6560000000000001, "grad_norm": 1.789642333984375, "learning_rate": 0.00013387900355871888, "loss": 1.8977, "step": 3726 }, { "epoch": 1.6564444444444444, "grad_norm": 1.8398528099060059, "learning_rate": 0.00013386120996441284, "loss": 2.1343, "step": 3727 }, { "epoch": 1.6568888888888889, "grad_norm": 1.2515662908554077, "learning_rate": 0.00013384341637010677, "loss": 0.9636, "step": 3728 }, { "epoch": 1.6573333333333333, "grad_norm": 2.197495937347412, "learning_rate": 0.0001338256227758007, "loss": 2.1068, "step": 3729 }, { "epoch": 1.6577777777777778, "grad_norm": 1.9065452814102173, "learning_rate": 0.00013380782918149465, "loss": 2.1467, "step": 3730 }, { "epoch": 1.6582222222222223, "grad_norm": 2.0440008640289307, "learning_rate": 0.0001337900355871886, "loss": 2.2524, "step": 3731 }, { "epoch": 1.6586666666666665, "grad_norm": 2.092015266418457, "learning_rate": 0.00013377224199288257, "loss": 1.9448, "step": 3732 }, { "epoch": 1.6591111111111112, "grad_norm": 2.170003652572632, "learning_rate": 0.00013375444839857652, "loss": 2.7868, "step": 3733 }, { "epoch": 1.6595555555555555, "grad_norm": 2.460923910140991, "learning_rate": 0.00013373665480427048, "loss": 2.7106, "step": 3734 }, { "epoch": 1.6600000000000001, "grad_norm": 2.117558717727661, "learning_rate": 0.0001337188612099644, "loss": 1.8442, "step": 3735 }, { "epoch": 1.6604444444444444, "grad_norm": 1.9140143394470215, "learning_rate": 0.00013370106761565837, "loss": 1.7407, "step": 3736 }, { "epoch": 1.6608888888888889, "grad_norm": 2.178762912750244, "learning_rate": 0.00013368327402135232, "loss": 2.0877, "step": 3737 }, { "epoch": 1.6613333333333333, "grad_norm": 1.4736453294754028, "learning_rate": 0.00013366548042704628, "loss": 1.2792, "step": 3738 }, { "epoch": 1.6617777777777778, "grad_norm": 2.096111536026001, "learning_rate": 0.00013364768683274023, "loss": 2.0112, "step": 3739 }, { "epoch": 1.6622222222222223, "grad_norm": 2.047461986541748, "learning_rate": 0.0001336298932384342, "loss": 2.578, "step": 3740 }, { "epoch": 1.6626666666666665, "grad_norm": 2.3218650817871094, "learning_rate": 0.00013361209964412812, "loss": 2.2947, "step": 3741 }, { "epoch": 1.6631111111111112, "grad_norm": 2.492847204208374, "learning_rate": 0.00013359430604982205, "loss": 2.4653, "step": 3742 }, { "epoch": 1.6635555555555555, "grad_norm": 2.1130130290985107, "learning_rate": 0.000133576512455516, "loss": 1.9604, "step": 3743 }, { "epoch": 1.6640000000000001, "grad_norm": 2.3125598430633545, "learning_rate": 0.00013355871886120996, "loss": 2.0361, "step": 3744 }, { "epoch": 1.6644444444444444, "grad_norm": 2.435710906982422, "learning_rate": 0.00013354092526690392, "loss": 2.6202, "step": 3745 }, { "epoch": 1.6648888888888889, "grad_norm": 2.7004244327545166, "learning_rate": 0.00013352313167259788, "loss": 2.2756, "step": 3746 }, { "epoch": 1.6653333333333333, "grad_norm": 1.9285310506820679, "learning_rate": 0.00013350533807829183, "loss": 1.8988, "step": 3747 }, { "epoch": 1.6657777777777778, "grad_norm": 2.2662885189056396, "learning_rate": 0.00013348754448398576, "loss": 2.2077, "step": 3748 }, { "epoch": 1.6662222222222223, "grad_norm": 2.3850321769714355, "learning_rate": 0.00013346975088967972, "loss": 2.4601, "step": 3749 }, { "epoch": 1.6666666666666665, "grad_norm": 2.6833248138427734, "learning_rate": 0.00013345195729537367, "loss": 1.1392, "step": 3750 }, { "epoch": 1.6671111111111112, "grad_norm": 1.3608639240264893, "learning_rate": 0.00013343416370106763, "loss": 2.1485, "step": 3751 }, { "epoch": 1.6675555555555555, "grad_norm": 1.7300606966018677, "learning_rate": 0.0001334163701067616, "loss": 2.5823, "step": 3752 }, { "epoch": 1.6680000000000001, "grad_norm": 1.7243146896362305, "learning_rate": 0.00013339857651245554, "loss": 2.4296, "step": 3753 }, { "epoch": 1.6684444444444444, "grad_norm": 1.6703439950942993, "learning_rate": 0.00013338078291814947, "loss": 2.1775, "step": 3754 }, { "epoch": 1.6688888888888889, "grad_norm": 1.81732976436615, "learning_rate": 0.0001333629893238434, "loss": 2.548, "step": 3755 }, { "epoch": 1.6693333333333333, "grad_norm": 1.9755940437316895, "learning_rate": 0.00013334519572953736, "loss": 2.2344, "step": 3756 }, { "epoch": 1.6697777777777778, "grad_norm": 1.8577018976211548, "learning_rate": 0.00013332740213523132, "loss": 2.247, "step": 3757 }, { "epoch": 1.6702222222222223, "grad_norm": 1.7116265296936035, "learning_rate": 0.00013330960854092527, "loss": 2.1821, "step": 3758 }, { "epoch": 1.6706666666666665, "grad_norm": 1.8977397680282593, "learning_rate": 0.00013329181494661923, "loss": 2.1784, "step": 3759 }, { "epoch": 1.6711111111111112, "grad_norm": 2.1667838096618652, "learning_rate": 0.00013327402135231316, "loss": 2.4484, "step": 3760 }, { "epoch": 1.6715555555555555, "grad_norm": 1.9198535680770874, "learning_rate": 0.00013325622775800712, "loss": 2.5515, "step": 3761 }, { "epoch": 1.6720000000000002, "grad_norm": 1.7330615520477295, "learning_rate": 0.00013323843416370107, "loss": 2.4671, "step": 3762 }, { "epoch": 1.6724444444444444, "grad_norm": 1.9315346479415894, "learning_rate": 0.00013322064056939503, "loss": 1.9458, "step": 3763 }, { "epoch": 1.6728888888888889, "grad_norm": 2.0418102741241455, "learning_rate": 0.00013320284697508898, "loss": 2.3197, "step": 3764 }, { "epoch": 1.6733333333333333, "grad_norm": 1.8241764307022095, "learning_rate": 0.00013318505338078294, "loss": 2.5491, "step": 3765 }, { "epoch": 1.6737777777777778, "grad_norm": 1.6973588466644287, "learning_rate": 0.00013316725978647687, "loss": 2.1914, "step": 3766 }, { "epoch": 1.6742222222222223, "grad_norm": 2.0757534503936768, "learning_rate": 0.00013314946619217083, "loss": 2.9653, "step": 3767 }, { "epoch": 1.6746666666666665, "grad_norm": 1.850299596786499, "learning_rate": 0.00013313167259786476, "loss": 2.241, "step": 3768 }, { "epoch": 1.6751111111111112, "grad_norm": 1.8038710355758667, "learning_rate": 0.0001331138790035587, "loss": 2.2839, "step": 3769 }, { "epoch": 1.6755555555555555, "grad_norm": 1.7989375591278076, "learning_rate": 0.00013309608540925267, "loss": 2.1376, "step": 3770 }, { "epoch": 1.6760000000000002, "grad_norm": 1.8544578552246094, "learning_rate": 0.00013307829181494663, "loss": 2.1846, "step": 3771 }, { "epoch": 1.6764444444444444, "grad_norm": 1.7627222537994385, "learning_rate": 0.00013306049822064058, "loss": 1.9189, "step": 3772 }, { "epoch": 1.6768888888888889, "grad_norm": 1.8061233758926392, "learning_rate": 0.0001330427046263345, "loss": 1.9897, "step": 3773 }, { "epoch": 1.6773333333333333, "grad_norm": 1.7060072422027588, "learning_rate": 0.00013302491103202847, "loss": 2.0127, "step": 3774 }, { "epoch": 1.6777777777777778, "grad_norm": 2.054374933242798, "learning_rate": 0.00013300711743772242, "loss": 2.2547, "step": 3775 }, { "epoch": 1.6782222222222223, "grad_norm": 1.944670557975769, "learning_rate": 0.00013298932384341638, "loss": 2.1802, "step": 3776 }, { "epoch": 1.6786666666666665, "grad_norm": 2.326748847961426, "learning_rate": 0.00013297153024911034, "loss": 2.3484, "step": 3777 }, { "epoch": 1.6791111111111112, "grad_norm": 1.707459807395935, "learning_rate": 0.0001329537366548043, "loss": 1.8628, "step": 3778 }, { "epoch": 1.6795555555555555, "grad_norm": 2.1038479804992676, "learning_rate": 0.00013293594306049822, "loss": 1.6272, "step": 3779 }, { "epoch": 1.6800000000000002, "grad_norm": 2.0666604042053223, "learning_rate": 0.00013291814946619218, "loss": 2.3157, "step": 3780 }, { "epoch": 1.6804444444444444, "grad_norm": 1.8454957008361816, "learning_rate": 0.0001329003558718861, "loss": 2.0967, "step": 3781 }, { "epoch": 1.6808888888888889, "grad_norm": 1.9296761751174927, "learning_rate": 0.00013288256227758007, "loss": 2.1812, "step": 3782 }, { "epoch": 1.6813333333333333, "grad_norm": 1.9812657833099365, "learning_rate": 0.00013286476868327402, "loss": 2.0514, "step": 3783 }, { "epoch": 1.6817777777777778, "grad_norm": 1.7936532497406006, "learning_rate": 0.00013284697508896798, "loss": 1.9351, "step": 3784 }, { "epoch": 1.6822222222222223, "grad_norm": 2.2539501190185547, "learning_rate": 0.00013282918149466194, "loss": 2.297, "step": 3785 }, { "epoch": 1.6826666666666665, "grad_norm": 2.0437049865722656, "learning_rate": 0.00013281138790035586, "loss": 1.9794, "step": 3786 }, { "epoch": 1.6831111111111112, "grad_norm": 2.1592202186584473, "learning_rate": 0.00013279359430604982, "loss": 2.2224, "step": 3787 }, { "epoch": 1.6835555555555555, "grad_norm": 2.0359416007995605, "learning_rate": 0.00013277580071174378, "loss": 2.0624, "step": 3788 }, { "epoch": 1.6840000000000002, "grad_norm": 2.133168935775757, "learning_rate": 0.00013275800711743773, "loss": 2.0215, "step": 3789 }, { "epoch": 1.6844444444444444, "grad_norm": 2.4152679443359375, "learning_rate": 0.0001327402135231317, "loss": 2.2511, "step": 3790 }, { "epoch": 1.6848888888888889, "grad_norm": 2.4660964012145996, "learning_rate": 0.00013272241992882565, "loss": 2.6581, "step": 3791 }, { "epoch": 1.6853333333333333, "grad_norm": 2.309948205947876, "learning_rate": 0.00013270462633451958, "loss": 2.3009, "step": 3792 }, { "epoch": 1.6857777777777778, "grad_norm": 2.3204185962677, "learning_rate": 0.00013268683274021353, "loss": 2.0474, "step": 3793 }, { "epoch": 1.6862222222222223, "grad_norm": 2.021592855453491, "learning_rate": 0.00013266903914590746, "loss": 1.6188, "step": 3794 }, { "epoch": 1.6866666666666665, "grad_norm": 2.1225037574768066, "learning_rate": 0.00013265124555160142, "loss": 1.7655, "step": 3795 }, { "epoch": 1.6871111111111112, "grad_norm": 2.4528112411499023, "learning_rate": 0.00013263345195729538, "loss": 2.146, "step": 3796 }, { "epoch": 1.6875555555555555, "grad_norm": 2.4885880947113037, "learning_rate": 0.00013261565836298933, "loss": 2.6485, "step": 3797 }, { "epoch": 1.688, "grad_norm": 2.7207860946655273, "learning_rate": 0.0001325978647686833, "loss": 2.7009, "step": 3798 }, { "epoch": 1.6884444444444444, "grad_norm": 3.098876476287842, "learning_rate": 0.00013258007117437722, "loss": 1.9231, "step": 3799 }, { "epoch": 1.6888888888888889, "grad_norm": 3.4644060134887695, "learning_rate": 0.00013256227758007117, "loss": 2.3836, "step": 3800 }, { "epoch": 1.6893333333333334, "grad_norm": 1.1060537099838257, "learning_rate": 0.00013254448398576513, "loss": 1.2927, "step": 3801 }, { "epoch": 1.6897777777777778, "grad_norm": 1.0716097354888916, "learning_rate": 0.0001325266903914591, "loss": 1.1274, "step": 3802 }, { "epoch": 1.6902222222222223, "grad_norm": 1.659468412399292, "learning_rate": 0.00013250889679715304, "loss": 2.5655, "step": 3803 }, { "epoch": 1.6906666666666665, "grad_norm": 1.416318416595459, "learning_rate": 0.000132491103202847, "loss": 2.0306, "step": 3804 }, { "epoch": 1.6911111111111112, "grad_norm": 2.2710070610046387, "learning_rate": 0.00013247330960854093, "loss": 2.178, "step": 3805 }, { "epoch": 1.6915555555555555, "grad_norm": 1.5939732789993286, "learning_rate": 0.00013245551601423489, "loss": 2.3397, "step": 3806 }, { "epoch": 1.692, "grad_norm": 1.590168833732605, "learning_rate": 0.00013243772241992882, "loss": 2.2752, "step": 3807 }, { "epoch": 1.6924444444444444, "grad_norm": 1.1703904867172241, "learning_rate": 0.00013241992882562277, "loss": 1.0365, "step": 3808 }, { "epoch": 1.6928888888888889, "grad_norm": 1.6481446027755737, "learning_rate": 0.00013240213523131673, "loss": 2.2892, "step": 3809 }, { "epoch": 1.6933333333333334, "grad_norm": 1.8637354373931885, "learning_rate": 0.00013238434163701069, "loss": 2.4122, "step": 3810 }, { "epoch": 1.6937777777777778, "grad_norm": 1.6943954229354858, "learning_rate": 0.00013236654804270464, "loss": 2.4287, "step": 3811 }, { "epoch": 1.6942222222222223, "grad_norm": 1.9139350652694702, "learning_rate": 0.00013234875444839857, "loss": 2.038, "step": 3812 }, { "epoch": 1.6946666666666665, "grad_norm": 1.6153117418289185, "learning_rate": 0.00013233096085409253, "loss": 1.8512, "step": 3813 }, { "epoch": 1.6951111111111112, "grad_norm": 1.473202109336853, "learning_rate": 0.00013231316725978648, "loss": 2.0833, "step": 3814 }, { "epoch": 1.6955555555555555, "grad_norm": 1.6183151006698608, "learning_rate": 0.00013229537366548044, "loss": 1.797, "step": 3815 }, { "epoch": 1.696, "grad_norm": 1.648177981376648, "learning_rate": 0.0001322775800711744, "loss": 1.9791, "step": 3816 }, { "epoch": 1.6964444444444444, "grad_norm": 1.758804202079773, "learning_rate": 0.00013225978647686835, "loss": 2.5341, "step": 3817 }, { "epoch": 1.696888888888889, "grad_norm": 1.5626602172851562, "learning_rate": 0.00013224199288256228, "loss": 1.5439, "step": 3818 }, { "epoch": 1.6973333333333334, "grad_norm": 1.9132189750671387, "learning_rate": 0.0001322241992882562, "loss": 2.4955, "step": 3819 }, { "epoch": 1.6977777777777778, "grad_norm": 1.975059986114502, "learning_rate": 0.00013220640569395017, "loss": 2.4941, "step": 3820 }, { "epoch": 1.6982222222222223, "grad_norm": 1.8108739852905273, "learning_rate": 0.00013218861209964413, "loss": 1.7566, "step": 3821 }, { "epoch": 1.6986666666666665, "grad_norm": 1.5749622583389282, "learning_rate": 0.00013217081850533808, "loss": 2.0191, "step": 3822 }, { "epoch": 1.6991111111111112, "grad_norm": 2.2732627391815186, "learning_rate": 0.00013215302491103204, "loss": 1.8025, "step": 3823 }, { "epoch": 1.6995555555555555, "grad_norm": 1.6251347064971924, "learning_rate": 0.000132135231316726, "loss": 1.8303, "step": 3824 }, { "epoch": 1.7, "grad_norm": 1.9713494777679443, "learning_rate": 0.00013211743772241992, "loss": 2.1631, "step": 3825 }, { "epoch": 1.7004444444444444, "grad_norm": 1.9133716821670532, "learning_rate": 0.00013209964412811388, "loss": 2.3184, "step": 3826 }, { "epoch": 1.700888888888889, "grad_norm": 2.2946348190307617, "learning_rate": 0.00013208185053380784, "loss": 2.5479, "step": 3827 }, { "epoch": 1.7013333333333334, "grad_norm": 1.764487862586975, "learning_rate": 0.0001320640569395018, "loss": 1.7169, "step": 3828 }, { "epoch": 1.7017777777777776, "grad_norm": 1.9306504726409912, "learning_rate": 0.00013204626334519575, "loss": 1.8107, "step": 3829 }, { "epoch": 1.7022222222222223, "grad_norm": 1.9167112112045288, "learning_rate": 0.0001320284697508897, "loss": 1.7252, "step": 3830 }, { "epoch": 1.7026666666666666, "grad_norm": 2.143211603164673, "learning_rate": 0.00013201067615658364, "loss": 2.3653, "step": 3831 }, { "epoch": 1.7031111111111112, "grad_norm": 2.1837661266326904, "learning_rate": 0.00013199288256227757, "loss": 2.3616, "step": 3832 }, { "epoch": 1.7035555555555555, "grad_norm": 2.0470423698425293, "learning_rate": 0.00013197508896797152, "loss": 1.9548, "step": 3833 }, { "epoch": 1.704, "grad_norm": 2.4029338359832764, "learning_rate": 0.00013195729537366548, "loss": 2.0182, "step": 3834 }, { "epoch": 1.7044444444444444, "grad_norm": 2.1079888343811035, "learning_rate": 0.00013193950177935944, "loss": 2.0754, "step": 3835 }, { "epoch": 1.704888888888889, "grad_norm": 2.2155072689056396, "learning_rate": 0.0001319217081850534, "loss": 2.2782, "step": 3836 }, { "epoch": 1.7053333333333334, "grad_norm": 2.1504364013671875, "learning_rate": 0.00013190391459074735, "loss": 2.1015, "step": 3837 }, { "epoch": 1.7057777777777776, "grad_norm": 2.214022159576416, "learning_rate": 0.00013188612099644128, "loss": 2.2518, "step": 3838 }, { "epoch": 1.7062222222222223, "grad_norm": 2.3961539268493652, "learning_rate": 0.00013186832740213523, "loss": 1.7802, "step": 3839 }, { "epoch": 1.7066666666666666, "grad_norm": 2.034930467605591, "learning_rate": 0.0001318505338078292, "loss": 2.2164, "step": 3840 }, { "epoch": 1.7071111111111112, "grad_norm": 2.0269289016723633, "learning_rate": 0.00013183274021352315, "loss": 1.7755, "step": 3841 }, { "epoch": 1.7075555555555555, "grad_norm": 2.0814435482025146, "learning_rate": 0.0001318149466192171, "loss": 1.9628, "step": 3842 }, { "epoch": 1.708, "grad_norm": 2.4242076873779297, "learning_rate": 0.00013179715302491106, "loss": 2.4269, "step": 3843 }, { "epoch": 1.7084444444444444, "grad_norm": 2.6244444847106934, "learning_rate": 0.000131779359430605, "loss": 2.279, "step": 3844 }, { "epoch": 1.708888888888889, "grad_norm": 2.184532403945923, "learning_rate": 0.00013176156583629892, "loss": 2.0163, "step": 3845 }, { "epoch": 1.7093333333333334, "grad_norm": 2.2187845706939697, "learning_rate": 0.00013174377224199288, "loss": 2.226, "step": 3846 }, { "epoch": 1.7097777777777776, "grad_norm": 2.7310564517974854, "learning_rate": 0.00013172597864768683, "loss": 2.1964, "step": 3847 }, { "epoch": 1.7102222222222223, "grad_norm": 2.8237147331237793, "learning_rate": 0.0001317081850533808, "loss": 2.5273, "step": 3848 }, { "epoch": 1.7106666666666666, "grad_norm": 1.9283852577209473, "learning_rate": 0.00013169039145907474, "loss": 1.3825, "step": 3849 }, { "epoch": 1.7111111111111112, "grad_norm": 2.044787883758545, "learning_rate": 0.00013167259786476867, "loss": 0.2311, "step": 3850 }, { "epoch": 1.7115555555555555, "grad_norm": 1.3595030307769775, "learning_rate": 0.00013165480427046263, "loss": 1.8332, "step": 3851 }, { "epoch": 1.712, "grad_norm": 1.7082335948944092, "learning_rate": 0.0001316370106761566, "loss": 1.3219, "step": 3852 }, { "epoch": 1.7124444444444444, "grad_norm": 1.4628937244415283, "learning_rate": 0.00013161921708185054, "loss": 2.1888, "step": 3853 }, { "epoch": 1.712888888888889, "grad_norm": 1.4949939250946045, "learning_rate": 0.0001316014234875445, "loss": 2.2021, "step": 3854 }, { "epoch": 1.7133333333333334, "grad_norm": 1.4384099245071411, "learning_rate": 0.00013158362989323846, "loss": 1.9055, "step": 3855 }, { "epoch": 1.7137777777777776, "grad_norm": 1.6663846969604492, "learning_rate": 0.00013156583629893239, "loss": 2.4469, "step": 3856 }, { "epoch": 1.7142222222222223, "grad_norm": 1.5963435173034668, "learning_rate": 0.00013154804270462634, "loss": 2.1661, "step": 3857 }, { "epoch": 1.7146666666666666, "grad_norm": 1.9488998651504517, "learning_rate": 0.00013153024911032027, "loss": 2.2052, "step": 3858 }, { "epoch": 1.7151111111111113, "grad_norm": 1.6803126335144043, "learning_rate": 0.00013151245551601423, "loss": 2.437, "step": 3859 }, { "epoch": 1.7155555555555555, "grad_norm": 1.6857129335403442, "learning_rate": 0.00013149466192170818, "loss": 1.8197, "step": 3860 }, { "epoch": 1.716, "grad_norm": 1.7986105680465698, "learning_rate": 0.00013147686832740214, "loss": 2.2436, "step": 3861 }, { "epoch": 1.7164444444444444, "grad_norm": 1.6643034219741821, "learning_rate": 0.0001314590747330961, "loss": 1.9625, "step": 3862 }, { "epoch": 1.716888888888889, "grad_norm": 1.7694244384765625, "learning_rate": 0.00013144128113879003, "loss": 2.2189, "step": 3863 }, { "epoch": 1.7173333333333334, "grad_norm": 1.951474905014038, "learning_rate": 0.00013142348754448398, "loss": 2.4388, "step": 3864 }, { "epoch": 1.7177777777777776, "grad_norm": 1.966758370399475, "learning_rate": 0.00013140569395017794, "loss": 2.5255, "step": 3865 }, { "epoch": 1.7182222222222223, "grad_norm": 1.5251381397247314, "learning_rate": 0.0001313879003558719, "loss": 1.4708, "step": 3866 }, { "epoch": 1.7186666666666666, "grad_norm": 1.851199984550476, "learning_rate": 0.00013137010676156585, "loss": 2.4586, "step": 3867 }, { "epoch": 1.7191111111111113, "grad_norm": 1.7658207416534424, "learning_rate": 0.0001313523131672598, "loss": 2.5013, "step": 3868 }, { "epoch": 1.7195555555555555, "grad_norm": 2.4436843395233154, "learning_rate": 0.00013133451957295374, "loss": 2.5335, "step": 3869 }, { "epoch": 1.72, "grad_norm": 1.8331066370010376, "learning_rate": 0.0001313167259786477, "loss": 2.3018, "step": 3870 }, { "epoch": 1.7204444444444444, "grad_norm": 1.725740671157837, "learning_rate": 0.00013129893238434163, "loss": 2.4409, "step": 3871 }, { "epoch": 1.720888888888889, "grad_norm": 2.027172088623047, "learning_rate": 0.00013128113879003558, "loss": 2.1246, "step": 3872 }, { "epoch": 1.7213333333333334, "grad_norm": 1.6568158864974976, "learning_rate": 0.00013126334519572954, "loss": 1.8242, "step": 3873 }, { "epoch": 1.7217777777777776, "grad_norm": 1.8668732643127441, "learning_rate": 0.0001312455516014235, "loss": 2.0447, "step": 3874 }, { "epoch": 1.7222222222222223, "grad_norm": 1.9975069761276245, "learning_rate": 0.00013122775800711745, "loss": 2.1747, "step": 3875 }, { "epoch": 1.7226666666666666, "grad_norm": 1.9104905128479004, "learning_rate": 0.00013120996441281138, "loss": 2.248, "step": 3876 }, { "epoch": 1.7231111111111113, "grad_norm": 2.3680083751678467, "learning_rate": 0.00013119217081850534, "loss": 2.4361, "step": 3877 }, { "epoch": 1.7235555555555555, "grad_norm": 2.0153682231903076, "learning_rate": 0.0001311743772241993, "loss": 1.9933, "step": 3878 }, { "epoch": 1.724, "grad_norm": 2.112910032272339, "learning_rate": 0.00013115658362989325, "loss": 2.4397, "step": 3879 }, { "epoch": 1.7244444444444444, "grad_norm": 1.9228010177612305, "learning_rate": 0.0001311387900355872, "loss": 2.7384, "step": 3880 }, { "epoch": 1.724888888888889, "grad_norm": 2.054799795150757, "learning_rate": 0.00013112099644128116, "loss": 2.09, "step": 3881 }, { "epoch": 1.7253333333333334, "grad_norm": 2.12286376953125, "learning_rate": 0.0001311032028469751, "loss": 2.0431, "step": 3882 }, { "epoch": 1.7257777777777776, "grad_norm": 2.27078914642334, "learning_rate": 0.00013108540925266905, "loss": 2.5563, "step": 3883 }, { "epoch": 1.7262222222222223, "grad_norm": 1.996488332748413, "learning_rate": 0.00013106761565836298, "loss": 2.4713, "step": 3884 }, { "epoch": 1.7266666666666666, "grad_norm": 2.3658390045166016, "learning_rate": 0.00013104982206405693, "loss": 2.1061, "step": 3885 }, { "epoch": 1.7271111111111113, "grad_norm": 1.9138381481170654, "learning_rate": 0.0001310320284697509, "loss": 2.6341, "step": 3886 }, { "epoch": 1.7275555555555555, "grad_norm": 1.9364850521087646, "learning_rate": 0.00013101423487544485, "loss": 2.4424, "step": 3887 }, { "epoch": 1.728, "grad_norm": 2.730329751968384, "learning_rate": 0.0001309964412811388, "loss": 2.0531, "step": 3888 }, { "epoch": 1.7284444444444444, "grad_norm": 2.1814632415771484, "learning_rate": 0.00013097864768683273, "loss": 2.3215, "step": 3889 }, { "epoch": 1.728888888888889, "grad_norm": 2.137681484222412, "learning_rate": 0.0001309608540925267, "loss": 2.4898, "step": 3890 }, { "epoch": 1.7293333333333334, "grad_norm": 2.490191698074341, "learning_rate": 0.00013094306049822065, "loss": 2.15, "step": 3891 }, { "epoch": 1.7297777777777776, "grad_norm": 2.2890820503234863, "learning_rate": 0.0001309252669039146, "loss": 1.6876, "step": 3892 }, { "epoch": 1.7302222222222223, "grad_norm": 1.9590189456939697, "learning_rate": 0.00013090747330960856, "loss": 1.9625, "step": 3893 }, { "epoch": 1.7306666666666666, "grad_norm": 2.1057655811309814, "learning_rate": 0.00013088967971530252, "loss": 2.198, "step": 3894 }, { "epoch": 1.7311111111111113, "grad_norm": 2.1267521381378174, "learning_rate": 0.00013087188612099645, "loss": 2.2824, "step": 3895 }, { "epoch": 1.7315555555555555, "grad_norm": 1.9178341627120972, "learning_rate": 0.0001308540925266904, "loss": 1.9127, "step": 3896 }, { "epoch": 1.732, "grad_norm": 2.151017189025879, "learning_rate": 0.00013083629893238433, "loss": 2.0591, "step": 3897 }, { "epoch": 1.7324444444444445, "grad_norm": 2.1666769981384277, "learning_rate": 0.0001308185053380783, "loss": 2.1367, "step": 3898 }, { "epoch": 1.732888888888889, "grad_norm": 2.2003917694091797, "learning_rate": 0.00013080071174377224, "loss": 0.9107, "step": 3899 }, { "epoch": 1.7333333333333334, "grad_norm": 2.9451475143432617, "learning_rate": 0.0001307829181494662, "loss": 2.487, "step": 3900 }, { "epoch": 1.7337777777777776, "grad_norm": 1.6602669954299927, "learning_rate": 0.00013076512455516016, "loss": 2.9994, "step": 3901 }, { "epoch": 1.7342222222222223, "grad_norm": 1.3574813604354858, "learning_rate": 0.0001307473309608541, "loss": 2.2298, "step": 3902 }, { "epoch": 1.7346666666666666, "grad_norm": 1.5164433717727661, "learning_rate": 0.00013072953736654804, "loss": 2.0965, "step": 3903 }, { "epoch": 1.7351111111111113, "grad_norm": 1.5202592611312866, "learning_rate": 0.000130711743772242, "loss": 2.3827, "step": 3904 }, { "epoch": 1.7355555555555555, "grad_norm": 1.3015217781066895, "learning_rate": 0.00013069395017793596, "loss": 1.6272, "step": 3905 }, { "epoch": 1.736, "grad_norm": 1.5526891946792603, "learning_rate": 0.0001306761565836299, "loss": 2.4231, "step": 3906 }, { "epoch": 1.7364444444444445, "grad_norm": 1.6718441247940063, "learning_rate": 0.00013065836298932387, "loss": 2.8115, "step": 3907 }, { "epoch": 1.736888888888889, "grad_norm": 1.7403420209884644, "learning_rate": 0.0001306405693950178, "loss": 2.7689, "step": 3908 }, { "epoch": 1.7373333333333334, "grad_norm": 1.7474772930145264, "learning_rate": 0.00013062277580071176, "loss": 2.6111, "step": 3909 }, { "epoch": 1.7377777777777776, "grad_norm": 1.6219923496246338, "learning_rate": 0.00013060498220640568, "loss": 2.5719, "step": 3910 }, { "epoch": 1.7382222222222223, "grad_norm": 1.893513798713684, "learning_rate": 0.00013058718861209964, "loss": 2.7495, "step": 3911 }, { "epoch": 1.7386666666666666, "grad_norm": 1.917389154434204, "learning_rate": 0.0001305693950177936, "loss": 2.4973, "step": 3912 }, { "epoch": 1.7391111111111113, "grad_norm": 1.699096918106079, "learning_rate": 0.00013055160142348755, "loss": 1.7678, "step": 3913 }, { "epoch": 1.7395555555555555, "grad_norm": 1.8528611660003662, "learning_rate": 0.0001305338078291815, "loss": 1.8641, "step": 3914 }, { "epoch": 1.74, "grad_norm": 1.741129994392395, "learning_rate": 0.00013051601423487544, "loss": 2.0575, "step": 3915 }, { "epoch": 1.7404444444444445, "grad_norm": 1.7191723585128784, "learning_rate": 0.0001304982206405694, "loss": 2.1448, "step": 3916 }, { "epoch": 1.740888888888889, "grad_norm": 1.8906670808792114, "learning_rate": 0.00013048042704626335, "loss": 2.3253, "step": 3917 }, { "epoch": 1.7413333333333334, "grad_norm": 1.799043893814087, "learning_rate": 0.0001304626334519573, "loss": 2.2234, "step": 3918 }, { "epoch": 1.7417777777777776, "grad_norm": 1.6781656742095947, "learning_rate": 0.00013044483985765127, "loss": 1.9896, "step": 3919 }, { "epoch": 1.7422222222222223, "grad_norm": 1.7165361642837524, "learning_rate": 0.00013042704626334522, "loss": 2.0458, "step": 3920 }, { "epoch": 1.7426666666666666, "grad_norm": 1.9061387777328491, "learning_rate": 0.00013040925266903915, "loss": 2.4378, "step": 3921 }, { "epoch": 1.743111111111111, "grad_norm": 2.1114823818206787, "learning_rate": 0.0001303914590747331, "loss": 2.4924, "step": 3922 }, { "epoch": 1.7435555555555555, "grad_norm": 1.9921493530273438, "learning_rate": 0.00013037366548042704, "loss": 2.5657, "step": 3923 }, { "epoch": 1.744, "grad_norm": 1.8910974264144897, "learning_rate": 0.000130355871886121, "loss": 2.0381, "step": 3924 }, { "epoch": 1.7444444444444445, "grad_norm": 2.049863815307617, "learning_rate": 0.00013033807829181495, "loss": 1.9867, "step": 3925 }, { "epoch": 1.744888888888889, "grad_norm": 1.9373703002929688, "learning_rate": 0.0001303202846975089, "loss": 2.3285, "step": 3926 }, { "epoch": 1.7453333333333334, "grad_norm": 1.5811951160430908, "learning_rate": 0.00013030249110320286, "loss": 1.1489, "step": 3927 }, { "epoch": 1.7457777777777777, "grad_norm": 2.237574577331543, "learning_rate": 0.0001302846975088968, "loss": 2.8071, "step": 3928 }, { "epoch": 1.7462222222222223, "grad_norm": 1.7456693649291992, "learning_rate": 0.00013026690391459075, "loss": 1.5628, "step": 3929 }, { "epoch": 1.7466666666666666, "grad_norm": 1.9937740564346313, "learning_rate": 0.0001302491103202847, "loss": 2.2982, "step": 3930 }, { "epoch": 1.747111111111111, "grad_norm": 1.7053123712539673, "learning_rate": 0.00013023131672597866, "loss": 1.0467, "step": 3931 }, { "epoch": 1.7475555555555555, "grad_norm": 2.3195912837982178, "learning_rate": 0.00013021352313167262, "loss": 2.5614, "step": 3932 }, { "epoch": 1.748, "grad_norm": 2.1397838592529297, "learning_rate": 0.00013019572953736655, "loss": 1.9569, "step": 3933 }, { "epoch": 1.7484444444444445, "grad_norm": 1.823757290840149, "learning_rate": 0.0001301779359430605, "loss": 1.8498, "step": 3934 }, { "epoch": 1.748888888888889, "grad_norm": 1.9765830039978027, "learning_rate": 0.00013016014234875443, "loss": 2.2709, "step": 3935 }, { "epoch": 1.7493333333333334, "grad_norm": 2.0341432094573975, "learning_rate": 0.0001301423487544484, "loss": 1.8853, "step": 3936 }, { "epoch": 1.7497777777777777, "grad_norm": 1.8640036582946777, "learning_rate": 0.00013012455516014235, "loss": 1.9018, "step": 3937 }, { "epoch": 1.7502222222222223, "grad_norm": 1.3533066511154175, "learning_rate": 0.0001301067615658363, "loss": 0.9755, "step": 3938 }, { "epoch": 1.7506666666666666, "grad_norm": 2.0747861862182617, "learning_rate": 0.00013008896797153026, "loss": 2.1518, "step": 3939 }, { "epoch": 1.751111111111111, "grad_norm": 2.1353771686553955, "learning_rate": 0.0001300711743772242, "loss": 2.3496, "step": 3940 }, { "epoch": 1.7515555555555555, "grad_norm": 2.6912460327148438, "learning_rate": 0.00013005338078291815, "loss": 2.2062, "step": 3941 }, { "epoch": 1.752, "grad_norm": 1.905840277671814, "learning_rate": 0.0001300355871886121, "loss": 2.0854, "step": 3942 }, { "epoch": 1.7524444444444445, "grad_norm": 2.2967617511749268, "learning_rate": 0.00013001779359430606, "loss": 1.8793, "step": 3943 }, { "epoch": 1.752888888888889, "grad_norm": 2.267254590988159, "learning_rate": 0.00013000000000000002, "loss": 1.8769, "step": 3944 }, { "epoch": 1.7533333333333334, "grad_norm": 2.3356738090515137, "learning_rate": 0.00012998220640569397, "loss": 1.7641, "step": 3945 }, { "epoch": 1.7537777777777777, "grad_norm": 2.722877264022827, "learning_rate": 0.0001299644128113879, "loss": 2.2982, "step": 3946 }, { "epoch": 1.7542222222222223, "grad_norm": 2.4450271129608154, "learning_rate": 0.00012994661921708186, "loss": 2.1468, "step": 3947 }, { "epoch": 1.7546666666666666, "grad_norm": 2.8750321865081787, "learning_rate": 0.0001299288256227758, "loss": 2.4269, "step": 3948 }, { "epoch": 1.755111111111111, "grad_norm": 2.6799416542053223, "learning_rate": 0.00012991103202846974, "loss": 1.6945, "step": 3949 }, { "epoch": 1.7555555555555555, "grad_norm": 3.447089433670044, "learning_rate": 0.0001298932384341637, "loss": 2.7562, "step": 3950 }, { "epoch": 1.756, "grad_norm": 1.3063669204711914, "learning_rate": 0.00012987544483985766, "loss": 2.1046, "step": 3951 }, { "epoch": 1.7564444444444445, "grad_norm": 1.4739446640014648, "learning_rate": 0.0001298576512455516, "loss": 2.3863, "step": 3952 }, { "epoch": 1.756888888888889, "grad_norm": 1.543544888496399, "learning_rate": 0.00012983985765124554, "loss": 2.3732, "step": 3953 }, { "epoch": 1.7573333333333334, "grad_norm": 1.6399084329605103, "learning_rate": 0.0001298220640569395, "loss": 2.5028, "step": 3954 }, { "epoch": 1.7577777777777777, "grad_norm": 1.5914816856384277, "learning_rate": 0.00012980427046263346, "loss": 2.2064, "step": 3955 }, { "epoch": 1.7582222222222224, "grad_norm": 1.6115366220474243, "learning_rate": 0.0001297864768683274, "loss": 2.2451, "step": 3956 }, { "epoch": 1.7586666666666666, "grad_norm": 1.5162943601608276, "learning_rate": 0.00012976868327402137, "loss": 2.1785, "step": 3957 }, { "epoch": 1.759111111111111, "grad_norm": 1.6043645143508911, "learning_rate": 0.00012975088967971533, "loss": 2.2091, "step": 3958 }, { "epoch": 1.7595555555555555, "grad_norm": 1.4509108066558838, "learning_rate": 0.00012973309608540925, "loss": 2.3536, "step": 3959 }, { "epoch": 1.76, "grad_norm": 1.6944358348846436, "learning_rate": 0.0001297153024911032, "loss": 2.5589, "step": 3960 }, { "epoch": 1.7604444444444445, "grad_norm": 1.5533844232559204, "learning_rate": 0.00012969750889679714, "loss": 2.4115, "step": 3961 }, { "epoch": 1.7608888888888887, "grad_norm": 1.4054555892944336, "learning_rate": 0.0001296797153024911, "loss": 2.1334, "step": 3962 }, { "epoch": 1.7613333333333334, "grad_norm": 1.6660244464874268, "learning_rate": 0.00012966192170818505, "loss": 2.3195, "step": 3963 }, { "epoch": 1.7617777777777777, "grad_norm": 1.5399365425109863, "learning_rate": 0.000129644128113879, "loss": 2.0814, "step": 3964 }, { "epoch": 1.7622222222222224, "grad_norm": 1.5576121807098389, "learning_rate": 0.00012962633451957297, "loss": 1.8001, "step": 3965 }, { "epoch": 1.7626666666666666, "grad_norm": 1.8270119428634644, "learning_rate": 0.0001296085409252669, "loss": 1.9568, "step": 3966 }, { "epoch": 1.763111111111111, "grad_norm": 1.5569310188293457, "learning_rate": 0.00012959074733096085, "loss": 2.1702, "step": 3967 }, { "epoch": 1.7635555555555555, "grad_norm": 1.7133228778839111, "learning_rate": 0.0001295729537366548, "loss": 2.4522, "step": 3968 }, { "epoch": 1.764, "grad_norm": 1.6561294794082642, "learning_rate": 0.00012955516014234877, "loss": 2.0854, "step": 3969 }, { "epoch": 1.7644444444444445, "grad_norm": 2.1431984901428223, "learning_rate": 0.00012953736654804272, "loss": 1.9774, "step": 3970 }, { "epoch": 1.7648888888888887, "grad_norm": 1.6542658805847168, "learning_rate": 0.00012951957295373668, "loss": 1.9308, "step": 3971 }, { "epoch": 1.7653333333333334, "grad_norm": 1.7912119626998901, "learning_rate": 0.0001295017793594306, "loss": 1.9613, "step": 3972 }, { "epoch": 1.7657777777777777, "grad_norm": 1.2500418424606323, "learning_rate": 0.00012948398576512456, "loss": 0.6403, "step": 3973 }, { "epoch": 1.7662222222222224, "grad_norm": 1.623761534690857, "learning_rate": 0.0001294661921708185, "loss": 2.2582, "step": 3974 }, { "epoch": 1.7666666666666666, "grad_norm": 1.6567007303237915, "learning_rate": 0.00012944839857651245, "loss": 1.8474, "step": 3975 }, { "epoch": 1.767111111111111, "grad_norm": 1.7294385433197021, "learning_rate": 0.0001294306049822064, "loss": 1.9536, "step": 3976 }, { "epoch": 1.7675555555555555, "grad_norm": 1.9673961400985718, "learning_rate": 0.00012941281138790036, "loss": 2.0788, "step": 3977 }, { "epoch": 1.768, "grad_norm": 2.372072696685791, "learning_rate": 0.00012939501779359432, "loss": 2.1401, "step": 3978 }, { "epoch": 1.7684444444444445, "grad_norm": 1.8470436334609985, "learning_rate": 0.00012937722419928825, "loss": 2.2509, "step": 3979 }, { "epoch": 1.7688888888888887, "grad_norm": 1.7825738191604614, "learning_rate": 0.0001293594306049822, "loss": 2.1858, "step": 3980 }, { "epoch": 1.7693333333333334, "grad_norm": 1.4077835083007812, "learning_rate": 0.00012934163701067616, "loss": 1.2364, "step": 3981 }, { "epoch": 1.7697777777777777, "grad_norm": 2.0429136753082275, "learning_rate": 0.00012932384341637012, "loss": 2.2663, "step": 3982 }, { "epoch": 1.7702222222222224, "grad_norm": 1.2380872964859009, "learning_rate": 0.00012930604982206408, "loss": 0.6942, "step": 3983 }, { "epoch": 1.7706666666666666, "grad_norm": 2.0053813457489014, "learning_rate": 0.00012928825622775803, "loss": 2.1342, "step": 3984 }, { "epoch": 1.771111111111111, "grad_norm": 1.8979686498641968, "learning_rate": 0.00012927046263345196, "loss": 2.4219, "step": 3985 }, { "epoch": 1.7715555555555556, "grad_norm": 2.177061080932617, "learning_rate": 0.00012925266903914592, "loss": 2.2856, "step": 3986 }, { "epoch": 1.772, "grad_norm": 2.1877963542938232, "learning_rate": 0.00012923487544483985, "loss": 2.0785, "step": 3987 }, { "epoch": 1.7724444444444445, "grad_norm": 1.8126049041748047, "learning_rate": 0.0001292170818505338, "loss": 1.8393, "step": 3988 }, { "epoch": 1.7728888888888887, "grad_norm": 2.09319806098938, "learning_rate": 0.00012919928825622776, "loss": 2.5437, "step": 3989 }, { "epoch": 1.7733333333333334, "grad_norm": 1.2192652225494385, "learning_rate": 0.00012918149466192172, "loss": 1.0593, "step": 3990 }, { "epoch": 1.7737777777777777, "grad_norm": 2.0721795558929443, "learning_rate": 0.00012916370106761567, "loss": 2.3111, "step": 3991 }, { "epoch": 1.7742222222222224, "grad_norm": 1.9391847848892212, "learning_rate": 0.0001291459074733096, "loss": 1.9213, "step": 3992 }, { "epoch": 1.7746666666666666, "grad_norm": 2.3300163745880127, "learning_rate": 0.00012912811387900356, "loss": 2.4885, "step": 3993 }, { "epoch": 1.775111111111111, "grad_norm": 2.414308786392212, "learning_rate": 0.00012911032028469752, "loss": 2.4184, "step": 3994 }, { "epoch": 1.7755555555555556, "grad_norm": 2.2488865852355957, "learning_rate": 0.00012909252669039147, "loss": 2.0251, "step": 3995 }, { "epoch": 1.776, "grad_norm": 3.426772117614746, "learning_rate": 0.00012907473309608543, "loss": 2.7753, "step": 3996 }, { "epoch": 1.7764444444444445, "grad_norm": 2.1739494800567627, "learning_rate": 0.00012905693950177938, "loss": 2.0393, "step": 3997 }, { "epoch": 1.7768888888888887, "grad_norm": 2.434339761734009, "learning_rate": 0.00012903914590747331, "loss": 2.2456, "step": 3998 }, { "epoch": 1.7773333333333334, "grad_norm": 2.3951499462127686, "learning_rate": 0.00012902135231316727, "loss": 2.292, "step": 3999 }, { "epoch": 1.7777777777777777, "grad_norm": 2.0735788345336914, "learning_rate": 0.0001290035587188612, "loss": 1.3612, "step": 4000 }, { "epoch": 1.7782222222222224, "grad_norm": 0.45426124334335327, "learning_rate": 0.00012898576512455516, "loss": 0.0292, "step": 4001 }, { "epoch": 1.7786666666666666, "grad_norm": 1.561895728111267, "learning_rate": 0.0001289679715302491, "loss": 2.864, "step": 4002 }, { "epoch": 1.779111111111111, "grad_norm": 1.1529381275177002, "learning_rate": 0.00012895017793594307, "loss": 1.1453, "step": 4003 }, { "epoch": 1.7795555555555556, "grad_norm": 1.7909576892852783, "learning_rate": 0.00012893238434163703, "loss": 2.1009, "step": 4004 }, { "epoch": 1.78, "grad_norm": 1.8325178623199463, "learning_rate": 0.00012891459074733096, "loss": 2.4447, "step": 4005 }, { "epoch": 1.7804444444444445, "grad_norm": 1.695643663406372, "learning_rate": 0.0001288967971530249, "loss": 2.5286, "step": 4006 }, { "epoch": 1.7808888888888887, "grad_norm": 1.6463677883148193, "learning_rate": 0.00012887900355871887, "loss": 2.2066, "step": 4007 }, { "epoch": 1.7813333333333334, "grad_norm": 1.7361533641815186, "learning_rate": 0.00012886120996441282, "loss": 2.2354, "step": 4008 }, { "epoch": 1.7817777777777777, "grad_norm": 2.212310791015625, "learning_rate": 0.00012884341637010678, "loss": 2.1203, "step": 4009 }, { "epoch": 1.7822222222222224, "grad_norm": 1.4698344469070435, "learning_rate": 0.00012882562277580074, "loss": 1.8529, "step": 4010 }, { "epoch": 1.7826666666666666, "grad_norm": 1.823663592338562, "learning_rate": 0.00012880782918149467, "loss": 2.2639, "step": 4011 }, { "epoch": 1.783111111111111, "grad_norm": 1.7825374603271484, "learning_rate": 0.00012879003558718862, "loss": 1.9933, "step": 4012 }, { "epoch": 1.7835555555555556, "grad_norm": 1.8066425323486328, "learning_rate": 0.00012877224199288255, "loss": 2.1136, "step": 4013 }, { "epoch": 1.784, "grad_norm": 2.2551443576812744, "learning_rate": 0.0001287544483985765, "loss": 1.9374, "step": 4014 }, { "epoch": 1.7844444444444445, "grad_norm": 1.7912770509719849, "learning_rate": 0.00012873665480427047, "loss": 2.2331, "step": 4015 }, { "epoch": 1.7848888888888887, "grad_norm": 1.5528522729873657, "learning_rate": 0.00012871886120996442, "loss": 1.9012, "step": 4016 }, { "epoch": 1.7853333333333334, "grad_norm": 1.6881825923919678, "learning_rate": 0.00012870106761565838, "loss": 2.061, "step": 4017 }, { "epoch": 1.7857777777777777, "grad_norm": 1.2570315599441528, "learning_rate": 0.0001286832740213523, "loss": 1.2893, "step": 4018 }, { "epoch": 1.7862222222222224, "grad_norm": 1.4894715547561646, "learning_rate": 0.00012866548042704627, "loss": 1.7158, "step": 4019 }, { "epoch": 1.7866666666666666, "grad_norm": 1.969714879989624, "learning_rate": 0.00012864768683274022, "loss": 2.3733, "step": 4020 }, { "epoch": 1.787111111111111, "grad_norm": 1.8792929649353027, "learning_rate": 0.00012862989323843418, "loss": 2.1233, "step": 4021 }, { "epoch": 1.7875555555555556, "grad_norm": 1.8268121480941772, "learning_rate": 0.00012861209964412813, "loss": 2.136, "step": 4022 }, { "epoch": 1.788, "grad_norm": 2.0907411575317383, "learning_rate": 0.00012859430604982206, "loss": 2.3892, "step": 4023 }, { "epoch": 1.7884444444444445, "grad_norm": 1.901108741760254, "learning_rate": 0.00012857651245551602, "loss": 1.938, "step": 4024 }, { "epoch": 1.7888888888888888, "grad_norm": 1.818852186203003, "learning_rate": 0.00012855871886120998, "loss": 2.5912, "step": 4025 }, { "epoch": 1.7893333333333334, "grad_norm": 1.6948503255844116, "learning_rate": 0.0001285409252669039, "loss": 1.8355, "step": 4026 }, { "epoch": 1.7897777777777777, "grad_norm": 2.053222417831421, "learning_rate": 0.00012852313167259786, "loss": 1.7958, "step": 4027 }, { "epoch": 1.7902222222222224, "grad_norm": 1.777777910232544, "learning_rate": 0.00012850533807829182, "loss": 2.276, "step": 4028 }, { "epoch": 1.7906666666666666, "grad_norm": 2.1247165203094482, "learning_rate": 0.00012848754448398578, "loss": 2.275, "step": 4029 }, { "epoch": 1.791111111111111, "grad_norm": 2.1670467853546143, "learning_rate": 0.0001284697508896797, "loss": 2.1813, "step": 4030 }, { "epoch": 1.7915555555555556, "grad_norm": 1.7745897769927979, "learning_rate": 0.00012845195729537366, "loss": 1.7079, "step": 4031 }, { "epoch": 1.792, "grad_norm": 1.98904287815094, "learning_rate": 0.00012843416370106762, "loss": 1.9803, "step": 4032 }, { "epoch": 1.7924444444444445, "grad_norm": 1.8329356908798218, "learning_rate": 0.00012841637010676157, "loss": 2.3103, "step": 4033 }, { "epoch": 1.7928888888888888, "grad_norm": 1.814437985420227, "learning_rate": 0.00012839857651245553, "loss": 2.0819, "step": 4034 }, { "epoch": 1.7933333333333334, "grad_norm": 1.9947712421417236, "learning_rate": 0.0001283807829181495, "loss": 2.1793, "step": 4035 }, { "epoch": 1.7937777777777777, "grad_norm": 1.7744113206863403, "learning_rate": 0.00012836298932384342, "loss": 1.4961, "step": 4036 }, { "epoch": 1.7942222222222224, "grad_norm": 1.941453456878662, "learning_rate": 0.00012834519572953737, "loss": 1.5453, "step": 4037 }, { "epoch": 1.7946666666666666, "grad_norm": 1.993354320526123, "learning_rate": 0.00012832740213523133, "loss": 2.5863, "step": 4038 }, { "epoch": 1.795111111111111, "grad_norm": 1.9085299968719482, "learning_rate": 0.00012830960854092526, "loss": 2.0983, "step": 4039 }, { "epoch": 1.7955555555555556, "grad_norm": 2.2730631828308105, "learning_rate": 0.00012829181494661922, "loss": 2.894, "step": 4040 }, { "epoch": 1.796, "grad_norm": 2.3365797996520996, "learning_rate": 0.00012827402135231317, "loss": 2.123, "step": 4041 }, { "epoch": 1.7964444444444445, "grad_norm": 1.8728188276290894, "learning_rate": 0.00012825622775800713, "loss": 1.4073, "step": 4042 }, { "epoch": 1.7968888888888888, "grad_norm": 2.5472781658172607, "learning_rate": 0.00012823843416370106, "loss": 2.2789, "step": 4043 }, { "epoch": 1.7973333333333334, "grad_norm": 2.269137382507324, "learning_rate": 0.00012822064056939501, "loss": 2.3703, "step": 4044 }, { "epoch": 1.7977777777777777, "grad_norm": 2.4079058170318604, "learning_rate": 0.00012820284697508897, "loss": 1.6054, "step": 4045 }, { "epoch": 1.7982222222222224, "grad_norm": 2.692018747329712, "learning_rate": 0.00012818505338078293, "loss": 2.6721, "step": 4046 }, { "epoch": 1.7986666666666666, "grad_norm": 2.388993740081787, "learning_rate": 0.00012816725978647688, "loss": 2.0374, "step": 4047 }, { "epoch": 1.799111111111111, "grad_norm": 1.563175082206726, "learning_rate": 0.00012814946619217084, "loss": 0.9111, "step": 4048 }, { "epoch": 1.7995555555555556, "grad_norm": 2.669541835784912, "learning_rate": 0.00012813167259786477, "loss": 1.4868, "step": 4049 }, { "epoch": 1.8, "grad_norm": 2.8668127059936523, "learning_rate": 0.00012811387900355873, "loss": 1.8904, "step": 4050 }, { "epoch": 1.8004444444444445, "grad_norm": 1.4953787326812744, "learning_rate": 0.00012809608540925266, "loss": 2.3084, "step": 4051 }, { "epoch": 1.8008888888888888, "grad_norm": 1.6110552549362183, "learning_rate": 0.0001280782918149466, "loss": 2.9876, "step": 4052 }, { "epoch": 1.8013333333333335, "grad_norm": 1.4815409183502197, "learning_rate": 0.00012806049822064057, "loss": 2.3245, "step": 4053 }, { "epoch": 1.8017777777777777, "grad_norm": 1.6469542980194092, "learning_rate": 0.00012804270462633453, "loss": 2.4534, "step": 4054 }, { "epoch": 1.8022222222222222, "grad_norm": 1.7720386981964111, "learning_rate": 0.00012802491103202848, "loss": 2.5591, "step": 4055 }, { "epoch": 1.8026666666666666, "grad_norm": 1.607649564743042, "learning_rate": 0.0001280071174377224, "loss": 2.3919, "step": 4056 }, { "epoch": 1.803111111111111, "grad_norm": 1.521120548248291, "learning_rate": 0.00012798932384341637, "loss": 2.292, "step": 4057 }, { "epoch": 1.8035555555555556, "grad_norm": 1.7461004257202148, "learning_rate": 0.00012797153024911032, "loss": 2.1695, "step": 4058 }, { "epoch": 1.804, "grad_norm": 1.6158878803253174, "learning_rate": 0.00012795373665480428, "loss": 1.4256, "step": 4059 }, { "epoch": 1.8044444444444445, "grad_norm": 1.6743505001068115, "learning_rate": 0.00012793594306049824, "loss": 1.9986, "step": 4060 }, { "epoch": 1.8048888888888888, "grad_norm": 1.6149520874023438, "learning_rate": 0.0001279181494661922, "loss": 2.5781, "step": 4061 }, { "epoch": 1.8053333333333335, "grad_norm": 1.5467309951782227, "learning_rate": 0.00012790035587188612, "loss": 1.7904, "step": 4062 }, { "epoch": 1.8057777777777777, "grad_norm": 1.7630541324615479, "learning_rate": 0.00012788256227758008, "loss": 2.2801, "step": 4063 }, { "epoch": 1.8062222222222222, "grad_norm": 1.8055700063705444, "learning_rate": 0.000127864768683274, "loss": 1.9704, "step": 4064 }, { "epoch": 1.8066666666666666, "grad_norm": 1.7958096265792847, "learning_rate": 0.00012784697508896797, "loss": 2.3531, "step": 4065 }, { "epoch": 1.8071111111111111, "grad_norm": 1.426638126373291, "learning_rate": 0.00012782918149466192, "loss": 1.2682, "step": 4066 }, { "epoch": 1.8075555555555556, "grad_norm": 1.6399476528167725, "learning_rate": 0.00012781138790035588, "loss": 2.1429, "step": 4067 }, { "epoch": 1.808, "grad_norm": 1.7020010948181152, "learning_rate": 0.00012779359430604984, "loss": 2.2423, "step": 4068 }, { "epoch": 1.8084444444444445, "grad_norm": 1.5817952156066895, "learning_rate": 0.00012777580071174376, "loss": 1.7548, "step": 4069 }, { "epoch": 1.8088888888888888, "grad_norm": 1.6301820278167725, "learning_rate": 0.00012775800711743772, "loss": 2.1788, "step": 4070 }, { "epoch": 1.8093333333333335, "grad_norm": 1.7955340147018433, "learning_rate": 0.00012774021352313168, "loss": 1.756, "step": 4071 }, { "epoch": 1.8097777777777777, "grad_norm": 1.5359266996383667, "learning_rate": 0.00012772241992882563, "loss": 1.714, "step": 4072 }, { "epoch": 1.8102222222222222, "grad_norm": 1.6445212364196777, "learning_rate": 0.0001277046263345196, "loss": 1.9434, "step": 4073 }, { "epoch": 1.8106666666666666, "grad_norm": 1.811740517616272, "learning_rate": 0.00012768683274021355, "loss": 1.7521, "step": 4074 }, { "epoch": 1.8111111111111111, "grad_norm": 1.9442163705825806, "learning_rate": 0.00012766903914590748, "loss": 2.284, "step": 4075 }, { "epoch": 1.8115555555555556, "grad_norm": 2.383162498474121, "learning_rate": 0.00012765124555160143, "loss": 2.1859, "step": 4076 }, { "epoch": 1.812, "grad_norm": 1.7321439981460571, "learning_rate": 0.00012763345195729536, "loss": 1.6559, "step": 4077 }, { "epoch": 1.8124444444444445, "grad_norm": 1.9843666553497314, "learning_rate": 0.00012761565836298932, "loss": 2.1682, "step": 4078 }, { "epoch": 1.8128888888888888, "grad_norm": 1.7194164991378784, "learning_rate": 0.00012759786476868328, "loss": 1.4098, "step": 4079 }, { "epoch": 1.8133333333333335, "grad_norm": 1.5130424499511719, "learning_rate": 0.00012758007117437723, "loss": 1.1193, "step": 4080 }, { "epoch": 1.8137777777777777, "grad_norm": 2.0162298679351807, "learning_rate": 0.0001275622775800712, "loss": 2.1374, "step": 4081 }, { "epoch": 1.8142222222222222, "grad_norm": 2.095752716064453, "learning_rate": 0.00012754448398576512, "loss": 1.7003, "step": 4082 }, { "epoch": 1.8146666666666667, "grad_norm": 1.4571044445037842, "learning_rate": 0.00012752669039145907, "loss": 0.9885, "step": 4083 }, { "epoch": 1.8151111111111111, "grad_norm": 2.0335910320281982, "learning_rate": 0.00012750889679715303, "loss": 1.7632, "step": 4084 }, { "epoch": 1.8155555555555556, "grad_norm": 2.1240885257720947, "learning_rate": 0.000127491103202847, "loss": 1.9801, "step": 4085 }, { "epoch": 1.8159999999999998, "grad_norm": 2.1505091190338135, "learning_rate": 0.00012747330960854094, "loss": 2.513, "step": 4086 }, { "epoch": 1.8164444444444445, "grad_norm": 2.2950072288513184, "learning_rate": 0.0001274555160142349, "loss": 1.9917, "step": 4087 }, { "epoch": 1.8168888888888888, "grad_norm": 2.2907001972198486, "learning_rate": 0.00012743772241992883, "loss": 2.3145, "step": 4088 }, { "epoch": 1.8173333333333335, "grad_norm": 2.215815782546997, "learning_rate": 0.00012741992882562279, "loss": 1.8032, "step": 4089 }, { "epoch": 1.8177777777777777, "grad_norm": 2.2718591690063477, "learning_rate": 0.00012740213523131672, "loss": 2.2278, "step": 4090 }, { "epoch": 1.8182222222222222, "grad_norm": 2.494612455368042, "learning_rate": 0.00012738434163701067, "loss": 2.8879, "step": 4091 }, { "epoch": 1.8186666666666667, "grad_norm": 2.505629539489746, "learning_rate": 0.00012736654804270463, "loss": 2.0987, "step": 4092 }, { "epoch": 1.8191111111111111, "grad_norm": 2.47916841506958, "learning_rate": 0.00012734875444839859, "loss": 2.3081, "step": 4093 }, { "epoch": 1.8195555555555556, "grad_norm": 2.0762839317321777, "learning_rate": 0.00012733096085409254, "loss": 1.929, "step": 4094 }, { "epoch": 1.8199999999999998, "grad_norm": 2.3247344493865967, "learning_rate": 0.00012731316725978647, "loss": 2.4938, "step": 4095 }, { "epoch": 1.8204444444444445, "grad_norm": 2.481536865234375, "learning_rate": 0.00012729537366548043, "loss": 2.1279, "step": 4096 }, { "epoch": 1.8208888888888888, "grad_norm": 2.1468632221221924, "learning_rate": 0.00012727758007117438, "loss": 2.2491, "step": 4097 }, { "epoch": 1.8213333333333335, "grad_norm": 2.509892702102661, "learning_rate": 0.00012725978647686834, "loss": 2.1366, "step": 4098 }, { "epoch": 1.8217777777777777, "grad_norm": 2.757918119430542, "learning_rate": 0.0001272419928825623, "loss": 2.3787, "step": 4099 }, { "epoch": 1.8222222222222222, "grad_norm": 2.95184326171875, "learning_rate": 0.00012722419928825625, "loss": 1.7402, "step": 4100 }, { "epoch": 1.8226666666666667, "grad_norm": 1.111022710800171, "learning_rate": 0.00012720640569395018, "loss": 1.3282, "step": 4101 }, { "epoch": 1.8231111111111111, "grad_norm": 1.570564866065979, "learning_rate": 0.00012718861209964414, "loss": 2.5706, "step": 4102 }, { "epoch": 1.8235555555555556, "grad_norm": 1.5850750207901, "learning_rate": 0.00012717081850533807, "loss": 2.603, "step": 4103 }, { "epoch": 1.8239999999999998, "grad_norm": 1.873236060142517, "learning_rate": 0.00012715302491103203, "loss": 2.306, "step": 4104 }, { "epoch": 1.8244444444444445, "grad_norm": 1.5911369323730469, "learning_rate": 0.00012713523131672598, "loss": 2.377, "step": 4105 }, { "epoch": 1.8248888888888888, "grad_norm": 1.6573972702026367, "learning_rate": 0.00012711743772241994, "loss": 2.1528, "step": 4106 }, { "epoch": 1.8253333333333335, "grad_norm": 1.76816987991333, "learning_rate": 0.0001270996441281139, "loss": 1.9276, "step": 4107 }, { "epoch": 1.8257777777777777, "grad_norm": 1.6484469175338745, "learning_rate": 0.00012708185053380782, "loss": 2.2868, "step": 4108 }, { "epoch": 1.8262222222222222, "grad_norm": 1.8483304977416992, "learning_rate": 0.00012706405693950178, "loss": 1.8456, "step": 4109 }, { "epoch": 1.8266666666666667, "grad_norm": 1.5337408781051636, "learning_rate": 0.00012704626334519574, "loss": 1.3085, "step": 4110 }, { "epoch": 1.8271111111111111, "grad_norm": 1.832377314567566, "learning_rate": 0.0001270284697508897, "loss": 2.0178, "step": 4111 }, { "epoch": 1.8275555555555556, "grad_norm": 1.631611943244934, "learning_rate": 0.00012701067615658365, "loss": 2.3294, "step": 4112 }, { "epoch": 1.8279999999999998, "grad_norm": 1.6167796850204468, "learning_rate": 0.00012699288256227758, "loss": 2.2195, "step": 4113 }, { "epoch": 1.8284444444444445, "grad_norm": 1.6683152914047241, "learning_rate": 0.00012697508896797154, "loss": 2.4356, "step": 4114 }, { "epoch": 1.8288888888888888, "grad_norm": 1.8200979232788086, "learning_rate": 0.0001269572953736655, "loss": 2.0209, "step": 4115 }, { "epoch": 1.8293333333333335, "grad_norm": 1.7240800857543945, "learning_rate": 0.00012693950177935942, "loss": 2.5672, "step": 4116 }, { "epoch": 1.8297777777777777, "grad_norm": 1.9097305536270142, "learning_rate": 0.00012692170818505338, "loss": 2.4372, "step": 4117 }, { "epoch": 1.8302222222222222, "grad_norm": 1.1662497520446777, "learning_rate": 0.00012690391459074733, "loss": 1.1567, "step": 4118 }, { "epoch": 1.8306666666666667, "grad_norm": 1.9544744491577148, "learning_rate": 0.0001268861209964413, "loss": 2.2734, "step": 4119 }, { "epoch": 1.8311111111111111, "grad_norm": 1.7287496328353882, "learning_rate": 0.00012686832740213522, "loss": 1.9544, "step": 4120 }, { "epoch": 1.8315555555555556, "grad_norm": 1.6461305618286133, "learning_rate": 0.00012685053380782918, "loss": 2.2844, "step": 4121 }, { "epoch": 1.8319999999999999, "grad_norm": 1.8667876720428467, "learning_rate": 0.00012683274021352313, "loss": 2.483, "step": 4122 }, { "epoch": 1.8324444444444445, "grad_norm": 1.8884638547897339, "learning_rate": 0.0001268149466192171, "loss": 2.2412, "step": 4123 }, { "epoch": 1.8328888888888888, "grad_norm": 1.9503480195999146, "learning_rate": 0.00012679715302491105, "loss": 2.3155, "step": 4124 }, { "epoch": 1.8333333333333335, "grad_norm": 1.864048719406128, "learning_rate": 0.000126779359430605, "loss": 1.9597, "step": 4125 }, { "epoch": 1.8337777777777777, "grad_norm": 1.8108760118484497, "learning_rate": 0.00012676156583629893, "loss": 2.4295, "step": 4126 }, { "epoch": 1.8342222222222222, "grad_norm": 2.0015783309936523, "learning_rate": 0.0001267437722419929, "loss": 2.2783, "step": 4127 }, { "epoch": 1.8346666666666667, "grad_norm": 1.8798251152038574, "learning_rate": 0.00012672597864768685, "loss": 1.6982, "step": 4128 }, { "epoch": 1.8351111111111111, "grad_norm": 2.052774667739868, "learning_rate": 0.00012670818505338078, "loss": 2.3289, "step": 4129 }, { "epoch": 1.8355555555555556, "grad_norm": 2.0394935607910156, "learning_rate": 0.00012669039145907473, "loss": 2.3255, "step": 4130 }, { "epoch": 1.8359999999999999, "grad_norm": 3.176072120666504, "learning_rate": 0.0001266725978647687, "loss": 2.1502, "step": 4131 }, { "epoch": 1.8364444444444445, "grad_norm": 2.176759719848633, "learning_rate": 0.00012665480427046264, "loss": 2.0181, "step": 4132 }, { "epoch": 1.8368888888888888, "grad_norm": 1.9604226350784302, "learning_rate": 0.00012663701067615657, "loss": 2.3706, "step": 4133 }, { "epoch": 1.8373333333333335, "grad_norm": 1.9100964069366455, "learning_rate": 0.00012661921708185053, "loss": 2.1983, "step": 4134 }, { "epoch": 1.8377777777777777, "grad_norm": 1.7293579578399658, "learning_rate": 0.0001266014234875445, "loss": 2.0016, "step": 4135 }, { "epoch": 1.8382222222222222, "grad_norm": 1.660583257675171, "learning_rate": 0.00012658362989323844, "loss": 1.5306, "step": 4136 }, { "epoch": 1.8386666666666667, "grad_norm": 1.858089804649353, "learning_rate": 0.0001265658362989324, "loss": 2.1515, "step": 4137 }, { "epoch": 1.8391111111111111, "grad_norm": 2.4940619468688965, "learning_rate": 0.00012654804270462636, "loss": 1.9427, "step": 4138 }, { "epoch": 1.8395555555555556, "grad_norm": 2.1511716842651367, "learning_rate": 0.00012653024911032029, "loss": 2.0729, "step": 4139 }, { "epoch": 1.8399999999999999, "grad_norm": 2.469057321548462, "learning_rate": 0.00012651245551601424, "loss": 1.8632, "step": 4140 }, { "epoch": 1.8404444444444445, "grad_norm": 2.173260450363159, "learning_rate": 0.0001264946619217082, "loss": 2.422, "step": 4141 }, { "epoch": 1.8408888888888888, "grad_norm": 2.3541669845581055, "learning_rate": 0.00012647686832740213, "loss": 2.3772, "step": 4142 }, { "epoch": 1.8413333333333335, "grad_norm": 2.6083996295928955, "learning_rate": 0.00012645907473309608, "loss": 2.3829, "step": 4143 }, { "epoch": 1.8417777777777777, "grad_norm": 2.0515520572662354, "learning_rate": 0.00012644128113879004, "loss": 2.0062, "step": 4144 }, { "epoch": 1.8422222222222222, "grad_norm": 2.331029176712036, "learning_rate": 0.000126423487544484, "loss": 2.1876, "step": 4145 }, { "epoch": 1.8426666666666667, "grad_norm": 2.6274282932281494, "learning_rate": 0.00012640569395017793, "loss": 2.344, "step": 4146 }, { "epoch": 1.8431111111111111, "grad_norm": 2.3808679580688477, "learning_rate": 0.00012638790035587188, "loss": 1.9633, "step": 4147 }, { "epoch": 1.8435555555555556, "grad_norm": 2.2616419792175293, "learning_rate": 0.00012637010676156584, "loss": 2.2526, "step": 4148 }, { "epoch": 1.8439999999999999, "grad_norm": 1.2193570137023926, "learning_rate": 0.0001263523131672598, "loss": 0.1276, "step": 4149 }, { "epoch": 1.8444444444444446, "grad_norm": 2.9776499271392822, "learning_rate": 0.00012633451957295375, "loss": 1.8211, "step": 4150 }, { "epoch": 1.8448888888888888, "grad_norm": 1.406093955039978, "learning_rate": 0.0001263167259786477, "loss": 2.3213, "step": 4151 }, { "epoch": 1.8453333333333335, "grad_norm": 1.0631603002548218, "learning_rate": 0.00012629893238434164, "loss": 1.0766, "step": 4152 }, { "epoch": 1.8457777777777777, "grad_norm": 1.5336110591888428, "learning_rate": 0.0001262811387900356, "loss": 2.4137, "step": 4153 }, { "epoch": 1.8462222222222222, "grad_norm": 1.60658597946167, "learning_rate": 0.00012626334519572955, "loss": 2.7207, "step": 4154 }, { "epoch": 1.8466666666666667, "grad_norm": 1.6107380390167236, "learning_rate": 0.00012624555160142348, "loss": 2.6709, "step": 4155 }, { "epoch": 1.8471111111111111, "grad_norm": 1.7542976140975952, "learning_rate": 0.00012622775800711744, "loss": 2.6682, "step": 4156 }, { "epoch": 1.8475555555555556, "grad_norm": 1.7753915786743164, "learning_rate": 0.0001262099644128114, "loss": 2.0563, "step": 4157 }, { "epoch": 1.8479999999999999, "grad_norm": 1.0910627841949463, "learning_rate": 0.00012619217081850535, "loss": 1.2284, "step": 4158 }, { "epoch": 1.8484444444444446, "grad_norm": 1.4845707416534424, "learning_rate": 0.00012617437722419928, "loss": 1.7377, "step": 4159 }, { "epoch": 1.8488888888888888, "grad_norm": 1.4500175714492798, "learning_rate": 0.00012615658362989324, "loss": 1.5395, "step": 4160 }, { "epoch": 1.8493333333333335, "grad_norm": 1.6412715911865234, "learning_rate": 0.0001261387900355872, "loss": 1.5817, "step": 4161 }, { "epoch": 1.8497777777777777, "grad_norm": 1.6677457094192505, "learning_rate": 0.00012612099644128115, "loss": 2.4675, "step": 4162 }, { "epoch": 1.8502222222222222, "grad_norm": 1.9067307710647583, "learning_rate": 0.0001261032028469751, "loss": 2.383, "step": 4163 }, { "epoch": 1.8506666666666667, "grad_norm": 1.6068315505981445, "learning_rate": 0.00012608540925266906, "loss": 1.9027, "step": 4164 }, { "epoch": 1.8511111111111112, "grad_norm": 1.8001701831817627, "learning_rate": 0.000126067615658363, "loss": 2.42, "step": 4165 }, { "epoch": 1.8515555555555556, "grad_norm": 2.3257319927215576, "learning_rate": 0.00012604982206405695, "loss": 2.0886, "step": 4166 }, { "epoch": 1.8519999999999999, "grad_norm": 2.088073492050171, "learning_rate": 0.00012603202846975088, "loss": 2.7623, "step": 4167 }, { "epoch": 1.8524444444444446, "grad_norm": 1.0101107358932495, "learning_rate": 0.00012601423487544483, "loss": 0.7091, "step": 4168 }, { "epoch": 1.8528888888888888, "grad_norm": 1.636488676071167, "learning_rate": 0.0001259964412811388, "loss": 2.0673, "step": 4169 }, { "epoch": 1.8533333333333335, "grad_norm": 1.791585922241211, "learning_rate": 0.00012597864768683275, "loss": 2.1496, "step": 4170 }, { "epoch": 1.8537777777777777, "grad_norm": 1.8314164876937866, "learning_rate": 0.0001259608540925267, "loss": 2.2932, "step": 4171 }, { "epoch": 1.8542222222222222, "grad_norm": 1.7913572788238525, "learning_rate": 0.00012594306049822063, "loss": 1.8432, "step": 4172 }, { "epoch": 1.8546666666666667, "grad_norm": 1.7947697639465332, "learning_rate": 0.0001259252669039146, "loss": 1.7726, "step": 4173 }, { "epoch": 1.8551111111111112, "grad_norm": 1.8670748472213745, "learning_rate": 0.00012590747330960855, "loss": 2.0514, "step": 4174 }, { "epoch": 1.8555555555555556, "grad_norm": 1.9150646924972534, "learning_rate": 0.0001258896797153025, "loss": 2.2755, "step": 4175 }, { "epoch": 1.8559999999999999, "grad_norm": 1.9982563257217407, "learning_rate": 0.00012587188612099646, "loss": 2.4126, "step": 4176 }, { "epoch": 1.8564444444444446, "grad_norm": 1.914477825164795, "learning_rate": 0.00012585409252669042, "loss": 1.7552, "step": 4177 }, { "epoch": 1.8568888888888888, "grad_norm": 2.1474571228027344, "learning_rate": 0.00012583629893238435, "loss": 2.5033, "step": 4178 }, { "epoch": 1.8573333333333333, "grad_norm": 1.845629334449768, "learning_rate": 0.0001258185053380783, "loss": 2.109, "step": 4179 }, { "epoch": 1.8577777777777778, "grad_norm": 2.0772738456726074, "learning_rate": 0.00012580071174377223, "loss": 1.5434, "step": 4180 }, { "epoch": 1.8582222222222222, "grad_norm": 1.9606627225875854, "learning_rate": 0.0001257829181494662, "loss": 2.1156, "step": 4181 }, { "epoch": 1.8586666666666667, "grad_norm": 2.0049219131469727, "learning_rate": 0.00012576512455516014, "loss": 1.9324, "step": 4182 }, { "epoch": 1.8591111111111112, "grad_norm": 1.535262107849121, "learning_rate": 0.0001257473309608541, "loss": 1.2178, "step": 4183 }, { "epoch": 1.8595555555555556, "grad_norm": 2.2102348804473877, "learning_rate": 0.00012572953736654806, "loss": 2.8349, "step": 4184 }, { "epoch": 1.8599999999999999, "grad_norm": 1.7623968124389648, "learning_rate": 0.000125711743772242, "loss": 1.5631, "step": 4185 }, { "epoch": 1.8604444444444446, "grad_norm": 2.135024070739746, "learning_rate": 0.00012569395017793594, "loss": 2.233, "step": 4186 }, { "epoch": 1.8608888888888888, "grad_norm": 2.3472445011138916, "learning_rate": 0.0001256761565836299, "loss": 2.658, "step": 4187 }, { "epoch": 1.8613333333333333, "grad_norm": 2.1595356464385986, "learning_rate": 0.00012565836298932386, "loss": 2.0647, "step": 4188 }, { "epoch": 1.8617777777777778, "grad_norm": 2.6070942878723145, "learning_rate": 0.0001256405693950178, "loss": 2.0941, "step": 4189 }, { "epoch": 1.8622222222222222, "grad_norm": 2.0253846645355225, "learning_rate": 0.00012562277580071177, "loss": 1.9656, "step": 4190 }, { "epoch": 1.8626666666666667, "grad_norm": 2.2132487297058105, "learning_rate": 0.0001256049822064057, "loss": 2.3254, "step": 4191 }, { "epoch": 1.8631111111111112, "grad_norm": 2.6254289150238037, "learning_rate": 0.00012558718861209966, "loss": 3.0945, "step": 4192 }, { "epoch": 1.8635555555555556, "grad_norm": 2.6178314685821533, "learning_rate": 0.00012556939501779358, "loss": 2.216, "step": 4193 }, { "epoch": 1.8639999999999999, "grad_norm": 2.047734498977661, "learning_rate": 0.00012555160142348754, "loss": 1.6887, "step": 4194 }, { "epoch": 1.8644444444444446, "grad_norm": 3.436713695526123, "learning_rate": 0.0001255338078291815, "loss": 1.5871, "step": 4195 }, { "epoch": 1.8648888888888888, "grad_norm": 2.280249834060669, "learning_rate": 0.00012551601423487545, "loss": 2.2639, "step": 4196 }, { "epoch": 1.8653333333333333, "grad_norm": 2.708115816116333, "learning_rate": 0.0001254982206405694, "loss": 2.4468, "step": 4197 }, { "epoch": 1.8657777777777778, "grad_norm": 2.6053898334503174, "learning_rate": 0.00012548042704626334, "loss": 2.3777, "step": 4198 }, { "epoch": 1.8662222222222222, "grad_norm": 1.6615748405456543, "learning_rate": 0.0001254626334519573, "loss": 0.9062, "step": 4199 }, { "epoch": 1.8666666666666667, "grad_norm": 2.4906649589538574, "learning_rate": 0.00012544483985765125, "loss": 0.8929, "step": 4200 }, { "epoch": 1.8671111111111112, "grad_norm": 1.4890950918197632, "learning_rate": 0.0001254270462633452, "loss": 2.7543, "step": 4201 }, { "epoch": 1.8675555555555556, "grad_norm": 1.5864791870117188, "learning_rate": 0.00012540925266903917, "loss": 2.6892, "step": 4202 }, { "epoch": 1.8679999999999999, "grad_norm": 1.612914800643921, "learning_rate": 0.0001253914590747331, "loss": 1.7013, "step": 4203 }, { "epoch": 1.8684444444444446, "grad_norm": 1.4477015733718872, "learning_rate": 0.00012537366548042705, "loss": 1.879, "step": 4204 }, { "epoch": 1.8688888888888888, "grad_norm": 1.9278892278671265, "learning_rate": 0.000125355871886121, "loss": 2.1627, "step": 4205 }, { "epoch": 1.8693333333333333, "grad_norm": 1.7229881286621094, "learning_rate": 0.00012533807829181494, "loss": 2.3671, "step": 4206 }, { "epoch": 1.8697777777777778, "grad_norm": 1.8617095947265625, "learning_rate": 0.0001253202846975089, "loss": 2.6608, "step": 4207 }, { "epoch": 1.8702222222222222, "grad_norm": 1.8363666534423828, "learning_rate": 0.00012530249110320285, "loss": 2.059, "step": 4208 }, { "epoch": 1.8706666666666667, "grad_norm": 1.920870304107666, "learning_rate": 0.0001252846975088968, "loss": 1.7415, "step": 4209 }, { "epoch": 1.871111111111111, "grad_norm": 1.610530138015747, "learning_rate": 0.00012526690391459074, "loss": 2.2202, "step": 4210 }, { "epoch": 1.8715555555555556, "grad_norm": 1.8126658201217651, "learning_rate": 0.0001252491103202847, "loss": 2.2095, "step": 4211 }, { "epoch": 1.8719999999999999, "grad_norm": 2.0825304985046387, "learning_rate": 0.00012523131672597865, "loss": 2.3952, "step": 4212 }, { "epoch": 1.8724444444444446, "grad_norm": 1.455892562866211, "learning_rate": 0.0001252135231316726, "loss": 1.4388, "step": 4213 }, { "epoch": 1.8728888888888888, "grad_norm": 1.9848651885986328, "learning_rate": 0.00012519572953736656, "loss": 2.325, "step": 4214 }, { "epoch": 1.8733333333333333, "grad_norm": 1.972327709197998, "learning_rate": 0.00012517793594306052, "loss": 1.9154, "step": 4215 }, { "epoch": 1.8737777777777778, "grad_norm": 1.8076159954071045, "learning_rate": 0.00012516014234875445, "loss": 2.4675, "step": 4216 }, { "epoch": 1.8742222222222222, "grad_norm": 1.6413958072662354, "learning_rate": 0.0001251423487544484, "loss": 1.5596, "step": 4217 }, { "epoch": 1.8746666666666667, "grad_norm": 1.9646254777908325, "learning_rate": 0.00012512455516014236, "loss": 2.1716, "step": 4218 }, { "epoch": 1.875111111111111, "grad_norm": 1.7197006940841675, "learning_rate": 0.0001251067615658363, "loss": 2.0816, "step": 4219 }, { "epoch": 1.8755555555555556, "grad_norm": 1.8920749425888062, "learning_rate": 0.00012508896797153025, "loss": 2.2979, "step": 4220 }, { "epoch": 1.876, "grad_norm": 1.8623499870300293, "learning_rate": 0.0001250711743772242, "loss": 2.5679, "step": 4221 }, { "epoch": 1.8764444444444446, "grad_norm": 2.2921416759490967, "learning_rate": 0.00012505338078291816, "loss": 2.4083, "step": 4222 }, { "epoch": 1.8768888888888888, "grad_norm": 1.984156847000122, "learning_rate": 0.0001250355871886121, "loss": 2.6118, "step": 4223 }, { "epoch": 1.8773333333333333, "grad_norm": 1.681645154953003, "learning_rate": 0.00012501779359430605, "loss": 1.4962, "step": 4224 }, { "epoch": 1.8777777777777778, "grad_norm": 2.018657684326172, "learning_rate": 0.000125, "loss": 2.2015, "step": 4225 }, { "epoch": 1.8782222222222222, "grad_norm": 1.8145815134048462, "learning_rate": 0.00012498220640569396, "loss": 1.4773, "step": 4226 }, { "epoch": 1.8786666666666667, "grad_norm": 1.8681118488311768, "learning_rate": 0.00012496441281138792, "loss": 2.0218, "step": 4227 }, { "epoch": 1.879111111111111, "grad_norm": 2.0949301719665527, "learning_rate": 0.00012494661921708187, "loss": 2.1287, "step": 4228 }, { "epoch": 1.8795555555555556, "grad_norm": 1.3150235414505005, "learning_rate": 0.0001249288256227758, "loss": 1.1873, "step": 4229 }, { "epoch": 1.88, "grad_norm": 2.1858744621276855, "learning_rate": 0.00012491103202846976, "loss": 2.2247, "step": 4230 }, { "epoch": 1.8804444444444446, "grad_norm": 2.0682497024536133, "learning_rate": 0.00012489323843416371, "loss": 2.2693, "step": 4231 }, { "epoch": 1.8808888888888888, "grad_norm": 2.008765697479248, "learning_rate": 0.00012487544483985764, "loss": 1.8305, "step": 4232 }, { "epoch": 1.8813333333333333, "grad_norm": 2.322118043899536, "learning_rate": 0.0001248576512455516, "loss": 2.6863, "step": 4233 }, { "epoch": 1.8817777777777778, "grad_norm": 2.1211628913879395, "learning_rate": 0.00012483985765124556, "loss": 2.3278, "step": 4234 }, { "epoch": 1.8822222222222222, "grad_norm": 2.126739501953125, "learning_rate": 0.0001248220640569395, "loss": 1.9615, "step": 4235 }, { "epoch": 1.8826666666666667, "grad_norm": 1.4139108657836914, "learning_rate": 0.00012480427046263344, "loss": 1.3694, "step": 4236 }, { "epoch": 1.883111111111111, "grad_norm": 2.1995489597320557, "learning_rate": 0.0001247864768683274, "loss": 2.1418, "step": 4237 }, { "epoch": 1.8835555555555556, "grad_norm": 2.4655182361602783, "learning_rate": 0.00012476868327402136, "loss": 2.497, "step": 4238 }, { "epoch": 1.884, "grad_norm": 2.2003283500671387, "learning_rate": 0.0001247508896797153, "loss": 2.6386, "step": 4239 }, { "epoch": 1.8844444444444446, "grad_norm": 2.3272600173950195, "learning_rate": 0.00012473309608540927, "loss": 2.2253, "step": 4240 }, { "epoch": 1.8848888888888888, "grad_norm": 1.7986118793487549, "learning_rate": 0.00012471530249110323, "loss": 2.112, "step": 4241 }, { "epoch": 1.8853333333333333, "grad_norm": 2.1452407836914062, "learning_rate": 0.00012469750889679715, "loss": 1.9534, "step": 4242 }, { "epoch": 1.8857777777777778, "grad_norm": 2.466782808303833, "learning_rate": 0.0001246797153024911, "loss": 2.5373, "step": 4243 }, { "epoch": 1.8862222222222222, "grad_norm": 2.473870277404785, "learning_rate": 0.00012466192170818507, "loss": 2.505, "step": 4244 }, { "epoch": 1.8866666666666667, "grad_norm": 2.5043163299560547, "learning_rate": 0.000124644128113879, "loss": 2.4174, "step": 4245 }, { "epoch": 1.887111111111111, "grad_norm": 2.2881720066070557, "learning_rate": 0.00012462633451957295, "loss": 2.4042, "step": 4246 }, { "epoch": 1.8875555555555557, "grad_norm": 2.357645034790039, "learning_rate": 0.0001246085409252669, "loss": 2.1379, "step": 4247 }, { "epoch": 1.888, "grad_norm": 2.677143096923828, "learning_rate": 0.00012459074733096087, "loss": 2.0372, "step": 4248 }, { "epoch": 1.8884444444444446, "grad_norm": 2.4684627056121826, "learning_rate": 0.0001245729537366548, "loss": 1.7057, "step": 4249 }, { "epoch": 1.8888888888888888, "grad_norm": 2.5674924850463867, "learning_rate": 0.00012455516014234875, "loss": 1.4432, "step": 4250 }, { "epoch": 1.8893333333333333, "grad_norm": 1.4329222440719604, "learning_rate": 0.0001245373665480427, "loss": 2.3156, "step": 4251 }, { "epoch": 1.8897777777777778, "grad_norm": 1.45597505569458, "learning_rate": 0.00012451957295373667, "loss": 2.4051, "step": 4252 }, { "epoch": 1.8902222222222222, "grad_norm": 1.2270104885101318, "learning_rate": 0.00012450177935943062, "loss": 1.7157, "step": 4253 }, { "epoch": 1.8906666666666667, "grad_norm": 1.1220571994781494, "learning_rate": 0.00012448398576512458, "loss": 1.2594, "step": 4254 }, { "epoch": 1.891111111111111, "grad_norm": 1.5439119338989258, "learning_rate": 0.0001244661921708185, "loss": 2.2097, "step": 4255 }, { "epoch": 1.8915555555555557, "grad_norm": 1.6956852674484253, "learning_rate": 0.00012444839857651246, "loss": 2.6823, "step": 4256 }, { "epoch": 1.892, "grad_norm": 1.640121579170227, "learning_rate": 0.00012443060498220642, "loss": 2.2303, "step": 4257 }, { "epoch": 1.8924444444444446, "grad_norm": 1.3527717590332031, "learning_rate": 0.00012441281138790035, "loss": 1.1705, "step": 4258 }, { "epoch": 1.8928888888888888, "grad_norm": 1.7290159463882446, "learning_rate": 0.0001243950177935943, "loss": 2.4764, "step": 4259 }, { "epoch": 1.8933333333333333, "grad_norm": 1.5144662857055664, "learning_rate": 0.00012437722419928826, "loss": 1.8495, "step": 4260 }, { "epoch": 1.8937777777777778, "grad_norm": 1.8776041269302368, "learning_rate": 0.00012435943060498222, "loss": 2.7901, "step": 4261 }, { "epoch": 1.8942222222222223, "grad_norm": 1.9808104038238525, "learning_rate": 0.00012434163701067615, "loss": 2.3308, "step": 4262 }, { "epoch": 1.8946666666666667, "grad_norm": 1.6954816579818726, "learning_rate": 0.0001243238434163701, "loss": 2.1066, "step": 4263 }, { "epoch": 1.895111111111111, "grad_norm": 2.016079902648926, "learning_rate": 0.00012430604982206406, "loss": 2.4023, "step": 4264 }, { "epoch": 1.8955555555555557, "grad_norm": 1.6305691003799438, "learning_rate": 0.00012428825622775802, "loss": 1.8707, "step": 4265 }, { "epoch": 1.896, "grad_norm": 1.8553330898284912, "learning_rate": 0.00012427046263345198, "loss": 2.0848, "step": 4266 }, { "epoch": 1.8964444444444446, "grad_norm": 1.7080594301223755, "learning_rate": 0.00012425266903914593, "loss": 2.0636, "step": 4267 }, { "epoch": 1.8968888888888888, "grad_norm": 1.9433776140213013, "learning_rate": 0.00012423487544483986, "loss": 2.1638, "step": 4268 }, { "epoch": 1.8973333333333333, "grad_norm": 1.7864351272583008, "learning_rate": 0.00012421708185053382, "loss": 2.1109, "step": 4269 }, { "epoch": 1.8977777777777778, "grad_norm": 1.667856216430664, "learning_rate": 0.00012419928825622777, "loss": 1.4533, "step": 4270 }, { "epoch": 1.8982222222222223, "grad_norm": 1.8229117393493652, "learning_rate": 0.0001241814946619217, "loss": 2.093, "step": 4271 }, { "epoch": 1.8986666666666667, "grad_norm": 2.1975040435791016, "learning_rate": 0.00012416370106761566, "loss": 2.6783, "step": 4272 }, { "epoch": 1.899111111111111, "grad_norm": 1.8947041034698486, "learning_rate": 0.00012414590747330962, "loss": 2.4428, "step": 4273 }, { "epoch": 1.8995555555555557, "grad_norm": 1.981955647468567, "learning_rate": 0.00012412811387900357, "loss": 1.9542, "step": 4274 }, { "epoch": 1.9, "grad_norm": 2.057514190673828, "learning_rate": 0.0001241103202846975, "loss": 2.4705, "step": 4275 }, { "epoch": 1.9004444444444446, "grad_norm": 1.9923932552337646, "learning_rate": 0.00012409252669039146, "loss": 2.1799, "step": 4276 }, { "epoch": 1.9008888888888889, "grad_norm": 1.9740736484527588, "learning_rate": 0.00012407473309608542, "loss": 2.401, "step": 4277 }, { "epoch": 1.9013333333333333, "grad_norm": 0.5957589149475098, "learning_rate": 0.00012405693950177937, "loss": 0.0373, "step": 4278 }, { "epoch": 1.9017777777777778, "grad_norm": 1.835414171218872, "learning_rate": 0.00012403914590747333, "loss": 2.2666, "step": 4279 }, { "epoch": 1.9022222222222223, "grad_norm": 1.9342777729034424, "learning_rate": 0.00012402135231316728, "loss": 2.1608, "step": 4280 }, { "epoch": 1.9026666666666667, "grad_norm": 1.760406732559204, "learning_rate": 0.00012400355871886121, "loss": 1.7764, "step": 4281 }, { "epoch": 1.903111111111111, "grad_norm": 2.0876519680023193, "learning_rate": 0.00012398576512455517, "loss": 2.6303, "step": 4282 }, { "epoch": 1.9035555555555557, "grad_norm": 2.0983736515045166, "learning_rate": 0.00012396797153024913, "loss": 2.28, "step": 4283 }, { "epoch": 1.904, "grad_norm": 2.1319432258605957, "learning_rate": 0.00012395017793594306, "loss": 2.5054, "step": 4284 }, { "epoch": 1.9044444444444446, "grad_norm": 1.9509506225585938, "learning_rate": 0.000123932384341637, "loss": 1.8634, "step": 4285 }, { "epoch": 1.9048888888888889, "grad_norm": 2.6779415607452393, "learning_rate": 0.00012391459074733097, "loss": 2.8077, "step": 4286 }, { "epoch": 1.9053333333333333, "grad_norm": 1.5328880548477173, "learning_rate": 0.00012389679715302493, "loss": 1.1721, "step": 4287 }, { "epoch": 1.9057777777777778, "grad_norm": 2.5803184509277344, "learning_rate": 0.00012387900355871886, "loss": 2.3973, "step": 4288 }, { "epoch": 1.9062222222222223, "grad_norm": 1.985290765762329, "learning_rate": 0.0001238612099644128, "loss": 2.2365, "step": 4289 }, { "epoch": 1.9066666666666667, "grad_norm": 2.207040309906006, "learning_rate": 0.00012384341637010677, "loss": 1.5546, "step": 4290 }, { "epoch": 1.907111111111111, "grad_norm": 2.6116459369659424, "learning_rate": 0.00012382562277580072, "loss": 2.4088, "step": 4291 }, { "epoch": 1.9075555555555557, "grad_norm": 1.6348541975021362, "learning_rate": 0.00012380782918149468, "loss": 1.5725, "step": 4292 }, { "epoch": 1.908, "grad_norm": 2.5747082233428955, "learning_rate": 0.0001237900355871886, "loss": 2.5069, "step": 4293 }, { "epoch": 1.9084444444444446, "grad_norm": 2.236910104751587, "learning_rate": 0.00012377224199288257, "loss": 2.2303, "step": 4294 }, { "epoch": 1.9088888888888889, "grad_norm": 2.4665608406066895, "learning_rate": 0.00012375444839857652, "loss": 2.0439, "step": 4295 }, { "epoch": 1.9093333333333333, "grad_norm": 2.041987895965576, "learning_rate": 0.00012373665480427045, "loss": 2.1251, "step": 4296 }, { "epoch": 1.9097777777777778, "grad_norm": 2.454289436340332, "learning_rate": 0.0001237188612099644, "loss": 2.5853, "step": 4297 }, { "epoch": 1.9102222222222223, "grad_norm": 2.1545984745025635, "learning_rate": 0.00012370106761565837, "loss": 2.0791, "step": 4298 }, { "epoch": 1.9106666666666667, "grad_norm": 2.7019107341766357, "learning_rate": 0.00012368327402135232, "loss": 2.4134, "step": 4299 }, { "epoch": 1.911111111111111, "grad_norm": 3.2024686336517334, "learning_rate": 0.00012366548042704625, "loss": 1.6449, "step": 4300 }, { "epoch": 1.9115555555555557, "grad_norm": 1.3786087036132812, "learning_rate": 0.0001236476868327402, "loss": 2.5061, "step": 4301 }, { "epoch": 1.912, "grad_norm": 1.5719784498214722, "learning_rate": 0.00012362989323843417, "loss": 2.3814, "step": 4302 }, { "epoch": 1.9124444444444444, "grad_norm": 1.4920408725738525, "learning_rate": 0.00012361209964412812, "loss": 2.0447, "step": 4303 }, { "epoch": 1.9128888888888889, "grad_norm": 1.747694730758667, "learning_rate": 0.00012359430604982208, "loss": 2.8776, "step": 4304 }, { "epoch": 1.9133333333333333, "grad_norm": 1.8610203266143799, "learning_rate": 0.00012357651245551603, "loss": 2.3367, "step": 4305 }, { "epoch": 1.9137777777777778, "grad_norm": 1.5181238651275635, "learning_rate": 0.00012355871886120996, "loss": 1.9326, "step": 4306 }, { "epoch": 1.9142222222222223, "grad_norm": 1.707848072052002, "learning_rate": 0.00012354092526690392, "loss": 2.3507, "step": 4307 }, { "epoch": 1.9146666666666667, "grad_norm": 1.7067019939422607, "learning_rate": 0.00012352313167259788, "loss": 2.0366, "step": 4308 }, { "epoch": 1.915111111111111, "grad_norm": 1.7387741804122925, "learning_rate": 0.0001235053380782918, "loss": 2.0162, "step": 4309 }, { "epoch": 1.9155555555555557, "grad_norm": 1.7039339542388916, "learning_rate": 0.00012348754448398576, "loss": 2.1688, "step": 4310 }, { "epoch": 1.916, "grad_norm": 1.6669498682022095, "learning_rate": 0.00012346975088967972, "loss": 2.2707, "step": 4311 }, { "epoch": 1.9164444444444444, "grad_norm": 1.6338768005371094, "learning_rate": 0.00012345195729537368, "loss": 2.0841, "step": 4312 }, { "epoch": 1.9168888888888889, "grad_norm": 1.6878036260604858, "learning_rate": 0.0001234341637010676, "loss": 2.2946, "step": 4313 }, { "epoch": 1.9173333333333333, "grad_norm": 1.7214369773864746, "learning_rate": 0.00012341637010676156, "loss": 1.9289, "step": 4314 }, { "epoch": 1.9177777777777778, "grad_norm": 1.6168575286865234, "learning_rate": 0.00012339857651245552, "loss": 1.9991, "step": 4315 }, { "epoch": 1.9182222222222223, "grad_norm": 1.788225531578064, "learning_rate": 0.00012338078291814947, "loss": 2.3562, "step": 4316 }, { "epoch": 1.9186666666666667, "grad_norm": 1.5336823463439941, "learning_rate": 0.00012336298932384343, "loss": 1.7089, "step": 4317 }, { "epoch": 1.919111111111111, "grad_norm": 1.8361992835998535, "learning_rate": 0.0001233451957295374, "loss": 2.0708, "step": 4318 }, { "epoch": 1.9195555555555557, "grad_norm": 1.857429027557373, "learning_rate": 0.00012332740213523132, "loss": 1.7238, "step": 4319 }, { "epoch": 1.92, "grad_norm": 1.6928489208221436, "learning_rate": 0.00012330960854092527, "loss": 2.0781, "step": 4320 }, { "epoch": 1.9204444444444444, "grad_norm": 1.6085439920425415, "learning_rate": 0.00012329181494661923, "loss": 1.37, "step": 4321 }, { "epoch": 1.9208888888888889, "grad_norm": 1.9759423732757568, "learning_rate": 0.00012327402135231316, "loss": 2.561, "step": 4322 }, { "epoch": 1.9213333333333333, "grad_norm": 1.8976612091064453, "learning_rate": 0.00012325622775800712, "loss": 2.1077, "step": 4323 }, { "epoch": 1.9217777777777778, "grad_norm": 2.0411934852600098, "learning_rate": 0.00012323843416370107, "loss": 2.4446, "step": 4324 }, { "epoch": 1.9222222222222223, "grad_norm": 2.429276704788208, "learning_rate": 0.00012322064056939503, "loss": 2.5635, "step": 4325 }, { "epoch": 1.9226666666666667, "grad_norm": 2.421410322189331, "learning_rate": 0.00012320284697508896, "loss": 2.9828, "step": 4326 }, { "epoch": 1.923111111111111, "grad_norm": 2.055605173110962, "learning_rate": 0.00012318505338078291, "loss": 2.5132, "step": 4327 }, { "epoch": 1.9235555555555557, "grad_norm": 1.587828278541565, "learning_rate": 0.00012316725978647687, "loss": 1.3512, "step": 4328 }, { "epoch": 1.924, "grad_norm": 2.128390312194824, "learning_rate": 0.00012314946619217083, "loss": 2.1455, "step": 4329 }, { "epoch": 1.9244444444444444, "grad_norm": 1.7967475652694702, "learning_rate": 0.00012313167259786478, "loss": 0.8184, "step": 4330 }, { "epoch": 1.9248888888888889, "grad_norm": 2.1603779792785645, "learning_rate": 0.00012311387900355874, "loss": 2.2261, "step": 4331 }, { "epoch": 1.9253333333333333, "grad_norm": 2.0636863708496094, "learning_rate": 0.00012309608540925267, "loss": 2.0121, "step": 4332 }, { "epoch": 1.9257777777777778, "grad_norm": 2.0725672245025635, "learning_rate": 0.00012307829181494663, "loss": 2.276, "step": 4333 }, { "epoch": 1.926222222222222, "grad_norm": 2.1373484134674072, "learning_rate": 0.00012306049822064058, "loss": 2.3134, "step": 4334 }, { "epoch": 1.9266666666666667, "grad_norm": 1.7675268650054932, "learning_rate": 0.0001230427046263345, "loss": 1.5304, "step": 4335 }, { "epoch": 1.927111111111111, "grad_norm": 1.9493167400360107, "learning_rate": 0.00012302491103202847, "loss": 1.8928, "step": 4336 }, { "epoch": 1.9275555555555557, "grad_norm": 2.15889835357666, "learning_rate": 0.00012300711743772243, "loss": 2.0092, "step": 4337 }, { "epoch": 1.928, "grad_norm": 2.0056865215301514, "learning_rate": 0.00012298932384341638, "loss": 2.3798, "step": 4338 }, { "epoch": 1.9284444444444444, "grad_norm": 2.266223907470703, "learning_rate": 0.0001229715302491103, "loss": 2.3884, "step": 4339 }, { "epoch": 1.9288888888888889, "grad_norm": 2.173867702484131, "learning_rate": 0.00012295373665480427, "loss": 2.5138, "step": 4340 }, { "epoch": 1.9293333333333333, "grad_norm": 2.383648633956909, "learning_rate": 0.00012293594306049822, "loss": 2.4544, "step": 4341 }, { "epoch": 1.9297777777777778, "grad_norm": 1.9824879169464111, "learning_rate": 0.00012291814946619218, "loss": 1.7544, "step": 4342 }, { "epoch": 1.930222222222222, "grad_norm": 2.24977707862854, "learning_rate": 0.00012290035587188614, "loss": 2.9336, "step": 4343 }, { "epoch": 1.9306666666666668, "grad_norm": 2.6044070720672607, "learning_rate": 0.0001228825622775801, "loss": 2.0662, "step": 4344 }, { "epoch": 1.931111111111111, "grad_norm": 2.6351845264434814, "learning_rate": 0.00012286476868327402, "loss": 2.554, "step": 4345 }, { "epoch": 1.9315555555555557, "grad_norm": 2.308326005935669, "learning_rate": 0.00012284697508896798, "loss": 2.0536, "step": 4346 }, { "epoch": 1.932, "grad_norm": 2.2656610012054443, "learning_rate": 0.00012282918149466194, "loss": 2.3636, "step": 4347 }, { "epoch": 1.9324444444444444, "grad_norm": 2.7390570640563965, "learning_rate": 0.00012281138790035587, "loss": 2.2897, "step": 4348 }, { "epoch": 1.9328888888888889, "grad_norm": 2.2677807807922363, "learning_rate": 0.00012279359430604982, "loss": 2.1377, "step": 4349 }, { "epoch": 1.9333333333333333, "grad_norm": 3.099202871322632, "learning_rate": 0.00012277580071174378, "loss": 2.5962, "step": 4350 }, { "epoch": 1.9337777777777778, "grad_norm": 0.9823408126831055, "learning_rate": 0.00012275800711743774, "loss": 1.15, "step": 4351 }, { "epoch": 1.934222222222222, "grad_norm": 1.5499428510665894, "learning_rate": 0.00012274021352313166, "loss": 1.9964, "step": 4352 }, { "epoch": 1.9346666666666668, "grad_norm": 1.5423551797866821, "learning_rate": 0.00012272241992882562, "loss": 2.1596, "step": 4353 }, { "epoch": 1.935111111111111, "grad_norm": 1.57822847366333, "learning_rate": 0.00012270462633451958, "loss": 2.2386, "step": 4354 }, { "epoch": 1.9355555555555557, "grad_norm": 1.5377904176712036, "learning_rate": 0.00012268683274021353, "loss": 2.0222, "step": 4355 }, { "epoch": 1.936, "grad_norm": 1.53287935256958, "learning_rate": 0.0001226690391459075, "loss": 1.9897, "step": 4356 }, { "epoch": 1.9364444444444444, "grad_norm": 1.51986563205719, "learning_rate": 0.00012265124555160145, "loss": 2.0826, "step": 4357 }, { "epoch": 1.9368888888888889, "grad_norm": 1.7066446542739868, "learning_rate": 0.00012263345195729538, "loss": 1.8589, "step": 4358 }, { "epoch": 1.9373333333333334, "grad_norm": 1.175493836402893, "learning_rate": 0.00012261565836298933, "loss": 1.0915, "step": 4359 }, { "epoch": 1.9377777777777778, "grad_norm": 1.7773221731185913, "learning_rate": 0.0001225978647686833, "loss": 1.8631, "step": 4360 }, { "epoch": 1.938222222222222, "grad_norm": 2.198300361633301, "learning_rate": 0.00012258007117437722, "loss": 2.6825, "step": 4361 }, { "epoch": 1.9386666666666668, "grad_norm": 1.8990510702133179, "learning_rate": 0.00012256227758007118, "loss": 2.8493, "step": 4362 }, { "epoch": 1.939111111111111, "grad_norm": 1.874263882637024, "learning_rate": 0.00012254448398576513, "loss": 2.588, "step": 4363 }, { "epoch": 1.9395555555555557, "grad_norm": 1.9859611988067627, "learning_rate": 0.0001225266903914591, "loss": 2.3982, "step": 4364 }, { "epoch": 1.94, "grad_norm": 1.6862070560455322, "learning_rate": 0.00012250889679715302, "loss": 2.0444, "step": 4365 }, { "epoch": 1.9404444444444444, "grad_norm": 1.7006193399429321, "learning_rate": 0.00012249110320284697, "loss": 1.6788, "step": 4366 }, { "epoch": 1.9408888888888889, "grad_norm": 1.6702203750610352, "learning_rate": 0.00012247330960854093, "loss": 2.0756, "step": 4367 }, { "epoch": 1.9413333333333334, "grad_norm": 2.062119483947754, "learning_rate": 0.0001224555160142349, "loss": 3.1842, "step": 4368 }, { "epoch": 1.9417777777777778, "grad_norm": 1.8628273010253906, "learning_rate": 0.00012243772241992884, "loss": 2.0465, "step": 4369 }, { "epoch": 1.942222222222222, "grad_norm": 1.9983272552490234, "learning_rate": 0.0001224199288256228, "loss": 1.8614, "step": 4370 }, { "epoch": 1.9426666666666668, "grad_norm": 1.8302124738693237, "learning_rate": 0.00012240213523131673, "loss": 2.2356, "step": 4371 }, { "epoch": 1.943111111111111, "grad_norm": 2.0615060329437256, "learning_rate": 0.00012238434163701069, "loss": 2.6518, "step": 4372 }, { "epoch": 1.9435555555555557, "grad_norm": 1.957165002822876, "learning_rate": 0.00012236654804270464, "loss": 2.5132, "step": 4373 }, { "epoch": 1.944, "grad_norm": 2.0981383323669434, "learning_rate": 0.00012234875444839857, "loss": 2.5369, "step": 4374 }, { "epoch": 1.9444444444444444, "grad_norm": 2.0047876834869385, "learning_rate": 0.00012233096085409253, "loss": 1.8372, "step": 4375 }, { "epoch": 1.944888888888889, "grad_norm": 1.8745818138122559, "learning_rate": 0.00012231316725978649, "loss": 2.0336, "step": 4376 }, { "epoch": 1.9453333333333334, "grad_norm": 1.9470521211624146, "learning_rate": 0.00012229537366548044, "loss": 2.0855, "step": 4377 }, { "epoch": 1.9457777777777778, "grad_norm": 1.992088794708252, "learning_rate": 0.00012227758007117437, "loss": 2.2125, "step": 4378 }, { "epoch": 1.946222222222222, "grad_norm": 1.8258486986160278, "learning_rate": 0.00012225978647686833, "loss": 1.8891, "step": 4379 }, { "epoch": 1.9466666666666668, "grad_norm": 1.850938081741333, "learning_rate": 0.00012224199288256228, "loss": 2.3489, "step": 4380 }, { "epoch": 1.947111111111111, "grad_norm": 2.0930421352386475, "learning_rate": 0.00012222419928825624, "loss": 2.173, "step": 4381 }, { "epoch": 1.9475555555555557, "grad_norm": 1.875168800354004, "learning_rate": 0.0001222064056939502, "loss": 1.7742, "step": 4382 }, { "epoch": 1.948, "grad_norm": 2.2607669830322266, "learning_rate": 0.00012218861209964413, "loss": 2.6387, "step": 4383 }, { "epoch": 1.9484444444444444, "grad_norm": 1.710083246231079, "learning_rate": 0.00012217081850533808, "loss": 2.132, "step": 4384 }, { "epoch": 1.948888888888889, "grad_norm": 2.2312233448028564, "learning_rate": 0.00012215302491103204, "loss": 1.6657, "step": 4385 }, { "epoch": 1.9493333333333334, "grad_norm": 2.3804073333740234, "learning_rate": 0.000122135231316726, "loss": 2.277, "step": 4386 }, { "epoch": 1.9497777777777778, "grad_norm": 2.0216658115386963, "learning_rate": 0.00012211743772241993, "loss": 2.4888, "step": 4387 }, { "epoch": 1.950222222222222, "grad_norm": 1.6631717681884766, "learning_rate": 0.00012209964412811388, "loss": 1.218, "step": 4388 }, { "epoch": 1.9506666666666668, "grad_norm": 2.17562198638916, "learning_rate": 0.00012208185053380784, "loss": 1.9006, "step": 4389 }, { "epoch": 1.951111111111111, "grad_norm": 1.9813545942306519, "learning_rate": 0.00012206405693950178, "loss": 1.9596, "step": 4390 }, { "epoch": 1.9515555555555557, "grad_norm": 1.9598819017410278, "learning_rate": 0.00012204626334519574, "loss": 2.5526, "step": 4391 }, { "epoch": 1.952, "grad_norm": 2.731766700744629, "learning_rate": 0.00012202846975088968, "loss": 2.2122, "step": 4392 }, { "epoch": 1.9524444444444444, "grad_norm": 2.2050271034240723, "learning_rate": 0.00012201067615658364, "loss": 2.17, "step": 4393 }, { "epoch": 1.952888888888889, "grad_norm": 2.014303684234619, "learning_rate": 0.0001219928825622776, "loss": 2.0293, "step": 4394 }, { "epoch": 1.9533333333333334, "grad_norm": 2.2613487243652344, "learning_rate": 0.00012197508896797154, "loss": 2.4478, "step": 4395 }, { "epoch": 1.9537777777777778, "grad_norm": 2.023850917816162, "learning_rate": 0.00012195729537366549, "loss": 1.7262, "step": 4396 }, { "epoch": 1.954222222222222, "grad_norm": 1.9727839231491089, "learning_rate": 0.00012193950177935945, "loss": 2.1561, "step": 4397 }, { "epoch": 1.9546666666666668, "grad_norm": 2.2552168369293213, "learning_rate": 0.00012192170818505339, "loss": 2.3178, "step": 4398 }, { "epoch": 1.955111111111111, "grad_norm": 2.353135108947754, "learning_rate": 0.00012190391459074735, "loss": 2.2246, "step": 4399 }, { "epoch": 1.9555555555555557, "grad_norm": 2.8717939853668213, "learning_rate": 0.00012188612099644128, "loss": 2.3769, "step": 4400 }, { "epoch": 1.956, "grad_norm": 1.117507815361023, "learning_rate": 0.00012186832740213523, "loss": 1.2481, "step": 4401 }, { "epoch": 1.9564444444444444, "grad_norm": 1.0877759456634521, "learning_rate": 0.00012185053380782918, "loss": 0.9556, "step": 4402 }, { "epoch": 1.956888888888889, "grad_norm": 1.1263483762741089, "learning_rate": 0.00012183274021352313, "loss": 1.1806, "step": 4403 }, { "epoch": 1.9573333333333334, "grad_norm": 1.5997364521026611, "learning_rate": 0.00012181494661921709, "loss": 2.2267, "step": 4404 }, { "epoch": 1.9577777777777778, "grad_norm": 1.4710240364074707, "learning_rate": 0.00012179715302491103, "loss": 2.4633, "step": 4405 }, { "epoch": 1.958222222222222, "grad_norm": 1.5954550504684448, "learning_rate": 0.00012177935943060499, "loss": 2.3169, "step": 4406 }, { "epoch": 1.9586666666666668, "grad_norm": 1.688092589378357, "learning_rate": 0.00012176156583629895, "loss": 2.6058, "step": 4407 }, { "epoch": 1.959111111111111, "grad_norm": 3.413558006286621, "learning_rate": 0.00012174377224199289, "loss": 1.9166, "step": 4408 }, { "epoch": 1.9595555555555557, "grad_norm": 1.7245512008666992, "learning_rate": 0.00012172597864768685, "loss": 2.4489, "step": 4409 }, { "epoch": 1.96, "grad_norm": 1.8946932554244995, "learning_rate": 0.0001217081850533808, "loss": 2.1216, "step": 4410 }, { "epoch": 1.9604444444444444, "grad_norm": 1.766736626625061, "learning_rate": 0.00012169039145907475, "loss": 2.4328, "step": 4411 }, { "epoch": 1.960888888888889, "grad_norm": 1.7106037139892578, "learning_rate": 0.00012167259786476868, "loss": 2.0669, "step": 4412 }, { "epoch": 1.9613333333333334, "grad_norm": 1.758056879043579, "learning_rate": 0.00012165480427046263, "loss": 2.306, "step": 4413 }, { "epoch": 1.9617777777777778, "grad_norm": 1.796230673789978, "learning_rate": 0.00012163701067615659, "loss": 2.3531, "step": 4414 }, { "epoch": 1.962222222222222, "grad_norm": 1.641489863395691, "learning_rate": 0.00012161921708185053, "loss": 1.9206, "step": 4415 }, { "epoch": 1.9626666666666668, "grad_norm": 1.558814287185669, "learning_rate": 0.00012160142348754449, "loss": 1.7989, "step": 4416 }, { "epoch": 1.963111111111111, "grad_norm": 1.724256992340088, "learning_rate": 0.00012158362989323844, "loss": 2.0509, "step": 4417 }, { "epoch": 1.9635555555555557, "grad_norm": 1.6848506927490234, "learning_rate": 0.00012156583629893239, "loss": 2.4329, "step": 4418 }, { "epoch": 1.964, "grad_norm": 1.874718427658081, "learning_rate": 0.00012154804270462634, "loss": 2.2301, "step": 4419 }, { "epoch": 1.9644444444444444, "grad_norm": 1.9206674098968506, "learning_rate": 0.0001215302491103203, "loss": 2.2543, "step": 4420 }, { "epoch": 1.964888888888889, "grad_norm": 1.9313912391662598, "learning_rate": 0.00012151245551601424, "loss": 2.0722, "step": 4421 }, { "epoch": 1.9653333333333334, "grad_norm": 2.0314974784851074, "learning_rate": 0.0001214946619217082, "loss": 2.2565, "step": 4422 }, { "epoch": 1.9657777777777778, "grad_norm": 1.886389970779419, "learning_rate": 0.00012147686832740214, "loss": 2.3392, "step": 4423 }, { "epoch": 1.966222222222222, "grad_norm": 2.023409843444824, "learning_rate": 0.0001214590747330961, "loss": 2.5448, "step": 4424 }, { "epoch": 1.9666666666666668, "grad_norm": 1.7405117750167847, "learning_rate": 0.00012144128113879003, "loss": 1.7628, "step": 4425 }, { "epoch": 1.967111111111111, "grad_norm": 1.9717170000076294, "learning_rate": 0.00012142348754448398, "loss": 2.027, "step": 4426 }, { "epoch": 1.9675555555555555, "grad_norm": 1.9732880592346191, "learning_rate": 0.00012140569395017794, "loss": 2.3524, "step": 4427 }, { "epoch": 1.968, "grad_norm": 1.9365227222442627, "learning_rate": 0.00012138790035587188, "loss": 1.9627, "step": 4428 }, { "epoch": 1.9684444444444444, "grad_norm": 2.359199285507202, "learning_rate": 0.00012137010676156584, "loss": 2.1723, "step": 4429 }, { "epoch": 1.968888888888889, "grad_norm": 2.102796792984009, "learning_rate": 0.00012135231316725978, "loss": 2.388, "step": 4430 }, { "epoch": 1.9693333333333334, "grad_norm": 2.1342241764068604, "learning_rate": 0.00012133451957295374, "loss": 2.6232, "step": 4431 }, { "epoch": 1.9697777777777778, "grad_norm": 2.0955238342285156, "learning_rate": 0.0001213167259786477, "loss": 2.0068, "step": 4432 }, { "epoch": 1.970222222222222, "grad_norm": 2.0554394721984863, "learning_rate": 0.00012129893238434164, "loss": 2.0677, "step": 4433 }, { "epoch": 1.9706666666666668, "grad_norm": 2.020298957824707, "learning_rate": 0.0001212811387900356, "loss": 1.9219, "step": 4434 }, { "epoch": 1.971111111111111, "grad_norm": 1.943292498588562, "learning_rate": 0.00012126334519572955, "loss": 2.1289, "step": 4435 }, { "epoch": 1.9715555555555555, "grad_norm": 1.9721516370773315, "learning_rate": 0.0001212455516014235, "loss": 1.7805, "step": 4436 }, { "epoch": 1.972, "grad_norm": 1.828384518623352, "learning_rate": 0.00012122775800711745, "loss": 1.8164, "step": 4437 }, { "epoch": 1.9724444444444444, "grad_norm": 2.0130600929260254, "learning_rate": 0.00012120996441281138, "loss": 1.8864, "step": 4438 }, { "epoch": 1.972888888888889, "grad_norm": 2.123643398284912, "learning_rate": 0.00012119217081850534, "loss": 1.9729, "step": 4439 }, { "epoch": 1.9733333333333334, "grad_norm": 2.224832534790039, "learning_rate": 0.00012117437722419928, "loss": 2.3063, "step": 4440 }, { "epoch": 1.9737777777777779, "grad_norm": 2.5760409832000732, "learning_rate": 0.00012115658362989324, "loss": 3.1176, "step": 4441 }, { "epoch": 1.974222222222222, "grad_norm": 1.936394214630127, "learning_rate": 0.0001211387900355872, "loss": 1.9176, "step": 4442 }, { "epoch": 1.9746666666666668, "grad_norm": 2.0854177474975586, "learning_rate": 0.00012112099644128114, "loss": 2.125, "step": 4443 }, { "epoch": 1.975111111111111, "grad_norm": 2.2257535457611084, "learning_rate": 0.00012110320284697509, "loss": 2.1138, "step": 4444 }, { "epoch": 1.9755555555555555, "grad_norm": 2.3724265098571777, "learning_rate": 0.00012108540925266905, "loss": 2.2497, "step": 4445 }, { "epoch": 1.976, "grad_norm": 1.9573043584823608, "learning_rate": 0.00012106761565836299, "loss": 1.7342, "step": 4446 }, { "epoch": 1.9764444444444444, "grad_norm": 2.2017226219177246, "learning_rate": 0.00012104982206405695, "loss": 2.292, "step": 4447 }, { "epoch": 1.976888888888889, "grad_norm": 2.3729429244995117, "learning_rate": 0.0001210320284697509, "loss": 2.853, "step": 4448 }, { "epoch": 1.9773333333333334, "grad_norm": 2.4285669326782227, "learning_rate": 0.00012101423487544485, "loss": 2.0607, "step": 4449 }, { "epoch": 1.9777777777777779, "grad_norm": 2.3270487785339355, "learning_rate": 0.0001209964412811388, "loss": 1.2632, "step": 4450 }, { "epoch": 1.978222222222222, "grad_norm": 1.3772460222244263, "learning_rate": 0.00012097864768683273, "loss": 2.7924, "step": 4451 }, { "epoch": 1.9786666666666668, "grad_norm": 1.451092004776001, "learning_rate": 0.00012096085409252669, "loss": 2.6418, "step": 4452 }, { "epoch": 1.979111111111111, "grad_norm": 1.4491418600082397, "learning_rate": 0.00012094306049822063, "loss": 2.5647, "step": 4453 }, { "epoch": 1.9795555555555555, "grad_norm": 1.4295765161514282, "learning_rate": 0.00012092526690391459, "loss": 2.092, "step": 4454 }, { "epoch": 1.98, "grad_norm": 1.1193150281906128, "learning_rate": 0.00012090747330960855, "loss": 0.9929, "step": 4455 }, { "epoch": 1.9804444444444445, "grad_norm": 1.6379578113555908, "learning_rate": 0.00012088967971530249, "loss": 2.621, "step": 4456 }, { "epoch": 1.980888888888889, "grad_norm": 1.7126623392105103, "learning_rate": 0.00012087188612099645, "loss": 1.7564, "step": 4457 }, { "epoch": 1.9813333333333332, "grad_norm": 1.670942783355713, "learning_rate": 0.0001208540925266904, "loss": 2.1591, "step": 4458 }, { "epoch": 1.9817777777777779, "grad_norm": 1.9480079412460327, "learning_rate": 0.00012083629893238435, "loss": 2.5495, "step": 4459 }, { "epoch": 1.982222222222222, "grad_norm": 1.5117992162704468, "learning_rate": 0.0001208185053380783, "loss": 2.1216, "step": 4460 }, { "epoch": 1.9826666666666668, "grad_norm": 1.5676404237747192, "learning_rate": 0.00012080071174377226, "loss": 2.2749, "step": 4461 }, { "epoch": 1.983111111111111, "grad_norm": 1.228750467300415, "learning_rate": 0.0001207829181494662, "loss": 0.9559, "step": 4462 }, { "epoch": 1.9835555555555555, "grad_norm": 1.8625293970108032, "learning_rate": 0.00012076512455516016, "loss": 2.1203, "step": 4463 }, { "epoch": 1.984, "grad_norm": 1.5544917583465576, "learning_rate": 0.00012074733096085409, "loss": 2.1986, "step": 4464 }, { "epoch": 1.9844444444444445, "grad_norm": 1.8329989910125732, "learning_rate": 0.00012072953736654804, "loss": 2.1676, "step": 4465 }, { "epoch": 1.984888888888889, "grad_norm": 1.6483327150344849, "learning_rate": 0.00012071174377224199, "loss": 2.383, "step": 4466 }, { "epoch": 1.9853333333333332, "grad_norm": 1.6952513456344604, "learning_rate": 0.00012069395017793594, "loss": 2.0415, "step": 4467 }, { "epoch": 1.9857777777777779, "grad_norm": 1.7458889484405518, "learning_rate": 0.0001206761565836299, "loss": 2.0539, "step": 4468 }, { "epoch": 1.9862222222222221, "grad_norm": 1.8751850128173828, "learning_rate": 0.00012065836298932384, "loss": 2.391, "step": 4469 }, { "epoch": 1.9866666666666668, "grad_norm": 1.6705282926559448, "learning_rate": 0.0001206405693950178, "loss": 1.8026, "step": 4470 }, { "epoch": 1.987111111111111, "grad_norm": 1.6303818225860596, "learning_rate": 0.00012062277580071176, "loss": 1.899, "step": 4471 }, { "epoch": 1.9875555555555555, "grad_norm": 1.7543621063232422, "learning_rate": 0.0001206049822064057, "loss": 1.9756, "step": 4472 }, { "epoch": 1.988, "grad_norm": 1.8945670127868652, "learning_rate": 0.00012058718861209966, "loss": 2.1054, "step": 4473 }, { "epoch": 1.9884444444444445, "grad_norm": 1.76500403881073, "learning_rate": 0.00012056939501779361, "loss": 2.2563, "step": 4474 }, { "epoch": 1.988888888888889, "grad_norm": 1.8483823537826538, "learning_rate": 0.00012055160142348755, "loss": 2.243, "step": 4475 }, { "epoch": 1.9893333333333332, "grad_norm": 1.7314825057983398, "learning_rate": 0.00012053380782918151, "loss": 2.4037, "step": 4476 }, { "epoch": 1.9897777777777779, "grad_norm": 1.545645833015442, "learning_rate": 0.00012051601423487544, "loss": 1.1406, "step": 4477 }, { "epoch": 1.9902222222222221, "grad_norm": 1.576292634010315, "learning_rate": 0.0001204982206405694, "loss": 1.7195, "step": 4478 }, { "epoch": 1.9906666666666668, "grad_norm": 1.6278116703033447, "learning_rate": 0.00012048042704626334, "loss": 1.8675, "step": 4479 }, { "epoch": 1.991111111111111, "grad_norm": 1.6416962146759033, "learning_rate": 0.0001204626334519573, "loss": 1.9954, "step": 4480 }, { "epoch": 1.9915555555555555, "grad_norm": 1.7250615358352661, "learning_rate": 0.00012044483985765125, "loss": 2.2059, "step": 4481 }, { "epoch": 1.992, "grad_norm": 1.973854899406433, "learning_rate": 0.0001204270462633452, "loss": 2.4063, "step": 4482 }, { "epoch": 1.9924444444444445, "grad_norm": 2.003932476043701, "learning_rate": 0.00012040925266903915, "loss": 1.9116, "step": 4483 }, { "epoch": 1.992888888888889, "grad_norm": 1.8206032514572144, "learning_rate": 0.00012039145907473311, "loss": 1.9458, "step": 4484 }, { "epoch": 1.9933333333333332, "grad_norm": 2.3211958408355713, "learning_rate": 0.00012037366548042705, "loss": 2.565, "step": 4485 }, { "epoch": 1.9937777777777779, "grad_norm": 2.1076741218566895, "learning_rate": 0.00012035587188612101, "loss": 2.1175, "step": 4486 }, { "epoch": 1.9942222222222221, "grad_norm": 1.6778833866119385, "learning_rate": 0.00012033807829181497, "loss": 1.5964, "step": 4487 }, { "epoch": 1.9946666666666668, "grad_norm": 2.298328399658203, "learning_rate": 0.00012032028469750891, "loss": 2.1779, "step": 4488 }, { "epoch": 1.995111111111111, "grad_norm": 1.8983428478240967, "learning_rate": 0.00012030249110320286, "loss": 2.0109, "step": 4489 }, { "epoch": 1.9955555555555555, "grad_norm": 1.9424333572387695, "learning_rate": 0.0001202846975088968, "loss": 1.6167, "step": 4490 }, { "epoch": 1.996, "grad_norm": 1.9415746927261353, "learning_rate": 0.00012026690391459075, "loss": 1.7478, "step": 4491 }, { "epoch": 1.9964444444444445, "grad_norm": 1.8782165050506592, "learning_rate": 0.0001202491103202847, "loss": 2.08, "step": 4492 }, { "epoch": 1.996888888888889, "grad_norm": 2.077409267425537, "learning_rate": 0.00012023131672597865, "loss": 1.9588, "step": 4493 }, { "epoch": 1.9973333333333332, "grad_norm": 2.3767199516296387, "learning_rate": 0.0001202135231316726, "loss": 2.4939, "step": 4494 }, { "epoch": 1.9977777777777779, "grad_norm": 2.2348458766937256, "learning_rate": 0.00012019572953736655, "loss": 2.3313, "step": 4495 }, { "epoch": 1.9982222222222221, "grad_norm": 2.509856700897217, "learning_rate": 0.0001201779359430605, "loss": 2.5201, "step": 4496 }, { "epoch": 1.9986666666666668, "grad_norm": 2.5316574573516846, "learning_rate": 0.00012016014234875446, "loss": 2.3262, "step": 4497 }, { "epoch": 1.999111111111111, "grad_norm": 2.4037058353424072, "learning_rate": 0.0001201423487544484, "loss": 1.8568, "step": 4498 }, { "epoch": 1.9995555555555555, "grad_norm": 2.8537352085113525, "learning_rate": 0.00012012455516014236, "loss": 2.8726, "step": 4499 }, { "epoch": 2.0, "grad_norm": 3.3729751110076904, "learning_rate": 0.00012010676156583632, "loss": 2.0103, "step": 4500 }, { "epoch": 2.0, "eval_loss": 2.4265496730804443, "eval_runtime": 47.2809, "eval_samples_per_second": 10.575, "eval_steps_per_second": 10.575, "step": 4500 }, { "epoch": 2.0004444444444442, "grad_norm": 0.8853614926338196, "learning_rate": 0.00012008896797153026, "loss": 0.8435, "step": 4501 }, { "epoch": 2.000888888888889, "grad_norm": 1.3319120407104492, "learning_rate": 0.00012007117437722422, "loss": 2.0388, "step": 4502 }, { "epoch": 2.001333333333333, "grad_norm": 1.6479343175888062, "learning_rate": 0.00012005338078291815, "loss": 1.9234, "step": 4503 }, { "epoch": 2.001777777777778, "grad_norm": 1.6244651079177856, "learning_rate": 0.0001200355871886121, "loss": 2.0868, "step": 4504 }, { "epoch": 2.002222222222222, "grad_norm": 1.5395774841308594, "learning_rate": 0.00012001779359430605, "loss": 2.1305, "step": 4505 }, { "epoch": 2.002666666666667, "grad_norm": 1.7427664995193481, "learning_rate": 0.00012, "loss": 2.3715, "step": 4506 }, { "epoch": 2.003111111111111, "grad_norm": 1.600896954536438, "learning_rate": 0.00011998220640569396, "loss": 1.9859, "step": 4507 }, { "epoch": 2.0035555555555558, "grad_norm": 1.6297886371612549, "learning_rate": 0.0001199644128113879, "loss": 1.4048, "step": 4508 }, { "epoch": 2.004, "grad_norm": 1.7642384767532349, "learning_rate": 0.00011994661921708186, "loss": 2.2935, "step": 4509 }, { "epoch": 2.0044444444444443, "grad_norm": 1.5079247951507568, "learning_rate": 0.00011992882562277582, "loss": 1.7682, "step": 4510 }, { "epoch": 2.004888888888889, "grad_norm": 1.8100050687789917, "learning_rate": 0.00011991103202846976, "loss": 1.9194, "step": 4511 }, { "epoch": 2.005333333333333, "grad_norm": 1.9946025609970093, "learning_rate": 0.00011989323843416371, "loss": 1.9621, "step": 4512 }, { "epoch": 2.005777777777778, "grad_norm": 1.6905927658081055, "learning_rate": 0.00011987544483985766, "loss": 2.0369, "step": 4513 }, { "epoch": 2.006222222222222, "grad_norm": 1.6684443950653076, "learning_rate": 0.00011985765124555161, "loss": 1.8459, "step": 4514 }, { "epoch": 2.006666666666667, "grad_norm": 2.0483903884887695, "learning_rate": 0.00011983985765124557, "loss": 1.9101, "step": 4515 }, { "epoch": 2.007111111111111, "grad_norm": 2.2682251930236816, "learning_rate": 0.0001198220640569395, "loss": 1.5367, "step": 4516 }, { "epoch": 2.0075555555555558, "grad_norm": 1.442750096321106, "learning_rate": 0.00011980427046263346, "loss": 0.6781, "step": 4517 }, { "epoch": 2.008, "grad_norm": 2.0190680027008057, "learning_rate": 0.0001197864768683274, "loss": 1.9999, "step": 4518 }, { "epoch": 2.0084444444444443, "grad_norm": 2.084582567214966, "learning_rate": 0.00011976868327402136, "loss": 2.1547, "step": 4519 }, { "epoch": 2.008888888888889, "grad_norm": 2.291092872619629, "learning_rate": 0.0001197508896797153, "loss": 1.982, "step": 4520 }, { "epoch": 2.009333333333333, "grad_norm": 2.4098877906799316, "learning_rate": 0.00011973309608540926, "loss": 1.9975, "step": 4521 }, { "epoch": 2.009777777777778, "grad_norm": 2.174729585647583, "learning_rate": 0.00011971530249110321, "loss": 1.8608, "step": 4522 }, { "epoch": 2.010222222222222, "grad_norm": 2.5682456493377686, "learning_rate": 0.00011969750889679716, "loss": 1.8649, "step": 4523 }, { "epoch": 2.010666666666667, "grad_norm": 1.9961142539978027, "learning_rate": 0.00011967971530249111, "loss": 1.5078, "step": 4524 }, { "epoch": 2.011111111111111, "grad_norm": 2.4178638458251953, "learning_rate": 0.00011966192170818507, "loss": 1.7468, "step": 4525 }, { "epoch": 2.0115555555555558, "grad_norm": 2.30759334564209, "learning_rate": 0.00011964412811387901, "loss": 1.8143, "step": 4526 }, { "epoch": 2.012, "grad_norm": 1.8575639724731445, "learning_rate": 0.00011962633451957297, "loss": 1.2505, "step": 4527 }, { "epoch": 2.0124444444444443, "grad_norm": 2.4838311672210693, "learning_rate": 0.0001196085409252669, "loss": 1.9343, "step": 4528 }, { "epoch": 2.012888888888889, "grad_norm": 2.2968297004699707, "learning_rate": 0.00011959074733096085, "loss": 1.8051, "step": 4529 }, { "epoch": 2.013333333333333, "grad_norm": 2.502021074295044, "learning_rate": 0.0001195729537366548, "loss": 1.5853, "step": 4530 }, { "epoch": 2.013777777777778, "grad_norm": 1.7160247564315796, "learning_rate": 0.00011955516014234875, "loss": 0.898, "step": 4531 }, { "epoch": 2.014222222222222, "grad_norm": 2.109179973602295, "learning_rate": 0.00011953736654804271, "loss": 1.3351, "step": 4532 }, { "epoch": 2.014666666666667, "grad_norm": 2.4689619541168213, "learning_rate": 0.00011951957295373665, "loss": 1.8757, "step": 4533 }, { "epoch": 2.015111111111111, "grad_norm": 3.084038257598877, "learning_rate": 0.00011950177935943061, "loss": 2.2736, "step": 4534 }, { "epoch": 2.0155555555555558, "grad_norm": 3.0082006454467773, "learning_rate": 0.00011948398576512457, "loss": 2.0318, "step": 4535 }, { "epoch": 2.016, "grad_norm": 2.722201108932495, "learning_rate": 0.00011946619217081851, "loss": 1.7501, "step": 4536 }, { "epoch": 2.0164444444444443, "grad_norm": 2.107433795928955, "learning_rate": 0.00011944839857651246, "loss": 1.4507, "step": 4537 }, { "epoch": 2.016888888888889, "grad_norm": 2.8475327491760254, "learning_rate": 0.00011943060498220642, "loss": 1.8524, "step": 4538 }, { "epoch": 2.017333333333333, "grad_norm": 2.276287078857422, "learning_rate": 0.00011941281138790036, "loss": 1.5575, "step": 4539 }, { "epoch": 2.017777777777778, "grad_norm": 2.9798543453216553, "learning_rate": 0.00011939501779359432, "loss": 1.8226, "step": 4540 }, { "epoch": 2.018222222222222, "grad_norm": 2.7004785537719727, "learning_rate": 0.00011937722419928825, "loss": 1.6149, "step": 4541 }, { "epoch": 2.018666666666667, "grad_norm": 2.5577023029327393, "learning_rate": 0.0001193594306049822, "loss": 1.6913, "step": 4542 }, { "epoch": 2.019111111111111, "grad_norm": 2.958678960800171, "learning_rate": 0.00011934163701067615, "loss": 1.9823, "step": 4543 }, { "epoch": 2.0195555555555558, "grad_norm": 2.862009048461914, "learning_rate": 0.0001193238434163701, "loss": 2.1328, "step": 4544 }, { "epoch": 2.02, "grad_norm": 2.533651113510132, "learning_rate": 0.00011930604982206406, "loss": 1.2886, "step": 4545 }, { "epoch": 2.0204444444444443, "grad_norm": 2.7307307720184326, "learning_rate": 0.000119288256227758, "loss": 1.4595, "step": 4546 }, { "epoch": 2.020888888888889, "grad_norm": 2.3221077919006348, "learning_rate": 0.00011927046263345196, "loss": 1.4627, "step": 4547 }, { "epoch": 2.021333333333333, "grad_norm": 2.621258497238159, "learning_rate": 0.00011925266903914592, "loss": 1.9821, "step": 4548 }, { "epoch": 2.021777777777778, "grad_norm": 3.0992743968963623, "learning_rate": 0.00011923487544483986, "loss": 1.9199, "step": 4549 }, { "epoch": 2.022222222222222, "grad_norm": 2.962254524230957, "learning_rate": 0.00011921708185053382, "loss": 1.2242, "step": 4550 }, { "epoch": 2.022666666666667, "grad_norm": 1.6041865348815918, "learning_rate": 0.00011919928825622777, "loss": 1.3209, "step": 4551 }, { "epoch": 2.023111111111111, "grad_norm": 1.467262864112854, "learning_rate": 0.00011918149466192172, "loss": 1.234, "step": 4552 }, { "epoch": 2.0235555555555558, "grad_norm": 1.7941569089889526, "learning_rate": 0.00011916370106761567, "loss": 2.2728, "step": 4553 }, { "epoch": 2.024, "grad_norm": 1.5521703958511353, "learning_rate": 0.0001191459074733096, "loss": 1.8525, "step": 4554 }, { "epoch": 2.0244444444444443, "grad_norm": 1.894801378250122, "learning_rate": 0.00011912811387900356, "loss": 2.1562, "step": 4555 }, { "epoch": 2.024888888888889, "grad_norm": 1.414631962776184, "learning_rate": 0.0001191103202846975, "loss": 0.9182, "step": 4556 }, { "epoch": 2.025333333333333, "grad_norm": 2.034543752670288, "learning_rate": 0.00011909252669039146, "loss": 2.1913, "step": 4557 }, { "epoch": 2.025777777777778, "grad_norm": 1.8976682424545288, "learning_rate": 0.00011907473309608542, "loss": 2.0936, "step": 4558 }, { "epoch": 2.026222222222222, "grad_norm": 1.6398429870605469, "learning_rate": 0.00011905693950177936, "loss": 1.7521, "step": 4559 }, { "epoch": 2.026666666666667, "grad_norm": 1.8975718021392822, "learning_rate": 0.00011903914590747332, "loss": 1.6575, "step": 4560 }, { "epoch": 2.027111111111111, "grad_norm": 1.5204815864562988, "learning_rate": 0.00011902135231316727, "loss": 0.9841, "step": 4561 }, { "epoch": 2.0275555555555558, "grad_norm": 1.710460901260376, "learning_rate": 0.00011900355871886121, "loss": 1.4803, "step": 4562 }, { "epoch": 2.028, "grad_norm": 1.8635836839675903, "learning_rate": 0.00011898576512455517, "loss": 2.0793, "step": 4563 }, { "epoch": 2.0284444444444443, "grad_norm": 1.871050238609314, "learning_rate": 0.00011896797153024913, "loss": 1.0645, "step": 4564 }, { "epoch": 2.028888888888889, "grad_norm": 2.071890115737915, "learning_rate": 0.00011895017793594307, "loss": 1.9001, "step": 4565 }, { "epoch": 2.029333333333333, "grad_norm": 2.0963134765625, "learning_rate": 0.00011893238434163703, "loss": 2.1881, "step": 4566 }, { "epoch": 2.029777777777778, "grad_norm": 1.9067516326904297, "learning_rate": 0.00011891459074733096, "loss": 1.7577, "step": 4567 }, { "epoch": 2.030222222222222, "grad_norm": 2.036006212234497, "learning_rate": 0.00011889679715302491, "loss": 2.1429, "step": 4568 }, { "epoch": 2.030666666666667, "grad_norm": 2.4452297687530518, "learning_rate": 0.00011887900355871886, "loss": 2.1539, "step": 4569 }, { "epoch": 2.031111111111111, "grad_norm": 2.6038591861724854, "learning_rate": 0.00011886120996441281, "loss": 2.0575, "step": 4570 }, { "epoch": 2.0315555555555553, "grad_norm": 1.8687160015106201, "learning_rate": 0.00011884341637010677, "loss": 1.633, "step": 4571 }, { "epoch": 2.032, "grad_norm": 2.6096248626708984, "learning_rate": 0.00011882562277580071, "loss": 1.9688, "step": 4572 }, { "epoch": 2.0324444444444443, "grad_norm": 1.9733986854553223, "learning_rate": 0.00011880782918149467, "loss": 2.0018, "step": 4573 }, { "epoch": 2.032888888888889, "grad_norm": 2.4421169757843018, "learning_rate": 0.00011879003558718862, "loss": 1.9457, "step": 4574 }, { "epoch": 2.033333333333333, "grad_norm": 2.1608901023864746, "learning_rate": 0.00011877224199288257, "loss": 1.5918, "step": 4575 }, { "epoch": 2.033777777777778, "grad_norm": 1.7584596872329712, "learning_rate": 0.00011875444839857652, "loss": 1.5401, "step": 4576 }, { "epoch": 2.034222222222222, "grad_norm": 3.489712953567505, "learning_rate": 0.00011873665480427048, "loss": 0.9426, "step": 4577 }, { "epoch": 2.034666666666667, "grad_norm": 2.114403486251831, "learning_rate": 0.00011871886120996442, "loss": 1.8606, "step": 4578 }, { "epoch": 2.035111111111111, "grad_norm": 2.2650794982910156, "learning_rate": 0.00011870106761565838, "loss": 1.6157, "step": 4579 }, { "epoch": 2.0355555555555553, "grad_norm": 2.404672145843506, "learning_rate": 0.00011868327402135231, "loss": 1.9152, "step": 4580 }, { "epoch": 2.036, "grad_norm": 2.237191677093506, "learning_rate": 0.00011866548042704627, "loss": 1.8395, "step": 4581 }, { "epoch": 2.0364444444444443, "grad_norm": 2.234955072402954, "learning_rate": 0.00011864768683274021, "loss": 1.7004, "step": 4582 }, { "epoch": 2.036888888888889, "grad_norm": 1.999866247177124, "learning_rate": 0.00011862989323843417, "loss": 1.33, "step": 4583 }, { "epoch": 2.037333333333333, "grad_norm": 2.5568530559539795, "learning_rate": 0.00011861209964412812, "loss": 1.7804, "step": 4584 }, { "epoch": 2.037777777777778, "grad_norm": 2.2619681358337402, "learning_rate": 0.00011859430604982206, "loss": 1.7402, "step": 4585 }, { "epoch": 2.038222222222222, "grad_norm": 2.6120476722717285, "learning_rate": 0.00011857651245551602, "loss": 1.6572, "step": 4586 }, { "epoch": 2.038666666666667, "grad_norm": 2.367854595184326, "learning_rate": 0.00011855871886120998, "loss": 1.3876, "step": 4587 }, { "epoch": 2.039111111111111, "grad_norm": 2.1282546520233154, "learning_rate": 0.00011854092526690392, "loss": 1.5421, "step": 4588 }, { "epoch": 2.0395555555555553, "grad_norm": 2.2529513835906982, "learning_rate": 0.00011852313167259788, "loss": 1.2083, "step": 4589 }, { "epoch": 2.04, "grad_norm": 2.7248787879943848, "learning_rate": 0.00011850533807829183, "loss": 1.6816, "step": 4590 }, { "epoch": 2.0404444444444443, "grad_norm": 3.0865466594696045, "learning_rate": 0.00011848754448398578, "loss": 1.5498, "step": 4591 }, { "epoch": 2.040888888888889, "grad_norm": 3.441319465637207, "learning_rate": 0.00011846975088967973, "loss": 1.5423, "step": 4592 }, { "epoch": 2.041333333333333, "grad_norm": 2.960602283477783, "learning_rate": 0.00011845195729537366, "loss": 1.7773, "step": 4593 }, { "epoch": 2.041777777777778, "grad_norm": 3.269716501235962, "learning_rate": 0.00011843416370106762, "loss": 2.1755, "step": 4594 }, { "epoch": 2.042222222222222, "grad_norm": 3.3861782550811768, "learning_rate": 0.00011841637010676156, "loss": 1.8699, "step": 4595 }, { "epoch": 2.042666666666667, "grad_norm": 2.951418399810791, "learning_rate": 0.00011839857651245552, "loss": 1.9355, "step": 4596 }, { "epoch": 2.043111111111111, "grad_norm": 2.9961771965026855, "learning_rate": 0.00011838078291814948, "loss": 2.1783, "step": 4597 }, { "epoch": 2.0435555555555553, "grad_norm": 2.833582639694214, "learning_rate": 0.00011836298932384342, "loss": 1.8636, "step": 4598 }, { "epoch": 2.044, "grad_norm": 3.0772078037261963, "learning_rate": 0.00011834519572953737, "loss": 1.4656, "step": 4599 }, { "epoch": 2.0444444444444443, "grad_norm": 3.595390558242798, "learning_rate": 0.00011832740213523133, "loss": 1.5303, "step": 4600 }, { "epoch": 2.044888888888889, "grad_norm": 1.3135249614715576, "learning_rate": 0.00011830960854092527, "loss": 1.0165, "step": 4601 }, { "epoch": 2.0453333333333332, "grad_norm": 1.4123620986938477, "learning_rate": 0.00011829181494661923, "loss": 1.1516, "step": 4602 }, { "epoch": 2.045777777777778, "grad_norm": 2.0700747966766357, "learning_rate": 0.00011827402135231317, "loss": 1.506, "step": 4603 }, { "epoch": 2.046222222222222, "grad_norm": 1.950496792793274, "learning_rate": 0.00011825622775800713, "loss": 1.8448, "step": 4604 }, { "epoch": 2.046666666666667, "grad_norm": 2.0276780128479004, "learning_rate": 0.00011823843416370109, "loss": 1.9754, "step": 4605 }, { "epoch": 2.047111111111111, "grad_norm": 2.1765434741973877, "learning_rate": 0.00011822064056939502, "loss": 2.5343, "step": 4606 }, { "epoch": 2.0475555555555554, "grad_norm": 1.9355638027191162, "learning_rate": 0.00011820284697508896, "loss": 2.0598, "step": 4607 }, { "epoch": 2.048, "grad_norm": 2.012378215789795, "learning_rate": 0.00011818505338078292, "loss": 1.7933, "step": 4608 }, { "epoch": 2.0484444444444443, "grad_norm": 2.179774045944214, "learning_rate": 0.00011816725978647687, "loss": 2.1403, "step": 4609 }, { "epoch": 2.048888888888889, "grad_norm": 2.4090864658355713, "learning_rate": 0.00011814946619217081, "loss": 1.648, "step": 4610 }, { "epoch": 2.0493333333333332, "grad_norm": 2.0735795497894287, "learning_rate": 0.00011813167259786477, "loss": 1.7304, "step": 4611 }, { "epoch": 2.049777777777778, "grad_norm": 2.167476177215576, "learning_rate": 0.00011811387900355873, "loss": 1.7997, "step": 4612 }, { "epoch": 2.050222222222222, "grad_norm": 1.9912680387496948, "learning_rate": 0.00011809608540925267, "loss": 1.4525, "step": 4613 }, { "epoch": 2.050666666666667, "grad_norm": 2.363731622695923, "learning_rate": 0.00011807829181494663, "loss": 2.1657, "step": 4614 }, { "epoch": 2.051111111111111, "grad_norm": 1.7121641635894775, "learning_rate": 0.00011806049822064058, "loss": 1.3718, "step": 4615 }, { "epoch": 2.0515555555555554, "grad_norm": 1.7621139287948608, "learning_rate": 0.00011804270462633453, "loss": 1.3004, "step": 4616 }, { "epoch": 2.052, "grad_norm": 2.833408832550049, "learning_rate": 0.00011802491103202848, "loss": 1.7775, "step": 4617 }, { "epoch": 2.0524444444444443, "grad_norm": 1.8854105472564697, "learning_rate": 0.00011800711743772244, "loss": 1.608, "step": 4618 }, { "epoch": 2.052888888888889, "grad_norm": 2.169327974319458, "learning_rate": 0.00011798932384341637, "loss": 1.9171, "step": 4619 }, { "epoch": 2.0533333333333332, "grad_norm": 2.554474115371704, "learning_rate": 0.00011797153024911031, "loss": 1.3938, "step": 4620 }, { "epoch": 2.053777777777778, "grad_norm": 1.969563364982605, "learning_rate": 0.00011795373665480427, "loss": 1.7774, "step": 4621 }, { "epoch": 2.054222222222222, "grad_norm": 1.854287028312683, "learning_rate": 0.00011793594306049822, "loss": 1.8198, "step": 4622 }, { "epoch": 2.054666666666667, "grad_norm": 2.1573734283447266, "learning_rate": 0.00011791814946619217, "loss": 1.7923, "step": 4623 }, { "epoch": 2.055111111111111, "grad_norm": 1.8452686071395874, "learning_rate": 0.00011790035587188612, "loss": 1.2323, "step": 4624 }, { "epoch": 2.0555555555555554, "grad_norm": 2.0869951248168945, "learning_rate": 0.00011788256227758008, "loss": 1.9431, "step": 4625 }, { "epoch": 2.056, "grad_norm": 1.9938613176345825, "learning_rate": 0.00011786476868327402, "loss": 1.5775, "step": 4626 }, { "epoch": 2.0564444444444443, "grad_norm": 2.0701723098754883, "learning_rate": 0.00011784697508896798, "loss": 1.465, "step": 4627 }, { "epoch": 2.056888888888889, "grad_norm": 2.5349276065826416, "learning_rate": 0.00011782918149466194, "loss": 2.0339, "step": 4628 }, { "epoch": 2.0573333333333332, "grad_norm": 1.9578278064727783, "learning_rate": 0.00011781138790035588, "loss": 1.8306, "step": 4629 }, { "epoch": 2.057777777777778, "grad_norm": 2.2757606506347656, "learning_rate": 0.00011779359430604984, "loss": 1.5594, "step": 4630 }, { "epoch": 2.058222222222222, "grad_norm": 2.5828166007995605, "learning_rate": 0.00011777580071174379, "loss": 2.4208, "step": 4631 }, { "epoch": 2.058666666666667, "grad_norm": 2.063826084136963, "learning_rate": 0.00011775800711743772, "loss": 1.3296, "step": 4632 }, { "epoch": 2.059111111111111, "grad_norm": 2.9016664028167725, "learning_rate": 0.00011774021352313167, "loss": 2.1661, "step": 4633 }, { "epoch": 2.0595555555555554, "grad_norm": 2.3844025135040283, "learning_rate": 0.00011772241992882562, "loss": 1.6781, "step": 4634 }, { "epoch": 2.06, "grad_norm": 1.9740389585494995, "learning_rate": 0.00011770462633451958, "loss": 1.4587, "step": 4635 }, { "epoch": 2.0604444444444443, "grad_norm": 2.022944688796997, "learning_rate": 0.00011768683274021352, "loss": 1.5967, "step": 4636 }, { "epoch": 2.060888888888889, "grad_norm": 2.8368794918060303, "learning_rate": 0.00011766903914590748, "loss": 1.9038, "step": 4637 }, { "epoch": 2.0613333333333332, "grad_norm": 2.7894749641418457, "learning_rate": 0.00011765124555160143, "loss": 1.5072, "step": 4638 }, { "epoch": 2.061777777777778, "grad_norm": 2.564565896987915, "learning_rate": 0.00011763345195729538, "loss": 1.4213, "step": 4639 }, { "epoch": 2.062222222222222, "grad_norm": 2.5650315284729004, "learning_rate": 0.00011761565836298933, "loss": 1.6088, "step": 4640 }, { "epoch": 2.062666666666667, "grad_norm": 1.4882378578186035, "learning_rate": 0.00011759786476868329, "loss": 0.7152, "step": 4641 }, { "epoch": 2.063111111111111, "grad_norm": 3.052170991897583, "learning_rate": 0.00011758007117437723, "loss": 1.7892, "step": 4642 }, { "epoch": 2.0635555555555554, "grad_norm": 3.016031265258789, "learning_rate": 0.00011756227758007119, "loss": 2.0574, "step": 4643 }, { "epoch": 2.064, "grad_norm": 2.477534532546997, "learning_rate": 0.00011754448398576512, "loss": 1.4391, "step": 4644 }, { "epoch": 2.0644444444444443, "grad_norm": 2.703592300415039, "learning_rate": 0.00011752669039145908, "loss": 1.782, "step": 4645 }, { "epoch": 2.064888888888889, "grad_norm": 2.8277475833892822, "learning_rate": 0.00011750889679715302, "loss": 2.0831, "step": 4646 }, { "epoch": 2.0653333333333332, "grad_norm": 2.78928542137146, "learning_rate": 0.00011749110320284697, "loss": 1.3444, "step": 4647 }, { "epoch": 2.065777777777778, "grad_norm": 2.8295578956604004, "learning_rate": 0.00011747330960854093, "loss": 1.4441, "step": 4648 }, { "epoch": 2.066222222222222, "grad_norm": 1.9804134368896484, "learning_rate": 0.00011745551601423487, "loss": 0.8381, "step": 4649 }, { "epoch": 2.066666666666667, "grad_norm": 3.729863166809082, "learning_rate": 0.00011743772241992883, "loss": 1.7128, "step": 4650 }, { "epoch": 2.067111111111111, "grad_norm": 2.222588062286377, "learning_rate": 0.00011741992882562279, "loss": 0.9648, "step": 4651 }, { "epoch": 2.0675555555555554, "grad_norm": 1.8874247074127197, "learning_rate": 0.00011740213523131673, "loss": 1.9391, "step": 4652 }, { "epoch": 2.068, "grad_norm": 1.757788062095642, "learning_rate": 0.00011738434163701069, "loss": 1.8273, "step": 4653 }, { "epoch": 2.0684444444444443, "grad_norm": 2.6937432289123535, "learning_rate": 0.00011736654804270464, "loss": 2.0317, "step": 4654 }, { "epoch": 2.068888888888889, "grad_norm": 2.1685352325439453, "learning_rate": 0.00011734875444839859, "loss": 1.957, "step": 4655 }, { "epoch": 2.0693333333333332, "grad_norm": 1.865443229675293, "learning_rate": 0.00011733096085409254, "loss": 1.8245, "step": 4656 }, { "epoch": 2.069777777777778, "grad_norm": 1.8847582340240479, "learning_rate": 0.00011731316725978647, "loss": 2.0589, "step": 4657 }, { "epoch": 2.070222222222222, "grad_norm": 2.063577890396118, "learning_rate": 0.00011729537366548043, "loss": 1.9999, "step": 4658 }, { "epoch": 2.070666666666667, "grad_norm": 2.3595352172851562, "learning_rate": 0.00011727758007117437, "loss": 1.2466, "step": 4659 }, { "epoch": 2.071111111111111, "grad_norm": 1.9096697568893433, "learning_rate": 0.00011725978647686833, "loss": 1.5026, "step": 4660 }, { "epoch": 2.0715555555555554, "grad_norm": 2.064755439758301, "learning_rate": 0.00011724199288256228, "loss": 1.7534, "step": 4661 }, { "epoch": 2.072, "grad_norm": 1.523712158203125, "learning_rate": 0.00011722419928825623, "loss": 1.2692, "step": 4662 }, { "epoch": 2.0724444444444443, "grad_norm": 2.223828077316284, "learning_rate": 0.00011720640569395018, "loss": 1.8284, "step": 4663 }, { "epoch": 2.072888888888889, "grad_norm": 2.1838390827178955, "learning_rate": 0.00011718861209964414, "loss": 1.6618, "step": 4664 }, { "epoch": 2.0733333333333333, "grad_norm": 1.8496938943862915, "learning_rate": 0.00011717081850533808, "loss": 1.9343, "step": 4665 }, { "epoch": 2.073777777777778, "grad_norm": 2.063234806060791, "learning_rate": 0.00011715302491103204, "loss": 1.4225, "step": 4666 }, { "epoch": 2.074222222222222, "grad_norm": 2.388313055038452, "learning_rate": 0.000117135231316726, "loss": 1.5016, "step": 4667 }, { "epoch": 2.074666666666667, "grad_norm": 2.2325823307037354, "learning_rate": 0.00011711743772241994, "loss": 1.7935, "step": 4668 }, { "epoch": 2.075111111111111, "grad_norm": 1.600996732711792, "learning_rate": 0.0001170996441281139, "loss": 1.0136, "step": 4669 }, { "epoch": 2.0755555555555554, "grad_norm": 2.17900013923645, "learning_rate": 0.00011708185053380783, "loss": 1.8612, "step": 4670 }, { "epoch": 2.076, "grad_norm": 2.1239895820617676, "learning_rate": 0.00011706405693950178, "loss": 2.083, "step": 4671 }, { "epoch": 2.0764444444444443, "grad_norm": 1.9274380207061768, "learning_rate": 0.00011704626334519572, "loss": 1.789, "step": 4672 }, { "epoch": 2.076888888888889, "grad_norm": 2.4279444217681885, "learning_rate": 0.00011702846975088968, "loss": 2.0962, "step": 4673 }, { "epoch": 2.0773333333333333, "grad_norm": 2.049731969833374, "learning_rate": 0.00011701067615658364, "loss": 1.529, "step": 4674 }, { "epoch": 2.077777777777778, "grad_norm": 2.452834367752075, "learning_rate": 0.00011699288256227758, "loss": 1.5339, "step": 4675 }, { "epoch": 2.078222222222222, "grad_norm": 2.3302857875823975, "learning_rate": 0.00011697508896797154, "loss": 1.8732, "step": 4676 }, { "epoch": 2.078666666666667, "grad_norm": 2.1098620891571045, "learning_rate": 0.0001169572953736655, "loss": 1.4732, "step": 4677 }, { "epoch": 2.079111111111111, "grad_norm": 2.499377489089966, "learning_rate": 0.00011693950177935944, "loss": 1.5989, "step": 4678 }, { "epoch": 2.0795555555555554, "grad_norm": 2.321789503097534, "learning_rate": 0.00011692170818505339, "loss": 1.9255, "step": 4679 }, { "epoch": 2.08, "grad_norm": 1.7443134784698486, "learning_rate": 0.00011690391459074735, "loss": 0.6187, "step": 4680 }, { "epoch": 2.0804444444444443, "grad_norm": 2.2512123584747314, "learning_rate": 0.00011688612099644129, "loss": 1.8004, "step": 4681 }, { "epoch": 2.080888888888889, "grad_norm": 2.09832763671875, "learning_rate": 0.00011686832740213525, "loss": 1.3556, "step": 4682 }, { "epoch": 2.0813333333333333, "grad_norm": 2.038501262664795, "learning_rate": 0.00011685053380782918, "loss": 1.1602, "step": 4683 }, { "epoch": 2.081777777777778, "grad_norm": 2.386502504348755, "learning_rate": 0.00011683274021352313, "loss": 1.5816, "step": 4684 }, { "epoch": 2.082222222222222, "grad_norm": 2.359687328338623, "learning_rate": 0.00011681494661921708, "loss": 1.7559, "step": 4685 }, { "epoch": 2.0826666666666664, "grad_norm": 2.628209114074707, "learning_rate": 0.00011679715302491103, "loss": 1.7236, "step": 4686 }, { "epoch": 2.083111111111111, "grad_norm": 2.598487615585327, "learning_rate": 0.00011677935943060499, "loss": 2.2871, "step": 4687 }, { "epoch": 2.0835555555555554, "grad_norm": 2.430295944213867, "learning_rate": 0.00011676156583629893, "loss": 1.3882, "step": 4688 }, { "epoch": 2.084, "grad_norm": 2.897634506225586, "learning_rate": 0.00011674377224199289, "loss": 1.9332, "step": 4689 }, { "epoch": 2.0844444444444443, "grad_norm": 3.5412042140960693, "learning_rate": 0.00011672597864768685, "loss": 1.7709, "step": 4690 }, { "epoch": 2.084888888888889, "grad_norm": 2.518477201461792, "learning_rate": 0.00011670818505338079, "loss": 1.8739, "step": 4691 }, { "epoch": 2.0853333333333333, "grad_norm": 2.6702585220336914, "learning_rate": 0.00011669039145907475, "loss": 1.5968, "step": 4692 }, { "epoch": 2.085777777777778, "grad_norm": 2.3972411155700684, "learning_rate": 0.00011667259786476869, "loss": 1.2457, "step": 4693 }, { "epoch": 2.086222222222222, "grad_norm": 3.0265438556671143, "learning_rate": 0.00011665480427046265, "loss": 2.0656, "step": 4694 }, { "epoch": 2.086666666666667, "grad_norm": 2.678575038909912, "learning_rate": 0.0001166370106761566, "loss": 1.3855, "step": 4695 }, { "epoch": 2.087111111111111, "grad_norm": 2.815953016281128, "learning_rate": 0.00011661921708185053, "loss": 1.5628, "step": 4696 }, { "epoch": 2.0875555555555554, "grad_norm": 3.07003116607666, "learning_rate": 0.00011660142348754447, "loss": 1.8748, "step": 4697 }, { "epoch": 2.088, "grad_norm": 2.8302512168884277, "learning_rate": 0.00011658362989323843, "loss": 1.9202, "step": 4698 }, { "epoch": 2.0884444444444443, "grad_norm": 2.839108467102051, "learning_rate": 0.00011656583629893239, "loss": 1.5236, "step": 4699 }, { "epoch": 2.088888888888889, "grad_norm": 2.802971124649048, "learning_rate": 0.00011654804270462633, "loss": 1.2329, "step": 4700 }, { "epoch": 2.0893333333333333, "grad_norm": 1.8777737617492676, "learning_rate": 0.00011653024911032029, "loss": 2.2772, "step": 4701 }, { "epoch": 2.089777777777778, "grad_norm": 2.0970041751861572, "learning_rate": 0.00011651245551601424, "loss": 1.5844, "step": 4702 }, { "epoch": 2.090222222222222, "grad_norm": 2.249812126159668, "learning_rate": 0.00011649466192170819, "loss": 1.9234, "step": 4703 }, { "epoch": 2.0906666666666665, "grad_norm": 2.0751559734344482, "learning_rate": 0.00011647686832740214, "loss": 1.9922, "step": 4704 }, { "epoch": 2.091111111111111, "grad_norm": 3.0285685062408447, "learning_rate": 0.0001164590747330961, "loss": 1.4631, "step": 4705 }, { "epoch": 2.0915555555555554, "grad_norm": 1.9764230251312256, "learning_rate": 0.00011644128113879004, "loss": 1.3146, "step": 4706 }, { "epoch": 2.092, "grad_norm": 2.0233638286590576, "learning_rate": 0.000116423487544484, "loss": 1.7718, "step": 4707 }, { "epoch": 2.0924444444444443, "grad_norm": 2.1841344833374023, "learning_rate": 0.00011640569395017796, "loss": 2.2122, "step": 4708 }, { "epoch": 2.092888888888889, "grad_norm": 2.00598406791687, "learning_rate": 0.00011638790035587188, "loss": 1.7922, "step": 4709 }, { "epoch": 2.0933333333333333, "grad_norm": 2.4193332195281982, "learning_rate": 0.00011637010676156583, "loss": 1.916, "step": 4710 }, { "epoch": 2.093777777777778, "grad_norm": 2.1774141788482666, "learning_rate": 0.00011635231316725978, "loss": 2.0158, "step": 4711 }, { "epoch": 2.094222222222222, "grad_norm": 2.0492541790008545, "learning_rate": 0.00011633451957295374, "loss": 2.1842, "step": 4712 }, { "epoch": 2.0946666666666665, "grad_norm": 2.008819818496704, "learning_rate": 0.00011631672597864768, "loss": 1.6566, "step": 4713 }, { "epoch": 2.095111111111111, "grad_norm": 1.877214789390564, "learning_rate": 0.00011629893238434164, "loss": 1.9144, "step": 4714 }, { "epoch": 2.0955555555555554, "grad_norm": 3.5000193119049072, "learning_rate": 0.0001162811387900356, "loss": 2.7184, "step": 4715 }, { "epoch": 2.096, "grad_norm": 2.240647554397583, "learning_rate": 0.00011626334519572954, "loss": 2.0603, "step": 4716 }, { "epoch": 2.0964444444444443, "grad_norm": 1.966137409210205, "learning_rate": 0.0001162455516014235, "loss": 1.7733, "step": 4717 }, { "epoch": 2.096888888888889, "grad_norm": 1.9724191427230835, "learning_rate": 0.00011622775800711745, "loss": 1.8949, "step": 4718 }, { "epoch": 2.0973333333333333, "grad_norm": 2.057591199874878, "learning_rate": 0.0001162099644128114, "loss": 1.943, "step": 4719 }, { "epoch": 2.097777777777778, "grad_norm": 2.395439624786377, "learning_rate": 0.00011619217081850535, "loss": 1.8616, "step": 4720 }, { "epoch": 2.098222222222222, "grad_norm": 2.374725580215454, "learning_rate": 0.00011617437722419931, "loss": 1.5945, "step": 4721 }, { "epoch": 2.0986666666666665, "grad_norm": 2.0969061851501465, "learning_rate": 0.00011615658362989324, "loss": 1.7964, "step": 4722 }, { "epoch": 2.099111111111111, "grad_norm": 2.3275258541107178, "learning_rate": 0.00011613879003558718, "loss": 1.9125, "step": 4723 }, { "epoch": 2.0995555555555554, "grad_norm": 2.1269102096557617, "learning_rate": 0.00011612099644128114, "loss": 1.2444, "step": 4724 }, { "epoch": 2.1, "grad_norm": 2.2072010040283203, "learning_rate": 0.0001161032028469751, "loss": 1.4687, "step": 4725 }, { "epoch": 2.1004444444444443, "grad_norm": 2.1687400341033936, "learning_rate": 0.00011608540925266904, "loss": 1.8108, "step": 4726 }, { "epoch": 2.100888888888889, "grad_norm": 2.358961820602417, "learning_rate": 0.00011606761565836299, "loss": 1.7005, "step": 4727 }, { "epoch": 2.1013333333333333, "grad_norm": 2.413325786590576, "learning_rate": 0.00011604982206405695, "loss": 1.9044, "step": 4728 }, { "epoch": 2.101777777777778, "grad_norm": 2.0373430252075195, "learning_rate": 0.00011603202846975089, "loss": 1.395, "step": 4729 }, { "epoch": 2.102222222222222, "grad_norm": 2.424889326095581, "learning_rate": 0.00011601423487544485, "loss": 1.7084, "step": 4730 }, { "epoch": 2.1026666666666665, "grad_norm": 2.3407599925994873, "learning_rate": 0.0001159964412811388, "loss": 1.3036, "step": 4731 }, { "epoch": 2.103111111111111, "grad_norm": 2.332714080810547, "learning_rate": 0.00011597864768683275, "loss": 1.7091, "step": 4732 }, { "epoch": 2.1035555555555554, "grad_norm": 2.412674903869629, "learning_rate": 0.0001159608540925267, "loss": 1.5195, "step": 4733 }, { "epoch": 2.104, "grad_norm": 2.552485942840576, "learning_rate": 0.00011594306049822066, "loss": 1.2986, "step": 4734 }, { "epoch": 2.1044444444444443, "grad_norm": 2.3344991207122803, "learning_rate": 0.00011592526690391459, "loss": 1.5909, "step": 4735 }, { "epoch": 2.104888888888889, "grad_norm": 2.5788843631744385, "learning_rate": 0.00011590747330960853, "loss": 1.7849, "step": 4736 }, { "epoch": 2.1053333333333333, "grad_norm": 2.2648143768310547, "learning_rate": 0.00011588967971530249, "loss": 1.688, "step": 4737 }, { "epoch": 2.105777777777778, "grad_norm": 1.9284850358963013, "learning_rate": 0.00011587188612099645, "loss": 0.7604, "step": 4738 }, { "epoch": 2.106222222222222, "grad_norm": 2.7157113552093506, "learning_rate": 0.00011585409252669039, "loss": 1.6809, "step": 4739 }, { "epoch": 2.1066666666666665, "grad_norm": 2.454627752304077, "learning_rate": 0.00011583629893238435, "loss": 1.5076, "step": 4740 }, { "epoch": 2.107111111111111, "grad_norm": 2.6363182067871094, "learning_rate": 0.0001158185053380783, "loss": 1.5113, "step": 4741 }, { "epoch": 2.1075555555555554, "grad_norm": 2.9892001152038574, "learning_rate": 0.00011580071174377225, "loss": 1.453, "step": 4742 }, { "epoch": 2.108, "grad_norm": 2.927412509918213, "learning_rate": 0.0001157829181494662, "loss": 1.7116, "step": 4743 }, { "epoch": 2.1084444444444443, "grad_norm": 2.7504935264587402, "learning_rate": 0.00011576512455516016, "loss": 1.5148, "step": 4744 }, { "epoch": 2.108888888888889, "grad_norm": 2.982175588607788, "learning_rate": 0.0001157473309608541, "loss": 1.7693, "step": 4745 }, { "epoch": 2.1093333333333333, "grad_norm": 2.9303767681121826, "learning_rate": 0.00011572953736654806, "loss": 1.6931, "step": 4746 }, { "epoch": 2.109777777777778, "grad_norm": 3.9879038333892822, "learning_rate": 0.00011571174377224201, "loss": 1.9336, "step": 4747 }, { "epoch": 2.110222222222222, "grad_norm": 3.65157413482666, "learning_rate": 0.00011569395017793594, "loss": 1.7332, "step": 4748 }, { "epoch": 2.1106666666666665, "grad_norm": 3.0835530757904053, "learning_rate": 0.00011567615658362989, "loss": 1.9205, "step": 4749 }, { "epoch": 2.111111111111111, "grad_norm": 3.5538792610168457, "learning_rate": 0.00011565836298932384, "loss": 0.9, "step": 4750 }, { "epoch": 2.1115555555555554, "grad_norm": 1.4265509843826294, "learning_rate": 0.0001156405693950178, "loss": 1.01, "step": 4751 }, { "epoch": 2.112, "grad_norm": 1.9871463775634766, "learning_rate": 0.00011562277580071174, "loss": 2.5466, "step": 4752 }, { "epoch": 2.1124444444444443, "grad_norm": 2.88714337348938, "learning_rate": 0.0001156049822064057, "loss": 1.1221, "step": 4753 }, { "epoch": 2.112888888888889, "grad_norm": 1.92192804813385, "learning_rate": 0.00011558718861209966, "loss": 1.7343, "step": 4754 }, { "epoch": 2.1133333333333333, "grad_norm": 2.3003766536712646, "learning_rate": 0.0001155693950177936, "loss": 2.0344, "step": 4755 }, { "epoch": 2.113777777777778, "grad_norm": 2.1846070289611816, "learning_rate": 0.00011555160142348756, "loss": 1.9369, "step": 4756 }, { "epoch": 2.1142222222222222, "grad_norm": 2.2077274322509766, "learning_rate": 0.00011553380782918151, "loss": 1.8763, "step": 4757 }, { "epoch": 2.1146666666666665, "grad_norm": 2.247318983078003, "learning_rate": 0.00011551601423487545, "loss": 1.6496, "step": 4758 }, { "epoch": 2.115111111111111, "grad_norm": 2.483921527862549, "learning_rate": 0.00011549822064056941, "loss": 2.0657, "step": 4759 }, { "epoch": 2.1155555555555554, "grad_norm": 2.4575605392456055, "learning_rate": 0.00011548042704626334, "loss": 1.9344, "step": 4760 }, { "epoch": 2.116, "grad_norm": 2.4088830947875977, "learning_rate": 0.0001154626334519573, "loss": 1.8533, "step": 4761 }, { "epoch": 2.1164444444444444, "grad_norm": 2.2537829875946045, "learning_rate": 0.00011544483985765124, "loss": 1.8034, "step": 4762 }, { "epoch": 2.116888888888889, "grad_norm": 2.119475841522217, "learning_rate": 0.0001154270462633452, "loss": 1.9903, "step": 4763 }, { "epoch": 2.1173333333333333, "grad_norm": 2.0730443000793457, "learning_rate": 0.00011540925266903915, "loss": 1.6459, "step": 4764 }, { "epoch": 2.117777777777778, "grad_norm": 1.8100666999816895, "learning_rate": 0.0001153914590747331, "loss": 1.7129, "step": 4765 }, { "epoch": 2.1182222222222222, "grad_norm": 1.7500770092010498, "learning_rate": 0.00011537366548042705, "loss": 1.5328, "step": 4766 }, { "epoch": 2.1186666666666665, "grad_norm": 2.3155996799468994, "learning_rate": 0.00011535587188612101, "loss": 1.3889, "step": 4767 }, { "epoch": 2.119111111111111, "grad_norm": 2.4421792030334473, "learning_rate": 0.00011533807829181495, "loss": 2.0543, "step": 4768 }, { "epoch": 2.1195555555555554, "grad_norm": 2.4871792793273926, "learning_rate": 0.00011532028469750891, "loss": 2.2072, "step": 4769 }, { "epoch": 2.12, "grad_norm": 2.081446647644043, "learning_rate": 0.00011530249110320286, "loss": 1.4294, "step": 4770 }, { "epoch": 2.1204444444444444, "grad_norm": 2.19559383392334, "learning_rate": 0.00011528469750889681, "loss": 1.9015, "step": 4771 }, { "epoch": 2.120888888888889, "grad_norm": 2.453030586242676, "learning_rate": 0.00011526690391459076, "loss": 1.8428, "step": 4772 }, { "epoch": 2.1213333333333333, "grad_norm": 2.7875795364379883, "learning_rate": 0.0001152491103202847, "loss": 1.5894, "step": 4773 }, { "epoch": 2.121777777777778, "grad_norm": 2.395458459854126, "learning_rate": 0.00011523131672597865, "loss": 1.5671, "step": 4774 }, { "epoch": 2.1222222222222222, "grad_norm": 2.4182310104370117, "learning_rate": 0.0001152135231316726, "loss": 1.6568, "step": 4775 }, { "epoch": 2.1226666666666665, "grad_norm": 2.46071195602417, "learning_rate": 0.00011519572953736655, "loss": 1.8421, "step": 4776 }, { "epoch": 2.123111111111111, "grad_norm": 2.5873522758483887, "learning_rate": 0.0001151779359430605, "loss": 1.5898, "step": 4777 }, { "epoch": 2.1235555555555554, "grad_norm": 1.7765636444091797, "learning_rate": 0.00011516014234875445, "loss": 0.7924, "step": 4778 }, { "epoch": 2.124, "grad_norm": 2.2877538204193115, "learning_rate": 0.0001151423487544484, "loss": 1.9232, "step": 4779 }, { "epoch": 2.1244444444444444, "grad_norm": 2.445138692855835, "learning_rate": 0.00011512455516014236, "loss": 2.0002, "step": 4780 }, { "epoch": 2.124888888888889, "grad_norm": 2.173436403274536, "learning_rate": 0.0001151067615658363, "loss": 1.6479, "step": 4781 }, { "epoch": 2.1253333333333333, "grad_norm": 2.5610175132751465, "learning_rate": 0.00011508896797153026, "loss": 1.4669, "step": 4782 }, { "epoch": 2.1257777777777775, "grad_norm": 2.4170966148376465, "learning_rate": 0.0001150711743772242, "loss": 1.7575, "step": 4783 }, { "epoch": 2.1262222222222222, "grad_norm": 2.7330989837646484, "learning_rate": 0.00011505338078291816, "loss": 1.8737, "step": 4784 }, { "epoch": 2.1266666666666665, "grad_norm": 2.4493215084075928, "learning_rate": 0.00011503558718861212, "loss": 1.2428, "step": 4785 }, { "epoch": 2.127111111111111, "grad_norm": 2.54807710647583, "learning_rate": 0.00011501779359430605, "loss": 1.675, "step": 4786 }, { "epoch": 2.1275555555555554, "grad_norm": 3.092026472091675, "learning_rate": 0.00011499999999999999, "loss": 1.8129, "step": 4787 }, { "epoch": 2.128, "grad_norm": 3.01973819732666, "learning_rate": 0.00011498220640569395, "loss": 1.9959, "step": 4788 }, { "epoch": 2.1284444444444444, "grad_norm": 2.2665798664093018, "learning_rate": 0.0001149644128113879, "loss": 1.1747, "step": 4789 }, { "epoch": 2.128888888888889, "grad_norm": 3.0983033180236816, "learning_rate": 0.00011494661921708185, "loss": 1.9808, "step": 4790 }, { "epoch": 2.1293333333333333, "grad_norm": 2.7604308128356934, "learning_rate": 0.0001149288256227758, "loss": 1.7185, "step": 4791 }, { "epoch": 2.129777777777778, "grad_norm": 2.712104558944702, "learning_rate": 0.00011491103202846976, "loss": 1.3634, "step": 4792 }, { "epoch": 2.1302222222222222, "grad_norm": 2.7724192142486572, "learning_rate": 0.0001148932384341637, "loss": 1.2231, "step": 4793 }, { "epoch": 2.1306666666666665, "grad_norm": 3.0472283363342285, "learning_rate": 0.00011487544483985766, "loss": 1.7844, "step": 4794 }, { "epoch": 2.131111111111111, "grad_norm": 2.9061384201049805, "learning_rate": 0.00011485765124555161, "loss": 1.7998, "step": 4795 }, { "epoch": 2.1315555555555554, "grad_norm": 3.207463502883911, "learning_rate": 0.00011483985765124556, "loss": 1.8822, "step": 4796 }, { "epoch": 2.132, "grad_norm": 3.0004780292510986, "learning_rate": 0.00011482206405693951, "loss": 1.6778, "step": 4797 }, { "epoch": 2.1324444444444444, "grad_norm": 3.055454730987549, "learning_rate": 0.00011480427046263347, "loss": 1.7763, "step": 4798 }, { "epoch": 2.132888888888889, "grad_norm": 3.3208091259002686, "learning_rate": 0.0001147864768683274, "loss": 1.8971, "step": 4799 }, { "epoch": 2.1333333333333333, "grad_norm": 2.958496570587158, "learning_rate": 0.00011476868327402134, "loss": 1.3253, "step": 4800 }, { "epoch": 2.1337777777777776, "grad_norm": 1.970202922821045, "learning_rate": 0.0001147508896797153, "loss": 2.5294, "step": 4801 }, { "epoch": 2.1342222222222222, "grad_norm": 1.807246446609497, "learning_rate": 0.00011473309608540926, "loss": 2.6724, "step": 4802 }, { "epoch": 2.1346666666666665, "grad_norm": 1.7409089803695679, "learning_rate": 0.0001147153024911032, "loss": 1.8308, "step": 4803 }, { "epoch": 2.135111111111111, "grad_norm": 1.3472347259521484, "learning_rate": 0.00011469750889679716, "loss": 1.1779, "step": 4804 }, { "epoch": 2.1355555555555554, "grad_norm": 2.0794670581817627, "learning_rate": 0.00011467971530249111, "loss": 1.8861, "step": 4805 }, { "epoch": 2.136, "grad_norm": 1.9827806949615479, "learning_rate": 0.00011466192170818505, "loss": 2.0878, "step": 4806 }, { "epoch": 2.1364444444444444, "grad_norm": 1.8904576301574707, "learning_rate": 0.00011464412811387901, "loss": 1.7428, "step": 4807 }, { "epoch": 2.136888888888889, "grad_norm": 1.7913202047348022, "learning_rate": 0.00011462633451957297, "loss": 1.2166, "step": 4808 }, { "epoch": 2.1373333333333333, "grad_norm": 2.015347957611084, "learning_rate": 0.00011460854092526691, "loss": 1.8428, "step": 4809 }, { "epoch": 2.137777777777778, "grad_norm": 2.0501961708068848, "learning_rate": 0.00011459074733096087, "loss": 2.1464, "step": 4810 }, { "epoch": 2.1382222222222222, "grad_norm": 2.050212860107422, "learning_rate": 0.00011457295373665482, "loss": 2.0002, "step": 4811 }, { "epoch": 2.1386666666666665, "grad_norm": 2.134122133255005, "learning_rate": 0.00011455516014234875, "loss": 2.1944, "step": 4812 }, { "epoch": 2.139111111111111, "grad_norm": 1.8899743556976318, "learning_rate": 0.0001145373665480427, "loss": 1.6606, "step": 4813 }, { "epoch": 2.1395555555555554, "grad_norm": 2.5020689964294434, "learning_rate": 0.00011451957295373665, "loss": 1.2598, "step": 4814 }, { "epoch": 2.14, "grad_norm": 1.9991487264633179, "learning_rate": 0.00011450177935943061, "loss": 1.9065, "step": 4815 }, { "epoch": 2.1404444444444444, "grad_norm": 2.101409912109375, "learning_rate": 0.00011448398576512455, "loss": 2.0605, "step": 4816 }, { "epoch": 2.140888888888889, "grad_norm": 1.888168454170227, "learning_rate": 0.00011446619217081851, "loss": 1.7381, "step": 4817 }, { "epoch": 2.1413333333333333, "grad_norm": 1.9398518800735474, "learning_rate": 0.00011444839857651247, "loss": 1.6413, "step": 4818 }, { "epoch": 2.1417777777777776, "grad_norm": 2.3134799003601074, "learning_rate": 0.00011443060498220641, "loss": 1.6505, "step": 4819 }, { "epoch": 2.1422222222222222, "grad_norm": 2.276531934738159, "learning_rate": 0.00011441281138790036, "loss": 2.3464, "step": 4820 }, { "epoch": 2.1426666666666665, "grad_norm": 1.9262102842330933, "learning_rate": 0.00011439501779359432, "loss": 1.7171, "step": 4821 }, { "epoch": 2.143111111111111, "grad_norm": 1.9879403114318848, "learning_rate": 0.00011437722419928826, "loss": 1.3029, "step": 4822 }, { "epoch": 2.1435555555555554, "grad_norm": 2.1766903400421143, "learning_rate": 0.00011435943060498222, "loss": 1.7259, "step": 4823 }, { "epoch": 2.144, "grad_norm": 2.2997825145721436, "learning_rate": 0.00011434163701067618, "loss": 2.3198, "step": 4824 }, { "epoch": 2.1444444444444444, "grad_norm": 2.1982157230377197, "learning_rate": 0.0001143238434163701, "loss": 1.913, "step": 4825 }, { "epoch": 2.144888888888889, "grad_norm": 2.041598320007324, "learning_rate": 0.00011430604982206405, "loss": 1.473, "step": 4826 }, { "epoch": 2.1453333333333333, "grad_norm": 2.7166502475738525, "learning_rate": 0.000114288256227758, "loss": 1.6377, "step": 4827 }, { "epoch": 2.145777777777778, "grad_norm": 2.4108223915100098, "learning_rate": 0.00011427046263345196, "loss": 1.5567, "step": 4828 }, { "epoch": 2.1462222222222223, "grad_norm": 2.168083429336548, "learning_rate": 0.0001142526690391459, "loss": 1.4504, "step": 4829 }, { "epoch": 2.1466666666666665, "grad_norm": 2.364166498184204, "learning_rate": 0.00011423487544483986, "loss": 2.1665, "step": 4830 }, { "epoch": 2.147111111111111, "grad_norm": 2.303101062774658, "learning_rate": 0.00011421708185053382, "loss": 1.7004, "step": 4831 }, { "epoch": 2.1475555555555554, "grad_norm": 2.227083921432495, "learning_rate": 0.00011419928825622776, "loss": 0.8074, "step": 4832 }, { "epoch": 2.148, "grad_norm": 2.1052846908569336, "learning_rate": 0.00011418149466192172, "loss": 1.6949, "step": 4833 }, { "epoch": 2.1484444444444444, "grad_norm": 2.4847774505615234, "learning_rate": 0.00011416370106761567, "loss": 1.601, "step": 4834 }, { "epoch": 2.148888888888889, "grad_norm": 1.6619476079940796, "learning_rate": 0.00011414590747330962, "loss": 0.8843, "step": 4835 }, { "epoch": 2.1493333333333333, "grad_norm": 2.540987491607666, "learning_rate": 0.00011412811387900357, "loss": 1.5859, "step": 4836 }, { "epoch": 2.1497777777777776, "grad_norm": 2.535325765609741, "learning_rate": 0.00011411032028469753, "loss": 1.8662, "step": 4837 }, { "epoch": 2.1502222222222223, "grad_norm": 2.593508243560791, "learning_rate": 0.00011409252669039146, "loss": 1.5427, "step": 4838 }, { "epoch": 2.1506666666666665, "grad_norm": 2.8630247116088867, "learning_rate": 0.0001140747330960854, "loss": 1.5968, "step": 4839 }, { "epoch": 2.151111111111111, "grad_norm": 2.9739978313446045, "learning_rate": 0.00011405693950177936, "loss": 1.7853, "step": 4840 }, { "epoch": 2.1515555555555554, "grad_norm": 2.3142545223236084, "learning_rate": 0.00011403914590747332, "loss": 1.4719, "step": 4841 }, { "epoch": 2.152, "grad_norm": 2.915494203567505, "learning_rate": 0.00011402135231316726, "loss": 1.4225, "step": 4842 }, { "epoch": 2.1524444444444444, "grad_norm": 2.4613630771636963, "learning_rate": 0.00011400355871886121, "loss": 1.8753, "step": 4843 }, { "epoch": 2.152888888888889, "grad_norm": 3.434368133544922, "learning_rate": 0.00011398576512455517, "loss": 1.4475, "step": 4844 }, { "epoch": 2.1533333333333333, "grad_norm": 3.4849135875701904, "learning_rate": 0.00011396797153024911, "loss": 2.0161, "step": 4845 }, { "epoch": 2.153777777777778, "grad_norm": 2.9939815998077393, "learning_rate": 0.00011395017793594307, "loss": 1.7339, "step": 4846 }, { "epoch": 2.1542222222222223, "grad_norm": 2.8146097660064697, "learning_rate": 0.00011393238434163703, "loss": 1.5392, "step": 4847 }, { "epoch": 2.1546666666666665, "grad_norm": 4.10907506942749, "learning_rate": 0.00011391459074733097, "loss": 1.2541, "step": 4848 }, { "epoch": 2.155111111111111, "grad_norm": 2.995576858520508, "learning_rate": 0.00011389679715302493, "loss": 1.668, "step": 4849 }, { "epoch": 2.1555555555555554, "grad_norm": 0.45589274168014526, "learning_rate": 0.00011387900355871888, "loss": 0.0619, "step": 4850 }, { "epoch": 2.156, "grad_norm": 1.8959424495697021, "learning_rate": 0.00011386120996441281, "loss": 2.3939, "step": 4851 }, { "epoch": 2.1564444444444444, "grad_norm": 1.9614112377166748, "learning_rate": 0.00011384341637010676, "loss": 1.8642, "step": 4852 }, { "epoch": 2.156888888888889, "grad_norm": 2.2174315452575684, "learning_rate": 0.00011382562277580071, "loss": 1.6341, "step": 4853 }, { "epoch": 2.1573333333333333, "grad_norm": 2.617152452468872, "learning_rate": 0.00011380782918149467, "loss": 1.9807, "step": 4854 }, { "epoch": 2.1577777777777776, "grad_norm": 1.9597722291946411, "learning_rate": 0.00011379003558718861, "loss": 1.5573, "step": 4855 }, { "epoch": 2.1582222222222223, "grad_norm": 2.3918566703796387, "learning_rate": 0.00011377224199288257, "loss": 2.0388, "step": 4856 }, { "epoch": 2.1586666666666665, "grad_norm": 2.0133910179138184, "learning_rate": 0.00011375444839857652, "loss": 2.0006, "step": 4857 }, { "epoch": 2.159111111111111, "grad_norm": 2.1372923851013184, "learning_rate": 0.00011373665480427047, "loss": 1.8971, "step": 4858 }, { "epoch": 2.1595555555555555, "grad_norm": 2.200821876525879, "learning_rate": 0.00011371886120996442, "loss": 1.8821, "step": 4859 }, { "epoch": 2.16, "grad_norm": 2.4604291915893555, "learning_rate": 0.00011370106761565838, "loss": 1.9079, "step": 4860 }, { "epoch": 2.1604444444444444, "grad_norm": 2.0270156860351562, "learning_rate": 0.00011368327402135232, "loss": 1.4849, "step": 4861 }, { "epoch": 2.160888888888889, "grad_norm": 2.796276807785034, "learning_rate": 0.00011366548042704628, "loss": 2.0916, "step": 4862 }, { "epoch": 2.1613333333333333, "grad_norm": 2.0537173748016357, "learning_rate": 0.00011364768683274024, "loss": 1.7261, "step": 4863 }, { "epoch": 2.1617777777777776, "grad_norm": 1.9119226932525635, "learning_rate": 0.00011362989323843417, "loss": 1.5883, "step": 4864 }, { "epoch": 2.1622222222222223, "grad_norm": 2.016460657119751, "learning_rate": 0.00011361209964412811, "loss": 2.0473, "step": 4865 }, { "epoch": 2.1626666666666665, "grad_norm": 2.056744337081909, "learning_rate": 0.00011359430604982207, "loss": 1.5667, "step": 4866 }, { "epoch": 2.163111111111111, "grad_norm": 2.0856573581695557, "learning_rate": 0.00011357651245551602, "loss": 2.1636, "step": 4867 }, { "epoch": 2.1635555555555555, "grad_norm": 2.0556750297546387, "learning_rate": 0.00011355871886120996, "loss": 1.6923, "step": 4868 }, { "epoch": 2.164, "grad_norm": 1.8435472249984741, "learning_rate": 0.00011354092526690392, "loss": 1.4836, "step": 4869 }, { "epoch": 2.1644444444444444, "grad_norm": 1.7105246782302856, "learning_rate": 0.00011352313167259788, "loss": 1.2481, "step": 4870 }, { "epoch": 2.164888888888889, "grad_norm": 1.367600679397583, "learning_rate": 0.00011350533807829182, "loss": 0.6025, "step": 4871 }, { "epoch": 2.1653333333333333, "grad_norm": 2.135531425476074, "learning_rate": 0.00011348754448398578, "loss": 1.6163, "step": 4872 }, { "epoch": 2.1657777777777776, "grad_norm": 1.9856308698654175, "learning_rate": 0.00011346975088967972, "loss": 1.6504, "step": 4873 }, { "epoch": 2.1662222222222223, "grad_norm": 2.070834159851074, "learning_rate": 0.00011345195729537368, "loss": 1.7144, "step": 4874 }, { "epoch": 2.1666666666666665, "grad_norm": 2.081918716430664, "learning_rate": 0.00011343416370106763, "loss": 1.6878, "step": 4875 }, { "epoch": 2.167111111111111, "grad_norm": 2.117114305496216, "learning_rate": 0.00011341637010676156, "loss": 1.5885, "step": 4876 }, { "epoch": 2.1675555555555555, "grad_norm": 2.200413227081299, "learning_rate": 0.0001133985765124555, "loss": 1.4892, "step": 4877 }, { "epoch": 2.168, "grad_norm": 2.2609503269195557, "learning_rate": 0.00011338078291814946, "loss": 1.5828, "step": 4878 }, { "epoch": 2.1684444444444444, "grad_norm": 2.2752559185028076, "learning_rate": 0.00011336298932384342, "loss": 1.6549, "step": 4879 }, { "epoch": 2.168888888888889, "grad_norm": 2.138413667678833, "learning_rate": 0.00011334519572953736, "loss": 1.241, "step": 4880 }, { "epoch": 2.1693333333333333, "grad_norm": 2.5910072326660156, "learning_rate": 0.00011332740213523132, "loss": 1.4842, "step": 4881 }, { "epoch": 2.1697777777777776, "grad_norm": 2.1112563610076904, "learning_rate": 0.00011330960854092527, "loss": 1.5217, "step": 4882 }, { "epoch": 2.1702222222222223, "grad_norm": 2.3650734424591064, "learning_rate": 0.00011329181494661922, "loss": 1.6714, "step": 4883 }, { "epoch": 2.1706666666666665, "grad_norm": 2.035025119781494, "learning_rate": 0.00011327402135231317, "loss": 1.2853, "step": 4884 }, { "epoch": 2.171111111111111, "grad_norm": 2.171036958694458, "learning_rate": 0.00011325622775800713, "loss": 1.4231, "step": 4885 }, { "epoch": 2.1715555555555555, "grad_norm": 3.245302200317383, "learning_rate": 0.00011323843416370107, "loss": 1.7397, "step": 4886 }, { "epoch": 2.172, "grad_norm": 3.072438955307007, "learning_rate": 0.00011322064056939503, "loss": 2.2672, "step": 4887 }, { "epoch": 2.1724444444444444, "grad_norm": 2.8883869647979736, "learning_rate": 0.00011320284697508899, "loss": 1.7494, "step": 4888 }, { "epoch": 2.172888888888889, "grad_norm": 2.5595476627349854, "learning_rate": 0.00011318505338078292, "loss": 1.3625, "step": 4889 }, { "epoch": 2.1733333333333333, "grad_norm": 2.532860517501831, "learning_rate": 0.00011316725978647686, "loss": 1.0884, "step": 4890 }, { "epoch": 2.1737777777777776, "grad_norm": 3.10447359085083, "learning_rate": 0.00011314946619217082, "loss": 2.1665, "step": 4891 }, { "epoch": 2.1742222222222223, "grad_norm": 2.6869397163391113, "learning_rate": 0.00011313167259786477, "loss": 1.644, "step": 4892 }, { "epoch": 2.1746666666666665, "grad_norm": 2.8844075202941895, "learning_rate": 0.00011311387900355871, "loss": 1.5886, "step": 4893 }, { "epoch": 2.175111111111111, "grad_norm": 2.6155810356140137, "learning_rate": 0.00011309608540925267, "loss": 1.4173, "step": 4894 }, { "epoch": 2.1755555555555555, "grad_norm": 3.163278341293335, "learning_rate": 0.00011307829181494663, "loss": 1.5141, "step": 4895 }, { "epoch": 2.176, "grad_norm": 2.6014788150787354, "learning_rate": 0.00011306049822064057, "loss": 1.3882, "step": 4896 }, { "epoch": 2.1764444444444444, "grad_norm": 3.472792387008667, "learning_rate": 0.00011304270462633453, "loss": 1.8972, "step": 4897 }, { "epoch": 2.176888888888889, "grad_norm": 2.989288568496704, "learning_rate": 0.00011302491103202848, "loss": 1.7598, "step": 4898 }, { "epoch": 2.1773333333333333, "grad_norm": 3.674435615539551, "learning_rate": 0.00011300711743772243, "loss": 1.5909, "step": 4899 }, { "epoch": 2.1777777777777776, "grad_norm": 2.7188007831573486, "learning_rate": 0.00011298932384341638, "loss": 0.8713, "step": 4900 }, { "epoch": 2.1782222222222223, "grad_norm": 1.8986291885375977, "learning_rate": 0.00011297153024911034, "loss": 2.2709, "step": 4901 }, { "epoch": 2.1786666666666665, "grad_norm": 1.9453548192977905, "learning_rate": 0.00011295373665480427, "loss": 2.0065, "step": 4902 }, { "epoch": 2.179111111111111, "grad_norm": 1.927510380744934, "learning_rate": 0.00011293594306049821, "loss": 2.1494, "step": 4903 }, { "epoch": 2.1795555555555555, "grad_norm": 2.2795588970184326, "learning_rate": 0.00011291814946619217, "loss": 1.9723, "step": 4904 }, { "epoch": 2.18, "grad_norm": 2.4390902519226074, "learning_rate": 0.00011290035587188612, "loss": 2.4457, "step": 4905 }, { "epoch": 2.1804444444444444, "grad_norm": 2.1050994396209717, "learning_rate": 0.00011288256227758007, "loss": 1.3167, "step": 4906 }, { "epoch": 2.180888888888889, "grad_norm": 2.186591386795044, "learning_rate": 0.00011286476868327402, "loss": 1.9273, "step": 4907 }, { "epoch": 2.1813333333333333, "grad_norm": 2.417327404022217, "learning_rate": 0.00011284697508896798, "loss": 1.9586, "step": 4908 }, { "epoch": 2.1817777777777776, "grad_norm": 2.4069387912750244, "learning_rate": 0.00011282918149466192, "loss": 1.3856, "step": 4909 }, { "epoch": 2.1822222222222223, "grad_norm": 2.5273990631103516, "learning_rate": 0.00011281138790035588, "loss": 1.8896, "step": 4910 }, { "epoch": 2.1826666666666665, "grad_norm": 2.1475119590759277, "learning_rate": 0.00011279359430604984, "loss": 1.9951, "step": 4911 }, { "epoch": 2.1831111111111112, "grad_norm": 2.5976321697235107, "learning_rate": 0.00011277580071174378, "loss": 1.6981, "step": 4912 }, { "epoch": 2.1835555555555555, "grad_norm": 1.4629095792770386, "learning_rate": 0.00011275800711743774, "loss": 0.9384, "step": 4913 }, { "epoch": 2.184, "grad_norm": 2.086716413497925, "learning_rate": 0.00011274021352313169, "loss": 1.5494, "step": 4914 }, { "epoch": 2.1844444444444444, "grad_norm": 1.9124023914337158, "learning_rate": 0.00011272241992882562, "loss": 1.5172, "step": 4915 }, { "epoch": 2.1848888888888887, "grad_norm": 2.046513080596924, "learning_rate": 0.00011270462633451956, "loss": 1.8116, "step": 4916 }, { "epoch": 2.1853333333333333, "grad_norm": 2.0074009895324707, "learning_rate": 0.00011268683274021352, "loss": 1.2871, "step": 4917 }, { "epoch": 2.1857777777777776, "grad_norm": 2.108560800552368, "learning_rate": 0.00011266903914590748, "loss": 1.8694, "step": 4918 }, { "epoch": 2.1862222222222223, "grad_norm": 2.6234912872314453, "learning_rate": 0.00011265124555160142, "loss": 1.577, "step": 4919 }, { "epoch": 2.1866666666666665, "grad_norm": 2.239198684692383, "learning_rate": 0.00011263345195729538, "loss": 2.0347, "step": 4920 }, { "epoch": 2.1871111111111112, "grad_norm": 2.2819833755493164, "learning_rate": 0.00011261565836298933, "loss": 2.3103, "step": 4921 }, { "epoch": 2.1875555555555555, "grad_norm": 2.2331910133361816, "learning_rate": 0.00011259786476868328, "loss": 1.6046, "step": 4922 }, { "epoch": 2.188, "grad_norm": 2.1115944385528564, "learning_rate": 0.00011258007117437723, "loss": 1.8757, "step": 4923 }, { "epoch": 2.1884444444444444, "grad_norm": 2.104067087173462, "learning_rate": 0.00011256227758007119, "loss": 1.6774, "step": 4924 }, { "epoch": 2.188888888888889, "grad_norm": 2.385721206665039, "learning_rate": 0.00011254448398576513, "loss": 2.1468, "step": 4925 }, { "epoch": 2.1893333333333334, "grad_norm": 2.554507255554199, "learning_rate": 0.00011252669039145909, "loss": 2.0144, "step": 4926 }, { "epoch": 2.1897777777777776, "grad_norm": 2.23824143409729, "learning_rate": 0.00011250889679715305, "loss": 1.7106, "step": 4927 }, { "epoch": 2.1902222222222223, "grad_norm": 2.3488128185272217, "learning_rate": 0.00011249110320284698, "loss": 1.8532, "step": 4928 }, { "epoch": 2.1906666666666665, "grad_norm": 2.960286855697632, "learning_rate": 0.00011247330960854092, "loss": 1.6043, "step": 4929 }, { "epoch": 2.1911111111111112, "grad_norm": 2.2605531215667725, "learning_rate": 0.00011245551601423487, "loss": 1.4308, "step": 4930 }, { "epoch": 2.1915555555555555, "grad_norm": 2.7191028594970703, "learning_rate": 0.00011243772241992883, "loss": 1.4713, "step": 4931 }, { "epoch": 2.192, "grad_norm": 2.499677896499634, "learning_rate": 0.00011241992882562277, "loss": 1.4607, "step": 4932 }, { "epoch": 2.1924444444444444, "grad_norm": 2.556260347366333, "learning_rate": 0.00011240213523131673, "loss": 1.4645, "step": 4933 }, { "epoch": 2.1928888888888887, "grad_norm": 2.443964958190918, "learning_rate": 0.00011238434163701069, "loss": 1.3057, "step": 4934 }, { "epoch": 2.1933333333333334, "grad_norm": 2.543210744857788, "learning_rate": 0.00011236654804270463, "loss": 2.0736, "step": 4935 }, { "epoch": 2.1937777777777776, "grad_norm": 2.3704352378845215, "learning_rate": 0.00011234875444839859, "loss": 1.3827, "step": 4936 }, { "epoch": 2.1942222222222223, "grad_norm": 2.49611234664917, "learning_rate": 0.00011233096085409254, "loss": 1.7015, "step": 4937 }, { "epoch": 2.1946666666666665, "grad_norm": 2.5088584423065186, "learning_rate": 0.00011231316725978649, "loss": 1.7995, "step": 4938 }, { "epoch": 2.1951111111111112, "grad_norm": 2.3908095359802246, "learning_rate": 0.00011229537366548044, "loss": 1.5912, "step": 4939 }, { "epoch": 2.1955555555555555, "grad_norm": 2.597078323364258, "learning_rate": 0.0001122775800711744, "loss": 1.5497, "step": 4940 }, { "epoch": 2.196, "grad_norm": 2.943357467651367, "learning_rate": 0.00011225978647686833, "loss": 2.0209, "step": 4941 }, { "epoch": 2.1964444444444444, "grad_norm": 3.4762535095214844, "learning_rate": 0.00011224199288256227, "loss": 2.0581, "step": 4942 }, { "epoch": 2.196888888888889, "grad_norm": 2.8447139263153076, "learning_rate": 0.00011222419928825623, "loss": 1.42, "step": 4943 }, { "epoch": 2.1973333333333334, "grad_norm": 2.7726380825042725, "learning_rate": 0.00011220640569395018, "loss": 1.608, "step": 4944 }, { "epoch": 2.1977777777777776, "grad_norm": 2.942455768585205, "learning_rate": 0.00011218861209964413, "loss": 1.6582, "step": 4945 }, { "epoch": 2.1982222222222223, "grad_norm": 3.3560426235198975, "learning_rate": 0.00011217081850533808, "loss": 1.897, "step": 4946 }, { "epoch": 2.1986666666666665, "grad_norm": 3.4621267318725586, "learning_rate": 0.00011215302491103204, "loss": 2.2004, "step": 4947 }, { "epoch": 2.1991111111111112, "grad_norm": 2.5123441219329834, "learning_rate": 0.00011213523131672598, "loss": 1.0752, "step": 4948 }, { "epoch": 2.1995555555555555, "grad_norm": 1.6749694347381592, "learning_rate": 0.00011211743772241994, "loss": 0.6226, "step": 4949 }, { "epoch": 2.2, "grad_norm": 3.319334030151367, "learning_rate": 0.0001120996441281139, "loss": 1.4356, "step": 4950 }, { "epoch": 2.2004444444444444, "grad_norm": 1.5259418487548828, "learning_rate": 0.00011208185053380784, "loss": 1.6631, "step": 4951 }, { "epoch": 2.2008888888888887, "grad_norm": 1.3049883842468262, "learning_rate": 0.0001120640569395018, "loss": 0.9819, "step": 4952 }, { "epoch": 2.2013333333333334, "grad_norm": 2.1342673301696777, "learning_rate": 0.00011204626334519575, "loss": 1.929, "step": 4953 }, { "epoch": 2.2017777777777776, "grad_norm": 1.989028811454773, "learning_rate": 0.00011202846975088968, "loss": 2.0505, "step": 4954 }, { "epoch": 2.2022222222222223, "grad_norm": 2.049135208129883, "learning_rate": 0.00011201067615658362, "loss": 1.8818, "step": 4955 }, { "epoch": 2.2026666666666666, "grad_norm": 2.263387441635132, "learning_rate": 0.00011199288256227758, "loss": 2.0014, "step": 4956 }, { "epoch": 2.2031111111111112, "grad_norm": 2.11501407623291, "learning_rate": 0.00011197508896797154, "loss": 1.9057, "step": 4957 }, { "epoch": 2.2035555555555555, "grad_norm": 1.9304909706115723, "learning_rate": 0.00011195729537366548, "loss": 1.867, "step": 4958 }, { "epoch": 2.204, "grad_norm": 2.2471048831939697, "learning_rate": 0.00011193950177935944, "loss": 1.7576, "step": 4959 }, { "epoch": 2.2044444444444444, "grad_norm": 2.4755563735961914, "learning_rate": 0.0001119217081850534, "loss": 1.6393, "step": 4960 }, { "epoch": 2.204888888888889, "grad_norm": 2.0645923614501953, "learning_rate": 0.00011190391459074734, "loss": 1.4799, "step": 4961 }, { "epoch": 2.2053333333333334, "grad_norm": 2.2446157932281494, "learning_rate": 0.00011188612099644129, "loss": 2.2068, "step": 4962 }, { "epoch": 2.2057777777777776, "grad_norm": 2.309330701828003, "learning_rate": 0.00011186832740213524, "loss": 2.1778, "step": 4963 }, { "epoch": 2.2062222222222223, "grad_norm": 1.9780677556991577, "learning_rate": 0.00011185053380782919, "loss": 1.8793, "step": 4964 }, { "epoch": 2.2066666666666666, "grad_norm": 2.061521530151367, "learning_rate": 0.00011183274021352315, "loss": 1.2263, "step": 4965 }, { "epoch": 2.2071111111111112, "grad_norm": 2.245497703552246, "learning_rate": 0.00011181494661921709, "loss": 1.9673, "step": 4966 }, { "epoch": 2.2075555555555555, "grad_norm": 2.020643472671509, "learning_rate": 0.00011179715302491102, "loss": 1.9154, "step": 4967 }, { "epoch": 2.208, "grad_norm": 2.15903902053833, "learning_rate": 0.00011177935943060498, "loss": 1.8257, "step": 4968 }, { "epoch": 2.2084444444444444, "grad_norm": 2.490280866622925, "learning_rate": 0.00011176156583629893, "loss": 1.9819, "step": 4969 }, { "epoch": 2.2088888888888887, "grad_norm": 2.4871490001678467, "learning_rate": 0.00011174377224199288, "loss": 1.946, "step": 4970 }, { "epoch": 2.2093333333333334, "grad_norm": 2.053659200668335, "learning_rate": 0.00011172597864768683, "loss": 1.8871, "step": 4971 }, { "epoch": 2.2097777777777776, "grad_norm": 2.2057316303253174, "learning_rate": 0.00011170818505338079, "loss": 1.5619, "step": 4972 }, { "epoch": 2.2102222222222223, "grad_norm": 1.9269914627075195, "learning_rate": 0.00011169039145907473, "loss": 1.3407, "step": 4973 }, { "epoch": 2.2106666666666666, "grad_norm": 2.1930150985717773, "learning_rate": 0.00011167259786476869, "loss": 1.7184, "step": 4974 }, { "epoch": 2.2111111111111112, "grad_norm": 2.1484861373901367, "learning_rate": 0.00011165480427046265, "loss": 1.8792, "step": 4975 }, { "epoch": 2.2115555555555555, "grad_norm": 2.3635623455047607, "learning_rate": 0.00011163701067615659, "loss": 1.8599, "step": 4976 }, { "epoch": 2.212, "grad_norm": 2.525038003921509, "learning_rate": 0.00011161921708185055, "loss": 1.9892, "step": 4977 }, { "epoch": 2.2124444444444444, "grad_norm": 2.2718634605407715, "learning_rate": 0.0001116014234875445, "loss": 1.6329, "step": 4978 }, { "epoch": 2.2128888888888887, "grad_norm": 1.9843499660491943, "learning_rate": 0.00011158362989323844, "loss": 1.2032, "step": 4979 }, { "epoch": 2.2133333333333334, "grad_norm": 2.85638165473938, "learning_rate": 0.00011156583629893237, "loss": 2.0707, "step": 4980 }, { "epoch": 2.2137777777777776, "grad_norm": 2.3343589305877686, "learning_rate": 0.00011154804270462633, "loss": 1.3783, "step": 4981 }, { "epoch": 2.2142222222222223, "grad_norm": 2.3680782318115234, "learning_rate": 0.00011153024911032029, "loss": 1.4898, "step": 4982 }, { "epoch": 2.2146666666666666, "grad_norm": 1.55228853225708, "learning_rate": 0.00011151245551601423, "loss": 0.6674, "step": 4983 }, { "epoch": 2.2151111111111113, "grad_norm": 2.7255640029907227, "learning_rate": 0.00011149466192170819, "loss": 1.7476, "step": 4984 }, { "epoch": 2.2155555555555555, "grad_norm": 2.6943490505218506, "learning_rate": 0.00011147686832740214, "loss": 1.5847, "step": 4985 }, { "epoch": 2.216, "grad_norm": 2.6895389556884766, "learning_rate": 0.00011145907473309609, "loss": 1.8233, "step": 4986 }, { "epoch": 2.2164444444444444, "grad_norm": 2.645171642303467, "learning_rate": 0.00011144128113879004, "loss": 1.7922, "step": 4987 }, { "epoch": 2.2168888888888887, "grad_norm": 3.0169036388397217, "learning_rate": 0.000111423487544484, "loss": 1.8521, "step": 4988 }, { "epoch": 2.2173333333333334, "grad_norm": 2.563162088394165, "learning_rate": 0.00011140569395017794, "loss": 1.654, "step": 4989 }, { "epoch": 2.2177777777777776, "grad_norm": 2.9162724018096924, "learning_rate": 0.0001113879003558719, "loss": 1.8161, "step": 4990 }, { "epoch": 2.2182222222222223, "grad_norm": 2.8812615871429443, "learning_rate": 0.00011137010676156586, "loss": 1.8481, "step": 4991 }, { "epoch": 2.2186666666666666, "grad_norm": 3.00016188621521, "learning_rate": 0.00011135231316725978, "loss": 1.7091, "step": 4992 }, { "epoch": 2.2191111111111113, "grad_norm": 2.6278302669525146, "learning_rate": 0.00011133451957295373, "loss": 1.7687, "step": 4993 }, { "epoch": 2.2195555555555555, "grad_norm": 2.530658006668091, "learning_rate": 0.00011131672597864768, "loss": 1.6654, "step": 4994 }, { "epoch": 2.22, "grad_norm": 2.8739798069000244, "learning_rate": 0.00011129893238434164, "loss": 1.3435, "step": 4995 }, { "epoch": 2.2204444444444444, "grad_norm": 2.9799602031707764, "learning_rate": 0.00011128113879003558, "loss": 1.642, "step": 4996 }, { "epoch": 2.2208888888888887, "grad_norm": 2.8478050231933594, "learning_rate": 0.00011126334519572954, "loss": 1.6135, "step": 4997 }, { "epoch": 2.2213333333333334, "grad_norm": 3.293814182281494, "learning_rate": 0.0001112455516014235, "loss": 1.8149, "step": 4998 }, { "epoch": 2.2217777777777776, "grad_norm": 2.8683934211730957, "learning_rate": 0.00011122775800711744, "loss": 1.3027, "step": 4999 }, { "epoch": 2.2222222222222223, "grad_norm": 3.7593607902526855, "learning_rate": 0.0001112099644128114, "loss": 1.6055, "step": 5000 }, { "epoch": 2.2226666666666666, "grad_norm": 1.727084755897522, "learning_rate": 0.00011119217081850535, "loss": 1.8026, "step": 5001 }, { "epoch": 2.2231111111111113, "grad_norm": 1.792447805404663, "learning_rate": 0.0001111743772241993, "loss": 1.9238, "step": 5002 }, { "epoch": 2.2235555555555555, "grad_norm": 1.3714830875396729, "learning_rate": 0.00011115658362989325, "loss": 0.1835, "step": 5003 }, { "epoch": 2.224, "grad_norm": 2.177612066268921, "learning_rate": 0.00011113879003558721, "loss": 2.0674, "step": 5004 }, { "epoch": 2.2244444444444444, "grad_norm": 2.2636287212371826, "learning_rate": 0.00011112099644128114, "loss": 2.0529, "step": 5005 }, { "epoch": 2.2248888888888887, "grad_norm": 2.174473285675049, "learning_rate": 0.00011110320284697508, "loss": 1.7144, "step": 5006 }, { "epoch": 2.2253333333333334, "grad_norm": 2.079563617706299, "learning_rate": 0.00011108540925266904, "loss": 1.7053, "step": 5007 }, { "epoch": 2.2257777777777776, "grad_norm": 2.0495669841766357, "learning_rate": 0.000111067615658363, "loss": 1.4467, "step": 5008 }, { "epoch": 2.2262222222222223, "grad_norm": 2.1744115352630615, "learning_rate": 0.00011104982206405694, "loss": 1.6289, "step": 5009 }, { "epoch": 2.2266666666666666, "grad_norm": 2.245788335800171, "learning_rate": 0.00011103202846975089, "loss": 2.0664, "step": 5010 }, { "epoch": 2.2271111111111113, "grad_norm": 2.4176290035247803, "learning_rate": 0.00011101423487544485, "loss": 1.986, "step": 5011 }, { "epoch": 2.2275555555555555, "grad_norm": 2.397927761077881, "learning_rate": 0.00011099644128113879, "loss": 1.8439, "step": 5012 }, { "epoch": 2.228, "grad_norm": 2.479949951171875, "learning_rate": 0.00011097864768683275, "loss": 1.7942, "step": 5013 }, { "epoch": 2.2284444444444444, "grad_norm": 2.213899850845337, "learning_rate": 0.0001109608540925267, "loss": 1.8292, "step": 5014 }, { "epoch": 2.2288888888888887, "grad_norm": 2.117074728012085, "learning_rate": 0.00011094306049822065, "loss": 1.8464, "step": 5015 }, { "epoch": 2.2293333333333334, "grad_norm": 2.4568095207214355, "learning_rate": 0.0001109252669039146, "loss": 1.7872, "step": 5016 }, { "epoch": 2.2297777777777776, "grad_norm": 2.0903635025024414, "learning_rate": 0.00011090747330960856, "loss": 1.4369, "step": 5017 }, { "epoch": 2.2302222222222223, "grad_norm": 2.1136090755462646, "learning_rate": 0.00011088967971530249, "loss": 1.8091, "step": 5018 }, { "epoch": 2.2306666666666666, "grad_norm": 1.9191817045211792, "learning_rate": 0.00011087188612099643, "loss": 1.9579, "step": 5019 }, { "epoch": 2.2311111111111113, "grad_norm": 2.3440165519714355, "learning_rate": 0.00011085409252669039, "loss": 1.9823, "step": 5020 }, { "epoch": 2.2315555555555555, "grad_norm": 2.085297107696533, "learning_rate": 0.00011083629893238435, "loss": 1.8584, "step": 5021 }, { "epoch": 2.232, "grad_norm": 2.513420820236206, "learning_rate": 0.00011081850533807829, "loss": 2.2256, "step": 5022 }, { "epoch": 2.2324444444444445, "grad_norm": 2.3045380115509033, "learning_rate": 0.00011080071174377225, "loss": 1.4638, "step": 5023 }, { "epoch": 2.2328888888888887, "grad_norm": 2.3060178756713867, "learning_rate": 0.0001107829181494662, "loss": 1.8818, "step": 5024 }, { "epoch": 2.2333333333333334, "grad_norm": 2.0273396968841553, "learning_rate": 0.00011076512455516015, "loss": 1.5125, "step": 5025 }, { "epoch": 2.2337777777777776, "grad_norm": 2.183797597885132, "learning_rate": 0.0001107473309608541, "loss": 1.4318, "step": 5026 }, { "epoch": 2.2342222222222223, "grad_norm": 2.43717098236084, "learning_rate": 0.00011072953736654806, "loss": 1.7804, "step": 5027 }, { "epoch": 2.2346666666666666, "grad_norm": 2.276435375213623, "learning_rate": 0.000110711743772242, "loss": 1.5594, "step": 5028 }, { "epoch": 2.2351111111111113, "grad_norm": 2.2554962635040283, "learning_rate": 0.00011069395017793596, "loss": 1.4094, "step": 5029 }, { "epoch": 2.2355555555555555, "grad_norm": 2.3153090476989746, "learning_rate": 0.00011067615658362991, "loss": 1.3073, "step": 5030 }, { "epoch": 2.2359999999999998, "grad_norm": 2.353789806365967, "learning_rate": 0.00011065836298932384, "loss": 1.5179, "step": 5031 }, { "epoch": 2.2364444444444445, "grad_norm": 2.6622314453125, "learning_rate": 0.00011064056939501779, "loss": 1.9487, "step": 5032 }, { "epoch": 2.2368888888888887, "grad_norm": 2.1066958904266357, "learning_rate": 0.00011062277580071174, "loss": 1.3317, "step": 5033 }, { "epoch": 2.2373333333333334, "grad_norm": 2.2060763835906982, "learning_rate": 0.0001106049822064057, "loss": 0.9029, "step": 5034 }, { "epoch": 2.2377777777777776, "grad_norm": 0.20760299265384674, "learning_rate": 0.00011058718861209964, "loss": 0.037, "step": 5035 }, { "epoch": 2.2382222222222223, "grad_norm": 2.2269294261932373, "learning_rate": 0.0001105693950177936, "loss": 1.4259, "step": 5036 }, { "epoch": 2.2386666666666666, "grad_norm": 2.511350631713867, "learning_rate": 0.00011055160142348756, "loss": 2.0862, "step": 5037 }, { "epoch": 2.2391111111111113, "grad_norm": 2.4969749450683594, "learning_rate": 0.0001105338078291815, "loss": 1.4302, "step": 5038 }, { "epoch": 2.2395555555555555, "grad_norm": 2.8733372688293457, "learning_rate": 0.00011051601423487546, "loss": 1.4739, "step": 5039 }, { "epoch": 2.24, "grad_norm": 2.5586845874786377, "learning_rate": 0.00011049822064056941, "loss": 1.6102, "step": 5040 }, { "epoch": 2.2404444444444445, "grad_norm": 3.037733554840088, "learning_rate": 0.00011048042704626335, "loss": 1.6267, "step": 5041 }, { "epoch": 2.2408888888888887, "grad_norm": 2.6196389198303223, "learning_rate": 0.00011046263345195731, "loss": 1.3955, "step": 5042 }, { "epoch": 2.2413333333333334, "grad_norm": 2.608210325241089, "learning_rate": 0.00011044483985765127, "loss": 1.8584, "step": 5043 }, { "epoch": 2.2417777777777776, "grad_norm": 2.4374005794525146, "learning_rate": 0.0001104270462633452, "loss": 1.1662, "step": 5044 }, { "epoch": 2.2422222222222223, "grad_norm": 2.6647913455963135, "learning_rate": 0.00011040925266903914, "loss": 1.7048, "step": 5045 }, { "epoch": 2.2426666666666666, "grad_norm": 3.091082811355591, "learning_rate": 0.0001103914590747331, "loss": 1.6592, "step": 5046 }, { "epoch": 2.2431111111111113, "grad_norm": 3.078878164291382, "learning_rate": 0.00011037366548042705, "loss": 1.6601, "step": 5047 }, { "epoch": 2.2435555555555555, "grad_norm": 2.757037401199341, "learning_rate": 0.000110355871886121, "loss": 1.3657, "step": 5048 }, { "epoch": 2.2439999999999998, "grad_norm": 4.261408805847168, "learning_rate": 0.00011033807829181495, "loss": 1.2643, "step": 5049 }, { "epoch": 2.2444444444444445, "grad_norm": 4.232182025909424, "learning_rate": 0.00011032028469750891, "loss": 1.125, "step": 5050 }, { "epoch": 2.2448888888888887, "grad_norm": 1.8265682458877563, "learning_rate": 0.00011030249110320285, "loss": 2.396, "step": 5051 }, { "epoch": 2.2453333333333334, "grad_norm": 2.1252670288085938, "learning_rate": 0.00011028469750889681, "loss": 2.0516, "step": 5052 }, { "epoch": 2.2457777777777777, "grad_norm": 2.073094129562378, "learning_rate": 0.00011026690391459075, "loss": 2.2738, "step": 5053 }, { "epoch": 2.2462222222222223, "grad_norm": 2.2928173542022705, "learning_rate": 0.00011024911032028471, "loss": 1.8305, "step": 5054 }, { "epoch": 2.2466666666666666, "grad_norm": 2.276385545730591, "learning_rate": 0.00011023131672597866, "loss": 2.1467, "step": 5055 }, { "epoch": 2.2471111111111113, "grad_norm": 1.8952126502990723, "learning_rate": 0.00011021352313167261, "loss": 1.6379, "step": 5056 }, { "epoch": 2.2475555555555555, "grad_norm": 2.017606735229492, "learning_rate": 0.00011019572953736654, "loss": 2.0341, "step": 5057 }, { "epoch": 2.248, "grad_norm": 2.055772542953491, "learning_rate": 0.00011017793594306049, "loss": 2.0079, "step": 5058 }, { "epoch": 2.2484444444444445, "grad_norm": 2.386974334716797, "learning_rate": 0.00011016014234875445, "loss": 2.0906, "step": 5059 }, { "epoch": 2.2488888888888887, "grad_norm": 2.5190494060516357, "learning_rate": 0.00011014234875444839, "loss": 0.848, "step": 5060 }, { "epoch": 2.2493333333333334, "grad_norm": 1.9555598497390747, "learning_rate": 0.00011012455516014235, "loss": 1.8228, "step": 5061 }, { "epoch": 2.2497777777777777, "grad_norm": 2.0160269737243652, "learning_rate": 0.0001101067615658363, "loss": 1.8248, "step": 5062 }, { "epoch": 2.2502222222222223, "grad_norm": 1.9360467195510864, "learning_rate": 0.00011008896797153025, "loss": 1.5145, "step": 5063 }, { "epoch": 2.2506666666666666, "grad_norm": 2.0160787105560303, "learning_rate": 0.0001100711743772242, "loss": 1.72, "step": 5064 }, { "epoch": 2.2511111111111113, "grad_norm": 2.07167649269104, "learning_rate": 0.00011005338078291816, "loss": 1.5055, "step": 5065 }, { "epoch": 2.2515555555555555, "grad_norm": 2.21230149269104, "learning_rate": 0.0001100355871886121, "loss": 1.9066, "step": 5066 }, { "epoch": 2.252, "grad_norm": 1.537184476852417, "learning_rate": 0.00011001779359430606, "loss": 0.8466, "step": 5067 }, { "epoch": 2.2524444444444445, "grad_norm": 1.7883505821228027, "learning_rate": 0.00011000000000000002, "loss": 1.1587, "step": 5068 }, { "epoch": 2.2528888888888887, "grad_norm": 2.396481990814209, "learning_rate": 0.00010998220640569396, "loss": 2.3124, "step": 5069 }, { "epoch": 2.2533333333333334, "grad_norm": 2.4314522743225098, "learning_rate": 0.00010996441281138789, "loss": 1.9213, "step": 5070 }, { "epoch": 2.2537777777777777, "grad_norm": 2.3176066875457764, "learning_rate": 0.00010994661921708185, "loss": 1.7673, "step": 5071 }, { "epoch": 2.2542222222222223, "grad_norm": 2.6162097454071045, "learning_rate": 0.0001099288256227758, "loss": 1.8331, "step": 5072 }, { "epoch": 2.2546666666666666, "grad_norm": 2.079402446746826, "learning_rate": 0.00010991103202846975, "loss": 1.7033, "step": 5073 }, { "epoch": 2.2551111111111113, "grad_norm": 2.6413557529449463, "learning_rate": 0.0001098932384341637, "loss": 2.1742, "step": 5074 }, { "epoch": 2.2555555555555555, "grad_norm": 2.4262351989746094, "learning_rate": 0.00010987544483985766, "loss": 1.8914, "step": 5075 }, { "epoch": 2.2560000000000002, "grad_norm": 2.247837543487549, "learning_rate": 0.0001098576512455516, "loss": 1.7544, "step": 5076 }, { "epoch": 2.2564444444444445, "grad_norm": 2.4542994499206543, "learning_rate": 0.00010983985765124556, "loss": 1.5624, "step": 5077 }, { "epoch": 2.2568888888888887, "grad_norm": 2.1061196327209473, "learning_rate": 0.00010982206405693951, "loss": 1.4963, "step": 5078 }, { "epoch": 2.2573333333333334, "grad_norm": 2.599663257598877, "learning_rate": 0.00010980427046263346, "loss": 2.1344, "step": 5079 }, { "epoch": 2.2577777777777777, "grad_norm": 2.2885448932647705, "learning_rate": 0.00010978647686832741, "loss": 1.6295, "step": 5080 }, { "epoch": 2.2582222222222224, "grad_norm": 2.3721494674682617, "learning_rate": 0.00010976868327402137, "loss": 1.7371, "step": 5081 }, { "epoch": 2.2586666666666666, "grad_norm": 2.5894291400909424, "learning_rate": 0.00010975088967971531, "loss": 1.7291, "step": 5082 }, { "epoch": 2.2591111111111113, "grad_norm": 2.5275352001190186, "learning_rate": 0.00010973309608540924, "loss": 1.6438, "step": 5083 }, { "epoch": 2.2595555555555555, "grad_norm": 2.664001941680908, "learning_rate": 0.0001097153024911032, "loss": 1.561, "step": 5084 }, { "epoch": 2.26, "grad_norm": 2.6688332557678223, "learning_rate": 0.00010969750889679716, "loss": 1.6883, "step": 5085 }, { "epoch": 2.2604444444444445, "grad_norm": 3.350391387939453, "learning_rate": 0.0001096797153024911, "loss": 1.7163, "step": 5086 }, { "epoch": 2.2608888888888887, "grad_norm": 2.204253911972046, "learning_rate": 0.00010966192170818506, "loss": 1.3111, "step": 5087 }, { "epoch": 2.2613333333333334, "grad_norm": 2.883998394012451, "learning_rate": 0.00010964412811387901, "loss": 2.3049, "step": 5088 }, { "epoch": 2.2617777777777777, "grad_norm": 3.0937411785125732, "learning_rate": 0.00010962633451957295, "loss": 2.2407, "step": 5089 }, { "epoch": 2.2622222222222224, "grad_norm": 2.6656394004821777, "learning_rate": 0.00010960854092526691, "loss": 1.4929, "step": 5090 }, { "epoch": 2.2626666666666666, "grad_norm": 2.6025304794311523, "learning_rate": 0.00010959074733096087, "loss": 1.6213, "step": 5091 }, { "epoch": 2.2631111111111113, "grad_norm": 2.685457706451416, "learning_rate": 0.00010957295373665481, "loss": 1.5779, "step": 5092 }, { "epoch": 2.2635555555555555, "grad_norm": 2.978306531906128, "learning_rate": 0.00010955516014234877, "loss": 1.9549, "step": 5093 }, { "epoch": 2.2640000000000002, "grad_norm": 3.0156309604644775, "learning_rate": 0.00010953736654804272, "loss": 1.837, "step": 5094 }, { "epoch": 2.2644444444444445, "grad_norm": 2.647887706756592, "learning_rate": 0.00010951957295373667, "loss": 1.496, "step": 5095 }, { "epoch": 2.2648888888888887, "grad_norm": 3.1577885150909424, "learning_rate": 0.0001095017793594306, "loss": 1.7638, "step": 5096 }, { "epoch": 2.2653333333333334, "grad_norm": 3.2039244174957275, "learning_rate": 0.00010948398576512455, "loss": 1.8427, "step": 5097 }, { "epoch": 2.2657777777777777, "grad_norm": 3.6146154403686523, "learning_rate": 0.00010946619217081851, "loss": 1.961, "step": 5098 }, { "epoch": 2.2662222222222224, "grad_norm": 2.9038968086242676, "learning_rate": 0.00010944839857651245, "loss": 1.731, "step": 5099 }, { "epoch": 2.2666666666666666, "grad_norm": 1.7584317922592163, "learning_rate": 0.00010943060498220641, "loss": 0.5946, "step": 5100 }, { "epoch": 2.2671111111111113, "grad_norm": 1.3620789051055908, "learning_rate": 0.00010941281138790037, "loss": 1.1835, "step": 5101 }, { "epoch": 2.2675555555555555, "grad_norm": 1.787398338317871, "learning_rate": 0.00010939501779359431, "loss": 2.3021, "step": 5102 }, { "epoch": 2.268, "grad_norm": 2.0760881900787354, "learning_rate": 0.00010937722419928826, "loss": 2.3895, "step": 5103 }, { "epoch": 2.2684444444444445, "grad_norm": 1.6183782815933228, "learning_rate": 0.00010935943060498222, "loss": 1.7555, "step": 5104 }, { "epoch": 2.2688888888888887, "grad_norm": 2.2935304641723633, "learning_rate": 0.00010934163701067616, "loss": 2.1392, "step": 5105 }, { "epoch": 2.2693333333333334, "grad_norm": 2.3295974731445312, "learning_rate": 0.00010932384341637012, "loss": 1.9799, "step": 5106 }, { "epoch": 2.2697777777777777, "grad_norm": 2.144185781478882, "learning_rate": 0.00010930604982206408, "loss": 1.6646, "step": 5107 }, { "epoch": 2.2702222222222224, "grad_norm": 1.8681271076202393, "learning_rate": 0.000109288256227758, "loss": 1.7238, "step": 5108 }, { "epoch": 2.2706666666666666, "grad_norm": 1.9623098373413086, "learning_rate": 0.00010927046263345195, "loss": 1.5286, "step": 5109 }, { "epoch": 2.2711111111111113, "grad_norm": 2.0515475273132324, "learning_rate": 0.0001092526690391459, "loss": 1.853, "step": 5110 }, { "epoch": 2.2715555555555556, "grad_norm": 2.088438034057617, "learning_rate": 0.00010923487544483986, "loss": 1.907, "step": 5111 }, { "epoch": 2.2720000000000002, "grad_norm": 2.4442410469055176, "learning_rate": 0.0001092170818505338, "loss": 2.044, "step": 5112 }, { "epoch": 2.2724444444444445, "grad_norm": 2.190702199935913, "learning_rate": 0.00010919928825622776, "loss": 2.2709, "step": 5113 }, { "epoch": 2.2728888888888887, "grad_norm": 2.703242778778076, "learning_rate": 0.00010918149466192172, "loss": 2.2026, "step": 5114 }, { "epoch": 2.2733333333333334, "grad_norm": 2.1000590324401855, "learning_rate": 0.00010916370106761566, "loss": 1.552, "step": 5115 }, { "epoch": 2.2737777777777777, "grad_norm": 2.5693249702453613, "learning_rate": 0.00010914590747330962, "loss": 2.2501, "step": 5116 }, { "epoch": 2.2742222222222224, "grad_norm": 2.3385515213012695, "learning_rate": 0.00010912811387900357, "loss": 2.283, "step": 5117 }, { "epoch": 2.2746666666666666, "grad_norm": 1.8569934368133545, "learning_rate": 0.00010911032028469752, "loss": 1.5728, "step": 5118 }, { "epoch": 2.2751111111111113, "grad_norm": 2.1497058868408203, "learning_rate": 0.00010909252669039147, "loss": 1.6939, "step": 5119 }, { "epoch": 2.2755555555555556, "grad_norm": 2.399209499359131, "learning_rate": 0.00010907473309608543, "loss": 2.1534, "step": 5120 }, { "epoch": 2.276, "grad_norm": 2.810572624206543, "learning_rate": 0.00010905693950177936, "loss": 1.8042, "step": 5121 }, { "epoch": 2.2764444444444445, "grad_norm": 2.422611713409424, "learning_rate": 0.0001090391459074733, "loss": 1.7979, "step": 5122 }, { "epoch": 2.2768888888888887, "grad_norm": 2.5435221195220947, "learning_rate": 0.00010902135231316726, "loss": 1.8007, "step": 5123 }, { "epoch": 2.2773333333333334, "grad_norm": 2.7877004146575928, "learning_rate": 0.00010900355871886122, "loss": 1.9082, "step": 5124 }, { "epoch": 2.2777777777777777, "grad_norm": 2.4548027515411377, "learning_rate": 0.00010898576512455516, "loss": 1.4372, "step": 5125 }, { "epoch": 2.2782222222222224, "grad_norm": 2.1647870540618896, "learning_rate": 0.00010896797153024911, "loss": 1.5144, "step": 5126 }, { "epoch": 2.2786666666666666, "grad_norm": 2.2049877643585205, "learning_rate": 0.00010895017793594307, "loss": 1.278, "step": 5127 }, { "epoch": 2.279111111111111, "grad_norm": 2.6493771076202393, "learning_rate": 0.00010893238434163701, "loss": 2.1244, "step": 5128 }, { "epoch": 2.2795555555555556, "grad_norm": 2.0453760623931885, "learning_rate": 0.00010891459074733097, "loss": 0.8898, "step": 5129 }, { "epoch": 2.2800000000000002, "grad_norm": 2.6239845752716064, "learning_rate": 0.00010889679715302493, "loss": 1.7447, "step": 5130 }, { "epoch": 2.2804444444444445, "grad_norm": 1.8818942308425903, "learning_rate": 0.00010887900355871887, "loss": 0.8104, "step": 5131 }, { "epoch": 2.2808888888888887, "grad_norm": 2.51438045501709, "learning_rate": 0.00010886120996441283, "loss": 1.4221, "step": 5132 }, { "epoch": 2.2813333333333334, "grad_norm": 2.23756742477417, "learning_rate": 0.00010884341637010678, "loss": 1.3229, "step": 5133 }, { "epoch": 2.2817777777777777, "grad_norm": 2.578627109527588, "learning_rate": 0.00010882562277580071, "loss": 1.679, "step": 5134 }, { "epoch": 2.2822222222222224, "grad_norm": 2.4972848892211914, "learning_rate": 0.00010880782918149466, "loss": 1.0042, "step": 5135 }, { "epoch": 2.2826666666666666, "grad_norm": 2.4864046573638916, "learning_rate": 0.00010879003558718861, "loss": 1.6573, "step": 5136 }, { "epoch": 2.2831111111111113, "grad_norm": 1.8569329977035522, "learning_rate": 0.00010877224199288257, "loss": 0.6083, "step": 5137 }, { "epoch": 2.2835555555555556, "grad_norm": 2.6191885471343994, "learning_rate": 0.00010875444839857651, "loss": 1.7911, "step": 5138 }, { "epoch": 2.284, "grad_norm": 2.874465227127075, "learning_rate": 0.00010873665480427047, "loss": 1.9264, "step": 5139 }, { "epoch": 2.2844444444444445, "grad_norm": 2.813215494155884, "learning_rate": 0.00010871886120996441, "loss": 1.6019, "step": 5140 }, { "epoch": 2.2848888888888887, "grad_norm": 2.852952003479004, "learning_rate": 0.00010870106761565837, "loss": 2.0002, "step": 5141 }, { "epoch": 2.2853333333333334, "grad_norm": 3.3380825519561768, "learning_rate": 0.00010868327402135232, "loss": 1.6685, "step": 5142 }, { "epoch": 2.2857777777777777, "grad_norm": 2.840141534805298, "learning_rate": 0.00010866548042704627, "loss": 1.6248, "step": 5143 }, { "epoch": 2.2862222222222224, "grad_norm": 2.88301420211792, "learning_rate": 0.00010864768683274022, "loss": 1.3737, "step": 5144 }, { "epoch": 2.2866666666666666, "grad_norm": 3.1075048446655273, "learning_rate": 0.00010862989323843418, "loss": 1.678, "step": 5145 }, { "epoch": 2.287111111111111, "grad_norm": 3.333651542663574, "learning_rate": 0.00010861209964412812, "loss": 2.0464, "step": 5146 }, { "epoch": 2.2875555555555556, "grad_norm": 4.473095417022705, "learning_rate": 0.00010859430604982205, "loss": 1.6364, "step": 5147 }, { "epoch": 2.288, "grad_norm": 2.844916343688965, "learning_rate": 0.00010857651245551601, "loss": 1.8926, "step": 5148 }, { "epoch": 2.2884444444444445, "grad_norm": 5.156615257263184, "learning_rate": 0.00010855871886120997, "loss": 1.9084, "step": 5149 }, { "epoch": 2.2888888888888888, "grad_norm": 3.648468494415283, "learning_rate": 0.00010854092526690391, "loss": 0.7005, "step": 5150 }, { "epoch": 2.2893333333333334, "grad_norm": 2.095301628112793, "learning_rate": 0.00010852313167259786, "loss": 1.952, "step": 5151 }, { "epoch": 2.2897777777777777, "grad_norm": 1.9257947206497192, "learning_rate": 0.00010850533807829182, "loss": 1.4345, "step": 5152 }, { "epoch": 2.2902222222222224, "grad_norm": 2.2104406356811523, "learning_rate": 0.00010848754448398576, "loss": 1.9751, "step": 5153 }, { "epoch": 2.2906666666666666, "grad_norm": 1.96810781955719, "learning_rate": 0.00010846975088967972, "loss": 2.024, "step": 5154 }, { "epoch": 2.2911111111111113, "grad_norm": 2.201118230819702, "learning_rate": 0.00010845195729537368, "loss": 1.8283, "step": 5155 }, { "epoch": 2.2915555555555556, "grad_norm": 2.238771438598633, "learning_rate": 0.00010843416370106762, "loss": 1.6468, "step": 5156 }, { "epoch": 2.292, "grad_norm": 2.4652698040008545, "learning_rate": 0.00010841637010676158, "loss": 2.0724, "step": 5157 }, { "epoch": 2.2924444444444445, "grad_norm": 1.8881862163543701, "learning_rate": 0.00010839857651245553, "loss": 1.2482, "step": 5158 }, { "epoch": 2.2928888888888888, "grad_norm": 2.0697972774505615, "learning_rate": 0.00010838078291814948, "loss": 2.2032, "step": 5159 }, { "epoch": 2.2933333333333334, "grad_norm": 2.31854510307312, "learning_rate": 0.0001083629893238434, "loss": 1.9085, "step": 5160 }, { "epoch": 2.2937777777777777, "grad_norm": 2.013096332550049, "learning_rate": 0.00010834519572953736, "loss": 1.2551, "step": 5161 }, { "epoch": 2.2942222222222224, "grad_norm": 2.2634682655334473, "learning_rate": 0.00010832740213523132, "loss": 1.5858, "step": 5162 }, { "epoch": 2.2946666666666666, "grad_norm": 2.027539014816284, "learning_rate": 0.00010830960854092526, "loss": 1.3721, "step": 5163 }, { "epoch": 2.295111111111111, "grad_norm": 2.623805284500122, "learning_rate": 0.00010829181494661922, "loss": 2.0648, "step": 5164 }, { "epoch": 2.2955555555555556, "grad_norm": 1.4081056118011475, "learning_rate": 0.00010827402135231317, "loss": 0.8012, "step": 5165 }, { "epoch": 2.296, "grad_norm": 1.6935027837753296, "learning_rate": 0.00010825622775800712, "loss": 1.0633, "step": 5166 }, { "epoch": 2.2964444444444445, "grad_norm": 2.5808827877044678, "learning_rate": 0.00010823843416370107, "loss": 1.3108, "step": 5167 }, { "epoch": 2.2968888888888888, "grad_norm": 2.5715770721435547, "learning_rate": 0.00010822064056939503, "loss": 1.5686, "step": 5168 }, { "epoch": 2.2973333333333334, "grad_norm": 2.925513505935669, "learning_rate": 0.00010820284697508897, "loss": 1.9211, "step": 5169 }, { "epoch": 2.2977777777777777, "grad_norm": 2.3410723209381104, "learning_rate": 0.00010818505338078293, "loss": 1.7527, "step": 5170 }, { "epoch": 2.2982222222222224, "grad_norm": 2.2690529823303223, "learning_rate": 0.00010816725978647689, "loss": 1.5536, "step": 5171 }, { "epoch": 2.2986666666666666, "grad_norm": 2.3402512073516846, "learning_rate": 0.00010814946619217083, "loss": 1.6756, "step": 5172 }, { "epoch": 2.2991111111111113, "grad_norm": 2.567690849304199, "learning_rate": 0.00010813167259786476, "loss": 2.1322, "step": 5173 }, { "epoch": 2.2995555555555556, "grad_norm": 2.5583088397979736, "learning_rate": 0.00010811387900355872, "loss": 1.7518, "step": 5174 }, { "epoch": 2.3, "grad_norm": 2.2459707260131836, "learning_rate": 0.00010809608540925267, "loss": 1.4129, "step": 5175 }, { "epoch": 2.3004444444444445, "grad_norm": 3.016340970993042, "learning_rate": 0.00010807829181494661, "loss": 1.8386, "step": 5176 }, { "epoch": 2.3008888888888888, "grad_norm": 2.6692094802856445, "learning_rate": 0.00010806049822064057, "loss": 1.5559, "step": 5177 }, { "epoch": 2.3013333333333335, "grad_norm": 2.3022642135620117, "learning_rate": 0.00010804270462633453, "loss": 1.4982, "step": 5178 }, { "epoch": 2.3017777777777777, "grad_norm": 2.4145843982696533, "learning_rate": 0.00010802491103202847, "loss": 1.8678, "step": 5179 }, { "epoch": 2.3022222222222224, "grad_norm": 2.2984983921051025, "learning_rate": 0.00010800711743772243, "loss": 1.606, "step": 5180 }, { "epoch": 2.3026666666666666, "grad_norm": 2.6932058334350586, "learning_rate": 0.00010798932384341638, "loss": 1.5845, "step": 5181 }, { "epoch": 2.303111111111111, "grad_norm": 2.256645441055298, "learning_rate": 0.00010797153024911033, "loss": 1.7118, "step": 5182 }, { "epoch": 2.3035555555555556, "grad_norm": 2.5675642490386963, "learning_rate": 0.00010795373665480428, "loss": 1.3647, "step": 5183 }, { "epoch": 2.304, "grad_norm": 2.930628776550293, "learning_rate": 0.00010793594306049824, "loss": 1.4687, "step": 5184 }, { "epoch": 2.3044444444444445, "grad_norm": 3.3996012210845947, "learning_rate": 0.00010791814946619218, "loss": 1.5153, "step": 5185 }, { "epoch": 2.3048888888888888, "grad_norm": 4.883398532867432, "learning_rate": 0.00010790035587188611, "loss": 1.8317, "step": 5186 }, { "epoch": 2.3053333333333335, "grad_norm": 2.2228214740753174, "learning_rate": 0.00010788256227758007, "loss": 1.1171, "step": 5187 }, { "epoch": 2.3057777777777777, "grad_norm": 2.6813435554504395, "learning_rate": 0.00010786476868327402, "loss": 1.5585, "step": 5188 }, { "epoch": 2.3062222222222224, "grad_norm": 2.6681740283966064, "learning_rate": 0.00010784697508896797, "loss": 1.6806, "step": 5189 }, { "epoch": 2.3066666666666666, "grad_norm": 2.8957736492156982, "learning_rate": 0.00010782918149466192, "loss": 1.8597, "step": 5190 }, { "epoch": 2.3071111111111113, "grad_norm": 2.954521656036377, "learning_rate": 0.00010781138790035588, "loss": 1.6892, "step": 5191 }, { "epoch": 2.3075555555555556, "grad_norm": 3.0856049060821533, "learning_rate": 0.00010779359430604982, "loss": 1.5892, "step": 5192 }, { "epoch": 2.308, "grad_norm": 3.5431087017059326, "learning_rate": 0.00010777580071174378, "loss": 1.9527, "step": 5193 }, { "epoch": 2.3084444444444445, "grad_norm": 2.8147850036621094, "learning_rate": 0.00010775800711743774, "loss": 1.6474, "step": 5194 }, { "epoch": 2.3088888888888888, "grad_norm": 2.9207980632781982, "learning_rate": 0.00010774021352313168, "loss": 1.7115, "step": 5195 }, { "epoch": 2.3093333333333335, "grad_norm": 1.972179651260376, "learning_rate": 0.00010772241992882564, "loss": 0.7474, "step": 5196 }, { "epoch": 2.3097777777777777, "grad_norm": 3.6194729804992676, "learning_rate": 0.00010770462633451959, "loss": 1.577, "step": 5197 }, { "epoch": 2.3102222222222224, "grad_norm": 3.8124332427978516, "learning_rate": 0.00010768683274021354, "loss": 1.6077, "step": 5198 }, { "epoch": 2.3106666666666666, "grad_norm": 4.198216438293457, "learning_rate": 0.00010766903914590746, "loss": 2.2859, "step": 5199 }, { "epoch": 2.311111111111111, "grad_norm": 3.1311981678009033, "learning_rate": 0.00010765124555160142, "loss": 1.2557, "step": 5200 }, { "epoch": 2.3115555555555556, "grad_norm": 1.368379831314087, "learning_rate": 0.00010763345195729538, "loss": 1.233, "step": 5201 }, { "epoch": 2.312, "grad_norm": 2.020923376083374, "learning_rate": 0.00010761565836298932, "loss": 1.9648, "step": 5202 }, { "epoch": 2.3124444444444445, "grad_norm": 2.088132619857788, "learning_rate": 0.00010759786476868328, "loss": 2.2838, "step": 5203 }, { "epoch": 2.3128888888888888, "grad_norm": 2.1785659790039062, "learning_rate": 0.00010758007117437723, "loss": 2.1716, "step": 5204 }, { "epoch": 2.3133333333333335, "grad_norm": 2.086555004119873, "learning_rate": 0.00010756227758007118, "loss": 1.9242, "step": 5205 }, { "epoch": 2.3137777777777777, "grad_norm": 2.2909762859344482, "learning_rate": 0.00010754448398576513, "loss": 2.3151, "step": 5206 }, { "epoch": 2.3142222222222224, "grad_norm": 2.3706085681915283, "learning_rate": 0.00010752669039145909, "loss": 2.0996, "step": 5207 }, { "epoch": 2.3146666666666667, "grad_norm": 2.0024471282958984, "learning_rate": 0.00010750889679715303, "loss": 1.8437, "step": 5208 }, { "epoch": 2.3151111111111113, "grad_norm": 2.22165584564209, "learning_rate": 0.00010749110320284699, "loss": 1.6399, "step": 5209 }, { "epoch": 2.3155555555555556, "grad_norm": 2.4060609340667725, "learning_rate": 0.00010747330960854095, "loss": 1.4666, "step": 5210 }, { "epoch": 2.316, "grad_norm": 0.6937952041625977, "learning_rate": 0.00010745551601423489, "loss": 0.0337, "step": 5211 }, { "epoch": 2.3164444444444445, "grad_norm": 2.10048508644104, "learning_rate": 0.00010743772241992882, "loss": 2.2067, "step": 5212 }, { "epoch": 2.3168888888888888, "grad_norm": 2.3263375759124756, "learning_rate": 0.00010741992882562277, "loss": 2.0357, "step": 5213 }, { "epoch": 2.3173333333333335, "grad_norm": 1.9152144193649292, "learning_rate": 0.00010740213523131673, "loss": 1.9073, "step": 5214 }, { "epoch": 2.3177777777777777, "grad_norm": 2.3582661151885986, "learning_rate": 0.00010738434163701067, "loss": 1.9668, "step": 5215 }, { "epoch": 2.3182222222222224, "grad_norm": 2.1141693592071533, "learning_rate": 0.00010736654804270463, "loss": 1.6874, "step": 5216 }, { "epoch": 2.3186666666666667, "grad_norm": 2.0202839374542236, "learning_rate": 0.00010734875444839859, "loss": 1.7318, "step": 5217 }, { "epoch": 2.319111111111111, "grad_norm": 2.4021458625793457, "learning_rate": 0.00010733096085409253, "loss": 1.9969, "step": 5218 }, { "epoch": 2.3195555555555556, "grad_norm": 1.3853273391723633, "learning_rate": 0.00010731316725978649, "loss": 0.8318, "step": 5219 }, { "epoch": 2.32, "grad_norm": 1.6327898502349854, "learning_rate": 0.00010729537366548044, "loss": 1.3863, "step": 5220 }, { "epoch": 2.3204444444444445, "grad_norm": 2.009892463684082, "learning_rate": 0.00010727758007117439, "loss": 1.5846, "step": 5221 }, { "epoch": 2.320888888888889, "grad_norm": 2.0592620372772217, "learning_rate": 0.00010725978647686834, "loss": 1.6894, "step": 5222 }, { "epoch": 2.3213333333333335, "grad_norm": 2.1385364532470703, "learning_rate": 0.0001072419928825623, "loss": 1.6496, "step": 5223 }, { "epoch": 2.3217777777777777, "grad_norm": 2.383230209350586, "learning_rate": 0.00010722419928825623, "loss": 1.707, "step": 5224 }, { "epoch": 2.3222222222222224, "grad_norm": 2.0533084869384766, "learning_rate": 0.00010720640569395017, "loss": 1.4989, "step": 5225 }, { "epoch": 2.3226666666666667, "grad_norm": 2.2495386600494385, "learning_rate": 0.00010718861209964413, "loss": 1.7552, "step": 5226 }, { "epoch": 2.3231111111111113, "grad_norm": 2.7372584342956543, "learning_rate": 0.00010717081850533808, "loss": 1.5627, "step": 5227 }, { "epoch": 2.3235555555555556, "grad_norm": 2.1765596866607666, "learning_rate": 0.00010715302491103203, "loss": 1.2697, "step": 5228 }, { "epoch": 2.324, "grad_norm": 2.3925368785858154, "learning_rate": 0.00010713523131672598, "loss": 1.8882, "step": 5229 }, { "epoch": 2.3244444444444445, "grad_norm": 3.3124077320098877, "learning_rate": 0.00010711743772241993, "loss": 1.5998, "step": 5230 }, { "epoch": 2.324888888888889, "grad_norm": 2.3769872188568115, "learning_rate": 0.00010709964412811388, "loss": 1.8299, "step": 5231 }, { "epoch": 2.3253333333333335, "grad_norm": 3.1028130054473877, "learning_rate": 0.00010708185053380784, "loss": 2.4035, "step": 5232 }, { "epoch": 2.3257777777777777, "grad_norm": 2.543940544128418, "learning_rate": 0.00010706405693950178, "loss": 1.2689, "step": 5233 }, { "epoch": 2.3262222222222224, "grad_norm": 2.7707502841949463, "learning_rate": 0.00010704626334519574, "loss": 1.9459, "step": 5234 }, { "epoch": 2.3266666666666667, "grad_norm": 2.408712863922119, "learning_rate": 0.0001070284697508897, "loss": 1.7291, "step": 5235 }, { "epoch": 2.327111111111111, "grad_norm": 3.0360755920410156, "learning_rate": 0.00010701067615658364, "loss": 2.274, "step": 5236 }, { "epoch": 2.3275555555555556, "grad_norm": 2.5077056884765625, "learning_rate": 0.00010699288256227757, "loss": 1.5385, "step": 5237 }, { "epoch": 2.328, "grad_norm": 3.110048770904541, "learning_rate": 0.00010697508896797152, "loss": 1.7831, "step": 5238 }, { "epoch": 2.3284444444444445, "grad_norm": 2.6556756496429443, "learning_rate": 0.00010695729537366548, "loss": 1.9516, "step": 5239 }, { "epoch": 2.328888888888889, "grad_norm": 3.066831350326538, "learning_rate": 0.00010693950177935942, "loss": 1.9794, "step": 5240 }, { "epoch": 2.3293333333333335, "grad_norm": 2.9944040775299072, "learning_rate": 0.00010692170818505338, "loss": 1.2826, "step": 5241 }, { "epoch": 2.3297777777777777, "grad_norm": 2.9339687824249268, "learning_rate": 0.00010690391459074734, "loss": 1.6169, "step": 5242 }, { "epoch": 2.330222222222222, "grad_norm": 2.9092490673065186, "learning_rate": 0.00010688612099644128, "loss": 1.6369, "step": 5243 }, { "epoch": 2.3306666666666667, "grad_norm": 3.148411512374878, "learning_rate": 0.00010686832740213524, "loss": 1.7855, "step": 5244 }, { "epoch": 2.3311111111111114, "grad_norm": 2.889481544494629, "learning_rate": 0.00010685053380782919, "loss": 1.8247, "step": 5245 }, { "epoch": 2.3315555555555556, "grad_norm": 3.1188857555389404, "learning_rate": 0.00010683274021352314, "loss": 1.301, "step": 5246 }, { "epoch": 2.332, "grad_norm": 2.914860486984253, "learning_rate": 0.00010681494661921709, "loss": 1.5213, "step": 5247 }, { "epoch": 2.3324444444444445, "grad_norm": 3.193268060684204, "learning_rate": 0.00010679715302491105, "loss": 1.5691, "step": 5248 }, { "epoch": 2.332888888888889, "grad_norm": 2.0643980503082275, "learning_rate": 0.00010677935943060499, "loss": 0.7877, "step": 5249 }, { "epoch": 2.3333333333333335, "grad_norm": 3.4626474380493164, "learning_rate": 0.00010676156583629892, "loss": 1.5895, "step": 5250 }, { "epoch": 2.3337777777777777, "grad_norm": 1.721472144126892, "learning_rate": 0.00010674377224199288, "loss": 1.8599, "step": 5251 }, { "epoch": 2.3342222222222224, "grad_norm": 2.4358739852905273, "learning_rate": 0.00010672597864768683, "loss": 2.4073, "step": 5252 }, { "epoch": 2.3346666666666667, "grad_norm": 2.0674357414245605, "learning_rate": 0.00010670818505338078, "loss": 1.4055, "step": 5253 }, { "epoch": 2.335111111111111, "grad_norm": 1.9118478298187256, "learning_rate": 0.00010669039145907473, "loss": 1.3337, "step": 5254 }, { "epoch": 2.3355555555555556, "grad_norm": 2.8397645950317383, "learning_rate": 0.00010667259786476869, "loss": 2.0588, "step": 5255 }, { "epoch": 2.336, "grad_norm": 2.2740516662597656, "learning_rate": 0.00010665480427046263, "loss": 1.9389, "step": 5256 }, { "epoch": 2.3364444444444445, "grad_norm": 1.9148417711257935, "learning_rate": 0.00010663701067615659, "loss": 1.1185, "step": 5257 }, { "epoch": 2.336888888888889, "grad_norm": 1.7196369171142578, "learning_rate": 0.00010661921708185055, "loss": 0.9252, "step": 5258 }, { "epoch": 2.3373333333333335, "grad_norm": 2.5018043518066406, "learning_rate": 0.00010660142348754449, "loss": 2.1507, "step": 5259 }, { "epoch": 2.3377777777777777, "grad_norm": 2.2617380619049072, "learning_rate": 0.00010658362989323845, "loss": 1.6547, "step": 5260 }, { "epoch": 2.338222222222222, "grad_norm": 2.1879079341888428, "learning_rate": 0.0001065658362989324, "loss": 1.8825, "step": 5261 }, { "epoch": 2.3386666666666667, "grad_norm": 2.5784716606140137, "learning_rate": 0.00010654804270462634, "loss": 2.1602, "step": 5262 }, { "epoch": 2.339111111111111, "grad_norm": 2.2693116664886475, "learning_rate": 0.00010653024911032027, "loss": 1.818, "step": 5263 }, { "epoch": 2.3395555555555556, "grad_norm": 2.2588093280792236, "learning_rate": 0.00010651245551601423, "loss": 2.0916, "step": 5264 }, { "epoch": 2.34, "grad_norm": 2.266087055206299, "learning_rate": 0.00010649466192170819, "loss": 1.8388, "step": 5265 }, { "epoch": 2.3404444444444445, "grad_norm": 2.1709249019622803, "learning_rate": 0.00010647686832740213, "loss": 1.9512, "step": 5266 }, { "epoch": 2.340888888888889, "grad_norm": 1.934448003768921, "learning_rate": 0.00010645907473309609, "loss": 1.5891, "step": 5267 }, { "epoch": 2.3413333333333335, "grad_norm": 1.595613956451416, "learning_rate": 0.00010644128113879004, "loss": 0.9631, "step": 5268 }, { "epoch": 2.3417777777777777, "grad_norm": 2.1244924068450928, "learning_rate": 0.00010642348754448399, "loss": 1.49, "step": 5269 }, { "epoch": 2.3422222222222224, "grad_norm": 2.107415199279785, "learning_rate": 0.00010640569395017794, "loss": 1.2837, "step": 5270 }, { "epoch": 2.3426666666666667, "grad_norm": 2.0290896892547607, "learning_rate": 0.0001063879003558719, "loss": 1.426, "step": 5271 }, { "epoch": 2.343111111111111, "grad_norm": 1.9935804605484009, "learning_rate": 0.00010637010676156584, "loss": 1.6032, "step": 5272 }, { "epoch": 2.3435555555555556, "grad_norm": 2.401331663131714, "learning_rate": 0.0001063523131672598, "loss": 1.745, "step": 5273 }, { "epoch": 2.344, "grad_norm": 2.32720685005188, "learning_rate": 0.00010633451957295375, "loss": 2.1566, "step": 5274 }, { "epoch": 2.3444444444444446, "grad_norm": 2.5163567066192627, "learning_rate": 0.0001063167259786477, "loss": 1.8647, "step": 5275 }, { "epoch": 2.344888888888889, "grad_norm": 2.6666061878204346, "learning_rate": 0.00010629893238434163, "loss": 2.0087, "step": 5276 }, { "epoch": 2.3453333333333335, "grad_norm": 2.972123622894287, "learning_rate": 0.00010628113879003558, "loss": 2.0574, "step": 5277 }, { "epoch": 2.3457777777777777, "grad_norm": 2.4128878116607666, "learning_rate": 0.00010626334519572954, "loss": 1.4681, "step": 5278 }, { "epoch": 2.346222222222222, "grad_norm": 2.2778286933898926, "learning_rate": 0.00010624555160142348, "loss": 1.6174, "step": 5279 }, { "epoch": 2.3466666666666667, "grad_norm": 2.547231435775757, "learning_rate": 0.00010622775800711744, "loss": 1.6653, "step": 5280 }, { "epoch": 2.347111111111111, "grad_norm": 2.483854055404663, "learning_rate": 0.0001062099644128114, "loss": 1.5825, "step": 5281 }, { "epoch": 2.3475555555555556, "grad_norm": 2.559788465499878, "learning_rate": 0.00010619217081850534, "loss": 2.0953, "step": 5282 }, { "epoch": 2.348, "grad_norm": 3.0145204067230225, "learning_rate": 0.0001061743772241993, "loss": 1.9542, "step": 5283 }, { "epoch": 2.3484444444444446, "grad_norm": 2.558265209197998, "learning_rate": 0.00010615658362989325, "loss": 1.8425, "step": 5284 }, { "epoch": 2.348888888888889, "grad_norm": 2.9224250316619873, "learning_rate": 0.0001061387900355872, "loss": 1.947, "step": 5285 }, { "epoch": 2.3493333333333335, "grad_norm": 2.7229421138763428, "learning_rate": 0.00010612099644128115, "loss": 1.305, "step": 5286 }, { "epoch": 2.3497777777777777, "grad_norm": 2.2449328899383545, "learning_rate": 0.00010610320284697511, "loss": 1.4746, "step": 5287 }, { "epoch": 2.3502222222222224, "grad_norm": 2.956235885620117, "learning_rate": 0.00010608540925266905, "loss": 1.9641, "step": 5288 }, { "epoch": 2.3506666666666667, "grad_norm": 2.308121681213379, "learning_rate": 0.00010606761565836298, "loss": 1.4521, "step": 5289 }, { "epoch": 2.351111111111111, "grad_norm": 2.414666175842285, "learning_rate": 0.00010604982206405694, "loss": 1.0793, "step": 5290 }, { "epoch": 2.3515555555555556, "grad_norm": 3.1062090396881104, "learning_rate": 0.0001060320284697509, "loss": 1.5819, "step": 5291 }, { "epoch": 2.352, "grad_norm": 2.759904146194458, "learning_rate": 0.00010601423487544484, "loss": 2.2146, "step": 5292 }, { "epoch": 2.3524444444444446, "grad_norm": 3.40423321723938, "learning_rate": 0.00010599644128113879, "loss": 1.8593, "step": 5293 }, { "epoch": 2.352888888888889, "grad_norm": 3.157247543334961, "learning_rate": 0.00010597864768683275, "loss": 1.9475, "step": 5294 }, { "epoch": 2.3533333333333335, "grad_norm": 2.931468963623047, "learning_rate": 0.00010596085409252669, "loss": 1.9819, "step": 5295 }, { "epoch": 2.3537777777777777, "grad_norm": 2.865544557571411, "learning_rate": 0.00010594306049822065, "loss": 1.7654, "step": 5296 }, { "epoch": 2.354222222222222, "grad_norm": 3.2829089164733887, "learning_rate": 0.0001059252669039146, "loss": 1.7053, "step": 5297 }, { "epoch": 2.3546666666666667, "grad_norm": 2.915627956390381, "learning_rate": 0.00010590747330960855, "loss": 1.6627, "step": 5298 }, { "epoch": 2.355111111111111, "grad_norm": 2.891852617263794, "learning_rate": 0.0001058896797153025, "loss": 1.4539, "step": 5299 }, { "epoch": 2.3555555555555556, "grad_norm": 3.9856643676757812, "learning_rate": 0.00010587188612099646, "loss": 1.6694, "step": 5300 }, { "epoch": 2.356, "grad_norm": 1.8876063823699951, "learning_rate": 0.0001058540925266904, "loss": 1.9182, "step": 5301 }, { "epoch": 2.3564444444444446, "grad_norm": 1.3082079887390137, "learning_rate": 0.00010583629893238433, "loss": 0.945, "step": 5302 }, { "epoch": 2.356888888888889, "grad_norm": 1.5244293212890625, "learning_rate": 0.00010581850533807829, "loss": 0.8277, "step": 5303 }, { "epoch": 2.3573333333333335, "grad_norm": 2.210871458053589, "learning_rate": 0.00010580071174377225, "loss": 1.937, "step": 5304 }, { "epoch": 2.3577777777777778, "grad_norm": 2.4062864780426025, "learning_rate": 0.00010578291814946619, "loss": 1.9807, "step": 5305 }, { "epoch": 2.3582222222222224, "grad_norm": 2.0000367164611816, "learning_rate": 0.00010576512455516015, "loss": 1.4079, "step": 5306 }, { "epoch": 2.3586666666666667, "grad_norm": 1.9247461557388306, "learning_rate": 0.0001057473309608541, "loss": 1.7219, "step": 5307 }, { "epoch": 2.359111111111111, "grad_norm": 2.2574825286865234, "learning_rate": 0.00010572953736654805, "loss": 1.7616, "step": 5308 }, { "epoch": 2.3595555555555556, "grad_norm": 2.254293203353882, "learning_rate": 0.000105711743772242, "loss": 2.0128, "step": 5309 }, { "epoch": 2.36, "grad_norm": 2.4639298915863037, "learning_rate": 0.00010569395017793596, "loss": 1.9739, "step": 5310 }, { "epoch": 2.3604444444444446, "grad_norm": 2.103483200073242, "learning_rate": 0.0001056761565836299, "loss": 1.91, "step": 5311 }, { "epoch": 2.360888888888889, "grad_norm": 0.13134007155895233, "learning_rate": 0.00010565836298932386, "loss": 0.0197, "step": 5312 }, { "epoch": 2.3613333333333335, "grad_norm": 2.1380348205566406, "learning_rate": 0.00010564056939501781, "loss": 1.7965, "step": 5313 }, { "epoch": 2.3617777777777778, "grad_norm": 1.9932422637939453, "learning_rate": 0.00010562277580071176, "loss": 1.6092, "step": 5314 }, { "epoch": 2.362222222222222, "grad_norm": 2.254859209060669, "learning_rate": 0.00010560498220640569, "loss": 2.1432, "step": 5315 }, { "epoch": 2.3626666666666667, "grad_norm": 2.0134313106536865, "learning_rate": 0.00010558718861209964, "loss": 1.8426, "step": 5316 }, { "epoch": 2.363111111111111, "grad_norm": 2.157883644104004, "learning_rate": 0.0001055693950177936, "loss": 1.5952, "step": 5317 }, { "epoch": 2.3635555555555556, "grad_norm": 2.2276217937469482, "learning_rate": 0.00010555160142348754, "loss": 1.8253, "step": 5318 }, { "epoch": 2.364, "grad_norm": 2.406975269317627, "learning_rate": 0.0001055338078291815, "loss": 1.7489, "step": 5319 }, { "epoch": 2.3644444444444446, "grad_norm": 2.4040451049804688, "learning_rate": 0.00010551601423487544, "loss": 1.7381, "step": 5320 }, { "epoch": 2.364888888888889, "grad_norm": 2.3204426765441895, "learning_rate": 0.0001054982206405694, "loss": 2.0069, "step": 5321 }, { "epoch": 2.3653333333333335, "grad_norm": 2.3594791889190674, "learning_rate": 0.00010548042704626336, "loss": 1.6934, "step": 5322 }, { "epoch": 2.3657777777777778, "grad_norm": 2.970160961151123, "learning_rate": 0.0001054626334519573, "loss": 2.0672, "step": 5323 }, { "epoch": 2.3662222222222224, "grad_norm": 2.4029760360717773, "learning_rate": 0.00010544483985765125, "loss": 1.6066, "step": 5324 }, { "epoch": 2.3666666666666667, "grad_norm": 2.390634059906006, "learning_rate": 0.00010542704626334521, "loss": 1.8098, "step": 5325 }, { "epoch": 2.367111111111111, "grad_norm": 2.158510208129883, "learning_rate": 0.00010540925266903915, "loss": 1.4622, "step": 5326 }, { "epoch": 2.3675555555555556, "grad_norm": 2.2908811569213867, "learning_rate": 0.00010539145907473311, "loss": 1.4533, "step": 5327 }, { "epoch": 2.368, "grad_norm": 2.348654270172119, "learning_rate": 0.00010537366548042704, "loss": 1.7244, "step": 5328 }, { "epoch": 2.3684444444444446, "grad_norm": 2.7863059043884277, "learning_rate": 0.000105355871886121, "loss": 1.9031, "step": 5329 }, { "epoch": 2.368888888888889, "grad_norm": 1.919249415397644, "learning_rate": 0.00010533807829181494, "loss": 0.9351, "step": 5330 }, { "epoch": 2.3693333333333335, "grad_norm": 2.1779160499572754, "learning_rate": 0.0001053202846975089, "loss": 1.311, "step": 5331 }, { "epoch": 2.3697777777777778, "grad_norm": 2.9580140113830566, "learning_rate": 0.00010530249110320285, "loss": 1.8562, "step": 5332 }, { "epoch": 2.370222222222222, "grad_norm": 2.4959969520568848, "learning_rate": 0.0001052846975088968, "loss": 1.3581, "step": 5333 }, { "epoch": 2.3706666666666667, "grad_norm": 2.6963963508605957, "learning_rate": 0.00010526690391459075, "loss": 2.3081, "step": 5334 }, { "epoch": 2.371111111111111, "grad_norm": 2.9075794219970703, "learning_rate": 0.00010524911032028471, "loss": 2.2261, "step": 5335 }, { "epoch": 2.3715555555555556, "grad_norm": 2.346102714538574, "learning_rate": 0.00010523131672597865, "loss": 1.4546, "step": 5336 }, { "epoch": 2.372, "grad_norm": 2.860933542251587, "learning_rate": 0.00010521352313167261, "loss": 1.6149, "step": 5337 }, { "epoch": 2.3724444444444446, "grad_norm": 3.0181329250335693, "learning_rate": 0.00010519572953736656, "loss": 1.9698, "step": 5338 }, { "epoch": 2.372888888888889, "grad_norm": 3.008528709411621, "learning_rate": 0.00010517793594306051, "loss": 1.943, "step": 5339 }, { "epoch": 2.3733333333333335, "grad_norm": 2.5461809635162354, "learning_rate": 0.00010516014234875444, "loss": 1.473, "step": 5340 }, { "epoch": 2.3737777777777778, "grad_norm": 2.90517258644104, "learning_rate": 0.00010514234875444839, "loss": 2.0846, "step": 5341 }, { "epoch": 2.3742222222222225, "grad_norm": 2.84635329246521, "learning_rate": 0.00010512455516014235, "loss": 1.8785, "step": 5342 }, { "epoch": 2.3746666666666667, "grad_norm": 2.7728958129882812, "learning_rate": 0.00010510676156583629, "loss": 1.4901, "step": 5343 }, { "epoch": 2.375111111111111, "grad_norm": 2.4480459690093994, "learning_rate": 0.00010508896797153025, "loss": 1.4275, "step": 5344 }, { "epoch": 2.3755555555555556, "grad_norm": 3.176147937774658, "learning_rate": 0.0001050711743772242, "loss": 1.3497, "step": 5345 }, { "epoch": 2.376, "grad_norm": 2.7006847858428955, "learning_rate": 0.00010505338078291815, "loss": 1.2245, "step": 5346 }, { "epoch": 2.3764444444444446, "grad_norm": 3.114924192428589, "learning_rate": 0.0001050355871886121, "loss": 1.6906, "step": 5347 }, { "epoch": 2.376888888888889, "grad_norm": 3.2181596755981445, "learning_rate": 0.00010501779359430606, "loss": 1.4348, "step": 5348 }, { "epoch": 2.3773333333333335, "grad_norm": 2.9679977893829346, "learning_rate": 0.000105, "loss": 1.8438, "step": 5349 }, { "epoch": 2.3777777777777778, "grad_norm": 3.126516342163086, "learning_rate": 0.00010498220640569396, "loss": 1.0786, "step": 5350 }, { "epoch": 2.378222222222222, "grad_norm": 1.8322924375534058, "learning_rate": 0.00010496441281138792, "loss": 2.0924, "step": 5351 }, { "epoch": 2.3786666666666667, "grad_norm": 1.7690398693084717, "learning_rate": 0.00010494661921708186, "loss": 2.0291, "step": 5352 }, { "epoch": 2.379111111111111, "grad_norm": 0.14878343045711517, "learning_rate": 0.00010492882562277579, "loss": 0.0181, "step": 5353 }, { "epoch": 2.3795555555555556, "grad_norm": 2.2192952632904053, "learning_rate": 0.00010491103202846975, "loss": 1.5731, "step": 5354 }, { "epoch": 2.38, "grad_norm": 2.1451058387756348, "learning_rate": 0.0001048932384341637, "loss": 1.7856, "step": 5355 }, { "epoch": 2.3804444444444446, "grad_norm": 2.5920441150665283, "learning_rate": 0.00010487544483985765, "loss": 2.0455, "step": 5356 }, { "epoch": 2.380888888888889, "grad_norm": 2.153883218765259, "learning_rate": 0.0001048576512455516, "loss": 1.6193, "step": 5357 }, { "epoch": 2.3813333333333335, "grad_norm": 2.2498092651367188, "learning_rate": 0.00010483985765124556, "loss": 2.1478, "step": 5358 }, { "epoch": 2.3817777777777778, "grad_norm": 2.189810037612915, "learning_rate": 0.0001048220640569395, "loss": 1.8984, "step": 5359 }, { "epoch": 2.3822222222222225, "grad_norm": 2.1763858795166016, "learning_rate": 0.00010480427046263346, "loss": 1.6781, "step": 5360 }, { "epoch": 2.3826666666666667, "grad_norm": 2.4611992835998535, "learning_rate": 0.00010478647686832741, "loss": 1.9078, "step": 5361 }, { "epoch": 2.383111111111111, "grad_norm": 2.345245122909546, "learning_rate": 0.00010476868327402136, "loss": 1.9821, "step": 5362 }, { "epoch": 2.3835555555555556, "grad_norm": 2.14660906791687, "learning_rate": 0.00010475088967971531, "loss": 1.4488, "step": 5363 }, { "epoch": 2.384, "grad_norm": 2.06569504737854, "learning_rate": 0.00010473309608540927, "loss": 1.908, "step": 5364 }, { "epoch": 2.3844444444444446, "grad_norm": 2.5986487865448, "learning_rate": 0.00010471530249110321, "loss": 2.0425, "step": 5365 }, { "epoch": 2.384888888888889, "grad_norm": 2.309596061706543, "learning_rate": 0.00010469750889679714, "loss": 2.0211, "step": 5366 }, { "epoch": 2.3853333333333335, "grad_norm": 2.2031354904174805, "learning_rate": 0.0001046797153024911, "loss": 1.793, "step": 5367 }, { "epoch": 2.3857777777777778, "grad_norm": 2.5349233150482178, "learning_rate": 0.00010466192170818506, "loss": 2.126, "step": 5368 }, { "epoch": 2.386222222222222, "grad_norm": 1.8240305185317993, "learning_rate": 0.000104644128113879, "loss": 1.3644, "step": 5369 }, { "epoch": 2.3866666666666667, "grad_norm": 1.6334868669509888, "learning_rate": 0.00010462633451957296, "loss": 1.0386, "step": 5370 }, { "epoch": 2.387111111111111, "grad_norm": 2.6212308406829834, "learning_rate": 0.00010460854092526691, "loss": 2.2779, "step": 5371 }, { "epoch": 2.3875555555555557, "grad_norm": 2.4640159606933594, "learning_rate": 0.00010459074733096085, "loss": 2.1993, "step": 5372 }, { "epoch": 2.388, "grad_norm": 2.5300076007843018, "learning_rate": 0.00010457295373665481, "loss": 2.1011, "step": 5373 }, { "epoch": 2.3884444444444446, "grad_norm": 2.257607936859131, "learning_rate": 0.00010455516014234877, "loss": 1.6381, "step": 5374 }, { "epoch": 2.388888888888889, "grad_norm": 2.7849531173706055, "learning_rate": 0.00010453736654804271, "loss": 1.4969, "step": 5375 }, { "epoch": 2.389333333333333, "grad_norm": 2.2761592864990234, "learning_rate": 0.00010451957295373667, "loss": 1.4029, "step": 5376 }, { "epoch": 2.389777777777778, "grad_norm": 2.228327512741089, "learning_rate": 0.00010450177935943062, "loss": 1.8044, "step": 5377 }, { "epoch": 2.3902222222222225, "grad_norm": 1.8073506355285645, "learning_rate": 0.00010448398576512457, "loss": 0.9291, "step": 5378 }, { "epoch": 2.3906666666666667, "grad_norm": 2.5977213382720947, "learning_rate": 0.0001044661921708185, "loss": 1.585, "step": 5379 }, { "epoch": 2.391111111111111, "grad_norm": 2.726691246032715, "learning_rate": 0.00010444839857651245, "loss": 1.6747, "step": 5380 }, { "epoch": 2.3915555555555557, "grad_norm": 2.13911509513855, "learning_rate": 0.00010443060498220641, "loss": 1.531, "step": 5381 }, { "epoch": 2.392, "grad_norm": 2.832296133041382, "learning_rate": 0.00010441281138790035, "loss": 1.7403, "step": 5382 }, { "epoch": 2.3924444444444446, "grad_norm": 2.713308811187744, "learning_rate": 0.00010439501779359431, "loss": 2.0669, "step": 5383 }, { "epoch": 2.392888888888889, "grad_norm": 2.503474473953247, "learning_rate": 0.00010437722419928826, "loss": 1.8383, "step": 5384 }, { "epoch": 2.3933333333333335, "grad_norm": 2.878114700317383, "learning_rate": 0.00010435943060498221, "loss": 1.7423, "step": 5385 }, { "epoch": 2.393777777777778, "grad_norm": 3.174804449081421, "learning_rate": 0.00010434163701067616, "loss": 1.5972, "step": 5386 }, { "epoch": 2.394222222222222, "grad_norm": 2.129746675491333, "learning_rate": 0.00010432384341637012, "loss": 1.0781, "step": 5387 }, { "epoch": 2.3946666666666667, "grad_norm": 2.780766487121582, "learning_rate": 0.00010430604982206406, "loss": 1.649, "step": 5388 }, { "epoch": 2.395111111111111, "grad_norm": 2.8491053581237793, "learning_rate": 0.00010428825622775802, "loss": 1.6903, "step": 5389 }, { "epoch": 2.3955555555555557, "grad_norm": 2.460873603820801, "learning_rate": 0.00010427046263345198, "loss": 1.3277, "step": 5390 }, { "epoch": 2.396, "grad_norm": 2.9226863384246826, "learning_rate": 0.00010425266903914592, "loss": 1.5445, "step": 5391 }, { "epoch": 2.3964444444444446, "grad_norm": 3.39444899559021, "learning_rate": 0.00010423487544483985, "loss": 1.6876, "step": 5392 }, { "epoch": 2.396888888888889, "grad_norm": 3.0782647132873535, "learning_rate": 0.0001042170818505338, "loss": 1.6426, "step": 5393 }, { "epoch": 2.397333333333333, "grad_norm": 2.9284658432006836, "learning_rate": 0.00010419928825622776, "loss": 1.5015, "step": 5394 }, { "epoch": 2.397777777777778, "grad_norm": 3.772498369216919, "learning_rate": 0.0001041814946619217, "loss": 1.734, "step": 5395 }, { "epoch": 2.398222222222222, "grad_norm": 3.4858293533325195, "learning_rate": 0.00010416370106761566, "loss": 2.1559, "step": 5396 }, { "epoch": 2.3986666666666667, "grad_norm": 2.8593242168426514, "learning_rate": 0.00010414590747330962, "loss": 1.2558, "step": 5397 }, { "epoch": 2.399111111111111, "grad_norm": 3.39560866355896, "learning_rate": 0.00010412811387900356, "loss": 1.726, "step": 5398 }, { "epoch": 2.3995555555555557, "grad_norm": 2.1574625968933105, "learning_rate": 0.00010411032028469752, "loss": 0.8812, "step": 5399 }, { "epoch": 2.4, "grad_norm": 4.058824062347412, "learning_rate": 0.00010409252669039147, "loss": 1.1959, "step": 5400 }, { "epoch": 2.4004444444444446, "grad_norm": 1.906567931175232, "learning_rate": 0.00010407473309608542, "loss": 2.2807, "step": 5401 }, { "epoch": 2.400888888888889, "grad_norm": 1.9457437992095947, "learning_rate": 0.00010405693950177937, "loss": 1.7017, "step": 5402 }, { "epoch": 2.4013333333333335, "grad_norm": 2.0136795043945312, "learning_rate": 0.00010403914590747333, "loss": 2.0448, "step": 5403 }, { "epoch": 2.401777777777778, "grad_norm": 2.323181390762329, "learning_rate": 0.00010402135231316727, "loss": 1.8618, "step": 5404 }, { "epoch": 2.402222222222222, "grad_norm": 1.999695897102356, "learning_rate": 0.0001040035587188612, "loss": 1.8949, "step": 5405 }, { "epoch": 2.4026666666666667, "grad_norm": 2.2009153366088867, "learning_rate": 0.00010398576512455516, "loss": 1.7722, "step": 5406 }, { "epoch": 2.403111111111111, "grad_norm": 2.2078299522399902, "learning_rate": 0.00010396797153024912, "loss": 2.0193, "step": 5407 }, { "epoch": 2.4035555555555557, "grad_norm": 2.5770654678344727, "learning_rate": 0.00010395017793594306, "loss": 2.0849, "step": 5408 }, { "epoch": 2.404, "grad_norm": 2.2066149711608887, "learning_rate": 0.00010393238434163701, "loss": 1.5842, "step": 5409 }, { "epoch": 2.4044444444444446, "grad_norm": 2.5766196250915527, "learning_rate": 0.00010391459074733096, "loss": 2.3273, "step": 5410 }, { "epoch": 2.404888888888889, "grad_norm": 2.1553375720977783, "learning_rate": 0.00010389679715302491, "loss": 1.7654, "step": 5411 }, { "epoch": 2.405333333333333, "grad_norm": 2.179262638092041, "learning_rate": 0.00010387900355871887, "loss": 1.8465, "step": 5412 }, { "epoch": 2.405777777777778, "grad_norm": 2.3855044841766357, "learning_rate": 0.00010386120996441281, "loss": 1.5526, "step": 5413 }, { "epoch": 2.406222222222222, "grad_norm": 1.9066109657287598, "learning_rate": 0.00010384341637010677, "loss": 1.1865, "step": 5414 }, { "epoch": 2.4066666666666667, "grad_norm": 2.4043421745300293, "learning_rate": 0.00010382562277580073, "loss": 2.1115, "step": 5415 }, { "epoch": 2.407111111111111, "grad_norm": 2.303219795227051, "learning_rate": 0.00010380782918149467, "loss": 1.6968, "step": 5416 }, { "epoch": 2.4075555555555557, "grad_norm": 2.519510507583618, "learning_rate": 0.00010379003558718863, "loss": 1.4451, "step": 5417 }, { "epoch": 2.408, "grad_norm": 2.0006396770477295, "learning_rate": 0.00010377224199288256, "loss": 1.6501, "step": 5418 }, { "epoch": 2.4084444444444446, "grad_norm": 2.359020709991455, "learning_rate": 0.00010375444839857651, "loss": 2.1004, "step": 5419 }, { "epoch": 2.408888888888889, "grad_norm": 2.581718683242798, "learning_rate": 0.00010373665480427045, "loss": 1.5921, "step": 5420 }, { "epoch": 2.4093333333333335, "grad_norm": 2.2149415016174316, "learning_rate": 0.00010371886120996441, "loss": 1.4495, "step": 5421 }, { "epoch": 2.409777777777778, "grad_norm": 2.1162991523742676, "learning_rate": 0.00010370106761565837, "loss": 1.7689, "step": 5422 }, { "epoch": 2.410222222222222, "grad_norm": 2.3050718307495117, "learning_rate": 0.00010368327402135231, "loss": 1.7421, "step": 5423 }, { "epoch": 2.4106666666666667, "grad_norm": 2.368021011352539, "learning_rate": 0.00010366548042704627, "loss": 1.8718, "step": 5424 }, { "epoch": 2.411111111111111, "grad_norm": 2.574089765548706, "learning_rate": 0.00010364768683274022, "loss": 1.8538, "step": 5425 }, { "epoch": 2.4115555555555557, "grad_norm": 2.1765310764312744, "learning_rate": 0.00010362989323843417, "loss": 1.6152, "step": 5426 }, { "epoch": 2.412, "grad_norm": 2.298729419708252, "learning_rate": 0.00010361209964412812, "loss": 1.6085, "step": 5427 }, { "epoch": 2.4124444444444446, "grad_norm": 2.547469139099121, "learning_rate": 0.00010359430604982208, "loss": 1.7991, "step": 5428 }, { "epoch": 2.412888888888889, "grad_norm": 2.315479040145874, "learning_rate": 0.00010357651245551602, "loss": 1.5592, "step": 5429 }, { "epoch": 2.413333333333333, "grad_norm": 2.59236478805542, "learning_rate": 0.00010355871886120998, "loss": 1.4449, "step": 5430 }, { "epoch": 2.413777777777778, "grad_norm": 2.6196041107177734, "learning_rate": 0.00010354092526690391, "loss": 1.4048, "step": 5431 }, { "epoch": 2.414222222222222, "grad_norm": 2.8616275787353516, "learning_rate": 0.00010352313167259787, "loss": 2.1669, "step": 5432 }, { "epoch": 2.4146666666666667, "grad_norm": 2.7045931816101074, "learning_rate": 0.00010350533807829181, "loss": 1.8538, "step": 5433 }, { "epoch": 2.415111111111111, "grad_norm": 2.812784194946289, "learning_rate": 0.00010348754448398576, "loss": 1.7349, "step": 5434 }, { "epoch": 2.4155555555555557, "grad_norm": 2.665402889251709, "learning_rate": 0.00010346975088967972, "loss": 1.671, "step": 5435 }, { "epoch": 2.416, "grad_norm": 2.632174253463745, "learning_rate": 0.00010345195729537366, "loss": 1.297, "step": 5436 }, { "epoch": 2.4164444444444446, "grad_norm": 2.9285874366760254, "learning_rate": 0.00010343416370106762, "loss": 1.6447, "step": 5437 }, { "epoch": 2.416888888888889, "grad_norm": 2.873795747756958, "learning_rate": 0.00010341637010676158, "loss": 1.6297, "step": 5438 }, { "epoch": 2.4173333333333336, "grad_norm": 1.693901538848877, "learning_rate": 0.00010339857651245552, "loss": 0.571, "step": 5439 }, { "epoch": 2.417777777777778, "grad_norm": 2.4902453422546387, "learning_rate": 0.00010338078291814948, "loss": 1.7092, "step": 5440 }, { "epoch": 2.418222222222222, "grad_norm": 2.657254457473755, "learning_rate": 0.00010336298932384343, "loss": 1.2449, "step": 5441 }, { "epoch": 2.4186666666666667, "grad_norm": 2.572938919067383, "learning_rate": 0.00010334519572953738, "loss": 1.6842, "step": 5442 }, { "epoch": 2.419111111111111, "grad_norm": 3.0407369136810303, "learning_rate": 0.00010332740213523133, "loss": 1.7859, "step": 5443 }, { "epoch": 2.4195555555555557, "grad_norm": 2.9764583110809326, "learning_rate": 0.00010330960854092526, "loss": 1.6893, "step": 5444 }, { "epoch": 2.42, "grad_norm": 2.7912721633911133, "learning_rate": 0.00010329181494661922, "loss": 1.5315, "step": 5445 }, { "epoch": 2.4204444444444446, "grad_norm": 3.436552047729492, "learning_rate": 0.00010327402135231316, "loss": 1.6444, "step": 5446 }, { "epoch": 2.420888888888889, "grad_norm": 3.2824277877807617, "learning_rate": 0.00010325622775800712, "loss": 1.6908, "step": 5447 }, { "epoch": 2.421333333333333, "grad_norm": 3.5586631298065186, "learning_rate": 0.00010323843416370107, "loss": 1.9766, "step": 5448 }, { "epoch": 2.421777777777778, "grad_norm": 3.6402716636657715, "learning_rate": 0.00010322064056939502, "loss": 1.8899, "step": 5449 }, { "epoch": 2.422222222222222, "grad_norm": 3.806692600250244, "learning_rate": 0.00010320284697508897, "loss": 1.7844, "step": 5450 }, { "epoch": 2.4226666666666667, "grad_norm": 1.4669278860092163, "learning_rate": 0.00010318505338078293, "loss": 0.98, "step": 5451 }, { "epoch": 2.423111111111111, "grad_norm": 1.7053353786468506, "learning_rate": 0.00010316725978647687, "loss": 2.1493, "step": 5452 }, { "epoch": 2.4235555555555557, "grad_norm": 2.3028388023376465, "learning_rate": 0.00010314946619217083, "loss": 2.0852, "step": 5453 }, { "epoch": 2.424, "grad_norm": 1.9444522857666016, "learning_rate": 0.00010313167259786479, "loss": 1.6929, "step": 5454 }, { "epoch": 2.4244444444444446, "grad_norm": 2.3324930667877197, "learning_rate": 0.00010311387900355873, "loss": 1.9656, "step": 5455 }, { "epoch": 2.424888888888889, "grad_norm": 2.4078588485717773, "learning_rate": 0.00010309608540925266, "loss": 1.8105, "step": 5456 }, { "epoch": 2.4253333333333336, "grad_norm": 2.183835983276367, "learning_rate": 0.00010307829181494661, "loss": 1.921, "step": 5457 }, { "epoch": 2.425777777777778, "grad_norm": 2.3134961128234863, "learning_rate": 0.00010306049822064057, "loss": 1.8601, "step": 5458 }, { "epoch": 2.426222222222222, "grad_norm": 2.484114408493042, "learning_rate": 0.00010304270462633451, "loss": 1.701, "step": 5459 }, { "epoch": 2.4266666666666667, "grad_norm": 2.430379867553711, "learning_rate": 0.00010302491103202847, "loss": 1.9341, "step": 5460 }, { "epoch": 2.427111111111111, "grad_norm": 2.4869649410247803, "learning_rate": 0.00010300711743772243, "loss": 1.9392, "step": 5461 }, { "epoch": 2.4275555555555557, "grad_norm": 2.402344226837158, "learning_rate": 0.00010298932384341637, "loss": 1.8691, "step": 5462 }, { "epoch": 2.428, "grad_norm": 1.7501614093780518, "learning_rate": 0.00010297153024911033, "loss": 1.085, "step": 5463 }, { "epoch": 2.4284444444444446, "grad_norm": 2.223562002182007, "learning_rate": 0.00010295373665480428, "loss": 1.3557, "step": 5464 }, { "epoch": 2.428888888888889, "grad_norm": 2.1328225135803223, "learning_rate": 0.00010293594306049823, "loss": 1.7067, "step": 5465 }, { "epoch": 2.429333333333333, "grad_norm": 2.1772732734680176, "learning_rate": 0.00010291814946619218, "loss": 1.5118, "step": 5466 }, { "epoch": 2.429777777777778, "grad_norm": 2.320878505706787, "learning_rate": 0.00010290035587188614, "loss": 1.3153, "step": 5467 }, { "epoch": 2.430222222222222, "grad_norm": 2.253594398498535, "learning_rate": 0.00010288256227758008, "loss": 1.8138, "step": 5468 }, { "epoch": 2.4306666666666668, "grad_norm": 2.6910340785980225, "learning_rate": 0.00010286476868327401, "loss": 2.0876, "step": 5469 }, { "epoch": 2.431111111111111, "grad_norm": 2.565636396408081, "learning_rate": 0.00010284697508896797, "loss": 1.9388, "step": 5470 }, { "epoch": 2.4315555555555557, "grad_norm": 2.60259747505188, "learning_rate": 0.00010282918149466192, "loss": 1.5759, "step": 5471 }, { "epoch": 2.432, "grad_norm": 2.3194119930267334, "learning_rate": 0.00010281138790035587, "loss": 1.8334, "step": 5472 }, { "epoch": 2.4324444444444446, "grad_norm": 2.5380783081054688, "learning_rate": 0.00010279359430604982, "loss": 1.7694, "step": 5473 }, { "epoch": 2.432888888888889, "grad_norm": 2.5655555725097656, "learning_rate": 0.00010277580071174378, "loss": 2.0182, "step": 5474 }, { "epoch": 2.4333333333333336, "grad_norm": 2.371737241744995, "learning_rate": 0.00010275800711743772, "loss": 1.805, "step": 5475 }, { "epoch": 2.433777777777778, "grad_norm": 2.5527260303497314, "learning_rate": 0.00010274021352313168, "loss": 1.7959, "step": 5476 }, { "epoch": 2.434222222222222, "grad_norm": 2.364063262939453, "learning_rate": 0.00010272241992882564, "loss": 1.4738, "step": 5477 }, { "epoch": 2.4346666666666668, "grad_norm": 3.0714645385742188, "learning_rate": 0.00010270462633451958, "loss": 1.7449, "step": 5478 }, { "epoch": 2.435111111111111, "grad_norm": 2.63435959815979, "learning_rate": 0.00010268683274021354, "loss": 1.9282, "step": 5479 }, { "epoch": 2.4355555555555557, "grad_norm": 2.5400550365448, "learning_rate": 0.00010266903914590749, "loss": 1.199, "step": 5480 }, { "epoch": 2.436, "grad_norm": 2.6152279376983643, "learning_rate": 0.00010265124555160144, "loss": 1.7645, "step": 5481 }, { "epoch": 2.4364444444444446, "grad_norm": 3.2764761447906494, "learning_rate": 0.00010263345195729536, "loss": 1.8478, "step": 5482 }, { "epoch": 2.436888888888889, "grad_norm": 2.65189790725708, "learning_rate": 0.00010261565836298932, "loss": 1.969, "step": 5483 }, { "epoch": 2.437333333333333, "grad_norm": 2.8644859790802, "learning_rate": 0.00010259786476868328, "loss": 1.5599, "step": 5484 }, { "epoch": 2.437777777777778, "grad_norm": 2.7728471755981445, "learning_rate": 0.00010258007117437722, "loss": 1.4802, "step": 5485 }, { "epoch": 2.438222222222222, "grad_norm": 3.231962203979492, "learning_rate": 0.00010256227758007118, "loss": 1.8151, "step": 5486 }, { "epoch": 2.4386666666666668, "grad_norm": 2.6833078861236572, "learning_rate": 0.00010254448398576513, "loss": 1.4914, "step": 5487 }, { "epoch": 2.439111111111111, "grad_norm": 2.837379217147827, "learning_rate": 0.00010252669039145908, "loss": 2.1413, "step": 5488 }, { "epoch": 2.4395555555555557, "grad_norm": 2.6518139839172363, "learning_rate": 0.00010250889679715303, "loss": 1.5023, "step": 5489 }, { "epoch": 2.44, "grad_norm": 2.0156285762786865, "learning_rate": 0.00010249110320284699, "loss": 0.9281, "step": 5490 }, { "epoch": 2.4404444444444446, "grad_norm": 3.069772958755493, "learning_rate": 0.00010247330960854093, "loss": 2.1119, "step": 5491 }, { "epoch": 2.440888888888889, "grad_norm": 3.10864520072937, "learning_rate": 0.00010245551601423489, "loss": 1.5862, "step": 5492 }, { "epoch": 2.4413333333333336, "grad_norm": 2.859609365463257, "learning_rate": 0.00010243772241992885, "loss": 1.7583, "step": 5493 }, { "epoch": 2.441777777777778, "grad_norm": 2.6531405448913574, "learning_rate": 0.00010241992882562279, "loss": 1.4576, "step": 5494 }, { "epoch": 2.442222222222222, "grad_norm": 3.0856566429138184, "learning_rate": 0.00010240213523131672, "loss": 1.6381, "step": 5495 }, { "epoch": 2.4426666666666668, "grad_norm": 3.7810099124908447, "learning_rate": 0.00010238434163701067, "loss": 2.2101, "step": 5496 }, { "epoch": 2.443111111111111, "grad_norm": 3.085653066635132, "learning_rate": 0.00010236654804270463, "loss": 1.7354, "step": 5497 }, { "epoch": 2.4435555555555557, "grad_norm": 2.9411280155181885, "learning_rate": 0.00010234875444839857, "loss": 1.1896, "step": 5498 }, { "epoch": 2.444, "grad_norm": 2.1418721675872803, "learning_rate": 0.00010233096085409253, "loss": 1.0698, "step": 5499 }, { "epoch": 2.4444444444444446, "grad_norm": 2.725583076477051, "learning_rate": 0.00010231316725978647, "loss": 0.6198, "step": 5500 }, { "epoch": 2.444888888888889, "grad_norm": 1.92898690700531, "learning_rate": 0.00010229537366548043, "loss": 2.5948, "step": 5501 }, { "epoch": 2.445333333333333, "grad_norm": 1.2224948406219482, "learning_rate": 0.00010227758007117439, "loss": 1.0915, "step": 5502 }, { "epoch": 2.445777777777778, "grad_norm": 1.8230127096176147, "learning_rate": 0.00010225978647686833, "loss": 2.0617, "step": 5503 }, { "epoch": 2.446222222222222, "grad_norm": 1.861149787902832, "learning_rate": 0.00010224199288256229, "loss": 1.6798, "step": 5504 }, { "epoch": 2.4466666666666668, "grad_norm": 1.9252985715866089, "learning_rate": 0.00010222419928825624, "loss": 1.9089, "step": 5505 }, { "epoch": 2.447111111111111, "grad_norm": 2.1263620853424072, "learning_rate": 0.00010220640569395019, "loss": 2.0751, "step": 5506 }, { "epoch": 2.4475555555555557, "grad_norm": 2.2096424102783203, "learning_rate": 0.00010218861209964414, "loss": 2.0726, "step": 5507 }, { "epoch": 2.448, "grad_norm": 2.455075263977051, "learning_rate": 0.00010217081850533807, "loss": 2.2006, "step": 5508 }, { "epoch": 2.448444444444444, "grad_norm": 2.1597304344177246, "learning_rate": 0.00010215302491103203, "loss": 1.5901, "step": 5509 }, { "epoch": 2.448888888888889, "grad_norm": 2.4968745708465576, "learning_rate": 0.00010213523131672597, "loss": 1.8659, "step": 5510 }, { "epoch": 2.449333333333333, "grad_norm": 2.074950695037842, "learning_rate": 0.00010211743772241993, "loss": 2.1795, "step": 5511 }, { "epoch": 2.449777777777778, "grad_norm": 2.339791774749756, "learning_rate": 0.00010209964412811388, "loss": 1.5882, "step": 5512 }, { "epoch": 2.450222222222222, "grad_norm": 2.3237624168395996, "learning_rate": 0.00010208185053380783, "loss": 1.4691, "step": 5513 }, { "epoch": 2.4506666666666668, "grad_norm": 2.4327070713043213, "learning_rate": 0.00010206405693950178, "loss": 1.8784, "step": 5514 }, { "epoch": 2.451111111111111, "grad_norm": 2.7746636867523193, "learning_rate": 0.00010204626334519574, "loss": 1.7476, "step": 5515 }, { "epoch": 2.4515555555555557, "grad_norm": 1.4151685237884521, "learning_rate": 0.00010202846975088968, "loss": 0.7311, "step": 5516 }, { "epoch": 2.452, "grad_norm": 2.369919776916504, "learning_rate": 0.00010201067615658364, "loss": 1.8284, "step": 5517 }, { "epoch": 2.4524444444444446, "grad_norm": 2.3550519943237305, "learning_rate": 0.0001019928825622776, "loss": 1.4522, "step": 5518 }, { "epoch": 2.452888888888889, "grad_norm": 2.1800880432128906, "learning_rate": 0.00010197508896797154, "loss": 1.7927, "step": 5519 }, { "epoch": 2.453333333333333, "grad_norm": 1.9889439344406128, "learning_rate": 0.0001019572953736655, "loss": 1.4447, "step": 5520 }, { "epoch": 2.453777777777778, "grad_norm": 2.0936267375946045, "learning_rate": 0.00010193950177935942, "loss": 1.6783, "step": 5521 }, { "epoch": 2.454222222222222, "grad_norm": 2.577599287033081, "learning_rate": 0.00010192170818505338, "loss": 2.0707, "step": 5522 }, { "epoch": 2.4546666666666668, "grad_norm": 2.550760507583618, "learning_rate": 0.00010190391459074732, "loss": 1.8725, "step": 5523 }, { "epoch": 2.455111111111111, "grad_norm": 2.492544651031494, "learning_rate": 0.00010188612099644128, "loss": 1.8554, "step": 5524 }, { "epoch": 2.4555555555555557, "grad_norm": 2.88720703125, "learning_rate": 0.00010186832740213524, "loss": 1.9373, "step": 5525 }, { "epoch": 2.456, "grad_norm": 2.423215627670288, "learning_rate": 0.00010185053380782918, "loss": 1.8004, "step": 5526 }, { "epoch": 2.456444444444444, "grad_norm": 2.4208219051361084, "learning_rate": 0.00010183274021352314, "loss": 2.0648, "step": 5527 }, { "epoch": 2.456888888888889, "grad_norm": 2.6025261878967285, "learning_rate": 0.00010181494661921709, "loss": 1.7313, "step": 5528 }, { "epoch": 2.457333333333333, "grad_norm": 1.9392775297164917, "learning_rate": 0.00010179715302491104, "loss": 1.209, "step": 5529 }, { "epoch": 2.457777777777778, "grad_norm": 2.2096610069274902, "learning_rate": 0.00010177935943060499, "loss": 1.6588, "step": 5530 }, { "epoch": 2.458222222222222, "grad_norm": 2.5169079303741455, "learning_rate": 0.00010176156583629895, "loss": 1.6311, "step": 5531 }, { "epoch": 2.458666666666667, "grad_norm": 2.4798898696899414, "learning_rate": 0.00010174377224199289, "loss": 1.4915, "step": 5532 }, { "epoch": 2.459111111111111, "grad_norm": 2.8230345249176025, "learning_rate": 0.00010172597864768685, "loss": 1.7837, "step": 5533 }, { "epoch": 2.4595555555555557, "grad_norm": 2.252345561981201, "learning_rate": 0.00010170818505338078, "loss": 1.694, "step": 5534 }, { "epoch": 2.46, "grad_norm": 2.8912289142608643, "learning_rate": 0.00010169039145907473, "loss": 2.1645, "step": 5535 }, { "epoch": 2.4604444444444447, "grad_norm": 2.01261305809021, "learning_rate": 0.00010167259786476868, "loss": 0.7449, "step": 5536 }, { "epoch": 2.460888888888889, "grad_norm": 2.8200204372406006, "learning_rate": 0.00010165480427046263, "loss": 1.5755, "step": 5537 }, { "epoch": 2.461333333333333, "grad_norm": 2.4388539791107178, "learning_rate": 0.00010163701067615659, "loss": 1.7643, "step": 5538 }, { "epoch": 2.461777777777778, "grad_norm": 2.9990639686584473, "learning_rate": 0.00010161921708185053, "loss": 1.7626, "step": 5539 }, { "epoch": 2.462222222222222, "grad_norm": 2.477353811264038, "learning_rate": 0.00010160142348754449, "loss": 1.3518, "step": 5540 }, { "epoch": 2.462666666666667, "grad_norm": 2.8211402893066406, "learning_rate": 0.00010158362989323845, "loss": 1.5751, "step": 5541 }, { "epoch": 2.463111111111111, "grad_norm": 2.7907443046569824, "learning_rate": 0.00010156583629893239, "loss": 2.068, "step": 5542 }, { "epoch": 2.4635555555555557, "grad_norm": 2.866947650909424, "learning_rate": 0.00010154804270462635, "loss": 1.593, "step": 5543 }, { "epoch": 2.464, "grad_norm": 3.0840697288513184, "learning_rate": 0.0001015302491103203, "loss": 1.6982, "step": 5544 }, { "epoch": 2.464444444444444, "grad_norm": 2.5934946537017822, "learning_rate": 0.00010151245551601424, "loss": 1.6589, "step": 5545 }, { "epoch": 2.464888888888889, "grad_norm": 3.505406141281128, "learning_rate": 0.0001014946619217082, "loss": 1.4979, "step": 5546 }, { "epoch": 2.465333333333333, "grad_norm": 3.8022685050964355, "learning_rate": 0.00010147686832740213, "loss": 2.0308, "step": 5547 }, { "epoch": 2.465777777777778, "grad_norm": 3.6010377407073975, "learning_rate": 0.00010145907473309609, "loss": 1.9663, "step": 5548 }, { "epoch": 2.466222222222222, "grad_norm": 3.2680211067199707, "learning_rate": 0.00010144128113879003, "loss": 2.0005, "step": 5549 }, { "epoch": 2.466666666666667, "grad_norm": 4.470943927764893, "learning_rate": 0.00010142348754448399, "loss": 1.4878, "step": 5550 }, { "epoch": 2.467111111111111, "grad_norm": 1.9788882732391357, "learning_rate": 0.00010140569395017794, "loss": 1.9776, "step": 5551 }, { "epoch": 2.4675555555555557, "grad_norm": 1.8858304023742676, "learning_rate": 0.00010138790035587189, "loss": 2.3771, "step": 5552 }, { "epoch": 2.468, "grad_norm": 2.0019304752349854, "learning_rate": 0.00010137010676156584, "loss": 1.513, "step": 5553 }, { "epoch": 2.4684444444444447, "grad_norm": 2.7883894443511963, "learning_rate": 0.0001013523131672598, "loss": 2.1866, "step": 5554 }, { "epoch": 2.468888888888889, "grad_norm": 2.3653924465179443, "learning_rate": 0.00010133451957295374, "loss": 1.6028, "step": 5555 }, { "epoch": 2.469333333333333, "grad_norm": 2.258758544921875, "learning_rate": 0.0001013167259786477, "loss": 1.8427, "step": 5556 }, { "epoch": 2.469777777777778, "grad_norm": 2.4392614364624023, "learning_rate": 0.00010129893238434165, "loss": 1.4794, "step": 5557 }, { "epoch": 2.470222222222222, "grad_norm": 2.4021196365356445, "learning_rate": 0.0001012811387900356, "loss": 1.7046, "step": 5558 }, { "epoch": 2.470666666666667, "grad_norm": 2.3524880409240723, "learning_rate": 0.00010126334519572955, "loss": 1.5135, "step": 5559 }, { "epoch": 2.471111111111111, "grad_norm": 2.1956753730773926, "learning_rate": 0.00010124555160142348, "loss": 1.4995, "step": 5560 }, { "epoch": 2.4715555555555557, "grad_norm": 2.2904982566833496, "learning_rate": 0.00010122775800711744, "loss": 1.7763, "step": 5561 }, { "epoch": 2.472, "grad_norm": 2.2968485355377197, "learning_rate": 0.00010120996441281138, "loss": 1.882, "step": 5562 }, { "epoch": 2.4724444444444442, "grad_norm": 2.505476951599121, "learning_rate": 0.00010119217081850534, "loss": 1.928, "step": 5563 }, { "epoch": 2.472888888888889, "grad_norm": 2.5836219787597656, "learning_rate": 0.0001011743772241993, "loss": 2.0092, "step": 5564 }, { "epoch": 2.473333333333333, "grad_norm": 2.1713929176330566, "learning_rate": 0.00010115658362989324, "loss": 1.6092, "step": 5565 }, { "epoch": 2.473777777777778, "grad_norm": 2.1562206745147705, "learning_rate": 0.0001011387900355872, "loss": 1.1912, "step": 5566 }, { "epoch": 2.474222222222222, "grad_norm": 2.7238199710845947, "learning_rate": 0.00010112099644128115, "loss": 1.8131, "step": 5567 }, { "epoch": 2.474666666666667, "grad_norm": 2.7333614826202393, "learning_rate": 0.0001011032028469751, "loss": 2.0719, "step": 5568 }, { "epoch": 2.475111111111111, "grad_norm": 1.9092731475830078, "learning_rate": 0.00010108540925266905, "loss": 1.5397, "step": 5569 }, { "epoch": 2.4755555555555557, "grad_norm": 2.2644593715667725, "learning_rate": 0.00010106761565836301, "loss": 1.5289, "step": 5570 }, { "epoch": 2.476, "grad_norm": 2.569396734237671, "learning_rate": 0.00010104982206405695, "loss": 1.8126, "step": 5571 }, { "epoch": 2.4764444444444447, "grad_norm": 2.5156378746032715, "learning_rate": 0.00010103202846975088, "loss": 1.542, "step": 5572 }, { "epoch": 2.476888888888889, "grad_norm": 2.716794013977051, "learning_rate": 0.00010101423487544484, "loss": 1.9589, "step": 5573 }, { "epoch": 2.477333333333333, "grad_norm": 2.614689826965332, "learning_rate": 0.0001009964412811388, "loss": 2.0686, "step": 5574 }, { "epoch": 2.477777777777778, "grad_norm": 3.7736783027648926, "learning_rate": 0.00010097864768683274, "loss": 1.7161, "step": 5575 }, { "epoch": 2.478222222222222, "grad_norm": 2.7454824447631836, "learning_rate": 0.00010096085409252669, "loss": 2.2613, "step": 5576 }, { "epoch": 2.478666666666667, "grad_norm": 2.346919298171997, "learning_rate": 0.00010094306049822065, "loss": 2.0465, "step": 5577 }, { "epoch": 2.479111111111111, "grad_norm": 2.593102216720581, "learning_rate": 0.00010092526690391459, "loss": 1.9165, "step": 5578 }, { "epoch": 2.4795555555555557, "grad_norm": 2.560601234436035, "learning_rate": 0.00010090747330960855, "loss": 1.8571, "step": 5579 }, { "epoch": 2.48, "grad_norm": 2.8583500385284424, "learning_rate": 0.0001008896797153025, "loss": 1.9994, "step": 5580 }, { "epoch": 2.4804444444444442, "grad_norm": 2.5867176055908203, "learning_rate": 0.00010087188612099645, "loss": 1.6889, "step": 5581 }, { "epoch": 2.480888888888889, "grad_norm": 6.32763147354126, "learning_rate": 0.0001008540925266904, "loss": 1.3415, "step": 5582 }, { "epoch": 2.481333333333333, "grad_norm": 2.52264666557312, "learning_rate": 0.00010083629893238435, "loss": 1.8925, "step": 5583 }, { "epoch": 2.481777777777778, "grad_norm": 2.677546977996826, "learning_rate": 0.0001008185053380783, "loss": 2.1266, "step": 5584 }, { "epoch": 2.482222222222222, "grad_norm": 2.429607391357422, "learning_rate": 0.00010080071174377223, "loss": 1.6861, "step": 5585 }, { "epoch": 2.482666666666667, "grad_norm": 2.5945723056793213, "learning_rate": 0.00010078291814946619, "loss": 1.3189, "step": 5586 }, { "epoch": 2.483111111111111, "grad_norm": 2.5813865661621094, "learning_rate": 0.00010076512455516015, "loss": 1.6405, "step": 5587 }, { "epoch": 2.4835555555555557, "grad_norm": 2.870296001434326, "learning_rate": 0.00010074733096085409, "loss": 1.6053, "step": 5588 }, { "epoch": 2.484, "grad_norm": 3.1349499225616455, "learning_rate": 0.00010072953736654805, "loss": 1.7858, "step": 5589 }, { "epoch": 2.4844444444444447, "grad_norm": 2.9594576358795166, "learning_rate": 0.00010071174377224199, "loss": 1.4855, "step": 5590 }, { "epoch": 2.484888888888889, "grad_norm": 2.6506004333496094, "learning_rate": 0.00010069395017793595, "loss": 1.4352, "step": 5591 }, { "epoch": 2.485333333333333, "grad_norm": 2.739713191986084, "learning_rate": 0.0001006761565836299, "loss": 1.5386, "step": 5592 }, { "epoch": 2.485777777777778, "grad_norm": 2.450587034225464, "learning_rate": 0.00010065836298932384, "loss": 1.3184, "step": 5593 }, { "epoch": 2.486222222222222, "grad_norm": 3.1459028720855713, "learning_rate": 0.0001006405693950178, "loss": 1.6811, "step": 5594 }, { "epoch": 2.486666666666667, "grad_norm": 3.288677215576172, "learning_rate": 0.00010062277580071176, "loss": 1.7088, "step": 5595 }, { "epoch": 2.487111111111111, "grad_norm": 3.5362205505371094, "learning_rate": 0.0001006049822064057, "loss": 1.4808, "step": 5596 }, { "epoch": 2.4875555555555557, "grad_norm": 3.3700575828552246, "learning_rate": 0.00010058718861209966, "loss": 1.9626, "step": 5597 }, { "epoch": 2.488, "grad_norm": 5.008380889892578, "learning_rate": 0.00010056939501779359, "loss": 1.8212, "step": 5598 }, { "epoch": 2.4884444444444442, "grad_norm": 3.4378342628479004, "learning_rate": 0.00010055160142348754, "loss": 1.8416, "step": 5599 }, { "epoch": 2.488888888888889, "grad_norm": 3.6014068126678467, "learning_rate": 0.00010053380782918149, "loss": 1.8809, "step": 5600 }, { "epoch": 2.489333333333333, "grad_norm": 1.8102363348007202, "learning_rate": 0.00010051601423487544, "loss": 2.2235, "step": 5601 }, { "epoch": 2.489777777777778, "grad_norm": 1.5259066820144653, "learning_rate": 0.0001004982206405694, "loss": 1.4081, "step": 5602 }, { "epoch": 2.490222222222222, "grad_norm": 1.8424686193466187, "learning_rate": 0.00010048042704626334, "loss": 2.124, "step": 5603 }, { "epoch": 2.490666666666667, "grad_norm": 2.1134793758392334, "learning_rate": 0.0001004626334519573, "loss": 2.0344, "step": 5604 }, { "epoch": 2.491111111111111, "grad_norm": 1.8464834690093994, "learning_rate": 0.00010044483985765125, "loss": 1.4534, "step": 5605 }, { "epoch": 2.4915555555555557, "grad_norm": 2.2974958419799805, "learning_rate": 0.0001004270462633452, "loss": 1.936, "step": 5606 }, { "epoch": 2.492, "grad_norm": 2.2461228370666504, "learning_rate": 0.00010040925266903915, "loss": 1.9357, "step": 5607 }, { "epoch": 2.4924444444444447, "grad_norm": 2.2530407905578613, "learning_rate": 0.00010039145907473311, "loss": 2.0635, "step": 5608 }, { "epoch": 2.492888888888889, "grad_norm": 1.6925837993621826, "learning_rate": 0.00010037366548042705, "loss": 0.9727, "step": 5609 }, { "epoch": 2.493333333333333, "grad_norm": 1.5781002044677734, "learning_rate": 0.00010035587188612101, "loss": 1.0898, "step": 5610 }, { "epoch": 2.493777777777778, "grad_norm": 2.560060501098633, "learning_rate": 0.00010033807829181494, "loss": 1.6789, "step": 5611 }, { "epoch": 2.494222222222222, "grad_norm": 2.309338092803955, "learning_rate": 0.0001003202846975089, "loss": 1.7006, "step": 5612 }, { "epoch": 2.494666666666667, "grad_norm": 2.1079952716827393, "learning_rate": 0.00010030249110320284, "loss": 1.9404, "step": 5613 }, { "epoch": 2.495111111111111, "grad_norm": 2.587090253829956, "learning_rate": 0.0001002846975088968, "loss": 2.0718, "step": 5614 }, { "epoch": 2.4955555555555557, "grad_norm": 2.6044342517852783, "learning_rate": 0.00010026690391459075, "loss": 1.4223, "step": 5615 }, { "epoch": 2.496, "grad_norm": 2.266907215118408, "learning_rate": 0.0001002491103202847, "loss": 1.8536, "step": 5616 }, { "epoch": 2.4964444444444442, "grad_norm": 2.3977086544036865, "learning_rate": 0.00010023131672597865, "loss": 1.5307, "step": 5617 }, { "epoch": 2.496888888888889, "grad_norm": 2.1894779205322266, "learning_rate": 0.00010021352313167261, "loss": 1.6952, "step": 5618 }, { "epoch": 2.497333333333333, "grad_norm": 2.3074259757995605, "learning_rate": 0.00010019572953736655, "loss": 1.7691, "step": 5619 }, { "epoch": 2.497777777777778, "grad_norm": 2.2768285274505615, "learning_rate": 0.00010017793594306051, "loss": 1.696, "step": 5620 }, { "epoch": 2.498222222222222, "grad_norm": 2.4698994159698486, "learning_rate": 0.00010016014234875446, "loss": 1.4205, "step": 5621 }, { "epoch": 2.498666666666667, "grad_norm": 2.3182806968688965, "learning_rate": 0.00010014234875444841, "loss": 1.7386, "step": 5622 }, { "epoch": 2.499111111111111, "grad_norm": 2.212176561355591, "learning_rate": 0.00010012455516014236, "loss": 1.4822, "step": 5623 }, { "epoch": 2.4995555555555553, "grad_norm": 2.8632688522338867, "learning_rate": 0.00010010676156583629, "loss": 1.9683, "step": 5624 }, { "epoch": 2.5, "grad_norm": 2.298098087310791, "learning_rate": 0.00010008896797153025, "loss": 1.865, "step": 5625 }, { "epoch": 2.5004444444444447, "grad_norm": 1.8092749118804932, "learning_rate": 0.00010007117437722419, "loss": 0.9924, "step": 5626 }, { "epoch": 2.500888888888889, "grad_norm": 3.5151941776275635, "learning_rate": 0.00010005338078291815, "loss": 2.314, "step": 5627 }, { "epoch": 2.501333333333333, "grad_norm": 1.888319730758667, "learning_rate": 0.0001000355871886121, "loss": 0.9312, "step": 5628 }, { "epoch": 2.501777777777778, "grad_norm": 3.430162191390991, "learning_rate": 0.00010001779359430605, "loss": 1.7719, "step": 5629 }, { "epoch": 2.502222222222222, "grad_norm": 2.4954025745391846, "learning_rate": 0.0001, "loss": 1.4474, "step": 5630 }, { "epoch": 2.502666666666667, "grad_norm": 2.7440812587738037, "learning_rate": 9.998220640569396e-05, "loss": 2.1656, "step": 5631 }, { "epoch": 2.503111111111111, "grad_norm": 2.9336841106414795, "learning_rate": 9.99644128113879e-05, "loss": 1.767, "step": 5632 }, { "epoch": 2.5035555555555558, "grad_norm": 2.7770421504974365, "learning_rate": 9.994661921708186e-05, "loss": 1.8719, "step": 5633 }, { "epoch": 2.504, "grad_norm": 3.4708054065704346, "learning_rate": 9.99288256227758e-05, "loss": 2.0225, "step": 5634 }, { "epoch": 2.5044444444444443, "grad_norm": 3.865091323852539, "learning_rate": 9.991103202846975e-05, "loss": 1.9943, "step": 5635 }, { "epoch": 2.504888888888889, "grad_norm": 2.653648853302002, "learning_rate": 9.98932384341637e-05, "loss": 1.7085, "step": 5636 }, { "epoch": 2.505333333333333, "grad_norm": 3.2496232986450195, "learning_rate": 9.987544483985766e-05, "loss": 1.5987, "step": 5637 }, { "epoch": 2.505777777777778, "grad_norm": 2.717987537384033, "learning_rate": 9.98576512455516e-05, "loss": 1.5691, "step": 5638 }, { "epoch": 2.506222222222222, "grad_norm": 2.918710470199585, "learning_rate": 9.983985765124556e-05, "loss": 1.9691, "step": 5639 }, { "epoch": 2.506666666666667, "grad_norm": 3.100358247756958, "learning_rate": 9.98220640569395e-05, "loss": 1.907, "step": 5640 }, { "epoch": 2.507111111111111, "grad_norm": 2.9432833194732666, "learning_rate": 9.980427046263346e-05, "loss": 1.9512, "step": 5641 }, { "epoch": 2.5075555555555553, "grad_norm": 3.8566741943359375, "learning_rate": 9.97864768683274e-05, "loss": 2.0891, "step": 5642 }, { "epoch": 2.508, "grad_norm": 2.724581003189087, "learning_rate": 9.976868327402136e-05, "loss": 1.4336, "step": 5643 }, { "epoch": 2.5084444444444447, "grad_norm": 2.8649446964263916, "learning_rate": 9.975088967971531e-05, "loss": 1.8689, "step": 5644 }, { "epoch": 2.508888888888889, "grad_norm": 2.8804287910461426, "learning_rate": 9.973309608540926e-05, "loss": 1.902, "step": 5645 }, { "epoch": 2.509333333333333, "grad_norm": 2.8557331562042236, "learning_rate": 9.971530249110321e-05, "loss": 1.7273, "step": 5646 }, { "epoch": 2.509777777777778, "grad_norm": 3.825040340423584, "learning_rate": 9.969750889679716e-05, "loss": 2.0143, "step": 5647 }, { "epoch": 2.510222222222222, "grad_norm": 2.0822556018829346, "learning_rate": 9.96797153024911e-05, "loss": 0.7991, "step": 5648 }, { "epoch": 2.510666666666667, "grad_norm": 4.02184534072876, "learning_rate": 9.966192170818506e-05, "loss": 1.6318, "step": 5649 }, { "epoch": 2.511111111111111, "grad_norm": 3.868868589401245, "learning_rate": 9.964412811387901e-05, "loss": 1.7922, "step": 5650 }, { "epoch": 2.5115555555555558, "grad_norm": 1.9891340732574463, "learning_rate": 9.962633451957296e-05, "loss": 1.9876, "step": 5651 }, { "epoch": 2.512, "grad_norm": 2.9037976264953613, "learning_rate": 9.960854092526691e-05, "loss": 2.0403, "step": 5652 }, { "epoch": 2.5124444444444443, "grad_norm": 2.0764451026916504, "learning_rate": 9.959074733096086e-05, "loss": 2.0316, "step": 5653 }, { "epoch": 2.512888888888889, "grad_norm": 2.249202251434326, "learning_rate": 9.957295373665481e-05, "loss": 1.8978, "step": 5654 }, { "epoch": 2.513333333333333, "grad_norm": 2.4839189052581787, "learning_rate": 9.955516014234875e-05, "loss": 2.0246, "step": 5655 }, { "epoch": 2.513777777777778, "grad_norm": 2.4105618000030518, "learning_rate": 9.953736654804271e-05, "loss": 1.8856, "step": 5656 }, { "epoch": 2.514222222222222, "grad_norm": 1.5155545473098755, "learning_rate": 9.951957295373667e-05, "loss": 1.0195, "step": 5657 }, { "epoch": 2.514666666666667, "grad_norm": 2.674628973007202, "learning_rate": 9.950177935943061e-05, "loss": 2.1171, "step": 5658 }, { "epoch": 2.515111111111111, "grad_norm": 2.4236528873443604, "learning_rate": 9.948398576512457e-05, "loss": 1.9027, "step": 5659 }, { "epoch": 2.5155555555555553, "grad_norm": 2.4038853645324707, "learning_rate": 9.946619217081851e-05, "loss": 1.5235, "step": 5660 }, { "epoch": 2.516, "grad_norm": 2.637871026992798, "learning_rate": 9.944839857651245e-05, "loss": 2.012, "step": 5661 }, { "epoch": 2.5164444444444447, "grad_norm": 2.1370859146118164, "learning_rate": 9.943060498220641e-05, "loss": 1.6912, "step": 5662 }, { "epoch": 2.516888888888889, "grad_norm": 1.37175452709198, "learning_rate": 9.941281138790037e-05, "loss": 0.8939, "step": 5663 }, { "epoch": 2.517333333333333, "grad_norm": 2.60074520111084, "learning_rate": 9.939501779359431e-05, "loss": 2.1094, "step": 5664 }, { "epoch": 2.517777777777778, "grad_norm": 2.421234607696533, "learning_rate": 9.937722419928827e-05, "loss": 2.2255, "step": 5665 }, { "epoch": 2.518222222222222, "grad_norm": 1.9047367572784424, "learning_rate": 9.935943060498221e-05, "loss": 1.5189, "step": 5666 }, { "epoch": 2.518666666666667, "grad_norm": 2.2271924018859863, "learning_rate": 9.934163701067616e-05, "loss": 1.6326, "step": 5667 }, { "epoch": 2.519111111111111, "grad_norm": 2.0416018962860107, "learning_rate": 9.932384341637011e-05, "loss": 1.1365, "step": 5668 }, { "epoch": 2.5195555555555558, "grad_norm": 2.345722198486328, "learning_rate": 9.930604982206406e-05, "loss": 1.771, "step": 5669 }, { "epoch": 2.52, "grad_norm": 2.515684127807617, "learning_rate": 9.928825622775802e-05, "loss": 1.5751, "step": 5670 }, { "epoch": 2.5204444444444443, "grad_norm": 2.2543230056762695, "learning_rate": 9.927046263345196e-05, "loss": 1.505, "step": 5671 }, { "epoch": 2.520888888888889, "grad_norm": 2.895059585571289, "learning_rate": 9.92526690391459e-05, "loss": 2.2744, "step": 5672 }, { "epoch": 2.521333333333333, "grad_norm": 2.4043800830841064, "learning_rate": 9.923487544483986e-05, "loss": 1.5109, "step": 5673 }, { "epoch": 2.521777777777778, "grad_norm": 2.604374885559082, "learning_rate": 9.92170818505338e-05, "loss": 1.5665, "step": 5674 }, { "epoch": 2.522222222222222, "grad_norm": 2.587221145629883, "learning_rate": 9.919928825622776e-05, "loss": 1.7969, "step": 5675 }, { "epoch": 2.522666666666667, "grad_norm": 2.516834259033203, "learning_rate": 9.918149466192172e-05, "loss": 1.5522, "step": 5676 }, { "epoch": 2.523111111111111, "grad_norm": 2.5173187255859375, "learning_rate": 9.916370106761566e-05, "loss": 1.8983, "step": 5677 }, { "epoch": 2.5235555555555553, "grad_norm": 3.197408676147461, "learning_rate": 9.914590747330962e-05, "loss": 2.0039, "step": 5678 }, { "epoch": 2.524, "grad_norm": 2.5664446353912354, "learning_rate": 9.912811387900356e-05, "loss": 1.7915, "step": 5679 }, { "epoch": 2.5244444444444447, "grad_norm": 2.8082244396209717, "learning_rate": 9.91103202846975e-05, "loss": 2.2351, "step": 5680 }, { "epoch": 2.524888888888889, "grad_norm": 2.381884813308716, "learning_rate": 9.909252669039146e-05, "loss": 1.3298, "step": 5681 }, { "epoch": 2.525333333333333, "grad_norm": 2.747676134109497, "learning_rate": 9.907473309608542e-05, "loss": 1.758, "step": 5682 }, { "epoch": 2.525777777777778, "grad_norm": 1.99376380443573, "learning_rate": 9.905693950177936e-05, "loss": 0.815, "step": 5683 }, { "epoch": 2.526222222222222, "grad_norm": 2.863327980041504, "learning_rate": 9.903914590747332e-05, "loss": 1.9873, "step": 5684 }, { "epoch": 2.5266666666666664, "grad_norm": 2.060922861099243, "learning_rate": 9.902135231316726e-05, "loss": 1.206, "step": 5685 }, { "epoch": 2.527111111111111, "grad_norm": 2.845263719558716, "learning_rate": 9.900355871886122e-05, "loss": 1.7081, "step": 5686 }, { "epoch": 2.5275555555555558, "grad_norm": 2.831609010696411, "learning_rate": 9.898576512455516e-05, "loss": 1.9307, "step": 5687 }, { "epoch": 2.528, "grad_norm": 2.4219014644622803, "learning_rate": 9.896797153024912e-05, "loss": 1.2403, "step": 5688 }, { "epoch": 2.5284444444444443, "grad_norm": 3.4156317710876465, "learning_rate": 9.895017793594307e-05, "loss": 1.9255, "step": 5689 }, { "epoch": 2.528888888888889, "grad_norm": 3.083696126937866, "learning_rate": 9.893238434163702e-05, "loss": 1.5899, "step": 5690 }, { "epoch": 2.529333333333333, "grad_norm": 3.043922185897827, "learning_rate": 9.891459074733097e-05, "loss": 1.9652, "step": 5691 }, { "epoch": 2.529777777777778, "grad_norm": 2.8423397541046143, "learning_rate": 9.889679715302491e-05, "loss": 1.7007, "step": 5692 }, { "epoch": 2.530222222222222, "grad_norm": 3.143592596054077, "learning_rate": 9.887900355871886e-05, "loss": 1.613, "step": 5693 }, { "epoch": 2.530666666666667, "grad_norm": 3.120520830154419, "learning_rate": 9.886120996441281e-05, "loss": 1.5702, "step": 5694 }, { "epoch": 2.531111111111111, "grad_norm": 3.2891790866851807, "learning_rate": 9.884341637010677e-05, "loss": 2.051, "step": 5695 }, { "epoch": 2.5315555555555553, "grad_norm": 2.3981990814208984, "learning_rate": 9.882562277580071e-05, "loss": 1.1692, "step": 5696 }, { "epoch": 2.532, "grad_norm": 2.9288384914398193, "learning_rate": 9.880782918149467e-05, "loss": 1.5118, "step": 5697 }, { "epoch": 2.5324444444444447, "grad_norm": 2.981546640396118, "learning_rate": 9.879003558718861e-05, "loss": 1.6747, "step": 5698 }, { "epoch": 2.532888888888889, "grad_norm": 3.5307023525238037, "learning_rate": 9.877224199288257e-05, "loss": 1.7474, "step": 5699 }, { "epoch": 2.533333333333333, "grad_norm": 5.0346832275390625, "learning_rate": 9.875444839857651e-05, "loss": 1.8528, "step": 5700 }, { "epoch": 2.533777777777778, "grad_norm": 1.5125519037246704, "learning_rate": 9.873665480427047e-05, "loss": 1.1769, "step": 5701 }, { "epoch": 2.534222222222222, "grad_norm": 1.7165669202804565, "learning_rate": 9.871886120996443e-05, "loss": 1.086, "step": 5702 }, { "epoch": 2.5346666666666664, "grad_norm": 1.4426929950714111, "learning_rate": 9.870106761565837e-05, "loss": 1.1691, "step": 5703 }, { "epoch": 2.535111111111111, "grad_norm": 2.2641258239746094, "learning_rate": 9.868327402135232e-05, "loss": 2.1713, "step": 5704 }, { "epoch": 2.535555555555556, "grad_norm": 2.1632139682769775, "learning_rate": 9.866548042704627e-05, "loss": 2.2283, "step": 5705 }, { "epoch": 2.536, "grad_norm": 1.8564096689224243, "learning_rate": 9.864768683274021e-05, "loss": 1.0078, "step": 5706 }, { "epoch": 2.5364444444444443, "grad_norm": 2.37397837638855, "learning_rate": 9.862989323843417e-05, "loss": 2.1491, "step": 5707 }, { "epoch": 2.536888888888889, "grad_norm": 2.2556726932525635, "learning_rate": 9.861209964412812e-05, "loss": 2.0101, "step": 5708 }, { "epoch": 2.537333333333333, "grad_norm": 2.226167678833008, "learning_rate": 9.859430604982207e-05, "loss": 1.8431, "step": 5709 }, { "epoch": 2.537777777777778, "grad_norm": 2.111975908279419, "learning_rate": 9.857651245551602e-05, "loss": 1.9402, "step": 5710 }, { "epoch": 2.538222222222222, "grad_norm": 2.208085060119629, "learning_rate": 9.855871886120997e-05, "loss": 1.8334, "step": 5711 }, { "epoch": 2.538666666666667, "grad_norm": 2.404080867767334, "learning_rate": 9.854092526690392e-05, "loss": 2.0223, "step": 5712 }, { "epoch": 2.539111111111111, "grad_norm": 2.602574586868286, "learning_rate": 9.852313167259787e-05, "loss": 2.0228, "step": 5713 }, { "epoch": 2.5395555555555553, "grad_norm": 2.617043972015381, "learning_rate": 9.850533807829182e-05, "loss": 1.7762, "step": 5714 }, { "epoch": 2.54, "grad_norm": 2.011544704437256, "learning_rate": 9.848754448398578e-05, "loss": 1.5084, "step": 5715 }, { "epoch": 2.5404444444444443, "grad_norm": 2.117644786834717, "learning_rate": 9.846975088967972e-05, "loss": 1.6064, "step": 5716 }, { "epoch": 2.540888888888889, "grad_norm": 2.198139190673828, "learning_rate": 9.845195729537368e-05, "loss": 1.6679, "step": 5717 }, { "epoch": 2.541333333333333, "grad_norm": 1.5015946626663208, "learning_rate": 9.843416370106762e-05, "loss": 0.9008, "step": 5718 }, { "epoch": 2.541777777777778, "grad_norm": 2.3386454582214355, "learning_rate": 9.841637010676156e-05, "loss": 1.665, "step": 5719 }, { "epoch": 2.542222222222222, "grad_norm": 2.063887357711792, "learning_rate": 9.839857651245552e-05, "loss": 1.4986, "step": 5720 }, { "epoch": 2.5426666666666664, "grad_norm": 2.3388946056365967, "learning_rate": 9.838078291814948e-05, "loss": 2.0517, "step": 5721 }, { "epoch": 2.543111111111111, "grad_norm": 2.838296890258789, "learning_rate": 9.836298932384342e-05, "loss": 1.8289, "step": 5722 }, { "epoch": 2.543555555555556, "grad_norm": 2.528554677963257, "learning_rate": 9.834519572953738e-05, "loss": 2.0108, "step": 5723 }, { "epoch": 2.544, "grad_norm": 3.278167963027954, "learning_rate": 9.832740213523132e-05, "loss": 0.9733, "step": 5724 }, { "epoch": 2.5444444444444443, "grad_norm": 2.756471633911133, "learning_rate": 9.830960854092526e-05, "loss": 1.7284, "step": 5725 }, { "epoch": 2.544888888888889, "grad_norm": 2.6623575687408447, "learning_rate": 9.829181494661922e-05, "loss": 1.8741, "step": 5726 }, { "epoch": 2.5453333333333332, "grad_norm": 2.228694438934326, "learning_rate": 9.827402135231318e-05, "loss": 1.5795, "step": 5727 }, { "epoch": 2.545777777777778, "grad_norm": 1.9252150058746338, "learning_rate": 9.825622775800712e-05, "loss": 1.038, "step": 5728 }, { "epoch": 2.546222222222222, "grad_norm": 2.6613807678222656, "learning_rate": 9.823843416370107e-05, "loss": 1.9298, "step": 5729 }, { "epoch": 2.546666666666667, "grad_norm": 2.613647222518921, "learning_rate": 9.822064056939502e-05, "loss": 1.8275, "step": 5730 }, { "epoch": 2.547111111111111, "grad_norm": 2.9346303939819336, "learning_rate": 9.820284697508897e-05, "loss": 1.9397, "step": 5731 }, { "epoch": 2.5475555555555554, "grad_norm": 2.4454238414764404, "learning_rate": 9.818505338078292e-05, "loss": 1.6681, "step": 5732 }, { "epoch": 2.548, "grad_norm": 2.5040907859802246, "learning_rate": 9.816725978647687e-05, "loss": 1.2844, "step": 5733 }, { "epoch": 2.5484444444444443, "grad_norm": 2.631068706512451, "learning_rate": 9.814946619217083e-05, "loss": 1.482, "step": 5734 }, { "epoch": 2.548888888888889, "grad_norm": 2.5751149654388428, "learning_rate": 9.813167259786477e-05, "loss": 1.7342, "step": 5735 }, { "epoch": 2.5493333333333332, "grad_norm": 2.755535840988159, "learning_rate": 9.811387900355873e-05, "loss": 1.775, "step": 5736 }, { "epoch": 2.549777777777778, "grad_norm": 3.007052421569824, "learning_rate": 9.809608540925267e-05, "loss": 1.8263, "step": 5737 }, { "epoch": 2.550222222222222, "grad_norm": 2.7909440994262695, "learning_rate": 9.807829181494662e-05, "loss": 1.7401, "step": 5738 }, { "epoch": 2.5506666666666664, "grad_norm": 3.113389492034912, "learning_rate": 9.806049822064057e-05, "loss": 1.4822, "step": 5739 }, { "epoch": 2.551111111111111, "grad_norm": 3.607086420059204, "learning_rate": 9.804270462633453e-05, "loss": 2.3029, "step": 5740 }, { "epoch": 2.551555555555556, "grad_norm": 2.5248825550079346, "learning_rate": 9.802491103202847e-05, "loss": 1.6365, "step": 5741 }, { "epoch": 2.552, "grad_norm": 3.1107113361358643, "learning_rate": 9.800711743772243e-05, "loss": 1.6821, "step": 5742 }, { "epoch": 2.5524444444444443, "grad_norm": 3.095695734024048, "learning_rate": 9.798932384341637e-05, "loss": 1.9611, "step": 5743 }, { "epoch": 2.552888888888889, "grad_norm": 3.119293212890625, "learning_rate": 9.797153024911033e-05, "loss": 1.8032, "step": 5744 }, { "epoch": 2.5533333333333332, "grad_norm": 3.1017251014709473, "learning_rate": 9.795373665480427e-05, "loss": 1.3616, "step": 5745 }, { "epoch": 2.553777777777778, "grad_norm": 3.225269317626953, "learning_rate": 9.793594306049823e-05, "loss": 1.6475, "step": 5746 }, { "epoch": 2.554222222222222, "grad_norm": 3.819324254989624, "learning_rate": 9.791814946619218e-05, "loss": 1.8154, "step": 5747 }, { "epoch": 2.554666666666667, "grad_norm": 1.9647773504257202, "learning_rate": 9.790035587188613e-05, "loss": 0.7431, "step": 5748 }, { "epoch": 2.555111111111111, "grad_norm": 3.4745988845825195, "learning_rate": 9.788256227758008e-05, "loss": 1.7861, "step": 5749 }, { "epoch": 2.5555555555555554, "grad_norm": 4.805171489715576, "learning_rate": 9.786476868327403e-05, "loss": 1.0196, "step": 5750 }, { "epoch": 2.556, "grad_norm": 1.983173131942749, "learning_rate": 9.784697508896797e-05, "loss": 2.2983, "step": 5751 }, { "epoch": 2.5564444444444443, "grad_norm": 1.9651044607162476, "learning_rate": 9.782918149466193e-05, "loss": 1.9459, "step": 5752 }, { "epoch": 2.556888888888889, "grad_norm": 2.155700445175171, "learning_rate": 9.781138790035588e-05, "loss": 2.1677, "step": 5753 }, { "epoch": 2.5573333333333332, "grad_norm": 0.1379898637533188, "learning_rate": 9.779359430604982e-05, "loss": 0.0167, "step": 5754 }, { "epoch": 2.557777777777778, "grad_norm": 2.319441080093384, "learning_rate": 9.777580071174378e-05, "loss": 2.1024, "step": 5755 }, { "epoch": 2.558222222222222, "grad_norm": 2.3348388671875, "learning_rate": 9.775800711743772e-05, "loss": 2.4375, "step": 5756 }, { "epoch": 2.5586666666666664, "grad_norm": 2.42160964012146, "learning_rate": 9.774021352313168e-05, "loss": 2.0312, "step": 5757 }, { "epoch": 2.559111111111111, "grad_norm": 2.4215610027313232, "learning_rate": 9.772241992882562e-05, "loss": 2.1268, "step": 5758 }, { "epoch": 2.559555555555556, "grad_norm": 1.9341044425964355, "learning_rate": 9.770462633451958e-05, "loss": 1.6431, "step": 5759 }, { "epoch": 2.56, "grad_norm": 2.569185972213745, "learning_rate": 9.768683274021354e-05, "loss": 1.9818, "step": 5760 }, { "epoch": 2.5604444444444443, "grad_norm": 2.380908727645874, "learning_rate": 9.766903914590748e-05, "loss": 1.6281, "step": 5761 }, { "epoch": 2.560888888888889, "grad_norm": 2.445352554321289, "learning_rate": 9.765124555160144e-05, "loss": 1.7746, "step": 5762 }, { "epoch": 2.5613333333333332, "grad_norm": 1.9633126258850098, "learning_rate": 9.763345195729538e-05, "loss": 1.2796, "step": 5763 }, { "epoch": 2.561777777777778, "grad_norm": 2.4921209812164307, "learning_rate": 9.761565836298932e-05, "loss": 2.1531, "step": 5764 }, { "epoch": 2.562222222222222, "grad_norm": 1.6959223747253418, "learning_rate": 9.759786476868328e-05, "loss": 0.8272, "step": 5765 }, { "epoch": 2.562666666666667, "grad_norm": 2.389455556869507, "learning_rate": 9.758007117437723e-05, "loss": 1.7277, "step": 5766 }, { "epoch": 2.563111111111111, "grad_norm": 1.7974770069122314, "learning_rate": 9.756227758007118e-05, "loss": 1.1353, "step": 5767 }, { "epoch": 2.5635555555555554, "grad_norm": 2.2047617435455322, "learning_rate": 9.754448398576513e-05, "loss": 1.7808, "step": 5768 }, { "epoch": 2.564, "grad_norm": 3.0488312244415283, "learning_rate": 9.752669039145908e-05, "loss": 2.2577, "step": 5769 }, { "epoch": 2.5644444444444443, "grad_norm": 2.719773054122925, "learning_rate": 9.750889679715302e-05, "loss": 2.1927, "step": 5770 }, { "epoch": 2.564888888888889, "grad_norm": 2.654473304748535, "learning_rate": 9.749110320284698e-05, "loss": 1.8773, "step": 5771 }, { "epoch": 2.5653333333333332, "grad_norm": 2.6189746856689453, "learning_rate": 9.747330960854093e-05, "loss": 1.5834, "step": 5772 }, { "epoch": 2.565777777777778, "grad_norm": 2.9932594299316406, "learning_rate": 9.745551601423488e-05, "loss": 2.3835, "step": 5773 }, { "epoch": 2.566222222222222, "grad_norm": 2.437887668609619, "learning_rate": 9.743772241992883e-05, "loss": 1.7243, "step": 5774 }, { "epoch": 2.5666666666666664, "grad_norm": 2.175389051437378, "learning_rate": 9.741992882562279e-05, "loss": 1.4495, "step": 5775 }, { "epoch": 2.567111111111111, "grad_norm": 2.9918575286865234, "learning_rate": 9.740213523131673e-05, "loss": 2.0336, "step": 5776 }, { "epoch": 2.567555555555556, "grad_norm": 2.525195360183716, "learning_rate": 9.738434163701067e-05, "loss": 1.6138, "step": 5777 }, { "epoch": 2.568, "grad_norm": 2.7021284103393555, "learning_rate": 9.736654804270463e-05, "loss": 1.7927, "step": 5778 }, { "epoch": 2.5684444444444443, "grad_norm": 2.420149087905884, "learning_rate": 9.734875444839859e-05, "loss": 1.7339, "step": 5779 }, { "epoch": 2.568888888888889, "grad_norm": 2.3807191848754883, "learning_rate": 9.733096085409253e-05, "loss": 0.6999, "step": 5780 }, { "epoch": 2.5693333333333332, "grad_norm": 2.4973058700561523, "learning_rate": 9.731316725978649e-05, "loss": 1.6894, "step": 5781 }, { "epoch": 2.569777777777778, "grad_norm": 2.848585844039917, "learning_rate": 9.729537366548043e-05, "loss": 1.9012, "step": 5782 }, { "epoch": 2.570222222222222, "grad_norm": 2.616931676864624, "learning_rate": 9.727758007117437e-05, "loss": 1.5713, "step": 5783 }, { "epoch": 2.570666666666667, "grad_norm": 2.4025752544403076, "learning_rate": 9.725978647686833e-05, "loss": 1.6347, "step": 5784 }, { "epoch": 2.571111111111111, "grad_norm": 2.759446620941162, "learning_rate": 9.724199288256229e-05, "loss": 1.5939, "step": 5785 }, { "epoch": 2.5715555555555554, "grad_norm": 2.421267032623291, "learning_rate": 9.722419928825623e-05, "loss": 1.3653, "step": 5786 }, { "epoch": 2.572, "grad_norm": 3.161170482635498, "learning_rate": 9.720640569395019e-05, "loss": 1.5433, "step": 5787 }, { "epoch": 2.5724444444444443, "grad_norm": 3.038508653640747, "learning_rate": 9.718861209964413e-05, "loss": 1.4943, "step": 5788 }, { "epoch": 2.572888888888889, "grad_norm": 2.3798742294311523, "learning_rate": 9.717081850533809e-05, "loss": 1.1891, "step": 5789 }, { "epoch": 2.5733333333333333, "grad_norm": 2.9611308574676514, "learning_rate": 9.715302491103203e-05, "loss": 1.6984, "step": 5790 }, { "epoch": 2.573777777777778, "grad_norm": 3.015956401824951, "learning_rate": 9.713523131672598e-05, "loss": 1.6971, "step": 5791 }, { "epoch": 2.574222222222222, "grad_norm": 2.819864273071289, "learning_rate": 9.711743772241994e-05, "loss": 1.5978, "step": 5792 }, { "epoch": 2.5746666666666664, "grad_norm": 3.813222885131836, "learning_rate": 9.709964412811388e-05, "loss": 1.9956, "step": 5793 }, { "epoch": 2.575111111111111, "grad_norm": 2.9637928009033203, "learning_rate": 9.708185053380784e-05, "loss": 1.7197, "step": 5794 }, { "epoch": 2.575555555555556, "grad_norm": 3.4697177410125732, "learning_rate": 9.706405693950178e-05, "loss": 1.3374, "step": 5795 }, { "epoch": 2.576, "grad_norm": 3.505645751953125, "learning_rate": 9.704626334519573e-05, "loss": 1.9452, "step": 5796 }, { "epoch": 2.5764444444444443, "grad_norm": 3.6183602809906006, "learning_rate": 9.702846975088968e-05, "loss": 1.6057, "step": 5797 }, { "epoch": 2.576888888888889, "grad_norm": 3.725675106048584, "learning_rate": 9.701067615658364e-05, "loss": 1.6473, "step": 5798 }, { "epoch": 2.5773333333333333, "grad_norm": 2.1364715099334717, "learning_rate": 9.699288256227758e-05, "loss": 0.8211, "step": 5799 }, { "epoch": 2.5777777777777775, "grad_norm": 4.8051934242248535, "learning_rate": 9.697508896797154e-05, "loss": 1.2673, "step": 5800 }, { "epoch": 2.578222222222222, "grad_norm": 1.737740397453308, "learning_rate": 9.695729537366548e-05, "loss": 2.0515, "step": 5801 }, { "epoch": 2.578666666666667, "grad_norm": 1.431738018989563, "learning_rate": 9.693950177935944e-05, "loss": 1.0424, "step": 5802 }, { "epoch": 2.579111111111111, "grad_norm": 1.8884087800979614, "learning_rate": 9.692170818505338e-05, "loss": 2.2214, "step": 5803 }, { "epoch": 2.5795555555555554, "grad_norm": 2.2000203132629395, "learning_rate": 9.690391459074734e-05, "loss": 2.3403, "step": 5804 }, { "epoch": 2.58, "grad_norm": 1.952431559562683, "learning_rate": 9.68861209964413e-05, "loss": 2.2942, "step": 5805 }, { "epoch": 2.5804444444444443, "grad_norm": 2.162529706954956, "learning_rate": 9.686832740213524e-05, "loss": 2.1921, "step": 5806 }, { "epoch": 2.580888888888889, "grad_norm": 2.1220896244049072, "learning_rate": 9.68505338078292e-05, "loss": 1.914, "step": 5807 }, { "epoch": 2.5813333333333333, "grad_norm": 1.9332010746002197, "learning_rate": 9.683274021352314e-05, "loss": 1.4451, "step": 5808 }, { "epoch": 2.581777777777778, "grad_norm": 2.8427624702453613, "learning_rate": 9.681494661921708e-05, "loss": 2.0637, "step": 5809 }, { "epoch": 2.582222222222222, "grad_norm": 1.7098627090454102, "learning_rate": 9.679715302491104e-05, "loss": 1.2616, "step": 5810 }, { "epoch": 2.5826666666666664, "grad_norm": 2.2705390453338623, "learning_rate": 9.677935943060499e-05, "loss": 1.9668, "step": 5811 }, { "epoch": 2.583111111111111, "grad_norm": 2.204101800918579, "learning_rate": 9.676156583629894e-05, "loss": 1.8447, "step": 5812 }, { "epoch": 2.583555555555556, "grad_norm": 2.294118642807007, "learning_rate": 9.674377224199289e-05, "loss": 1.7681, "step": 5813 }, { "epoch": 2.584, "grad_norm": 2.175595760345459, "learning_rate": 9.672597864768683e-05, "loss": 1.9735, "step": 5814 }, { "epoch": 2.5844444444444443, "grad_norm": 2.1814606189727783, "learning_rate": 9.670818505338078e-05, "loss": 1.709, "step": 5815 }, { "epoch": 2.584888888888889, "grad_norm": 2.43196702003479, "learning_rate": 9.669039145907473e-05, "loss": 2.0574, "step": 5816 }, { "epoch": 2.5853333333333333, "grad_norm": 2.466120958328247, "learning_rate": 9.667259786476869e-05, "loss": 2.2044, "step": 5817 }, { "epoch": 2.5857777777777775, "grad_norm": 1.9092581272125244, "learning_rate": 9.665480427046263e-05, "loss": 0.5758, "step": 5818 }, { "epoch": 2.586222222222222, "grad_norm": 2.337799310684204, "learning_rate": 9.663701067615659e-05, "loss": 2.1761, "step": 5819 }, { "epoch": 2.586666666666667, "grad_norm": 2.0676565170288086, "learning_rate": 9.661921708185055e-05, "loss": 1.7785, "step": 5820 }, { "epoch": 2.587111111111111, "grad_norm": 2.228407382965088, "learning_rate": 9.660142348754449e-05, "loss": 1.3814, "step": 5821 }, { "epoch": 2.5875555555555554, "grad_norm": 2.3954479694366455, "learning_rate": 9.658362989323843e-05, "loss": 1.3558, "step": 5822 }, { "epoch": 2.588, "grad_norm": 2.1830084323883057, "learning_rate": 9.656583629893239e-05, "loss": 1.8961, "step": 5823 }, { "epoch": 2.5884444444444443, "grad_norm": 2.57344913482666, "learning_rate": 9.654804270462635e-05, "loss": 1.8005, "step": 5824 }, { "epoch": 2.588888888888889, "grad_norm": 2.3932812213897705, "learning_rate": 9.653024911032029e-05, "loss": 1.824, "step": 5825 }, { "epoch": 2.5893333333333333, "grad_norm": 2.067878007888794, "learning_rate": 9.651245551601425e-05, "loss": 1.7157, "step": 5826 }, { "epoch": 2.589777777777778, "grad_norm": 2.737030267715454, "learning_rate": 9.649466192170819e-05, "loss": 1.3283, "step": 5827 }, { "epoch": 2.590222222222222, "grad_norm": 2.7902398109436035, "learning_rate": 9.647686832740213e-05, "loss": 1.8529, "step": 5828 }, { "epoch": 2.5906666666666665, "grad_norm": 2.5697999000549316, "learning_rate": 9.645907473309609e-05, "loss": 1.7706, "step": 5829 }, { "epoch": 2.591111111111111, "grad_norm": 2.486321449279785, "learning_rate": 9.644128113879004e-05, "loss": 1.4964, "step": 5830 }, { "epoch": 2.5915555555555554, "grad_norm": 2.440593719482422, "learning_rate": 9.642348754448399e-05, "loss": 1.732, "step": 5831 }, { "epoch": 2.592, "grad_norm": 2.2928342819213867, "learning_rate": 9.640569395017794e-05, "loss": 1.0141, "step": 5832 }, { "epoch": 2.5924444444444443, "grad_norm": 2.5680768489837646, "learning_rate": 9.63879003558719e-05, "loss": 1.3784, "step": 5833 }, { "epoch": 2.592888888888889, "grad_norm": 2.402845621109009, "learning_rate": 9.637010676156584e-05, "loss": 1.7085, "step": 5834 }, { "epoch": 2.5933333333333333, "grad_norm": 1.819101333618164, "learning_rate": 9.635231316725979e-05, "loss": 0.8828, "step": 5835 }, { "epoch": 2.5937777777777775, "grad_norm": 2.9509596824645996, "learning_rate": 9.633451957295374e-05, "loss": 1.4475, "step": 5836 }, { "epoch": 2.594222222222222, "grad_norm": 2.3825151920318604, "learning_rate": 9.63167259786477e-05, "loss": 1.2385, "step": 5837 }, { "epoch": 2.594666666666667, "grad_norm": 3.0059218406677246, "learning_rate": 9.629893238434164e-05, "loss": 1.6829, "step": 5838 }, { "epoch": 2.595111111111111, "grad_norm": 3.277036190032959, "learning_rate": 9.62811387900356e-05, "loss": 1.6309, "step": 5839 }, { "epoch": 2.5955555555555554, "grad_norm": 2.9145870208740234, "learning_rate": 9.626334519572954e-05, "loss": 2.0603, "step": 5840 }, { "epoch": 2.596, "grad_norm": 2.803736686706543, "learning_rate": 9.624555160142348e-05, "loss": 1.6505, "step": 5841 }, { "epoch": 2.5964444444444443, "grad_norm": 3.3242056369781494, "learning_rate": 9.622775800711744e-05, "loss": 1.9847, "step": 5842 }, { "epoch": 2.596888888888889, "grad_norm": 3.6770615577697754, "learning_rate": 9.62099644128114e-05, "loss": 1.4115, "step": 5843 }, { "epoch": 2.5973333333333333, "grad_norm": 2.989746332168579, "learning_rate": 9.619217081850534e-05, "loss": 1.7387, "step": 5844 }, { "epoch": 2.597777777777778, "grad_norm": 3.3722546100616455, "learning_rate": 9.61743772241993e-05, "loss": 1.6478, "step": 5845 }, { "epoch": 2.598222222222222, "grad_norm": 3.2518842220306396, "learning_rate": 9.615658362989324e-05, "loss": 1.1471, "step": 5846 }, { "epoch": 2.5986666666666665, "grad_norm": 3.7409679889678955, "learning_rate": 9.61387900355872e-05, "loss": 1.9251, "step": 5847 }, { "epoch": 2.599111111111111, "grad_norm": 3.091366767883301, "learning_rate": 9.612099644128114e-05, "loss": 1.5747, "step": 5848 }, { "epoch": 2.5995555555555554, "grad_norm": 3.8686630725860596, "learning_rate": 9.61032028469751e-05, "loss": 1.0119, "step": 5849 }, { "epoch": 2.6, "grad_norm": 2.7347426414489746, "learning_rate": 9.608540925266905e-05, "loss": 0.4375, "step": 5850 }, { "epoch": 2.6004444444444443, "grad_norm": 1.894957184791565, "learning_rate": 9.6067615658363e-05, "loss": 2.3036, "step": 5851 }, { "epoch": 2.600888888888889, "grad_norm": 1.9998407363891602, "learning_rate": 9.604982206405695e-05, "loss": 2.2295, "step": 5852 }, { "epoch": 2.6013333333333333, "grad_norm": 2.1077568531036377, "learning_rate": 9.60320284697509e-05, "loss": 1.085, "step": 5853 }, { "epoch": 2.6017777777777775, "grad_norm": 2.0301358699798584, "learning_rate": 9.601423487544484e-05, "loss": 1.1873, "step": 5854 }, { "epoch": 2.602222222222222, "grad_norm": 2.020034074783325, "learning_rate": 9.59964412811388e-05, "loss": 1.7447, "step": 5855 }, { "epoch": 2.602666666666667, "grad_norm": 2.428344964981079, "learning_rate": 9.597864768683275e-05, "loss": 2.155, "step": 5856 }, { "epoch": 2.603111111111111, "grad_norm": 2.2994885444641113, "learning_rate": 9.596085409252669e-05, "loss": 2.0446, "step": 5857 }, { "epoch": 2.6035555555555554, "grad_norm": 2.4764015674591064, "learning_rate": 9.594306049822065e-05, "loss": 1.8314, "step": 5858 }, { "epoch": 2.604, "grad_norm": 2.4663918018341064, "learning_rate": 9.592526690391459e-05, "loss": 1.6086, "step": 5859 }, { "epoch": 2.6044444444444443, "grad_norm": 2.3457717895507812, "learning_rate": 9.590747330960854e-05, "loss": 1.8214, "step": 5860 }, { "epoch": 2.604888888888889, "grad_norm": 1.8627485036849976, "learning_rate": 9.588967971530249e-05, "loss": 1.225, "step": 5861 }, { "epoch": 2.6053333333333333, "grad_norm": 2.610481023788452, "learning_rate": 9.587188612099645e-05, "loss": 1.8318, "step": 5862 }, { "epoch": 2.605777777777778, "grad_norm": 2.533219337463379, "learning_rate": 9.585409252669039e-05, "loss": 1.8934, "step": 5863 }, { "epoch": 2.606222222222222, "grad_norm": 2.506080389022827, "learning_rate": 9.583629893238435e-05, "loss": 2.0397, "step": 5864 }, { "epoch": 2.6066666666666665, "grad_norm": 2.50927734375, "learning_rate": 9.58185053380783e-05, "loss": 1.8956, "step": 5865 }, { "epoch": 2.607111111111111, "grad_norm": 2.1902568340301514, "learning_rate": 9.580071174377225e-05, "loss": 1.7044, "step": 5866 }, { "epoch": 2.6075555555555554, "grad_norm": 2.695941925048828, "learning_rate": 9.578291814946619e-05, "loss": 1.936, "step": 5867 }, { "epoch": 2.608, "grad_norm": 2.4527158737182617, "learning_rate": 9.576512455516015e-05, "loss": 1.4728, "step": 5868 }, { "epoch": 2.6084444444444443, "grad_norm": 2.4402294158935547, "learning_rate": 9.57473309608541e-05, "loss": 1.6587, "step": 5869 }, { "epoch": 2.608888888888889, "grad_norm": 2.2047953605651855, "learning_rate": 9.572953736654805e-05, "loss": 1.5533, "step": 5870 }, { "epoch": 2.6093333333333333, "grad_norm": 2.1342928409576416, "learning_rate": 9.5711743772242e-05, "loss": 1.6553, "step": 5871 }, { "epoch": 2.6097777777777775, "grad_norm": 2.7137703895568848, "learning_rate": 9.569395017793595e-05, "loss": 2.0063, "step": 5872 }, { "epoch": 2.610222222222222, "grad_norm": 2.3683669567108154, "learning_rate": 9.567615658362989e-05, "loss": 1.4502, "step": 5873 }, { "epoch": 2.610666666666667, "grad_norm": 2.7055866718292236, "learning_rate": 9.565836298932385e-05, "loss": 2.0081, "step": 5874 }, { "epoch": 2.611111111111111, "grad_norm": 2.435533046722412, "learning_rate": 9.56405693950178e-05, "loss": 1.4356, "step": 5875 }, { "epoch": 2.6115555555555554, "grad_norm": 2.407223701477051, "learning_rate": 9.562277580071174e-05, "loss": 1.4788, "step": 5876 }, { "epoch": 2.612, "grad_norm": 2.53997540473938, "learning_rate": 9.56049822064057e-05, "loss": 1.6948, "step": 5877 }, { "epoch": 2.6124444444444443, "grad_norm": 2.5272786617279053, "learning_rate": 9.558718861209966e-05, "loss": 2.0246, "step": 5878 }, { "epoch": 2.612888888888889, "grad_norm": 2.6360082626342773, "learning_rate": 9.55693950177936e-05, "loss": 1.6889, "step": 5879 }, { "epoch": 2.6133333333333333, "grad_norm": 2.879133462905884, "learning_rate": 9.555160142348754e-05, "loss": 1.7153, "step": 5880 }, { "epoch": 2.613777777777778, "grad_norm": 1.858364224433899, "learning_rate": 9.55338078291815e-05, "loss": 0.8528, "step": 5881 }, { "epoch": 2.6142222222222222, "grad_norm": 2.2637181282043457, "learning_rate": 9.551601423487546e-05, "loss": 1.3998, "step": 5882 }, { "epoch": 2.6146666666666665, "grad_norm": 2.4725327491760254, "learning_rate": 9.54982206405694e-05, "loss": 1.8587, "step": 5883 }, { "epoch": 2.615111111111111, "grad_norm": 2.9442496299743652, "learning_rate": 9.548042704626336e-05, "loss": 2.0215, "step": 5884 }, { "epoch": 2.6155555555555554, "grad_norm": 2.771085739135742, "learning_rate": 9.54626334519573e-05, "loss": 1.7668, "step": 5885 }, { "epoch": 2.616, "grad_norm": 3.2240281105041504, "learning_rate": 9.544483985765124e-05, "loss": 1.9279, "step": 5886 }, { "epoch": 2.6164444444444444, "grad_norm": 2.726102590560913, "learning_rate": 9.54270462633452e-05, "loss": 1.5733, "step": 5887 }, { "epoch": 2.616888888888889, "grad_norm": 2.668149709701538, "learning_rate": 9.540925266903915e-05, "loss": 1.562, "step": 5888 }, { "epoch": 2.6173333333333333, "grad_norm": 2.835635185241699, "learning_rate": 9.53914590747331e-05, "loss": 1.7907, "step": 5889 }, { "epoch": 2.6177777777777775, "grad_norm": 3.4420056343078613, "learning_rate": 9.537366548042705e-05, "loss": 1.7993, "step": 5890 }, { "epoch": 2.6182222222222222, "grad_norm": 3.626722574234009, "learning_rate": 9.535587188612101e-05, "loss": 2.1257, "step": 5891 }, { "epoch": 2.618666666666667, "grad_norm": 2.6178202629089355, "learning_rate": 9.533807829181495e-05, "loss": 1.3251, "step": 5892 }, { "epoch": 2.619111111111111, "grad_norm": 2.7256970405578613, "learning_rate": 9.53202846975089e-05, "loss": 1.5753, "step": 5893 }, { "epoch": 2.6195555555555554, "grad_norm": 3.222346782684326, "learning_rate": 9.530249110320285e-05, "loss": 1.928, "step": 5894 }, { "epoch": 2.62, "grad_norm": 3.2846624851226807, "learning_rate": 9.528469750889681e-05, "loss": 1.8586, "step": 5895 }, { "epoch": 2.6204444444444444, "grad_norm": 3.4086952209472656, "learning_rate": 9.526690391459075e-05, "loss": 1.5629, "step": 5896 }, { "epoch": 2.620888888888889, "grad_norm": 3.3256282806396484, "learning_rate": 9.524911032028471e-05, "loss": 1.6927, "step": 5897 }, { "epoch": 2.6213333333333333, "grad_norm": 3.0743277072906494, "learning_rate": 9.523131672597865e-05, "loss": 1.4407, "step": 5898 }, { "epoch": 2.621777777777778, "grad_norm": 5.507002353668213, "learning_rate": 9.52135231316726e-05, "loss": 1.2145, "step": 5899 }, { "epoch": 2.6222222222222222, "grad_norm": 4.734825134277344, "learning_rate": 9.519572953736655e-05, "loss": 1.7574, "step": 5900 }, { "epoch": 2.6226666666666665, "grad_norm": 1.4950133562088013, "learning_rate": 9.517793594306051e-05, "loss": 1.0372, "step": 5901 }, { "epoch": 2.623111111111111, "grad_norm": 2.024033784866333, "learning_rate": 9.516014234875445e-05, "loss": 1.9618, "step": 5902 }, { "epoch": 2.6235555555555554, "grad_norm": 2.259506940841675, "learning_rate": 9.514234875444841e-05, "loss": 1.7976, "step": 5903 }, { "epoch": 2.624, "grad_norm": 2.1952743530273438, "learning_rate": 9.512455516014235e-05, "loss": 2.5925, "step": 5904 }, { "epoch": 2.6244444444444444, "grad_norm": 2.353161096572876, "learning_rate": 9.51067615658363e-05, "loss": 1.9936, "step": 5905 }, { "epoch": 2.624888888888889, "grad_norm": 1.892688274383545, "learning_rate": 9.508896797153025e-05, "loss": 1.7493, "step": 5906 }, { "epoch": 2.6253333333333333, "grad_norm": 2.5873072147369385, "learning_rate": 9.50711743772242e-05, "loss": 1.8485, "step": 5907 }, { "epoch": 2.6257777777777775, "grad_norm": 1.6135746240615845, "learning_rate": 9.505338078291815e-05, "loss": 0.9499, "step": 5908 }, { "epoch": 2.6262222222222222, "grad_norm": 2.211524486541748, "learning_rate": 9.50355871886121e-05, "loss": 1.6318, "step": 5909 }, { "epoch": 2.626666666666667, "grad_norm": 2.18351411819458, "learning_rate": 9.501779359430606e-05, "loss": 1.6961, "step": 5910 }, { "epoch": 2.627111111111111, "grad_norm": 2.3403427600860596, "learning_rate": 9.5e-05, "loss": 1.2993, "step": 5911 }, { "epoch": 2.6275555555555554, "grad_norm": 2.16619610786438, "learning_rate": 9.498220640569395e-05, "loss": 1.5, "step": 5912 }, { "epoch": 2.628, "grad_norm": 2.4509599208831787, "learning_rate": 9.49644128113879e-05, "loss": 2.1853, "step": 5913 }, { "epoch": 2.6284444444444444, "grad_norm": 2.2944180965423584, "learning_rate": 9.494661921708186e-05, "loss": 1.4259, "step": 5914 }, { "epoch": 2.628888888888889, "grad_norm": 2.5902411937713623, "learning_rate": 9.49288256227758e-05, "loss": 1.5032, "step": 5915 }, { "epoch": 2.6293333333333333, "grad_norm": 2.6031863689422607, "learning_rate": 9.491103202846976e-05, "loss": 1.9135, "step": 5916 }, { "epoch": 2.629777777777778, "grad_norm": 2.3265883922576904, "learning_rate": 9.48932384341637e-05, "loss": 1.2576, "step": 5917 }, { "epoch": 2.6302222222222222, "grad_norm": 2.358459949493408, "learning_rate": 9.487544483985765e-05, "loss": 1.8601, "step": 5918 }, { "epoch": 2.6306666666666665, "grad_norm": 2.6963040828704834, "learning_rate": 9.48576512455516e-05, "loss": 2.0838, "step": 5919 }, { "epoch": 2.631111111111111, "grad_norm": 2.430453300476074, "learning_rate": 9.483985765124556e-05, "loss": 1.6023, "step": 5920 }, { "epoch": 2.6315555555555554, "grad_norm": 2.707322120666504, "learning_rate": 9.48220640569395e-05, "loss": 1.5959, "step": 5921 }, { "epoch": 2.632, "grad_norm": 2.6505823135375977, "learning_rate": 9.480427046263346e-05, "loss": 1.9604, "step": 5922 }, { "epoch": 2.6324444444444444, "grad_norm": 2.96512770652771, "learning_rate": 9.478647686832742e-05, "loss": 1.7178, "step": 5923 }, { "epoch": 2.632888888888889, "grad_norm": 2.6181640625, "learning_rate": 9.476868327402136e-05, "loss": 2.2189, "step": 5924 }, { "epoch": 2.6333333333333333, "grad_norm": 2.878074884414673, "learning_rate": 9.47508896797153e-05, "loss": 1.4863, "step": 5925 }, { "epoch": 2.6337777777777776, "grad_norm": 2.7391178607940674, "learning_rate": 9.473309608540926e-05, "loss": 1.5675, "step": 5926 }, { "epoch": 2.6342222222222222, "grad_norm": 2.5607688426971436, "learning_rate": 9.471530249110321e-05, "loss": 1.6423, "step": 5927 }, { "epoch": 2.634666666666667, "grad_norm": 2.78110408782959, "learning_rate": 9.469750889679716e-05, "loss": 1.7438, "step": 5928 }, { "epoch": 2.635111111111111, "grad_norm": 2.7865052223205566, "learning_rate": 9.467971530249111e-05, "loss": 1.8183, "step": 5929 }, { "epoch": 2.6355555555555554, "grad_norm": 2.619302749633789, "learning_rate": 9.466192170818506e-05, "loss": 1.4989, "step": 5930 }, { "epoch": 2.636, "grad_norm": 2.7835965156555176, "learning_rate": 9.4644128113879e-05, "loss": 1.3163, "step": 5931 }, { "epoch": 2.6364444444444444, "grad_norm": 2.692561149597168, "learning_rate": 9.462633451957296e-05, "loss": 1.4886, "step": 5932 }, { "epoch": 2.6368888888888886, "grad_norm": 2.836426019668579, "learning_rate": 9.460854092526691e-05, "loss": 1.9111, "step": 5933 }, { "epoch": 2.6373333333333333, "grad_norm": 2.8386855125427246, "learning_rate": 9.459074733096086e-05, "loss": 1.8409, "step": 5934 }, { "epoch": 2.637777777777778, "grad_norm": 2.6482274532318115, "learning_rate": 9.457295373665481e-05, "loss": 1.6835, "step": 5935 }, { "epoch": 2.6382222222222222, "grad_norm": 2.3308165073394775, "learning_rate": 9.455516014234877e-05, "loss": 1.2158, "step": 5936 }, { "epoch": 2.6386666666666665, "grad_norm": 2.602177619934082, "learning_rate": 9.453736654804271e-05, "loss": 1.8307, "step": 5937 }, { "epoch": 2.639111111111111, "grad_norm": 3.2326931953430176, "learning_rate": 9.451957295373665e-05, "loss": 2.0871, "step": 5938 }, { "epoch": 2.6395555555555554, "grad_norm": 2.9063456058502197, "learning_rate": 9.450177935943061e-05, "loss": 1.7372, "step": 5939 }, { "epoch": 2.64, "grad_norm": 2.6239306926727295, "learning_rate": 9.448398576512457e-05, "loss": 1.5927, "step": 5940 }, { "epoch": 2.6404444444444444, "grad_norm": 3.01645827293396, "learning_rate": 9.446619217081851e-05, "loss": 1.5774, "step": 5941 }, { "epoch": 2.640888888888889, "grad_norm": 3.1588776111602783, "learning_rate": 9.444839857651247e-05, "loss": 1.6249, "step": 5942 }, { "epoch": 2.6413333333333333, "grad_norm": 3.4573142528533936, "learning_rate": 9.443060498220641e-05, "loss": 1.8496, "step": 5943 }, { "epoch": 2.6417777777777776, "grad_norm": 2.9585986137390137, "learning_rate": 9.441281138790035e-05, "loss": 2.2489, "step": 5944 }, { "epoch": 2.6422222222222222, "grad_norm": 3.1667559146881104, "learning_rate": 9.439501779359431e-05, "loss": 1.7358, "step": 5945 }, { "epoch": 2.642666666666667, "grad_norm": 3.757124900817871, "learning_rate": 9.437722419928827e-05, "loss": 2.1035, "step": 5946 }, { "epoch": 2.643111111111111, "grad_norm": 3.6398861408233643, "learning_rate": 9.435943060498221e-05, "loss": 2.1867, "step": 5947 }, { "epoch": 2.6435555555555554, "grad_norm": 3.7026424407958984, "learning_rate": 9.434163701067617e-05, "loss": 1.5336, "step": 5948 }, { "epoch": 2.644, "grad_norm": 3.198451519012451, "learning_rate": 9.432384341637012e-05, "loss": 0.8215, "step": 5949 }, { "epoch": 2.6444444444444444, "grad_norm": 3.469071865081787, "learning_rate": 9.430604982206405e-05, "loss": 0.8383, "step": 5950 }, { "epoch": 2.6448888888888886, "grad_norm": 1.5619441270828247, "learning_rate": 9.428825622775801e-05, "loss": 2.0473, "step": 5951 }, { "epoch": 2.6453333333333333, "grad_norm": 1.3054723739624023, "learning_rate": 9.427046263345196e-05, "loss": 1.0846, "step": 5952 }, { "epoch": 2.645777777777778, "grad_norm": 1.9514864683151245, "learning_rate": 9.425266903914591e-05, "loss": 1.7394, "step": 5953 }, { "epoch": 2.6462222222222223, "grad_norm": 2.2087137699127197, "learning_rate": 9.423487544483986e-05, "loss": 1.6852, "step": 5954 }, { "epoch": 2.6466666666666665, "grad_norm": 2.1699979305267334, "learning_rate": 9.421708185053382e-05, "loss": 1.6228, "step": 5955 }, { "epoch": 2.647111111111111, "grad_norm": 2.410844564437866, "learning_rate": 9.419928825622776e-05, "loss": 2.018, "step": 5956 }, { "epoch": 2.6475555555555554, "grad_norm": 2.351001024246216, "learning_rate": 9.41814946619217e-05, "loss": 1.5701, "step": 5957 }, { "epoch": 2.648, "grad_norm": 2.365635633468628, "learning_rate": 9.416370106761566e-05, "loss": 1.8983, "step": 5958 }, { "epoch": 2.6484444444444444, "grad_norm": 1.956677794456482, "learning_rate": 9.414590747330962e-05, "loss": 1.0431, "step": 5959 }, { "epoch": 2.648888888888889, "grad_norm": 2.353614330291748, "learning_rate": 9.412811387900356e-05, "loss": 1.9778, "step": 5960 }, { "epoch": 2.6493333333333333, "grad_norm": 2.23882794380188, "learning_rate": 9.411032028469752e-05, "loss": 1.4755, "step": 5961 }, { "epoch": 2.6497777777777776, "grad_norm": 2.3019869327545166, "learning_rate": 9.409252669039146e-05, "loss": 1.1742, "step": 5962 }, { "epoch": 2.6502222222222223, "grad_norm": 2.3776917457580566, "learning_rate": 9.40747330960854e-05, "loss": 1.6219, "step": 5963 }, { "epoch": 2.6506666666666665, "grad_norm": 2.3942043781280518, "learning_rate": 9.405693950177936e-05, "loss": 1.7453, "step": 5964 }, { "epoch": 2.651111111111111, "grad_norm": 2.4723472595214844, "learning_rate": 9.403914590747332e-05, "loss": 2.0211, "step": 5965 }, { "epoch": 2.6515555555555554, "grad_norm": 2.4571118354797363, "learning_rate": 9.402135231316726e-05, "loss": 1.6631, "step": 5966 }, { "epoch": 2.652, "grad_norm": 2.0202927589416504, "learning_rate": 9.400355871886122e-05, "loss": 1.291, "step": 5967 }, { "epoch": 2.6524444444444444, "grad_norm": 2.624063014984131, "learning_rate": 9.398576512455517e-05, "loss": 1.9158, "step": 5968 }, { "epoch": 2.6528888888888886, "grad_norm": 2.2154154777526855, "learning_rate": 9.396797153024912e-05, "loss": 1.46, "step": 5969 }, { "epoch": 2.6533333333333333, "grad_norm": 2.480447769165039, "learning_rate": 9.395017793594306e-05, "loss": 2.152, "step": 5970 }, { "epoch": 2.653777777777778, "grad_norm": 2.2933542728424072, "learning_rate": 9.393238434163702e-05, "loss": 1.5246, "step": 5971 }, { "epoch": 2.6542222222222223, "grad_norm": 2.259690523147583, "learning_rate": 9.391459074733097e-05, "loss": 1.8234, "step": 5972 }, { "epoch": 2.6546666666666665, "grad_norm": 2.219409942626953, "learning_rate": 9.389679715302492e-05, "loss": 1.8107, "step": 5973 }, { "epoch": 2.655111111111111, "grad_norm": 2.7897937297821045, "learning_rate": 9.387900355871887e-05, "loss": 1.7732, "step": 5974 }, { "epoch": 2.6555555555555554, "grad_norm": 2.3130133152008057, "learning_rate": 9.386120996441281e-05, "loss": 1.5716, "step": 5975 }, { "epoch": 2.656, "grad_norm": 2.908433437347412, "learning_rate": 9.384341637010676e-05, "loss": 1.9881, "step": 5976 }, { "epoch": 2.6564444444444444, "grad_norm": 2.9439632892608643, "learning_rate": 9.382562277580071e-05, "loss": 1.7524, "step": 5977 }, { "epoch": 2.656888888888889, "grad_norm": 2.63321852684021, "learning_rate": 9.380782918149467e-05, "loss": 1.9221, "step": 5978 }, { "epoch": 2.6573333333333333, "grad_norm": 2.665898323059082, "learning_rate": 9.379003558718861e-05, "loss": 1.9193, "step": 5979 }, { "epoch": 2.6577777777777776, "grad_norm": 2.8060519695281982, "learning_rate": 9.377224199288257e-05, "loss": 1.7567, "step": 5980 }, { "epoch": 2.6582222222222223, "grad_norm": 2.898313045501709, "learning_rate": 9.375444839857653e-05, "loss": 1.9132, "step": 5981 }, { "epoch": 2.6586666666666665, "grad_norm": 3.1064798831939697, "learning_rate": 9.373665480427047e-05, "loss": 1.5257, "step": 5982 }, { "epoch": 2.659111111111111, "grad_norm": 3.1904420852661133, "learning_rate": 9.371886120996441e-05, "loss": 2.2091, "step": 5983 }, { "epoch": 2.6595555555555555, "grad_norm": 2.535499095916748, "learning_rate": 9.370106761565837e-05, "loss": 1.5115, "step": 5984 }, { "epoch": 2.66, "grad_norm": 2.571568727493286, "learning_rate": 9.368327402135233e-05, "loss": 1.7336, "step": 5985 }, { "epoch": 2.6604444444444444, "grad_norm": 2.7381021976470947, "learning_rate": 9.366548042704627e-05, "loss": 1.7122, "step": 5986 }, { "epoch": 2.6608888888888886, "grad_norm": 2.603264331817627, "learning_rate": 9.364768683274022e-05, "loss": 1.6372, "step": 5987 }, { "epoch": 2.6613333333333333, "grad_norm": 2.825221300125122, "learning_rate": 9.362989323843417e-05, "loss": 1.8266, "step": 5988 }, { "epoch": 2.661777777777778, "grad_norm": 3.076425313949585, "learning_rate": 9.361209964412811e-05, "loss": 1.8103, "step": 5989 }, { "epoch": 2.6622222222222223, "grad_norm": 3.15580677986145, "learning_rate": 9.359430604982207e-05, "loss": 1.6719, "step": 5990 }, { "epoch": 2.6626666666666665, "grad_norm": 3.181922674179077, "learning_rate": 9.357651245551602e-05, "loss": 1.8815, "step": 5991 }, { "epoch": 2.663111111111111, "grad_norm": 2.974426746368408, "learning_rate": 9.355871886120997e-05, "loss": 1.6728, "step": 5992 }, { "epoch": 2.6635555555555555, "grad_norm": 2.7631449699401855, "learning_rate": 9.354092526690392e-05, "loss": 1.5703, "step": 5993 }, { "epoch": 2.664, "grad_norm": 3.3828341960906982, "learning_rate": 9.352313167259788e-05, "loss": 1.6526, "step": 5994 }, { "epoch": 2.6644444444444444, "grad_norm": 3.690214157104492, "learning_rate": 9.350533807829181e-05, "loss": 2.2105, "step": 5995 }, { "epoch": 2.664888888888889, "grad_norm": 4.263453960418701, "learning_rate": 9.348754448398577e-05, "loss": 1.8527, "step": 5996 }, { "epoch": 2.6653333333333333, "grad_norm": 2.7068159580230713, "learning_rate": 9.346975088967972e-05, "loss": 0.7449, "step": 5997 }, { "epoch": 2.6657777777777776, "grad_norm": 2.991706609725952, "learning_rate": 9.345195729537366e-05, "loss": 1.0355, "step": 5998 }, { "epoch": 2.6662222222222223, "grad_norm": 4.530425071716309, "learning_rate": 9.343416370106762e-05, "loss": 1.912, "step": 5999 }, { "epoch": 2.6666666666666665, "grad_norm": 4.280130386352539, "learning_rate": 9.341637010676158e-05, "loss": 1.333, "step": 6000 }, { "epoch": 2.667111111111111, "grad_norm": 1.9667905569076538, "learning_rate": 9.339857651245552e-05, "loss": 2.5096, "step": 6001 }, { "epoch": 2.6675555555555555, "grad_norm": 1.9078987836837769, "learning_rate": 9.338078291814946e-05, "loss": 2.1031, "step": 6002 }, { "epoch": 2.668, "grad_norm": 2.1769826412200928, "learning_rate": 9.336298932384342e-05, "loss": 2.0237, "step": 6003 }, { "epoch": 2.6684444444444444, "grad_norm": 0.29694122076034546, "learning_rate": 9.334519572953738e-05, "loss": 0.0224, "step": 6004 }, { "epoch": 2.6688888888888886, "grad_norm": 1.7639079093933105, "learning_rate": 9.332740213523132e-05, "loss": 0.9714, "step": 6005 }, { "epoch": 2.6693333333333333, "grad_norm": 2.3218326568603516, "learning_rate": 9.330960854092528e-05, "loss": 2.0642, "step": 6006 }, { "epoch": 2.669777777777778, "grad_norm": 2.195533514022827, "learning_rate": 9.329181494661923e-05, "loss": 1.1775, "step": 6007 }, { "epoch": 2.6702222222222223, "grad_norm": 1.75465989112854, "learning_rate": 9.327402135231316e-05, "loss": 0.9689, "step": 6008 }, { "epoch": 2.6706666666666665, "grad_norm": 2.281090259552002, "learning_rate": 9.325622775800712e-05, "loss": 1.977, "step": 6009 }, { "epoch": 2.671111111111111, "grad_norm": 1.779270052909851, "learning_rate": 9.323843416370108e-05, "loss": 0.882, "step": 6010 }, { "epoch": 2.6715555555555555, "grad_norm": 2.8689680099487305, "learning_rate": 9.322064056939502e-05, "loss": 2.0544, "step": 6011 }, { "epoch": 2.672, "grad_norm": 2.2885069847106934, "learning_rate": 9.320284697508897e-05, "loss": 1.6721, "step": 6012 }, { "epoch": 2.6724444444444444, "grad_norm": 2.3824915885925293, "learning_rate": 9.318505338078293e-05, "loss": 1.7404, "step": 6013 }, { "epoch": 2.672888888888889, "grad_norm": 2.4072515964508057, "learning_rate": 9.316725978647687e-05, "loss": 1.7259, "step": 6014 }, { "epoch": 2.6733333333333333, "grad_norm": 2.6987950801849365, "learning_rate": 9.314946619217082e-05, "loss": 1.9788, "step": 6015 }, { "epoch": 2.6737777777777776, "grad_norm": 2.521949052810669, "learning_rate": 9.313167259786477e-05, "loss": 1.4704, "step": 6016 }, { "epoch": 2.6742222222222223, "grad_norm": 3.2929980754852295, "learning_rate": 9.311387900355873e-05, "loss": 2.1755, "step": 6017 }, { "epoch": 2.6746666666666665, "grad_norm": 1.4418671131134033, "learning_rate": 9.309608540925267e-05, "loss": 0.817, "step": 6018 }, { "epoch": 2.675111111111111, "grad_norm": 1.5588750839233398, "learning_rate": 9.307829181494663e-05, "loss": 1.0768, "step": 6019 }, { "epoch": 2.6755555555555555, "grad_norm": 2.279388189315796, "learning_rate": 9.306049822064057e-05, "loss": 1.2211, "step": 6020 }, { "epoch": 2.676, "grad_norm": 2.7521657943725586, "learning_rate": 9.304270462633452e-05, "loss": 1.7047, "step": 6021 }, { "epoch": 2.6764444444444444, "grad_norm": 2.4141414165496826, "learning_rate": 9.302491103202847e-05, "loss": 1.6488, "step": 6022 }, { "epoch": 2.6768888888888887, "grad_norm": 2.966360330581665, "learning_rate": 9.300711743772243e-05, "loss": 1.8208, "step": 6023 }, { "epoch": 2.6773333333333333, "grad_norm": 2.5755410194396973, "learning_rate": 9.298932384341637e-05, "loss": 2.2004, "step": 6024 }, { "epoch": 2.677777777777778, "grad_norm": 2.3477094173431396, "learning_rate": 9.297153024911033e-05, "loss": 1.9106, "step": 6025 }, { "epoch": 2.6782222222222223, "grad_norm": 3.050549030303955, "learning_rate": 9.295373665480428e-05, "loss": 2.0783, "step": 6026 }, { "epoch": 2.6786666666666665, "grad_norm": 2.4758310317993164, "learning_rate": 9.293594306049823e-05, "loss": 1.7121, "step": 6027 }, { "epoch": 2.679111111111111, "grad_norm": 2.427130699157715, "learning_rate": 9.291814946619217e-05, "loss": 1.9215, "step": 6028 }, { "epoch": 2.6795555555555555, "grad_norm": 2.3932747840881348, "learning_rate": 9.290035587188613e-05, "loss": 1.7158, "step": 6029 }, { "epoch": 2.68, "grad_norm": 2.1773719787597656, "learning_rate": 9.288256227758008e-05, "loss": 1.6804, "step": 6030 }, { "epoch": 2.6804444444444444, "grad_norm": 2.6314051151275635, "learning_rate": 9.286476868327403e-05, "loss": 1.5102, "step": 6031 }, { "epoch": 2.680888888888889, "grad_norm": 2.660897970199585, "learning_rate": 9.284697508896798e-05, "loss": 1.9421, "step": 6032 }, { "epoch": 2.6813333333333333, "grad_norm": 3.031437873840332, "learning_rate": 9.282918149466193e-05, "loss": 1.9283, "step": 6033 }, { "epoch": 2.6817777777777776, "grad_norm": 2.821213722229004, "learning_rate": 9.281138790035587e-05, "loss": 1.7759, "step": 6034 }, { "epoch": 2.6822222222222223, "grad_norm": 2.7068347930908203, "learning_rate": 9.279359430604982e-05, "loss": 1.7589, "step": 6035 }, { "epoch": 2.6826666666666665, "grad_norm": 2.7685983180999756, "learning_rate": 9.277580071174378e-05, "loss": 2.0632, "step": 6036 }, { "epoch": 2.6831111111111112, "grad_norm": 2.5510952472686768, "learning_rate": 9.275800711743772e-05, "loss": 1.6365, "step": 6037 }, { "epoch": 2.6835555555555555, "grad_norm": 2.7311320304870605, "learning_rate": 9.274021352313168e-05, "loss": 1.5591, "step": 6038 }, { "epoch": 2.684, "grad_norm": 3.251199722290039, "learning_rate": 9.272241992882564e-05, "loss": 2.0724, "step": 6039 }, { "epoch": 2.6844444444444444, "grad_norm": 2.5799219608306885, "learning_rate": 9.270462633451957e-05, "loss": 1.643, "step": 6040 }, { "epoch": 2.6848888888888887, "grad_norm": 3.4391684532165527, "learning_rate": 9.268683274021352e-05, "loss": 1.8439, "step": 6041 }, { "epoch": 2.6853333333333333, "grad_norm": 3.380760908126831, "learning_rate": 9.266903914590748e-05, "loss": 2.3897, "step": 6042 }, { "epoch": 2.685777777777778, "grad_norm": 2.8219289779663086, "learning_rate": 9.265124555160142e-05, "loss": 1.5193, "step": 6043 }, { "epoch": 2.6862222222222223, "grad_norm": 2.5288350582122803, "learning_rate": 9.263345195729538e-05, "loss": 1.3944, "step": 6044 }, { "epoch": 2.6866666666666665, "grad_norm": 3.494917154312134, "learning_rate": 9.261565836298934e-05, "loss": 1.5498, "step": 6045 }, { "epoch": 2.6871111111111112, "grad_norm": 3.129040241241455, "learning_rate": 9.259786476868328e-05, "loss": 1.6695, "step": 6046 }, { "epoch": 2.6875555555555555, "grad_norm": 3.235013484954834, "learning_rate": 9.258007117437722e-05, "loss": 1.9297, "step": 6047 }, { "epoch": 2.6879999999999997, "grad_norm": 3.288318634033203, "learning_rate": 9.256227758007118e-05, "loss": 1.7766, "step": 6048 }, { "epoch": 2.6884444444444444, "grad_norm": 3.6393327713012695, "learning_rate": 9.254448398576513e-05, "loss": 2.0743, "step": 6049 }, { "epoch": 2.688888888888889, "grad_norm": 3.2492709159851074, "learning_rate": 9.252669039145908e-05, "loss": 0.7984, "step": 6050 }, { "epoch": 2.6893333333333334, "grad_norm": 1.417654037475586, "learning_rate": 9.250889679715303e-05, "loss": 1.1905, "step": 6051 }, { "epoch": 2.6897777777777776, "grad_norm": 2.2296252250671387, "learning_rate": 9.249110320284699e-05, "loss": 1.9266, "step": 6052 }, { "epoch": 2.6902222222222223, "grad_norm": 2.10345721244812, "learning_rate": 9.247330960854092e-05, "loss": 2.199, "step": 6053 }, { "epoch": 2.6906666666666665, "grad_norm": 2.360987901687622, "learning_rate": 9.245551601423488e-05, "loss": 2.2995, "step": 6054 }, { "epoch": 2.6911111111111112, "grad_norm": 2.832897186279297, "learning_rate": 9.243772241992883e-05, "loss": 1.8118, "step": 6055 }, { "epoch": 2.6915555555555555, "grad_norm": 2.3768863677978516, "learning_rate": 9.241992882562278e-05, "loss": 2.1399, "step": 6056 }, { "epoch": 2.692, "grad_norm": 2.3208935260772705, "learning_rate": 9.240213523131673e-05, "loss": 2.2196, "step": 6057 }, { "epoch": 2.6924444444444444, "grad_norm": 2.1701810359954834, "learning_rate": 9.238434163701069e-05, "loss": 2.0644, "step": 6058 }, { "epoch": 2.6928888888888887, "grad_norm": 2.7813336849212646, "learning_rate": 9.236654804270463e-05, "loss": 2.1576, "step": 6059 }, { "epoch": 2.6933333333333334, "grad_norm": 2.3689005374908447, "learning_rate": 9.234875444839857e-05, "loss": 2.148, "step": 6060 }, { "epoch": 2.693777777777778, "grad_norm": 2.3087990283966064, "learning_rate": 9.233096085409253e-05, "loss": 2.139, "step": 6061 }, { "epoch": 2.6942222222222223, "grad_norm": 2.087191581726074, "learning_rate": 9.231316725978649e-05, "loss": 1.7637, "step": 6062 }, { "epoch": 2.6946666666666665, "grad_norm": 2.2079708576202393, "learning_rate": 9.229537366548043e-05, "loss": 2.0657, "step": 6063 }, { "epoch": 2.6951111111111112, "grad_norm": 2.13736629486084, "learning_rate": 9.227758007117439e-05, "loss": 2.0342, "step": 6064 }, { "epoch": 2.6955555555555555, "grad_norm": 2.382661819458008, "learning_rate": 9.225978647686834e-05, "loss": 1.5989, "step": 6065 }, { "epoch": 2.6959999999999997, "grad_norm": 2.3359639644622803, "learning_rate": 9.224199288256227e-05, "loss": 2.0982, "step": 6066 }, { "epoch": 2.6964444444444444, "grad_norm": 2.5619516372680664, "learning_rate": 9.222419928825623e-05, "loss": 1.9643, "step": 6067 }, { "epoch": 2.696888888888889, "grad_norm": 2.3712518215179443, "learning_rate": 9.220640569395019e-05, "loss": 1.9981, "step": 6068 }, { "epoch": 2.6973333333333334, "grad_norm": 2.2327232360839844, "learning_rate": 9.218861209964413e-05, "loss": 1.081, "step": 6069 }, { "epoch": 2.6977777777777776, "grad_norm": 2.4085304737091064, "learning_rate": 9.217081850533809e-05, "loss": 1.8645, "step": 6070 }, { "epoch": 2.6982222222222223, "grad_norm": 2.5223472118377686, "learning_rate": 9.215302491103204e-05, "loss": 1.9147, "step": 6071 }, { "epoch": 2.6986666666666665, "grad_norm": 2.6552350521087646, "learning_rate": 9.213523131672598e-05, "loss": 2.2025, "step": 6072 }, { "epoch": 2.6991111111111112, "grad_norm": 2.412116289138794, "learning_rate": 9.211743772241993e-05, "loss": 1.7127, "step": 6073 }, { "epoch": 2.6995555555555555, "grad_norm": 2.261444091796875, "learning_rate": 9.209964412811388e-05, "loss": 1.4406, "step": 6074 }, { "epoch": 2.7, "grad_norm": 2.534367561340332, "learning_rate": 9.208185053380784e-05, "loss": 1.7541, "step": 6075 }, { "epoch": 2.7004444444444444, "grad_norm": 2.575896978378296, "learning_rate": 9.206405693950178e-05, "loss": 1.9618, "step": 6076 }, { "epoch": 2.7008888888888887, "grad_norm": 2.7344939708709717, "learning_rate": 9.204626334519574e-05, "loss": 1.9426, "step": 6077 }, { "epoch": 2.7013333333333334, "grad_norm": 2.7714240550994873, "learning_rate": 9.202846975088968e-05, "loss": 1.8616, "step": 6078 }, { "epoch": 2.7017777777777776, "grad_norm": 2.9522223472595215, "learning_rate": 9.201067615658363e-05, "loss": 2.3087, "step": 6079 }, { "epoch": 2.7022222222222223, "grad_norm": 2.702958345413208, "learning_rate": 9.199288256227758e-05, "loss": 2.236, "step": 6080 }, { "epoch": 2.7026666666666666, "grad_norm": 1.8355095386505127, "learning_rate": 9.197508896797154e-05, "loss": 0.6777, "step": 6081 }, { "epoch": 2.7031111111111112, "grad_norm": 2.5094728469848633, "learning_rate": 9.195729537366548e-05, "loss": 1.8198, "step": 6082 }, { "epoch": 2.7035555555555555, "grad_norm": 2.6528875827789307, "learning_rate": 9.193950177935944e-05, "loss": 1.7715, "step": 6083 }, { "epoch": 2.7039999999999997, "grad_norm": 2.9194884300231934, "learning_rate": 9.19217081850534e-05, "loss": 1.7147, "step": 6084 }, { "epoch": 2.7044444444444444, "grad_norm": 2.9441230297088623, "learning_rate": 9.190391459074732e-05, "loss": 1.854, "step": 6085 }, { "epoch": 2.704888888888889, "grad_norm": 2.6675310134887695, "learning_rate": 9.188612099644128e-05, "loss": 2.0047, "step": 6086 }, { "epoch": 2.7053333333333334, "grad_norm": 2.501265048980713, "learning_rate": 9.186832740213524e-05, "loss": 1.5176, "step": 6087 }, { "epoch": 2.7057777777777776, "grad_norm": 2.112760066986084, "learning_rate": 9.185053380782918e-05, "loss": 1.0436, "step": 6088 }, { "epoch": 2.7062222222222223, "grad_norm": 2.7593295574188232, "learning_rate": 9.183274021352314e-05, "loss": 1.6187, "step": 6089 }, { "epoch": 2.7066666666666666, "grad_norm": 2.7610411643981934, "learning_rate": 9.18149466192171e-05, "loss": 1.7194, "step": 6090 }, { "epoch": 2.7071111111111112, "grad_norm": 2.309563398361206, "learning_rate": 9.179715302491104e-05, "loss": 1.2155, "step": 6091 }, { "epoch": 2.7075555555555555, "grad_norm": 3.307159900665283, "learning_rate": 9.177935943060498e-05, "loss": 1.9251, "step": 6092 }, { "epoch": 2.708, "grad_norm": 2.4298038482666016, "learning_rate": 9.176156583629894e-05, "loss": 1.2146, "step": 6093 }, { "epoch": 2.7084444444444444, "grad_norm": 2.78938627243042, "learning_rate": 9.174377224199289e-05, "loss": 1.7818, "step": 6094 }, { "epoch": 2.7088888888888887, "grad_norm": 2.8825080394744873, "learning_rate": 9.172597864768684e-05, "loss": 1.6426, "step": 6095 }, { "epoch": 2.7093333333333334, "grad_norm": 3.2043745517730713, "learning_rate": 9.170818505338079e-05, "loss": 1.5032, "step": 6096 }, { "epoch": 2.7097777777777776, "grad_norm": 3.1680638790130615, "learning_rate": 9.169039145907475e-05, "loss": 1.3996, "step": 6097 }, { "epoch": 2.7102222222222223, "grad_norm": 3.6439359188079834, "learning_rate": 9.167259786476868e-05, "loss": 2.1672, "step": 6098 }, { "epoch": 2.7106666666666666, "grad_norm": 4.421741485595703, "learning_rate": 9.165480427046263e-05, "loss": 2.4516, "step": 6099 }, { "epoch": 2.7111111111111112, "grad_norm": 3.2291154861450195, "learning_rate": 9.163701067615659e-05, "loss": 1.6698, "step": 6100 }, { "epoch": 2.7115555555555555, "grad_norm": 1.8090177774429321, "learning_rate": 9.161921708185053e-05, "loss": 2.1445, "step": 6101 }, { "epoch": 2.7119999999999997, "grad_norm": 1.2429306507110596, "learning_rate": 9.160142348754449e-05, "loss": 1.2912, "step": 6102 }, { "epoch": 2.7124444444444444, "grad_norm": 2.093599319458008, "learning_rate": 9.158362989323845e-05, "loss": 2.4831, "step": 6103 }, { "epoch": 2.712888888888889, "grad_norm": 1.8647911548614502, "learning_rate": 9.156583629893239e-05, "loss": 0.9684, "step": 6104 }, { "epoch": 2.7133333333333334, "grad_norm": 2.1396780014038086, "learning_rate": 9.154804270462633e-05, "loss": 2.1317, "step": 6105 }, { "epoch": 2.7137777777777776, "grad_norm": 2.1516218185424805, "learning_rate": 9.153024911032029e-05, "loss": 1.7869, "step": 6106 }, { "epoch": 2.7142222222222223, "grad_norm": 2.357754945755005, "learning_rate": 9.151245551601425e-05, "loss": 2.3442, "step": 6107 }, { "epoch": 2.7146666666666666, "grad_norm": 2.1616320610046387, "learning_rate": 9.149466192170819e-05, "loss": 2.2035, "step": 6108 }, { "epoch": 2.7151111111111113, "grad_norm": 2.2545979022979736, "learning_rate": 9.147686832740214e-05, "loss": 2.3456, "step": 6109 }, { "epoch": 2.7155555555555555, "grad_norm": 1.8658015727996826, "learning_rate": 9.14590747330961e-05, "loss": 1.5012, "step": 6110 }, { "epoch": 2.716, "grad_norm": 2.5703420639038086, "learning_rate": 9.144128113879003e-05, "loss": 1.8163, "step": 6111 }, { "epoch": 2.7164444444444444, "grad_norm": 2.278449296951294, "learning_rate": 9.142348754448399e-05, "loss": 1.7411, "step": 6112 }, { "epoch": 2.7168888888888887, "grad_norm": 2.5653324127197266, "learning_rate": 9.140569395017794e-05, "loss": 1.8087, "step": 6113 }, { "epoch": 2.7173333333333334, "grad_norm": 2.3541901111602783, "learning_rate": 9.138790035587189e-05, "loss": 2.0437, "step": 6114 }, { "epoch": 2.7177777777777776, "grad_norm": 2.4183175563812256, "learning_rate": 9.137010676156584e-05, "loss": 1.6396, "step": 6115 }, { "epoch": 2.7182222222222223, "grad_norm": 2.6298437118530273, "learning_rate": 9.13523131672598e-05, "loss": 2.3171, "step": 6116 }, { "epoch": 2.7186666666666666, "grad_norm": 2.7562639713287354, "learning_rate": 9.133451957295374e-05, "loss": 2.0566, "step": 6117 }, { "epoch": 2.7191111111111113, "grad_norm": 2.4366018772125244, "learning_rate": 9.131672597864769e-05, "loss": 1.9631, "step": 6118 }, { "epoch": 2.7195555555555555, "grad_norm": 2.1214277744293213, "learning_rate": 9.129893238434164e-05, "loss": 1.3662, "step": 6119 }, { "epoch": 2.7199999999999998, "grad_norm": 2.162672996520996, "learning_rate": 9.12811387900356e-05, "loss": 1.3789, "step": 6120 }, { "epoch": 2.7204444444444444, "grad_norm": 2.6844053268432617, "learning_rate": 9.126334519572954e-05, "loss": 1.8149, "step": 6121 }, { "epoch": 2.720888888888889, "grad_norm": 2.491919994354248, "learning_rate": 9.12455516014235e-05, "loss": 1.8213, "step": 6122 }, { "epoch": 2.7213333333333334, "grad_norm": 2.4091458320617676, "learning_rate": 9.122775800711744e-05, "loss": 1.8815, "step": 6123 }, { "epoch": 2.7217777777777776, "grad_norm": 2.2828023433685303, "learning_rate": 9.120996441281138e-05, "loss": 1.4552, "step": 6124 }, { "epoch": 2.7222222222222223, "grad_norm": 2.2405014038085938, "learning_rate": 9.119217081850534e-05, "loss": 1.3707, "step": 6125 }, { "epoch": 2.7226666666666666, "grad_norm": 2.6642508506774902, "learning_rate": 9.11743772241993e-05, "loss": 1.7702, "step": 6126 }, { "epoch": 2.7231111111111113, "grad_norm": 2.5781748294830322, "learning_rate": 9.115658362989324e-05, "loss": 1.7319, "step": 6127 }, { "epoch": 2.7235555555555555, "grad_norm": 2.8042945861816406, "learning_rate": 9.11387900355872e-05, "loss": 1.8082, "step": 6128 }, { "epoch": 2.724, "grad_norm": 1.7119746208190918, "learning_rate": 9.112099644128115e-05, "loss": 0.7126, "step": 6129 }, { "epoch": 2.7244444444444444, "grad_norm": 2.4615836143493652, "learning_rate": 9.110320284697508e-05, "loss": 1.6634, "step": 6130 }, { "epoch": 2.7248888888888887, "grad_norm": 2.9589643478393555, "learning_rate": 9.108540925266904e-05, "loss": 1.829, "step": 6131 }, { "epoch": 2.7253333333333334, "grad_norm": 2.783677577972412, "learning_rate": 9.1067615658363e-05, "loss": 1.98, "step": 6132 }, { "epoch": 2.7257777777777776, "grad_norm": 3.0595808029174805, "learning_rate": 9.104982206405694e-05, "loss": 1.7831, "step": 6133 }, { "epoch": 2.7262222222222223, "grad_norm": 2.033073902130127, "learning_rate": 9.10320284697509e-05, "loss": 0.923, "step": 6134 }, { "epoch": 2.7266666666666666, "grad_norm": 3.1585965156555176, "learning_rate": 9.101423487544485e-05, "loss": 1.735, "step": 6135 }, { "epoch": 2.7271111111111113, "grad_norm": 2.5209286212921143, "learning_rate": 9.09964412811388e-05, "loss": 1.5229, "step": 6136 }, { "epoch": 2.7275555555555555, "grad_norm": 2.977914810180664, "learning_rate": 9.097864768683274e-05, "loss": 2.0039, "step": 6137 }, { "epoch": 2.7279999999999998, "grad_norm": 2.497366189956665, "learning_rate": 9.09608540925267e-05, "loss": 1.4511, "step": 6138 }, { "epoch": 2.7284444444444444, "grad_norm": 2.936041831970215, "learning_rate": 9.094306049822065e-05, "loss": 1.9052, "step": 6139 }, { "epoch": 2.728888888888889, "grad_norm": 3.006460428237915, "learning_rate": 9.092526690391459e-05, "loss": 1.758, "step": 6140 }, { "epoch": 2.7293333333333334, "grad_norm": 3.10183048248291, "learning_rate": 9.090747330960855e-05, "loss": 1.6804, "step": 6141 }, { "epoch": 2.7297777777777776, "grad_norm": 3.5119240283966064, "learning_rate": 9.08896797153025e-05, "loss": 1.763, "step": 6142 }, { "epoch": 2.7302222222222223, "grad_norm": 3.4593982696533203, "learning_rate": 9.087188612099644e-05, "loss": 2.3246, "step": 6143 }, { "epoch": 2.7306666666666666, "grad_norm": 2.949693202972412, "learning_rate": 9.085409252669039e-05, "loss": 1.4612, "step": 6144 }, { "epoch": 2.7311111111111113, "grad_norm": 3.316683769226074, "learning_rate": 9.083629893238435e-05, "loss": 2.0814, "step": 6145 }, { "epoch": 2.7315555555555555, "grad_norm": 2.9927797317504883, "learning_rate": 9.081850533807829e-05, "loss": 1.5698, "step": 6146 }, { "epoch": 2.732, "grad_norm": 2.82143497467041, "learning_rate": 9.080071174377225e-05, "loss": 1.6885, "step": 6147 }, { "epoch": 2.7324444444444445, "grad_norm": 3.127837657928467, "learning_rate": 9.07829181494662e-05, "loss": 1.7723, "step": 6148 }, { "epoch": 2.7328888888888887, "grad_norm": 3.9703283309936523, "learning_rate": 9.076512455516015e-05, "loss": 2.0333, "step": 6149 }, { "epoch": 2.7333333333333334, "grad_norm": 2.2306530475616455, "learning_rate": 9.074733096085409e-05, "loss": 0.5828, "step": 6150 }, { "epoch": 2.7337777777777776, "grad_norm": 2.2018139362335205, "learning_rate": 9.072953736654805e-05, "loss": 2.6287, "step": 6151 }, { "epoch": 2.7342222222222223, "grad_norm": 1.7515716552734375, "learning_rate": 9.0711743772242e-05, "loss": 1.8294, "step": 6152 }, { "epoch": 2.7346666666666666, "grad_norm": 1.612664818763733, "learning_rate": 9.069395017793595e-05, "loss": 1.3168, "step": 6153 }, { "epoch": 2.7351111111111113, "grad_norm": 2.2013888359069824, "learning_rate": 9.06761565836299e-05, "loss": 2.3534, "step": 6154 }, { "epoch": 2.7355555555555555, "grad_norm": 2.403697967529297, "learning_rate": 9.065836298932386e-05, "loss": 2.0122, "step": 6155 }, { "epoch": 2.7359999999999998, "grad_norm": 2.392401695251465, "learning_rate": 9.064056939501779e-05, "loss": 2.1935, "step": 6156 }, { "epoch": 2.7364444444444445, "grad_norm": 2.547696828842163, "learning_rate": 9.062277580071175e-05, "loss": 2.1032, "step": 6157 }, { "epoch": 2.736888888888889, "grad_norm": 2.0454320907592773, "learning_rate": 9.06049822064057e-05, "loss": 1.6723, "step": 6158 }, { "epoch": 2.7373333333333334, "grad_norm": 2.0025997161865234, "learning_rate": 9.058718861209964e-05, "loss": 1.5211, "step": 6159 }, { "epoch": 2.7377777777777776, "grad_norm": 2.7177071571350098, "learning_rate": 9.05693950177936e-05, "loss": 2.2681, "step": 6160 }, { "epoch": 2.7382222222222223, "grad_norm": 2.377166986465454, "learning_rate": 9.055160142348756e-05, "loss": 1.6563, "step": 6161 }, { "epoch": 2.7386666666666666, "grad_norm": 2.444869041442871, "learning_rate": 9.05338078291815e-05, "loss": 1.7362, "step": 6162 }, { "epoch": 2.7391111111111113, "grad_norm": 2.2301559448242188, "learning_rate": 9.051601423487544e-05, "loss": 2.0245, "step": 6163 }, { "epoch": 2.7395555555555555, "grad_norm": 2.3340377807617188, "learning_rate": 9.04982206405694e-05, "loss": 1.6158, "step": 6164 }, { "epoch": 2.74, "grad_norm": 2.1984641551971436, "learning_rate": 9.048042704626336e-05, "loss": 1.6976, "step": 6165 }, { "epoch": 2.7404444444444445, "grad_norm": 2.684303045272827, "learning_rate": 9.04626334519573e-05, "loss": 1.9596, "step": 6166 }, { "epoch": 2.7408888888888887, "grad_norm": 2.037320852279663, "learning_rate": 9.044483985765126e-05, "loss": 1.5683, "step": 6167 }, { "epoch": 2.7413333333333334, "grad_norm": 1.9725054502487183, "learning_rate": 9.04270462633452e-05, "loss": 1.5862, "step": 6168 }, { "epoch": 2.7417777777777776, "grad_norm": 2.6273114681243896, "learning_rate": 9.040925266903914e-05, "loss": 2.0218, "step": 6169 }, { "epoch": 2.7422222222222223, "grad_norm": 2.4630777835845947, "learning_rate": 9.03914590747331e-05, "loss": 1.7888, "step": 6170 }, { "epoch": 2.7426666666666666, "grad_norm": 2.1881697177886963, "learning_rate": 9.037366548042705e-05, "loss": 1.811, "step": 6171 }, { "epoch": 2.7431111111111113, "grad_norm": 2.804276943206787, "learning_rate": 9.0355871886121e-05, "loss": 2.1978, "step": 6172 }, { "epoch": 2.7435555555555555, "grad_norm": 2.1326301097869873, "learning_rate": 9.033807829181495e-05, "loss": 1.7793, "step": 6173 }, { "epoch": 2.7439999999999998, "grad_norm": 2.142714023590088, "learning_rate": 9.032028469750891e-05, "loss": 1.824, "step": 6174 }, { "epoch": 2.7444444444444445, "grad_norm": 2.4953362941741943, "learning_rate": 9.030249110320284e-05, "loss": 1.922, "step": 6175 }, { "epoch": 2.744888888888889, "grad_norm": 2.264841318130493, "learning_rate": 9.02846975088968e-05, "loss": 1.7474, "step": 6176 }, { "epoch": 2.7453333333333334, "grad_norm": 2.464749813079834, "learning_rate": 9.026690391459075e-05, "loss": 1.7516, "step": 6177 }, { "epoch": 2.7457777777777777, "grad_norm": 2.3408308029174805, "learning_rate": 9.02491103202847e-05, "loss": 1.6946, "step": 6178 }, { "epoch": 2.7462222222222223, "grad_norm": 2.4677340984344482, "learning_rate": 9.023131672597865e-05, "loss": 1.6011, "step": 6179 }, { "epoch": 2.7466666666666666, "grad_norm": 2.2355053424835205, "learning_rate": 9.021352313167261e-05, "loss": 1.5883, "step": 6180 }, { "epoch": 2.747111111111111, "grad_norm": 0.23143476247787476, "learning_rate": 9.019572953736655e-05, "loss": 0.0327, "step": 6181 }, { "epoch": 2.7475555555555555, "grad_norm": 2.9940497875213623, "learning_rate": 9.01779359430605e-05, "loss": 1.9261, "step": 6182 }, { "epoch": 2.748, "grad_norm": 2.4658970832824707, "learning_rate": 9.016014234875445e-05, "loss": 1.8213, "step": 6183 }, { "epoch": 2.7484444444444445, "grad_norm": 3.212437152862549, "learning_rate": 9.014234875444841e-05, "loss": 1.511, "step": 6184 }, { "epoch": 2.7488888888888887, "grad_norm": 2.720136880874634, "learning_rate": 9.012455516014235e-05, "loss": 1.5774, "step": 6185 }, { "epoch": 2.7493333333333334, "grad_norm": 2.8537580966949463, "learning_rate": 9.010676156583631e-05, "loss": 1.8448, "step": 6186 }, { "epoch": 2.7497777777777777, "grad_norm": 2.702603578567505, "learning_rate": 9.008896797153026e-05, "loss": 1.3248, "step": 6187 }, { "epoch": 2.7502222222222223, "grad_norm": 2.6414220333099365, "learning_rate": 9.00711743772242e-05, "loss": 1.7348, "step": 6188 }, { "epoch": 2.7506666666666666, "grad_norm": 2.9192988872528076, "learning_rate": 9.005338078291815e-05, "loss": 1.4816, "step": 6189 }, { "epoch": 2.7511111111111113, "grad_norm": 2.888636589050293, "learning_rate": 9.00355871886121e-05, "loss": 1.5585, "step": 6190 }, { "epoch": 2.7515555555555555, "grad_norm": 3.021221399307251, "learning_rate": 9.001779359430605e-05, "loss": 1.8431, "step": 6191 }, { "epoch": 2.752, "grad_norm": 2.7811830043792725, "learning_rate": 9e-05, "loss": 1.7267, "step": 6192 }, { "epoch": 2.7524444444444445, "grad_norm": 3.7655017375946045, "learning_rate": 8.998220640569396e-05, "loss": 1.9514, "step": 6193 }, { "epoch": 2.752888888888889, "grad_norm": 2.969604015350342, "learning_rate": 8.99644128113879e-05, "loss": 1.8387, "step": 6194 }, { "epoch": 2.7533333333333334, "grad_norm": 3.2840607166290283, "learning_rate": 8.994661921708185e-05, "loss": 1.4239, "step": 6195 }, { "epoch": 2.7537777777777777, "grad_norm": 2.914557695388794, "learning_rate": 8.99288256227758e-05, "loss": 1.7825, "step": 6196 }, { "epoch": 2.7542222222222223, "grad_norm": 3.4805359840393066, "learning_rate": 8.991103202846976e-05, "loss": 1.8883, "step": 6197 }, { "epoch": 2.7546666666666666, "grad_norm": 3.3305397033691406, "learning_rate": 8.98932384341637e-05, "loss": 1.3508, "step": 6198 }, { "epoch": 2.755111111111111, "grad_norm": 3.2841057777404785, "learning_rate": 8.987544483985766e-05, "loss": 1.4563, "step": 6199 }, { "epoch": 2.7555555555555555, "grad_norm": 2.9176838397979736, "learning_rate": 8.985765124555162e-05, "loss": 1.0079, "step": 6200 }, { "epoch": 2.7560000000000002, "grad_norm": 1.1757615804672241, "learning_rate": 8.983985765124555e-05, "loss": 1.0498, "step": 6201 }, { "epoch": 2.7564444444444445, "grad_norm": 1.7046676874160767, "learning_rate": 8.98220640569395e-05, "loss": 1.9004, "step": 6202 }, { "epoch": 2.7568888888888887, "grad_norm": 1.9083878993988037, "learning_rate": 8.980427046263346e-05, "loss": 2.1245, "step": 6203 }, { "epoch": 2.7573333333333334, "grad_norm": 2.188979148864746, "learning_rate": 8.97864768683274e-05, "loss": 2.0422, "step": 6204 }, { "epoch": 2.7577777777777777, "grad_norm": 2.305520534515381, "learning_rate": 8.976868327402136e-05, "loss": 1.8755, "step": 6205 }, { "epoch": 2.7582222222222224, "grad_norm": 2.1911003589630127, "learning_rate": 8.975088967971532e-05, "loss": 1.9326, "step": 6206 }, { "epoch": 2.7586666666666666, "grad_norm": 1.9062891006469727, "learning_rate": 8.973309608540926e-05, "loss": 1.7775, "step": 6207 }, { "epoch": 2.7591111111111113, "grad_norm": 2.4695067405700684, "learning_rate": 8.97153024911032e-05, "loss": 2.0425, "step": 6208 }, { "epoch": 2.7595555555555555, "grad_norm": 2.0994985103607178, "learning_rate": 8.969750889679716e-05, "loss": 1.6857, "step": 6209 }, { "epoch": 2.76, "grad_norm": 2.3211324214935303, "learning_rate": 8.96797153024911e-05, "loss": 2.2616, "step": 6210 }, { "epoch": 2.7604444444444445, "grad_norm": 2.4278671741485596, "learning_rate": 8.966192170818506e-05, "loss": 1.8835, "step": 6211 }, { "epoch": 2.7608888888888887, "grad_norm": 2.776087999343872, "learning_rate": 8.964412811387901e-05, "loss": 1.4898, "step": 6212 }, { "epoch": 2.7613333333333334, "grad_norm": 2.4669578075408936, "learning_rate": 8.962633451957296e-05, "loss": 1.993, "step": 6213 }, { "epoch": 2.7617777777777777, "grad_norm": 2.3764331340789795, "learning_rate": 8.96085409252669e-05, "loss": 1.6464, "step": 6214 }, { "epoch": 2.7622222222222224, "grad_norm": 2.225770950317383, "learning_rate": 8.959074733096086e-05, "loss": 1.9112, "step": 6215 }, { "epoch": 2.7626666666666666, "grad_norm": 2.581855535507202, "learning_rate": 8.957295373665481e-05, "loss": 1.7821, "step": 6216 }, { "epoch": 2.763111111111111, "grad_norm": 2.1973445415496826, "learning_rate": 8.955516014234876e-05, "loss": 1.4587, "step": 6217 }, { "epoch": 2.7635555555555555, "grad_norm": 2.4556639194488525, "learning_rate": 8.953736654804271e-05, "loss": 2.1099, "step": 6218 }, { "epoch": 2.7640000000000002, "grad_norm": 1.6619460582733154, "learning_rate": 8.951957295373667e-05, "loss": 0.7801, "step": 6219 }, { "epoch": 2.7644444444444445, "grad_norm": 2.5685040950775146, "learning_rate": 8.95017793594306e-05, "loss": 1.572, "step": 6220 }, { "epoch": 2.7648888888888887, "grad_norm": 2.160173177719116, "learning_rate": 8.948398576512455e-05, "loss": 1.5836, "step": 6221 }, { "epoch": 2.7653333333333334, "grad_norm": 2.594742774963379, "learning_rate": 8.946619217081851e-05, "loss": 1.307, "step": 6222 }, { "epoch": 2.7657777777777777, "grad_norm": 2.604459285736084, "learning_rate": 8.944839857651245e-05, "loss": 1.8215, "step": 6223 }, { "epoch": 2.7662222222222224, "grad_norm": 2.575556516647339, "learning_rate": 8.943060498220641e-05, "loss": 1.7588, "step": 6224 }, { "epoch": 2.7666666666666666, "grad_norm": 2.51969051361084, "learning_rate": 8.941281138790037e-05, "loss": 1.8821, "step": 6225 }, { "epoch": 2.7671111111111113, "grad_norm": 2.5474319458007812, "learning_rate": 8.939501779359431e-05, "loss": 2.1172, "step": 6226 }, { "epoch": 2.7675555555555555, "grad_norm": 2.6223058700561523, "learning_rate": 8.937722419928825e-05, "loss": 1.7894, "step": 6227 }, { "epoch": 2.768, "grad_norm": 2.7866673469543457, "learning_rate": 8.935943060498221e-05, "loss": 1.7887, "step": 6228 }, { "epoch": 2.7684444444444445, "grad_norm": 0.3962222933769226, "learning_rate": 8.934163701067617e-05, "loss": 0.0359, "step": 6229 }, { "epoch": 2.7688888888888887, "grad_norm": 2.9397222995758057, "learning_rate": 8.932384341637011e-05, "loss": 1.8361, "step": 6230 }, { "epoch": 2.7693333333333334, "grad_norm": 2.984018087387085, "learning_rate": 8.930604982206407e-05, "loss": 1.7147, "step": 6231 }, { "epoch": 2.7697777777777777, "grad_norm": 3.337759017944336, "learning_rate": 8.928825622775802e-05, "loss": 2.0495, "step": 6232 }, { "epoch": 2.7702222222222224, "grad_norm": 2.786367654800415, "learning_rate": 8.927046263345195e-05, "loss": 1.7017, "step": 6233 }, { "epoch": 2.7706666666666666, "grad_norm": 3.031346321105957, "learning_rate": 8.925266903914591e-05, "loss": 1.8949, "step": 6234 }, { "epoch": 2.771111111111111, "grad_norm": 2.8684723377227783, "learning_rate": 8.923487544483986e-05, "loss": 1.7939, "step": 6235 }, { "epoch": 2.7715555555555556, "grad_norm": 2.4864673614501953, "learning_rate": 8.921708185053381e-05, "loss": 1.6988, "step": 6236 }, { "epoch": 2.7720000000000002, "grad_norm": 2.5959644317626953, "learning_rate": 8.919928825622776e-05, "loss": 1.4515, "step": 6237 }, { "epoch": 2.7724444444444445, "grad_norm": 2.3537802696228027, "learning_rate": 8.918149466192172e-05, "loss": 1.2161, "step": 6238 }, { "epoch": 2.7728888888888887, "grad_norm": 2.6737425327301025, "learning_rate": 8.916370106761566e-05, "loss": 1.6259, "step": 6239 }, { "epoch": 2.7733333333333334, "grad_norm": 2.907355785369873, "learning_rate": 8.91459074733096e-05, "loss": 1.846, "step": 6240 }, { "epoch": 2.7737777777777777, "grad_norm": 3.1080589294433594, "learning_rate": 8.912811387900356e-05, "loss": 1.718, "step": 6241 }, { "epoch": 2.7742222222222224, "grad_norm": 3.7171342372894287, "learning_rate": 8.911032028469752e-05, "loss": 1.7999, "step": 6242 }, { "epoch": 2.7746666666666666, "grad_norm": 2.974966049194336, "learning_rate": 8.909252669039146e-05, "loss": 1.5202, "step": 6243 }, { "epoch": 2.7751111111111113, "grad_norm": 2.403938055038452, "learning_rate": 8.907473309608542e-05, "loss": 1.3251, "step": 6244 }, { "epoch": 2.7755555555555556, "grad_norm": 3.240013599395752, "learning_rate": 8.905693950177937e-05, "loss": 1.1542, "step": 6245 }, { "epoch": 2.776, "grad_norm": 2.923494815826416, "learning_rate": 8.90391459074733e-05, "loss": 1.5198, "step": 6246 }, { "epoch": 2.7764444444444445, "grad_norm": 3.5100812911987305, "learning_rate": 8.902135231316726e-05, "loss": 1.4171, "step": 6247 }, { "epoch": 2.7768888888888887, "grad_norm": 2.885017156600952, "learning_rate": 8.900355871886122e-05, "loss": 1.4852, "step": 6248 }, { "epoch": 2.7773333333333334, "grad_norm": 3.7107977867126465, "learning_rate": 8.898576512455516e-05, "loss": 1.3071, "step": 6249 }, { "epoch": 2.7777777777777777, "grad_norm": 3.349717378616333, "learning_rate": 8.896797153024912e-05, "loss": 1.5356, "step": 6250 }, { "epoch": 2.7782222222222224, "grad_norm": 2.166078805923462, "learning_rate": 8.895017793594307e-05, "loss": 1.9864, "step": 6251 }, { "epoch": 2.7786666666666666, "grad_norm": 2.238597869873047, "learning_rate": 8.893238434163702e-05, "loss": 1.9695, "step": 6252 }, { "epoch": 2.779111111111111, "grad_norm": 1.4127635955810547, "learning_rate": 8.891459074733096e-05, "loss": 1.1453, "step": 6253 }, { "epoch": 2.7795555555555556, "grad_norm": 2.074450969696045, "learning_rate": 8.889679715302492e-05, "loss": 1.5515, "step": 6254 }, { "epoch": 2.7800000000000002, "grad_norm": 2.3815135955810547, "learning_rate": 8.887900355871886e-05, "loss": 1.9383, "step": 6255 }, { "epoch": 2.7804444444444445, "grad_norm": 2.3668477535247803, "learning_rate": 8.886120996441281e-05, "loss": 1.685, "step": 6256 }, { "epoch": 2.7808888888888887, "grad_norm": 2.168121576309204, "learning_rate": 8.884341637010677e-05, "loss": 1.783, "step": 6257 }, { "epoch": 2.7813333333333334, "grad_norm": 1.5242462158203125, "learning_rate": 8.882562277580071e-05, "loss": 1.0202, "step": 6258 }, { "epoch": 2.7817777777777777, "grad_norm": 2.2029366493225098, "learning_rate": 8.880782918149466e-05, "loss": 1.4853, "step": 6259 }, { "epoch": 2.7822222222222224, "grad_norm": 2.45417857170105, "learning_rate": 8.879003558718861e-05, "loss": 1.9181, "step": 6260 }, { "epoch": 2.7826666666666666, "grad_norm": 2.351133346557617, "learning_rate": 8.877224199288257e-05, "loss": 1.9803, "step": 6261 }, { "epoch": 2.7831111111111113, "grad_norm": 2.16433048248291, "learning_rate": 8.875444839857651e-05, "loss": 1.7061, "step": 6262 }, { "epoch": 2.7835555555555556, "grad_norm": 2.3350791931152344, "learning_rate": 8.873665480427047e-05, "loss": 1.7437, "step": 6263 }, { "epoch": 2.784, "grad_norm": 2.5073392391204834, "learning_rate": 8.871886120996443e-05, "loss": 1.6962, "step": 6264 }, { "epoch": 2.7844444444444445, "grad_norm": 2.518251657485962, "learning_rate": 8.870106761565836e-05, "loss": 1.6035, "step": 6265 }, { "epoch": 2.7848888888888887, "grad_norm": 2.514759063720703, "learning_rate": 8.868327402135231e-05, "loss": 1.9147, "step": 6266 }, { "epoch": 2.7853333333333334, "grad_norm": 2.542900562286377, "learning_rate": 8.866548042704627e-05, "loss": 1.8066, "step": 6267 }, { "epoch": 2.7857777777777777, "grad_norm": 2.330296754837036, "learning_rate": 8.864768683274021e-05, "loss": 1.7774, "step": 6268 }, { "epoch": 2.7862222222222224, "grad_norm": 2.444782018661499, "learning_rate": 8.862989323843417e-05, "loss": 2.0453, "step": 6269 }, { "epoch": 2.7866666666666666, "grad_norm": 2.7158892154693604, "learning_rate": 8.861209964412812e-05, "loss": 2.2444, "step": 6270 }, { "epoch": 2.787111111111111, "grad_norm": 2.286931037902832, "learning_rate": 8.859430604982207e-05, "loss": 1.1165, "step": 6271 }, { "epoch": 2.7875555555555556, "grad_norm": 2.6693062782287598, "learning_rate": 8.857651245551601e-05, "loss": 1.8122, "step": 6272 }, { "epoch": 2.7880000000000003, "grad_norm": 2.527357816696167, "learning_rate": 8.855871886120997e-05, "loss": 1.9374, "step": 6273 }, { "epoch": 2.7884444444444445, "grad_norm": 2.357048273086548, "learning_rate": 8.854092526690392e-05, "loss": 1.9746, "step": 6274 }, { "epoch": 2.7888888888888888, "grad_norm": 2.5850024223327637, "learning_rate": 8.852313167259787e-05, "loss": 1.3167, "step": 6275 }, { "epoch": 2.7893333333333334, "grad_norm": 2.3246774673461914, "learning_rate": 8.850533807829182e-05, "loss": 1.5308, "step": 6276 }, { "epoch": 2.7897777777777777, "grad_norm": 2.6649742126464844, "learning_rate": 8.848754448398578e-05, "loss": 1.9587, "step": 6277 }, { "epoch": 2.7902222222222224, "grad_norm": 2.920469284057617, "learning_rate": 8.846975088967971e-05, "loss": 2.0161, "step": 6278 }, { "epoch": 2.7906666666666666, "grad_norm": 2.4961137771606445, "learning_rate": 8.845195729537367e-05, "loss": 1.7213, "step": 6279 }, { "epoch": 2.7911111111111113, "grad_norm": 2.586489677429199, "learning_rate": 8.843416370106762e-05, "loss": 1.6672, "step": 6280 }, { "epoch": 2.7915555555555556, "grad_norm": 2.7179059982299805, "learning_rate": 8.841637010676156e-05, "loss": 1.6846, "step": 6281 }, { "epoch": 2.792, "grad_norm": 2.3709304332733154, "learning_rate": 8.839857651245552e-05, "loss": 1.6412, "step": 6282 }, { "epoch": 2.7924444444444445, "grad_norm": 2.750460624694824, "learning_rate": 8.838078291814948e-05, "loss": 1.928, "step": 6283 }, { "epoch": 2.7928888888888888, "grad_norm": 2.755523681640625, "learning_rate": 8.836298932384342e-05, "loss": 1.3791, "step": 6284 }, { "epoch": 2.7933333333333334, "grad_norm": 6.7336554527282715, "learning_rate": 8.834519572953736e-05, "loss": 1.7842, "step": 6285 }, { "epoch": 2.7937777777777777, "grad_norm": 1.775144100189209, "learning_rate": 8.832740213523132e-05, "loss": 0.8977, "step": 6286 }, { "epoch": 2.7942222222222224, "grad_norm": 1.6157419681549072, "learning_rate": 8.830960854092528e-05, "loss": 0.6727, "step": 6287 }, { "epoch": 2.7946666666666666, "grad_norm": 3.1187615394592285, "learning_rate": 8.829181494661922e-05, "loss": 2.2613, "step": 6288 }, { "epoch": 2.795111111111111, "grad_norm": 2.994696617126465, "learning_rate": 8.827402135231318e-05, "loss": 1.5852, "step": 6289 }, { "epoch": 2.7955555555555556, "grad_norm": 2.502854585647583, "learning_rate": 8.825622775800713e-05, "loss": 1.5286, "step": 6290 }, { "epoch": 2.7960000000000003, "grad_norm": 2.853635549545288, "learning_rate": 8.823843416370106e-05, "loss": 1.7994, "step": 6291 }, { "epoch": 2.7964444444444445, "grad_norm": 4.293439865112305, "learning_rate": 8.822064056939502e-05, "loss": 2.0785, "step": 6292 }, { "epoch": 2.7968888888888888, "grad_norm": 3.213901996612549, "learning_rate": 8.820284697508897e-05, "loss": 1.9483, "step": 6293 }, { "epoch": 2.7973333333333334, "grad_norm": 3.6944658756256104, "learning_rate": 8.818505338078292e-05, "loss": 2.1967, "step": 6294 }, { "epoch": 2.7977777777777777, "grad_norm": 2.9698970317840576, "learning_rate": 8.816725978647687e-05, "loss": 1.5886, "step": 6295 }, { "epoch": 2.7982222222222224, "grad_norm": 3.9858527183532715, "learning_rate": 8.814946619217083e-05, "loss": 1.9551, "step": 6296 }, { "epoch": 2.7986666666666666, "grad_norm": 4.04840612411499, "learning_rate": 8.813167259786477e-05, "loss": 2.0034, "step": 6297 }, { "epoch": 2.7991111111111113, "grad_norm": 3.7091634273529053, "learning_rate": 8.811387900355872e-05, "loss": 2.1336, "step": 6298 }, { "epoch": 2.7995555555555556, "grad_norm": 3.552135944366455, "learning_rate": 8.809608540925267e-05, "loss": 1.8169, "step": 6299 }, { "epoch": 2.8, "grad_norm": 2.049909830093384, "learning_rate": 8.807829181494662e-05, "loss": 0.6878, "step": 6300 }, { "epoch": 2.8004444444444445, "grad_norm": 3.8162107467651367, "learning_rate": 8.806049822064057e-05, "loss": 2.3236, "step": 6301 }, { "epoch": 2.8008888888888888, "grad_norm": 1.9239376783370972, "learning_rate": 8.804270462633453e-05, "loss": 1.5104, "step": 6302 }, { "epoch": 2.8013333333333335, "grad_norm": 1.867793083190918, "learning_rate": 8.802491103202847e-05, "loss": 1.936, "step": 6303 }, { "epoch": 2.8017777777777777, "grad_norm": 2.062577486038208, "learning_rate": 8.800711743772242e-05, "loss": 1.8487, "step": 6304 }, { "epoch": 2.8022222222222224, "grad_norm": 2.304241895675659, "learning_rate": 8.798932384341637e-05, "loss": 1.9238, "step": 6305 }, { "epoch": 2.8026666666666666, "grad_norm": 2.0936267375946045, "learning_rate": 8.797153024911033e-05, "loss": 1.6966, "step": 6306 }, { "epoch": 2.803111111111111, "grad_norm": 2.325967788696289, "learning_rate": 8.795373665480427e-05, "loss": 2.0191, "step": 6307 }, { "epoch": 2.8035555555555556, "grad_norm": 2.175997495651245, "learning_rate": 8.793594306049823e-05, "loss": 1.5685, "step": 6308 }, { "epoch": 2.8040000000000003, "grad_norm": 1.519380807876587, "learning_rate": 8.791814946619218e-05, "loss": 0.9142, "step": 6309 }, { "epoch": 2.8044444444444445, "grad_norm": 2.5699596405029297, "learning_rate": 8.790035587188611e-05, "loss": 2.3034, "step": 6310 }, { "epoch": 2.8048888888888888, "grad_norm": 2.172449827194214, "learning_rate": 8.788256227758007e-05, "loss": 1.7681, "step": 6311 }, { "epoch": 2.8053333333333335, "grad_norm": 2.67712140083313, "learning_rate": 8.786476868327403e-05, "loss": 2.1584, "step": 6312 }, { "epoch": 2.8057777777777777, "grad_norm": 2.426130771636963, "learning_rate": 8.784697508896797e-05, "loss": 1.7722, "step": 6313 }, { "epoch": 2.806222222222222, "grad_norm": 2.3680663108825684, "learning_rate": 8.782918149466193e-05, "loss": 1.886, "step": 6314 }, { "epoch": 2.8066666666666666, "grad_norm": 2.257155179977417, "learning_rate": 8.781138790035588e-05, "loss": 1.7808, "step": 6315 }, { "epoch": 2.8071111111111113, "grad_norm": 2.636523485183716, "learning_rate": 8.779359430604983e-05, "loss": 1.7881, "step": 6316 }, { "epoch": 2.8075555555555556, "grad_norm": 2.501422882080078, "learning_rate": 8.777580071174377e-05, "loss": 1.814, "step": 6317 }, { "epoch": 2.808, "grad_norm": 2.323500394821167, "learning_rate": 8.775800711743772e-05, "loss": 1.7802, "step": 6318 }, { "epoch": 2.8084444444444445, "grad_norm": 1.7834057807922363, "learning_rate": 8.774021352313168e-05, "loss": 0.901, "step": 6319 }, { "epoch": 2.8088888888888888, "grad_norm": 1.7829649448394775, "learning_rate": 8.772241992882562e-05, "loss": 0.9174, "step": 6320 }, { "epoch": 2.8093333333333335, "grad_norm": 2.242635488510132, "learning_rate": 8.770462633451958e-05, "loss": 1.7682, "step": 6321 }, { "epoch": 2.8097777777777777, "grad_norm": 2.7776167392730713, "learning_rate": 8.768683274021354e-05, "loss": 2.1089, "step": 6322 }, { "epoch": 2.8102222222222224, "grad_norm": 2.151515483856201, "learning_rate": 8.766903914590747e-05, "loss": 1.4301, "step": 6323 }, { "epoch": 2.8106666666666666, "grad_norm": 2.3415439128875732, "learning_rate": 8.765124555160142e-05, "loss": 1.8939, "step": 6324 }, { "epoch": 2.811111111111111, "grad_norm": 2.3634235858917236, "learning_rate": 8.763345195729538e-05, "loss": 1.9442, "step": 6325 }, { "epoch": 2.8115555555555556, "grad_norm": 2.3634817600250244, "learning_rate": 8.761565836298932e-05, "loss": 1.3034, "step": 6326 }, { "epoch": 2.8120000000000003, "grad_norm": 2.2050936222076416, "learning_rate": 8.759786476868328e-05, "loss": 1.4394, "step": 6327 }, { "epoch": 2.8124444444444445, "grad_norm": 2.491987466812134, "learning_rate": 8.758007117437724e-05, "loss": 1.4429, "step": 6328 }, { "epoch": 2.8128888888888888, "grad_norm": 2.7539877891540527, "learning_rate": 8.756227758007118e-05, "loss": 1.8103, "step": 6329 }, { "epoch": 2.8133333333333335, "grad_norm": 1.8611799478530884, "learning_rate": 8.754448398576512e-05, "loss": 0.8885, "step": 6330 }, { "epoch": 2.8137777777777777, "grad_norm": 2.7642264366149902, "learning_rate": 8.752669039145908e-05, "loss": 1.6687, "step": 6331 }, { "epoch": 2.814222222222222, "grad_norm": 2.5059311389923096, "learning_rate": 8.750889679715303e-05, "loss": 1.5068, "step": 6332 }, { "epoch": 2.8146666666666667, "grad_norm": 2.761054515838623, "learning_rate": 8.749110320284698e-05, "loss": 1.7512, "step": 6333 }, { "epoch": 2.8151111111111113, "grad_norm": 2.5050809383392334, "learning_rate": 8.747330960854093e-05, "loss": 1.4633, "step": 6334 }, { "epoch": 2.8155555555555556, "grad_norm": 3.096000909805298, "learning_rate": 8.745551601423489e-05, "loss": 1.7558, "step": 6335 }, { "epoch": 2.816, "grad_norm": 3.1274073123931885, "learning_rate": 8.743772241992882e-05, "loss": 1.614, "step": 6336 }, { "epoch": 2.8164444444444445, "grad_norm": 3.165975332260132, "learning_rate": 8.741992882562278e-05, "loss": 1.5489, "step": 6337 }, { "epoch": 2.8168888888888888, "grad_norm": 3.016789674758911, "learning_rate": 8.740213523131673e-05, "loss": 1.8568, "step": 6338 }, { "epoch": 2.8173333333333335, "grad_norm": 2.219804525375366, "learning_rate": 8.738434163701068e-05, "loss": 1.0983, "step": 6339 }, { "epoch": 2.8177777777777777, "grad_norm": 3.128307819366455, "learning_rate": 8.736654804270463e-05, "loss": 1.4682, "step": 6340 }, { "epoch": 2.8182222222222224, "grad_norm": 3.149846076965332, "learning_rate": 8.734875444839859e-05, "loss": 1.8692, "step": 6341 }, { "epoch": 2.8186666666666667, "grad_norm": 2.897365093231201, "learning_rate": 8.733096085409253e-05, "loss": 1.5963, "step": 6342 }, { "epoch": 2.819111111111111, "grad_norm": 3.354923963546753, "learning_rate": 8.731316725978647e-05, "loss": 1.8655, "step": 6343 }, { "epoch": 2.8195555555555556, "grad_norm": 3.33359432220459, "learning_rate": 8.729537366548043e-05, "loss": 1.8005, "step": 6344 }, { "epoch": 2.82, "grad_norm": 2.5689620971679688, "learning_rate": 8.727758007117437e-05, "loss": 1.5202, "step": 6345 }, { "epoch": 2.8204444444444445, "grad_norm": 4.082332134246826, "learning_rate": 8.725978647686833e-05, "loss": 2.2171, "step": 6346 }, { "epoch": 2.820888888888889, "grad_norm": 3.0966172218322754, "learning_rate": 8.724199288256229e-05, "loss": 1.9997, "step": 6347 }, { "epoch": 2.8213333333333335, "grad_norm": 3.0002849102020264, "learning_rate": 8.722419928825623e-05, "loss": 1.488, "step": 6348 }, { "epoch": 2.8217777777777777, "grad_norm": 4.207763671875, "learning_rate": 8.720640569395017e-05, "loss": 1.2479, "step": 6349 }, { "epoch": 2.822222222222222, "grad_norm": 2.4575085639953613, "learning_rate": 8.718861209964413e-05, "loss": 0.5755, "step": 6350 }, { "epoch": 2.8226666666666667, "grad_norm": 1.834578275680542, "learning_rate": 8.717081850533809e-05, "loss": 2.0906, "step": 6351 }, { "epoch": 2.8231111111111113, "grad_norm": 1.9886993169784546, "learning_rate": 8.715302491103203e-05, "loss": 1.6937, "step": 6352 }, { "epoch": 2.8235555555555556, "grad_norm": 1.45979905128479, "learning_rate": 8.713523131672599e-05, "loss": 1.3237, "step": 6353 }, { "epoch": 2.824, "grad_norm": 2.220140218734741, "learning_rate": 8.711743772241994e-05, "loss": 2.1938, "step": 6354 }, { "epoch": 2.8244444444444445, "grad_norm": 2.36525821685791, "learning_rate": 8.709964412811388e-05, "loss": 1.7316, "step": 6355 }, { "epoch": 2.824888888888889, "grad_norm": 2.672027587890625, "learning_rate": 8.708185053380783e-05, "loss": 2.1021, "step": 6356 }, { "epoch": 2.8253333333333335, "grad_norm": 2.566497325897217, "learning_rate": 8.706405693950178e-05, "loss": 2.0366, "step": 6357 }, { "epoch": 2.8257777777777777, "grad_norm": 2.3866355419158936, "learning_rate": 8.704626334519573e-05, "loss": 1.8904, "step": 6358 }, { "epoch": 2.8262222222222224, "grad_norm": 2.158238172531128, "learning_rate": 8.702846975088968e-05, "loss": 1.9666, "step": 6359 }, { "epoch": 2.8266666666666667, "grad_norm": 2.4062252044677734, "learning_rate": 8.701067615658364e-05, "loss": 1.7557, "step": 6360 }, { "epoch": 2.827111111111111, "grad_norm": 2.215730905532837, "learning_rate": 8.699288256227758e-05, "loss": 1.0731, "step": 6361 }, { "epoch": 2.8275555555555556, "grad_norm": 2.172459125518799, "learning_rate": 8.697508896797153e-05, "loss": 1.7543, "step": 6362 }, { "epoch": 2.828, "grad_norm": 2.4174609184265137, "learning_rate": 8.695729537366548e-05, "loss": 1.5526, "step": 6363 }, { "epoch": 2.8284444444444445, "grad_norm": 1.6709802150726318, "learning_rate": 8.693950177935944e-05, "loss": 0.8726, "step": 6364 }, { "epoch": 2.828888888888889, "grad_norm": 2.830564022064209, "learning_rate": 8.692170818505338e-05, "loss": 1.9783, "step": 6365 }, { "epoch": 2.8293333333333335, "grad_norm": 2.5049219131469727, "learning_rate": 8.690391459074734e-05, "loss": 1.7796, "step": 6366 }, { "epoch": 2.8297777777777777, "grad_norm": 2.455636739730835, "learning_rate": 8.68861209964413e-05, "loss": 1.807, "step": 6367 }, { "epoch": 2.830222222222222, "grad_norm": 2.934166431427002, "learning_rate": 8.686832740213522e-05, "loss": 1.4921, "step": 6368 }, { "epoch": 2.8306666666666667, "grad_norm": 3.1317198276519775, "learning_rate": 8.685053380782918e-05, "loss": 1.7484, "step": 6369 }, { "epoch": 2.8311111111111114, "grad_norm": 3.014157295227051, "learning_rate": 8.683274021352314e-05, "loss": 2.2238, "step": 6370 }, { "epoch": 2.8315555555555556, "grad_norm": 2.56988787651062, "learning_rate": 8.681494661921708e-05, "loss": 1.6546, "step": 6371 }, { "epoch": 2.832, "grad_norm": 2.6320784091949463, "learning_rate": 8.679715302491104e-05, "loss": 2.1411, "step": 6372 }, { "epoch": 2.8324444444444445, "grad_norm": 2.682128667831421, "learning_rate": 8.6779359430605e-05, "loss": 2.1698, "step": 6373 }, { "epoch": 2.832888888888889, "grad_norm": 2.754392623901367, "learning_rate": 8.676156583629894e-05, "loss": 1.8696, "step": 6374 }, { "epoch": 2.8333333333333335, "grad_norm": 2.4099462032318115, "learning_rate": 8.674377224199288e-05, "loss": 1.8406, "step": 6375 }, { "epoch": 2.8337777777777777, "grad_norm": 3.0007071495056152, "learning_rate": 8.672597864768684e-05, "loss": 1.7611, "step": 6376 }, { "epoch": 2.8342222222222224, "grad_norm": 3.2685530185699463, "learning_rate": 8.670818505338079e-05, "loss": 1.8421, "step": 6377 }, { "epoch": 2.8346666666666667, "grad_norm": 2.7052693367004395, "learning_rate": 8.669039145907474e-05, "loss": 1.6633, "step": 6378 }, { "epoch": 2.835111111111111, "grad_norm": 2.628025531768799, "learning_rate": 8.667259786476869e-05, "loss": 1.7274, "step": 6379 }, { "epoch": 2.8355555555555556, "grad_norm": 1.5739624500274658, "learning_rate": 8.665480427046265e-05, "loss": 0.7554, "step": 6380 }, { "epoch": 2.836, "grad_norm": 1.9430078268051147, "learning_rate": 8.663701067615658e-05, "loss": 0.8761, "step": 6381 }, { "epoch": 2.8364444444444445, "grad_norm": 2.2438039779663086, "learning_rate": 8.661921708185053e-05, "loss": 1.1933, "step": 6382 }, { "epoch": 2.836888888888889, "grad_norm": 2.6071414947509766, "learning_rate": 8.660142348754449e-05, "loss": 1.7854, "step": 6383 }, { "epoch": 2.8373333333333335, "grad_norm": 1.6424260139465332, "learning_rate": 8.658362989323843e-05, "loss": 0.7366, "step": 6384 }, { "epoch": 2.8377777777777777, "grad_norm": 2.981468439102173, "learning_rate": 8.656583629893239e-05, "loss": 1.7676, "step": 6385 }, { "epoch": 2.838222222222222, "grad_norm": 2.440037965774536, "learning_rate": 8.654804270462635e-05, "loss": 1.5232, "step": 6386 }, { "epoch": 2.8386666666666667, "grad_norm": 2.5505833625793457, "learning_rate": 8.653024911032029e-05, "loss": 1.4548, "step": 6387 }, { "epoch": 2.8391111111111114, "grad_norm": 2.7578847408294678, "learning_rate": 8.651245551601423e-05, "loss": 1.5028, "step": 6388 }, { "epoch": 2.8395555555555556, "grad_norm": 2.750519275665283, "learning_rate": 8.649466192170819e-05, "loss": 1.7589, "step": 6389 }, { "epoch": 2.84, "grad_norm": 2.7024030685424805, "learning_rate": 8.647686832740213e-05, "loss": 1.6456, "step": 6390 }, { "epoch": 2.8404444444444445, "grad_norm": 2.8694217205047607, "learning_rate": 8.645907473309609e-05, "loss": 1.0521, "step": 6391 }, { "epoch": 2.840888888888889, "grad_norm": 2.930608034133911, "learning_rate": 8.644128113879004e-05, "loss": 1.6561, "step": 6392 }, { "epoch": 2.8413333333333335, "grad_norm": 2.9115495681762695, "learning_rate": 8.642348754448399e-05, "loss": 1.8796, "step": 6393 }, { "epoch": 2.8417777777777777, "grad_norm": 3.2313528060913086, "learning_rate": 8.640569395017793e-05, "loss": 1.8615, "step": 6394 }, { "epoch": 2.8422222222222224, "grad_norm": 2.823005437850952, "learning_rate": 8.638790035587189e-05, "loss": 1.7883, "step": 6395 }, { "epoch": 2.8426666666666667, "grad_norm": 3.2536568641662598, "learning_rate": 8.637010676156584e-05, "loss": 1.734, "step": 6396 }, { "epoch": 2.843111111111111, "grad_norm": 3.4909675121307373, "learning_rate": 8.635231316725979e-05, "loss": 1.5882, "step": 6397 }, { "epoch": 2.8435555555555556, "grad_norm": 3.702479124069214, "learning_rate": 8.633451957295374e-05, "loss": 1.7106, "step": 6398 }, { "epoch": 2.844, "grad_norm": 3.9514353275299072, "learning_rate": 8.63167259786477e-05, "loss": 2.0991, "step": 6399 }, { "epoch": 2.8444444444444446, "grad_norm": 3.16878080368042, "learning_rate": 8.629893238434164e-05, "loss": 1.7365, "step": 6400 }, { "epoch": 2.844888888888889, "grad_norm": 1.7129290103912354, "learning_rate": 8.628113879003559e-05, "loss": 2.2009, "step": 6401 }, { "epoch": 2.8453333333333335, "grad_norm": 1.914031744003296, "learning_rate": 8.626334519572954e-05, "loss": 2.0489, "step": 6402 }, { "epoch": 2.8457777777777777, "grad_norm": 2.1656267642974854, "learning_rate": 8.624555160142348e-05, "loss": 2.4192, "step": 6403 }, { "epoch": 2.846222222222222, "grad_norm": 2.315261125564575, "learning_rate": 8.622775800711744e-05, "loss": 1.8947, "step": 6404 }, { "epoch": 2.8466666666666667, "grad_norm": 2.6250321865081787, "learning_rate": 8.62099644128114e-05, "loss": 2.3402, "step": 6405 }, { "epoch": 2.8471111111111114, "grad_norm": 2.3993537425994873, "learning_rate": 8.619217081850534e-05, "loss": 1.9262, "step": 6406 }, { "epoch": 2.8475555555555556, "grad_norm": 2.0765573978424072, "learning_rate": 8.617437722419928e-05, "loss": 2.1101, "step": 6407 }, { "epoch": 2.848, "grad_norm": 2.398111343383789, "learning_rate": 8.615658362989324e-05, "loss": 1.8188, "step": 6408 }, { "epoch": 2.8484444444444446, "grad_norm": 2.267061471939087, "learning_rate": 8.61387900355872e-05, "loss": 2.3754, "step": 6409 }, { "epoch": 2.848888888888889, "grad_norm": 2.5441832542419434, "learning_rate": 8.612099644128114e-05, "loss": 1.6375, "step": 6410 }, { "epoch": 2.8493333333333335, "grad_norm": 2.2044379711151123, "learning_rate": 8.61032028469751e-05, "loss": 1.8951, "step": 6411 }, { "epoch": 2.8497777777777777, "grad_norm": 2.2039835453033447, "learning_rate": 8.608540925266905e-05, "loss": 1.8844, "step": 6412 }, { "epoch": 2.8502222222222224, "grad_norm": 2.50587797164917, "learning_rate": 8.6067615658363e-05, "loss": 1.9177, "step": 6413 }, { "epoch": 2.8506666666666667, "grad_norm": 2.2638602256774902, "learning_rate": 8.604982206405694e-05, "loss": 1.5041, "step": 6414 }, { "epoch": 2.851111111111111, "grad_norm": 2.9379706382751465, "learning_rate": 8.60320284697509e-05, "loss": 1.5956, "step": 6415 }, { "epoch": 2.8515555555555556, "grad_norm": 2.415709972381592, "learning_rate": 8.601423487544484e-05, "loss": 1.5924, "step": 6416 }, { "epoch": 2.852, "grad_norm": 2.4809000492095947, "learning_rate": 8.59964412811388e-05, "loss": 1.9965, "step": 6417 }, { "epoch": 2.8524444444444446, "grad_norm": 2.643976926803589, "learning_rate": 8.597864768683275e-05, "loss": 2.1711, "step": 6418 }, { "epoch": 2.852888888888889, "grad_norm": 2.432825803756714, "learning_rate": 8.59608540925267e-05, "loss": 1.4952, "step": 6419 }, { "epoch": 2.8533333333333335, "grad_norm": 2.853224277496338, "learning_rate": 8.594306049822064e-05, "loss": 1.5422, "step": 6420 }, { "epoch": 2.8537777777777777, "grad_norm": 2.624593496322632, "learning_rate": 8.59252669039146e-05, "loss": 1.5306, "step": 6421 }, { "epoch": 2.854222222222222, "grad_norm": 2.337578296661377, "learning_rate": 8.590747330960855e-05, "loss": 1.6708, "step": 6422 }, { "epoch": 2.8546666666666667, "grad_norm": 2.62070369720459, "learning_rate": 8.588967971530249e-05, "loss": 1.799, "step": 6423 }, { "epoch": 2.8551111111111114, "grad_norm": 2.670642852783203, "learning_rate": 8.587188612099645e-05, "loss": 1.7654, "step": 6424 }, { "epoch": 2.8555555555555556, "grad_norm": 2.330082416534424, "learning_rate": 8.58540925266904e-05, "loss": 1.4492, "step": 6425 }, { "epoch": 2.856, "grad_norm": 2.2589471340179443, "learning_rate": 8.583629893238434e-05, "loss": 1.5373, "step": 6426 }, { "epoch": 2.8564444444444446, "grad_norm": 2.6059587001800537, "learning_rate": 8.581850533807829e-05, "loss": 1.8362, "step": 6427 }, { "epoch": 2.856888888888889, "grad_norm": 2.6167008876800537, "learning_rate": 8.580071174377225e-05, "loss": 1.7661, "step": 6428 }, { "epoch": 2.857333333333333, "grad_norm": 2.616682291030884, "learning_rate": 8.578291814946619e-05, "loss": 1.8365, "step": 6429 }, { "epoch": 2.8577777777777778, "grad_norm": 0.22476230561733246, "learning_rate": 8.576512455516015e-05, "loss": 0.0266, "step": 6430 }, { "epoch": 2.8582222222222224, "grad_norm": 1.9581819772720337, "learning_rate": 8.57473309608541e-05, "loss": 1.0284, "step": 6431 }, { "epoch": 2.8586666666666667, "grad_norm": 2.870471477508545, "learning_rate": 8.572953736654805e-05, "loss": 1.8517, "step": 6432 }, { "epoch": 2.859111111111111, "grad_norm": 2.1929726600646973, "learning_rate": 8.571174377224199e-05, "loss": 1.1532, "step": 6433 }, { "epoch": 2.8595555555555556, "grad_norm": 2.5478527545928955, "learning_rate": 8.569395017793595e-05, "loss": 1.6376, "step": 6434 }, { "epoch": 2.86, "grad_norm": 3.627499580383301, "learning_rate": 8.567615658362989e-05, "loss": 1.6505, "step": 6435 }, { "epoch": 2.8604444444444446, "grad_norm": 2.877537727355957, "learning_rate": 8.565836298932385e-05, "loss": 2.0833, "step": 6436 }, { "epoch": 2.860888888888889, "grad_norm": 2.8161423206329346, "learning_rate": 8.56405693950178e-05, "loss": 1.7967, "step": 6437 }, { "epoch": 2.8613333333333335, "grad_norm": 3.0182766914367676, "learning_rate": 8.562277580071175e-05, "loss": 1.8528, "step": 6438 }, { "epoch": 2.8617777777777778, "grad_norm": 2.9146299362182617, "learning_rate": 8.560498220640569e-05, "loss": 1.3118, "step": 6439 }, { "epoch": 2.862222222222222, "grad_norm": 4.448184967041016, "learning_rate": 8.558718861209964e-05, "loss": 1.9648, "step": 6440 }, { "epoch": 2.8626666666666667, "grad_norm": 3.721683979034424, "learning_rate": 8.55693950177936e-05, "loss": 2.0082, "step": 6441 }, { "epoch": 2.8631111111111114, "grad_norm": 2.6923537254333496, "learning_rate": 8.555160142348754e-05, "loss": 1.5117, "step": 6442 }, { "epoch": 2.8635555555555556, "grad_norm": 3.0055980682373047, "learning_rate": 8.55338078291815e-05, "loss": 1.688, "step": 6443 }, { "epoch": 2.864, "grad_norm": 2.95833158493042, "learning_rate": 8.551601423487546e-05, "loss": 1.8496, "step": 6444 }, { "epoch": 2.8644444444444446, "grad_norm": 3.0208351612091064, "learning_rate": 8.54982206405694e-05, "loss": 1.6707, "step": 6445 }, { "epoch": 2.864888888888889, "grad_norm": 3.0486018657684326, "learning_rate": 8.548042704626334e-05, "loss": 1.6162, "step": 6446 }, { "epoch": 2.865333333333333, "grad_norm": 4.173706531524658, "learning_rate": 8.54626334519573e-05, "loss": 2.0012, "step": 6447 }, { "epoch": 2.8657777777777778, "grad_norm": 3.727224349975586, "learning_rate": 8.544483985765124e-05, "loss": 2.3882, "step": 6448 }, { "epoch": 2.8662222222222224, "grad_norm": 3.276524782180786, "learning_rate": 8.54270462633452e-05, "loss": 1.3258, "step": 6449 }, { "epoch": 2.8666666666666667, "grad_norm": 3.5537030696868896, "learning_rate": 8.540925266903916e-05, "loss": 0.7531, "step": 6450 }, { "epoch": 2.867111111111111, "grad_norm": 2.074697732925415, "learning_rate": 8.53914590747331e-05, "loss": 2.3748, "step": 6451 }, { "epoch": 2.8675555555555556, "grad_norm": 1.607412576675415, "learning_rate": 8.537366548042704e-05, "loss": 1.1406, "step": 6452 }, { "epoch": 2.868, "grad_norm": 1.7138793468475342, "learning_rate": 8.5355871886121e-05, "loss": 1.1559, "step": 6453 }, { "epoch": 2.8684444444444446, "grad_norm": 2.1833138465881348, "learning_rate": 8.533807829181495e-05, "loss": 1.4804, "step": 6454 }, { "epoch": 2.868888888888889, "grad_norm": 2.5808310508728027, "learning_rate": 8.53202846975089e-05, "loss": 2.0459, "step": 6455 }, { "epoch": 2.8693333333333335, "grad_norm": 2.10170316696167, "learning_rate": 8.530249110320285e-05, "loss": 1.5515, "step": 6456 }, { "epoch": 2.8697777777777778, "grad_norm": 2.5783615112304688, "learning_rate": 8.528469750889681e-05, "loss": 1.7325, "step": 6457 }, { "epoch": 2.870222222222222, "grad_norm": 1.9656144380569458, "learning_rate": 8.526690391459075e-05, "loss": 1.6021, "step": 6458 }, { "epoch": 2.8706666666666667, "grad_norm": 2.899205446243286, "learning_rate": 8.52491103202847e-05, "loss": 1.5858, "step": 6459 }, { "epoch": 2.871111111111111, "grad_norm": 2.2554736137390137, "learning_rate": 8.523131672597865e-05, "loss": 1.8979, "step": 6460 }, { "epoch": 2.8715555555555556, "grad_norm": 2.1474039554595947, "learning_rate": 8.52135231316726e-05, "loss": 1.6935, "step": 6461 }, { "epoch": 2.872, "grad_norm": 2.697214365005493, "learning_rate": 8.519572953736655e-05, "loss": 1.746, "step": 6462 }, { "epoch": 2.8724444444444446, "grad_norm": 2.4106836318969727, "learning_rate": 8.517793594306051e-05, "loss": 1.7322, "step": 6463 }, { "epoch": 2.872888888888889, "grad_norm": 2.5296225547790527, "learning_rate": 8.516014234875445e-05, "loss": 1.6172, "step": 6464 }, { "epoch": 2.873333333333333, "grad_norm": 2.625232696533203, "learning_rate": 8.51423487544484e-05, "loss": 1.9533, "step": 6465 }, { "epoch": 2.8737777777777778, "grad_norm": 2.728658676147461, "learning_rate": 8.512455516014235e-05, "loss": 1.8563, "step": 6466 }, { "epoch": 2.8742222222222225, "grad_norm": 2.2425432205200195, "learning_rate": 8.510676156583631e-05, "loss": 1.1704, "step": 6467 }, { "epoch": 2.8746666666666667, "grad_norm": 1.9965929985046387, "learning_rate": 8.508896797153025e-05, "loss": 1.2467, "step": 6468 }, { "epoch": 2.875111111111111, "grad_norm": 2.5652081966400146, "learning_rate": 8.507117437722421e-05, "loss": 1.9512, "step": 6469 }, { "epoch": 2.8755555555555556, "grad_norm": 2.329965591430664, "learning_rate": 8.505338078291816e-05, "loss": 1.81, "step": 6470 }, { "epoch": 2.876, "grad_norm": 3.135545015335083, "learning_rate": 8.50355871886121e-05, "loss": 1.9133, "step": 6471 }, { "epoch": 2.8764444444444446, "grad_norm": 2.618896722793579, "learning_rate": 8.501779359430605e-05, "loss": 2.2505, "step": 6472 }, { "epoch": 2.876888888888889, "grad_norm": 3.150949001312256, "learning_rate": 8.5e-05, "loss": 2.0441, "step": 6473 }, { "epoch": 2.8773333333333335, "grad_norm": 2.217580556869507, "learning_rate": 8.498220640569395e-05, "loss": 1.4439, "step": 6474 }, { "epoch": 2.8777777777777778, "grad_norm": 2.7368547916412354, "learning_rate": 8.49644128113879e-05, "loss": 2.0908, "step": 6475 }, { "epoch": 2.878222222222222, "grad_norm": 2.5103628635406494, "learning_rate": 8.494661921708186e-05, "loss": 1.9196, "step": 6476 }, { "epoch": 2.8786666666666667, "grad_norm": 2.70851469039917, "learning_rate": 8.49288256227758e-05, "loss": 1.9561, "step": 6477 }, { "epoch": 2.879111111111111, "grad_norm": 2.451458692550659, "learning_rate": 8.491103202846975e-05, "loss": 1.6853, "step": 6478 }, { "epoch": 2.8795555555555556, "grad_norm": 3.1393275260925293, "learning_rate": 8.48932384341637e-05, "loss": 1.797, "step": 6479 }, { "epoch": 2.88, "grad_norm": 2.9535908699035645, "learning_rate": 8.487544483985765e-05, "loss": 2.0018, "step": 6480 }, { "epoch": 2.8804444444444446, "grad_norm": 2.7212975025177, "learning_rate": 8.48576512455516e-05, "loss": 1.7088, "step": 6481 }, { "epoch": 2.880888888888889, "grad_norm": 2.8284823894500732, "learning_rate": 8.483985765124556e-05, "loss": 1.9613, "step": 6482 }, { "epoch": 2.881333333333333, "grad_norm": 3.322567939758301, "learning_rate": 8.48220640569395e-05, "loss": 1.8317, "step": 6483 }, { "epoch": 2.8817777777777778, "grad_norm": 2.870798349380493, "learning_rate": 8.480427046263345e-05, "loss": 1.6794, "step": 6484 }, { "epoch": 2.8822222222222225, "grad_norm": 2.9603869915008545, "learning_rate": 8.47864768683274e-05, "loss": 1.9088, "step": 6485 }, { "epoch": 2.8826666666666667, "grad_norm": 3.2760636806488037, "learning_rate": 8.476868327402136e-05, "loss": 1.3813, "step": 6486 }, { "epoch": 2.883111111111111, "grad_norm": 2.696504831314087, "learning_rate": 8.47508896797153e-05, "loss": 1.692, "step": 6487 }, { "epoch": 2.8835555555555556, "grad_norm": 3.375922679901123, "learning_rate": 8.473309608540926e-05, "loss": 1.5721, "step": 6488 }, { "epoch": 2.884, "grad_norm": 2.901848316192627, "learning_rate": 8.471530249110322e-05, "loss": 1.589, "step": 6489 }, { "epoch": 2.8844444444444446, "grad_norm": 2.677187919616699, "learning_rate": 8.469750889679716e-05, "loss": 2.0427, "step": 6490 }, { "epoch": 2.884888888888889, "grad_norm": 3.2531144618988037, "learning_rate": 8.46797153024911e-05, "loss": 1.8249, "step": 6491 }, { "epoch": 2.8853333333333335, "grad_norm": 3.3353047370910645, "learning_rate": 8.466192170818506e-05, "loss": 1.3361, "step": 6492 }, { "epoch": 2.8857777777777778, "grad_norm": 2.4956469535827637, "learning_rate": 8.4644128113879e-05, "loss": 1.3096, "step": 6493 }, { "epoch": 2.886222222222222, "grad_norm": 3.014362096786499, "learning_rate": 8.462633451957296e-05, "loss": 1.5515, "step": 6494 }, { "epoch": 2.8866666666666667, "grad_norm": 2.6748619079589844, "learning_rate": 8.460854092526691e-05, "loss": 1.5532, "step": 6495 }, { "epoch": 2.887111111111111, "grad_norm": 3.3072657585144043, "learning_rate": 8.459074733096086e-05, "loss": 1.744, "step": 6496 }, { "epoch": 2.8875555555555557, "grad_norm": 3.1664316654205322, "learning_rate": 8.45729537366548e-05, "loss": 1.5507, "step": 6497 }, { "epoch": 2.888, "grad_norm": 3.5255908966064453, "learning_rate": 8.455516014234876e-05, "loss": 2.1625, "step": 6498 }, { "epoch": 2.8884444444444446, "grad_norm": 0.49165022373199463, "learning_rate": 8.453736654804271e-05, "loss": 0.0612, "step": 6499 }, { "epoch": 2.888888888888889, "grad_norm": 4.066702365875244, "learning_rate": 8.451957295373666e-05, "loss": 1.5139, "step": 6500 }, { "epoch": 2.889333333333333, "grad_norm": 1.581194519996643, "learning_rate": 8.450177935943061e-05, "loss": 1.1063, "step": 6501 }, { "epoch": 2.889777777777778, "grad_norm": 1.7735909223556519, "learning_rate": 8.448398576512457e-05, "loss": 1.9774, "step": 6502 }, { "epoch": 2.8902222222222225, "grad_norm": 1.8580666780471802, "learning_rate": 8.446619217081851e-05, "loss": 2.2711, "step": 6503 }, { "epoch": 2.8906666666666667, "grad_norm": 2.2903764247894287, "learning_rate": 8.444839857651245e-05, "loss": 2.2313, "step": 6504 }, { "epoch": 2.891111111111111, "grad_norm": 2.7461414337158203, "learning_rate": 8.443060498220641e-05, "loss": 2.6372, "step": 6505 }, { "epoch": 2.8915555555555557, "grad_norm": 2.5035669803619385, "learning_rate": 8.441281138790035e-05, "loss": 1.8447, "step": 6506 }, { "epoch": 2.892, "grad_norm": 2.2642126083374023, "learning_rate": 8.439501779359431e-05, "loss": 2.3071, "step": 6507 }, { "epoch": 2.8924444444444446, "grad_norm": 2.4640629291534424, "learning_rate": 8.437722419928827e-05, "loss": 2.0594, "step": 6508 }, { "epoch": 2.892888888888889, "grad_norm": 2.5078279972076416, "learning_rate": 8.435943060498221e-05, "loss": 2.0226, "step": 6509 }, { "epoch": 2.8933333333333335, "grad_norm": 2.1550204753875732, "learning_rate": 8.434163701067615e-05, "loss": 1.803, "step": 6510 }, { "epoch": 2.893777777777778, "grad_norm": 2.478727340698242, "learning_rate": 8.432384341637011e-05, "loss": 1.9638, "step": 6511 }, { "epoch": 2.894222222222222, "grad_norm": 2.432596206665039, "learning_rate": 8.430604982206407e-05, "loss": 1.8639, "step": 6512 }, { "epoch": 2.8946666666666667, "grad_norm": 2.1648998260498047, "learning_rate": 8.428825622775801e-05, "loss": 1.8943, "step": 6513 }, { "epoch": 2.895111111111111, "grad_norm": 2.2655625343322754, "learning_rate": 8.427046263345197e-05, "loss": 1.4152, "step": 6514 }, { "epoch": 2.8955555555555557, "grad_norm": 2.6261086463928223, "learning_rate": 8.425266903914592e-05, "loss": 1.9381, "step": 6515 }, { "epoch": 2.896, "grad_norm": 2.0679445266723633, "learning_rate": 8.423487544483986e-05, "loss": 1.5273, "step": 6516 }, { "epoch": 2.8964444444444446, "grad_norm": 2.227121591567993, "learning_rate": 8.421708185053381e-05, "loss": 1.5115, "step": 6517 }, { "epoch": 2.896888888888889, "grad_norm": 2.8133530616760254, "learning_rate": 8.419928825622776e-05, "loss": 2.1005, "step": 6518 }, { "epoch": 2.897333333333333, "grad_norm": 2.5659241676330566, "learning_rate": 8.418149466192171e-05, "loss": 1.6425, "step": 6519 }, { "epoch": 2.897777777777778, "grad_norm": 3.140711784362793, "learning_rate": 8.416370106761566e-05, "loss": 1.7234, "step": 6520 }, { "epoch": 2.8982222222222225, "grad_norm": 2.281583786010742, "learning_rate": 8.414590747330962e-05, "loss": 1.4164, "step": 6521 }, { "epoch": 2.8986666666666667, "grad_norm": 2.2396039962768555, "learning_rate": 8.412811387900356e-05, "loss": 1.6061, "step": 6522 }, { "epoch": 2.899111111111111, "grad_norm": 2.5794193744659424, "learning_rate": 8.41103202846975e-05, "loss": 1.8131, "step": 6523 }, { "epoch": 2.8995555555555557, "grad_norm": 2.583009958267212, "learning_rate": 8.409252669039146e-05, "loss": 1.6304, "step": 6524 }, { "epoch": 2.9, "grad_norm": 2.1925716400146484, "learning_rate": 8.40747330960854e-05, "loss": 1.4056, "step": 6525 }, { "epoch": 2.9004444444444446, "grad_norm": 2.4621171951293945, "learning_rate": 8.405693950177936e-05, "loss": 1.5926, "step": 6526 }, { "epoch": 2.900888888888889, "grad_norm": 2.397620439529419, "learning_rate": 8.403914590747332e-05, "loss": 1.5116, "step": 6527 }, { "epoch": 2.9013333333333335, "grad_norm": 2.841352939605713, "learning_rate": 8.402135231316726e-05, "loss": 1.5999, "step": 6528 }, { "epoch": 2.901777777777778, "grad_norm": 3.072946548461914, "learning_rate": 8.400355871886122e-05, "loss": 2.002, "step": 6529 }, { "epoch": 2.902222222222222, "grad_norm": 2.5697319507598877, "learning_rate": 8.398576512455516e-05, "loss": 1.8486, "step": 6530 }, { "epoch": 2.9026666666666667, "grad_norm": 3.085151195526123, "learning_rate": 8.396797153024912e-05, "loss": 1.6757, "step": 6531 }, { "epoch": 2.903111111111111, "grad_norm": 2.6995904445648193, "learning_rate": 8.395017793594306e-05, "loss": 1.4552, "step": 6532 }, { "epoch": 2.9035555555555557, "grad_norm": 3.3730642795562744, "learning_rate": 8.393238434163702e-05, "loss": 1.8132, "step": 6533 }, { "epoch": 2.904, "grad_norm": 2.572873115539551, "learning_rate": 8.391459074733097e-05, "loss": 1.8587, "step": 6534 }, { "epoch": 2.9044444444444446, "grad_norm": 3.1449742317199707, "learning_rate": 8.389679715302492e-05, "loss": 1.3689, "step": 6535 }, { "epoch": 2.904888888888889, "grad_norm": 2.7788970470428467, "learning_rate": 8.387900355871886e-05, "loss": 1.4992, "step": 6536 }, { "epoch": 2.905333333333333, "grad_norm": 2.823796510696411, "learning_rate": 8.386120996441282e-05, "loss": 1.9288, "step": 6537 }, { "epoch": 2.905777777777778, "grad_norm": 3.0071866512298584, "learning_rate": 8.384341637010676e-05, "loss": 2.0833, "step": 6538 }, { "epoch": 2.9062222222222225, "grad_norm": 2.019472360610962, "learning_rate": 8.382562277580071e-05, "loss": 0.9878, "step": 6539 }, { "epoch": 2.9066666666666667, "grad_norm": 2.4632411003112793, "learning_rate": 8.380782918149467e-05, "loss": 1.6176, "step": 6540 }, { "epoch": 2.907111111111111, "grad_norm": 3.062143564224243, "learning_rate": 8.379003558718861e-05, "loss": 2.1748, "step": 6541 }, { "epoch": 2.9075555555555557, "grad_norm": 4.060853958129883, "learning_rate": 8.377224199288256e-05, "loss": 1.9903, "step": 6542 }, { "epoch": 2.908, "grad_norm": 3.1725289821624756, "learning_rate": 8.375444839857651e-05, "loss": 1.8678, "step": 6543 }, { "epoch": 2.9084444444444446, "grad_norm": 3.047879934310913, "learning_rate": 8.373665480427047e-05, "loss": 1.6334, "step": 6544 }, { "epoch": 2.908888888888889, "grad_norm": 3.0653295516967773, "learning_rate": 8.371886120996441e-05, "loss": 1.8163, "step": 6545 }, { "epoch": 2.9093333333333335, "grad_norm": 3.4780588150024414, "learning_rate": 8.370106761565837e-05, "loss": 1.6996, "step": 6546 }, { "epoch": 2.909777777777778, "grad_norm": 3.106790781021118, "learning_rate": 8.368327402135233e-05, "loss": 1.5692, "step": 6547 }, { "epoch": 2.910222222222222, "grad_norm": 3.5817697048187256, "learning_rate": 8.366548042704627e-05, "loss": 2.3844, "step": 6548 }, { "epoch": 2.9106666666666667, "grad_norm": 3.6129720211029053, "learning_rate": 8.364768683274021e-05, "loss": 1.8793, "step": 6549 }, { "epoch": 2.911111111111111, "grad_norm": 3.0888235569000244, "learning_rate": 8.362989323843417e-05, "loss": 0.7461, "step": 6550 }, { "epoch": 2.9115555555555557, "grad_norm": 1.7484018802642822, "learning_rate": 8.361209964412811e-05, "loss": 1.9615, "step": 6551 }, { "epoch": 2.912, "grad_norm": 2.0905039310455322, "learning_rate": 8.359430604982207e-05, "loss": 2.2685, "step": 6552 }, { "epoch": 2.9124444444444446, "grad_norm": 2.227344512939453, "learning_rate": 8.357651245551602e-05, "loss": 1.8803, "step": 6553 }, { "epoch": 2.912888888888889, "grad_norm": 2.469000816345215, "learning_rate": 8.355871886120997e-05, "loss": 2.3178, "step": 6554 }, { "epoch": 2.913333333333333, "grad_norm": 2.261646270751953, "learning_rate": 8.354092526690391e-05, "loss": 2.118, "step": 6555 }, { "epoch": 2.913777777777778, "grad_norm": 2.2827672958374023, "learning_rate": 8.352313167259787e-05, "loss": 1.5105, "step": 6556 }, { "epoch": 2.9142222222222225, "grad_norm": 2.4409070014953613, "learning_rate": 8.350533807829182e-05, "loss": 2.1229, "step": 6557 }, { "epoch": 2.9146666666666667, "grad_norm": 1.7336750030517578, "learning_rate": 8.348754448398577e-05, "loss": 0.9735, "step": 6558 }, { "epoch": 2.915111111111111, "grad_norm": 2.462920904159546, "learning_rate": 8.346975088967972e-05, "loss": 1.6474, "step": 6559 }, { "epoch": 2.9155555555555557, "grad_norm": 2.5200788974761963, "learning_rate": 8.345195729537368e-05, "loss": 1.8454, "step": 6560 }, { "epoch": 2.916, "grad_norm": 2.2653720378875732, "learning_rate": 8.343416370106762e-05, "loss": 1.7404, "step": 6561 }, { "epoch": 2.916444444444444, "grad_norm": 2.323324680328369, "learning_rate": 8.341637010676157e-05, "loss": 1.7726, "step": 6562 }, { "epoch": 2.916888888888889, "grad_norm": 2.575195074081421, "learning_rate": 8.339857651245552e-05, "loss": 2.0535, "step": 6563 }, { "epoch": 2.9173333333333336, "grad_norm": 2.4426960945129395, "learning_rate": 8.338078291814946e-05, "loss": 1.8601, "step": 6564 }, { "epoch": 2.917777777777778, "grad_norm": 2.3300108909606934, "learning_rate": 8.336298932384342e-05, "loss": 1.7842, "step": 6565 }, { "epoch": 2.918222222222222, "grad_norm": 2.5958733558654785, "learning_rate": 8.334519572953738e-05, "loss": 1.9596, "step": 6566 }, { "epoch": 2.9186666666666667, "grad_norm": 2.2606828212738037, "learning_rate": 8.332740213523132e-05, "loss": 1.2543, "step": 6567 }, { "epoch": 2.919111111111111, "grad_norm": 2.573596954345703, "learning_rate": 8.330960854092526e-05, "loss": 1.6507, "step": 6568 }, { "epoch": 2.9195555555555557, "grad_norm": 1.9967552423477173, "learning_rate": 8.329181494661922e-05, "loss": 1.3919, "step": 6569 }, { "epoch": 2.92, "grad_norm": 2.615650177001953, "learning_rate": 8.327402135231316e-05, "loss": 1.8423, "step": 6570 }, { "epoch": 2.9204444444444446, "grad_norm": 2.7236530780792236, "learning_rate": 8.325622775800712e-05, "loss": 2.3231, "step": 6571 }, { "epoch": 2.920888888888889, "grad_norm": 2.5713248252868652, "learning_rate": 8.323843416370108e-05, "loss": 1.869, "step": 6572 }, { "epoch": 2.921333333333333, "grad_norm": 2.9785003662109375, "learning_rate": 8.322064056939502e-05, "loss": 2.2974, "step": 6573 }, { "epoch": 2.921777777777778, "grad_norm": 2.4110934734344482, "learning_rate": 8.320284697508898e-05, "loss": 1.2225, "step": 6574 }, { "epoch": 2.9222222222222225, "grad_norm": 2.372670888900757, "learning_rate": 8.318505338078292e-05, "loss": 1.2217, "step": 6575 }, { "epoch": 2.9226666666666667, "grad_norm": 2.6958694458007812, "learning_rate": 8.316725978647687e-05, "loss": 1.913, "step": 6576 }, { "epoch": 2.923111111111111, "grad_norm": 2.6148412227630615, "learning_rate": 8.314946619217082e-05, "loss": 1.4505, "step": 6577 }, { "epoch": 2.9235555555555557, "grad_norm": 2.7632627487182617, "learning_rate": 8.313167259786477e-05, "loss": 2.0089, "step": 6578 }, { "epoch": 2.924, "grad_norm": 2.448899984359741, "learning_rate": 8.311387900355873e-05, "loss": 1.7222, "step": 6579 }, { "epoch": 2.924444444444444, "grad_norm": 1.7573479413986206, "learning_rate": 8.309608540925267e-05, "loss": 0.8537, "step": 6580 }, { "epoch": 2.924888888888889, "grad_norm": 2.8462624549865723, "learning_rate": 8.307829181494662e-05, "loss": 2.0423, "step": 6581 }, { "epoch": 2.9253333333333336, "grad_norm": 1.8303264379501343, "learning_rate": 8.306049822064057e-05, "loss": 0.9714, "step": 6582 }, { "epoch": 2.925777777777778, "grad_norm": 2.0662248134613037, "learning_rate": 8.304270462633452e-05, "loss": 0.9561, "step": 6583 }, { "epoch": 2.926222222222222, "grad_norm": 2.5108532905578613, "learning_rate": 8.302491103202847e-05, "loss": 1.4271, "step": 6584 }, { "epoch": 2.9266666666666667, "grad_norm": 2.5698022842407227, "learning_rate": 8.300711743772243e-05, "loss": 1.4, "step": 6585 }, { "epoch": 2.927111111111111, "grad_norm": 2.503103494644165, "learning_rate": 8.298932384341637e-05, "loss": 1.2983, "step": 6586 }, { "epoch": 2.9275555555555557, "grad_norm": 2.902477264404297, "learning_rate": 8.297153024911033e-05, "loss": 1.7343, "step": 6587 }, { "epoch": 2.928, "grad_norm": 2.991421937942505, "learning_rate": 8.295373665480427e-05, "loss": 1.6752, "step": 6588 }, { "epoch": 2.9284444444444446, "grad_norm": 2.4898111820220947, "learning_rate": 8.293594306049823e-05, "loss": 1.7385, "step": 6589 }, { "epoch": 2.928888888888889, "grad_norm": 3.5942859649658203, "learning_rate": 8.291814946619217e-05, "loss": 2.0857, "step": 6590 }, { "epoch": 2.929333333333333, "grad_norm": 1.9334756135940552, "learning_rate": 8.290035587188613e-05, "loss": 0.77, "step": 6591 }, { "epoch": 2.929777777777778, "grad_norm": 3.102285385131836, "learning_rate": 8.288256227758008e-05, "loss": 1.8683, "step": 6592 }, { "epoch": 2.930222222222222, "grad_norm": 2.8912463188171387, "learning_rate": 8.286476868327403e-05, "loss": 1.7117, "step": 6593 }, { "epoch": 2.9306666666666668, "grad_norm": 3.238525152206421, "learning_rate": 8.284697508896797e-05, "loss": 2.01, "step": 6594 }, { "epoch": 2.931111111111111, "grad_norm": 2.9973912239074707, "learning_rate": 8.282918149466193e-05, "loss": 1.7764, "step": 6595 }, { "epoch": 2.9315555555555557, "grad_norm": 3.1456546783447266, "learning_rate": 8.281138790035587e-05, "loss": 2.0722, "step": 6596 }, { "epoch": 2.932, "grad_norm": 2.999476671218872, "learning_rate": 8.279359430604983e-05, "loss": 1.7548, "step": 6597 }, { "epoch": 2.932444444444444, "grad_norm": 3.8145954608917236, "learning_rate": 8.277580071174378e-05, "loss": 1.6423, "step": 6598 }, { "epoch": 2.932888888888889, "grad_norm": 5.341122150421143, "learning_rate": 8.275800711743773e-05, "loss": 1.1883, "step": 6599 }, { "epoch": 2.9333333333333336, "grad_norm": 3.1695616245269775, "learning_rate": 8.274021352313167e-05, "loss": 0.7776, "step": 6600 }, { "epoch": 2.933777777777778, "grad_norm": 2.0281214714050293, "learning_rate": 8.272241992882562e-05, "loss": 1.7799, "step": 6601 }, { "epoch": 2.934222222222222, "grad_norm": 2.2131547927856445, "learning_rate": 8.270462633451958e-05, "loss": 1.9286, "step": 6602 }, { "epoch": 2.9346666666666668, "grad_norm": 2.3528523445129395, "learning_rate": 8.268683274021352e-05, "loss": 1.5521, "step": 6603 }, { "epoch": 2.935111111111111, "grad_norm": 2.04451584815979, "learning_rate": 8.266903914590748e-05, "loss": 1.7594, "step": 6604 }, { "epoch": 2.9355555555555557, "grad_norm": 0.19427204132080078, "learning_rate": 8.265124555160144e-05, "loss": 0.0238, "step": 6605 }, { "epoch": 2.936, "grad_norm": 2.289008855819702, "learning_rate": 8.263345195729538e-05, "loss": 1.6694, "step": 6606 }, { "epoch": 2.9364444444444446, "grad_norm": 3.0516107082366943, "learning_rate": 8.261565836298932e-05, "loss": 1.9453, "step": 6607 }, { "epoch": 2.936888888888889, "grad_norm": 2.20947265625, "learning_rate": 8.259786476868328e-05, "loss": 1.7946, "step": 6608 }, { "epoch": 2.937333333333333, "grad_norm": 2.571842670440674, "learning_rate": 8.258007117437722e-05, "loss": 1.6298, "step": 6609 }, { "epoch": 2.937777777777778, "grad_norm": 2.839123249053955, "learning_rate": 8.256227758007118e-05, "loss": 1.9534, "step": 6610 }, { "epoch": 2.938222222222222, "grad_norm": 2.8716540336608887, "learning_rate": 8.254448398576514e-05, "loss": 1.8971, "step": 6611 }, { "epoch": 2.9386666666666668, "grad_norm": 2.287142276763916, "learning_rate": 8.252669039145908e-05, "loss": 1.73, "step": 6612 }, { "epoch": 2.939111111111111, "grad_norm": 2.8752803802490234, "learning_rate": 8.250889679715302e-05, "loss": 1.7428, "step": 6613 }, { "epoch": 2.9395555555555557, "grad_norm": 2.5958893299102783, "learning_rate": 8.249110320284698e-05, "loss": 1.9177, "step": 6614 }, { "epoch": 2.94, "grad_norm": 2.100194215774536, "learning_rate": 8.247330960854092e-05, "loss": 1.4296, "step": 6615 }, { "epoch": 2.940444444444444, "grad_norm": 2.6018195152282715, "learning_rate": 8.245551601423488e-05, "loss": 1.7576, "step": 6616 }, { "epoch": 2.940888888888889, "grad_norm": 2.3454983234405518, "learning_rate": 8.243772241992883e-05, "loss": 1.5927, "step": 6617 }, { "epoch": 2.9413333333333336, "grad_norm": 2.5776448249816895, "learning_rate": 8.241992882562278e-05, "loss": 1.7602, "step": 6618 }, { "epoch": 2.941777777777778, "grad_norm": 2.55011248588562, "learning_rate": 8.240213523131673e-05, "loss": 1.4901, "step": 6619 }, { "epoch": 2.942222222222222, "grad_norm": 2.298241376876831, "learning_rate": 8.238434163701068e-05, "loss": 1.5664, "step": 6620 }, { "epoch": 2.9426666666666668, "grad_norm": 2.403535842895508, "learning_rate": 8.236654804270463e-05, "loss": 1.7368, "step": 6621 }, { "epoch": 2.943111111111111, "grad_norm": 2.3302366733551025, "learning_rate": 8.234875444839858e-05, "loss": 1.5256, "step": 6622 }, { "epoch": 2.9435555555555557, "grad_norm": 2.4017152786254883, "learning_rate": 8.233096085409253e-05, "loss": 1.9532, "step": 6623 }, { "epoch": 2.944, "grad_norm": 1.9871573448181152, "learning_rate": 8.231316725978649e-05, "loss": 1.2034, "step": 6624 }, { "epoch": 2.9444444444444446, "grad_norm": 2.877697229385376, "learning_rate": 8.229537366548043e-05, "loss": 1.7799, "step": 6625 }, { "epoch": 2.944888888888889, "grad_norm": 2.7882204055786133, "learning_rate": 8.227758007117437e-05, "loss": 1.8585, "step": 6626 }, { "epoch": 2.945333333333333, "grad_norm": 3.066232442855835, "learning_rate": 8.225978647686833e-05, "loss": 1.9722, "step": 6627 }, { "epoch": 2.945777777777778, "grad_norm": 2.9001948833465576, "learning_rate": 8.224199288256227e-05, "loss": 2.0303, "step": 6628 }, { "epoch": 2.946222222222222, "grad_norm": 3.1734731197357178, "learning_rate": 8.222419928825623e-05, "loss": 1.8917, "step": 6629 }, { "epoch": 2.9466666666666668, "grad_norm": 1.936790108680725, "learning_rate": 8.220640569395019e-05, "loss": 0.9073, "step": 6630 }, { "epoch": 2.947111111111111, "grad_norm": 2.223129987716675, "learning_rate": 8.218861209964413e-05, "loss": 0.9589, "step": 6631 }, { "epoch": 2.9475555555555557, "grad_norm": 1.784337043762207, "learning_rate": 8.217081850533809e-05, "loss": 0.7341, "step": 6632 }, { "epoch": 2.948, "grad_norm": 2.703338146209717, "learning_rate": 8.215302491103203e-05, "loss": 1.5117, "step": 6633 }, { "epoch": 2.948444444444444, "grad_norm": 2.924023389816284, "learning_rate": 8.213523131672599e-05, "loss": 1.8323, "step": 6634 }, { "epoch": 2.948888888888889, "grad_norm": 3.084257125854492, "learning_rate": 8.211743772241993e-05, "loss": 1.7585, "step": 6635 }, { "epoch": 2.9493333333333336, "grad_norm": 2.6331820487976074, "learning_rate": 8.209964412811389e-05, "loss": 1.5509, "step": 6636 }, { "epoch": 2.949777777777778, "grad_norm": 2.6916754245758057, "learning_rate": 8.208185053380784e-05, "loss": 1.6519, "step": 6637 }, { "epoch": 2.950222222222222, "grad_norm": 3.28885555267334, "learning_rate": 8.206405693950178e-05, "loss": 1.9601, "step": 6638 }, { "epoch": 2.9506666666666668, "grad_norm": 3.135986328125, "learning_rate": 8.204626334519573e-05, "loss": 1.9278, "step": 6639 }, { "epoch": 2.951111111111111, "grad_norm": 2.845036745071411, "learning_rate": 8.202846975088968e-05, "loss": 1.611, "step": 6640 }, { "epoch": 2.9515555555555557, "grad_norm": 2.581521987915039, "learning_rate": 8.201067615658363e-05, "loss": 1.4946, "step": 6641 }, { "epoch": 2.952, "grad_norm": 2.706934928894043, "learning_rate": 8.199288256227758e-05, "loss": 1.6828, "step": 6642 }, { "epoch": 2.9524444444444446, "grad_norm": 3.34147572517395, "learning_rate": 8.197508896797154e-05, "loss": 2.12, "step": 6643 }, { "epoch": 2.952888888888889, "grad_norm": 3.1140594482421875, "learning_rate": 8.195729537366548e-05, "loss": 2.0541, "step": 6644 }, { "epoch": 2.953333333333333, "grad_norm": 3.713144540786743, "learning_rate": 8.193950177935944e-05, "loss": 2.1426, "step": 6645 }, { "epoch": 2.953777777777778, "grad_norm": 3.481236219406128, "learning_rate": 8.192170818505338e-05, "loss": 1.4833, "step": 6646 }, { "epoch": 2.954222222222222, "grad_norm": 3.1434216499328613, "learning_rate": 8.190391459074734e-05, "loss": 1.8463, "step": 6647 }, { "epoch": 2.9546666666666668, "grad_norm": 3.119110107421875, "learning_rate": 8.188612099644128e-05, "loss": 1.4728, "step": 6648 }, { "epoch": 2.955111111111111, "grad_norm": 0.43881502747535706, "learning_rate": 8.186832740213524e-05, "loss": 0.0622, "step": 6649 }, { "epoch": 2.9555555555555557, "grad_norm": 3.7765581607818604, "learning_rate": 8.18505338078292e-05, "loss": 1.5129, "step": 6650 }, { "epoch": 2.956, "grad_norm": 2.1686058044433594, "learning_rate": 8.183274021352314e-05, "loss": 2.1518, "step": 6651 }, { "epoch": 2.956444444444444, "grad_norm": 2.1492674350738525, "learning_rate": 8.181494661921708e-05, "loss": 1.6543, "step": 6652 }, { "epoch": 2.956888888888889, "grad_norm": 2.0771119594573975, "learning_rate": 8.179715302491104e-05, "loss": 2.1437, "step": 6653 }, { "epoch": 2.9573333333333336, "grad_norm": 2.154829978942871, "learning_rate": 8.177935943060498e-05, "loss": 2.0731, "step": 6654 }, { "epoch": 2.957777777777778, "grad_norm": 2.0446035861968994, "learning_rate": 8.176156583629894e-05, "loss": 2.1788, "step": 6655 }, { "epoch": 2.958222222222222, "grad_norm": 2.4353065490722656, "learning_rate": 8.174377224199289e-05, "loss": 1.9073, "step": 6656 }, { "epoch": 2.958666666666667, "grad_norm": 2.2374634742736816, "learning_rate": 8.172597864768684e-05, "loss": 1.9468, "step": 6657 }, { "epoch": 2.959111111111111, "grad_norm": 1.9991698265075684, "learning_rate": 8.170818505338078e-05, "loss": 2.0529, "step": 6658 }, { "epoch": 2.9595555555555557, "grad_norm": 2.3634281158447266, "learning_rate": 8.169039145907474e-05, "loss": 2.2038, "step": 6659 }, { "epoch": 2.96, "grad_norm": 2.3970882892608643, "learning_rate": 8.167259786476868e-05, "loss": 2.1983, "step": 6660 }, { "epoch": 2.9604444444444447, "grad_norm": 2.5298354625701904, "learning_rate": 8.165480427046264e-05, "loss": 2.1612, "step": 6661 }, { "epoch": 2.960888888888889, "grad_norm": 2.4006104469299316, "learning_rate": 8.163701067615659e-05, "loss": 2.0669, "step": 6662 }, { "epoch": 2.961333333333333, "grad_norm": 2.5116260051727295, "learning_rate": 8.161921708185053e-05, "loss": 1.9285, "step": 6663 }, { "epoch": 2.961777777777778, "grad_norm": 2.3565287590026855, "learning_rate": 8.160142348754449e-05, "loss": 1.9953, "step": 6664 }, { "epoch": 2.962222222222222, "grad_norm": 2.253330945968628, "learning_rate": 8.158362989323843e-05, "loss": 1.5129, "step": 6665 }, { "epoch": 2.962666666666667, "grad_norm": 2.49210524559021, "learning_rate": 8.156583629893239e-05, "loss": 1.9875, "step": 6666 }, { "epoch": 2.963111111111111, "grad_norm": 2.353780508041382, "learning_rate": 8.154804270462633e-05, "loss": 1.471, "step": 6667 }, { "epoch": 2.9635555555555557, "grad_norm": 2.679733991622925, "learning_rate": 8.153024911032029e-05, "loss": 1.8062, "step": 6668 }, { "epoch": 2.964, "grad_norm": 2.1088638305664062, "learning_rate": 8.151245551601425e-05, "loss": 1.6653, "step": 6669 }, { "epoch": 2.964444444444444, "grad_norm": 2.271273612976074, "learning_rate": 8.149466192170819e-05, "loss": 1.8267, "step": 6670 }, { "epoch": 2.964888888888889, "grad_norm": 2.468289613723755, "learning_rate": 8.147686832740213e-05, "loss": 1.5659, "step": 6671 }, { "epoch": 2.9653333333333336, "grad_norm": 2.2520124912261963, "learning_rate": 8.145907473309609e-05, "loss": 1.6409, "step": 6672 }, { "epoch": 2.965777777777778, "grad_norm": 2.9224207401275635, "learning_rate": 8.144128113879003e-05, "loss": 1.7884, "step": 6673 }, { "epoch": 2.966222222222222, "grad_norm": 2.306745767593384, "learning_rate": 8.142348754448399e-05, "loss": 1.8612, "step": 6674 }, { "epoch": 2.966666666666667, "grad_norm": 2.707267999649048, "learning_rate": 8.140569395017794e-05, "loss": 1.6955, "step": 6675 }, { "epoch": 2.967111111111111, "grad_norm": 2.6562561988830566, "learning_rate": 8.138790035587189e-05, "loss": 1.8765, "step": 6676 }, { "epoch": 2.9675555555555553, "grad_norm": 2.344191789627075, "learning_rate": 8.137010676156584e-05, "loss": 1.3519, "step": 6677 }, { "epoch": 2.968, "grad_norm": 2.481653928756714, "learning_rate": 8.135231316725979e-05, "loss": 1.8528, "step": 6678 }, { "epoch": 2.9684444444444447, "grad_norm": 2.491582155227661, "learning_rate": 8.133451957295374e-05, "loss": 1.8404, "step": 6679 }, { "epoch": 2.968888888888889, "grad_norm": 2.41845440864563, "learning_rate": 8.131672597864769e-05, "loss": 1.413, "step": 6680 }, { "epoch": 2.969333333333333, "grad_norm": 2.9708704948425293, "learning_rate": 8.129893238434164e-05, "loss": 2.0273, "step": 6681 }, { "epoch": 2.969777777777778, "grad_norm": 3.0216286182403564, "learning_rate": 8.12811387900356e-05, "loss": 2.0485, "step": 6682 }, { "epoch": 2.970222222222222, "grad_norm": 2.9172136783599854, "learning_rate": 8.126334519572954e-05, "loss": 1.5057, "step": 6683 }, { "epoch": 2.970666666666667, "grad_norm": 3.159728765487671, "learning_rate": 8.124555160142349e-05, "loss": 1.9017, "step": 6684 }, { "epoch": 2.971111111111111, "grad_norm": 2.4506280422210693, "learning_rate": 8.122775800711744e-05, "loss": 1.4581, "step": 6685 }, { "epoch": 2.9715555555555557, "grad_norm": 2.9856271743774414, "learning_rate": 8.120996441281138e-05, "loss": 1.591, "step": 6686 }, { "epoch": 2.972, "grad_norm": 3.172899007797241, "learning_rate": 8.119217081850534e-05, "loss": 1.7744, "step": 6687 }, { "epoch": 2.9724444444444442, "grad_norm": 2.1155688762664795, "learning_rate": 8.11743772241993e-05, "loss": 1.1155, "step": 6688 }, { "epoch": 2.972888888888889, "grad_norm": 2.4878904819488525, "learning_rate": 8.115658362989324e-05, "loss": 1.6012, "step": 6689 }, { "epoch": 2.9733333333333336, "grad_norm": 2.819701671600342, "learning_rate": 8.11387900355872e-05, "loss": 1.6376, "step": 6690 }, { "epoch": 2.973777777777778, "grad_norm": 3.1864094734191895, "learning_rate": 8.112099644128114e-05, "loss": 1.846, "step": 6691 }, { "epoch": 2.974222222222222, "grad_norm": 3.0991835594177246, "learning_rate": 8.11032028469751e-05, "loss": 2.155, "step": 6692 }, { "epoch": 2.974666666666667, "grad_norm": 2.6021008491516113, "learning_rate": 8.108540925266904e-05, "loss": 1.366, "step": 6693 }, { "epoch": 2.975111111111111, "grad_norm": 3.0120420455932617, "learning_rate": 8.1067615658363e-05, "loss": 1.7095, "step": 6694 }, { "epoch": 2.9755555555555553, "grad_norm": 3.54189395904541, "learning_rate": 8.104982206405695e-05, "loss": 1.5697, "step": 6695 }, { "epoch": 2.976, "grad_norm": 2.7960903644561768, "learning_rate": 8.10320284697509e-05, "loss": 1.5675, "step": 6696 }, { "epoch": 2.9764444444444447, "grad_norm": 3.8344061374664307, "learning_rate": 8.101423487544484e-05, "loss": 1.8995, "step": 6697 }, { "epoch": 2.976888888888889, "grad_norm": 3.2357256412506104, "learning_rate": 8.09964412811388e-05, "loss": 1.6873, "step": 6698 }, { "epoch": 2.977333333333333, "grad_norm": 4.216027736663818, "learning_rate": 8.097864768683274e-05, "loss": 1.9569, "step": 6699 }, { "epoch": 2.977777777777778, "grad_norm": 4.249828338623047, "learning_rate": 8.09608540925267e-05, "loss": 1.9055, "step": 6700 }, { "epoch": 2.978222222222222, "grad_norm": 2.1459178924560547, "learning_rate": 8.094306049822065e-05, "loss": 2.1627, "step": 6701 }, { "epoch": 2.978666666666667, "grad_norm": 1.4327117204666138, "learning_rate": 8.09252669039146e-05, "loss": 1.133, "step": 6702 }, { "epoch": 2.979111111111111, "grad_norm": 1.9758896827697754, "learning_rate": 8.090747330960855e-05, "loss": 2.7843, "step": 6703 }, { "epoch": 2.9795555555555557, "grad_norm": 2.312091112136841, "learning_rate": 8.08896797153025e-05, "loss": 2.1245, "step": 6704 }, { "epoch": 2.98, "grad_norm": 1.7319886684417725, "learning_rate": 8.087188612099644e-05, "loss": 0.4393, "step": 6705 }, { "epoch": 2.9804444444444442, "grad_norm": 2.626613140106201, "learning_rate": 8.085409252669039e-05, "loss": 2.6569, "step": 6706 }, { "epoch": 2.980888888888889, "grad_norm": 2.209878444671631, "learning_rate": 8.083629893238435e-05, "loss": 2.1272, "step": 6707 }, { "epoch": 2.981333333333333, "grad_norm": 2.3179056644439697, "learning_rate": 8.081850533807829e-05, "loss": 1.7923, "step": 6708 }, { "epoch": 2.981777777777778, "grad_norm": 2.408010959625244, "learning_rate": 8.080071174377225e-05, "loss": 1.7156, "step": 6709 }, { "epoch": 2.982222222222222, "grad_norm": 3.3281619548797607, "learning_rate": 8.078291814946619e-05, "loss": 1.9335, "step": 6710 }, { "epoch": 2.982666666666667, "grad_norm": 2.58201003074646, "learning_rate": 8.076512455516015e-05, "loss": 1.6612, "step": 6711 }, { "epoch": 2.983111111111111, "grad_norm": 2.454719066619873, "learning_rate": 8.074733096085409e-05, "loss": 2.0895, "step": 6712 }, { "epoch": 2.9835555555555553, "grad_norm": 2.9061319828033447, "learning_rate": 8.072953736654805e-05, "loss": 2.1351, "step": 6713 }, { "epoch": 2.984, "grad_norm": 2.6297855377197266, "learning_rate": 8.0711743772242e-05, "loss": 1.849, "step": 6714 }, { "epoch": 2.9844444444444447, "grad_norm": 2.431936264038086, "learning_rate": 8.069395017793595e-05, "loss": 1.809, "step": 6715 }, { "epoch": 2.984888888888889, "grad_norm": 1.5601791143417358, "learning_rate": 8.067615658362989e-05, "loss": 0.7057, "step": 6716 }, { "epoch": 2.985333333333333, "grad_norm": 2.3893916606903076, "learning_rate": 8.065836298932385e-05, "loss": 1.859, "step": 6717 }, { "epoch": 2.985777777777778, "grad_norm": 2.8812737464904785, "learning_rate": 8.064056939501779e-05, "loss": 1.4955, "step": 6718 }, { "epoch": 2.986222222222222, "grad_norm": 2.6631298065185547, "learning_rate": 8.062277580071175e-05, "loss": 1.8091, "step": 6719 }, { "epoch": 2.986666666666667, "grad_norm": 2.2545714378356934, "learning_rate": 8.06049822064057e-05, "loss": 1.7845, "step": 6720 }, { "epoch": 2.987111111111111, "grad_norm": 2.667125940322876, "learning_rate": 8.058718861209965e-05, "loss": 1.9743, "step": 6721 }, { "epoch": 2.9875555555555557, "grad_norm": 2.8963820934295654, "learning_rate": 8.05693950177936e-05, "loss": 2.2487, "step": 6722 }, { "epoch": 2.988, "grad_norm": 2.3376080989837646, "learning_rate": 8.055160142348754e-05, "loss": 1.7657, "step": 6723 }, { "epoch": 2.9884444444444442, "grad_norm": 2.4739911556243896, "learning_rate": 8.05338078291815e-05, "loss": 1.4618, "step": 6724 }, { "epoch": 2.988888888888889, "grad_norm": 2.836094856262207, "learning_rate": 8.051601423487544e-05, "loss": 1.8076, "step": 6725 }, { "epoch": 2.989333333333333, "grad_norm": 2.611704111099243, "learning_rate": 8.04982206405694e-05, "loss": 1.8282, "step": 6726 }, { "epoch": 2.989777777777778, "grad_norm": 2.016409158706665, "learning_rate": 8.048042704626336e-05, "loss": 1.579, "step": 6727 }, { "epoch": 2.990222222222222, "grad_norm": 3.172990083694458, "learning_rate": 8.04626334519573e-05, "loss": 1.9213, "step": 6728 }, { "epoch": 2.990666666666667, "grad_norm": 2.6530048847198486, "learning_rate": 8.044483985765124e-05, "loss": 1.3573, "step": 6729 }, { "epoch": 2.991111111111111, "grad_norm": 2.725369453430176, "learning_rate": 8.04270462633452e-05, "loss": 1.7038, "step": 6730 }, { "epoch": 2.9915555555555553, "grad_norm": 2.7324156761169434, "learning_rate": 8.040925266903914e-05, "loss": 1.9726, "step": 6731 }, { "epoch": 2.992, "grad_norm": 2.893840789794922, "learning_rate": 8.03914590747331e-05, "loss": 1.5817, "step": 6732 }, { "epoch": 2.9924444444444447, "grad_norm": 3.1419413089752197, "learning_rate": 8.037366548042706e-05, "loss": 1.5595, "step": 6733 }, { "epoch": 2.992888888888889, "grad_norm": 2.4272377490997314, "learning_rate": 8.0355871886121e-05, "loss": 1.338, "step": 6734 }, { "epoch": 2.993333333333333, "grad_norm": 2.944011688232422, "learning_rate": 8.033807829181496e-05, "loss": 1.7456, "step": 6735 }, { "epoch": 2.993777777777778, "grad_norm": 2.8351387977600098, "learning_rate": 8.03202846975089e-05, "loss": 1.6632, "step": 6736 }, { "epoch": 2.994222222222222, "grad_norm": 3.1518852710723877, "learning_rate": 8.030249110320285e-05, "loss": 0.9622, "step": 6737 }, { "epoch": 2.994666666666667, "grad_norm": 2.9322409629821777, "learning_rate": 8.02846975088968e-05, "loss": 1.1134, "step": 6738 }, { "epoch": 2.995111111111111, "grad_norm": 2.9083549976348877, "learning_rate": 8.026690391459075e-05, "loss": 1.7185, "step": 6739 }, { "epoch": 2.9955555555555557, "grad_norm": 2.974486827850342, "learning_rate": 8.024911032028471e-05, "loss": 1.9998, "step": 6740 }, { "epoch": 2.996, "grad_norm": 2.9648959636688232, "learning_rate": 8.023131672597865e-05, "loss": 1.7564, "step": 6741 }, { "epoch": 2.9964444444444442, "grad_norm": 3.1087772846221924, "learning_rate": 8.02135231316726e-05, "loss": 1.4747, "step": 6742 }, { "epoch": 2.996888888888889, "grad_norm": 3.156559944152832, "learning_rate": 8.019572953736655e-05, "loss": 1.6082, "step": 6743 }, { "epoch": 2.997333333333333, "grad_norm": 2.8267040252685547, "learning_rate": 8.01779359430605e-05, "loss": 1.7786, "step": 6744 }, { "epoch": 2.997777777777778, "grad_norm": 3.1161437034606934, "learning_rate": 8.016014234875445e-05, "loss": 1.8029, "step": 6745 }, { "epoch": 2.998222222222222, "grad_norm": 2.9387221336364746, "learning_rate": 8.014234875444841e-05, "loss": 1.4219, "step": 6746 }, { "epoch": 2.998666666666667, "grad_norm": 3.949208974838257, "learning_rate": 8.012455516014235e-05, "loss": 2.0522, "step": 6747 }, { "epoch": 2.999111111111111, "grad_norm": 3.7779977321624756, "learning_rate": 8.010676156583631e-05, "loss": 1.7612, "step": 6748 }, { "epoch": 2.9995555555555553, "grad_norm": 3.2595958709716797, "learning_rate": 8.008896797153025e-05, "loss": 1.5605, "step": 6749 }, { "epoch": 3.0, "grad_norm": 3.8709285259246826, "learning_rate": 8.00711743772242e-05, "loss": 1.2562, "step": 6750 }, { "epoch": 3.0, "eval_loss": 2.560321807861328, "eval_runtime": 47.1336, "eval_samples_per_second": 10.608, "eval_steps_per_second": 10.608, "step": 6750 }, { "epoch": 3.0004444444444442, "grad_norm": 1.5255669355392456, "learning_rate": 8.005338078291815e-05, "loss": 1.4736, "step": 6751 }, { "epoch": 3.000888888888889, "grad_norm": 1.8357324600219727, "learning_rate": 8.003558718861211e-05, "loss": 1.8316, "step": 6752 }, { "epoch": 3.001333333333333, "grad_norm": 2.2091808319091797, "learning_rate": 8.001779359430605e-05, "loss": 1.994, "step": 6753 }, { "epoch": 3.001777777777778, "grad_norm": 2.113086462020874, "learning_rate": 8e-05, "loss": 2.0161, "step": 6754 }, { "epoch": 3.002222222222222, "grad_norm": 2.0146021842956543, "learning_rate": 7.998220640569395e-05, "loss": 1.3508, "step": 6755 }, { "epoch": 3.002666666666667, "grad_norm": 2.153592348098755, "learning_rate": 7.99644128113879e-05, "loss": 1.2604, "step": 6756 }, { "epoch": 3.003111111111111, "grad_norm": 1.5062990188598633, "learning_rate": 7.994661921708185e-05, "loss": 1.0908, "step": 6757 }, { "epoch": 3.0035555555555558, "grad_norm": 2.1184024810791016, "learning_rate": 7.99288256227758e-05, "loss": 1.2938, "step": 6758 }, { "epoch": 3.004, "grad_norm": 2.0012667179107666, "learning_rate": 7.991103202846976e-05, "loss": 1.3528, "step": 6759 }, { "epoch": 3.0044444444444443, "grad_norm": 2.275223731994629, "learning_rate": 7.98932384341637e-05, "loss": 1.2119, "step": 6760 }, { "epoch": 3.004888888888889, "grad_norm": 2.465026617050171, "learning_rate": 7.987544483985766e-05, "loss": 1.4336, "step": 6761 }, { "epoch": 3.005333333333333, "grad_norm": 2.5878005027770996, "learning_rate": 7.98576512455516e-05, "loss": 1.4849, "step": 6762 }, { "epoch": 3.005777777777778, "grad_norm": 2.4724271297454834, "learning_rate": 7.983985765124555e-05, "loss": 1.033, "step": 6763 }, { "epoch": 3.006222222222222, "grad_norm": 2.9463562965393066, "learning_rate": 7.98220640569395e-05, "loss": 1.4481, "step": 6764 }, { "epoch": 3.006666666666667, "grad_norm": 3.1810200214385986, "learning_rate": 7.980427046263346e-05, "loss": 0.9846, "step": 6765 }, { "epoch": 3.007111111111111, "grad_norm": 2.737637758255005, "learning_rate": 7.97864768683274e-05, "loss": 1.0059, "step": 6766 }, { "epoch": 3.0075555555555558, "grad_norm": 2.819664239883423, "learning_rate": 7.976868327402136e-05, "loss": 1.331, "step": 6767 }, { "epoch": 3.008, "grad_norm": 2.880838632583618, "learning_rate": 7.97508896797153e-05, "loss": 1.6169, "step": 6768 }, { "epoch": 3.0084444444444443, "grad_norm": 2.998161792755127, "learning_rate": 7.973309608540926e-05, "loss": 1.3927, "step": 6769 }, { "epoch": 3.008888888888889, "grad_norm": 2.9646785259246826, "learning_rate": 7.97153024911032e-05, "loss": 1.1182, "step": 6770 }, { "epoch": 3.009333333333333, "grad_norm": 3.8950388431549072, "learning_rate": 7.969750889679716e-05, "loss": 1.0995, "step": 6771 }, { "epoch": 3.009777777777778, "grad_norm": 3.958792209625244, "learning_rate": 7.967971530249112e-05, "loss": 1.3026, "step": 6772 }, { "epoch": 3.010222222222222, "grad_norm": 3.3509862422943115, "learning_rate": 7.966192170818506e-05, "loss": 1.511, "step": 6773 }, { "epoch": 3.010666666666667, "grad_norm": 3.338815689086914, "learning_rate": 7.9644128113879e-05, "loss": 1.1785, "step": 6774 }, { "epoch": 3.011111111111111, "grad_norm": 4.509990692138672, "learning_rate": 7.962633451957296e-05, "loss": 1.4605, "step": 6775 }, { "epoch": 3.0115555555555558, "grad_norm": 3.72403883934021, "learning_rate": 7.96085409252669e-05, "loss": 1.3251, "step": 6776 }, { "epoch": 3.012, "grad_norm": 3.0874853134155273, "learning_rate": 7.959074733096086e-05, "loss": 1.424, "step": 6777 }, { "epoch": 3.0124444444444443, "grad_norm": 2.400561809539795, "learning_rate": 7.957295373665481e-05, "loss": 0.6489, "step": 6778 }, { "epoch": 3.012888888888889, "grad_norm": 3.6095080375671387, "learning_rate": 7.955516014234876e-05, "loss": 0.9895, "step": 6779 }, { "epoch": 3.013333333333333, "grad_norm": 4.059244632720947, "learning_rate": 7.953736654804271e-05, "loss": 1.0755, "step": 6780 }, { "epoch": 3.013777777777778, "grad_norm": 3.3951244354248047, "learning_rate": 7.951957295373666e-05, "loss": 1.4593, "step": 6781 }, { "epoch": 3.014222222222222, "grad_norm": 3.264514207839966, "learning_rate": 7.950177935943061e-05, "loss": 1.1751, "step": 6782 }, { "epoch": 3.014666666666667, "grad_norm": 3.5838840007781982, "learning_rate": 7.948398576512456e-05, "loss": 1.2873, "step": 6783 }, { "epoch": 3.015111111111111, "grad_norm": 3.029461622238159, "learning_rate": 7.946619217081851e-05, "loss": 0.5727, "step": 6784 }, { "epoch": 3.0155555555555558, "grad_norm": 2.3278002738952637, "learning_rate": 7.944839857651247e-05, "loss": 0.5224, "step": 6785 }, { "epoch": 3.016, "grad_norm": 3.3948307037353516, "learning_rate": 7.943060498220641e-05, "loss": 1.5969, "step": 6786 }, { "epoch": 3.0164444444444443, "grad_norm": 3.4269049167633057, "learning_rate": 7.941281138790035e-05, "loss": 1.3439, "step": 6787 }, { "epoch": 3.016888888888889, "grad_norm": 4.021193504333496, "learning_rate": 7.939501779359431e-05, "loss": 1.429, "step": 6788 }, { "epoch": 3.017333333333333, "grad_norm": 3.9195425510406494, "learning_rate": 7.937722419928825e-05, "loss": 1.3733, "step": 6789 }, { "epoch": 3.017777777777778, "grad_norm": 3.872466564178467, "learning_rate": 7.935943060498221e-05, "loss": 1.1731, "step": 6790 }, { "epoch": 3.018222222222222, "grad_norm": 4.006219387054443, "learning_rate": 7.934163701067617e-05, "loss": 1.5755, "step": 6791 }, { "epoch": 3.018666666666667, "grad_norm": 4.188656330108643, "learning_rate": 7.932384341637011e-05, "loss": 1.4132, "step": 6792 }, { "epoch": 3.019111111111111, "grad_norm": 3.21732234954834, "learning_rate": 7.930604982206407e-05, "loss": 0.9631, "step": 6793 }, { "epoch": 3.0195555555555558, "grad_norm": 2.5914382934570312, "learning_rate": 7.928825622775801e-05, "loss": 0.9383, "step": 6794 }, { "epoch": 3.02, "grad_norm": 3.1577656269073486, "learning_rate": 7.927046263345195e-05, "loss": 1.1981, "step": 6795 }, { "epoch": 3.0204444444444443, "grad_norm": 3.7935643196105957, "learning_rate": 7.925266903914591e-05, "loss": 1.2181, "step": 6796 }, { "epoch": 3.020888888888889, "grad_norm": 4.230493068695068, "learning_rate": 7.923487544483986e-05, "loss": 1.467, "step": 6797 }, { "epoch": 3.021333333333333, "grad_norm": 4.8975982666015625, "learning_rate": 7.921708185053381e-05, "loss": 1.0094, "step": 6798 }, { "epoch": 3.021777777777778, "grad_norm": 3.1656551361083984, "learning_rate": 7.919928825622776e-05, "loss": 0.5155, "step": 6799 }, { "epoch": 3.022222222222222, "grad_norm": 4.159629821777344, "learning_rate": 7.918149466192171e-05, "loss": 0.1859, "step": 6800 }, { "epoch": 3.022666666666667, "grad_norm": 3.048837900161743, "learning_rate": 7.916370106761566e-05, "loss": 0.8377, "step": 6801 }, { "epoch": 3.023111111111111, "grad_norm": 2.2959177494049072, "learning_rate": 7.91459074733096e-05, "loss": 1.6241, "step": 6802 }, { "epoch": 3.0235555555555558, "grad_norm": 1.9068602323532104, "learning_rate": 7.912811387900356e-05, "loss": 0.0712, "step": 6803 }, { "epoch": 3.024, "grad_norm": 1.6793782711029053, "learning_rate": 7.911032028469752e-05, "loss": 0.5563, "step": 6804 }, { "epoch": 3.0244444444444443, "grad_norm": 2.5858163833618164, "learning_rate": 7.909252669039146e-05, "loss": 1.3781, "step": 6805 }, { "epoch": 3.024888888888889, "grad_norm": 2.612051010131836, "learning_rate": 7.907473309608542e-05, "loss": 1.5614, "step": 6806 }, { "epoch": 3.025333333333333, "grad_norm": 2.8191487789154053, "learning_rate": 7.905693950177936e-05, "loss": 1.3573, "step": 6807 }, { "epoch": 3.025777777777778, "grad_norm": 2.706998109817505, "learning_rate": 7.90391459074733e-05, "loss": 1.6064, "step": 6808 }, { "epoch": 3.026222222222222, "grad_norm": 2.847705841064453, "learning_rate": 7.902135231316726e-05, "loss": 1.0591, "step": 6809 }, { "epoch": 3.026666666666667, "grad_norm": 2.685861825942993, "learning_rate": 7.900355871886122e-05, "loss": 1.5385, "step": 6810 }, { "epoch": 3.027111111111111, "grad_norm": 3.0858583450317383, "learning_rate": 7.898576512455516e-05, "loss": 1.4013, "step": 6811 }, { "epoch": 3.0275555555555558, "grad_norm": 2.825329065322876, "learning_rate": 7.896797153024912e-05, "loss": 1.4406, "step": 6812 }, { "epoch": 3.028, "grad_norm": 3.5686569213867188, "learning_rate": 7.895017793594306e-05, "loss": 1.3895, "step": 6813 }, { "epoch": 3.0284444444444443, "grad_norm": 3.122117280960083, "learning_rate": 7.893238434163702e-05, "loss": 1.3254, "step": 6814 }, { "epoch": 3.028888888888889, "grad_norm": 2.970949649810791, "learning_rate": 7.891459074733096e-05, "loss": 1.4918, "step": 6815 }, { "epoch": 3.029333333333333, "grad_norm": 2.8904595375061035, "learning_rate": 7.889679715302492e-05, "loss": 1.2927, "step": 6816 }, { "epoch": 3.029777777777778, "grad_norm": 3.0297486782073975, "learning_rate": 7.887900355871887e-05, "loss": 1.4865, "step": 6817 }, { "epoch": 3.030222222222222, "grad_norm": 3.1951310634613037, "learning_rate": 7.886120996441282e-05, "loss": 1.1547, "step": 6818 }, { "epoch": 3.030666666666667, "grad_norm": 1.9806190729141235, "learning_rate": 7.884341637010677e-05, "loss": 0.5659, "step": 6819 }, { "epoch": 3.031111111111111, "grad_norm": 2.957610845565796, "learning_rate": 7.882562277580072e-05, "loss": 1.3614, "step": 6820 }, { "epoch": 3.0315555555555553, "grad_norm": 3.6964633464813232, "learning_rate": 7.880782918149466e-05, "loss": 1.7276, "step": 6821 }, { "epoch": 3.032, "grad_norm": 3.108067512512207, "learning_rate": 7.879003558718861e-05, "loss": 1.4918, "step": 6822 }, { "epoch": 3.0324444444444443, "grad_norm": 2.6663601398468018, "learning_rate": 7.877224199288257e-05, "loss": 1.233, "step": 6823 }, { "epoch": 3.032888888888889, "grad_norm": 3.010274648666382, "learning_rate": 7.875444839857651e-05, "loss": 1.3845, "step": 6824 }, { "epoch": 3.033333333333333, "grad_norm": 3.353402614593506, "learning_rate": 7.873665480427047e-05, "loss": 1.3772, "step": 6825 }, { "epoch": 3.033777777777778, "grad_norm": 2.9852728843688965, "learning_rate": 7.871886120996441e-05, "loss": 1.5947, "step": 6826 }, { "epoch": 3.034222222222222, "grad_norm": 2.881147861480713, "learning_rate": 7.870106761565837e-05, "loss": 1.3783, "step": 6827 }, { "epoch": 3.034666666666667, "grad_norm": 3.402742385864258, "learning_rate": 7.868327402135231e-05, "loss": 1.1179, "step": 6828 }, { "epoch": 3.035111111111111, "grad_norm": 2.7964463233947754, "learning_rate": 7.866548042704627e-05, "loss": 0.6328, "step": 6829 }, { "epoch": 3.0355555555555553, "grad_norm": 3.21201229095459, "learning_rate": 7.864768683274023e-05, "loss": 1.4121, "step": 6830 }, { "epoch": 3.036, "grad_norm": 2.9257187843322754, "learning_rate": 7.862989323843417e-05, "loss": 0.7511, "step": 6831 }, { "epoch": 3.0364444444444443, "grad_norm": 3.565912961959839, "learning_rate": 7.861209964412811e-05, "loss": 1.5339, "step": 6832 }, { "epoch": 3.036888888888889, "grad_norm": 2.8367793560028076, "learning_rate": 7.859430604982207e-05, "loss": 1.2353, "step": 6833 }, { "epoch": 3.037333333333333, "grad_norm": 3.7711472511291504, "learning_rate": 7.857651245551601e-05, "loss": 1.4139, "step": 6834 }, { "epoch": 3.037777777777778, "grad_norm": 2.9545300006866455, "learning_rate": 7.855871886120997e-05, "loss": 1.3022, "step": 6835 }, { "epoch": 3.038222222222222, "grad_norm": 3.401104688644409, "learning_rate": 7.854092526690392e-05, "loss": 1.3024, "step": 6836 }, { "epoch": 3.038666666666667, "grad_norm": 3.2514753341674805, "learning_rate": 7.852313167259787e-05, "loss": 0.9559, "step": 6837 }, { "epoch": 3.039111111111111, "grad_norm": 4.7731547355651855, "learning_rate": 7.850533807829182e-05, "loss": 1.3061, "step": 6838 }, { "epoch": 3.0395555555555553, "grad_norm": 3.2436811923980713, "learning_rate": 7.848754448398577e-05, "loss": 1.0786, "step": 6839 }, { "epoch": 3.04, "grad_norm": 3.0958991050720215, "learning_rate": 7.846975088967971e-05, "loss": 1.2044, "step": 6840 }, { "epoch": 3.0404444444444443, "grad_norm": 3.618880271911621, "learning_rate": 7.845195729537367e-05, "loss": 1.0556, "step": 6841 }, { "epoch": 3.040888888888889, "grad_norm": 4.0584001541137695, "learning_rate": 7.843416370106762e-05, "loss": 1.1659, "step": 6842 }, { "epoch": 3.041333333333333, "grad_norm": 4.1572418212890625, "learning_rate": 7.841637010676157e-05, "loss": 1.0593, "step": 6843 }, { "epoch": 3.041777777777778, "grad_norm": 3.093219757080078, "learning_rate": 7.839857651245552e-05, "loss": 1.1444, "step": 6844 }, { "epoch": 3.042222222222222, "grad_norm": 4.556105613708496, "learning_rate": 7.838078291814947e-05, "loss": 1.3215, "step": 6845 }, { "epoch": 3.042666666666667, "grad_norm": 3.609612226486206, "learning_rate": 7.836298932384342e-05, "loss": 1.3856, "step": 6846 }, { "epoch": 3.043111111111111, "grad_norm": 4.959066867828369, "learning_rate": 7.834519572953736e-05, "loss": 1.1098, "step": 6847 }, { "epoch": 3.0435555555555553, "grad_norm": 3.45310640335083, "learning_rate": 7.832740213523132e-05, "loss": 0.7384, "step": 6848 }, { "epoch": 3.044, "grad_norm": 5.438348770141602, "learning_rate": 7.830960854092528e-05, "loss": 1.2785, "step": 6849 }, { "epoch": 3.0444444444444443, "grad_norm": 6.3002119064331055, "learning_rate": 7.829181494661922e-05, "loss": 0.9269, "step": 6850 }, { "epoch": 3.044888888888889, "grad_norm": 2.523265838623047, "learning_rate": 7.827402135231318e-05, "loss": 1.9499, "step": 6851 }, { "epoch": 3.0453333333333332, "grad_norm": 2.234962224960327, "learning_rate": 7.825622775800712e-05, "loss": 1.646, "step": 6852 }, { "epoch": 3.045777777777778, "grad_norm": 2.712780475616455, "learning_rate": 7.823843416370106e-05, "loss": 1.793, "step": 6853 }, { "epoch": 3.046222222222222, "grad_norm": 2.686671018600464, "learning_rate": 7.822064056939502e-05, "loss": 1.5653, "step": 6854 }, { "epoch": 3.046666666666667, "grad_norm": 2.907186269760132, "learning_rate": 7.820284697508898e-05, "loss": 1.5827, "step": 6855 }, { "epoch": 3.047111111111111, "grad_norm": 2.4521493911743164, "learning_rate": 7.818505338078292e-05, "loss": 1.5815, "step": 6856 }, { "epoch": 3.0475555555555554, "grad_norm": 2.803241491317749, "learning_rate": 7.816725978647688e-05, "loss": 1.5903, "step": 6857 }, { "epoch": 3.048, "grad_norm": 2.856114625930786, "learning_rate": 7.814946619217082e-05, "loss": 1.3927, "step": 6858 }, { "epoch": 3.0484444444444443, "grad_norm": 3.376676082611084, "learning_rate": 7.813167259786477e-05, "loss": 1.561, "step": 6859 }, { "epoch": 3.048888888888889, "grad_norm": 3.10386061668396, "learning_rate": 7.811387900355872e-05, "loss": 1.473, "step": 6860 }, { "epoch": 3.0493333333333332, "grad_norm": 3.0056633949279785, "learning_rate": 7.809608540925267e-05, "loss": 1.3637, "step": 6861 }, { "epoch": 3.049777777777778, "grad_norm": 3.1798784732818604, "learning_rate": 7.807829181494663e-05, "loss": 1.7298, "step": 6862 }, { "epoch": 3.050222222222222, "grad_norm": 3.0518245697021484, "learning_rate": 7.806049822064057e-05, "loss": 1.2918, "step": 6863 }, { "epoch": 3.050666666666667, "grad_norm": 2.996016502380371, "learning_rate": 7.804270462633453e-05, "loss": 1.5182, "step": 6864 }, { "epoch": 3.051111111111111, "grad_norm": 3.0778775215148926, "learning_rate": 7.802491103202847e-05, "loss": 1.7565, "step": 6865 }, { "epoch": 3.0515555555555554, "grad_norm": 2.45369553565979, "learning_rate": 7.800711743772242e-05, "loss": 0.9105, "step": 6866 }, { "epoch": 3.052, "grad_norm": 3.1584999561309814, "learning_rate": 7.798932384341637e-05, "loss": 1.5315, "step": 6867 }, { "epoch": 3.0524444444444443, "grad_norm": 2.7491531372070312, "learning_rate": 7.797153024911033e-05, "loss": 1.2398, "step": 6868 }, { "epoch": 3.052888888888889, "grad_norm": 3.181427478790283, "learning_rate": 7.795373665480427e-05, "loss": 1.4826, "step": 6869 }, { "epoch": 3.0533333333333332, "grad_norm": 2.462205648422241, "learning_rate": 7.793594306049823e-05, "loss": 0.7215, "step": 6870 }, { "epoch": 3.053777777777778, "grad_norm": 2.8350601196289062, "learning_rate": 7.791814946619217e-05, "loss": 1.4129, "step": 6871 }, { "epoch": 3.054222222222222, "grad_norm": 3.099984645843506, "learning_rate": 7.790035587188613e-05, "loss": 1.1478, "step": 6872 }, { "epoch": 3.054666666666667, "grad_norm": 2.893434524536133, "learning_rate": 7.788256227758007e-05, "loss": 1.4698, "step": 6873 }, { "epoch": 3.055111111111111, "grad_norm": 3.185155153274536, "learning_rate": 7.786476868327403e-05, "loss": 1.6558, "step": 6874 }, { "epoch": 3.0555555555555554, "grad_norm": 2.3959739208221436, "learning_rate": 7.784697508896798e-05, "loss": 1.266, "step": 6875 }, { "epoch": 3.056, "grad_norm": 3.058983564376831, "learning_rate": 7.782918149466193e-05, "loss": 1.3393, "step": 6876 }, { "epoch": 3.0564444444444443, "grad_norm": 3.080317974090576, "learning_rate": 7.781138790035588e-05, "loss": 1.5509, "step": 6877 }, { "epoch": 3.056888888888889, "grad_norm": 3.090117931365967, "learning_rate": 7.779359430604983e-05, "loss": 1.2201, "step": 6878 }, { "epoch": 3.0573333333333332, "grad_norm": 2.1614432334899902, "learning_rate": 7.777580071174377e-05, "loss": 0.6346, "step": 6879 }, { "epoch": 3.057777777777778, "grad_norm": 2.7697033882141113, "learning_rate": 7.775800711743773e-05, "loss": 1.1247, "step": 6880 }, { "epoch": 3.058222222222222, "grad_norm": 3.1787707805633545, "learning_rate": 7.774021352313168e-05, "loss": 1.0795, "step": 6881 }, { "epoch": 3.058666666666667, "grad_norm": 3.6657590866088867, "learning_rate": 7.772241992882563e-05, "loss": 1.6009, "step": 6882 }, { "epoch": 3.059111111111111, "grad_norm": 3.2929611206054688, "learning_rate": 7.770462633451958e-05, "loss": 1.4993, "step": 6883 }, { "epoch": 3.0595555555555554, "grad_norm": 3.753507137298584, "learning_rate": 7.768683274021352e-05, "loss": 1.6565, "step": 6884 }, { "epoch": 3.06, "grad_norm": 2.848525047302246, "learning_rate": 7.766903914590747e-05, "loss": 0.9654, "step": 6885 }, { "epoch": 3.0604444444444443, "grad_norm": 3.355612277984619, "learning_rate": 7.765124555160142e-05, "loss": 1.0126, "step": 6886 }, { "epoch": 3.060888888888889, "grad_norm": 3.229212522506714, "learning_rate": 7.763345195729538e-05, "loss": 0.7328, "step": 6887 }, { "epoch": 3.0613333333333332, "grad_norm": 4.031661033630371, "learning_rate": 7.761565836298932e-05, "loss": 1.3381, "step": 6888 }, { "epoch": 3.061777777777778, "grad_norm": 3.2130744457244873, "learning_rate": 7.759786476868328e-05, "loss": 1.1694, "step": 6889 }, { "epoch": 3.062222222222222, "grad_norm": 4.325170040130615, "learning_rate": 7.758007117437722e-05, "loss": 0.9918, "step": 6890 }, { "epoch": 3.062666666666667, "grad_norm": 3.6601080894470215, "learning_rate": 7.756227758007118e-05, "loss": 1.6125, "step": 6891 }, { "epoch": 3.063111111111111, "grad_norm": 2.9753174781799316, "learning_rate": 7.754448398576512e-05, "loss": 0.8867, "step": 6892 }, { "epoch": 3.0635555555555554, "grad_norm": 4.441684246063232, "learning_rate": 7.752669039145908e-05, "loss": 1.6672, "step": 6893 }, { "epoch": 3.064, "grad_norm": 5.101708889007568, "learning_rate": 7.750889679715304e-05, "loss": 1.3812, "step": 6894 }, { "epoch": 3.0644444444444443, "grad_norm": 4.84489631652832, "learning_rate": 7.749110320284698e-05, "loss": 1.4592, "step": 6895 }, { "epoch": 3.064888888888889, "grad_norm": 4.635469913482666, "learning_rate": 7.747330960854093e-05, "loss": 1.2783, "step": 6896 }, { "epoch": 3.0653333333333332, "grad_norm": 4.737509727478027, "learning_rate": 7.745551601423488e-05, "loss": 1.1961, "step": 6897 }, { "epoch": 3.065777777777778, "grad_norm": 5.450290203094482, "learning_rate": 7.743772241992882e-05, "loss": 0.9652, "step": 6898 }, { "epoch": 3.066222222222222, "grad_norm": 4.097311973571777, "learning_rate": 7.741992882562278e-05, "loss": 0.9152, "step": 6899 }, { "epoch": 3.066666666666667, "grad_norm": 4.8908257484436035, "learning_rate": 7.740213523131673e-05, "loss": 0.9301, "step": 6900 }, { "epoch": 3.067111111111111, "grad_norm": 1.8603249788284302, "learning_rate": 7.738434163701068e-05, "loss": 0.848, "step": 6901 }, { "epoch": 3.0675555555555554, "grad_norm": 2.392153263092041, "learning_rate": 7.736654804270463e-05, "loss": 1.9415, "step": 6902 }, { "epoch": 3.068, "grad_norm": 2.698664426803589, "learning_rate": 7.734875444839858e-05, "loss": 1.7879, "step": 6903 }, { "epoch": 3.0684444444444443, "grad_norm": 2.744234561920166, "learning_rate": 7.733096085409253e-05, "loss": 1.7097, "step": 6904 }, { "epoch": 3.068888888888889, "grad_norm": 3.156621217727661, "learning_rate": 7.731316725978648e-05, "loss": 2.1794, "step": 6905 }, { "epoch": 3.0693333333333332, "grad_norm": 2.774522066116333, "learning_rate": 7.729537366548043e-05, "loss": 1.6727, "step": 6906 }, { "epoch": 3.069777777777778, "grad_norm": 3.170665740966797, "learning_rate": 7.727758007117439e-05, "loss": 0.7644, "step": 6907 }, { "epoch": 3.070222222222222, "grad_norm": 2.518483877182007, "learning_rate": 7.725978647686833e-05, "loss": 1.337, "step": 6908 }, { "epoch": 3.070666666666667, "grad_norm": 2.7955567836761475, "learning_rate": 7.724199288256229e-05, "loss": 1.1453, "step": 6909 }, { "epoch": 3.071111111111111, "grad_norm": 3.288102865219116, "learning_rate": 7.722419928825623e-05, "loss": 1.2716, "step": 6910 }, { "epoch": 3.0715555555555554, "grad_norm": 3.349630355834961, "learning_rate": 7.720640569395017e-05, "loss": 1.716, "step": 6911 }, { "epoch": 3.072, "grad_norm": 2.8400745391845703, "learning_rate": 7.718861209964413e-05, "loss": 1.2617, "step": 6912 }, { "epoch": 3.0724444444444443, "grad_norm": 2.843729257583618, "learning_rate": 7.717081850533809e-05, "loss": 1.4572, "step": 6913 }, { "epoch": 3.072888888888889, "grad_norm": 3.0647592544555664, "learning_rate": 7.715302491103203e-05, "loss": 1.5921, "step": 6914 }, { "epoch": 3.0733333333333333, "grad_norm": 3.1365675926208496, "learning_rate": 7.713523131672599e-05, "loss": 1.5678, "step": 6915 }, { "epoch": 3.073777777777778, "grad_norm": 2.7604968547821045, "learning_rate": 7.711743772241993e-05, "loss": 1.3843, "step": 6916 }, { "epoch": 3.074222222222222, "grad_norm": 2.654712438583374, "learning_rate": 7.709964412811389e-05, "loss": 1.0891, "step": 6917 }, { "epoch": 3.074666666666667, "grad_norm": 3.091094732284546, "learning_rate": 7.708185053380783e-05, "loss": 1.4574, "step": 6918 }, { "epoch": 3.075111111111111, "grad_norm": 2.8697140216827393, "learning_rate": 7.706405693950179e-05, "loss": 1.2856, "step": 6919 }, { "epoch": 3.0755555555555554, "grad_norm": 4.098965644836426, "learning_rate": 7.704626334519574e-05, "loss": 1.2447, "step": 6920 }, { "epoch": 3.076, "grad_norm": 3.0717883110046387, "learning_rate": 7.702846975088968e-05, "loss": 1.2924, "step": 6921 }, { "epoch": 3.0764444444444443, "grad_norm": 3.0707571506500244, "learning_rate": 7.701067615658364e-05, "loss": 1.3306, "step": 6922 }, { "epoch": 3.076888888888889, "grad_norm": 3.0397446155548096, "learning_rate": 7.699288256227758e-05, "loss": 1.3013, "step": 6923 }, { "epoch": 3.0773333333333333, "grad_norm": 3.693847894668579, "learning_rate": 7.697508896797153e-05, "loss": 1.4978, "step": 6924 }, { "epoch": 3.077777777777778, "grad_norm": 3.112276077270508, "learning_rate": 7.695729537366548e-05, "loss": 1.4023, "step": 6925 }, { "epoch": 3.078222222222222, "grad_norm": 3.29516863822937, "learning_rate": 7.693950177935944e-05, "loss": 1.2205, "step": 6926 }, { "epoch": 3.078666666666667, "grad_norm": 2.247548818588257, "learning_rate": 7.692170818505338e-05, "loss": 0.6385, "step": 6927 }, { "epoch": 3.079111111111111, "grad_norm": 2.3756346702575684, "learning_rate": 7.690391459074734e-05, "loss": 0.7201, "step": 6928 }, { "epoch": 3.0795555555555554, "grad_norm": 3.00430965423584, "learning_rate": 7.688612099644128e-05, "loss": 1.0356, "step": 6929 }, { "epoch": 3.08, "grad_norm": 2.532029390335083, "learning_rate": 7.686832740213523e-05, "loss": 0.5134, "step": 6930 }, { "epoch": 3.0804444444444443, "grad_norm": 0.30651673674583435, "learning_rate": 7.685053380782918e-05, "loss": 0.0335, "step": 6931 }, { "epoch": 3.080888888888889, "grad_norm": 2.980158567428589, "learning_rate": 7.683274021352314e-05, "loss": 1.0148, "step": 6932 }, { "epoch": 3.0813333333333333, "grad_norm": 3.9379446506500244, "learning_rate": 7.681494661921708e-05, "loss": 1.354, "step": 6933 }, { "epoch": 3.081777777777778, "grad_norm": 3.313826560974121, "learning_rate": 7.679715302491104e-05, "loss": 1.5082, "step": 6934 }, { "epoch": 3.082222222222222, "grad_norm": 2.866286039352417, "learning_rate": 7.6779359430605e-05, "loss": 0.7554, "step": 6935 }, { "epoch": 3.0826666666666664, "grad_norm": 3.264652967453003, "learning_rate": 7.676156583629894e-05, "loss": 1.0634, "step": 6936 }, { "epoch": 3.083111111111111, "grad_norm": 3.0582849979400635, "learning_rate": 7.674377224199288e-05, "loss": 0.9423, "step": 6937 }, { "epoch": 3.0835555555555554, "grad_norm": 4.915375709533691, "learning_rate": 7.672597864768684e-05, "loss": 1.1836, "step": 6938 }, { "epoch": 3.084, "grad_norm": 3.4640719890594482, "learning_rate": 7.670818505338079e-05, "loss": 1.2714, "step": 6939 }, { "epoch": 3.0844444444444443, "grad_norm": 3.8669791221618652, "learning_rate": 7.669039145907474e-05, "loss": 1.0252, "step": 6940 }, { "epoch": 3.084888888888889, "grad_norm": 4.150646686553955, "learning_rate": 7.667259786476869e-05, "loss": 1.432, "step": 6941 }, { "epoch": 3.0853333333333333, "grad_norm": 3.675527334213257, "learning_rate": 7.665480427046264e-05, "loss": 1.2161, "step": 6942 }, { "epoch": 3.085777777777778, "grad_norm": 4.122688293457031, "learning_rate": 7.663701067615658e-05, "loss": 1.2763, "step": 6943 }, { "epoch": 3.086222222222222, "grad_norm": 3.758718967437744, "learning_rate": 7.661921708185053e-05, "loss": 1.2185, "step": 6944 }, { "epoch": 3.086666666666667, "grad_norm": 4.356784343719482, "learning_rate": 7.660142348754449e-05, "loss": 1.4009, "step": 6945 }, { "epoch": 3.087111111111111, "grad_norm": 4.016665458679199, "learning_rate": 7.658362989323843e-05, "loss": 0.9262, "step": 6946 }, { "epoch": 3.0875555555555554, "grad_norm": 6.008724689483643, "learning_rate": 7.656583629893239e-05, "loss": 1.37, "step": 6947 }, { "epoch": 3.088, "grad_norm": 5.219587326049805, "learning_rate": 7.654804270462633e-05, "loss": 1.1014, "step": 6948 }, { "epoch": 3.0884444444444443, "grad_norm": 3.348543643951416, "learning_rate": 7.653024911032029e-05, "loss": 0.3617, "step": 6949 }, { "epoch": 3.088888888888889, "grad_norm": 5.084573745727539, "learning_rate": 7.651245551601423e-05, "loss": 0.6737, "step": 6950 }, { "epoch": 3.0893333333333333, "grad_norm": 2.369049072265625, "learning_rate": 7.649466192170819e-05, "loss": 1.7119, "step": 6951 }, { "epoch": 3.089777777777778, "grad_norm": 2.740426540374756, "learning_rate": 7.647686832740215e-05, "loss": 1.6545, "step": 6952 }, { "epoch": 3.090222222222222, "grad_norm": 2.695539712905884, "learning_rate": 7.645907473309609e-05, "loss": 1.7265, "step": 6953 }, { "epoch": 3.0906666666666665, "grad_norm": 2.5847222805023193, "learning_rate": 7.644128113879005e-05, "loss": 1.1412, "step": 6954 }, { "epoch": 3.091111111111111, "grad_norm": 2.5503435134887695, "learning_rate": 7.642348754448399e-05, "loss": 1.3825, "step": 6955 }, { "epoch": 3.0915555555555554, "grad_norm": 3.02182674407959, "learning_rate": 7.640569395017793e-05, "loss": 1.3872, "step": 6956 }, { "epoch": 3.092, "grad_norm": 2.3122494220733643, "learning_rate": 7.638790035587189e-05, "loss": 0.9554, "step": 6957 }, { "epoch": 3.0924444444444443, "grad_norm": 2.8011865615844727, "learning_rate": 7.637010676156584e-05, "loss": 1.2296, "step": 6958 }, { "epoch": 3.092888888888889, "grad_norm": 3.4765398502349854, "learning_rate": 7.635231316725979e-05, "loss": 2.2308, "step": 6959 }, { "epoch": 3.0933333333333333, "grad_norm": 2.9357285499572754, "learning_rate": 7.633451957295374e-05, "loss": 1.394, "step": 6960 }, { "epoch": 3.093777777777778, "grad_norm": 3.6758627891540527, "learning_rate": 7.631672597864769e-05, "loss": 1.7402, "step": 6961 }, { "epoch": 3.094222222222222, "grad_norm": 3.1817829608917236, "learning_rate": 7.629893238434164e-05, "loss": 1.432, "step": 6962 }, { "epoch": 3.0946666666666665, "grad_norm": 3.500157594680786, "learning_rate": 7.628113879003559e-05, "loss": 1.7906, "step": 6963 }, { "epoch": 3.095111111111111, "grad_norm": 3.7627017498016357, "learning_rate": 7.626334519572954e-05, "loss": 1.7225, "step": 6964 }, { "epoch": 3.0955555555555554, "grad_norm": 2.646817445755005, "learning_rate": 7.62455516014235e-05, "loss": 1.3691, "step": 6965 }, { "epoch": 3.096, "grad_norm": 2.6752378940582275, "learning_rate": 7.622775800711744e-05, "loss": 1.4794, "step": 6966 }, { "epoch": 3.0964444444444443, "grad_norm": 2.979374885559082, "learning_rate": 7.62099644128114e-05, "loss": 1.0324, "step": 6967 }, { "epoch": 3.096888888888889, "grad_norm": 2.8177530765533447, "learning_rate": 7.619217081850534e-05, "loss": 1.5979, "step": 6968 }, { "epoch": 3.0973333333333333, "grad_norm": 2.7737972736358643, "learning_rate": 7.617437722419928e-05, "loss": 1.138, "step": 6969 }, { "epoch": 3.097777777777778, "grad_norm": 3.1239850521087646, "learning_rate": 7.615658362989324e-05, "loss": 1.4938, "step": 6970 }, { "epoch": 3.098222222222222, "grad_norm": 3.2741284370422363, "learning_rate": 7.61387900355872e-05, "loss": 1.0675, "step": 6971 }, { "epoch": 3.0986666666666665, "grad_norm": 2.790844678878784, "learning_rate": 7.612099644128114e-05, "loss": 1.2557, "step": 6972 }, { "epoch": 3.099111111111111, "grad_norm": 2.8968873023986816, "learning_rate": 7.61032028469751e-05, "loss": 1.259, "step": 6973 }, { "epoch": 3.0995555555555554, "grad_norm": 3.275770664215088, "learning_rate": 7.608540925266904e-05, "loss": 1.2327, "step": 6974 }, { "epoch": 3.1, "grad_norm": 3.203305721282959, "learning_rate": 7.606761565836298e-05, "loss": 0.9974, "step": 6975 }, { "epoch": 3.1004444444444443, "grad_norm": 3.2165777683258057, "learning_rate": 7.604982206405694e-05, "loss": 1.6345, "step": 6976 }, { "epoch": 3.100888888888889, "grad_norm": 3.0916032791137695, "learning_rate": 7.60320284697509e-05, "loss": 1.2841, "step": 6977 }, { "epoch": 3.1013333333333333, "grad_norm": 3.5910959243774414, "learning_rate": 7.601423487544484e-05, "loss": 0.96, "step": 6978 }, { "epoch": 3.101777777777778, "grad_norm": 3.80975341796875, "learning_rate": 7.59964412811388e-05, "loss": 1.5965, "step": 6979 }, { "epoch": 3.102222222222222, "grad_norm": 2.8560242652893066, "learning_rate": 7.597864768683275e-05, "loss": 0.8661, "step": 6980 }, { "epoch": 3.1026666666666665, "grad_norm": 4.064279079437256, "learning_rate": 7.59608540925267e-05, "loss": 1.3318, "step": 6981 }, { "epoch": 3.103111111111111, "grad_norm": 3.475475311279297, "learning_rate": 7.594306049822064e-05, "loss": 0.9713, "step": 6982 }, { "epoch": 3.1035555555555554, "grad_norm": 3.4135968685150146, "learning_rate": 7.59252669039146e-05, "loss": 1.1763, "step": 6983 }, { "epoch": 3.104, "grad_norm": 2.9875054359436035, "learning_rate": 7.590747330960855e-05, "loss": 1.3583, "step": 6984 }, { "epoch": 3.1044444444444443, "grad_norm": 3.5660154819488525, "learning_rate": 7.58896797153025e-05, "loss": 1.2142, "step": 6985 }, { "epoch": 3.104888888888889, "grad_norm": 3.5056378841400146, "learning_rate": 7.587188612099645e-05, "loss": 1.3169, "step": 6986 }, { "epoch": 3.1053333333333333, "grad_norm": 2.358790636062622, "learning_rate": 7.58540925266904e-05, "loss": 0.488, "step": 6987 }, { "epoch": 3.105777777777778, "grad_norm": 3.0565037727355957, "learning_rate": 7.583629893238434e-05, "loss": 1.1014, "step": 6988 }, { "epoch": 3.106222222222222, "grad_norm": 2.9908316135406494, "learning_rate": 7.581850533807829e-05, "loss": 1.0469, "step": 6989 }, { "epoch": 3.1066666666666665, "grad_norm": 3.8466875553131104, "learning_rate": 7.580071174377225e-05, "loss": 0.9868, "step": 6990 }, { "epoch": 3.107111111111111, "grad_norm": 4.6302032470703125, "learning_rate": 7.578291814946619e-05, "loss": 1.6431, "step": 6991 }, { "epoch": 3.1075555555555554, "grad_norm": 3.230506420135498, "learning_rate": 7.576512455516015e-05, "loss": 0.8402, "step": 6992 }, { "epoch": 3.108, "grad_norm": 3.8830463886260986, "learning_rate": 7.57473309608541e-05, "loss": 1.1773, "step": 6993 }, { "epoch": 3.1084444444444443, "grad_norm": 4.235727310180664, "learning_rate": 7.572953736654805e-05, "loss": 1.2941, "step": 6994 }, { "epoch": 3.108888888888889, "grad_norm": 4.56505012512207, "learning_rate": 7.571174377224199e-05, "loss": 1.1328, "step": 6995 }, { "epoch": 3.1093333333333333, "grad_norm": 3.8152174949645996, "learning_rate": 7.569395017793595e-05, "loss": 0.8993, "step": 6996 }, { "epoch": 3.109777777777778, "grad_norm": 5.836917877197266, "learning_rate": 7.56761565836299e-05, "loss": 1.6419, "step": 6997 }, { "epoch": 3.110222222222222, "grad_norm": 3.6817240715026855, "learning_rate": 7.565836298932385e-05, "loss": 0.995, "step": 6998 }, { "epoch": 3.1106666666666665, "grad_norm": 4.596709251403809, "learning_rate": 7.56405693950178e-05, "loss": 0.7939, "step": 6999 }, { "epoch": 3.111111111111111, "grad_norm": 3.930978775024414, "learning_rate": 7.562277580071175e-05, "loss": 0.7911, "step": 7000 }, { "epoch": 3.1115555555555554, "grad_norm": 2.6198768615722656, "learning_rate": 7.560498220640569e-05, "loss": 1.8534, "step": 7001 }, { "epoch": 3.112, "grad_norm": 2.364030599594116, "learning_rate": 7.558718861209965e-05, "loss": 2.0188, "step": 7002 }, { "epoch": 3.1124444444444443, "grad_norm": 2.803751230239868, "learning_rate": 7.55693950177936e-05, "loss": 1.405, "step": 7003 }, { "epoch": 3.112888888888889, "grad_norm": 2.8711655139923096, "learning_rate": 7.555160142348755e-05, "loss": 1.3464, "step": 7004 }, { "epoch": 3.1133333333333333, "grad_norm": 2.2702465057373047, "learning_rate": 7.55338078291815e-05, "loss": 1.1326, "step": 7005 }, { "epoch": 3.113777777777778, "grad_norm": 3.279500961303711, "learning_rate": 7.551601423487544e-05, "loss": 1.7044, "step": 7006 }, { "epoch": 3.1142222222222222, "grad_norm": 3.2445764541625977, "learning_rate": 7.54982206405694e-05, "loss": 1.5965, "step": 7007 }, { "epoch": 3.1146666666666665, "grad_norm": 3.1852023601531982, "learning_rate": 7.548042704626334e-05, "loss": 1.7852, "step": 7008 }, { "epoch": 3.115111111111111, "grad_norm": 2.400352716445923, "learning_rate": 7.54626334519573e-05, "loss": 0.6979, "step": 7009 }, { "epoch": 3.1155555555555554, "grad_norm": 3.9273557662963867, "learning_rate": 7.544483985765126e-05, "loss": 1.7671, "step": 7010 }, { "epoch": 3.116, "grad_norm": 3.297231674194336, "learning_rate": 7.54270462633452e-05, "loss": 1.6285, "step": 7011 }, { "epoch": 3.1164444444444444, "grad_norm": 2.7438652515411377, "learning_rate": 7.540925266903916e-05, "loss": 1.2797, "step": 7012 }, { "epoch": 3.116888888888889, "grad_norm": 2.7498419284820557, "learning_rate": 7.53914590747331e-05, "loss": 1.4858, "step": 7013 }, { "epoch": 3.1173333333333333, "grad_norm": 2.6676368713378906, "learning_rate": 7.537366548042704e-05, "loss": 1.3337, "step": 7014 }, { "epoch": 3.117777777777778, "grad_norm": 2.901026487350464, "learning_rate": 7.5355871886121e-05, "loss": 1.0484, "step": 7015 }, { "epoch": 3.1182222222222222, "grad_norm": 2.0546693801879883, "learning_rate": 7.533807829181496e-05, "loss": 0.693, "step": 7016 }, { "epoch": 3.1186666666666665, "grad_norm": 2.8888492584228516, "learning_rate": 7.53202846975089e-05, "loss": 1.1957, "step": 7017 }, { "epoch": 3.119111111111111, "grad_norm": 3.6129257678985596, "learning_rate": 7.530249110320285e-05, "loss": 1.4137, "step": 7018 }, { "epoch": 3.1195555555555554, "grad_norm": 2.8861212730407715, "learning_rate": 7.52846975088968e-05, "loss": 1.3716, "step": 7019 }, { "epoch": 3.12, "grad_norm": 2.638096809387207, "learning_rate": 7.526690391459074e-05, "loss": 1.1059, "step": 7020 }, { "epoch": 3.1204444444444444, "grad_norm": 3.651658773422241, "learning_rate": 7.52491103202847e-05, "loss": 1.4312, "step": 7021 }, { "epoch": 3.120888888888889, "grad_norm": 3.251763343811035, "learning_rate": 7.523131672597865e-05, "loss": 1.2436, "step": 7022 }, { "epoch": 3.1213333333333333, "grad_norm": 3.0787482261657715, "learning_rate": 7.52135231316726e-05, "loss": 1.6314, "step": 7023 }, { "epoch": 3.121777777777778, "grad_norm": 3.3005053997039795, "learning_rate": 7.519572953736655e-05, "loss": 1.3528, "step": 7024 }, { "epoch": 3.1222222222222222, "grad_norm": 3.0210680961608887, "learning_rate": 7.517793594306051e-05, "loss": 1.028, "step": 7025 }, { "epoch": 3.1226666666666665, "grad_norm": 3.53727126121521, "learning_rate": 7.516014234875445e-05, "loss": 1.2015, "step": 7026 }, { "epoch": 3.123111111111111, "grad_norm": 3.3059957027435303, "learning_rate": 7.51423487544484e-05, "loss": 1.1474, "step": 7027 }, { "epoch": 3.1235555555555554, "grad_norm": 3.240772008895874, "learning_rate": 7.512455516014235e-05, "loss": 1.1528, "step": 7028 }, { "epoch": 3.124, "grad_norm": 2.7922959327697754, "learning_rate": 7.510676156583631e-05, "loss": 1.1426, "step": 7029 }, { "epoch": 3.1244444444444444, "grad_norm": 3.379770278930664, "learning_rate": 7.508896797153025e-05, "loss": 1.3263, "step": 7030 }, { "epoch": 3.124888888888889, "grad_norm": 2.729382038116455, "learning_rate": 7.507117437722421e-05, "loss": 1.1248, "step": 7031 }, { "epoch": 3.1253333333333333, "grad_norm": 3.164997100830078, "learning_rate": 7.505338078291815e-05, "loss": 1.3626, "step": 7032 }, { "epoch": 3.1257777777777775, "grad_norm": 2.980705499649048, "learning_rate": 7.50355871886121e-05, "loss": 1.0401, "step": 7033 }, { "epoch": 3.1262222222222222, "grad_norm": 3.0273003578186035, "learning_rate": 7.501779359430605e-05, "loss": 1.0748, "step": 7034 }, { "epoch": 3.1266666666666665, "grad_norm": 3.4305291175842285, "learning_rate": 7.500000000000001e-05, "loss": 1.38, "step": 7035 }, { "epoch": 3.127111111111111, "grad_norm": 3.338956594467163, "learning_rate": 7.498220640569395e-05, "loss": 0.9866, "step": 7036 }, { "epoch": 3.1275555555555554, "grad_norm": 3.5987491607666016, "learning_rate": 7.49644128113879e-05, "loss": 1.1284, "step": 7037 }, { "epoch": 3.128, "grad_norm": 3.742375135421753, "learning_rate": 7.494661921708186e-05, "loss": 1.4609, "step": 7038 }, { "epoch": 3.1284444444444444, "grad_norm": 3.7690203189849854, "learning_rate": 7.49288256227758e-05, "loss": 1.5164, "step": 7039 }, { "epoch": 3.128888888888889, "grad_norm": 4.437491416931152, "learning_rate": 7.491103202846975e-05, "loss": 1.1423, "step": 7040 }, { "epoch": 3.1293333333333333, "grad_norm": 3.674255609512329, "learning_rate": 7.48932384341637e-05, "loss": 1.3641, "step": 7041 }, { "epoch": 3.129777777777778, "grad_norm": 3.8117153644561768, "learning_rate": 7.487544483985766e-05, "loss": 1.0165, "step": 7042 }, { "epoch": 3.1302222222222222, "grad_norm": 3.8670129776000977, "learning_rate": 7.48576512455516e-05, "loss": 1.0165, "step": 7043 }, { "epoch": 3.1306666666666665, "grad_norm": 3.9163060188293457, "learning_rate": 7.483985765124556e-05, "loss": 0.9119, "step": 7044 }, { "epoch": 3.131111111111111, "grad_norm": 4.793590545654297, "learning_rate": 7.48220640569395e-05, "loss": 1.3223, "step": 7045 }, { "epoch": 3.1315555555555554, "grad_norm": 5.119454383850098, "learning_rate": 7.480427046263345e-05, "loss": 1.1945, "step": 7046 }, { "epoch": 3.132, "grad_norm": 4.798033714294434, "learning_rate": 7.47864768683274e-05, "loss": 1.407, "step": 7047 }, { "epoch": 3.1324444444444444, "grad_norm": 2.8798067569732666, "learning_rate": 7.476868327402136e-05, "loss": 0.682, "step": 7048 }, { "epoch": 3.132888888888889, "grad_norm": 2.7945594787597656, "learning_rate": 7.47508896797153e-05, "loss": 0.5815, "step": 7049 }, { "epoch": 3.1333333333333333, "grad_norm": 4.523054122924805, "learning_rate": 7.473309608540926e-05, "loss": 0.7363, "step": 7050 }, { "epoch": 3.1337777777777776, "grad_norm": 2.011254072189331, "learning_rate": 7.471530249110322e-05, "loss": 0.9446, "step": 7051 }, { "epoch": 3.1342222222222222, "grad_norm": 1.6838473081588745, "learning_rate": 7.469750889679716e-05, "loss": 0.86, "step": 7052 }, { "epoch": 3.1346666666666665, "grad_norm": 2.2713677883148193, "learning_rate": 7.46797153024911e-05, "loss": 1.7424, "step": 7053 }, { "epoch": 3.135111111111111, "grad_norm": 2.4536499977111816, "learning_rate": 7.466192170818506e-05, "loss": 1.5837, "step": 7054 }, { "epoch": 3.1355555555555554, "grad_norm": 1.8696529865264893, "learning_rate": 7.464412811387901e-05, "loss": 0.6377, "step": 7055 }, { "epoch": 3.136, "grad_norm": 3.031843900680542, "learning_rate": 7.462633451957296e-05, "loss": 1.8678, "step": 7056 }, { "epoch": 3.1364444444444444, "grad_norm": 2.593820333480835, "learning_rate": 7.460854092526691e-05, "loss": 1.4144, "step": 7057 }, { "epoch": 3.136888888888889, "grad_norm": 2.432314872741699, "learning_rate": 7.459074733096086e-05, "loss": 1.5448, "step": 7058 }, { "epoch": 3.1373333333333333, "grad_norm": 2.6551852226257324, "learning_rate": 7.45729537366548e-05, "loss": 1.3579, "step": 7059 }, { "epoch": 3.137777777777778, "grad_norm": 3.1164588928222656, "learning_rate": 7.455516014234876e-05, "loss": 1.927, "step": 7060 }, { "epoch": 3.1382222222222222, "grad_norm": 2.72015643119812, "learning_rate": 7.453736654804271e-05, "loss": 1.3657, "step": 7061 }, { "epoch": 3.1386666666666665, "grad_norm": 3.068243980407715, "learning_rate": 7.451957295373666e-05, "loss": 1.4239, "step": 7062 }, { "epoch": 3.139111111111111, "grad_norm": 2.426748514175415, "learning_rate": 7.450177935943061e-05, "loss": 1.1847, "step": 7063 }, { "epoch": 3.1395555555555554, "grad_norm": 2.8654985427856445, "learning_rate": 7.448398576512457e-05, "loss": 1.5308, "step": 7064 }, { "epoch": 3.14, "grad_norm": 1.9712382555007935, "learning_rate": 7.44661921708185e-05, "loss": 0.7233, "step": 7065 }, { "epoch": 3.1404444444444444, "grad_norm": 1.8847380876541138, "learning_rate": 7.444839857651246e-05, "loss": 0.7311, "step": 7066 }, { "epoch": 3.140888888888889, "grad_norm": 2.814565658569336, "learning_rate": 7.443060498220641e-05, "loss": 1.1762, "step": 7067 }, { "epoch": 3.1413333333333333, "grad_norm": 2.49889874458313, "learning_rate": 7.441281138790035e-05, "loss": 1.2439, "step": 7068 }, { "epoch": 3.1417777777777776, "grad_norm": 3.1838126182556152, "learning_rate": 7.439501779359431e-05, "loss": 1.3337, "step": 7069 }, { "epoch": 3.1422222222222222, "grad_norm": 2.6192221641540527, "learning_rate": 7.437722419928827e-05, "loss": 1.065, "step": 7070 }, { "epoch": 3.1426666666666665, "grad_norm": 2.363414764404297, "learning_rate": 7.435943060498221e-05, "loss": 1.0256, "step": 7071 }, { "epoch": 3.143111111111111, "grad_norm": 1.8646711111068726, "learning_rate": 7.434163701067615e-05, "loss": 0.6336, "step": 7072 }, { "epoch": 3.1435555555555554, "grad_norm": 2.434340238571167, "learning_rate": 7.432384341637011e-05, "loss": 1.0433, "step": 7073 }, { "epoch": 3.144, "grad_norm": 3.6076550483703613, "learning_rate": 7.430604982206407e-05, "loss": 1.1586, "step": 7074 }, { "epoch": 3.1444444444444444, "grad_norm": 3.699741840362549, "learning_rate": 7.428825622775801e-05, "loss": 1.5315, "step": 7075 }, { "epoch": 3.144888888888889, "grad_norm": 3.15507435798645, "learning_rate": 7.427046263345197e-05, "loss": 1.3789, "step": 7076 }, { "epoch": 3.1453333333333333, "grad_norm": 2.9796347618103027, "learning_rate": 7.425266903914591e-05, "loss": 1.083, "step": 7077 }, { "epoch": 3.145777777777778, "grad_norm": 2.9139842987060547, "learning_rate": 7.423487544483985e-05, "loss": 1.3829, "step": 7078 }, { "epoch": 3.1462222222222223, "grad_norm": 3.6654200553894043, "learning_rate": 7.421708185053381e-05, "loss": 1.2362, "step": 7079 }, { "epoch": 3.1466666666666665, "grad_norm": 3.540692090988159, "learning_rate": 7.419928825622776e-05, "loss": 1.4958, "step": 7080 }, { "epoch": 3.147111111111111, "grad_norm": 3.061079740524292, "learning_rate": 7.418149466192171e-05, "loss": 1.317, "step": 7081 }, { "epoch": 3.1475555555555554, "grad_norm": 3.567028522491455, "learning_rate": 7.416370106761566e-05, "loss": 1.3618, "step": 7082 }, { "epoch": 3.148, "grad_norm": 3.034331798553467, "learning_rate": 7.414590747330962e-05, "loss": 0.9736, "step": 7083 }, { "epoch": 3.1484444444444444, "grad_norm": 0.4216180741786957, "learning_rate": 7.412811387900356e-05, "loss": 0.0315, "step": 7084 }, { "epoch": 3.148888888888889, "grad_norm": 2.6628007888793945, "learning_rate": 7.41103202846975e-05, "loss": 0.8623, "step": 7085 }, { "epoch": 3.1493333333333333, "grad_norm": 3.0110785961151123, "learning_rate": 7.409252669039146e-05, "loss": 1.0985, "step": 7086 }, { "epoch": 3.1497777777777776, "grad_norm": 3.624995231628418, "learning_rate": 7.407473309608542e-05, "loss": 1.1393, "step": 7087 }, { "epoch": 3.1502222222222223, "grad_norm": 4.227591514587402, "learning_rate": 7.405693950177936e-05, "loss": 1.0868, "step": 7088 }, { "epoch": 3.1506666666666665, "grad_norm": 3.4621124267578125, "learning_rate": 7.403914590747332e-05, "loss": 1.427, "step": 7089 }, { "epoch": 3.151111111111111, "grad_norm": 3.3510584831237793, "learning_rate": 7.402135231316726e-05, "loss": 1.4212, "step": 7090 }, { "epoch": 3.1515555555555554, "grad_norm": 3.500433921813965, "learning_rate": 7.40035587188612e-05, "loss": 1.1449, "step": 7091 }, { "epoch": 3.152, "grad_norm": 2.601447582244873, "learning_rate": 7.398576512455516e-05, "loss": 0.8084, "step": 7092 }, { "epoch": 3.1524444444444444, "grad_norm": 3.0802958011627197, "learning_rate": 7.396797153024912e-05, "loss": 1.1565, "step": 7093 }, { "epoch": 3.152888888888889, "grad_norm": 3.7664129734039307, "learning_rate": 7.395017793594306e-05, "loss": 1.3433, "step": 7094 }, { "epoch": 3.1533333333333333, "grad_norm": 3.885650157928467, "learning_rate": 7.393238434163702e-05, "loss": 1.2266, "step": 7095 }, { "epoch": 3.153777777777778, "grad_norm": 5.502748489379883, "learning_rate": 7.391459074733097e-05, "loss": 1.8502, "step": 7096 }, { "epoch": 3.1542222222222223, "grad_norm": 3.670057535171509, "learning_rate": 7.389679715302492e-05, "loss": 1.4684, "step": 7097 }, { "epoch": 3.1546666666666665, "grad_norm": 4.47036075592041, "learning_rate": 7.387900355871886e-05, "loss": 1.183, "step": 7098 }, { "epoch": 3.155111111111111, "grad_norm": 0.5041842460632324, "learning_rate": 7.386120996441282e-05, "loss": 0.0461, "step": 7099 }, { "epoch": 3.1555555555555554, "grad_norm": 3.492530107498169, "learning_rate": 7.384341637010677e-05, "loss": 0.8281, "step": 7100 }, { "epoch": 3.156, "grad_norm": 2.3296926021575928, "learning_rate": 7.382562277580072e-05, "loss": 1.9542, "step": 7101 }, { "epoch": 3.1564444444444444, "grad_norm": 2.73773455619812, "learning_rate": 7.380782918149467e-05, "loss": 1.7659, "step": 7102 }, { "epoch": 3.156888888888889, "grad_norm": 2.8689823150634766, "learning_rate": 7.379003558718862e-05, "loss": 1.7431, "step": 7103 }, { "epoch": 3.1573333333333333, "grad_norm": 2.7786591053009033, "learning_rate": 7.377224199288256e-05, "loss": 1.6562, "step": 7104 }, { "epoch": 3.1577777777777776, "grad_norm": 3.037637710571289, "learning_rate": 7.375444839857651e-05, "loss": 1.746, "step": 7105 }, { "epoch": 3.1582222222222223, "grad_norm": 3.2228243350982666, "learning_rate": 7.373665480427047e-05, "loss": 1.8074, "step": 7106 }, { "epoch": 3.1586666666666665, "grad_norm": 2.8026838302612305, "learning_rate": 7.371886120996441e-05, "loss": 1.786, "step": 7107 }, { "epoch": 3.159111111111111, "grad_norm": 3.4075815677642822, "learning_rate": 7.370106761565837e-05, "loss": 1.6417, "step": 7108 }, { "epoch": 3.1595555555555555, "grad_norm": 2.9315574169158936, "learning_rate": 7.368327402135233e-05, "loss": 1.3821, "step": 7109 }, { "epoch": 3.16, "grad_norm": 3.2183048725128174, "learning_rate": 7.366548042704626e-05, "loss": 1.3795, "step": 7110 }, { "epoch": 3.1604444444444444, "grad_norm": 2.97540020942688, "learning_rate": 7.364768683274021e-05, "loss": 1.6864, "step": 7111 }, { "epoch": 3.160888888888889, "grad_norm": 3.3829736709594727, "learning_rate": 7.362989323843417e-05, "loss": 1.5294, "step": 7112 }, { "epoch": 3.1613333333333333, "grad_norm": 3.1063239574432373, "learning_rate": 7.361209964412811e-05, "loss": 1.451, "step": 7113 }, { "epoch": 3.1617777777777776, "grad_norm": 3.2091407775878906, "learning_rate": 7.359430604982207e-05, "loss": 1.1878, "step": 7114 }, { "epoch": 3.1622222222222223, "grad_norm": 3.398361921310425, "learning_rate": 7.357651245551603e-05, "loss": 1.5108, "step": 7115 }, { "epoch": 3.1626666666666665, "grad_norm": 3.1675875186920166, "learning_rate": 7.355871886120997e-05, "loss": 1.294, "step": 7116 }, { "epoch": 3.163111111111111, "grad_norm": 2.810655117034912, "learning_rate": 7.354092526690391e-05, "loss": 1.2984, "step": 7117 }, { "epoch": 3.1635555555555555, "grad_norm": 3.2484841346740723, "learning_rate": 7.352313167259787e-05, "loss": 1.1205, "step": 7118 }, { "epoch": 3.164, "grad_norm": 3.3352835178375244, "learning_rate": 7.350533807829182e-05, "loss": 1.5339, "step": 7119 }, { "epoch": 3.1644444444444444, "grad_norm": 3.3871214389801025, "learning_rate": 7.348754448398577e-05, "loss": 1.3406, "step": 7120 }, { "epoch": 3.164888888888889, "grad_norm": 3.104323148727417, "learning_rate": 7.346975088967972e-05, "loss": 1.636, "step": 7121 }, { "epoch": 3.1653333333333333, "grad_norm": 2.902137041091919, "learning_rate": 7.345195729537368e-05, "loss": 1.4295, "step": 7122 }, { "epoch": 3.1657777777777776, "grad_norm": 2.7861342430114746, "learning_rate": 7.343416370106761e-05, "loss": 0.9665, "step": 7123 }, { "epoch": 3.1662222222222223, "grad_norm": 2.9004876613616943, "learning_rate": 7.341637010676157e-05, "loss": 1.2588, "step": 7124 }, { "epoch": 3.1666666666666665, "grad_norm": 2.671241521835327, "learning_rate": 7.339857651245552e-05, "loss": 1.2189, "step": 7125 }, { "epoch": 3.167111111111111, "grad_norm": 3.0016653537750244, "learning_rate": 7.338078291814947e-05, "loss": 0.9757, "step": 7126 }, { "epoch": 3.1675555555555555, "grad_norm": 3.0266857147216797, "learning_rate": 7.336298932384342e-05, "loss": 1.2218, "step": 7127 }, { "epoch": 3.168, "grad_norm": 2.868429183959961, "learning_rate": 7.334519572953738e-05, "loss": 0.8355, "step": 7128 }, { "epoch": 3.1684444444444444, "grad_norm": 3.6476175785064697, "learning_rate": 7.332740213523132e-05, "loss": 0.864, "step": 7129 }, { "epoch": 3.168888888888889, "grad_norm": 3.347186803817749, "learning_rate": 7.330960854092526e-05, "loss": 1.1723, "step": 7130 }, { "epoch": 3.1693333333333333, "grad_norm": 3.4044735431671143, "learning_rate": 7.329181494661922e-05, "loss": 1.0311, "step": 7131 }, { "epoch": 3.1697777777777776, "grad_norm": 2.955648422241211, "learning_rate": 7.327402135231318e-05, "loss": 1.1082, "step": 7132 }, { "epoch": 3.1702222222222223, "grad_norm": 3.7140543460845947, "learning_rate": 7.325622775800712e-05, "loss": 0.9562, "step": 7133 }, { "epoch": 3.1706666666666665, "grad_norm": 2.378147840499878, "learning_rate": 7.323843416370108e-05, "loss": 0.697, "step": 7134 }, { "epoch": 3.171111111111111, "grad_norm": 3.5638556480407715, "learning_rate": 7.322064056939502e-05, "loss": 1.2156, "step": 7135 }, { "epoch": 3.1715555555555555, "grad_norm": 3.8616840839385986, "learning_rate": 7.320284697508896e-05, "loss": 0.9394, "step": 7136 }, { "epoch": 3.172, "grad_norm": 3.537442684173584, "learning_rate": 7.318505338078292e-05, "loss": 1.3647, "step": 7137 }, { "epoch": 3.1724444444444444, "grad_norm": 3.341238260269165, "learning_rate": 7.316725978647688e-05, "loss": 1.0796, "step": 7138 }, { "epoch": 3.172888888888889, "grad_norm": 3.2897703647613525, "learning_rate": 7.314946619217082e-05, "loss": 0.9163, "step": 7139 }, { "epoch": 3.1733333333333333, "grad_norm": 3.147260904312134, "learning_rate": 7.313167259786478e-05, "loss": 0.9676, "step": 7140 }, { "epoch": 3.1737777777777776, "grad_norm": 4.187511444091797, "learning_rate": 7.311387900355873e-05, "loss": 1.3926, "step": 7141 }, { "epoch": 3.1742222222222223, "grad_norm": 4.7306809425354, "learning_rate": 7.309608540925267e-05, "loss": 1.3427, "step": 7142 }, { "epoch": 3.1746666666666665, "grad_norm": 3.0592243671417236, "learning_rate": 7.307829181494662e-05, "loss": 0.7532, "step": 7143 }, { "epoch": 3.175111111111111, "grad_norm": 4.022464275360107, "learning_rate": 7.306049822064057e-05, "loss": 1.3008, "step": 7144 }, { "epoch": 3.1755555555555555, "grad_norm": 4.482253551483154, "learning_rate": 7.304270462633453e-05, "loss": 1.1332, "step": 7145 }, { "epoch": 3.176, "grad_norm": 3.72058367729187, "learning_rate": 7.302491103202847e-05, "loss": 0.7426, "step": 7146 }, { "epoch": 3.1764444444444444, "grad_norm": 4.921838760375977, "learning_rate": 7.300711743772243e-05, "loss": 1.0258, "step": 7147 }, { "epoch": 3.176888888888889, "grad_norm": 4.758439540863037, "learning_rate": 7.298932384341637e-05, "loss": 1.3546, "step": 7148 }, { "epoch": 3.1773333333333333, "grad_norm": 3.1380600929260254, "learning_rate": 7.297153024911032e-05, "loss": 1.0218, "step": 7149 }, { "epoch": 3.1777777777777776, "grad_norm": 7.19141149520874, "learning_rate": 7.295373665480427e-05, "loss": 1.2103, "step": 7150 }, { "epoch": 3.1782222222222223, "grad_norm": 1.9877086877822876, "learning_rate": 7.293594306049823e-05, "loss": 1.8951, "step": 7151 }, { "epoch": 3.1786666666666665, "grad_norm": 2.509326934814453, "learning_rate": 7.291814946619217e-05, "loss": 1.9882, "step": 7152 }, { "epoch": 3.179111111111111, "grad_norm": 2.56880521774292, "learning_rate": 7.290035587188613e-05, "loss": 1.8781, "step": 7153 }, { "epoch": 3.1795555555555555, "grad_norm": 2.692734718322754, "learning_rate": 7.288256227758008e-05, "loss": 1.9204, "step": 7154 }, { "epoch": 3.18, "grad_norm": 2.635511875152588, "learning_rate": 7.286476868327401e-05, "loss": 1.1913, "step": 7155 }, { "epoch": 3.1804444444444444, "grad_norm": 2.8718338012695312, "learning_rate": 7.284697508896797e-05, "loss": 1.432, "step": 7156 }, { "epoch": 3.180888888888889, "grad_norm": 3.3866138458251953, "learning_rate": 7.282918149466193e-05, "loss": 1.7453, "step": 7157 }, { "epoch": 3.1813333333333333, "grad_norm": 3.44844913482666, "learning_rate": 7.281138790035587e-05, "loss": 1.3587, "step": 7158 }, { "epoch": 3.1817777777777776, "grad_norm": 2.8075125217437744, "learning_rate": 7.279359430604983e-05, "loss": 1.5954, "step": 7159 }, { "epoch": 3.1822222222222223, "grad_norm": 2.6318461894989014, "learning_rate": 7.277580071174378e-05, "loss": 1.2991, "step": 7160 }, { "epoch": 3.1826666666666665, "grad_norm": 2.9282736778259277, "learning_rate": 7.275800711743773e-05, "loss": 1.5801, "step": 7161 }, { "epoch": 3.1831111111111112, "grad_norm": 2.870149850845337, "learning_rate": 7.274021352313167e-05, "loss": 0.8409, "step": 7162 }, { "epoch": 3.1835555555555555, "grad_norm": 2.717829942703247, "learning_rate": 7.272241992882563e-05, "loss": 0.994, "step": 7163 }, { "epoch": 3.184, "grad_norm": 2.9092955589294434, "learning_rate": 7.270462633451958e-05, "loss": 1.6491, "step": 7164 }, { "epoch": 3.1844444444444444, "grad_norm": 2.3932883739471436, "learning_rate": 7.268683274021352e-05, "loss": 0.8707, "step": 7165 }, { "epoch": 3.1848888888888887, "grad_norm": 3.078213691711426, "learning_rate": 7.266903914590748e-05, "loss": 1.0287, "step": 7166 }, { "epoch": 3.1853333333333333, "grad_norm": 3.1520655155181885, "learning_rate": 7.265124555160144e-05, "loss": 1.1343, "step": 7167 }, { "epoch": 3.1857777777777776, "grad_norm": 3.593202829360962, "learning_rate": 7.263345195729537e-05, "loss": 1.276, "step": 7168 }, { "epoch": 3.1862222222222223, "grad_norm": 2.6163017749786377, "learning_rate": 7.261565836298932e-05, "loss": 1.226, "step": 7169 }, { "epoch": 3.1866666666666665, "grad_norm": 2.836480140686035, "learning_rate": 7.259786476868328e-05, "loss": 1.2427, "step": 7170 }, { "epoch": 3.1871111111111112, "grad_norm": 3.4099297523498535, "learning_rate": 7.258007117437722e-05, "loss": 1.4251, "step": 7171 }, { "epoch": 3.1875555555555555, "grad_norm": 2.73268461227417, "learning_rate": 7.256227758007118e-05, "loss": 1.0855, "step": 7172 }, { "epoch": 3.188, "grad_norm": 2.9737985134124756, "learning_rate": 7.254448398576514e-05, "loss": 1.451, "step": 7173 }, { "epoch": 3.1884444444444444, "grad_norm": 3.1273388862609863, "learning_rate": 7.252669039145908e-05, "loss": 1.6291, "step": 7174 }, { "epoch": 3.188888888888889, "grad_norm": 3.3678598403930664, "learning_rate": 7.250889679715302e-05, "loss": 1.6578, "step": 7175 }, { "epoch": 3.1893333333333334, "grad_norm": 3.497072458267212, "learning_rate": 7.249110320284698e-05, "loss": 1.4176, "step": 7176 }, { "epoch": 3.1897777777777776, "grad_norm": 3.735827922821045, "learning_rate": 7.247330960854094e-05, "loss": 1.5746, "step": 7177 }, { "epoch": 3.1902222222222223, "grad_norm": 2.853653907775879, "learning_rate": 7.245551601423488e-05, "loss": 1.1089, "step": 7178 }, { "epoch": 3.1906666666666665, "grad_norm": 3.3315930366516113, "learning_rate": 7.243772241992883e-05, "loss": 1.1311, "step": 7179 }, { "epoch": 3.1911111111111112, "grad_norm": 2.9672279357910156, "learning_rate": 7.241992882562279e-05, "loss": 1.3171, "step": 7180 }, { "epoch": 3.1915555555555555, "grad_norm": 3.4035391807556152, "learning_rate": 7.240213523131672e-05, "loss": 0.9667, "step": 7181 }, { "epoch": 3.192, "grad_norm": 3.72255539894104, "learning_rate": 7.238434163701068e-05, "loss": 1.36, "step": 7182 }, { "epoch": 3.1924444444444444, "grad_norm": 3.1913297176361084, "learning_rate": 7.236654804270463e-05, "loss": 1.0758, "step": 7183 }, { "epoch": 3.1928888888888887, "grad_norm": 2.7058324813842773, "learning_rate": 7.234875444839858e-05, "loss": 0.6656, "step": 7184 }, { "epoch": 3.1933333333333334, "grad_norm": 3.929481029510498, "learning_rate": 7.233096085409253e-05, "loss": 1.6081, "step": 7185 }, { "epoch": 3.1937777777777776, "grad_norm": 3.4291040897369385, "learning_rate": 7.231316725978649e-05, "loss": 1.363, "step": 7186 }, { "epoch": 3.1942222222222223, "grad_norm": 3.8611419200897217, "learning_rate": 7.229537366548043e-05, "loss": 1.6286, "step": 7187 }, { "epoch": 3.1946666666666665, "grad_norm": 2.4565515518188477, "learning_rate": 7.227758007117438e-05, "loss": 0.4925, "step": 7188 }, { "epoch": 3.1951111111111112, "grad_norm": 2.0368804931640625, "learning_rate": 7.225978647686833e-05, "loss": 0.5928, "step": 7189 }, { "epoch": 3.1955555555555555, "grad_norm": 3.8311848640441895, "learning_rate": 7.224199288256229e-05, "loss": 1.3479, "step": 7190 }, { "epoch": 3.196, "grad_norm": 4.187136173248291, "learning_rate": 7.222419928825623e-05, "loss": 0.9687, "step": 7191 }, { "epoch": 3.1964444444444444, "grad_norm": 3.8171749114990234, "learning_rate": 7.220640569395019e-05, "loss": 1.265, "step": 7192 }, { "epoch": 3.196888888888889, "grad_norm": 3.7234129905700684, "learning_rate": 7.218861209964413e-05, "loss": 1.1282, "step": 7193 }, { "epoch": 3.1973333333333334, "grad_norm": 4.388239860534668, "learning_rate": 7.217081850533807e-05, "loss": 1.2207, "step": 7194 }, { "epoch": 3.1977777777777776, "grad_norm": 3.327991247177124, "learning_rate": 7.215302491103203e-05, "loss": 1.1965, "step": 7195 }, { "epoch": 3.1982222222222223, "grad_norm": 4.21957540512085, "learning_rate": 7.213523131672599e-05, "loss": 1.4889, "step": 7196 }, { "epoch": 3.1986666666666665, "grad_norm": 4.019267559051514, "learning_rate": 7.211743772241993e-05, "loss": 1.3072, "step": 7197 }, { "epoch": 3.1991111111111112, "grad_norm": 3.848752975463867, "learning_rate": 7.209964412811389e-05, "loss": 1.0561, "step": 7198 }, { "epoch": 3.1995555555555555, "grad_norm": 3.2341089248657227, "learning_rate": 7.208185053380784e-05, "loss": 0.545, "step": 7199 }, { "epoch": 3.2, "grad_norm": 3.097386598587036, "learning_rate": 7.206405693950177e-05, "loss": 0.2964, "step": 7200 }, { "epoch": 3.2004444444444444, "grad_norm": 2.824582099914551, "learning_rate": 7.204626334519573e-05, "loss": 1.753, "step": 7201 }, { "epoch": 3.2008888888888887, "grad_norm": 2.1013436317443848, "learning_rate": 7.202846975088968e-05, "loss": 0.606, "step": 7202 }, { "epoch": 3.2013333333333334, "grad_norm": 2.8657641410827637, "learning_rate": 7.201067615658363e-05, "loss": 1.8523, "step": 7203 }, { "epoch": 3.2017777777777776, "grad_norm": 2.5109121799468994, "learning_rate": 7.199288256227758e-05, "loss": 1.1317, "step": 7204 }, { "epoch": 3.2022222222222223, "grad_norm": 3.1287527084350586, "learning_rate": 7.197508896797154e-05, "loss": 1.8141, "step": 7205 }, { "epoch": 3.2026666666666666, "grad_norm": 3.054079294204712, "learning_rate": 7.195729537366548e-05, "loss": 1.8931, "step": 7206 }, { "epoch": 3.2031111111111112, "grad_norm": 3.2487199306488037, "learning_rate": 7.193950177935943e-05, "loss": 1.4648, "step": 7207 }, { "epoch": 3.2035555555555555, "grad_norm": 2.855591058731079, "learning_rate": 7.192170818505338e-05, "loss": 1.2971, "step": 7208 }, { "epoch": 3.204, "grad_norm": 2.5365679264068604, "learning_rate": 7.190391459074734e-05, "loss": 1.2072, "step": 7209 }, { "epoch": 3.2044444444444444, "grad_norm": 3.0599007606506348, "learning_rate": 7.188612099644128e-05, "loss": 1.6128, "step": 7210 }, { "epoch": 3.204888888888889, "grad_norm": 3.139268159866333, "learning_rate": 7.186832740213524e-05, "loss": 1.2958, "step": 7211 }, { "epoch": 3.2053333333333334, "grad_norm": 3.227274179458618, "learning_rate": 7.18505338078292e-05, "loss": 1.4077, "step": 7212 }, { "epoch": 3.2057777777777776, "grad_norm": 3.3886468410491943, "learning_rate": 7.183274021352313e-05, "loss": 1.5784, "step": 7213 }, { "epoch": 3.2062222222222223, "grad_norm": 3.387749195098877, "learning_rate": 7.181494661921708e-05, "loss": 1.4535, "step": 7214 }, { "epoch": 3.2066666666666666, "grad_norm": 2.8726046085357666, "learning_rate": 7.179715302491104e-05, "loss": 1.1744, "step": 7215 }, { "epoch": 3.2071111111111112, "grad_norm": 2.2621724605560303, "learning_rate": 7.177935943060498e-05, "loss": 0.6728, "step": 7216 }, { "epoch": 3.2075555555555555, "grad_norm": 3.217412233352661, "learning_rate": 7.176156583629894e-05, "loss": 1.6073, "step": 7217 }, { "epoch": 3.208, "grad_norm": 2.7585723400115967, "learning_rate": 7.17437722419929e-05, "loss": 0.9378, "step": 7218 }, { "epoch": 3.2084444444444444, "grad_norm": 3.4076976776123047, "learning_rate": 7.172597864768684e-05, "loss": 1.3989, "step": 7219 }, { "epoch": 3.2088888888888887, "grad_norm": 2.9216151237487793, "learning_rate": 7.170818505338078e-05, "loss": 1.206, "step": 7220 }, { "epoch": 3.2093333333333334, "grad_norm": 3.7456247806549072, "learning_rate": 7.169039145907474e-05, "loss": 1.8443, "step": 7221 }, { "epoch": 3.2097777777777776, "grad_norm": 2.7865076065063477, "learning_rate": 7.167259786476869e-05, "loss": 0.9977, "step": 7222 }, { "epoch": 3.2102222222222223, "grad_norm": 3.0865001678466797, "learning_rate": 7.165480427046264e-05, "loss": 1.3247, "step": 7223 }, { "epoch": 3.2106666666666666, "grad_norm": 3.52168607711792, "learning_rate": 7.163701067615659e-05, "loss": 1.0324, "step": 7224 }, { "epoch": 3.2111111111111112, "grad_norm": 4.151998519897461, "learning_rate": 7.161921708185055e-05, "loss": 1.5058, "step": 7225 }, { "epoch": 3.2115555555555555, "grad_norm": 3.7610089778900146, "learning_rate": 7.160142348754448e-05, "loss": 1.8216, "step": 7226 }, { "epoch": 3.212, "grad_norm": 2.9817423820495605, "learning_rate": 7.158362989323843e-05, "loss": 1.3767, "step": 7227 }, { "epoch": 3.2124444444444444, "grad_norm": 2.8566582202911377, "learning_rate": 7.156583629893239e-05, "loss": 0.9962, "step": 7228 }, { "epoch": 3.2128888888888887, "grad_norm": 2.9715373516082764, "learning_rate": 7.154804270462633e-05, "loss": 1.4952, "step": 7229 }, { "epoch": 3.2133333333333334, "grad_norm": 3.730404853820801, "learning_rate": 7.153024911032029e-05, "loss": 1.5204, "step": 7230 }, { "epoch": 3.2137777777777776, "grad_norm": 3.011878490447998, "learning_rate": 7.151245551601425e-05, "loss": 1.1504, "step": 7231 }, { "epoch": 3.2142222222222223, "grad_norm": 3.1973698139190674, "learning_rate": 7.149466192170819e-05, "loss": 0.9828, "step": 7232 }, { "epoch": 3.2146666666666666, "grad_norm": 3.3064098358154297, "learning_rate": 7.147686832740213e-05, "loss": 1.5142, "step": 7233 }, { "epoch": 3.2151111111111113, "grad_norm": 2.9979686737060547, "learning_rate": 7.145907473309609e-05, "loss": 1.1844, "step": 7234 }, { "epoch": 3.2155555555555555, "grad_norm": 3.717355489730835, "learning_rate": 7.144128113879005e-05, "loss": 1.7117, "step": 7235 }, { "epoch": 3.216, "grad_norm": 4.831557273864746, "learning_rate": 7.142348754448399e-05, "loss": 1.553, "step": 7236 }, { "epoch": 3.2164444444444444, "grad_norm": 2.6884803771972656, "learning_rate": 7.140569395017795e-05, "loss": 0.7039, "step": 7237 }, { "epoch": 3.2168888888888887, "grad_norm": 3.3653738498687744, "learning_rate": 7.138790035587189e-05, "loss": 1.144, "step": 7238 }, { "epoch": 3.2173333333333334, "grad_norm": 3.712017774581909, "learning_rate": 7.137010676156583e-05, "loss": 1.2988, "step": 7239 }, { "epoch": 3.2177777777777776, "grad_norm": 4.0276360511779785, "learning_rate": 7.135231316725979e-05, "loss": 1.0672, "step": 7240 }, { "epoch": 3.2182222222222223, "grad_norm": 3.8651421070098877, "learning_rate": 7.133451957295374e-05, "loss": 1.5012, "step": 7241 }, { "epoch": 3.2186666666666666, "grad_norm": 3.2855637073516846, "learning_rate": 7.131672597864769e-05, "loss": 0.9562, "step": 7242 }, { "epoch": 3.2191111111111113, "grad_norm": 3.601529598236084, "learning_rate": 7.129893238434164e-05, "loss": 1.2631, "step": 7243 }, { "epoch": 3.2195555555555555, "grad_norm": 3.80122709274292, "learning_rate": 7.12811387900356e-05, "loss": 1.2177, "step": 7244 }, { "epoch": 3.22, "grad_norm": 4.019015789031982, "learning_rate": 7.126334519572953e-05, "loss": 1.2818, "step": 7245 }, { "epoch": 3.2204444444444444, "grad_norm": 3.7049124240875244, "learning_rate": 7.124555160142349e-05, "loss": 0.9796, "step": 7246 }, { "epoch": 3.2208888888888887, "grad_norm": 4.283298969268799, "learning_rate": 7.122775800711744e-05, "loss": 1.1318, "step": 7247 }, { "epoch": 3.2213333333333334, "grad_norm": 5.531714916229248, "learning_rate": 7.120996441281139e-05, "loss": 1.4307, "step": 7248 }, { "epoch": 3.2217777777777776, "grad_norm": 3.2561047077178955, "learning_rate": 7.119217081850534e-05, "loss": 1.09, "step": 7249 }, { "epoch": 3.2222222222222223, "grad_norm": 4.089896202087402, "learning_rate": 7.11743772241993e-05, "loss": 0.5095, "step": 7250 }, { "epoch": 3.2226666666666666, "grad_norm": 2.2933907508850098, "learning_rate": 7.115658362989324e-05, "loss": 2.2543, "step": 7251 }, { "epoch": 3.2231111111111113, "grad_norm": 2.559863567352295, "learning_rate": 7.113879003558718e-05, "loss": 1.9739, "step": 7252 }, { "epoch": 3.2235555555555555, "grad_norm": 1.9117332696914673, "learning_rate": 7.112099644128114e-05, "loss": 0.9344, "step": 7253 }, { "epoch": 3.224, "grad_norm": 2.9356801509857178, "learning_rate": 7.11032028469751e-05, "loss": 1.5221, "step": 7254 }, { "epoch": 3.2244444444444444, "grad_norm": 2.627389430999756, "learning_rate": 7.108540925266904e-05, "loss": 1.4828, "step": 7255 }, { "epoch": 3.2248888888888887, "grad_norm": 3.1114463806152344, "learning_rate": 7.1067615658363e-05, "loss": 1.4934, "step": 7256 }, { "epoch": 3.2253333333333334, "grad_norm": 2.5716917514801025, "learning_rate": 7.104982206405695e-05, "loss": 1.0995, "step": 7257 }, { "epoch": 3.2257777777777776, "grad_norm": 2.8480474948883057, "learning_rate": 7.103202846975088e-05, "loss": 1.7053, "step": 7258 }, { "epoch": 3.2262222222222223, "grad_norm": 2.725034475326538, "learning_rate": 7.101423487544484e-05, "loss": 1.1406, "step": 7259 }, { "epoch": 3.2266666666666666, "grad_norm": 2.4350764751434326, "learning_rate": 7.09964412811388e-05, "loss": 1.2509, "step": 7260 }, { "epoch": 3.2271111111111113, "grad_norm": 3.65389347076416, "learning_rate": 7.097864768683274e-05, "loss": 2.3713, "step": 7261 }, { "epoch": 3.2275555555555555, "grad_norm": 3.542847156524658, "learning_rate": 7.09608540925267e-05, "loss": 1.5322, "step": 7262 }, { "epoch": 3.228, "grad_norm": 2.748013496398926, "learning_rate": 7.094306049822065e-05, "loss": 1.3331, "step": 7263 }, { "epoch": 3.2284444444444444, "grad_norm": 3.1858341693878174, "learning_rate": 7.09252669039146e-05, "loss": 1.7167, "step": 7264 }, { "epoch": 3.2288888888888887, "grad_norm": 2.994159698486328, "learning_rate": 7.090747330960854e-05, "loss": 1.0373, "step": 7265 }, { "epoch": 3.2293333333333334, "grad_norm": 2.560145616531372, "learning_rate": 7.08896797153025e-05, "loss": 0.9071, "step": 7266 }, { "epoch": 3.2297777777777776, "grad_norm": 3.167236566543579, "learning_rate": 7.087188612099645e-05, "loss": 1.317, "step": 7267 }, { "epoch": 3.2302222222222223, "grad_norm": 2.4366395473480225, "learning_rate": 7.08540925266904e-05, "loss": 0.8397, "step": 7268 }, { "epoch": 3.2306666666666666, "grad_norm": 3.0970094203948975, "learning_rate": 7.083629893238435e-05, "loss": 1.2824, "step": 7269 }, { "epoch": 3.2311111111111113, "grad_norm": 3.605058431625366, "learning_rate": 7.08185053380783e-05, "loss": 1.4352, "step": 7270 }, { "epoch": 3.2315555555555555, "grad_norm": 3.437645673751831, "learning_rate": 7.080071174377224e-05, "loss": 1.4561, "step": 7271 }, { "epoch": 3.232, "grad_norm": 2.9223389625549316, "learning_rate": 7.078291814946619e-05, "loss": 1.146, "step": 7272 }, { "epoch": 3.2324444444444445, "grad_norm": 4.0127129554748535, "learning_rate": 7.076512455516015e-05, "loss": 1.5512, "step": 7273 }, { "epoch": 3.2328888888888887, "grad_norm": 3.0993196964263916, "learning_rate": 7.074733096085409e-05, "loss": 1.178, "step": 7274 }, { "epoch": 3.2333333333333334, "grad_norm": 3.2388551235198975, "learning_rate": 7.072953736654805e-05, "loss": 1.1, "step": 7275 }, { "epoch": 3.2337777777777776, "grad_norm": 3.486222505569458, "learning_rate": 7.0711743772242e-05, "loss": 1.5717, "step": 7276 }, { "epoch": 3.2342222222222223, "grad_norm": 3.0996615886688232, "learning_rate": 7.069395017793595e-05, "loss": 1.1912, "step": 7277 }, { "epoch": 3.2346666666666666, "grad_norm": 3.7221884727478027, "learning_rate": 7.067615658362989e-05, "loss": 1.3382, "step": 7278 }, { "epoch": 3.2351111111111113, "grad_norm": 2.7222108840942383, "learning_rate": 7.065836298932385e-05, "loss": 1.0546, "step": 7279 }, { "epoch": 3.2355555555555555, "grad_norm": 3.1601808071136475, "learning_rate": 7.06405693950178e-05, "loss": 1.0409, "step": 7280 }, { "epoch": 3.2359999999999998, "grad_norm": 3.1560659408569336, "learning_rate": 7.062277580071175e-05, "loss": 1.3843, "step": 7281 }, { "epoch": 3.2364444444444445, "grad_norm": 2.998133897781372, "learning_rate": 7.06049822064057e-05, "loss": 1.227, "step": 7282 }, { "epoch": 3.2368888888888887, "grad_norm": 2.981759548187256, "learning_rate": 7.058718861209965e-05, "loss": 1.0669, "step": 7283 }, { "epoch": 3.2373333333333334, "grad_norm": 4.712173938751221, "learning_rate": 7.056939501779359e-05, "loss": 1.5214, "step": 7284 }, { "epoch": 3.2377777777777776, "grad_norm": 3.807121515274048, "learning_rate": 7.055160142348755e-05, "loss": 1.1862, "step": 7285 }, { "epoch": 3.2382222222222223, "grad_norm": 3.189521312713623, "learning_rate": 7.05338078291815e-05, "loss": 1.0769, "step": 7286 }, { "epoch": 3.2386666666666666, "grad_norm": 3.0515081882476807, "learning_rate": 7.051601423487545e-05, "loss": 0.9761, "step": 7287 }, { "epoch": 3.2391111111111113, "grad_norm": 3.679802894592285, "learning_rate": 7.04982206405694e-05, "loss": 1.1306, "step": 7288 }, { "epoch": 3.2395555555555555, "grad_norm": 3.8492064476013184, "learning_rate": 7.048042704626336e-05, "loss": 1.0187, "step": 7289 }, { "epoch": 3.24, "grad_norm": 4.128783226013184, "learning_rate": 7.046263345195729e-05, "loss": 1.1335, "step": 7290 }, { "epoch": 3.2404444444444445, "grad_norm": 3.4628500938415527, "learning_rate": 7.044483985765124e-05, "loss": 1.4144, "step": 7291 }, { "epoch": 3.2408888888888887, "grad_norm": 3.7098937034606934, "learning_rate": 7.04270462633452e-05, "loss": 1.0089, "step": 7292 }, { "epoch": 3.2413333333333334, "grad_norm": 3.4453952312469482, "learning_rate": 7.040925266903914e-05, "loss": 1.1287, "step": 7293 }, { "epoch": 3.2417777777777776, "grad_norm": 4.36504602432251, "learning_rate": 7.03914590747331e-05, "loss": 1.435, "step": 7294 }, { "epoch": 3.2422222222222223, "grad_norm": 4.6257476806640625, "learning_rate": 7.037366548042706e-05, "loss": 1.1206, "step": 7295 }, { "epoch": 3.2426666666666666, "grad_norm": 5.652115821838379, "learning_rate": 7.0355871886121e-05, "loss": 1.1406, "step": 7296 }, { "epoch": 3.2431111111111113, "grad_norm": 3.9133806228637695, "learning_rate": 7.033807829181494e-05, "loss": 0.8279, "step": 7297 }, { "epoch": 3.2435555555555555, "grad_norm": 5.016626834869385, "learning_rate": 7.03202846975089e-05, "loss": 1.7871, "step": 7298 }, { "epoch": 3.2439999999999998, "grad_norm": 2.0724196434020996, "learning_rate": 7.030249110320286e-05, "loss": 0.4856, "step": 7299 }, { "epoch": 3.2444444444444445, "grad_norm": 5.151663303375244, "learning_rate": 7.02846975088968e-05, "loss": 1.195, "step": 7300 }, { "epoch": 3.2448888888888887, "grad_norm": 1.3501733541488647, "learning_rate": 7.026690391459075e-05, "loss": 0.0231, "step": 7301 }, { "epoch": 3.2453333333333334, "grad_norm": 2.257030963897705, "learning_rate": 7.024911032028471e-05, "loss": 1.4925, "step": 7302 }, { "epoch": 3.2457777777777777, "grad_norm": 2.3157708644866943, "learning_rate": 7.023131672597864e-05, "loss": 1.3873, "step": 7303 }, { "epoch": 3.2462222222222223, "grad_norm": 2.4795522689819336, "learning_rate": 7.02135231316726e-05, "loss": 1.7858, "step": 7304 }, { "epoch": 3.2466666666666666, "grad_norm": 3.000107765197754, "learning_rate": 7.019572953736655e-05, "loss": 2.1163, "step": 7305 }, { "epoch": 3.2471111111111113, "grad_norm": 3.0592353343963623, "learning_rate": 7.01779359430605e-05, "loss": 1.615, "step": 7306 }, { "epoch": 3.2475555555555555, "grad_norm": 3.055565357208252, "learning_rate": 7.016014234875445e-05, "loss": 1.4699, "step": 7307 }, { "epoch": 3.248, "grad_norm": 2.8485829830169678, "learning_rate": 7.014234875444841e-05, "loss": 1.61, "step": 7308 }, { "epoch": 3.2484444444444445, "grad_norm": 0.6298081278800964, "learning_rate": 7.012455516014235e-05, "loss": 0.0241, "step": 7309 }, { "epoch": 3.2488888888888887, "grad_norm": 3.0662970542907715, "learning_rate": 7.01067615658363e-05, "loss": 1.7241, "step": 7310 }, { "epoch": 3.2493333333333334, "grad_norm": 4.107135772705078, "learning_rate": 7.008896797153025e-05, "loss": 2.0917, "step": 7311 }, { "epoch": 3.2497777777777777, "grad_norm": 3.6882572174072266, "learning_rate": 7.007117437722421e-05, "loss": 1.5678, "step": 7312 }, { "epoch": 3.2502222222222223, "grad_norm": 2.740238904953003, "learning_rate": 7.005338078291815e-05, "loss": 1.3232, "step": 7313 }, { "epoch": 3.2506666666666666, "grad_norm": 3.1968138217926025, "learning_rate": 7.003558718861211e-05, "loss": 1.3232, "step": 7314 }, { "epoch": 3.2511111111111113, "grad_norm": 3.4555680751800537, "learning_rate": 7.001779359430606e-05, "loss": 1.6254, "step": 7315 }, { "epoch": 3.2515555555555555, "grad_norm": 3.412783145904541, "learning_rate": 7e-05, "loss": 1.671, "step": 7316 }, { "epoch": 3.252, "grad_norm": 3.239433765411377, "learning_rate": 6.998220640569395e-05, "loss": 1.4749, "step": 7317 }, { "epoch": 3.2524444444444445, "grad_norm": 2.9741363525390625, "learning_rate": 6.996441281138791e-05, "loss": 0.8289, "step": 7318 }, { "epoch": 3.2528888888888887, "grad_norm": 3.045731782913208, "learning_rate": 6.994661921708185e-05, "loss": 1.6571, "step": 7319 }, { "epoch": 3.2533333333333334, "grad_norm": 3.0729551315307617, "learning_rate": 6.99288256227758e-05, "loss": 1.4405, "step": 7320 }, { "epoch": 3.2537777777777777, "grad_norm": 2.7762842178344727, "learning_rate": 6.991103202846976e-05, "loss": 1.1644, "step": 7321 }, { "epoch": 3.2542222222222223, "grad_norm": 2.8989222049713135, "learning_rate": 6.98932384341637e-05, "loss": 1.2091, "step": 7322 }, { "epoch": 3.2546666666666666, "grad_norm": 3.4842469692230225, "learning_rate": 6.987544483985765e-05, "loss": 1.489, "step": 7323 }, { "epoch": 3.2551111111111113, "grad_norm": 3.5377140045166016, "learning_rate": 6.98576512455516e-05, "loss": 0.9816, "step": 7324 }, { "epoch": 3.2555555555555555, "grad_norm": 3.346153974533081, "learning_rate": 6.983985765124555e-05, "loss": 1.6473, "step": 7325 }, { "epoch": 3.2560000000000002, "grad_norm": 3.840273380279541, "learning_rate": 6.98220640569395e-05, "loss": 1.509, "step": 7326 }, { "epoch": 3.2564444444444445, "grad_norm": 3.4088332653045654, "learning_rate": 6.980427046263346e-05, "loss": 1.1997, "step": 7327 }, { "epoch": 3.2568888888888887, "grad_norm": 2.9059813022613525, "learning_rate": 6.97864768683274e-05, "loss": 1.133, "step": 7328 }, { "epoch": 3.2573333333333334, "grad_norm": 3.1663708686828613, "learning_rate": 6.976868327402135e-05, "loss": 1.2817, "step": 7329 }, { "epoch": 3.2577777777777777, "grad_norm": 3.088623523712158, "learning_rate": 6.97508896797153e-05, "loss": 1.142, "step": 7330 }, { "epoch": 3.2582222222222224, "grad_norm": 2.9888076782226562, "learning_rate": 6.973309608540926e-05, "loss": 0.8191, "step": 7331 }, { "epoch": 3.2586666666666666, "grad_norm": 3.0160036087036133, "learning_rate": 6.97153024911032e-05, "loss": 0.8145, "step": 7332 }, { "epoch": 3.2591111111111113, "grad_norm": 4.1365885734558105, "learning_rate": 6.969750889679716e-05, "loss": 1.2396, "step": 7333 }, { "epoch": 3.2595555555555555, "grad_norm": 4.00795316696167, "learning_rate": 6.967971530249112e-05, "loss": 1.7597, "step": 7334 }, { "epoch": 3.26, "grad_norm": 3.9610869884490967, "learning_rate": 6.966192170818505e-05, "loss": 1.4981, "step": 7335 }, { "epoch": 3.2604444444444445, "grad_norm": 2.927777051925659, "learning_rate": 6.9644128113879e-05, "loss": 0.9722, "step": 7336 }, { "epoch": 3.2608888888888887, "grad_norm": 3.2167749404907227, "learning_rate": 6.962633451957296e-05, "loss": 1.1545, "step": 7337 }, { "epoch": 3.2613333333333334, "grad_norm": 3.5486435890197754, "learning_rate": 6.96085409252669e-05, "loss": 1.2762, "step": 7338 }, { "epoch": 3.2617777777777777, "grad_norm": 2.870244026184082, "learning_rate": 6.959074733096086e-05, "loss": 0.8667, "step": 7339 }, { "epoch": 3.2622222222222224, "grad_norm": 3.2309038639068604, "learning_rate": 6.957295373665481e-05, "loss": 1.0299, "step": 7340 }, { "epoch": 3.2626666666666666, "grad_norm": 3.7159862518310547, "learning_rate": 6.955516014234876e-05, "loss": 1.2253, "step": 7341 }, { "epoch": 3.2631111111111113, "grad_norm": 4.002220630645752, "learning_rate": 6.95373665480427e-05, "loss": 1.0, "step": 7342 }, { "epoch": 3.2635555555555555, "grad_norm": 3.580116033554077, "learning_rate": 6.951957295373666e-05, "loss": 1.1418, "step": 7343 }, { "epoch": 3.2640000000000002, "grad_norm": 3.863196849822998, "learning_rate": 6.950177935943061e-05, "loss": 1.2002, "step": 7344 }, { "epoch": 3.2644444444444445, "grad_norm": 3.9584481716156006, "learning_rate": 6.948398576512456e-05, "loss": 1.1874, "step": 7345 }, { "epoch": 3.2648888888888887, "grad_norm": 4.6558403968811035, "learning_rate": 6.946619217081851e-05, "loss": 1.1067, "step": 7346 }, { "epoch": 3.2653333333333334, "grad_norm": 5.399629592895508, "learning_rate": 6.944839857651247e-05, "loss": 1.3862, "step": 7347 }, { "epoch": 3.2657777777777777, "grad_norm": 4.66273307800293, "learning_rate": 6.94306049822064e-05, "loss": 1.7355, "step": 7348 }, { "epoch": 3.2662222222222224, "grad_norm": 4.567107200622559, "learning_rate": 6.941281138790035e-05, "loss": 1.4238, "step": 7349 }, { "epoch": 3.2666666666666666, "grad_norm": 3.7617597579956055, "learning_rate": 6.939501779359431e-05, "loss": 0.5293, "step": 7350 }, { "epoch": 3.2671111111111113, "grad_norm": 2.0919718742370605, "learning_rate": 6.937722419928825e-05, "loss": 0.9802, "step": 7351 }, { "epoch": 3.2675555555555555, "grad_norm": 2.003411293029785, "learning_rate": 6.935943060498221e-05, "loss": 0.7958, "step": 7352 }, { "epoch": 3.268, "grad_norm": 2.802001714706421, "learning_rate": 6.934163701067617e-05, "loss": 1.5791, "step": 7353 }, { "epoch": 3.2684444444444445, "grad_norm": 2.7242465019226074, "learning_rate": 6.932384341637011e-05, "loss": 1.2121, "step": 7354 }, { "epoch": 3.2688888888888887, "grad_norm": 3.243849039077759, "learning_rate": 6.930604982206405e-05, "loss": 1.4581, "step": 7355 }, { "epoch": 3.2693333333333334, "grad_norm": 2.92507004737854, "learning_rate": 6.928825622775801e-05, "loss": 0.9443, "step": 7356 }, { "epoch": 3.2697777777777777, "grad_norm": 3.716792345046997, "learning_rate": 6.927046263345197e-05, "loss": 1.8721, "step": 7357 }, { "epoch": 3.2702222222222224, "grad_norm": 3.27911639213562, "learning_rate": 6.925266903914591e-05, "loss": 1.8725, "step": 7358 }, { "epoch": 3.2706666666666666, "grad_norm": 2.785362720489502, "learning_rate": 6.923487544483987e-05, "loss": 1.0283, "step": 7359 }, { "epoch": 3.2711111111111113, "grad_norm": 3.401243209838867, "learning_rate": 6.921708185053382e-05, "loss": 1.4464, "step": 7360 }, { "epoch": 3.2715555555555556, "grad_norm": 3.113215923309326, "learning_rate": 6.919928825622775e-05, "loss": 1.6867, "step": 7361 }, { "epoch": 3.2720000000000002, "grad_norm": 3.3423285484313965, "learning_rate": 6.918149466192171e-05, "loss": 1.2452, "step": 7362 }, { "epoch": 3.2724444444444445, "grad_norm": 2.914412021636963, "learning_rate": 6.916370106761566e-05, "loss": 1.0074, "step": 7363 }, { "epoch": 3.2728888888888887, "grad_norm": 2.851064920425415, "learning_rate": 6.914590747330961e-05, "loss": 1.2674, "step": 7364 }, { "epoch": 3.2733333333333334, "grad_norm": 3.0711374282836914, "learning_rate": 6.912811387900356e-05, "loss": 1.3246, "step": 7365 }, { "epoch": 3.2737777777777777, "grad_norm": 3.009274959564209, "learning_rate": 6.911032028469752e-05, "loss": 1.3831, "step": 7366 }, { "epoch": 3.2742222222222224, "grad_norm": 3.4454495906829834, "learning_rate": 6.909252669039146e-05, "loss": 1.5029, "step": 7367 }, { "epoch": 3.2746666666666666, "grad_norm": 3.192911386489868, "learning_rate": 6.90747330960854e-05, "loss": 1.3722, "step": 7368 }, { "epoch": 3.2751111111111113, "grad_norm": 3.413494110107422, "learning_rate": 6.905693950177936e-05, "loss": 1.3843, "step": 7369 }, { "epoch": 3.2755555555555556, "grad_norm": 2.5494213104248047, "learning_rate": 6.90391459074733e-05, "loss": 0.9536, "step": 7370 }, { "epoch": 3.276, "grad_norm": 3.192929267883301, "learning_rate": 6.902135231316726e-05, "loss": 1.687, "step": 7371 }, { "epoch": 3.2764444444444445, "grad_norm": 3.70212459564209, "learning_rate": 6.900355871886122e-05, "loss": 1.5884, "step": 7372 }, { "epoch": 3.2768888888888887, "grad_norm": 3.3133480548858643, "learning_rate": 6.898576512455516e-05, "loss": 1.5349, "step": 7373 }, { "epoch": 3.2773333333333334, "grad_norm": 3.146245241165161, "learning_rate": 6.89679715302491e-05, "loss": 1.259, "step": 7374 }, { "epoch": 3.2777777777777777, "grad_norm": 3.329066753387451, "learning_rate": 6.895017793594306e-05, "loss": 1.4778, "step": 7375 }, { "epoch": 3.2782222222222224, "grad_norm": 3.4649317264556885, "learning_rate": 6.893238434163702e-05, "loss": 1.4338, "step": 7376 }, { "epoch": 3.2786666666666666, "grad_norm": 3.8039920330047607, "learning_rate": 6.891459074733096e-05, "loss": 1.6371, "step": 7377 }, { "epoch": 3.279111111111111, "grad_norm": 3.138587474822998, "learning_rate": 6.889679715302492e-05, "loss": 1.2055, "step": 7378 }, { "epoch": 3.2795555555555556, "grad_norm": 3.8496856689453125, "learning_rate": 6.887900355871887e-05, "loss": 1.1513, "step": 7379 }, { "epoch": 3.2800000000000002, "grad_norm": 1.128354787826538, "learning_rate": 6.88612099644128e-05, "loss": 0.039, "step": 7380 }, { "epoch": 3.2804444444444445, "grad_norm": 3.4761149883270264, "learning_rate": 6.884341637010676e-05, "loss": 1.6024, "step": 7381 }, { "epoch": 3.2808888888888887, "grad_norm": 3.339864492416382, "learning_rate": 6.882562277580072e-05, "loss": 1.3504, "step": 7382 }, { "epoch": 3.2813333333333334, "grad_norm": 2.745142936706543, "learning_rate": 6.880782918149466e-05, "loss": 0.8555, "step": 7383 }, { "epoch": 3.2817777777777777, "grad_norm": 4.043299198150635, "learning_rate": 6.879003558718862e-05, "loss": 1.5035, "step": 7384 }, { "epoch": 3.2822222222222224, "grad_norm": 3.773738384246826, "learning_rate": 6.877224199288257e-05, "loss": 1.1627, "step": 7385 }, { "epoch": 3.2826666666666666, "grad_norm": 3.9867827892303467, "learning_rate": 6.875444839857652e-05, "loss": 1.5756, "step": 7386 }, { "epoch": 3.2831111111111113, "grad_norm": 3.7746942043304443, "learning_rate": 6.873665480427046e-05, "loss": 1.4665, "step": 7387 }, { "epoch": 3.2835555555555556, "grad_norm": 3.536393642425537, "learning_rate": 6.871886120996441e-05, "loss": 1.0364, "step": 7388 }, { "epoch": 3.284, "grad_norm": 3.190711498260498, "learning_rate": 6.870106761565837e-05, "loss": 0.9868, "step": 7389 }, { "epoch": 3.2844444444444445, "grad_norm": 3.708233594894409, "learning_rate": 6.868327402135231e-05, "loss": 1.3567, "step": 7390 }, { "epoch": 3.2848888888888887, "grad_norm": 3.1745657920837402, "learning_rate": 6.866548042704627e-05, "loss": 1.048, "step": 7391 }, { "epoch": 3.2853333333333334, "grad_norm": 3.826011896133423, "learning_rate": 6.864768683274023e-05, "loss": 1.1933, "step": 7392 }, { "epoch": 3.2857777777777777, "grad_norm": 3.8659939765930176, "learning_rate": 6.862989323843416e-05, "loss": 1.0345, "step": 7393 }, { "epoch": 3.2862222222222224, "grad_norm": 4.651473522186279, "learning_rate": 6.861209964412811e-05, "loss": 1.0049, "step": 7394 }, { "epoch": 3.2866666666666666, "grad_norm": 4.106167316436768, "learning_rate": 6.859430604982207e-05, "loss": 0.9622, "step": 7395 }, { "epoch": 3.287111111111111, "grad_norm": 4.003452301025391, "learning_rate": 6.857651245551601e-05, "loss": 1.0904, "step": 7396 }, { "epoch": 3.2875555555555556, "grad_norm": 0.4317832887172699, "learning_rate": 6.855871886120997e-05, "loss": 0.0452, "step": 7397 }, { "epoch": 3.288, "grad_norm": 4.068361759185791, "learning_rate": 6.854092526690393e-05, "loss": 1.0543, "step": 7398 }, { "epoch": 3.2884444444444445, "grad_norm": 3.885014772415161, "learning_rate": 6.852313167259787e-05, "loss": 0.5935, "step": 7399 }, { "epoch": 3.2888888888888888, "grad_norm": 5.79363489151001, "learning_rate": 6.850533807829181e-05, "loss": 0.9945, "step": 7400 }, { "epoch": 3.2893333333333334, "grad_norm": 2.343862295150757, "learning_rate": 6.848754448398577e-05, "loss": 1.873, "step": 7401 }, { "epoch": 3.2897777777777777, "grad_norm": 2.8761746883392334, "learning_rate": 6.846975088967972e-05, "loss": 2.2376, "step": 7402 }, { "epoch": 3.2902222222222224, "grad_norm": 2.3083155155181885, "learning_rate": 6.845195729537367e-05, "loss": 0.9427, "step": 7403 }, { "epoch": 3.2906666666666666, "grad_norm": 2.915044069290161, "learning_rate": 6.843416370106762e-05, "loss": 1.3453, "step": 7404 }, { "epoch": 3.2911111111111113, "grad_norm": 3.3559882640838623, "learning_rate": 6.841637010676158e-05, "loss": 2.2773, "step": 7405 }, { "epoch": 3.2915555555555556, "grad_norm": 2.8695106506347656, "learning_rate": 6.839857651245551e-05, "loss": 1.099, "step": 7406 }, { "epoch": 3.292, "grad_norm": 3.35927414894104, "learning_rate": 6.838078291814947e-05, "loss": 1.7923, "step": 7407 }, { "epoch": 3.2924444444444445, "grad_norm": 3.227517604827881, "learning_rate": 6.836298932384342e-05, "loss": 1.651, "step": 7408 }, { "epoch": 3.2928888888888888, "grad_norm": 3.0707974433898926, "learning_rate": 6.834519572953737e-05, "loss": 1.5187, "step": 7409 }, { "epoch": 3.2933333333333334, "grad_norm": 2.92110538482666, "learning_rate": 6.832740213523132e-05, "loss": 1.5467, "step": 7410 }, { "epoch": 3.2937777777777777, "grad_norm": 3.0438144207000732, "learning_rate": 6.830960854092528e-05, "loss": 1.6164, "step": 7411 }, { "epoch": 3.2942222222222224, "grad_norm": 3.135681629180908, "learning_rate": 6.829181494661922e-05, "loss": 1.3172, "step": 7412 }, { "epoch": 3.2946666666666666, "grad_norm": 3.0080184936523438, "learning_rate": 6.827402135231316e-05, "loss": 1.5289, "step": 7413 }, { "epoch": 3.295111111111111, "grad_norm": 2.6062517166137695, "learning_rate": 6.825622775800712e-05, "loss": 1.47, "step": 7414 }, { "epoch": 3.2955555555555556, "grad_norm": 3.1638424396514893, "learning_rate": 6.823843416370106e-05, "loss": 1.5606, "step": 7415 }, { "epoch": 3.296, "grad_norm": 3.3744962215423584, "learning_rate": 6.822064056939502e-05, "loss": 1.6891, "step": 7416 }, { "epoch": 3.2964444444444445, "grad_norm": 3.1398794651031494, "learning_rate": 6.820284697508898e-05, "loss": 1.4498, "step": 7417 }, { "epoch": 3.2968888888888888, "grad_norm": 2.805931806564331, "learning_rate": 6.818505338078292e-05, "loss": 1.0785, "step": 7418 }, { "epoch": 3.2973333333333334, "grad_norm": 2.9776318073272705, "learning_rate": 6.816725978647686e-05, "loss": 1.5049, "step": 7419 }, { "epoch": 3.2977777777777777, "grad_norm": 2.0422849655151367, "learning_rate": 6.814946619217082e-05, "loss": 0.6512, "step": 7420 }, { "epoch": 3.2982222222222224, "grad_norm": 3.6055731773376465, "learning_rate": 6.813167259786478e-05, "loss": 1.4551, "step": 7421 }, { "epoch": 3.2986666666666666, "grad_norm": 3.5920650959014893, "learning_rate": 6.811387900355872e-05, "loss": 1.2965, "step": 7422 }, { "epoch": 3.2991111111111113, "grad_norm": 2.8194003105163574, "learning_rate": 6.809608540925268e-05, "loss": 1.102, "step": 7423 }, { "epoch": 3.2995555555555556, "grad_norm": 3.0865092277526855, "learning_rate": 6.807829181494663e-05, "loss": 1.0238, "step": 7424 }, { "epoch": 3.3, "grad_norm": 3.17631459236145, "learning_rate": 6.806049822064056e-05, "loss": 1.3226, "step": 7425 }, { "epoch": 3.3004444444444445, "grad_norm": 3.16072678565979, "learning_rate": 6.804270462633452e-05, "loss": 1.5295, "step": 7426 }, { "epoch": 3.3008888888888888, "grad_norm": 2.93965482711792, "learning_rate": 6.802491103202847e-05, "loss": 0.8391, "step": 7427 }, { "epoch": 3.3013333333333335, "grad_norm": 3.356229543685913, "learning_rate": 6.800711743772242e-05, "loss": 1.6525, "step": 7428 }, { "epoch": 3.3017777777777777, "grad_norm": 3.251253128051758, "learning_rate": 6.798932384341637e-05, "loss": 1.2885, "step": 7429 }, { "epoch": 3.3022222222222224, "grad_norm": 3.1769371032714844, "learning_rate": 6.797153024911033e-05, "loss": 1.0414, "step": 7430 }, { "epoch": 3.3026666666666666, "grad_norm": 3.4345128536224365, "learning_rate": 6.795373665480427e-05, "loss": 1.0469, "step": 7431 }, { "epoch": 3.303111111111111, "grad_norm": 3.2680447101593018, "learning_rate": 6.793594306049822e-05, "loss": 1.3976, "step": 7432 }, { "epoch": 3.3035555555555556, "grad_norm": 3.0672709941864014, "learning_rate": 6.791814946619217e-05, "loss": 1.2144, "step": 7433 }, { "epoch": 3.304, "grad_norm": 3.2540392875671387, "learning_rate": 6.790035587188613e-05, "loss": 1.1354, "step": 7434 }, { "epoch": 3.3044444444444445, "grad_norm": 3.3583109378814697, "learning_rate": 6.788256227758007e-05, "loss": 1.2996, "step": 7435 }, { "epoch": 3.3048888888888888, "grad_norm": 4.012465476989746, "learning_rate": 6.786476868327403e-05, "loss": 1.5203, "step": 7436 }, { "epoch": 3.3053333333333335, "grad_norm": 3.8637561798095703, "learning_rate": 6.784697508896798e-05, "loss": 1.3783, "step": 7437 }, { "epoch": 3.3057777777777777, "grad_norm": 3.8466806411743164, "learning_rate": 6.782918149466191e-05, "loss": 1.3447, "step": 7438 }, { "epoch": 3.3062222222222224, "grad_norm": 3.448899269104004, "learning_rate": 6.781138790035587e-05, "loss": 1.1285, "step": 7439 }, { "epoch": 3.3066666666666666, "grad_norm": 3.3083012104034424, "learning_rate": 6.779359430604983e-05, "loss": 0.8823, "step": 7440 }, { "epoch": 3.3071111111111113, "grad_norm": 4.381979465484619, "learning_rate": 6.777580071174377e-05, "loss": 1.1432, "step": 7441 }, { "epoch": 3.3075555555555556, "grad_norm": 4.388772964477539, "learning_rate": 6.775800711743773e-05, "loss": 1.2684, "step": 7442 }, { "epoch": 3.308, "grad_norm": 3.7288894653320312, "learning_rate": 6.774021352313168e-05, "loss": 1.035, "step": 7443 }, { "epoch": 3.3084444444444445, "grad_norm": 4.472944736480713, "learning_rate": 6.772241992882563e-05, "loss": 1.217, "step": 7444 }, { "epoch": 3.3088888888888888, "grad_norm": 4.284884929656982, "learning_rate": 6.770462633451957e-05, "loss": 1.237, "step": 7445 }, { "epoch": 3.3093333333333335, "grad_norm": 6.807945251464844, "learning_rate": 6.768683274021353e-05, "loss": 1.2017, "step": 7446 }, { "epoch": 3.3097777777777777, "grad_norm": 4.781992435455322, "learning_rate": 6.766903914590748e-05, "loss": 1.32, "step": 7447 }, { "epoch": 3.3102222222222224, "grad_norm": 3.9543511867523193, "learning_rate": 6.765124555160142e-05, "loss": 1.0203, "step": 7448 }, { "epoch": 3.3106666666666666, "grad_norm": 4.435871124267578, "learning_rate": 6.763345195729538e-05, "loss": 1.1069, "step": 7449 }, { "epoch": 3.311111111111111, "grad_norm": 4.147932529449463, "learning_rate": 6.761565836298934e-05, "loss": 0.6211, "step": 7450 }, { "epoch": 3.3115555555555556, "grad_norm": 2.8902626037597656, "learning_rate": 6.759786476868327e-05, "loss": 1.9548, "step": 7451 }, { "epoch": 3.312, "grad_norm": 1.825205683708191, "learning_rate": 6.758007117437722e-05, "loss": 0.6829, "step": 7452 }, { "epoch": 3.3124444444444445, "grad_norm": 3.156799077987671, "learning_rate": 6.756227758007118e-05, "loss": 1.5275, "step": 7453 }, { "epoch": 3.3128888888888888, "grad_norm": 3.1396076679229736, "learning_rate": 6.754448398576512e-05, "loss": 1.4951, "step": 7454 }, { "epoch": 3.3133333333333335, "grad_norm": 3.152575731277466, "learning_rate": 6.752669039145908e-05, "loss": 0.9246, "step": 7455 }, { "epoch": 3.3137777777777777, "grad_norm": 3.4386932849884033, "learning_rate": 6.750889679715304e-05, "loss": 1.5932, "step": 7456 }, { "epoch": 3.3142222222222224, "grad_norm": 3.287623882293701, "learning_rate": 6.749110320284698e-05, "loss": 1.3995, "step": 7457 }, { "epoch": 3.3146666666666667, "grad_norm": 3.267625570297241, "learning_rate": 6.747330960854092e-05, "loss": 1.4844, "step": 7458 }, { "epoch": 3.3151111111111113, "grad_norm": 2.988413095474243, "learning_rate": 6.745551601423488e-05, "loss": 1.1216, "step": 7459 }, { "epoch": 3.3155555555555556, "grad_norm": 2.892350196838379, "learning_rate": 6.743772241992882e-05, "loss": 1.415, "step": 7460 }, { "epoch": 3.316, "grad_norm": 3.1864426136016846, "learning_rate": 6.741992882562278e-05, "loss": 1.4968, "step": 7461 }, { "epoch": 3.3164444444444445, "grad_norm": 2.9494309425354004, "learning_rate": 6.740213523131673e-05, "loss": 1.1238, "step": 7462 }, { "epoch": 3.3168888888888888, "grad_norm": 2.9198689460754395, "learning_rate": 6.738434163701068e-05, "loss": 1.5899, "step": 7463 }, { "epoch": 3.3173333333333335, "grad_norm": 3.371006965637207, "learning_rate": 6.736654804270462e-05, "loss": 1.564, "step": 7464 }, { "epoch": 3.3177777777777777, "grad_norm": 2.8129475116729736, "learning_rate": 6.734875444839858e-05, "loss": 1.0162, "step": 7465 }, { "epoch": 3.3182222222222224, "grad_norm": 3.1402010917663574, "learning_rate": 6.733096085409253e-05, "loss": 1.3211, "step": 7466 }, { "epoch": 3.3186666666666667, "grad_norm": 2.9242656230926514, "learning_rate": 6.731316725978648e-05, "loss": 1.2605, "step": 7467 }, { "epoch": 3.319111111111111, "grad_norm": 3.019500255584717, "learning_rate": 6.729537366548043e-05, "loss": 1.3583, "step": 7468 }, { "epoch": 3.3195555555555556, "grad_norm": 3.6432340145111084, "learning_rate": 6.727758007117439e-05, "loss": 1.4444, "step": 7469 }, { "epoch": 3.32, "grad_norm": 3.2907845973968506, "learning_rate": 6.725978647686833e-05, "loss": 1.1577, "step": 7470 }, { "epoch": 3.3204444444444445, "grad_norm": 2.75240421295166, "learning_rate": 6.724199288256228e-05, "loss": 1.5465, "step": 7471 }, { "epoch": 3.320888888888889, "grad_norm": 3.2636733055114746, "learning_rate": 6.722419928825623e-05, "loss": 1.212, "step": 7472 }, { "epoch": 3.3213333333333335, "grad_norm": 3.171222448348999, "learning_rate": 6.720640569395017e-05, "loss": 1.3743, "step": 7473 }, { "epoch": 3.3217777777777777, "grad_norm": 2.8894879817962646, "learning_rate": 6.718861209964413e-05, "loss": 1.5747, "step": 7474 }, { "epoch": 3.3222222222222224, "grad_norm": 3.2905118465423584, "learning_rate": 6.717081850533809e-05, "loss": 1.6193, "step": 7475 }, { "epoch": 3.3226666666666667, "grad_norm": 3.048412799835205, "learning_rate": 6.715302491103203e-05, "loss": 1.1528, "step": 7476 }, { "epoch": 3.3231111111111113, "grad_norm": 3.6255054473876953, "learning_rate": 6.713523131672597e-05, "loss": 1.3157, "step": 7477 }, { "epoch": 3.3235555555555556, "grad_norm": 3.5292255878448486, "learning_rate": 6.711743772241993e-05, "loss": 1.2811, "step": 7478 }, { "epoch": 3.324, "grad_norm": 2.83103346824646, "learning_rate": 6.709964412811389e-05, "loss": 0.9067, "step": 7479 }, { "epoch": 3.3244444444444445, "grad_norm": 3.3375396728515625, "learning_rate": 6.708185053380783e-05, "loss": 1.307, "step": 7480 }, { "epoch": 3.324888888888889, "grad_norm": 2.8192222118377686, "learning_rate": 6.706405693950179e-05, "loss": 1.0012, "step": 7481 }, { "epoch": 3.3253333333333335, "grad_norm": 3.162932872772217, "learning_rate": 6.704626334519574e-05, "loss": 1.2155, "step": 7482 }, { "epoch": 3.3257777777777777, "grad_norm": 4.290798187255859, "learning_rate": 6.702846975088967e-05, "loss": 1.3119, "step": 7483 }, { "epoch": 3.3262222222222224, "grad_norm": 3.4641189575195312, "learning_rate": 6.701067615658363e-05, "loss": 1.0565, "step": 7484 }, { "epoch": 3.3266666666666667, "grad_norm": 4.372963905334473, "learning_rate": 6.699288256227758e-05, "loss": 1.5057, "step": 7485 }, { "epoch": 3.327111111111111, "grad_norm": 3.908487319946289, "learning_rate": 6.697508896797153e-05, "loss": 1.2567, "step": 7486 }, { "epoch": 3.3275555555555556, "grad_norm": 3.5218594074249268, "learning_rate": 6.695729537366548e-05, "loss": 1.039, "step": 7487 }, { "epoch": 3.328, "grad_norm": 4.635097980499268, "learning_rate": 6.693950177935944e-05, "loss": 1.1867, "step": 7488 }, { "epoch": 3.3284444444444445, "grad_norm": 3.447298526763916, "learning_rate": 6.692170818505338e-05, "loss": 1.1312, "step": 7489 }, { "epoch": 3.328888888888889, "grad_norm": 3.3791463375091553, "learning_rate": 6.690391459074733e-05, "loss": 0.8666, "step": 7490 }, { "epoch": 3.3293333333333335, "grad_norm": 2.8901917934417725, "learning_rate": 6.688612099644128e-05, "loss": 1.1193, "step": 7491 }, { "epoch": 3.3297777777777777, "grad_norm": 3.7830843925476074, "learning_rate": 6.686832740213524e-05, "loss": 1.152, "step": 7492 }, { "epoch": 3.330222222222222, "grad_norm": 3.2505948543548584, "learning_rate": 6.685053380782918e-05, "loss": 0.9152, "step": 7493 }, { "epoch": 3.3306666666666667, "grad_norm": 4.339881896972656, "learning_rate": 6.683274021352314e-05, "loss": 1.3848, "step": 7494 }, { "epoch": 3.3311111111111114, "grad_norm": 4.330918312072754, "learning_rate": 6.68149466192171e-05, "loss": 1.5097, "step": 7495 }, { "epoch": 3.3315555555555556, "grad_norm": 3.9478585720062256, "learning_rate": 6.679715302491103e-05, "loss": 1.2268, "step": 7496 }, { "epoch": 3.332, "grad_norm": 4.484266757965088, "learning_rate": 6.677935943060498e-05, "loss": 1.1452, "step": 7497 }, { "epoch": 3.3324444444444445, "grad_norm": 3.884369373321533, "learning_rate": 6.676156583629894e-05, "loss": 1.3011, "step": 7498 }, { "epoch": 3.332888888888889, "grad_norm": 5.073292255401611, "learning_rate": 6.674377224199288e-05, "loss": 1.4814, "step": 7499 }, { "epoch": 3.3333333333333335, "grad_norm": 4.949814796447754, "learning_rate": 6.672597864768684e-05, "loss": 0.5286, "step": 7500 }, { "epoch": 3.3337777777777777, "grad_norm": 2.3786208629608154, "learning_rate": 6.67081850533808e-05, "loss": 1.8626, "step": 7501 }, { "epoch": 3.3342222222222224, "grad_norm": 2.6522817611694336, "learning_rate": 6.669039145907474e-05, "loss": 2.0315, "step": 7502 }, { "epoch": 3.3346666666666667, "grad_norm": 2.5052194595336914, "learning_rate": 6.667259786476868e-05, "loss": 1.5958, "step": 7503 }, { "epoch": 3.335111111111111, "grad_norm": 2.0900769233703613, "learning_rate": 6.665480427046264e-05, "loss": 1.0634, "step": 7504 }, { "epoch": 3.3355555555555556, "grad_norm": 3.4959940910339355, "learning_rate": 6.663701067615658e-05, "loss": 1.9428, "step": 7505 }, { "epoch": 3.336, "grad_norm": 2.677666187286377, "learning_rate": 6.661921708185054e-05, "loss": 1.0751, "step": 7506 }, { "epoch": 3.3364444444444445, "grad_norm": 3.592830181121826, "learning_rate": 6.660142348754449e-05, "loss": 2.0699, "step": 7507 }, { "epoch": 3.336888888888889, "grad_norm": 2.6081550121307373, "learning_rate": 6.658362989323844e-05, "loss": 1.142, "step": 7508 }, { "epoch": 3.3373333333333335, "grad_norm": 1.7961082458496094, "learning_rate": 6.656583629893238e-05, "loss": 0.5108, "step": 7509 }, { "epoch": 3.3377777777777777, "grad_norm": 3.024773359298706, "learning_rate": 6.654804270462633e-05, "loss": 1.2942, "step": 7510 }, { "epoch": 3.338222222222222, "grad_norm": 3.3666229248046875, "learning_rate": 6.653024911032029e-05, "loss": 1.7867, "step": 7511 }, { "epoch": 3.3386666666666667, "grad_norm": 3.1930816173553467, "learning_rate": 6.651245551601423e-05, "loss": 1.3751, "step": 7512 }, { "epoch": 3.339111111111111, "grad_norm": 2.655470609664917, "learning_rate": 6.649466192170819e-05, "loss": 1.6016, "step": 7513 }, { "epoch": 3.3395555555555556, "grad_norm": 3.1630845069885254, "learning_rate": 6.647686832740215e-05, "loss": 1.3575, "step": 7514 }, { "epoch": 3.34, "grad_norm": 3.315333843231201, "learning_rate": 6.645907473309609e-05, "loss": 1.2997, "step": 7515 }, { "epoch": 3.3404444444444445, "grad_norm": 2.40342378616333, "learning_rate": 6.644128113879003e-05, "loss": 0.9741, "step": 7516 }, { "epoch": 3.340888888888889, "grad_norm": 0.24996036291122437, "learning_rate": 6.642348754448399e-05, "loss": 0.0318, "step": 7517 }, { "epoch": 3.3413333333333335, "grad_norm": 2.008341073989868, "learning_rate": 6.640569395017793e-05, "loss": 0.5644, "step": 7518 }, { "epoch": 3.3417777777777777, "grad_norm": 3.1700870990753174, "learning_rate": 6.638790035587189e-05, "loss": 1.164, "step": 7519 }, { "epoch": 3.3422222222222224, "grad_norm": 3.5381534099578857, "learning_rate": 6.637010676156585e-05, "loss": 1.6558, "step": 7520 }, { "epoch": 3.3426666666666667, "grad_norm": 3.9801483154296875, "learning_rate": 6.635231316725979e-05, "loss": 1.6166, "step": 7521 }, { "epoch": 3.343111111111111, "grad_norm": 3.240372896194458, "learning_rate": 6.633451957295373e-05, "loss": 1.3277, "step": 7522 }, { "epoch": 3.3435555555555556, "grad_norm": 3.508293390274048, "learning_rate": 6.631672597864769e-05, "loss": 1.3398, "step": 7523 }, { "epoch": 3.344, "grad_norm": 3.4153871536254883, "learning_rate": 6.629893238434164e-05, "loss": 1.1003, "step": 7524 }, { "epoch": 3.3444444444444446, "grad_norm": 3.2748870849609375, "learning_rate": 6.628113879003559e-05, "loss": 1.3643, "step": 7525 }, { "epoch": 3.344888888888889, "grad_norm": 3.4981982707977295, "learning_rate": 6.626334519572954e-05, "loss": 1.3478, "step": 7526 }, { "epoch": 3.3453333333333335, "grad_norm": 3.316903591156006, "learning_rate": 6.62455516014235e-05, "loss": 1.2191, "step": 7527 }, { "epoch": 3.3457777777777777, "grad_norm": 3.2292816638946533, "learning_rate": 6.622775800711744e-05, "loss": 1.4199, "step": 7528 }, { "epoch": 3.346222222222222, "grad_norm": 3.875464677810669, "learning_rate": 6.620996441281139e-05, "loss": 1.4645, "step": 7529 }, { "epoch": 3.3466666666666667, "grad_norm": 3.4380528926849365, "learning_rate": 6.619217081850534e-05, "loss": 1.2479, "step": 7530 }, { "epoch": 3.347111111111111, "grad_norm": 3.452080726623535, "learning_rate": 6.617437722419929e-05, "loss": 1.1636, "step": 7531 }, { "epoch": 3.3475555555555556, "grad_norm": 3.524420738220215, "learning_rate": 6.615658362989324e-05, "loss": 1.2439, "step": 7532 }, { "epoch": 3.348, "grad_norm": 3.042452096939087, "learning_rate": 6.61387900355872e-05, "loss": 1.1566, "step": 7533 }, { "epoch": 3.3484444444444446, "grad_norm": 3.027488946914673, "learning_rate": 6.612099644128114e-05, "loss": 1.1191, "step": 7534 }, { "epoch": 3.348888888888889, "grad_norm": 3.0509836673736572, "learning_rate": 6.610320284697508e-05, "loss": 1.0986, "step": 7535 }, { "epoch": 3.3493333333333335, "grad_norm": 4.5463080406188965, "learning_rate": 6.608540925266904e-05, "loss": 1.5118, "step": 7536 }, { "epoch": 3.3497777777777777, "grad_norm": 2.0029096603393555, "learning_rate": 6.6067615658363e-05, "loss": 0.5587, "step": 7537 }, { "epoch": 3.3502222222222224, "grad_norm": 4.032919406890869, "learning_rate": 6.604982206405694e-05, "loss": 1.5668, "step": 7538 }, { "epoch": 3.3506666666666667, "grad_norm": 3.9945425987243652, "learning_rate": 6.60320284697509e-05, "loss": 1.3243, "step": 7539 }, { "epoch": 3.351111111111111, "grad_norm": 3.5870978832244873, "learning_rate": 6.601423487544485e-05, "loss": 1.1531, "step": 7540 }, { "epoch": 3.3515555555555556, "grad_norm": 3.745569944381714, "learning_rate": 6.599644128113878e-05, "loss": 1.2786, "step": 7541 }, { "epoch": 3.352, "grad_norm": 3.2785604000091553, "learning_rate": 6.597864768683274e-05, "loss": 1.0538, "step": 7542 }, { "epoch": 3.3524444444444446, "grad_norm": 4.0194621086120605, "learning_rate": 6.59608540925267e-05, "loss": 1.2927, "step": 7543 }, { "epoch": 3.352888888888889, "grad_norm": 4.010869026184082, "learning_rate": 6.594306049822064e-05, "loss": 1.2408, "step": 7544 }, { "epoch": 3.3533333333333335, "grad_norm": 4.548504829406738, "learning_rate": 6.59252669039146e-05, "loss": 0.8367, "step": 7545 }, { "epoch": 3.3537777777777777, "grad_norm": 3.896096706390381, "learning_rate": 6.590747330960855e-05, "loss": 0.8111, "step": 7546 }, { "epoch": 3.354222222222222, "grad_norm": 4.7822957038879395, "learning_rate": 6.58896797153025e-05, "loss": 1.4794, "step": 7547 }, { "epoch": 3.3546666666666667, "grad_norm": 4.28626823425293, "learning_rate": 6.587188612099644e-05, "loss": 1.0438, "step": 7548 }, { "epoch": 3.355111111111111, "grad_norm": 4.136358737945557, "learning_rate": 6.58540925266904e-05, "loss": 0.9042, "step": 7549 }, { "epoch": 3.3555555555555556, "grad_norm": 3.873260021209717, "learning_rate": 6.583629893238434e-05, "loss": 0.8692, "step": 7550 }, { "epoch": 3.356, "grad_norm": 2.481637716293335, "learning_rate": 6.58185053380783e-05, "loss": 2.0427, "step": 7551 }, { "epoch": 3.3564444444444446, "grad_norm": 2.9505603313446045, "learning_rate": 6.580071174377225e-05, "loss": 1.5954, "step": 7552 }, { "epoch": 3.356888888888889, "grad_norm": 2.6145901679992676, "learning_rate": 6.578291814946619e-05, "loss": 1.3394, "step": 7553 }, { "epoch": 3.3573333333333335, "grad_norm": 2.786313533782959, "learning_rate": 6.576512455516014e-05, "loss": 1.376, "step": 7554 }, { "epoch": 3.3577777777777778, "grad_norm": 3.3209388256073, "learning_rate": 6.574733096085409e-05, "loss": 1.4978, "step": 7555 }, { "epoch": 3.3582222222222224, "grad_norm": 3.2253782749176025, "learning_rate": 6.572953736654805e-05, "loss": 1.345, "step": 7556 }, { "epoch": 3.3586666666666667, "grad_norm": 2.5213868618011475, "learning_rate": 6.571174377224199e-05, "loss": 1.0464, "step": 7557 }, { "epoch": 3.359111111111111, "grad_norm": 3.1049644947052, "learning_rate": 6.569395017793595e-05, "loss": 1.2791, "step": 7558 }, { "epoch": 3.3595555555555556, "grad_norm": 3.2726714611053467, "learning_rate": 6.56761565836299e-05, "loss": 1.1688, "step": 7559 }, { "epoch": 3.36, "grad_norm": 3.0971052646636963, "learning_rate": 6.565836298932385e-05, "loss": 1.1853, "step": 7560 }, { "epoch": 3.3604444444444446, "grad_norm": 3.0031280517578125, "learning_rate": 6.564056939501779e-05, "loss": 1.5328, "step": 7561 }, { "epoch": 3.360888888888889, "grad_norm": 4.010336399078369, "learning_rate": 6.562277580071175e-05, "loss": 1.3606, "step": 7562 }, { "epoch": 3.3613333333333335, "grad_norm": 3.0197978019714355, "learning_rate": 6.560498220640569e-05, "loss": 1.5253, "step": 7563 }, { "epoch": 3.3617777777777778, "grad_norm": 3.6670336723327637, "learning_rate": 6.558718861209965e-05, "loss": 2.0248, "step": 7564 }, { "epoch": 3.362222222222222, "grad_norm": 3.820261001586914, "learning_rate": 6.55693950177936e-05, "loss": 1.6484, "step": 7565 }, { "epoch": 3.3626666666666667, "grad_norm": 3.1288888454437256, "learning_rate": 6.555160142348755e-05, "loss": 1.1557, "step": 7566 }, { "epoch": 3.363111111111111, "grad_norm": 3.1067354679107666, "learning_rate": 6.553380782918149e-05, "loss": 0.9204, "step": 7567 }, { "epoch": 3.3635555555555556, "grad_norm": 3.1598074436187744, "learning_rate": 6.551601423487545e-05, "loss": 1.193, "step": 7568 }, { "epoch": 3.364, "grad_norm": 2.919339656829834, "learning_rate": 6.54982206405694e-05, "loss": 1.3601, "step": 7569 }, { "epoch": 3.3644444444444446, "grad_norm": 3.384775161743164, "learning_rate": 6.548042704626335e-05, "loss": 1.3367, "step": 7570 }, { "epoch": 3.364888888888889, "grad_norm": 3.7233974933624268, "learning_rate": 6.54626334519573e-05, "loss": 1.748, "step": 7571 }, { "epoch": 3.3653333333333335, "grad_norm": 3.6494290828704834, "learning_rate": 6.544483985765126e-05, "loss": 1.646, "step": 7572 }, { "epoch": 3.3657777777777778, "grad_norm": 3.263012409210205, "learning_rate": 6.54270462633452e-05, "loss": 0.9094, "step": 7573 }, { "epoch": 3.3662222222222224, "grad_norm": 3.7953481674194336, "learning_rate": 6.540925266903914e-05, "loss": 1.4884, "step": 7574 }, { "epoch": 3.3666666666666667, "grad_norm": 2.8919782638549805, "learning_rate": 6.53914590747331e-05, "loss": 1.121, "step": 7575 }, { "epoch": 3.367111111111111, "grad_norm": 2.639195680618286, "learning_rate": 6.537366548042704e-05, "loss": 0.5362, "step": 7576 }, { "epoch": 3.3675555555555556, "grad_norm": 3.4763965606689453, "learning_rate": 6.5355871886121e-05, "loss": 1.3417, "step": 7577 }, { "epoch": 3.368, "grad_norm": 2.803255558013916, "learning_rate": 6.533807829181496e-05, "loss": 1.0939, "step": 7578 }, { "epoch": 3.3684444444444446, "grad_norm": 3.6555652618408203, "learning_rate": 6.53202846975089e-05, "loss": 1.3298, "step": 7579 }, { "epoch": 3.368888888888889, "grad_norm": 3.3960537910461426, "learning_rate": 6.530249110320284e-05, "loss": 1.1507, "step": 7580 }, { "epoch": 3.3693333333333335, "grad_norm": 2.8318889141082764, "learning_rate": 6.52846975088968e-05, "loss": 0.9001, "step": 7581 }, { "epoch": 3.3697777777777778, "grad_norm": 3.948547601699829, "learning_rate": 6.526690391459076e-05, "loss": 0.7694, "step": 7582 }, { "epoch": 3.370222222222222, "grad_norm": 3.6763882637023926, "learning_rate": 6.52491103202847e-05, "loss": 0.9653, "step": 7583 }, { "epoch": 3.3706666666666667, "grad_norm": 4.624429225921631, "learning_rate": 6.523131672597865e-05, "loss": 1.2327, "step": 7584 }, { "epoch": 3.371111111111111, "grad_norm": 3.7554173469543457, "learning_rate": 6.521352313167261e-05, "loss": 1.0042, "step": 7585 }, { "epoch": 3.3715555555555556, "grad_norm": 2.8825485706329346, "learning_rate": 6.519572953736655e-05, "loss": 0.8331, "step": 7586 }, { "epoch": 3.372, "grad_norm": 4.5201849937438965, "learning_rate": 6.51779359430605e-05, "loss": 1.8612, "step": 7587 }, { "epoch": 3.3724444444444446, "grad_norm": 3.1740362644195557, "learning_rate": 6.516014234875445e-05, "loss": 1.1159, "step": 7588 }, { "epoch": 3.372888888888889, "grad_norm": 3.3097000122070312, "learning_rate": 6.51423487544484e-05, "loss": 1.3252, "step": 7589 }, { "epoch": 3.3733333333333335, "grad_norm": 3.417339563369751, "learning_rate": 6.512455516014235e-05, "loss": 0.9649, "step": 7590 }, { "epoch": 3.3737777777777778, "grad_norm": 4.091862201690674, "learning_rate": 6.510676156583631e-05, "loss": 1.5291, "step": 7591 }, { "epoch": 3.3742222222222225, "grad_norm": 5.507560729980469, "learning_rate": 6.508896797153025e-05, "loss": 0.6388, "step": 7592 }, { "epoch": 3.3746666666666667, "grad_norm": 5.815156936645508, "learning_rate": 6.50711743772242e-05, "loss": 1.1697, "step": 7593 }, { "epoch": 3.375111111111111, "grad_norm": 3.993685245513916, "learning_rate": 6.505338078291815e-05, "loss": 1.3822, "step": 7594 }, { "epoch": 3.3755555555555556, "grad_norm": 3.4094078540802, "learning_rate": 6.50355871886121e-05, "loss": 0.8347, "step": 7595 }, { "epoch": 3.376, "grad_norm": 3.818934202194214, "learning_rate": 6.501779359430605e-05, "loss": 1.0501, "step": 7596 }, { "epoch": 3.3764444444444446, "grad_norm": 5.371906280517578, "learning_rate": 6.500000000000001e-05, "loss": 1.3774, "step": 7597 }, { "epoch": 3.376888888888889, "grad_norm": 4.580707550048828, "learning_rate": 6.498220640569395e-05, "loss": 1.2469, "step": 7598 }, { "epoch": 3.3773333333333335, "grad_norm": 5.118098258972168, "learning_rate": 6.49644128113879e-05, "loss": 1.7323, "step": 7599 }, { "epoch": 3.3777777777777778, "grad_norm": 4.494351387023926, "learning_rate": 6.494661921708185e-05, "loss": 1.426, "step": 7600 }, { "epoch": 3.378222222222222, "grad_norm": 1.999345302581787, "learning_rate": 6.49288256227758e-05, "loss": 1.0275, "step": 7601 }, { "epoch": 3.3786666666666667, "grad_norm": 2.187124729156494, "learning_rate": 6.491103202846975e-05, "loss": 0.8653, "step": 7602 }, { "epoch": 3.379111111111111, "grad_norm": 3.221345901489258, "learning_rate": 6.48932384341637e-05, "loss": 1.444, "step": 7603 }, { "epoch": 3.3795555555555556, "grad_norm": 2.575716495513916, "learning_rate": 6.487544483985766e-05, "loss": 1.2003, "step": 7604 }, { "epoch": 3.38, "grad_norm": 3.7870845794677734, "learning_rate": 6.48576512455516e-05, "loss": 1.7092, "step": 7605 }, { "epoch": 3.3804444444444446, "grad_norm": 2.4028003215789795, "learning_rate": 6.483985765124555e-05, "loss": 0.777, "step": 7606 }, { "epoch": 3.380888888888889, "grad_norm": 2.2200560569763184, "learning_rate": 6.48220640569395e-05, "loss": 0.8364, "step": 7607 }, { "epoch": 3.3813333333333335, "grad_norm": 3.2974655628204346, "learning_rate": 6.480427046263345e-05, "loss": 1.5625, "step": 7608 }, { "epoch": 3.3817777777777778, "grad_norm": 3.614440441131592, "learning_rate": 6.47864768683274e-05, "loss": 1.6034, "step": 7609 }, { "epoch": 3.3822222222222225, "grad_norm": 2.842564582824707, "learning_rate": 6.476868327402136e-05, "loss": 1.3101, "step": 7610 }, { "epoch": 3.3826666666666667, "grad_norm": 3.6041154861450195, "learning_rate": 6.47508896797153e-05, "loss": 1.7356, "step": 7611 }, { "epoch": 3.383111111111111, "grad_norm": 2.605125665664673, "learning_rate": 6.473309608540925e-05, "loss": 0.7838, "step": 7612 }, { "epoch": 3.3835555555555556, "grad_norm": 3.500422477722168, "learning_rate": 6.47153024911032e-05, "loss": 1.4996, "step": 7613 }, { "epoch": 3.384, "grad_norm": 2.843087673187256, "learning_rate": 6.469750889679716e-05, "loss": 1.1637, "step": 7614 }, { "epoch": 3.3844444444444446, "grad_norm": 2.8948328495025635, "learning_rate": 6.46797153024911e-05, "loss": 1.3661, "step": 7615 }, { "epoch": 3.384888888888889, "grad_norm": 3.8482465744018555, "learning_rate": 6.466192170818506e-05, "loss": 1.4079, "step": 7616 }, { "epoch": 3.3853333333333335, "grad_norm": 3.166999101638794, "learning_rate": 6.464412811387902e-05, "loss": 1.2834, "step": 7617 }, { "epoch": 3.3857777777777778, "grad_norm": 2.966557264328003, "learning_rate": 6.462633451957296e-05, "loss": 1.0525, "step": 7618 }, { "epoch": 3.386222222222222, "grad_norm": 3.0287766456604004, "learning_rate": 6.46085409252669e-05, "loss": 1.0802, "step": 7619 }, { "epoch": 3.3866666666666667, "grad_norm": 3.048327684402466, "learning_rate": 6.459074733096086e-05, "loss": 1.3006, "step": 7620 }, { "epoch": 3.387111111111111, "grad_norm": 2.9743454456329346, "learning_rate": 6.45729537366548e-05, "loss": 1.1943, "step": 7621 }, { "epoch": 3.3875555555555557, "grad_norm": 3.471954345703125, "learning_rate": 6.455516014234876e-05, "loss": 1.1794, "step": 7622 }, { "epoch": 3.388, "grad_norm": 3.0449366569519043, "learning_rate": 6.453736654804271e-05, "loss": 1.1742, "step": 7623 }, { "epoch": 3.3884444444444446, "grad_norm": 3.310814142227173, "learning_rate": 6.451957295373666e-05, "loss": 1.1978, "step": 7624 }, { "epoch": 3.388888888888889, "grad_norm": 3.090825319290161, "learning_rate": 6.45017793594306e-05, "loss": 1.2543, "step": 7625 }, { "epoch": 3.389333333333333, "grad_norm": 3.573758363723755, "learning_rate": 6.448398576512456e-05, "loss": 1.0293, "step": 7626 }, { "epoch": 3.389777777777778, "grad_norm": 3.0098912715911865, "learning_rate": 6.446619217081851e-05, "loss": 1.0174, "step": 7627 }, { "epoch": 3.3902222222222225, "grad_norm": 3.734511375427246, "learning_rate": 6.444839857651246e-05, "loss": 1.4176, "step": 7628 }, { "epoch": 3.3906666666666667, "grad_norm": 3.394307851791382, "learning_rate": 6.443060498220641e-05, "loss": 1.4112, "step": 7629 }, { "epoch": 3.391111111111111, "grad_norm": 3.520249605178833, "learning_rate": 6.441281138790037e-05, "loss": 1.3007, "step": 7630 }, { "epoch": 3.3915555555555557, "grad_norm": 3.260222911834717, "learning_rate": 6.439501779359431e-05, "loss": 1.1919, "step": 7631 }, { "epoch": 3.392, "grad_norm": 3.4656856060028076, "learning_rate": 6.437722419928825e-05, "loss": 1.1526, "step": 7632 }, { "epoch": 3.3924444444444446, "grad_norm": 3.670583963394165, "learning_rate": 6.435943060498221e-05, "loss": 1.222, "step": 7633 }, { "epoch": 3.392888888888889, "grad_norm": 3.5100245475769043, "learning_rate": 6.434163701067615e-05, "loss": 1.4112, "step": 7634 }, { "epoch": 3.3933333333333335, "grad_norm": 3.7049787044525146, "learning_rate": 6.432384341637011e-05, "loss": 1.3744, "step": 7635 }, { "epoch": 3.393777777777778, "grad_norm": 3.5180797576904297, "learning_rate": 6.430604982206407e-05, "loss": 1.0369, "step": 7636 }, { "epoch": 3.394222222222222, "grad_norm": 3.888057231903076, "learning_rate": 6.428825622775801e-05, "loss": 1.3029, "step": 7637 }, { "epoch": 3.3946666666666667, "grad_norm": 2.7446529865264893, "learning_rate": 6.427046263345195e-05, "loss": 1.0311, "step": 7638 }, { "epoch": 3.395111111111111, "grad_norm": 4.01584005355835, "learning_rate": 6.425266903914591e-05, "loss": 1.0342, "step": 7639 }, { "epoch": 3.3955555555555557, "grad_norm": 4.280939102172852, "learning_rate": 6.423487544483985e-05, "loss": 1.4491, "step": 7640 }, { "epoch": 3.396, "grad_norm": 4.160184383392334, "learning_rate": 6.421708185053381e-05, "loss": 1.8072, "step": 7641 }, { "epoch": 3.3964444444444446, "grad_norm": 3.48480486869812, "learning_rate": 6.419928825622777e-05, "loss": 1.0331, "step": 7642 }, { "epoch": 3.396888888888889, "grad_norm": 3.321204423904419, "learning_rate": 6.418149466192171e-05, "loss": 0.9976, "step": 7643 }, { "epoch": 3.397333333333333, "grad_norm": 3.8095834255218506, "learning_rate": 6.416370106761567e-05, "loss": 1.3911, "step": 7644 }, { "epoch": 3.397777777777778, "grad_norm": 4.368819236755371, "learning_rate": 6.414590747330961e-05, "loss": 1.1837, "step": 7645 }, { "epoch": 3.398222222222222, "grad_norm": 4.469260215759277, "learning_rate": 6.412811387900356e-05, "loss": 1.0485, "step": 7646 }, { "epoch": 3.3986666666666667, "grad_norm": 4.630403518676758, "learning_rate": 6.411032028469751e-05, "loss": 1.3338, "step": 7647 }, { "epoch": 3.399111111111111, "grad_norm": 4.27545166015625, "learning_rate": 6.409252669039146e-05, "loss": 1.071, "step": 7648 }, { "epoch": 3.3995555555555557, "grad_norm": 5.530716896057129, "learning_rate": 6.407473309608542e-05, "loss": 1.4461, "step": 7649 }, { "epoch": 3.4, "grad_norm": 5.1273193359375, "learning_rate": 6.405693950177936e-05, "loss": 0.9624, "step": 7650 }, { "epoch": 3.4004444444444446, "grad_norm": 2.5767862796783447, "learning_rate": 6.40391459074733e-05, "loss": 1.6648, "step": 7651 }, { "epoch": 3.400888888888889, "grad_norm": 2.5170185565948486, "learning_rate": 6.402135231316726e-05, "loss": 1.7532, "step": 7652 }, { "epoch": 3.4013333333333335, "grad_norm": 2.523386240005493, "learning_rate": 6.40035587188612e-05, "loss": 1.7704, "step": 7653 }, { "epoch": 3.401777777777778, "grad_norm": 2.007690668106079, "learning_rate": 6.398576512455516e-05, "loss": 0.8182, "step": 7654 }, { "epoch": 3.402222222222222, "grad_norm": 2.8901143074035645, "learning_rate": 6.396797153024912e-05, "loss": 1.8572, "step": 7655 }, { "epoch": 3.4026666666666667, "grad_norm": 3.179241895675659, "learning_rate": 6.395017793594306e-05, "loss": 1.5891, "step": 7656 }, { "epoch": 3.403111111111111, "grad_norm": 3.0536370277404785, "learning_rate": 6.3932384341637e-05, "loss": 1.4515, "step": 7657 }, { "epoch": 3.4035555555555557, "grad_norm": 2.9210469722747803, "learning_rate": 6.391459074733096e-05, "loss": 1.8337, "step": 7658 }, { "epoch": 3.404, "grad_norm": 3.83801531791687, "learning_rate": 6.389679715302492e-05, "loss": 1.8063, "step": 7659 }, { "epoch": 3.4044444444444446, "grad_norm": 2.360499382019043, "learning_rate": 6.387900355871886e-05, "loss": 0.762, "step": 7660 }, { "epoch": 3.404888888888889, "grad_norm": 3.5909371376037598, "learning_rate": 6.386120996441282e-05, "loss": 1.4551, "step": 7661 }, { "epoch": 3.405333333333333, "grad_norm": 3.15690541267395, "learning_rate": 6.384341637010677e-05, "loss": 1.387, "step": 7662 }, { "epoch": 3.405777777777778, "grad_norm": 3.711642026901245, "learning_rate": 6.382562277580072e-05, "loss": 1.6299, "step": 7663 }, { "epoch": 3.406222222222222, "grad_norm": 3.540889263153076, "learning_rate": 6.380782918149466e-05, "loss": 1.5186, "step": 7664 }, { "epoch": 3.4066666666666667, "grad_norm": 3.2987284660339355, "learning_rate": 6.379003558718862e-05, "loss": 1.0771, "step": 7665 }, { "epoch": 3.407111111111111, "grad_norm": 3.3886878490448, "learning_rate": 6.377224199288256e-05, "loss": 1.8004, "step": 7666 }, { "epoch": 3.4075555555555557, "grad_norm": 2.921910524368286, "learning_rate": 6.375444839857652e-05, "loss": 1.3376, "step": 7667 }, { "epoch": 3.408, "grad_norm": 0.2479649782180786, "learning_rate": 6.373665480427047e-05, "loss": 0.0287, "step": 7668 }, { "epoch": 3.4084444444444446, "grad_norm": 2.7667505741119385, "learning_rate": 6.371886120996441e-05, "loss": 1.1423, "step": 7669 }, { "epoch": 3.408888888888889, "grad_norm": 2.919297933578491, "learning_rate": 6.370106761565836e-05, "loss": 1.5937, "step": 7670 }, { "epoch": 3.4093333333333335, "grad_norm": 3.060208559036255, "learning_rate": 6.368327402135231e-05, "loss": 1.3546, "step": 7671 }, { "epoch": 3.409777777777778, "grad_norm": 3.8419125080108643, "learning_rate": 6.366548042704627e-05, "loss": 1.6202, "step": 7672 }, { "epoch": 3.410222222222222, "grad_norm": 3.074181318283081, "learning_rate": 6.364768683274021e-05, "loss": 1.1265, "step": 7673 }, { "epoch": 3.4106666666666667, "grad_norm": 3.2371325492858887, "learning_rate": 6.362989323843417e-05, "loss": 0.9814, "step": 7674 }, { "epoch": 3.411111111111111, "grad_norm": 3.5532615184783936, "learning_rate": 6.361209964412813e-05, "loss": 1.5752, "step": 7675 }, { "epoch": 3.4115555555555557, "grad_norm": 2.725200653076172, "learning_rate": 6.359430604982207e-05, "loss": 0.9369, "step": 7676 }, { "epoch": 3.412, "grad_norm": 2.41279673576355, "learning_rate": 6.357651245551601e-05, "loss": 1.2278, "step": 7677 }, { "epoch": 3.4124444444444446, "grad_norm": 3.5397229194641113, "learning_rate": 6.355871886120997e-05, "loss": 1.1223, "step": 7678 }, { "epoch": 3.412888888888889, "grad_norm": 3.2247607707977295, "learning_rate": 6.354092526690391e-05, "loss": 1.6677, "step": 7679 }, { "epoch": 3.413333333333333, "grad_norm": 3.0843541622161865, "learning_rate": 6.352313167259787e-05, "loss": 1.3358, "step": 7680 }, { "epoch": 3.413777777777778, "grad_norm": 3.328740358352661, "learning_rate": 6.350533807829183e-05, "loss": 1.5565, "step": 7681 }, { "epoch": 3.414222222222222, "grad_norm": 2.958048105239868, "learning_rate": 6.348754448398577e-05, "loss": 1.2256, "step": 7682 }, { "epoch": 3.4146666666666667, "grad_norm": 0.26679933071136475, "learning_rate": 6.346975088967971e-05, "loss": 0.0342, "step": 7683 }, { "epoch": 3.415111111111111, "grad_norm": 4.010861396789551, "learning_rate": 6.345195729537367e-05, "loss": 1.9807, "step": 7684 }, { "epoch": 3.4155555555555557, "grad_norm": 3.461297035217285, "learning_rate": 6.343416370106761e-05, "loss": 1.4831, "step": 7685 }, { "epoch": 3.416, "grad_norm": 3.9611904621124268, "learning_rate": 6.341637010676157e-05, "loss": 0.9961, "step": 7686 }, { "epoch": 3.4164444444444446, "grad_norm": 3.7136943340301514, "learning_rate": 6.339857651245552e-05, "loss": 1.3796, "step": 7687 }, { "epoch": 3.416888888888889, "grad_norm": 3.2812464237213135, "learning_rate": 6.338078291814947e-05, "loss": 1.3365, "step": 7688 }, { "epoch": 3.4173333333333336, "grad_norm": 3.156381130218506, "learning_rate": 6.336298932384342e-05, "loss": 1.0016, "step": 7689 }, { "epoch": 3.417777777777778, "grad_norm": 1.95352041721344, "learning_rate": 6.334519572953737e-05, "loss": 0.5333, "step": 7690 }, { "epoch": 3.418222222222222, "grad_norm": 3.408022880554199, "learning_rate": 6.332740213523132e-05, "loss": 1.2858, "step": 7691 }, { "epoch": 3.4186666666666667, "grad_norm": 4.81541633605957, "learning_rate": 6.330960854092527e-05, "loss": 1.3646, "step": 7692 }, { "epoch": 3.419111111111111, "grad_norm": 3.8997044563293457, "learning_rate": 6.329181494661922e-05, "loss": 1.4012, "step": 7693 }, { "epoch": 3.4195555555555557, "grad_norm": 3.6768176555633545, "learning_rate": 6.327402135231318e-05, "loss": 1.3647, "step": 7694 }, { "epoch": 3.42, "grad_norm": 3.490180015563965, "learning_rate": 6.325622775800712e-05, "loss": 0.9646, "step": 7695 }, { "epoch": 3.4204444444444446, "grad_norm": 3.8013601303100586, "learning_rate": 6.323843416370106e-05, "loss": 1.5347, "step": 7696 }, { "epoch": 3.420888888888889, "grad_norm": 4.499445915222168, "learning_rate": 6.322064056939502e-05, "loss": 1.0899, "step": 7697 }, { "epoch": 3.421333333333333, "grad_norm": 3.4003989696502686, "learning_rate": 6.320284697508896e-05, "loss": 0.9259, "step": 7698 }, { "epoch": 3.421777777777778, "grad_norm": 4.400297164916992, "learning_rate": 6.318505338078292e-05, "loss": 1.6985, "step": 7699 }, { "epoch": 3.422222222222222, "grad_norm": 4.983489513397217, "learning_rate": 6.316725978647688e-05, "loss": 1.1409, "step": 7700 }, { "epoch": 3.4226666666666667, "grad_norm": 2.452336311340332, "learning_rate": 6.314946619217082e-05, "loss": 1.7351, "step": 7701 }, { "epoch": 3.423111111111111, "grad_norm": 2.2893435955047607, "learning_rate": 6.313167259786478e-05, "loss": 2.1632, "step": 7702 }, { "epoch": 3.4235555555555557, "grad_norm": 2.175917625427246, "learning_rate": 6.311387900355872e-05, "loss": 1.0005, "step": 7703 }, { "epoch": 3.424, "grad_norm": 2.591916084289551, "learning_rate": 6.309608540925268e-05, "loss": 1.5803, "step": 7704 }, { "epoch": 3.4244444444444446, "grad_norm": 2.7328264713287354, "learning_rate": 6.307829181494662e-05, "loss": 1.6264, "step": 7705 }, { "epoch": 3.424888888888889, "grad_norm": 2.771141767501831, "learning_rate": 6.306049822064057e-05, "loss": 1.3346, "step": 7706 }, { "epoch": 3.4253333333333336, "grad_norm": 3.2554972171783447, "learning_rate": 6.304270462633453e-05, "loss": 1.3667, "step": 7707 }, { "epoch": 3.425777777777778, "grad_norm": 4.142671585083008, "learning_rate": 6.302491103202847e-05, "loss": 1.5744, "step": 7708 }, { "epoch": 3.426222222222222, "grad_norm": 3.247718095779419, "learning_rate": 6.300711743772242e-05, "loss": 1.2485, "step": 7709 }, { "epoch": 3.4266666666666667, "grad_norm": 3.285029649734497, "learning_rate": 6.298932384341637e-05, "loss": 1.3671, "step": 7710 }, { "epoch": 3.427111111111111, "grad_norm": 2.8541810512542725, "learning_rate": 6.297153024911032e-05, "loss": 1.0453, "step": 7711 }, { "epoch": 3.4275555555555557, "grad_norm": 3.167170286178589, "learning_rate": 6.295373665480427e-05, "loss": 1.1664, "step": 7712 }, { "epoch": 3.428, "grad_norm": 3.222472667694092, "learning_rate": 6.293594306049823e-05, "loss": 1.0299, "step": 7713 }, { "epoch": 3.4284444444444446, "grad_norm": 3.7111828327178955, "learning_rate": 6.291814946619217e-05, "loss": 1.3693, "step": 7714 }, { "epoch": 3.428888888888889, "grad_norm": 3.3316597938537598, "learning_rate": 6.290035587188612e-05, "loss": 1.7048, "step": 7715 }, { "epoch": 3.429333333333333, "grad_norm": 2.9972000122070312, "learning_rate": 6.288256227758007e-05, "loss": 1.1209, "step": 7716 }, { "epoch": 3.429777777777778, "grad_norm": 3.5281057357788086, "learning_rate": 6.286476868327403e-05, "loss": 1.6927, "step": 7717 }, { "epoch": 3.430222222222222, "grad_norm": 3.553957939147949, "learning_rate": 6.284697508896797e-05, "loss": 1.5864, "step": 7718 }, { "epoch": 3.4306666666666668, "grad_norm": 4.420113563537598, "learning_rate": 6.282918149466193e-05, "loss": 1.5708, "step": 7719 }, { "epoch": 3.431111111111111, "grad_norm": 4.024765968322754, "learning_rate": 6.281138790035588e-05, "loss": 1.3665, "step": 7720 }, { "epoch": 3.4315555555555557, "grad_norm": 3.158511161804199, "learning_rate": 6.279359430604983e-05, "loss": 1.1667, "step": 7721 }, { "epoch": 3.432, "grad_norm": 3.0785789489746094, "learning_rate": 6.277580071174377e-05, "loss": 1.1982, "step": 7722 }, { "epoch": 3.4324444444444446, "grad_norm": 2.7140047550201416, "learning_rate": 6.275800711743773e-05, "loss": 1.1619, "step": 7723 }, { "epoch": 3.432888888888889, "grad_norm": 3.3508801460266113, "learning_rate": 6.274021352313167e-05, "loss": 1.3535, "step": 7724 }, { "epoch": 3.4333333333333336, "grad_norm": 2.8375468254089355, "learning_rate": 6.272241992882563e-05, "loss": 1.1456, "step": 7725 }, { "epoch": 3.433777777777778, "grad_norm": 2.967170000076294, "learning_rate": 6.270462633451958e-05, "loss": 1.1639, "step": 7726 }, { "epoch": 3.434222222222222, "grad_norm": 3.734205484390259, "learning_rate": 6.268683274021353e-05, "loss": 1.4107, "step": 7727 }, { "epoch": 3.4346666666666668, "grad_norm": 3.417689085006714, "learning_rate": 6.266903914590747e-05, "loss": 1.2238, "step": 7728 }, { "epoch": 3.435111111111111, "grad_norm": 3.4484238624572754, "learning_rate": 6.265124555160143e-05, "loss": 1.1604, "step": 7729 }, { "epoch": 3.4355555555555557, "grad_norm": 1.4519230127334595, "learning_rate": 6.263345195729537e-05, "loss": 0.2928, "step": 7730 }, { "epoch": 3.436, "grad_norm": 3.699568748474121, "learning_rate": 6.261565836298932e-05, "loss": 1.5317, "step": 7731 }, { "epoch": 3.4364444444444446, "grad_norm": 3.111086845397949, "learning_rate": 6.259786476868328e-05, "loss": 1.3173, "step": 7732 }, { "epoch": 3.436888888888889, "grad_norm": 2.9475185871124268, "learning_rate": 6.258007117437722e-05, "loss": 0.8653, "step": 7733 }, { "epoch": 3.437333333333333, "grad_norm": 3.5937764644622803, "learning_rate": 6.256227758007118e-05, "loss": 1.4628, "step": 7734 }, { "epoch": 3.437777777777778, "grad_norm": 3.6315152645111084, "learning_rate": 6.254448398576512e-05, "loss": 1.6076, "step": 7735 }, { "epoch": 3.438222222222222, "grad_norm": 3.4667177200317383, "learning_rate": 6.252669039145908e-05, "loss": 1.4787, "step": 7736 }, { "epoch": 3.4386666666666668, "grad_norm": 3.2693827152252197, "learning_rate": 6.250889679715302e-05, "loss": 1.1539, "step": 7737 }, { "epoch": 3.439111111111111, "grad_norm": 3.5391995906829834, "learning_rate": 6.249110320284698e-05, "loss": 1.1326, "step": 7738 }, { "epoch": 3.4395555555555557, "grad_norm": 4.569096088409424, "learning_rate": 6.247330960854094e-05, "loss": 1.5917, "step": 7739 }, { "epoch": 3.44, "grad_norm": 3.872166633605957, "learning_rate": 6.245551601423488e-05, "loss": 1.3355, "step": 7740 }, { "epoch": 3.4404444444444446, "grad_norm": 3.8824992179870605, "learning_rate": 6.243772241992882e-05, "loss": 1.5446, "step": 7741 }, { "epoch": 3.440888888888889, "grad_norm": 3.592456579208374, "learning_rate": 6.241992882562278e-05, "loss": 0.9791, "step": 7742 }, { "epoch": 3.4413333333333336, "grad_norm": 3.4050142765045166, "learning_rate": 6.240213523131672e-05, "loss": 0.8251, "step": 7743 }, { "epoch": 3.441777777777778, "grad_norm": 3.6496431827545166, "learning_rate": 6.238434163701068e-05, "loss": 1.2421, "step": 7744 }, { "epoch": 3.442222222222222, "grad_norm": 4.084912300109863, "learning_rate": 6.236654804270463e-05, "loss": 1.1315, "step": 7745 }, { "epoch": 3.4426666666666668, "grad_norm": 4.4305033683776855, "learning_rate": 6.234875444839858e-05, "loss": 0.8433, "step": 7746 }, { "epoch": 3.443111111111111, "grad_norm": 4.542752742767334, "learning_rate": 6.233096085409253e-05, "loss": 1.3728, "step": 7747 }, { "epoch": 3.4435555555555557, "grad_norm": 4.393490791320801, "learning_rate": 6.231316725978648e-05, "loss": 1.4556, "step": 7748 }, { "epoch": 3.444, "grad_norm": 4.165998458862305, "learning_rate": 6.229537366548043e-05, "loss": 0.8499, "step": 7749 }, { "epoch": 3.4444444444444446, "grad_norm": 3.2756288051605225, "learning_rate": 6.227758007117438e-05, "loss": 0.59, "step": 7750 }, { "epoch": 3.444888888888889, "grad_norm": 1.687354564666748, "learning_rate": 6.225978647686833e-05, "loss": 0.7765, "step": 7751 }, { "epoch": 3.445333333333333, "grad_norm": 2.0984833240509033, "learning_rate": 6.224199288256229e-05, "loss": 1.1706, "step": 7752 }, { "epoch": 3.445777777777778, "grad_norm": 1.5732635259628296, "learning_rate": 6.222419928825623e-05, "loss": 0.4636, "step": 7753 }, { "epoch": 3.446222222222222, "grad_norm": 2.6701159477233887, "learning_rate": 6.220640569395018e-05, "loss": 1.3432, "step": 7754 }, { "epoch": 3.4466666666666668, "grad_norm": 2.630073308944702, "learning_rate": 6.218861209964413e-05, "loss": 1.229, "step": 7755 }, { "epoch": 3.447111111111111, "grad_norm": 3.405998945236206, "learning_rate": 6.217081850533807e-05, "loss": 1.6136, "step": 7756 }, { "epoch": 3.4475555555555557, "grad_norm": 3.1982181072235107, "learning_rate": 6.215302491103203e-05, "loss": 1.4239, "step": 7757 }, { "epoch": 3.448, "grad_norm": 3.283080816268921, "learning_rate": 6.213523131672599e-05, "loss": 1.5826, "step": 7758 }, { "epoch": 3.448444444444444, "grad_norm": 3.861956834793091, "learning_rate": 6.211743772241993e-05, "loss": 1.9406, "step": 7759 }, { "epoch": 3.448888888888889, "grad_norm": 3.5665135383605957, "learning_rate": 6.209964412811389e-05, "loss": 1.5776, "step": 7760 }, { "epoch": 3.449333333333333, "grad_norm": 3.1784329414367676, "learning_rate": 6.208185053380783e-05, "loss": 1.1607, "step": 7761 }, { "epoch": 3.449777777777778, "grad_norm": 3.31244158744812, "learning_rate": 6.206405693950179e-05, "loss": 1.2746, "step": 7762 }, { "epoch": 3.450222222222222, "grad_norm": 3.8500120639801025, "learning_rate": 6.204626334519573e-05, "loss": 1.4645, "step": 7763 }, { "epoch": 3.4506666666666668, "grad_norm": 3.650425434112549, "learning_rate": 6.202846975088969e-05, "loss": 1.5679, "step": 7764 }, { "epoch": 3.451111111111111, "grad_norm": 3.314197063446045, "learning_rate": 6.201067615658364e-05, "loss": 1.236, "step": 7765 }, { "epoch": 3.4515555555555557, "grad_norm": 3.924046277999878, "learning_rate": 6.199288256227759e-05, "loss": 1.4479, "step": 7766 }, { "epoch": 3.452, "grad_norm": 4.836942672729492, "learning_rate": 6.197508896797153e-05, "loss": 1.4685, "step": 7767 }, { "epoch": 3.4524444444444446, "grad_norm": 3.9449543952941895, "learning_rate": 6.195729537366548e-05, "loss": 1.5383, "step": 7768 }, { "epoch": 3.452888888888889, "grad_norm": 3.259953737258911, "learning_rate": 6.193950177935943e-05, "loss": 1.3374, "step": 7769 }, { "epoch": 3.453333333333333, "grad_norm": 3.4236366748809814, "learning_rate": 6.192170818505338e-05, "loss": 1.4552, "step": 7770 }, { "epoch": 3.453777777777778, "grad_norm": 3.391941547393799, "learning_rate": 6.190391459074734e-05, "loss": 1.3679, "step": 7771 }, { "epoch": 3.454222222222222, "grad_norm": 2.923438787460327, "learning_rate": 6.188612099644128e-05, "loss": 1.0185, "step": 7772 }, { "epoch": 3.4546666666666668, "grad_norm": 3.636385202407837, "learning_rate": 6.186832740213523e-05, "loss": 1.2137, "step": 7773 }, { "epoch": 3.455111111111111, "grad_norm": 3.6334786415100098, "learning_rate": 6.185053380782918e-05, "loss": 1.2733, "step": 7774 }, { "epoch": 3.4555555555555557, "grad_norm": 3.618476152420044, "learning_rate": 6.183274021352313e-05, "loss": 1.4823, "step": 7775 }, { "epoch": 3.456, "grad_norm": 3.8086905479431152, "learning_rate": 6.181494661921708e-05, "loss": 1.2488, "step": 7776 }, { "epoch": 3.456444444444444, "grad_norm": 4.449453353881836, "learning_rate": 6.179715302491104e-05, "loss": 1.2205, "step": 7777 }, { "epoch": 3.456888888888889, "grad_norm": 2.2830700874328613, "learning_rate": 6.177935943060498e-05, "loss": 0.576, "step": 7778 }, { "epoch": 3.457333333333333, "grad_norm": 3.2258355617523193, "learning_rate": 6.176156583629894e-05, "loss": 1.0031, "step": 7779 }, { "epoch": 3.457777777777778, "grad_norm": 3.3974528312683105, "learning_rate": 6.174377224199288e-05, "loss": 1.2962, "step": 7780 }, { "epoch": 3.458222222222222, "grad_norm": 3.8626692295074463, "learning_rate": 6.172597864768684e-05, "loss": 1.663, "step": 7781 }, { "epoch": 3.458666666666667, "grad_norm": 2.1425042152404785, "learning_rate": 6.170818505338078e-05, "loss": 0.5912, "step": 7782 }, { "epoch": 3.459111111111111, "grad_norm": 3.64699649810791, "learning_rate": 6.169039145907474e-05, "loss": 1.5683, "step": 7783 }, { "epoch": 3.4595555555555557, "grad_norm": 3.0158698558807373, "learning_rate": 6.16725978647687e-05, "loss": 1.1897, "step": 7784 }, { "epoch": 3.46, "grad_norm": 2.8843777179718018, "learning_rate": 6.165480427046264e-05, "loss": 0.9849, "step": 7785 }, { "epoch": 3.4604444444444447, "grad_norm": 4.387118339538574, "learning_rate": 6.163701067615658e-05, "loss": 1.2511, "step": 7786 }, { "epoch": 3.460888888888889, "grad_norm": 3.0603246688842773, "learning_rate": 6.161921708185054e-05, "loss": 1.0083, "step": 7787 }, { "epoch": 3.461333333333333, "grad_norm": 3.8462274074554443, "learning_rate": 6.160142348754448e-05, "loss": 1.4729, "step": 7788 }, { "epoch": 3.461777777777778, "grad_norm": 3.917243719100952, "learning_rate": 6.158362989323844e-05, "loss": 1.104, "step": 7789 }, { "epoch": 3.462222222222222, "grad_norm": 4.193759441375732, "learning_rate": 6.156583629893239e-05, "loss": 1.4846, "step": 7790 }, { "epoch": 3.462666666666667, "grad_norm": 3.3156356811523438, "learning_rate": 6.154804270462634e-05, "loss": 0.9882, "step": 7791 }, { "epoch": 3.463111111111111, "grad_norm": 4.5232062339782715, "learning_rate": 6.153024911032029e-05, "loss": 1.1761, "step": 7792 }, { "epoch": 3.4635555555555557, "grad_norm": 3.9281444549560547, "learning_rate": 6.151245551601423e-05, "loss": 1.3836, "step": 7793 }, { "epoch": 3.464, "grad_norm": 5.008172035217285, "learning_rate": 6.149466192170819e-05, "loss": 1.4874, "step": 7794 }, { "epoch": 3.464444444444444, "grad_norm": 5.387003421783447, "learning_rate": 6.147686832740213e-05, "loss": 1.0767, "step": 7795 }, { "epoch": 3.464888888888889, "grad_norm": 5.167206764221191, "learning_rate": 6.145907473309609e-05, "loss": 1.3537, "step": 7796 }, { "epoch": 3.465333333333333, "grad_norm": 3.742565870285034, "learning_rate": 6.144128113879005e-05, "loss": 0.7384, "step": 7797 }, { "epoch": 3.465777777777778, "grad_norm": 3.991476058959961, "learning_rate": 6.142348754448399e-05, "loss": 1.1434, "step": 7798 }, { "epoch": 3.466222222222222, "grad_norm": 2.9769797325134277, "learning_rate": 6.140569395017793e-05, "loss": 0.496, "step": 7799 }, { "epoch": 3.466666666666667, "grad_norm": 3.5614140033721924, "learning_rate": 6.138790035587189e-05, "loss": 0.4358, "step": 7800 }, { "epoch": 3.467111111111111, "grad_norm": 2.5742027759552, "learning_rate": 6.137010676156583e-05, "loss": 2.0281, "step": 7801 }, { "epoch": 3.4675555555555557, "grad_norm": 2.53623366355896, "learning_rate": 6.135231316725979e-05, "loss": 1.6929, "step": 7802 }, { "epoch": 3.468, "grad_norm": 2.944135904312134, "learning_rate": 6.133451957295375e-05, "loss": 1.7063, "step": 7803 }, { "epoch": 3.4684444444444447, "grad_norm": 2.7259559631347656, "learning_rate": 6.131672597864769e-05, "loss": 1.2452, "step": 7804 }, { "epoch": 3.468888888888889, "grad_norm": 3.2671380043029785, "learning_rate": 6.129893238434164e-05, "loss": 1.7628, "step": 7805 }, { "epoch": 3.469333333333333, "grad_norm": 2.8245341777801514, "learning_rate": 6.128113879003559e-05, "loss": 1.6624, "step": 7806 }, { "epoch": 3.469777777777778, "grad_norm": 2.8411941528320312, "learning_rate": 6.126334519572954e-05, "loss": 1.4877, "step": 7807 }, { "epoch": 3.470222222222222, "grad_norm": 3.039693593978882, "learning_rate": 6.124555160142349e-05, "loss": 1.8058, "step": 7808 }, { "epoch": 3.470666666666667, "grad_norm": 3.047823667526245, "learning_rate": 6.122775800711744e-05, "loss": 1.0683, "step": 7809 }, { "epoch": 3.471111111111111, "grad_norm": 3.6758055686950684, "learning_rate": 6.12099644128114e-05, "loss": 1.3514, "step": 7810 }, { "epoch": 3.4715555555555557, "grad_norm": 2.9851505756378174, "learning_rate": 6.119217081850534e-05, "loss": 1.1167, "step": 7811 }, { "epoch": 3.472, "grad_norm": 2.0843656063079834, "learning_rate": 6.117437722419929e-05, "loss": 0.533, "step": 7812 }, { "epoch": 3.4724444444444442, "grad_norm": 3.570765733718872, "learning_rate": 6.115658362989324e-05, "loss": 1.5857, "step": 7813 }, { "epoch": 3.472888888888889, "grad_norm": 2.951054573059082, "learning_rate": 6.113879003558719e-05, "loss": 0.9994, "step": 7814 }, { "epoch": 3.473333333333333, "grad_norm": 3.410421133041382, "learning_rate": 6.112099644128114e-05, "loss": 1.492, "step": 7815 }, { "epoch": 3.473777777777778, "grad_norm": 3.0904409885406494, "learning_rate": 6.11032028469751e-05, "loss": 1.6493, "step": 7816 }, { "epoch": 3.474222222222222, "grad_norm": 3.2813360691070557, "learning_rate": 6.108540925266904e-05, "loss": 1.2129, "step": 7817 }, { "epoch": 3.474666666666667, "grad_norm": 3.62602162361145, "learning_rate": 6.1067615658363e-05, "loss": 1.2179, "step": 7818 }, { "epoch": 3.475111111111111, "grad_norm": 2.9935662746429443, "learning_rate": 6.104982206405694e-05, "loss": 1.2209, "step": 7819 }, { "epoch": 3.4755555555555557, "grad_norm": 3.7781035900115967, "learning_rate": 6.103202846975089e-05, "loss": 1.8109, "step": 7820 }, { "epoch": 3.476, "grad_norm": 3.0393505096435547, "learning_rate": 6.101423487544484e-05, "loss": 1.3712, "step": 7821 }, { "epoch": 3.4764444444444447, "grad_norm": 3.436675786972046, "learning_rate": 6.09964412811388e-05, "loss": 1.1229, "step": 7822 }, { "epoch": 3.476888888888889, "grad_norm": 3.587766170501709, "learning_rate": 6.0978647686832747e-05, "loss": 1.4697, "step": 7823 }, { "epoch": 3.477333333333333, "grad_norm": 3.541997194290161, "learning_rate": 6.0960854092526696e-05, "loss": 0.978, "step": 7824 }, { "epoch": 3.477777777777778, "grad_norm": 4.022648334503174, "learning_rate": 6.094306049822064e-05, "loss": 1.1844, "step": 7825 }, { "epoch": 3.478222222222222, "grad_norm": 3.1931285858154297, "learning_rate": 6.092526690391459e-05, "loss": 1.3828, "step": 7826 }, { "epoch": 3.478666666666667, "grad_norm": 2.8355612754821777, "learning_rate": 6.0907473309608545e-05, "loss": 0.8536, "step": 7827 }, { "epoch": 3.479111111111111, "grad_norm": 3.4258205890655518, "learning_rate": 6.0889679715302495e-05, "loss": 1.2377, "step": 7828 }, { "epoch": 3.4795555555555557, "grad_norm": 3.4359145164489746, "learning_rate": 6.0871886120996445e-05, "loss": 1.2237, "step": 7829 }, { "epoch": 3.48, "grad_norm": 3.928776502609253, "learning_rate": 6.08540925266904e-05, "loss": 1.4612, "step": 7830 }, { "epoch": 3.4804444444444442, "grad_norm": 3.3484604358673096, "learning_rate": 6.083629893238434e-05, "loss": 1.3294, "step": 7831 }, { "epoch": 3.480888888888889, "grad_norm": 4.023479461669922, "learning_rate": 6.0818505338078294e-05, "loss": 1.1446, "step": 7832 }, { "epoch": 3.481333333333333, "grad_norm": 4.194918632507324, "learning_rate": 6.0800711743772244e-05, "loss": 1.5604, "step": 7833 }, { "epoch": 3.481777777777778, "grad_norm": 3.498831033706665, "learning_rate": 6.0782918149466193e-05, "loss": 1.2972, "step": 7834 }, { "epoch": 3.482222222222222, "grad_norm": 3.6610026359558105, "learning_rate": 6.076512455516015e-05, "loss": 1.0893, "step": 7835 }, { "epoch": 3.482666666666667, "grad_norm": 3.50296950340271, "learning_rate": 6.07473309608541e-05, "loss": 1.252, "step": 7836 }, { "epoch": 3.483111111111111, "grad_norm": 3.684080123901367, "learning_rate": 6.072953736654805e-05, "loss": 1.3696, "step": 7837 }, { "epoch": 3.4835555555555557, "grad_norm": 3.986010789871216, "learning_rate": 6.071174377224199e-05, "loss": 1.3597, "step": 7838 }, { "epoch": 3.484, "grad_norm": 3.7796928882598877, "learning_rate": 6.069395017793594e-05, "loss": 1.2141, "step": 7839 }, { "epoch": 3.4844444444444447, "grad_norm": 3.9027299880981445, "learning_rate": 6.067615658362989e-05, "loss": 1.2926, "step": 7840 }, { "epoch": 3.484888888888889, "grad_norm": 4.090729236602783, "learning_rate": 6.065836298932385e-05, "loss": 1.3292, "step": 7841 }, { "epoch": 3.485333333333333, "grad_norm": 3.7317214012145996, "learning_rate": 6.06405693950178e-05, "loss": 1.2942, "step": 7842 }, { "epoch": 3.485777777777778, "grad_norm": 5.53169584274292, "learning_rate": 6.062277580071175e-05, "loss": 1.1903, "step": 7843 }, { "epoch": 3.486222222222222, "grad_norm": 4.516229629516602, "learning_rate": 6.060498220640569e-05, "loss": 1.246, "step": 7844 }, { "epoch": 3.486666666666667, "grad_norm": 4.402373313903809, "learning_rate": 6.058718861209964e-05, "loss": 1.4393, "step": 7845 }, { "epoch": 3.487111111111111, "grad_norm": 5.0335540771484375, "learning_rate": 6.05693950177936e-05, "loss": 0.9533, "step": 7846 }, { "epoch": 3.4875555555555557, "grad_norm": 4.421757698059082, "learning_rate": 6.0551601423487547e-05, "loss": 1.1198, "step": 7847 }, { "epoch": 3.488, "grad_norm": 3.6150312423706055, "learning_rate": 6.0533807829181496e-05, "loss": 1.3501, "step": 7848 }, { "epoch": 3.4884444444444442, "grad_norm": 3.798158645629883, "learning_rate": 6.051601423487545e-05, "loss": 0.9775, "step": 7849 }, { "epoch": 3.488888888888889, "grad_norm": 4.989626407623291, "learning_rate": 6.04982206405694e-05, "loss": 0.7092, "step": 7850 }, { "epoch": 3.489333333333333, "grad_norm": 2.5157763957977295, "learning_rate": 6.0480427046263345e-05, "loss": 1.8447, "step": 7851 }, { "epoch": 3.489777777777778, "grad_norm": 1.8457545042037964, "learning_rate": 6.0462633451957295e-05, "loss": 1.0517, "step": 7852 }, { "epoch": 3.490222222222222, "grad_norm": 2.611934185028076, "learning_rate": 6.0444839857651245e-05, "loss": 1.78, "step": 7853 }, { "epoch": 3.490666666666667, "grad_norm": 0.1728007197380066, "learning_rate": 6.04270462633452e-05, "loss": 0.0174, "step": 7854 }, { "epoch": 3.491111111111111, "grad_norm": 2.6706738471984863, "learning_rate": 6.040925266903915e-05, "loss": 1.605, "step": 7855 }, { "epoch": 3.4915555555555557, "grad_norm": 2.728020429611206, "learning_rate": 6.03914590747331e-05, "loss": 1.5846, "step": 7856 }, { "epoch": 3.492, "grad_norm": 3.039168357849121, "learning_rate": 6.0373665480427044e-05, "loss": 1.7239, "step": 7857 }, { "epoch": 3.4924444444444447, "grad_norm": 2.816784381866455, "learning_rate": 6.0355871886120994e-05, "loss": 1.4047, "step": 7858 }, { "epoch": 3.492888888888889, "grad_norm": 2.839526653289795, "learning_rate": 6.033807829181495e-05, "loss": 1.4536, "step": 7859 }, { "epoch": 3.493333333333333, "grad_norm": 3.2044901847839355, "learning_rate": 6.03202846975089e-05, "loss": 1.322, "step": 7860 }, { "epoch": 3.493777777777778, "grad_norm": 3.872880458831787, "learning_rate": 6.030249110320285e-05, "loss": 1.7948, "step": 7861 }, { "epoch": 3.494222222222222, "grad_norm": 2.9369702339172363, "learning_rate": 6.0284697508896806e-05, "loss": 1.1952, "step": 7862 }, { "epoch": 3.494666666666667, "grad_norm": 2.457634687423706, "learning_rate": 6.0266903914590756e-05, "loss": 1.1545, "step": 7863 }, { "epoch": 3.495111111111111, "grad_norm": 3.318244457244873, "learning_rate": 6.02491103202847e-05, "loss": 1.6481, "step": 7864 }, { "epoch": 3.4955555555555557, "grad_norm": 3.1333770751953125, "learning_rate": 6.023131672597865e-05, "loss": 1.4185, "step": 7865 }, { "epoch": 3.496, "grad_norm": 3.627277135848999, "learning_rate": 6.02135231316726e-05, "loss": 1.4683, "step": 7866 }, { "epoch": 3.4964444444444442, "grad_norm": 2.975083827972412, "learning_rate": 6.0195729537366555e-05, "loss": 1.5541, "step": 7867 }, { "epoch": 3.496888888888889, "grad_norm": 2.3885438442230225, "learning_rate": 6.0177935943060504e-05, "loss": 1.0314, "step": 7868 }, { "epoch": 3.497333333333333, "grad_norm": 3.9095265865325928, "learning_rate": 6.0160142348754454e-05, "loss": 1.6982, "step": 7869 }, { "epoch": 3.497777777777778, "grad_norm": 5.884151935577393, "learning_rate": 6.01423487544484e-05, "loss": 1.5449, "step": 7870 }, { "epoch": 3.498222222222222, "grad_norm": 3.240220785140991, "learning_rate": 6.012455516014235e-05, "loss": 1.2482, "step": 7871 }, { "epoch": 3.498666666666667, "grad_norm": 2.9373934268951416, "learning_rate": 6.01067615658363e-05, "loss": 1.3222, "step": 7872 }, { "epoch": 3.499111111111111, "grad_norm": 3.125972270965576, "learning_rate": 6.008896797153025e-05, "loss": 1.4957, "step": 7873 }, { "epoch": 3.4995555555555553, "grad_norm": 3.1987497806549072, "learning_rate": 6.00711743772242e-05, "loss": 1.5033, "step": 7874 }, { "epoch": 3.5, "grad_norm": 2.531674861907959, "learning_rate": 6.005338078291816e-05, "loss": 1.2043, "step": 7875 }, { "epoch": 3.5004444444444447, "grad_norm": 3.2194294929504395, "learning_rate": 6.003558718861211e-05, "loss": 1.4242, "step": 7876 }, { "epoch": 3.500888888888889, "grad_norm": 3.215784788131714, "learning_rate": 6.001779359430605e-05, "loss": 1.5275, "step": 7877 }, { "epoch": 3.501333333333333, "grad_norm": 2.9531593322753906, "learning_rate": 6e-05, "loss": 1.3093, "step": 7878 }, { "epoch": 3.501777777777778, "grad_norm": 3.470592737197876, "learning_rate": 5.998220640569395e-05, "loss": 1.5293, "step": 7879 }, { "epoch": 3.502222222222222, "grad_norm": 3.2422022819519043, "learning_rate": 5.996441281138791e-05, "loss": 1.4052, "step": 7880 }, { "epoch": 3.502666666666667, "grad_norm": 3.3211963176727295, "learning_rate": 5.994661921708186e-05, "loss": 1.304, "step": 7881 }, { "epoch": 3.503111111111111, "grad_norm": 3.2284364700317383, "learning_rate": 5.992882562277581e-05, "loss": 1.1494, "step": 7882 }, { "epoch": 3.5035555555555558, "grad_norm": 3.206890821456909, "learning_rate": 5.991103202846975e-05, "loss": 1.296, "step": 7883 }, { "epoch": 3.504, "grad_norm": 2.572016716003418, "learning_rate": 5.98932384341637e-05, "loss": 0.7784, "step": 7884 }, { "epoch": 3.5044444444444443, "grad_norm": 3.5026330947875977, "learning_rate": 5.987544483985765e-05, "loss": 1.3138, "step": 7885 }, { "epoch": 3.504888888888889, "grad_norm": 3.5628318786621094, "learning_rate": 5.9857651245551606e-05, "loss": 1.1652, "step": 7886 }, { "epoch": 3.505333333333333, "grad_norm": 3.418849468231201, "learning_rate": 5.9839857651245556e-05, "loss": 1.4964, "step": 7887 }, { "epoch": 3.505777777777778, "grad_norm": 4.8035454750061035, "learning_rate": 5.9822064056939506e-05, "loss": 1.8758, "step": 7888 }, { "epoch": 3.506222222222222, "grad_norm": 3.9143970012664795, "learning_rate": 5.980427046263345e-05, "loss": 1.4264, "step": 7889 }, { "epoch": 3.506666666666667, "grad_norm": 3.6827564239501953, "learning_rate": 5.97864768683274e-05, "loss": 1.5077, "step": 7890 }, { "epoch": 3.507111111111111, "grad_norm": 4.183266639709473, "learning_rate": 5.9768683274021355e-05, "loss": 1.2283, "step": 7891 }, { "epoch": 3.5075555555555553, "grad_norm": 3.9071977138519287, "learning_rate": 5.9750889679715304e-05, "loss": 1.2634, "step": 7892 }, { "epoch": 3.508, "grad_norm": 3.631640672683716, "learning_rate": 5.9733096085409254e-05, "loss": 1.5404, "step": 7893 }, { "epoch": 3.5084444444444447, "grad_norm": 3.552340269088745, "learning_rate": 5.971530249110321e-05, "loss": 1.2201, "step": 7894 }, { "epoch": 3.508888888888889, "grad_norm": 4.531553268432617, "learning_rate": 5.969750889679716e-05, "loss": 1.4324, "step": 7895 }, { "epoch": 3.509333333333333, "grad_norm": 3.7588775157928467, "learning_rate": 5.96797153024911e-05, "loss": 1.2748, "step": 7896 }, { "epoch": 3.509777777777778, "grad_norm": 4.838202476501465, "learning_rate": 5.966192170818505e-05, "loss": 1.1187, "step": 7897 }, { "epoch": 3.510222222222222, "grad_norm": 4.4196367263793945, "learning_rate": 5.9644128113879e-05, "loss": 1.0293, "step": 7898 }, { "epoch": 3.510666666666667, "grad_norm": 3.9595701694488525, "learning_rate": 5.962633451957296e-05, "loss": 1.232, "step": 7899 }, { "epoch": 3.511111111111111, "grad_norm": 5.525210380554199, "learning_rate": 5.960854092526691e-05, "loss": 0.8993, "step": 7900 }, { "epoch": 3.5115555555555558, "grad_norm": 2.474489212036133, "learning_rate": 5.959074733096086e-05, "loss": 1.8906, "step": 7901 }, { "epoch": 3.512, "grad_norm": 1.5867375135421753, "learning_rate": 5.95729537366548e-05, "loss": 0.3189, "step": 7902 }, { "epoch": 3.5124444444444443, "grad_norm": 2.7589077949523926, "learning_rate": 5.955516014234875e-05, "loss": 1.5188, "step": 7903 }, { "epoch": 3.512888888888889, "grad_norm": 3.1739702224731445, "learning_rate": 5.953736654804271e-05, "loss": 1.4973, "step": 7904 }, { "epoch": 3.513333333333333, "grad_norm": 3.220743179321289, "learning_rate": 5.951957295373666e-05, "loss": 1.7824, "step": 7905 }, { "epoch": 3.513777777777778, "grad_norm": 3.1341934204101562, "learning_rate": 5.950177935943061e-05, "loss": 1.53, "step": 7906 }, { "epoch": 3.514222222222222, "grad_norm": 2.954864501953125, "learning_rate": 5.9483985765124564e-05, "loss": 1.0464, "step": 7907 }, { "epoch": 3.514666666666667, "grad_norm": 2.6394574642181396, "learning_rate": 5.9466192170818513e-05, "loss": 1.4441, "step": 7908 }, { "epoch": 3.515111111111111, "grad_norm": 3.118455648422241, "learning_rate": 5.9448398576512456e-05, "loss": 1.438, "step": 7909 }, { "epoch": 3.5155555555555553, "grad_norm": 2.812499523162842, "learning_rate": 5.9430604982206406e-05, "loss": 1.32, "step": 7910 }, { "epoch": 3.516, "grad_norm": 3.96679425239563, "learning_rate": 5.9412811387900356e-05, "loss": 1.5107, "step": 7911 }, { "epoch": 3.5164444444444447, "grad_norm": 3.8176608085632324, "learning_rate": 5.939501779359431e-05, "loss": 1.7569, "step": 7912 }, { "epoch": 3.516888888888889, "grad_norm": 3.084055185317993, "learning_rate": 5.937722419928826e-05, "loss": 1.4477, "step": 7913 }, { "epoch": 3.517333333333333, "grad_norm": 3.462454319000244, "learning_rate": 5.935943060498221e-05, "loss": 1.5585, "step": 7914 }, { "epoch": 3.517777777777778, "grad_norm": 2.181887149810791, "learning_rate": 5.9341637010676155e-05, "loss": 0.452, "step": 7915 }, { "epoch": 3.518222222222222, "grad_norm": 3.213517189025879, "learning_rate": 5.9323843416370104e-05, "loss": 1.5827, "step": 7916 }, { "epoch": 3.518666666666667, "grad_norm": 3.1706907749176025, "learning_rate": 5.930604982206406e-05, "loss": 1.1167, "step": 7917 }, { "epoch": 3.519111111111111, "grad_norm": 3.325066566467285, "learning_rate": 5.928825622775801e-05, "loss": 1.2163, "step": 7918 }, { "epoch": 3.5195555555555558, "grad_norm": 3.789510488510132, "learning_rate": 5.927046263345196e-05, "loss": 1.5674, "step": 7919 }, { "epoch": 3.52, "grad_norm": 3.43265962600708, "learning_rate": 5.925266903914592e-05, "loss": 1.3059, "step": 7920 }, { "epoch": 3.5204444444444443, "grad_norm": 3.1737518310546875, "learning_rate": 5.923487544483987e-05, "loss": 1.1704, "step": 7921 }, { "epoch": 3.520888888888889, "grad_norm": 3.2082409858703613, "learning_rate": 5.921708185053381e-05, "loss": 1.5402, "step": 7922 }, { "epoch": 3.521333333333333, "grad_norm": 3.5814785957336426, "learning_rate": 5.919928825622776e-05, "loss": 1.4676, "step": 7923 }, { "epoch": 3.521777777777778, "grad_norm": 3.379570722579956, "learning_rate": 5.918149466192171e-05, "loss": 1.1777, "step": 7924 }, { "epoch": 3.522222222222222, "grad_norm": 3.4962823390960693, "learning_rate": 5.9163701067615666e-05, "loss": 1.6759, "step": 7925 }, { "epoch": 3.522666666666667, "grad_norm": 3.593132495880127, "learning_rate": 5.9145907473309615e-05, "loss": 1.5066, "step": 7926 }, { "epoch": 3.523111111111111, "grad_norm": 3.335500478744507, "learning_rate": 5.9128113879003565e-05, "loss": 0.9421, "step": 7927 }, { "epoch": 3.5235555555555553, "grad_norm": 3.0314064025878906, "learning_rate": 5.911032028469751e-05, "loss": 1.0284, "step": 7928 }, { "epoch": 3.524, "grad_norm": 3.651460886001587, "learning_rate": 5.909252669039146e-05, "loss": 1.3157, "step": 7929 }, { "epoch": 3.5244444444444447, "grad_norm": 3.4120965003967285, "learning_rate": 5.907473309608541e-05, "loss": 1.4894, "step": 7930 }, { "epoch": 3.524888888888889, "grad_norm": 4.027737140655518, "learning_rate": 5.9056939501779364e-05, "loss": 1.6357, "step": 7931 }, { "epoch": 3.525333333333333, "grad_norm": 2.242091178894043, "learning_rate": 5.9039145907473314e-05, "loss": 0.5504, "step": 7932 }, { "epoch": 3.525777777777778, "grad_norm": 3.149824380874634, "learning_rate": 5.902135231316726e-05, "loss": 0.8101, "step": 7933 }, { "epoch": 3.526222222222222, "grad_norm": 2.0692434310913086, "learning_rate": 5.900355871886122e-05, "loss": 0.4684, "step": 7934 }, { "epoch": 3.5266666666666664, "grad_norm": 3.153052806854248, "learning_rate": 5.8985765124555156e-05, "loss": 1.1421, "step": 7935 }, { "epoch": 3.527111111111111, "grad_norm": 3.0566887855529785, "learning_rate": 5.896797153024911e-05, "loss": 1.0727, "step": 7936 }, { "epoch": 3.5275555555555558, "grad_norm": 3.488640546798706, "learning_rate": 5.895017793594306e-05, "loss": 1.2473, "step": 7937 }, { "epoch": 3.528, "grad_norm": 3.4758243560791016, "learning_rate": 5.893238434163701e-05, "loss": 0.9908, "step": 7938 }, { "epoch": 3.5284444444444443, "grad_norm": 5.093916416168213, "learning_rate": 5.891459074733097e-05, "loss": 1.1238, "step": 7939 }, { "epoch": 3.528888888888889, "grad_norm": 3.6023635864257812, "learning_rate": 5.889679715302492e-05, "loss": 0.8337, "step": 7940 }, { "epoch": 3.529333333333333, "grad_norm": 3.5062379837036133, "learning_rate": 5.887900355871886e-05, "loss": 1.244, "step": 7941 }, { "epoch": 3.529777777777778, "grad_norm": 3.3245818614959717, "learning_rate": 5.886120996441281e-05, "loss": 1.0635, "step": 7942 }, { "epoch": 3.530222222222222, "grad_norm": 3.718571662902832, "learning_rate": 5.884341637010676e-05, "loss": 1.2694, "step": 7943 }, { "epoch": 3.530666666666667, "grad_norm": 4.398813247680664, "learning_rate": 5.882562277580072e-05, "loss": 1.2429, "step": 7944 }, { "epoch": 3.531111111111111, "grad_norm": 3.6693432331085205, "learning_rate": 5.880782918149467e-05, "loss": 1.0828, "step": 7945 }, { "epoch": 3.5315555555555553, "grad_norm": 4.250223159790039, "learning_rate": 5.8790035587188616e-05, "loss": 1.6145, "step": 7946 }, { "epoch": 3.532, "grad_norm": 2.441047191619873, "learning_rate": 5.877224199288256e-05, "loss": 0.279, "step": 7947 }, { "epoch": 3.5324444444444447, "grad_norm": 4.091504096984863, "learning_rate": 5.875444839857651e-05, "loss": 0.6715, "step": 7948 }, { "epoch": 3.532888888888889, "grad_norm": 5.657823085784912, "learning_rate": 5.8736654804270466e-05, "loss": 1.2939, "step": 7949 }, { "epoch": 3.533333333333333, "grad_norm": 4.15081787109375, "learning_rate": 5.8718861209964415e-05, "loss": 0.4457, "step": 7950 }, { "epoch": 3.533777777777778, "grad_norm": 2.6434662342071533, "learning_rate": 5.8701067615658365e-05, "loss": 2.3563, "step": 7951 }, { "epoch": 3.534222222222222, "grad_norm": 2.4858129024505615, "learning_rate": 5.868327402135232e-05, "loss": 1.8796, "step": 7952 }, { "epoch": 3.5346666666666664, "grad_norm": 3.0322587490081787, "learning_rate": 5.866548042704627e-05, "loss": 1.9928, "step": 7953 }, { "epoch": 3.535111111111111, "grad_norm": 2.7141330242156982, "learning_rate": 5.8647686832740214e-05, "loss": 1.6356, "step": 7954 }, { "epoch": 3.535555555555556, "grad_norm": 3.070211172103882, "learning_rate": 5.8629893238434164e-05, "loss": 1.3506, "step": 7955 }, { "epoch": 3.536, "grad_norm": 3.548081159591675, "learning_rate": 5.8612099644128114e-05, "loss": 1.8977, "step": 7956 }, { "epoch": 3.5364444444444443, "grad_norm": 3.189389228820801, "learning_rate": 5.859430604982207e-05, "loss": 1.5622, "step": 7957 }, { "epoch": 3.536888888888889, "grad_norm": 3.3881306648254395, "learning_rate": 5.857651245551602e-05, "loss": 1.7807, "step": 7958 }, { "epoch": 3.537333333333333, "grad_norm": 2.211007595062256, "learning_rate": 5.855871886120997e-05, "loss": 0.688, "step": 7959 }, { "epoch": 3.537777777777778, "grad_norm": 3.352668523788452, "learning_rate": 5.854092526690391e-05, "loss": 1.8142, "step": 7960 }, { "epoch": 3.538222222222222, "grad_norm": 3.3707339763641357, "learning_rate": 5.852313167259786e-05, "loss": 1.3783, "step": 7961 }, { "epoch": 3.538666666666667, "grad_norm": 3.5272951126098633, "learning_rate": 5.850533807829182e-05, "loss": 1.5301, "step": 7962 }, { "epoch": 3.539111111111111, "grad_norm": 3.348961353302002, "learning_rate": 5.848754448398577e-05, "loss": 1.2704, "step": 7963 }, { "epoch": 3.5395555555555553, "grad_norm": 3.5585920810699463, "learning_rate": 5.846975088967972e-05, "loss": 1.3461, "step": 7964 }, { "epoch": 3.54, "grad_norm": 3.355560064315796, "learning_rate": 5.8451957295373675e-05, "loss": 1.4307, "step": 7965 }, { "epoch": 3.5404444444444443, "grad_norm": 3.4020283222198486, "learning_rate": 5.8434163701067624e-05, "loss": 1.4312, "step": 7966 }, { "epoch": 3.540888888888889, "grad_norm": 3.8027524948120117, "learning_rate": 5.841637010676157e-05, "loss": 1.6558, "step": 7967 }, { "epoch": 3.541333333333333, "grad_norm": 3.7693047523498535, "learning_rate": 5.839857651245552e-05, "loss": 1.2358, "step": 7968 }, { "epoch": 3.541777777777778, "grad_norm": 0.21360933780670166, "learning_rate": 5.838078291814947e-05, "loss": 0.0254, "step": 7969 }, { "epoch": 3.542222222222222, "grad_norm": 2.69741153717041, "learning_rate": 5.836298932384342e-05, "loss": 1.2167, "step": 7970 }, { "epoch": 3.5426666666666664, "grad_norm": 2.8727669715881348, "learning_rate": 5.834519572953737e-05, "loss": 1.0769, "step": 7971 }, { "epoch": 3.543111111111111, "grad_norm": 2.91815447807312, "learning_rate": 5.832740213523132e-05, "loss": 1.0958, "step": 7972 }, { "epoch": 3.543555555555556, "grad_norm": 3.0180909633636475, "learning_rate": 5.8309608540925266e-05, "loss": 1.0416, "step": 7973 }, { "epoch": 3.544, "grad_norm": 3.282245635986328, "learning_rate": 5.8291814946619215e-05, "loss": 1.1943, "step": 7974 }, { "epoch": 3.5444444444444443, "grad_norm": 3.464338779449463, "learning_rate": 5.8274021352313165e-05, "loss": 1.088, "step": 7975 }, { "epoch": 3.544888888888889, "grad_norm": 3.1577398777008057, "learning_rate": 5.825622775800712e-05, "loss": 1.3338, "step": 7976 }, { "epoch": 3.5453333333333332, "grad_norm": 3.2722463607788086, "learning_rate": 5.823843416370107e-05, "loss": 1.2914, "step": 7977 }, { "epoch": 3.545777777777778, "grad_norm": 3.671210527420044, "learning_rate": 5.822064056939502e-05, "loss": 1.3975, "step": 7978 }, { "epoch": 3.546222222222222, "grad_norm": 3.583202600479126, "learning_rate": 5.820284697508898e-05, "loss": 1.2501, "step": 7979 }, { "epoch": 3.546666666666667, "grad_norm": 2.0527262687683105, "learning_rate": 5.8185053380782914e-05, "loss": 0.5888, "step": 7980 }, { "epoch": 3.547111111111111, "grad_norm": 3.583556890487671, "learning_rate": 5.816725978647687e-05, "loss": 1.0753, "step": 7981 }, { "epoch": 3.5475555555555554, "grad_norm": 3.821152687072754, "learning_rate": 5.814946619217082e-05, "loss": 1.311, "step": 7982 }, { "epoch": 3.548, "grad_norm": 3.369014263153076, "learning_rate": 5.813167259786477e-05, "loss": 1.2148, "step": 7983 }, { "epoch": 3.5484444444444443, "grad_norm": 3.3493340015411377, "learning_rate": 5.8113879003558726e-05, "loss": 1.1789, "step": 7984 }, { "epoch": 3.548888888888889, "grad_norm": 2.961345672607422, "learning_rate": 5.8096085409252676e-05, "loss": 1.0513, "step": 7985 }, { "epoch": 3.5493333333333332, "grad_norm": 3.708880662918091, "learning_rate": 5.807829181494662e-05, "loss": 1.3949, "step": 7986 }, { "epoch": 3.549777777777778, "grad_norm": 3.2943549156188965, "learning_rate": 5.806049822064057e-05, "loss": 1.1287, "step": 7987 }, { "epoch": 3.550222222222222, "grad_norm": 3.0104446411132812, "learning_rate": 5.804270462633452e-05, "loss": 1.121, "step": 7988 }, { "epoch": 3.5506666666666664, "grad_norm": 3.7163071632385254, "learning_rate": 5.8024911032028475e-05, "loss": 1.0807, "step": 7989 }, { "epoch": 3.551111111111111, "grad_norm": 3.9415080547332764, "learning_rate": 5.8007117437722425e-05, "loss": 1.3248, "step": 7990 }, { "epoch": 3.551555555555556, "grad_norm": 4.1947245597839355, "learning_rate": 5.7989323843416374e-05, "loss": 1.5827, "step": 7991 }, { "epoch": 3.552, "grad_norm": 4.378093719482422, "learning_rate": 5.797153024911033e-05, "loss": 1.2251, "step": 7992 }, { "epoch": 3.5524444444444443, "grad_norm": 3.898223638534546, "learning_rate": 5.795373665480427e-05, "loss": 1.4066, "step": 7993 }, { "epoch": 3.552888888888889, "grad_norm": 4.775320529937744, "learning_rate": 5.7935943060498223e-05, "loss": 1.2938, "step": 7994 }, { "epoch": 3.5533333333333332, "grad_norm": 4.527971267700195, "learning_rate": 5.791814946619217e-05, "loss": 1.3705, "step": 7995 }, { "epoch": 3.553777777777778, "grad_norm": 4.258374214172363, "learning_rate": 5.790035587188612e-05, "loss": 1.2344, "step": 7996 }, { "epoch": 3.554222222222222, "grad_norm": 3.4799654483795166, "learning_rate": 5.788256227758008e-05, "loss": 1.1858, "step": 7997 }, { "epoch": 3.554666666666667, "grad_norm": 5.774474620819092, "learning_rate": 5.786476868327403e-05, "loss": 1.6127, "step": 7998 }, { "epoch": 3.555111111111111, "grad_norm": 4.93737268447876, "learning_rate": 5.784697508896797e-05, "loss": 1.2544, "step": 7999 }, { "epoch": 3.5555555555555554, "grad_norm": 3.601134777069092, "learning_rate": 5.782918149466192e-05, "loss": 0.5138, "step": 8000 }, { "epoch": 3.556, "grad_norm": 2.7162842750549316, "learning_rate": 5.781138790035587e-05, "loss": 2.1305, "step": 8001 }, { "epoch": 3.5564444444444443, "grad_norm": 2.303110361099243, "learning_rate": 5.779359430604983e-05, "loss": 1.6483, "step": 8002 }, { "epoch": 3.556888888888889, "grad_norm": 2.755901575088501, "learning_rate": 5.777580071174378e-05, "loss": 1.8632, "step": 8003 }, { "epoch": 3.5573333333333332, "grad_norm": 1.7043906450271606, "learning_rate": 5.775800711743773e-05, "loss": 0.9941, "step": 8004 }, { "epoch": 3.557777777777778, "grad_norm": 1.6068474054336548, "learning_rate": 5.774021352313167e-05, "loss": 0.6672, "step": 8005 }, { "epoch": 3.558222222222222, "grad_norm": 2.530790328979492, "learning_rate": 5.772241992882562e-05, "loss": 1.2947, "step": 8006 }, { "epoch": 3.5586666666666664, "grad_norm": 3.187232732772827, "learning_rate": 5.7704626334519577e-05, "loss": 1.9547, "step": 8007 }, { "epoch": 3.559111111111111, "grad_norm": 2.78572416305542, "learning_rate": 5.7686832740213526e-05, "loss": 1.5949, "step": 8008 }, { "epoch": 3.559555555555556, "grad_norm": 3.9255530834198, "learning_rate": 5.7669039145907476e-05, "loss": 1.6875, "step": 8009 }, { "epoch": 3.56, "grad_norm": 2.8568902015686035, "learning_rate": 5.765124555160143e-05, "loss": 1.7145, "step": 8010 }, { "epoch": 3.5604444444444443, "grad_norm": 2.9281044006347656, "learning_rate": 5.763345195729538e-05, "loss": 1.518, "step": 8011 }, { "epoch": 3.560888888888889, "grad_norm": 2.7695422172546387, "learning_rate": 5.7615658362989325e-05, "loss": 1.665, "step": 8012 }, { "epoch": 3.5613333333333332, "grad_norm": 3.807720899581909, "learning_rate": 5.7597864768683275e-05, "loss": 1.395, "step": 8013 }, { "epoch": 3.561777777777778, "grad_norm": 3.4705801010131836, "learning_rate": 5.7580071174377225e-05, "loss": 1.4524, "step": 8014 }, { "epoch": 3.562222222222222, "grad_norm": 2.88387131690979, "learning_rate": 5.756227758007118e-05, "loss": 1.1103, "step": 8015 }, { "epoch": 3.562666666666667, "grad_norm": 2.9393246173858643, "learning_rate": 5.754448398576513e-05, "loss": 1.0856, "step": 8016 }, { "epoch": 3.563111111111111, "grad_norm": 3.0343334674835205, "learning_rate": 5.752669039145908e-05, "loss": 1.3709, "step": 8017 }, { "epoch": 3.5635555555555554, "grad_norm": 2.985978126525879, "learning_rate": 5.7508896797153023e-05, "loss": 1.36, "step": 8018 }, { "epoch": 3.564, "grad_norm": 3.5497658252716064, "learning_rate": 5.749110320284697e-05, "loss": 1.4632, "step": 8019 }, { "epoch": 3.5644444444444443, "grad_norm": 2.939556837081909, "learning_rate": 5.747330960854092e-05, "loss": 1.417, "step": 8020 }, { "epoch": 3.564888888888889, "grad_norm": 2.9384403228759766, "learning_rate": 5.745551601423488e-05, "loss": 0.9369, "step": 8021 }, { "epoch": 3.5653333333333332, "grad_norm": 3.302537441253662, "learning_rate": 5.743772241992883e-05, "loss": 1.592, "step": 8022 }, { "epoch": 3.565777777777778, "grad_norm": 3.0296859741210938, "learning_rate": 5.741992882562278e-05, "loss": 1.3002, "step": 8023 }, { "epoch": 3.566222222222222, "grad_norm": 2.4445323944091797, "learning_rate": 5.7402135231316735e-05, "loss": 1.0328, "step": 8024 }, { "epoch": 3.5666666666666664, "grad_norm": 2.847886562347412, "learning_rate": 5.738434163701067e-05, "loss": 1.151, "step": 8025 }, { "epoch": 3.567111111111111, "grad_norm": 3.6882436275482178, "learning_rate": 5.736654804270463e-05, "loss": 1.717, "step": 8026 }, { "epoch": 3.567555555555556, "grad_norm": 2.9185895919799805, "learning_rate": 5.734875444839858e-05, "loss": 1.4205, "step": 8027 }, { "epoch": 3.568, "grad_norm": 3.1053783893585205, "learning_rate": 5.733096085409253e-05, "loss": 1.2341, "step": 8028 }, { "epoch": 3.5684444444444443, "grad_norm": 3.3123791217803955, "learning_rate": 5.7313167259786484e-05, "loss": 1.1749, "step": 8029 }, { "epoch": 3.568888888888889, "grad_norm": 3.7100796699523926, "learning_rate": 5.7295373665480434e-05, "loss": 1.2908, "step": 8030 }, { "epoch": 3.5693333333333332, "grad_norm": 2.9911487102508545, "learning_rate": 5.727758007117438e-05, "loss": 0.8336, "step": 8031 }, { "epoch": 3.569777777777778, "grad_norm": 4.015035629272461, "learning_rate": 5.7259786476868326e-05, "loss": 1.8666, "step": 8032 }, { "epoch": 3.570222222222222, "grad_norm": 3.5821290016174316, "learning_rate": 5.7241992882562276e-05, "loss": 1.1474, "step": 8033 }, { "epoch": 3.570666666666667, "grad_norm": 3.538398265838623, "learning_rate": 5.722419928825623e-05, "loss": 0.8516, "step": 8034 }, { "epoch": 3.571111111111111, "grad_norm": 3.570612907409668, "learning_rate": 5.720640569395018e-05, "loss": 1.4082, "step": 8035 }, { "epoch": 3.5715555555555554, "grad_norm": 2.5072994232177734, "learning_rate": 5.718861209964413e-05, "loss": 0.6298, "step": 8036 }, { "epoch": 3.572, "grad_norm": 3.3981175422668457, "learning_rate": 5.717081850533809e-05, "loss": 1.1501, "step": 8037 }, { "epoch": 3.5724444444444443, "grad_norm": 4.341430187225342, "learning_rate": 5.7153024911032025e-05, "loss": 1.4794, "step": 8038 }, { "epoch": 3.572888888888889, "grad_norm": 4.046430587768555, "learning_rate": 5.713523131672598e-05, "loss": 1.3761, "step": 8039 }, { "epoch": 3.5733333333333333, "grad_norm": 3.6594674587249756, "learning_rate": 5.711743772241993e-05, "loss": 1.5613, "step": 8040 }, { "epoch": 3.573777777777778, "grad_norm": 4.007656574249268, "learning_rate": 5.709964412811388e-05, "loss": 1.3619, "step": 8041 }, { "epoch": 3.574222222222222, "grad_norm": 4.279335021972656, "learning_rate": 5.708185053380784e-05, "loss": 1.6566, "step": 8042 }, { "epoch": 3.5746666666666664, "grad_norm": 3.4057838916778564, "learning_rate": 5.706405693950179e-05, "loss": 1.2244, "step": 8043 }, { "epoch": 3.575111111111111, "grad_norm": 3.479970932006836, "learning_rate": 5.704626334519573e-05, "loss": 1.4622, "step": 8044 }, { "epoch": 3.575555555555556, "grad_norm": 3.1348230838775635, "learning_rate": 5.702846975088968e-05, "loss": 1.2252, "step": 8045 }, { "epoch": 3.576, "grad_norm": 3.5639426708221436, "learning_rate": 5.701067615658363e-05, "loss": 1.0362, "step": 8046 }, { "epoch": 3.5764444444444443, "grad_norm": 3.8750410079956055, "learning_rate": 5.6992882562277586e-05, "loss": 0.9873, "step": 8047 }, { "epoch": 3.576888888888889, "grad_norm": 4.268735885620117, "learning_rate": 5.6975088967971535e-05, "loss": 1.7215, "step": 8048 }, { "epoch": 3.5773333333333333, "grad_norm": 3.788827657699585, "learning_rate": 5.6957295373665485e-05, "loss": 1.0994, "step": 8049 }, { "epoch": 3.5777777777777775, "grad_norm": 3.688361644744873, "learning_rate": 5.693950177935944e-05, "loss": 0.2493, "step": 8050 }, { "epoch": 3.578222222222222, "grad_norm": 2.1738009452819824, "learning_rate": 5.692170818505338e-05, "loss": 1.7641, "step": 8051 }, { "epoch": 3.578666666666667, "grad_norm": 2.439173698425293, "learning_rate": 5.6903914590747334e-05, "loss": 2.0032, "step": 8052 }, { "epoch": 3.579111111111111, "grad_norm": 2.087841749191284, "learning_rate": 5.6886120996441284e-05, "loss": 1.0669, "step": 8053 }, { "epoch": 3.5795555555555554, "grad_norm": 2.6237778663635254, "learning_rate": 5.6868327402135234e-05, "loss": 1.5711, "step": 8054 }, { "epoch": 3.58, "grad_norm": 2.9692187309265137, "learning_rate": 5.685053380782919e-05, "loss": 1.553, "step": 8055 }, { "epoch": 3.5804444444444443, "grad_norm": 2.481410264968872, "learning_rate": 5.683274021352314e-05, "loss": 0.8767, "step": 8056 }, { "epoch": 3.580888888888889, "grad_norm": 3.197645425796509, "learning_rate": 5.681494661921708e-05, "loss": 1.7859, "step": 8057 }, { "epoch": 3.5813333333333333, "grad_norm": 3.208965301513672, "learning_rate": 5.679715302491103e-05, "loss": 1.5878, "step": 8058 }, { "epoch": 3.581777777777778, "grad_norm": 3.1050381660461426, "learning_rate": 5.677935943060498e-05, "loss": 1.4308, "step": 8059 }, { "epoch": 3.582222222222222, "grad_norm": 3.836418867111206, "learning_rate": 5.676156583629894e-05, "loss": 1.6935, "step": 8060 }, { "epoch": 3.5826666666666664, "grad_norm": 3.386842727661133, "learning_rate": 5.674377224199289e-05, "loss": 1.5557, "step": 8061 }, { "epoch": 3.583111111111111, "grad_norm": 2.919567108154297, "learning_rate": 5.672597864768684e-05, "loss": 1.1336, "step": 8062 }, { "epoch": 3.583555555555556, "grad_norm": 3.271778106689453, "learning_rate": 5.670818505338078e-05, "loss": 1.3239, "step": 8063 }, { "epoch": 3.584, "grad_norm": 4.067761421203613, "learning_rate": 5.669039145907473e-05, "loss": 1.423, "step": 8064 }, { "epoch": 3.5844444444444443, "grad_norm": 3.229156970977783, "learning_rate": 5.667259786476868e-05, "loss": 1.3032, "step": 8065 }, { "epoch": 3.584888888888889, "grad_norm": 3.7134037017822266, "learning_rate": 5.665480427046264e-05, "loss": 1.2842, "step": 8066 }, { "epoch": 3.5853333333333333, "grad_norm": 3.65313982963562, "learning_rate": 5.663701067615659e-05, "loss": 1.4233, "step": 8067 }, { "epoch": 3.5857777777777775, "grad_norm": 3.398033857345581, "learning_rate": 5.661921708185054e-05, "loss": 1.3633, "step": 8068 }, { "epoch": 3.586222222222222, "grad_norm": 3.1917221546173096, "learning_rate": 5.660142348754449e-05, "loss": 1.0808, "step": 8069 }, { "epoch": 3.586666666666667, "grad_norm": 3.6049952507019043, "learning_rate": 5.658362989323843e-05, "loss": 1.7095, "step": 8070 }, { "epoch": 3.587111111111111, "grad_norm": 3.4335100650787354, "learning_rate": 5.6565836298932386e-05, "loss": 1.3466, "step": 8071 }, { "epoch": 3.5875555555555554, "grad_norm": 3.463630437850952, "learning_rate": 5.6548042704626336e-05, "loss": 1.2838, "step": 8072 }, { "epoch": 3.588, "grad_norm": 3.933293104171753, "learning_rate": 5.6530249110320285e-05, "loss": 1.4277, "step": 8073 }, { "epoch": 3.5884444444444443, "grad_norm": 3.340559482574463, "learning_rate": 5.651245551601424e-05, "loss": 1.2219, "step": 8074 }, { "epoch": 3.588888888888889, "grad_norm": 4.030251979827881, "learning_rate": 5.649466192170819e-05, "loss": 1.4176, "step": 8075 }, { "epoch": 3.5893333333333333, "grad_norm": 3.1213886737823486, "learning_rate": 5.6476868327402134e-05, "loss": 1.2696, "step": 8076 }, { "epoch": 3.589777777777778, "grad_norm": 2.9783213138580322, "learning_rate": 5.6459074733096084e-05, "loss": 1.2995, "step": 8077 }, { "epoch": 3.590222222222222, "grad_norm": 2.2861247062683105, "learning_rate": 5.6441281138790034e-05, "loss": 0.642, "step": 8078 }, { "epoch": 3.5906666666666665, "grad_norm": 3.1233749389648438, "learning_rate": 5.642348754448399e-05, "loss": 0.9259, "step": 8079 }, { "epoch": 3.591111111111111, "grad_norm": 3.5224452018737793, "learning_rate": 5.640569395017794e-05, "loss": 1.4473, "step": 8080 }, { "epoch": 3.5915555555555554, "grad_norm": 3.7169413566589355, "learning_rate": 5.638790035587189e-05, "loss": 1.1774, "step": 8081 }, { "epoch": 3.592, "grad_norm": 3.676466226577759, "learning_rate": 5.6370106761565846e-05, "loss": 1.4331, "step": 8082 }, { "epoch": 3.5924444444444443, "grad_norm": 3.7899556159973145, "learning_rate": 5.635231316725978e-05, "loss": 1.6973, "step": 8083 }, { "epoch": 3.592888888888889, "grad_norm": 4.713149070739746, "learning_rate": 5.633451957295374e-05, "loss": 1.4462, "step": 8084 }, { "epoch": 3.5933333333333333, "grad_norm": 3.4206809997558594, "learning_rate": 5.631672597864769e-05, "loss": 1.2807, "step": 8085 }, { "epoch": 3.5937777777777775, "grad_norm": 3.8811004161834717, "learning_rate": 5.629893238434164e-05, "loss": 1.2656, "step": 8086 }, { "epoch": 3.594222222222222, "grad_norm": 3.997771978378296, "learning_rate": 5.6281138790035595e-05, "loss": 1.5387, "step": 8087 }, { "epoch": 3.594666666666667, "grad_norm": 3.7929420471191406, "learning_rate": 5.6263345195729545e-05, "loss": 1.2682, "step": 8088 }, { "epoch": 3.595111111111111, "grad_norm": 3.0101442337036133, "learning_rate": 5.624555160142349e-05, "loss": 0.7153, "step": 8089 }, { "epoch": 3.5955555555555554, "grad_norm": 3.637249708175659, "learning_rate": 5.622775800711744e-05, "loss": 1.3984, "step": 8090 }, { "epoch": 3.596, "grad_norm": 4.056656360626221, "learning_rate": 5.620996441281139e-05, "loss": 1.3663, "step": 8091 }, { "epoch": 3.5964444444444443, "grad_norm": 3.1671206951141357, "learning_rate": 5.6192170818505344e-05, "loss": 1.1746, "step": 8092 }, { "epoch": 3.596888888888889, "grad_norm": 4.801858901977539, "learning_rate": 5.617437722419929e-05, "loss": 1.7342, "step": 8093 }, { "epoch": 3.5973333333333333, "grad_norm": 3.729170322418213, "learning_rate": 5.615658362989324e-05, "loss": 0.959, "step": 8094 }, { "epoch": 3.597777777777778, "grad_norm": 3.9978530406951904, "learning_rate": 5.61387900355872e-05, "loss": 1.2262, "step": 8095 }, { "epoch": 3.598222222222222, "grad_norm": 3.4558112621307373, "learning_rate": 5.6120996441281136e-05, "loss": 1.0928, "step": 8096 }, { "epoch": 3.5986666666666665, "grad_norm": 3.660371780395508, "learning_rate": 5.610320284697509e-05, "loss": 1.1768, "step": 8097 }, { "epoch": 3.599111111111111, "grad_norm": 5.200618743896484, "learning_rate": 5.608540925266904e-05, "loss": 1.0061, "step": 8098 }, { "epoch": 3.5995555555555554, "grad_norm": 4.098397731781006, "learning_rate": 5.606761565836299e-05, "loss": 1.1386, "step": 8099 }, { "epoch": 3.6, "grad_norm": 2.963202714920044, "learning_rate": 5.604982206405695e-05, "loss": 0.4808, "step": 8100 }, { "epoch": 3.6004444444444443, "grad_norm": 1.6309106349945068, "learning_rate": 5.60320284697509e-05, "loss": 0.8084, "step": 8101 }, { "epoch": 3.600888888888889, "grad_norm": 3.205033779144287, "learning_rate": 5.601423487544484e-05, "loss": 1.2626, "step": 8102 }, { "epoch": 3.6013333333333333, "grad_norm": 3.2114498615264893, "learning_rate": 5.599644128113879e-05, "loss": 1.5981, "step": 8103 }, { "epoch": 3.6017777777777775, "grad_norm": 3.0003700256347656, "learning_rate": 5.597864768683274e-05, "loss": 1.3144, "step": 8104 }, { "epoch": 3.602222222222222, "grad_norm": 3.0486159324645996, "learning_rate": 5.59608540925267e-05, "loss": 1.3489, "step": 8105 }, { "epoch": 3.602666666666667, "grad_norm": 2.8934648036956787, "learning_rate": 5.5943060498220646e-05, "loss": 1.316, "step": 8106 }, { "epoch": 3.603111111111111, "grad_norm": 3.1762866973876953, "learning_rate": 5.5925266903914596e-05, "loss": 1.259, "step": 8107 }, { "epoch": 3.6035555555555554, "grad_norm": 2.9977829456329346, "learning_rate": 5.5907473309608546e-05, "loss": 1.0371, "step": 8108 }, { "epoch": 3.604, "grad_norm": 3.3716068267822266, "learning_rate": 5.588967971530249e-05, "loss": 1.4329, "step": 8109 }, { "epoch": 3.6044444444444443, "grad_norm": 3.4693422317504883, "learning_rate": 5.587188612099644e-05, "loss": 1.3105, "step": 8110 }, { "epoch": 3.604888888888889, "grad_norm": 3.0525338649749756, "learning_rate": 5.5854092526690395e-05, "loss": 1.3395, "step": 8111 }, { "epoch": 3.6053333333333333, "grad_norm": 3.2203667163848877, "learning_rate": 5.5836298932384345e-05, "loss": 1.4343, "step": 8112 }, { "epoch": 3.605777777777778, "grad_norm": 3.3371267318725586, "learning_rate": 5.5818505338078294e-05, "loss": 1.3877, "step": 8113 }, { "epoch": 3.606222222222222, "grad_norm": 4.093338489532471, "learning_rate": 5.580071174377225e-05, "loss": 1.558, "step": 8114 }, { "epoch": 3.6066666666666665, "grad_norm": 3.266066312789917, "learning_rate": 5.578291814946619e-05, "loss": 0.9591, "step": 8115 }, { "epoch": 3.607111111111111, "grad_norm": 3.600327730178833, "learning_rate": 5.5765124555160144e-05, "loss": 1.4223, "step": 8116 }, { "epoch": 3.6075555555555554, "grad_norm": 3.469787359237671, "learning_rate": 5.574733096085409e-05, "loss": 1.1383, "step": 8117 }, { "epoch": 3.608, "grad_norm": 3.263157844543457, "learning_rate": 5.572953736654804e-05, "loss": 1.5695, "step": 8118 }, { "epoch": 3.6084444444444443, "grad_norm": 2.616588830947876, "learning_rate": 5.5711743772242e-05, "loss": 0.9983, "step": 8119 }, { "epoch": 3.608888888888889, "grad_norm": 3.373994827270508, "learning_rate": 5.569395017793595e-05, "loss": 1.3782, "step": 8120 }, { "epoch": 3.6093333333333333, "grad_norm": 3.624722957611084, "learning_rate": 5.567615658362989e-05, "loss": 1.3283, "step": 8121 }, { "epoch": 3.6097777777777775, "grad_norm": 3.547624349594116, "learning_rate": 5.565836298932384e-05, "loss": 0.9186, "step": 8122 }, { "epoch": 3.610222222222222, "grad_norm": 3.4890434741973877, "learning_rate": 5.564056939501779e-05, "loss": 1.514, "step": 8123 }, { "epoch": 3.610666666666667, "grad_norm": 3.2042863368988037, "learning_rate": 5.562277580071175e-05, "loss": 1.2029, "step": 8124 }, { "epoch": 3.611111111111111, "grad_norm": 3.498734712600708, "learning_rate": 5.56049822064057e-05, "loss": 1.2113, "step": 8125 }, { "epoch": 3.6115555555555554, "grad_norm": 3.9023139476776123, "learning_rate": 5.558718861209965e-05, "loss": 1.1575, "step": 8126 }, { "epoch": 3.612, "grad_norm": 3.4858694076538086, "learning_rate": 5.5569395017793604e-05, "loss": 1.3176, "step": 8127 }, { "epoch": 3.6124444444444443, "grad_norm": 3.3926501274108887, "learning_rate": 5.555160142348754e-05, "loss": 1.52, "step": 8128 }, { "epoch": 3.612888888888889, "grad_norm": 3.7569804191589355, "learning_rate": 5.55338078291815e-05, "loss": 1.2774, "step": 8129 }, { "epoch": 3.6133333333333333, "grad_norm": 3.865650177001953, "learning_rate": 5.5516014234875446e-05, "loss": 1.4652, "step": 8130 }, { "epoch": 3.613777777777778, "grad_norm": 3.499619483947754, "learning_rate": 5.5498220640569396e-05, "loss": 1.3073, "step": 8131 }, { "epoch": 3.6142222222222222, "grad_norm": 3.366881847381592, "learning_rate": 5.548042704626335e-05, "loss": 1.1339, "step": 8132 }, { "epoch": 3.6146666666666665, "grad_norm": 3.8024709224700928, "learning_rate": 5.54626334519573e-05, "loss": 1.4588, "step": 8133 }, { "epoch": 3.615111111111111, "grad_norm": 4.1196160316467285, "learning_rate": 5.5444839857651245e-05, "loss": 1.2886, "step": 8134 }, { "epoch": 3.6155555555555554, "grad_norm": 3.782890796661377, "learning_rate": 5.5427046263345195e-05, "loss": 1.1498, "step": 8135 }, { "epoch": 3.616, "grad_norm": 3.4527840614318848, "learning_rate": 5.5409252669039145e-05, "loss": 1.4733, "step": 8136 }, { "epoch": 3.6164444444444444, "grad_norm": 3.2366058826446533, "learning_rate": 5.53914590747331e-05, "loss": 1.0925, "step": 8137 }, { "epoch": 3.616888888888889, "grad_norm": 2.8542094230651855, "learning_rate": 5.537366548042705e-05, "loss": 0.7504, "step": 8138 }, { "epoch": 3.6173333333333333, "grad_norm": 3.7491934299468994, "learning_rate": 5.5355871886121e-05, "loss": 1.4432, "step": 8139 }, { "epoch": 3.6177777777777775, "grad_norm": 3.4385619163513184, "learning_rate": 5.533807829181496e-05, "loss": 0.8893, "step": 8140 }, { "epoch": 3.6182222222222222, "grad_norm": 4.420047283172607, "learning_rate": 5.5320284697508893e-05, "loss": 1.8875, "step": 8141 }, { "epoch": 3.618666666666667, "grad_norm": 4.307175636291504, "learning_rate": 5.530249110320285e-05, "loss": 1.1576, "step": 8142 }, { "epoch": 3.619111111111111, "grad_norm": 3.8173797130584717, "learning_rate": 5.52846975088968e-05, "loss": 1.5327, "step": 8143 }, { "epoch": 3.6195555555555554, "grad_norm": 4.208653926849365, "learning_rate": 5.526690391459075e-05, "loss": 0.8899, "step": 8144 }, { "epoch": 3.62, "grad_norm": 4.356147766113281, "learning_rate": 5.5249110320284706e-05, "loss": 1.4298, "step": 8145 }, { "epoch": 3.6204444444444444, "grad_norm": 3.923570394515991, "learning_rate": 5.5231316725978656e-05, "loss": 1.0489, "step": 8146 }, { "epoch": 3.620888888888889, "grad_norm": 5.018566608428955, "learning_rate": 5.52135231316726e-05, "loss": 1.1978, "step": 8147 }, { "epoch": 3.6213333333333333, "grad_norm": 3.8071272373199463, "learning_rate": 5.519572953736655e-05, "loss": 0.5866, "step": 8148 }, { "epoch": 3.621777777777778, "grad_norm": 5.471420764923096, "learning_rate": 5.51779359430605e-05, "loss": 1.0767, "step": 8149 }, { "epoch": 3.6222222222222222, "grad_norm": 4.0738606452941895, "learning_rate": 5.5160142348754454e-05, "loss": 0.5901, "step": 8150 }, { "epoch": 3.6226666666666665, "grad_norm": 2.3456101417541504, "learning_rate": 5.5142348754448404e-05, "loss": 1.7003, "step": 8151 }, { "epoch": 3.623111111111111, "grad_norm": 2.777620315551758, "learning_rate": 5.5124555160142354e-05, "loss": 1.9537, "step": 8152 }, { "epoch": 3.6235555555555554, "grad_norm": 2.4514338970184326, "learning_rate": 5.5106761565836304e-05, "loss": 1.5011, "step": 8153 }, { "epoch": 3.624, "grad_norm": 2.7258999347686768, "learning_rate": 5.5088967971530247e-05, "loss": 1.7209, "step": 8154 }, { "epoch": 3.6244444444444444, "grad_norm": 2.463881492614746, "learning_rate": 5.5071174377224196e-05, "loss": 1.3477, "step": 8155 }, { "epoch": 3.624888888888889, "grad_norm": 2.6786627769470215, "learning_rate": 5.505338078291815e-05, "loss": 1.4005, "step": 8156 }, { "epoch": 3.6253333333333333, "grad_norm": 3.139360189437866, "learning_rate": 5.50355871886121e-05, "loss": 1.9946, "step": 8157 }, { "epoch": 3.6257777777777775, "grad_norm": 2.5752453804016113, "learning_rate": 5.501779359430605e-05, "loss": 1.3608, "step": 8158 }, { "epoch": 3.6262222222222222, "grad_norm": 3.0296263694763184, "learning_rate": 5.500000000000001e-05, "loss": 1.5828, "step": 8159 }, { "epoch": 3.626666666666667, "grad_norm": 3.2608470916748047, "learning_rate": 5.4982206405693945e-05, "loss": 1.9236, "step": 8160 }, { "epoch": 3.627111111111111, "grad_norm": 2.6645302772521973, "learning_rate": 5.49644128113879e-05, "loss": 1.4359, "step": 8161 }, { "epoch": 3.6275555555555554, "grad_norm": 3.320736885070801, "learning_rate": 5.494661921708185e-05, "loss": 1.5185, "step": 8162 }, { "epoch": 3.628, "grad_norm": 2.807404041290283, "learning_rate": 5.49288256227758e-05, "loss": 1.4373, "step": 8163 }, { "epoch": 3.6284444444444444, "grad_norm": 2.9995763301849365, "learning_rate": 5.491103202846976e-05, "loss": 1.4947, "step": 8164 }, { "epoch": 3.628888888888889, "grad_norm": 2.971586227416992, "learning_rate": 5.489323843416371e-05, "loss": 1.1496, "step": 8165 }, { "epoch": 3.6293333333333333, "grad_norm": 3.100977659225464, "learning_rate": 5.487544483985766e-05, "loss": 1.1148, "step": 8166 }, { "epoch": 3.629777777777778, "grad_norm": 3.1417391300201416, "learning_rate": 5.48576512455516e-05, "loss": 1.6336, "step": 8167 }, { "epoch": 3.6302222222222222, "grad_norm": 3.098752975463867, "learning_rate": 5.483985765124555e-05, "loss": 1.2825, "step": 8168 }, { "epoch": 3.6306666666666665, "grad_norm": 3.1631276607513428, "learning_rate": 5.4822064056939506e-05, "loss": 1.8255, "step": 8169 }, { "epoch": 3.631111111111111, "grad_norm": 3.218193769454956, "learning_rate": 5.4804270462633456e-05, "loss": 1.6269, "step": 8170 }, { "epoch": 3.6315555555555554, "grad_norm": 3.31066632270813, "learning_rate": 5.4786476868327405e-05, "loss": 1.3839, "step": 8171 }, { "epoch": 3.632, "grad_norm": 4.438895225524902, "learning_rate": 5.476868327402136e-05, "loss": 1.9307, "step": 8172 }, { "epoch": 3.6324444444444444, "grad_norm": 3.1939444541931152, "learning_rate": 5.47508896797153e-05, "loss": 1.3621, "step": 8173 }, { "epoch": 3.632888888888889, "grad_norm": 2.8579087257385254, "learning_rate": 5.4733096085409255e-05, "loss": 0.8286, "step": 8174 }, { "epoch": 3.6333333333333333, "grad_norm": 3.163404941558838, "learning_rate": 5.4715302491103204e-05, "loss": 1.3634, "step": 8175 }, { "epoch": 3.6337777777777776, "grad_norm": 3.0559070110321045, "learning_rate": 5.4697508896797154e-05, "loss": 1.2169, "step": 8176 }, { "epoch": 3.6342222222222222, "grad_norm": 3.1324009895324707, "learning_rate": 5.467971530249111e-05, "loss": 1.1629, "step": 8177 }, { "epoch": 3.634666666666667, "grad_norm": 2.9012513160705566, "learning_rate": 5.466192170818506e-05, "loss": 0.8936, "step": 8178 }, { "epoch": 3.635111111111111, "grad_norm": 3.157163381576538, "learning_rate": 5.4644128113879e-05, "loss": 1.2233, "step": 8179 }, { "epoch": 3.6355555555555554, "grad_norm": 3.3618364334106445, "learning_rate": 5.462633451957295e-05, "loss": 1.3142, "step": 8180 }, { "epoch": 3.636, "grad_norm": 3.3610551357269287, "learning_rate": 5.46085409252669e-05, "loss": 0.9224, "step": 8181 }, { "epoch": 3.6364444444444444, "grad_norm": 3.8400661945343018, "learning_rate": 5.459074733096086e-05, "loss": 1.2544, "step": 8182 }, { "epoch": 3.6368888888888886, "grad_norm": 3.3344969749450684, "learning_rate": 5.457295373665481e-05, "loss": 1.2641, "step": 8183 }, { "epoch": 3.6373333333333333, "grad_norm": 3.121795415878296, "learning_rate": 5.455516014234876e-05, "loss": 1.1398, "step": 8184 }, { "epoch": 3.637777777777778, "grad_norm": 4.248197555541992, "learning_rate": 5.4537366548042715e-05, "loss": 1.0294, "step": 8185 }, { "epoch": 3.6382222222222222, "grad_norm": 3.8400375843048096, "learning_rate": 5.451957295373665e-05, "loss": 1.2639, "step": 8186 }, { "epoch": 3.6386666666666665, "grad_norm": 3.7360832691192627, "learning_rate": 5.450177935943061e-05, "loss": 0.9932, "step": 8187 }, { "epoch": 3.639111111111111, "grad_norm": 3.7453458309173584, "learning_rate": 5.448398576512456e-05, "loss": 1.2704, "step": 8188 }, { "epoch": 3.6395555555555554, "grad_norm": 3.8645150661468506, "learning_rate": 5.446619217081851e-05, "loss": 0.682, "step": 8189 }, { "epoch": 3.64, "grad_norm": 3.455167055130005, "learning_rate": 5.4448398576512464e-05, "loss": 1.3446, "step": 8190 }, { "epoch": 3.6404444444444444, "grad_norm": 3.7151405811309814, "learning_rate": 5.443060498220641e-05, "loss": 1.0171, "step": 8191 }, { "epoch": 3.640888888888889, "grad_norm": 4.202325820922852, "learning_rate": 5.4412811387900356e-05, "loss": 1.26, "step": 8192 }, { "epoch": 3.6413333333333333, "grad_norm": 4.061198711395264, "learning_rate": 5.4395017793594306e-05, "loss": 0.9919, "step": 8193 }, { "epoch": 3.6417777777777776, "grad_norm": 4.6272501945495605, "learning_rate": 5.4377224199288256e-05, "loss": 1.2987, "step": 8194 }, { "epoch": 3.6422222222222222, "grad_norm": 4.520051956176758, "learning_rate": 5.4359430604982205e-05, "loss": 1.0514, "step": 8195 }, { "epoch": 3.642666666666667, "grad_norm": 3.6569225788116455, "learning_rate": 5.434163701067616e-05, "loss": 1.2133, "step": 8196 }, { "epoch": 3.643111111111111, "grad_norm": 4.312341213226318, "learning_rate": 5.432384341637011e-05, "loss": 1.1944, "step": 8197 }, { "epoch": 3.6435555555555554, "grad_norm": 4.46543025970459, "learning_rate": 5.430604982206406e-05, "loss": 1.3214, "step": 8198 }, { "epoch": 3.644, "grad_norm": 5.549198627471924, "learning_rate": 5.4288256227758004e-05, "loss": 0.8821, "step": 8199 }, { "epoch": 3.6444444444444444, "grad_norm": 4.581550121307373, "learning_rate": 5.4270462633451954e-05, "loss": 1.044, "step": 8200 }, { "epoch": 3.6448888888888886, "grad_norm": 2.513296127319336, "learning_rate": 5.425266903914591e-05, "loss": 0.9842, "step": 8201 }, { "epoch": 3.6453333333333333, "grad_norm": 2.9860239028930664, "learning_rate": 5.423487544483986e-05, "loss": 2.2302, "step": 8202 }, { "epoch": 3.645777777777778, "grad_norm": 2.648104429244995, "learning_rate": 5.421708185053381e-05, "loss": 1.7132, "step": 8203 }, { "epoch": 3.6462222222222223, "grad_norm": 2.892163038253784, "learning_rate": 5.4199288256227767e-05, "loss": 1.3049, "step": 8204 }, { "epoch": 3.6466666666666665, "grad_norm": 3.270510196685791, "learning_rate": 5.41814946619217e-05, "loss": 1.8626, "step": 8205 }, { "epoch": 3.647111111111111, "grad_norm": 3.1879770755767822, "learning_rate": 5.416370106761566e-05, "loss": 1.513, "step": 8206 }, { "epoch": 3.6475555555555554, "grad_norm": 2.4823553562164307, "learning_rate": 5.414590747330961e-05, "loss": 0.8951, "step": 8207 }, { "epoch": 3.648, "grad_norm": 2.580904722213745, "learning_rate": 5.412811387900356e-05, "loss": 1.0414, "step": 8208 }, { "epoch": 3.6484444444444444, "grad_norm": 3.6623318195343018, "learning_rate": 5.4110320284697515e-05, "loss": 1.6687, "step": 8209 }, { "epoch": 3.648888888888889, "grad_norm": 3.58868408203125, "learning_rate": 5.4092526690391465e-05, "loss": 1.4268, "step": 8210 }, { "epoch": 3.6493333333333333, "grad_norm": 3.2081754207611084, "learning_rate": 5.4074733096085415e-05, "loss": 1.6204, "step": 8211 }, { "epoch": 3.6497777777777776, "grad_norm": 3.3098785877227783, "learning_rate": 5.405693950177936e-05, "loss": 1.1347, "step": 8212 }, { "epoch": 3.6502222222222223, "grad_norm": 3.5921061038970947, "learning_rate": 5.403914590747331e-05, "loss": 1.5662, "step": 8213 }, { "epoch": 3.6506666666666665, "grad_norm": 3.2691173553466797, "learning_rate": 5.4021352313167264e-05, "loss": 1.2848, "step": 8214 }, { "epoch": 3.651111111111111, "grad_norm": 3.106048583984375, "learning_rate": 5.4003558718861213e-05, "loss": 0.8942, "step": 8215 }, { "epoch": 3.6515555555555554, "grad_norm": 6.022765636444092, "learning_rate": 5.398576512455516e-05, "loss": 1.1837, "step": 8216 }, { "epoch": 3.652, "grad_norm": 3.3533272743225098, "learning_rate": 5.396797153024912e-05, "loss": 1.3667, "step": 8217 }, { "epoch": 3.6524444444444444, "grad_norm": 3.1057679653167725, "learning_rate": 5.3950177935943056e-05, "loss": 1.3119, "step": 8218 }, { "epoch": 3.6528888888888886, "grad_norm": 2.8598716259002686, "learning_rate": 5.393238434163701e-05, "loss": 1.0251, "step": 8219 }, { "epoch": 3.6533333333333333, "grad_norm": 3.3327109813690186, "learning_rate": 5.391459074733096e-05, "loss": 1.1523, "step": 8220 }, { "epoch": 3.653777777777778, "grad_norm": 2.8015899658203125, "learning_rate": 5.389679715302491e-05, "loss": 1.087, "step": 8221 }, { "epoch": 3.6542222222222223, "grad_norm": 3.4534077644348145, "learning_rate": 5.387900355871887e-05, "loss": 1.1916, "step": 8222 }, { "epoch": 3.6546666666666665, "grad_norm": 2.8814008235931396, "learning_rate": 5.386120996441282e-05, "loss": 1.1617, "step": 8223 }, { "epoch": 3.655111111111111, "grad_norm": 3.4058141708374023, "learning_rate": 5.384341637010677e-05, "loss": 1.2716, "step": 8224 }, { "epoch": 3.6555555555555554, "grad_norm": 3.901968479156494, "learning_rate": 5.382562277580071e-05, "loss": 1.5236, "step": 8225 }, { "epoch": 3.656, "grad_norm": 2.169243097305298, "learning_rate": 5.380782918149466e-05, "loss": 0.5998, "step": 8226 }, { "epoch": 3.6564444444444444, "grad_norm": 3.659686803817749, "learning_rate": 5.379003558718862e-05, "loss": 1.4257, "step": 8227 }, { "epoch": 3.656888888888889, "grad_norm": 3.8471503257751465, "learning_rate": 5.3772241992882567e-05, "loss": 1.3645, "step": 8228 }, { "epoch": 3.6573333333333333, "grad_norm": 2.410559892654419, "learning_rate": 5.3754448398576516e-05, "loss": 0.6455, "step": 8229 }, { "epoch": 3.6577777777777776, "grad_norm": 3.5442912578582764, "learning_rate": 5.373665480427047e-05, "loss": 1.3689, "step": 8230 }, { "epoch": 3.6582222222222223, "grad_norm": 3.4644775390625, "learning_rate": 5.371886120996441e-05, "loss": 1.197, "step": 8231 }, { "epoch": 3.6586666666666665, "grad_norm": 3.7939751148223877, "learning_rate": 5.3701067615658365e-05, "loss": 1.1387, "step": 8232 }, { "epoch": 3.659111111111111, "grad_norm": 3.8991305828094482, "learning_rate": 5.3683274021352315e-05, "loss": 1.1945, "step": 8233 }, { "epoch": 3.6595555555555555, "grad_norm": 2.8764808177948, "learning_rate": 5.3665480427046265e-05, "loss": 0.8908, "step": 8234 }, { "epoch": 3.66, "grad_norm": 3.422370433807373, "learning_rate": 5.364768683274022e-05, "loss": 1.0134, "step": 8235 }, { "epoch": 3.6604444444444444, "grad_norm": 4.0114874839782715, "learning_rate": 5.362989323843417e-05, "loss": 1.0199, "step": 8236 }, { "epoch": 3.6608888888888886, "grad_norm": 4.0204362869262695, "learning_rate": 5.3612099644128114e-05, "loss": 1.447, "step": 8237 }, { "epoch": 3.6613333333333333, "grad_norm": 3.3613667488098145, "learning_rate": 5.3594306049822064e-05, "loss": 0.8356, "step": 8238 }, { "epoch": 3.661777777777778, "grad_norm": 5.9143571853637695, "learning_rate": 5.3576512455516014e-05, "loss": 1.2582, "step": 8239 }, { "epoch": 3.6622222222222223, "grad_norm": 3.864518404006958, "learning_rate": 5.355871886120996e-05, "loss": 1.2546, "step": 8240 }, { "epoch": 3.6626666666666665, "grad_norm": 3.446545362472534, "learning_rate": 5.354092526690392e-05, "loss": 0.9876, "step": 8241 }, { "epoch": 3.663111111111111, "grad_norm": 4.813685417175293, "learning_rate": 5.352313167259787e-05, "loss": 1.1929, "step": 8242 }, { "epoch": 3.6635555555555555, "grad_norm": 4.656696319580078, "learning_rate": 5.350533807829182e-05, "loss": 1.0747, "step": 8243 }, { "epoch": 3.664, "grad_norm": 4.323572158813477, "learning_rate": 5.348754448398576e-05, "loss": 1.2275, "step": 8244 }, { "epoch": 3.6644444444444444, "grad_norm": 3.6625609397888184, "learning_rate": 5.346975088967971e-05, "loss": 1.0316, "step": 8245 }, { "epoch": 3.664888888888889, "grad_norm": 4.840065002441406, "learning_rate": 5.345195729537367e-05, "loss": 1.3223, "step": 8246 }, { "epoch": 3.6653333333333333, "grad_norm": 4.664702415466309, "learning_rate": 5.343416370106762e-05, "loss": 1.4522, "step": 8247 }, { "epoch": 3.6657777777777776, "grad_norm": 4.857150077819824, "learning_rate": 5.341637010676157e-05, "loss": 1.0819, "step": 8248 }, { "epoch": 3.6662222222222223, "grad_norm": 10.743447303771973, "learning_rate": 5.3398576512455524e-05, "loss": 1.3214, "step": 8249 }, { "epoch": 3.6666666666666665, "grad_norm": 4.244997024536133, "learning_rate": 5.338078291814946e-05, "loss": 0.3447, "step": 8250 }, { "epoch": 3.667111111111111, "grad_norm": 2.1878910064697266, "learning_rate": 5.336298932384342e-05, "loss": 1.7063, "step": 8251 }, { "epoch": 3.6675555555555555, "grad_norm": 2.090615749359131, "learning_rate": 5.334519572953737e-05, "loss": 0.9182, "step": 8252 }, { "epoch": 3.668, "grad_norm": 2.9112021923065186, "learning_rate": 5.3327402135231316e-05, "loss": 1.575, "step": 8253 }, { "epoch": 3.6684444444444444, "grad_norm": 3.180088996887207, "learning_rate": 5.330960854092527e-05, "loss": 1.6393, "step": 8254 }, { "epoch": 3.6688888888888886, "grad_norm": 3.059612512588501, "learning_rate": 5.329181494661922e-05, "loss": 1.631, "step": 8255 }, { "epoch": 3.6693333333333333, "grad_norm": 3.003873825073242, "learning_rate": 5.327402135231317e-05, "loss": 1.4932, "step": 8256 }, { "epoch": 3.669777777777778, "grad_norm": 3.1199257373809814, "learning_rate": 5.3256227758007115e-05, "loss": 1.5315, "step": 8257 }, { "epoch": 3.6702222222222223, "grad_norm": 2.0866105556488037, "learning_rate": 5.3238434163701065e-05, "loss": 0.8337, "step": 8258 }, { "epoch": 3.6706666666666665, "grad_norm": 3.190763473510742, "learning_rate": 5.322064056939502e-05, "loss": 1.4458, "step": 8259 }, { "epoch": 3.671111111111111, "grad_norm": 3.141622304916382, "learning_rate": 5.320284697508897e-05, "loss": 1.4373, "step": 8260 }, { "epoch": 3.6715555555555555, "grad_norm": 3.208282232284546, "learning_rate": 5.318505338078292e-05, "loss": 1.2228, "step": 8261 }, { "epoch": 3.672, "grad_norm": 3.1854054927825928, "learning_rate": 5.316725978647688e-05, "loss": 1.1494, "step": 8262 }, { "epoch": 3.6724444444444444, "grad_norm": 3.755186080932617, "learning_rate": 5.3149466192170814e-05, "loss": 1.5219, "step": 8263 }, { "epoch": 3.672888888888889, "grad_norm": 3.6373298168182373, "learning_rate": 5.313167259786477e-05, "loss": 1.2791, "step": 8264 }, { "epoch": 3.6733333333333333, "grad_norm": 3.869194984436035, "learning_rate": 5.311387900355872e-05, "loss": 1.854, "step": 8265 }, { "epoch": 3.6737777777777776, "grad_norm": 3.1836750507354736, "learning_rate": 5.309608540925267e-05, "loss": 1.3653, "step": 8266 }, { "epoch": 3.6742222222222223, "grad_norm": 3.2657806873321533, "learning_rate": 5.3078291814946626e-05, "loss": 1.5467, "step": 8267 }, { "epoch": 3.6746666666666665, "grad_norm": 1.893934726715088, "learning_rate": 5.3060498220640576e-05, "loss": 0.7488, "step": 8268 }, { "epoch": 3.675111111111111, "grad_norm": 3.510976552963257, "learning_rate": 5.3042704626334526e-05, "loss": 1.148, "step": 8269 }, { "epoch": 3.6755555555555555, "grad_norm": 2.769212007522583, "learning_rate": 5.302491103202847e-05, "loss": 0.9423, "step": 8270 }, { "epoch": 3.676, "grad_norm": 3.1665525436401367, "learning_rate": 5.300711743772242e-05, "loss": 1.9088, "step": 8271 }, { "epoch": 3.6764444444444444, "grad_norm": 3.130796432495117, "learning_rate": 5.2989323843416375e-05, "loss": 1.2455, "step": 8272 }, { "epoch": 3.6768888888888887, "grad_norm": 3.2460105419158936, "learning_rate": 5.2971530249110324e-05, "loss": 1.2274, "step": 8273 }, { "epoch": 3.6773333333333333, "grad_norm": 3.6489906311035156, "learning_rate": 5.2953736654804274e-05, "loss": 1.3301, "step": 8274 }, { "epoch": 3.677777777777778, "grad_norm": 3.5762670040130615, "learning_rate": 5.293594306049823e-05, "loss": 1.9313, "step": 8275 }, { "epoch": 3.6782222222222223, "grad_norm": 3.458677053451538, "learning_rate": 5.291814946619217e-05, "loss": 1.0342, "step": 8276 }, { "epoch": 3.6786666666666665, "grad_norm": 3.369166612625122, "learning_rate": 5.290035587188612e-05, "loss": 1.7108, "step": 8277 }, { "epoch": 3.679111111111111, "grad_norm": 2.5164918899536133, "learning_rate": 5.288256227758007e-05, "loss": 0.6384, "step": 8278 }, { "epoch": 3.6795555555555555, "grad_norm": 4.323822975158691, "learning_rate": 5.286476868327402e-05, "loss": 1.6491, "step": 8279 }, { "epoch": 3.68, "grad_norm": 3.677583932876587, "learning_rate": 5.284697508896798e-05, "loss": 0.9492, "step": 8280 }, { "epoch": 3.6804444444444444, "grad_norm": 4.226683616638184, "learning_rate": 5.282918149466193e-05, "loss": 1.3614, "step": 8281 }, { "epoch": 3.680888888888889, "grad_norm": 3.287234306335449, "learning_rate": 5.281138790035588e-05, "loss": 1.3625, "step": 8282 }, { "epoch": 3.6813333333333333, "grad_norm": 3.5409836769104004, "learning_rate": 5.279359430604982e-05, "loss": 1.4074, "step": 8283 }, { "epoch": 3.6817777777777776, "grad_norm": 3.3012094497680664, "learning_rate": 5.277580071174377e-05, "loss": 1.1894, "step": 8284 }, { "epoch": 3.6822222222222223, "grad_norm": 3.455919027328491, "learning_rate": 5.275800711743772e-05, "loss": 1.3859, "step": 8285 }, { "epoch": 3.6826666666666665, "grad_norm": 3.6948232650756836, "learning_rate": 5.274021352313168e-05, "loss": 1.1195, "step": 8286 }, { "epoch": 3.6831111111111112, "grad_norm": 3.614387273788452, "learning_rate": 5.272241992882563e-05, "loss": 1.3584, "step": 8287 }, { "epoch": 3.6835555555555555, "grad_norm": 3.2227156162261963, "learning_rate": 5.270462633451958e-05, "loss": 0.3771, "step": 8288 }, { "epoch": 3.684, "grad_norm": 2.9594337940216064, "learning_rate": 5.268683274021352e-05, "loss": 1.0337, "step": 8289 }, { "epoch": 3.6844444444444444, "grad_norm": 3.800976276397705, "learning_rate": 5.266903914590747e-05, "loss": 0.7237, "step": 8290 }, { "epoch": 3.6848888888888887, "grad_norm": 4.719809055328369, "learning_rate": 5.2651245551601426e-05, "loss": 1.2393, "step": 8291 }, { "epoch": 3.6853333333333333, "grad_norm": 4.721984386444092, "learning_rate": 5.2633451957295376e-05, "loss": 1.2577, "step": 8292 }, { "epoch": 3.685777777777778, "grad_norm": 3.1835551261901855, "learning_rate": 5.2615658362989326e-05, "loss": 0.9904, "step": 8293 }, { "epoch": 3.6862222222222223, "grad_norm": 4.421992301940918, "learning_rate": 5.259786476868328e-05, "loss": 1.4825, "step": 8294 }, { "epoch": 3.6866666666666665, "grad_norm": 4.140617370605469, "learning_rate": 5.258007117437722e-05, "loss": 1.1782, "step": 8295 }, { "epoch": 3.6871111111111112, "grad_norm": 3.6903188228607178, "learning_rate": 5.2562277580071175e-05, "loss": 1.5041, "step": 8296 }, { "epoch": 3.6875555555555555, "grad_norm": 3.7069318294525146, "learning_rate": 5.2544483985765124e-05, "loss": 1.1797, "step": 8297 }, { "epoch": 3.6879999999999997, "grad_norm": 4.413390636444092, "learning_rate": 5.2526690391459074e-05, "loss": 0.9219, "step": 8298 }, { "epoch": 3.6884444444444444, "grad_norm": 5.5441508293151855, "learning_rate": 5.250889679715303e-05, "loss": 1.0509, "step": 8299 }, { "epoch": 3.688888888888889, "grad_norm": 0.8163884282112122, "learning_rate": 5.249110320284698e-05, "loss": 0.0719, "step": 8300 }, { "epoch": 3.6893333333333334, "grad_norm": 1.9359767436981201, "learning_rate": 5.247330960854093e-05, "loss": 0.8754, "step": 8301 }, { "epoch": 3.6897777777777776, "grad_norm": 2.754122495651245, "learning_rate": 5.245551601423487e-05, "loss": 1.6614, "step": 8302 }, { "epoch": 3.6902222222222223, "grad_norm": 2.6773335933685303, "learning_rate": 5.243772241992882e-05, "loss": 1.7607, "step": 8303 }, { "epoch": 3.6906666666666665, "grad_norm": 2.679157257080078, "learning_rate": 5.241992882562278e-05, "loss": 1.3, "step": 8304 }, { "epoch": 3.6911111111111112, "grad_norm": 2.7156100273132324, "learning_rate": 5.240213523131673e-05, "loss": 1.5878, "step": 8305 }, { "epoch": 3.6915555555555555, "grad_norm": 3.0638999938964844, "learning_rate": 5.238434163701068e-05, "loss": 2.0461, "step": 8306 }, { "epoch": 3.692, "grad_norm": 2.997682571411133, "learning_rate": 5.2366548042704635e-05, "loss": 1.6546, "step": 8307 }, { "epoch": 3.6924444444444444, "grad_norm": 3.044121265411377, "learning_rate": 5.234875444839857e-05, "loss": 1.6228, "step": 8308 }, { "epoch": 3.6928888888888887, "grad_norm": 2.8701446056365967, "learning_rate": 5.233096085409253e-05, "loss": 1.7426, "step": 8309 }, { "epoch": 3.6933333333333334, "grad_norm": 3.174617052078247, "learning_rate": 5.231316725978648e-05, "loss": 1.5458, "step": 8310 }, { "epoch": 3.693777777777778, "grad_norm": 3.201929807662964, "learning_rate": 5.229537366548043e-05, "loss": 1.6514, "step": 8311 }, { "epoch": 3.6942222222222223, "grad_norm": 3.799694538116455, "learning_rate": 5.2277580071174384e-05, "loss": 1.6626, "step": 8312 }, { "epoch": 3.6946666666666665, "grad_norm": 3.5478615760803223, "learning_rate": 5.2259786476868334e-05, "loss": 1.3328, "step": 8313 }, { "epoch": 3.6951111111111112, "grad_norm": 3.048062562942505, "learning_rate": 5.224199288256228e-05, "loss": 1.4886, "step": 8314 }, { "epoch": 3.6955555555555555, "grad_norm": 3.5230629444122314, "learning_rate": 5.2224199288256226e-05, "loss": 1.2701, "step": 8315 }, { "epoch": 3.6959999999999997, "grad_norm": 3.156364917755127, "learning_rate": 5.2206405693950176e-05, "loss": 1.1824, "step": 8316 }, { "epoch": 3.6964444444444444, "grad_norm": 3.9449214935302734, "learning_rate": 5.218861209964413e-05, "loss": 1.2558, "step": 8317 }, { "epoch": 3.696888888888889, "grad_norm": 3.572063684463501, "learning_rate": 5.217081850533808e-05, "loss": 1.4913, "step": 8318 }, { "epoch": 3.6973333333333334, "grad_norm": 3.8648767471313477, "learning_rate": 5.215302491103203e-05, "loss": 1.6746, "step": 8319 }, { "epoch": 3.6977777777777776, "grad_norm": 3.326974391937256, "learning_rate": 5.213523131672599e-05, "loss": 1.213, "step": 8320 }, { "epoch": 3.6982222222222223, "grad_norm": 3.392534017562866, "learning_rate": 5.2117437722419925e-05, "loss": 1.272, "step": 8321 }, { "epoch": 3.6986666666666665, "grad_norm": 3.3309099674224854, "learning_rate": 5.209964412811388e-05, "loss": 1.559, "step": 8322 }, { "epoch": 3.6991111111111112, "grad_norm": 3.49074387550354, "learning_rate": 5.208185053380783e-05, "loss": 1.2227, "step": 8323 }, { "epoch": 3.6995555555555555, "grad_norm": 3.63338041305542, "learning_rate": 5.206405693950178e-05, "loss": 1.2891, "step": 8324 }, { "epoch": 3.7, "grad_norm": 3.5115768909454346, "learning_rate": 5.204626334519574e-05, "loss": 1.3273, "step": 8325 }, { "epoch": 3.7004444444444444, "grad_norm": 3.8019955158233643, "learning_rate": 5.202846975088969e-05, "loss": 1.0889, "step": 8326 }, { "epoch": 3.7008888888888887, "grad_norm": 3.4950296878814697, "learning_rate": 5.2010676156583636e-05, "loss": 1.379, "step": 8327 }, { "epoch": 3.7013333333333334, "grad_norm": 3.5437874794006348, "learning_rate": 5.199288256227758e-05, "loss": 1.4179, "step": 8328 }, { "epoch": 3.7017777777777776, "grad_norm": 3.734877109527588, "learning_rate": 5.197508896797153e-05, "loss": 1.0075, "step": 8329 }, { "epoch": 3.7022222222222223, "grad_norm": 2.877346992492676, "learning_rate": 5.195729537366548e-05, "loss": 1.1345, "step": 8330 }, { "epoch": 3.7026666666666666, "grad_norm": 3.68837571144104, "learning_rate": 5.1939501779359435e-05, "loss": 1.176, "step": 8331 }, { "epoch": 3.7031111111111112, "grad_norm": 3.2419826984405518, "learning_rate": 5.1921708185053385e-05, "loss": 0.9544, "step": 8332 }, { "epoch": 3.7035555555555555, "grad_norm": 3.9483325481414795, "learning_rate": 5.1903914590747335e-05, "loss": 1.0054, "step": 8333 }, { "epoch": 3.7039999999999997, "grad_norm": 3.8043711185455322, "learning_rate": 5.188612099644128e-05, "loss": 1.1455, "step": 8334 }, { "epoch": 3.7044444444444444, "grad_norm": 3.818793535232544, "learning_rate": 5.186832740213523e-05, "loss": 1.0459, "step": 8335 }, { "epoch": 3.704888888888889, "grad_norm": 3.5451431274414062, "learning_rate": 5.1850533807829184e-05, "loss": 1.2172, "step": 8336 }, { "epoch": 3.7053333333333334, "grad_norm": 3.9264607429504395, "learning_rate": 5.1832740213523134e-05, "loss": 1.5576, "step": 8337 }, { "epoch": 3.7057777777777776, "grad_norm": 3.3462579250335693, "learning_rate": 5.1814946619217083e-05, "loss": 1.1177, "step": 8338 }, { "epoch": 3.7062222222222223, "grad_norm": 3.5663697719573975, "learning_rate": 5.179715302491104e-05, "loss": 1.4554, "step": 8339 }, { "epoch": 3.7066666666666666, "grad_norm": 3.879847288131714, "learning_rate": 5.177935943060499e-05, "loss": 1.3961, "step": 8340 }, { "epoch": 3.7071111111111112, "grad_norm": 4.285848617553711, "learning_rate": 5.176156583629893e-05, "loss": 1.485, "step": 8341 }, { "epoch": 3.7075555555555555, "grad_norm": 4.327765464782715, "learning_rate": 5.174377224199288e-05, "loss": 1.439, "step": 8342 }, { "epoch": 3.708, "grad_norm": 3.3615176677703857, "learning_rate": 5.172597864768683e-05, "loss": 1.0735, "step": 8343 }, { "epoch": 3.7084444444444444, "grad_norm": 5.214451789855957, "learning_rate": 5.170818505338079e-05, "loss": 1.2868, "step": 8344 }, { "epoch": 3.7088888888888887, "grad_norm": 3.7910678386688232, "learning_rate": 5.169039145907474e-05, "loss": 1.2784, "step": 8345 }, { "epoch": 3.7093333333333334, "grad_norm": 3.738447666168213, "learning_rate": 5.167259786476869e-05, "loss": 1.1879, "step": 8346 }, { "epoch": 3.7097777777777776, "grad_norm": 4.8904643058776855, "learning_rate": 5.165480427046263e-05, "loss": 1.3139, "step": 8347 }, { "epoch": 3.7102222222222223, "grad_norm": 5.133594036102295, "learning_rate": 5.163701067615658e-05, "loss": 1.5525, "step": 8348 }, { "epoch": 3.7106666666666666, "grad_norm": 7.878839492797852, "learning_rate": 5.161921708185054e-05, "loss": 1.2252, "step": 8349 }, { "epoch": 3.7111111111111112, "grad_norm": 3.2023541927337646, "learning_rate": 5.160142348754449e-05, "loss": 1.0264, "step": 8350 }, { "epoch": 3.7115555555555555, "grad_norm": 2.548800230026245, "learning_rate": 5.1583629893238437e-05, "loss": 1.7163, "step": 8351 }, { "epoch": 3.7119999999999997, "grad_norm": 2.4168503284454346, "learning_rate": 5.156583629893239e-05, "loss": 1.3757, "step": 8352 }, { "epoch": 3.7124444444444444, "grad_norm": 2.8642630577087402, "learning_rate": 5.154804270462633e-05, "loss": 1.593, "step": 8353 }, { "epoch": 3.712888888888889, "grad_norm": 3.071199893951416, "learning_rate": 5.1530249110320286e-05, "loss": 1.929, "step": 8354 }, { "epoch": 3.7133333333333334, "grad_norm": 2.835141897201538, "learning_rate": 5.1512455516014235e-05, "loss": 1.5476, "step": 8355 }, { "epoch": 3.7137777777777776, "grad_norm": 3.1856870651245117, "learning_rate": 5.1494661921708185e-05, "loss": 1.4111, "step": 8356 }, { "epoch": 3.7142222222222223, "grad_norm": 3.279014825820923, "learning_rate": 5.147686832740214e-05, "loss": 1.6864, "step": 8357 }, { "epoch": 3.7146666666666666, "grad_norm": 2.251955270767212, "learning_rate": 5.145907473309609e-05, "loss": 0.8535, "step": 8358 }, { "epoch": 3.7151111111111113, "grad_norm": 3.4706077575683594, "learning_rate": 5.144128113879004e-05, "loss": 1.4023, "step": 8359 }, { "epoch": 3.7155555555555555, "grad_norm": 3.1662933826446533, "learning_rate": 5.1423487544483984e-05, "loss": 1.1449, "step": 8360 }, { "epoch": 3.716, "grad_norm": 3.3553576469421387, "learning_rate": 5.1405693950177934e-05, "loss": 1.1429, "step": 8361 }, { "epoch": 3.7164444444444444, "grad_norm": 4.185884952545166, "learning_rate": 5.138790035587189e-05, "loss": 1.6231, "step": 8362 }, { "epoch": 3.7168888888888887, "grad_norm": 3.4784839153289795, "learning_rate": 5.137010676156584e-05, "loss": 1.7953, "step": 8363 }, { "epoch": 3.7173333333333334, "grad_norm": 3.0565781593322754, "learning_rate": 5.135231316725979e-05, "loss": 1.5965, "step": 8364 }, { "epoch": 3.7177777777777776, "grad_norm": 3.630974054336548, "learning_rate": 5.1334519572953746e-05, "loss": 1.0787, "step": 8365 }, { "epoch": 3.7182222222222223, "grad_norm": 3.7152397632598877, "learning_rate": 5.131672597864768e-05, "loss": 1.783, "step": 8366 }, { "epoch": 3.7186666666666666, "grad_norm": 2.838580846786499, "learning_rate": 5.129893238434164e-05, "loss": 0.9311, "step": 8367 }, { "epoch": 3.7191111111111113, "grad_norm": 3.2233829498291016, "learning_rate": 5.128113879003559e-05, "loss": 1.3966, "step": 8368 }, { "epoch": 3.7195555555555555, "grad_norm": 3.2140471935272217, "learning_rate": 5.126334519572954e-05, "loss": 1.4011, "step": 8369 }, { "epoch": 3.7199999999999998, "grad_norm": 3.5495688915252686, "learning_rate": 5.1245551601423495e-05, "loss": 1.5965, "step": 8370 }, { "epoch": 3.7204444444444444, "grad_norm": 4.187902450561523, "learning_rate": 5.1227758007117445e-05, "loss": 1.4036, "step": 8371 }, { "epoch": 3.720888888888889, "grad_norm": 3.8422319889068604, "learning_rate": 5.1209964412811394e-05, "loss": 1.5535, "step": 8372 }, { "epoch": 3.7213333333333334, "grad_norm": 3.2021896839141846, "learning_rate": 5.119217081850534e-05, "loss": 1.4836, "step": 8373 }, { "epoch": 3.7217777777777776, "grad_norm": 3.2228951454162598, "learning_rate": 5.117437722419929e-05, "loss": 1.2058, "step": 8374 }, { "epoch": 3.7222222222222223, "grad_norm": 3.7099387645721436, "learning_rate": 5.115658362989324e-05, "loss": 1.2875, "step": 8375 }, { "epoch": 3.7226666666666666, "grad_norm": 2.979400157928467, "learning_rate": 5.113879003558719e-05, "loss": 1.1885, "step": 8376 }, { "epoch": 3.7231111111111113, "grad_norm": 3.7015509605407715, "learning_rate": 5.112099644128114e-05, "loss": 1.467, "step": 8377 }, { "epoch": 3.7235555555555555, "grad_norm": 4.023457050323486, "learning_rate": 5.110320284697509e-05, "loss": 1.5333, "step": 8378 }, { "epoch": 3.724, "grad_norm": 2.9902260303497314, "learning_rate": 5.1085409252669036e-05, "loss": 1.2314, "step": 8379 }, { "epoch": 3.7244444444444444, "grad_norm": 4.326292991638184, "learning_rate": 5.1067615658362985e-05, "loss": 1.9829, "step": 8380 }, { "epoch": 3.7248888888888887, "grad_norm": 3.4808270931243896, "learning_rate": 5.104982206405694e-05, "loss": 1.5185, "step": 8381 }, { "epoch": 3.7253333333333334, "grad_norm": 3.8214287757873535, "learning_rate": 5.103202846975089e-05, "loss": 1.6433, "step": 8382 }, { "epoch": 3.7257777777777776, "grad_norm": 3.6692192554473877, "learning_rate": 5.101423487544484e-05, "loss": 1.4233, "step": 8383 }, { "epoch": 3.7262222222222223, "grad_norm": 2.361046552658081, "learning_rate": 5.09964412811388e-05, "loss": 0.7065, "step": 8384 }, { "epoch": 3.7266666666666666, "grad_norm": 3.3511815071105957, "learning_rate": 5.097864768683275e-05, "loss": 1.0584, "step": 8385 }, { "epoch": 3.7271111111111113, "grad_norm": 3.851069211959839, "learning_rate": 5.096085409252669e-05, "loss": 1.5869, "step": 8386 }, { "epoch": 3.7275555555555555, "grad_norm": 3.2693023681640625, "learning_rate": 5.094306049822064e-05, "loss": 1.0885, "step": 8387 }, { "epoch": 3.7279999999999998, "grad_norm": 3.3839292526245117, "learning_rate": 5.092526690391459e-05, "loss": 1.2459, "step": 8388 }, { "epoch": 3.7284444444444444, "grad_norm": 4.055683135986328, "learning_rate": 5.0907473309608546e-05, "loss": 1.1619, "step": 8389 }, { "epoch": 3.728888888888889, "grad_norm": 4.578985691070557, "learning_rate": 5.0889679715302496e-05, "loss": 1.3901, "step": 8390 }, { "epoch": 3.7293333333333334, "grad_norm": 3.6490328311920166, "learning_rate": 5.0871886120996446e-05, "loss": 1.2829, "step": 8391 }, { "epoch": 3.7297777777777776, "grad_norm": 4.452314853668213, "learning_rate": 5.085409252669039e-05, "loss": 1.9056, "step": 8392 }, { "epoch": 3.7302222222222223, "grad_norm": 2.537201166152954, "learning_rate": 5.083629893238434e-05, "loss": 0.9342, "step": 8393 }, { "epoch": 3.7306666666666666, "grad_norm": 3.178445339202881, "learning_rate": 5.0818505338078295e-05, "loss": 1.0333, "step": 8394 }, { "epoch": 3.7311111111111113, "grad_norm": 3.890375852584839, "learning_rate": 5.0800711743772245e-05, "loss": 1.2319, "step": 8395 }, { "epoch": 3.7315555555555555, "grad_norm": 3.9328713417053223, "learning_rate": 5.0782918149466194e-05, "loss": 0.9856, "step": 8396 }, { "epoch": 3.732, "grad_norm": 4.24088191986084, "learning_rate": 5.076512455516015e-05, "loss": 1.2177, "step": 8397 }, { "epoch": 3.7324444444444445, "grad_norm": 3.290562629699707, "learning_rate": 5.07473309608541e-05, "loss": 0.7268, "step": 8398 }, { "epoch": 3.7328888888888887, "grad_norm": 3.8136353492736816, "learning_rate": 5.0729537366548043e-05, "loss": 0.3112, "step": 8399 }, { "epoch": 3.7333333333333334, "grad_norm": 4.7547454833984375, "learning_rate": 5.071174377224199e-05, "loss": 0.8184, "step": 8400 }, { "epoch": 3.7337777777777776, "grad_norm": 2.6292288303375244, "learning_rate": 5.069395017793594e-05, "loss": 1.9555, "step": 8401 }, { "epoch": 3.7342222222222223, "grad_norm": 1.9523777961730957, "learning_rate": 5.06761565836299e-05, "loss": 0.8654, "step": 8402 }, { "epoch": 3.7346666666666666, "grad_norm": 2.6126413345336914, "learning_rate": 5.065836298932385e-05, "loss": 1.7074, "step": 8403 }, { "epoch": 3.7351111111111113, "grad_norm": 2.737042188644409, "learning_rate": 5.06405693950178e-05, "loss": 1.7787, "step": 8404 }, { "epoch": 3.7355555555555555, "grad_norm": 3.067366600036621, "learning_rate": 5.062277580071174e-05, "loss": 1.4087, "step": 8405 }, { "epoch": 3.7359999999999998, "grad_norm": 3.5647099018096924, "learning_rate": 5.060498220640569e-05, "loss": 1.8064, "step": 8406 }, { "epoch": 3.7364444444444445, "grad_norm": 3.5738532543182373, "learning_rate": 5.058718861209965e-05, "loss": 2.0468, "step": 8407 }, { "epoch": 3.736888888888889, "grad_norm": 3.4049813747406006, "learning_rate": 5.05693950177936e-05, "loss": 1.33, "step": 8408 }, { "epoch": 3.7373333333333334, "grad_norm": 3.1586861610412598, "learning_rate": 5.055160142348755e-05, "loss": 1.7597, "step": 8409 }, { "epoch": 3.7377777777777776, "grad_norm": 2.9543251991271973, "learning_rate": 5.0533807829181504e-05, "loss": 1.3858, "step": 8410 }, { "epoch": 3.7382222222222223, "grad_norm": 3.1417078971862793, "learning_rate": 5.051601423487544e-05, "loss": 1.8369, "step": 8411 }, { "epoch": 3.7386666666666666, "grad_norm": 3.0961074829101562, "learning_rate": 5.04982206405694e-05, "loss": 1.4527, "step": 8412 }, { "epoch": 3.7391111111111113, "grad_norm": 3.145967483520508, "learning_rate": 5.0480427046263346e-05, "loss": 1.2984, "step": 8413 }, { "epoch": 3.7395555555555555, "grad_norm": 2.759549856185913, "learning_rate": 5.0462633451957296e-05, "loss": 0.936, "step": 8414 }, { "epoch": 3.74, "grad_norm": 2.0562026500701904, "learning_rate": 5.044483985765125e-05, "loss": 0.7822, "step": 8415 }, { "epoch": 3.7404444444444445, "grad_norm": 3.5271592140197754, "learning_rate": 5.04270462633452e-05, "loss": 1.9599, "step": 8416 }, { "epoch": 3.7408888888888887, "grad_norm": 4.152881145477295, "learning_rate": 5.040925266903915e-05, "loss": 1.5676, "step": 8417 }, { "epoch": 3.7413333333333334, "grad_norm": 3.7143032550811768, "learning_rate": 5.0391459074733095e-05, "loss": 1.4384, "step": 8418 }, { "epoch": 3.7417777777777776, "grad_norm": 3.6692559719085693, "learning_rate": 5.0373665480427045e-05, "loss": 1.0879, "step": 8419 }, { "epoch": 3.7422222222222223, "grad_norm": 2.8461532592773438, "learning_rate": 5.0355871886120994e-05, "loss": 1.2018, "step": 8420 }, { "epoch": 3.7426666666666666, "grad_norm": 3.5240817070007324, "learning_rate": 5.033807829181495e-05, "loss": 1.7161, "step": 8421 }, { "epoch": 3.7431111111111113, "grad_norm": 3.98225998878479, "learning_rate": 5.03202846975089e-05, "loss": 1.5325, "step": 8422 }, { "epoch": 3.7435555555555555, "grad_norm": 3.3189501762390137, "learning_rate": 5.030249110320285e-05, "loss": 1.6055, "step": 8423 }, { "epoch": 3.7439999999999998, "grad_norm": 3.6065752506256104, "learning_rate": 5.028469750889679e-05, "loss": 1.6128, "step": 8424 }, { "epoch": 3.7444444444444445, "grad_norm": 3.7471060752868652, "learning_rate": 5.026690391459074e-05, "loss": 1.1217, "step": 8425 }, { "epoch": 3.744888888888889, "grad_norm": 3.301255702972412, "learning_rate": 5.02491103202847e-05, "loss": 1.1631, "step": 8426 }, { "epoch": 3.7453333333333334, "grad_norm": 3.3452343940734863, "learning_rate": 5.023131672597865e-05, "loss": 1.2224, "step": 8427 }, { "epoch": 3.7457777777777777, "grad_norm": 3.3727502822875977, "learning_rate": 5.02135231316726e-05, "loss": 1.1533, "step": 8428 }, { "epoch": 3.7462222222222223, "grad_norm": 3.6482954025268555, "learning_rate": 5.0195729537366555e-05, "loss": 1.4724, "step": 8429 }, { "epoch": 3.7466666666666666, "grad_norm": 3.6111557483673096, "learning_rate": 5.0177935943060505e-05, "loss": 1.2438, "step": 8430 }, { "epoch": 3.747111111111111, "grad_norm": 0.2538396120071411, "learning_rate": 5.016014234875445e-05, "loss": 0.0322, "step": 8431 }, { "epoch": 3.7475555555555555, "grad_norm": 3.558535575866699, "learning_rate": 5.01423487544484e-05, "loss": 1.6036, "step": 8432 }, { "epoch": 3.748, "grad_norm": 3.5707340240478516, "learning_rate": 5.012455516014235e-05, "loss": 1.0665, "step": 8433 }, { "epoch": 3.7484444444444445, "grad_norm": 3.5678842067718506, "learning_rate": 5.0106761565836304e-05, "loss": 0.9684, "step": 8434 }, { "epoch": 3.7488888888888887, "grad_norm": 3.770473003387451, "learning_rate": 5.0088967971530254e-05, "loss": 1.1987, "step": 8435 }, { "epoch": 3.7493333333333334, "grad_norm": 3.830598831176758, "learning_rate": 5.0071174377224204e-05, "loss": 1.2741, "step": 8436 }, { "epoch": 3.7497777777777777, "grad_norm": 3.5863218307495117, "learning_rate": 5.0053380782918146e-05, "loss": 1.7064, "step": 8437 }, { "epoch": 3.7502222222222223, "grad_norm": 4.064740180969238, "learning_rate": 5.0035587188612096e-05, "loss": 1.5107, "step": 8438 }, { "epoch": 3.7506666666666666, "grad_norm": 3.272308588027954, "learning_rate": 5.001779359430605e-05, "loss": 1.0924, "step": 8439 }, { "epoch": 3.7511111111111113, "grad_norm": 3.53114914894104, "learning_rate": 5e-05, "loss": 1.1262, "step": 8440 }, { "epoch": 3.7515555555555555, "grad_norm": 3.6443865299224854, "learning_rate": 4.998220640569395e-05, "loss": 1.2448, "step": 8441 }, { "epoch": 3.752, "grad_norm": 4.559260845184326, "learning_rate": 4.99644128113879e-05, "loss": 1.0831, "step": 8442 }, { "epoch": 3.7524444444444445, "grad_norm": 4.263270378112793, "learning_rate": 4.994661921708185e-05, "loss": 1.2406, "step": 8443 }, { "epoch": 3.752888888888889, "grad_norm": 4.247504234313965, "learning_rate": 4.99288256227758e-05, "loss": 1.2629, "step": 8444 }, { "epoch": 3.7533333333333334, "grad_norm": 3.752406358718872, "learning_rate": 4.991103202846975e-05, "loss": 1.1891, "step": 8445 }, { "epoch": 3.7537777777777777, "grad_norm": 3.750194787979126, "learning_rate": 4.98932384341637e-05, "loss": 1.3531, "step": 8446 }, { "epoch": 3.7542222222222223, "grad_norm": 5.040890216827393, "learning_rate": 4.987544483985766e-05, "loss": 1.2088, "step": 8447 }, { "epoch": 3.7546666666666666, "grad_norm": 4.1202006340026855, "learning_rate": 4.985765124555161e-05, "loss": 1.2779, "step": 8448 }, { "epoch": 3.755111111111111, "grad_norm": 4.093197345733643, "learning_rate": 4.983985765124555e-05, "loss": 0.832, "step": 8449 }, { "epoch": 3.7555555555555555, "grad_norm": 4.263852119445801, "learning_rate": 4.9822064056939506e-05, "loss": 0.4735, "step": 8450 }, { "epoch": 3.7560000000000002, "grad_norm": 1.3852638006210327, "learning_rate": 4.9804270462633456e-05, "loss": 0.6947, "step": 8451 }, { "epoch": 3.7564444444444445, "grad_norm": 2.5553395748138428, "learning_rate": 4.9786476868327406e-05, "loss": 1.6021, "step": 8452 }, { "epoch": 3.7568888888888887, "grad_norm": 2.2025933265686035, "learning_rate": 4.9768683274021356e-05, "loss": 1.1253, "step": 8453 }, { "epoch": 3.7573333333333334, "grad_norm": 2.739720582962036, "learning_rate": 4.9750889679715305e-05, "loss": 1.8841, "step": 8454 }, { "epoch": 3.7577777777777777, "grad_norm": 3.2897424697875977, "learning_rate": 4.9733096085409255e-05, "loss": 1.8541, "step": 8455 }, { "epoch": 3.7582222222222224, "grad_norm": 3.067814350128174, "learning_rate": 4.9715302491103205e-05, "loss": 1.4044, "step": 8456 }, { "epoch": 3.7586666666666666, "grad_norm": 2.3474533557891846, "learning_rate": 4.9697508896797154e-05, "loss": 1.1692, "step": 8457 }, { "epoch": 3.7591111111111113, "grad_norm": 3.4697251319885254, "learning_rate": 4.9679715302491104e-05, "loss": 1.6912, "step": 8458 }, { "epoch": 3.7595555555555555, "grad_norm": 3.053978443145752, "learning_rate": 4.9661921708185054e-05, "loss": 1.4072, "step": 8459 }, { "epoch": 3.76, "grad_norm": 3.327169179916382, "learning_rate": 4.964412811387901e-05, "loss": 1.6578, "step": 8460 }, { "epoch": 3.7604444444444445, "grad_norm": 3.0925164222717285, "learning_rate": 4.962633451957295e-05, "loss": 1.6706, "step": 8461 }, { "epoch": 3.7608888888888887, "grad_norm": 2.676941394805908, "learning_rate": 4.96085409252669e-05, "loss": 1.1866, "step": 8462 }, { "epoch": 3.7613333333333334, "grad_norm": 3.976247787475586, "learning_rate": 4.959074733096086e-05, "loss": 1.4686, "step": 8463 }, { "epoch": 3.7617777777777777, "grad_norm": 3.166390895843506, "learning_rate": 4.957295373665481e-05, "loss": 1.1084, "step": 8464 }, { "epoch": 3.7622222222222224, "grad_norm": 3.5380375385284424, "learning_rate": 4.955516014234875e-05, "loss": 1.5746, "step": 8465 }, { "epoch": 3.7626666666666666, "grad_norm": 2.127175807952881, "learning_rate": 4.953736654804271e-05, "loss": 0.4779, "step": 8466 }, { "epoch": 3.763111111111111, "grad_norm": 3.756263494491577, "learning_rate": 4.951957295373666e-05, "loss": 1.2304, "step": 8467 }, { "epoch": 3.7635555555555555, "grad_norm": 3.4877758026123047, "learning_rate": 4.950177935943061e-05, "loss": 1.7547, "step": 8468 }, { "epoch": 3.7640000000000002, "grad_norm": 3.6990151405334473, "learning_rate": 4.948398576512456e-05, "loss": 1.4276, "step": 8469 }, { "epoch": 3.7644444444444445, "grad_norm": 3.083824634552002, "learning_rate": 4.946619217081851e-05, "loss": 1.3949, "step": 8470 }, { "epoch": 3.7648888888888887, "grad_norm": 3.808980703353882, "learning_rate": 4.944839857651246e-05, "loss": 1.247, "step": 8471 }, { "epoch": 3.7653333333333334, "grad_norm": 3.5250051021575928, "learning_rate": 4.943060498220641e-05, "loss": 1.6016, "step": 8472 }, { "epoch": 3.7657777777777777, "grad_norm": 3.2846179008483887, "learning_rate": 4.941281138790036e-05, "loss": 1.1513, "step": 8473 }, { "epoch": 3.7662222222222224, "grad_norm": 3.1756033897399902, "learning_rate": 4.9395017793594306e-05, "loss": 1.2079, "step": 8474 }, { "epoch": 3.7666666666666666, "grad_norm": 3.1101205348968506, "learning_rate": 4.9377224199288256e-05, "loss": 1.1411, "step": 8475 }, { "epoch": 3.7671111111111113, "grad_norm": 3.436415433883667, "learning_rate": 4.935943060498221e-05, "loss": 1.2356, "step": 8476 }, { "epoch": 3.7675555555555555, "grad_norm": 3.413377285003662, "learning_rate": 4.934163701067616e-05, "loss": 1.6142, "step": 8477 }, { "epoch": 3.768, "grad_norm": 3.4654288291931152, "learning_rate": 4.9323843416370105e-05, "loss": 1.4022, "step": 8478 }, { "epoch": 3.7684444444444445, "grad_norm": 1.9338456392288208, "learning_rate": 4.930604982206406e-05, "loss": 0.5082, "step": 8479 }, { "epoch": 3.7688888888888887, "grad_norm": 3.8473315238952637, "learning_rate": 4.928825622775801e-05, "loss": 1.4336, "step": 8480 }, { "epoch": 3.7693333333333334, "grad_norm": 2.3992502689361572, "learning_rate": 4.927046263345196e-05, "loss": 0.7091, "step": 8481 }, { "epoch": 3.7697777777777777, "grad_norm": 3.7076449394226074, "learning_rate": 4.925266903914591e-05, "loss": 1.5299, "step": 8482 }, { "epoch": 3.7702222222222224, "grad_norm": 3.1709039211273193, "learning_rate": 4.923487544483986e-05, "loss": 1.0703, "step": 8483 }, { "epoch": 3.7706666666666666, "grad_norm": 3.58585262298584, "learning_rate": 4.921708185053381e-05, "loss": 1.3857, "step": 8484 }, { "epoch": 3.771111111111111, "grad_norm": 3.486786365509033, "learning_rate": 4.919928825622776e-05, "loss": 1.5706, "step": 8485 }, { "epoch": 3.7715555555555556, "grad_norm": 3.2322800159454346, "learning_rate": 4.918149466192171e-05, "loss": 0.9788, "step": 8486 }, { "epoch": 3.7720000000000002, "grad_norm": 3.4824700355529785, "learning_rate": 4.916370106761566e-05, "loss": 1.3818, "step": 8487 }, { "epoch": 3.7724444444444445, "grad_norm": 3.4638094902038574, "learning_rate": 4.914590747330961e-05, "loss": 1.3499, "step": 8488 }, { "epoch": 3.7728888888888887, "grad_norm": 3.505972146987915, "learning_rate": 4.912811387900356e-05, "loss": 1.0351, "step": 8489 }, { "epoch": 3.7733333333333334, "grad_norm": 3.177283525466919, "learning_rate": 4.911032028469751e-05, "loss": 1.4239, "step": 8490 }, { "epoch": 3.7737777777777777, "grad_norm": 3.8296144008636475, "learning_rate": 4.909252669039146e-05, "loss": 1.449, "step": 8491 }, { "epoch": 3.7742222222222224, "grad_norm": 3.507258415222168, "learning_rate": 4.9074733096085415e-05, "loss": 1.1, "step": 8492 }, { "epoch": 3.7746666666666666, "grad_norm": 3.143571615219116, "learning_rate": 4.9056939501779365e-05, "loss": 0.668, "step": 8493 }, { "epoch": 3.7751111111111113, "grad_norm": 4.362837314605713, "learning_rate": 4.903914590747331e-05, "loss": 1.2331, "step": 8494 }, { "epoch": 3.7755555555555556, "grad_norm": 4.254435062408447, "learning_rate": 4.9021352313167264e-05, "loss": 1.0221, "step": 8495 }, { "epoch": 3.776, "grad_norm": 3.589878797531128, "learning_rate": 4.9003558718861214e-05, "loss": 0.9752, "step": 8496 }, { "epoch": 3.7764444444444445, "grad_norm": 4.932192325592041, "learning_rate": 4.8985765124555164e-05, "loss": 1.1548, "step": 8497 }, { "epoch": 3.7768888888888887, "grad_norm": 4.631229400634766, "learning_rate": 4.896797153024911e-05, "loss": 1.2203, "step": 8498 }, { "epoch": 3.7773333333333334, "grad_norm": 5.158580780029297, "learning_rate": 4.895017793594306e-05, "loss": 1.5583, "step": 8499 }, { "epoch": 3.7777777777777777, "grad_norm": 4.34719181060791, "learning_rate": 4.893238434163701e-05, "loss": 0.8711, "step": 8500 }, { "epoch": 3.7782222222222224, "grad_norm": 1.7901962995529175, "learning_rate": 4.891459074733096e-05, "loss": 0.969, "step": 8501 }, { "epoch": 3.7786666666666666, "grad_norm": 2.875005006790161, "learning_rate": 4.889679715302491e-05, "loss": 1.8006, "step": 8502 }, { "epoch": 3.779111111111111, "grad_norm": 0.1947576105594635, "learning_rate": 4.887900355871886e-05, "loss": 0.0163, "step": 8503 }, { "epoch": 3.7795555555555556, "grad_norm": 2.6576716899871826, "learning_rate": 4.886120996441281e-05, "loss": 1.585, "step": 8504 }, { "epoch": 3.7800000000000002, "grad_norm": 2.818967342376709, "learning_rate": 4.884341637010677e-05, "loss": 1.658, "step": 8505 }, { "epoch": 3.7804444444444445, "grad_norm": 3.126274585723877, "learning_rate": 4.882562277580072e-05, "loss": 1.2724, "step": 8506 }, { "epoch": 3.7808888888888887, "grad_norm": 3.2297914028167725, "learning_rate": 4.880782918149466e-05, "loss": 1.7334, "step": 8507 }, { "epoch": 3.7813333333333334, "grad_norm": 3.0050487518310547, "learning_rate": 4.879003558718862e-05, "loss": 1.734, "step": 8508 }, { "epoch": 3.7817777777777777, "grad_norm": 3.3488566875457764, "learning_rate": 4.877224199288257e-05, "loss": 1.7469, "step": 8509 }, { "epoch": 3.7822222222222224, "grad_norm": 2.3252031803131104, "learning_rate": 4.875444839857651e-05, "loss": 0.6167, "step": 8510 }, { "epoch": 3.7826666666666666, "grad_norm": 3.154351234436035, "learning_rate": 4.8736654804270466e-05, "loss": 1.5694, "step": 8511 }, { "epoch": 3.7831111111111113, "grad_norm": 3.574852705001831, "learning_rate": 4.8718861209964416e-05, "loss": 1.4769, "step": 8512 }, { "epoch": 3.7835555555555556, "grad_norm": 2.932189464569092, "learning_rate": 4.8701067615658366e-05, "loss": 1.2641, "step": 8513 }, { "epoch": 3.784, "grad_norm": 3.2192270755767822, "learning_rate": 4.8683274021352316e-05, "loss": 1.5916, "step": 8514 }, { "epoch": 3.7844444444444445, "grad_norm": 3.2948851585388184, "learning_rate": 4.8665480427046265e-05, "loss": 1.4189, "step": 8515 }, { "epoch": 3.7848888888888887, "grad_norm": 3.3815078735351562, "learning_rate": 4.8647686832740215e-05, "loss": 1.5318, "step": 8516 }, { "epoch": 3.7853333333333334, "grad_norm": 3.34417724609375, "learning_rate": 4.8629893238434165e-05, "loss": 1.5307, "step": 8517 }, { "epoch": 3.7857777777777777, "grad_norm": 3.700117826461792, "learning_rate": 4.8612099644128115e-05, "loss": 0.8523, "step": 8518 }, { "epoch": 3.7862222222222224, "grad_norm": 3.057548999786377, "learning_rate": 4.8594306049822064e-05, "loss": 1.1474, "step": 8519 }, { "epoch": 3.7866666666666666, "grad_norm": 3.9406673908233643, "learning_rate": 4.8576512455516014e-05, "loss": 1.5789, "step": 8520 }, { "epoch": 3.787111111111111, "grad_norm": 2.2049951553344727, "learning_rate": 4.855871886120997e-05, "loss": 0.9952, "step": 8521 }, { "epoch": 3.7875555555555556, "grad_norm": 3.1826930046081543, "learning_rate": 4.854092526690392e-05, "loss": 1.1047, "step": 8522 }, { "epoch": 3.7880000000000003, "grad_norm": 5.1929192543029785, "learning_rate": 4.852313167259786e-05, "loss": 1.3763, "step": 8523 }, { "epoch": 3.7884444444444445, "grad_norm": 3.3125691413879395, "learning_rate": 4.850533807829182e-05, "loss": 1.5083, "step": 8524 }, { "epoch": 3.7888888888888888, "grad_norm": 2.958933115005493, "learning_rate": 4.848754448398577e-05, "loss": 0.9042, "step": 8525 }, { "epoch": 3.7893333333333334, "grad_norm": 3.5505728721618652, "learning_rate": 4.846975088967972e-05, "loss": 1.4218, "step": 8526 }, { "epoch": 3.7897777777777777, "grad_norm": 3.6742985248565674, "learning_rate": 4.845195729537367e-05, "loss": 1.1994, "step": 8527 }, { "epoch": 3.7902222222222224, "grad_norm": 3.1948156356811523, "learning_rate": 4.843416370106762e-05, "loss": 1.439, "step": 8528 }, { "epoch": 3.7906666666666666, "grad_norm": 3.452651262283325, "learning_rate": 4.841637010676157e-05, "loss": 0.9603, "step": 8529 }, { "epoch": 3.7911111111111113, "grad_norm": 3.6245038509368896, "learning_rate": 4.839857651245552e-05, "loss": 1.3753, "step": 8530 }, { "epoch": 3.7915555555555556, "grad_norm": 3.7631113529205322, "learning_rate": 4.838078291814947e-05, "loss": 1.3403, "step": 8531 }, { "epoch": 3.792, "grad_norm": 3.130178213119507, "learning_rate": 4.836298932384342e-05, "loss": 1.4048, "step": 8532 }, { "epoch": 3.7924444444444445, "grad_norm": 4.133440017700195, "learning_rate": 4.834519572953737e-05, "loss": 1.1999, "step": 8533 }, { "epoch": 3.7928888888888888, "grad_norm": 3.3631365299224854, "learning_rate": 4.832740213523132e-05, "loss": 1.1746, "step": 8534 }, { "epoch": 3.7933333333333334, "grad_norm": 1.9450854063034058, "learning_rate": 4.830960854092527e-05, "loss": 0.3845, "step": 8535 }, { "epoch": 3.7937777777777777, "grad_norm": 3.2177698612213135, "learning_rate": 4.8291814946619216e-05, "loss": 0.9918, "step": 8536 }, { "epoch": 3.7942222222222224, "grad_norm": 3.5220441818237305, "learning_rate": 4.827402135231317e-05, "loss": 1.4969, "step": 8537 }, { "epoch": 3.7946666666666666, "grad_norm": 4.0864996910095215, "learning_rate": 4.825622775800712e-05, "loss": 1.7111, "step": 8538 }, { "epoch": 3.795111111111111, "grad_norm": 3.846219062805176, "learning_rate": 4.8238434163701065e-05, "loss": 1.1624, "step": 8539 }, { "epoch": 3.7955555555555556, "grad_norm": 3.6705775260925293, "learning_rate": 4.822064056939502e-05, "loss": 1.132, "step": 8540 }, { "epoch": 3.7960000000000003, "grad_norm": 3.94295334815979, "learning_rate": 4.820284697508897e-05, "loss": 1.6029, "step": 8541 }, { "epoch": 3.7964444444444445, "grad_norm": 4.677298069000244, "learning_rate": 4.818505338078292e-05, "loss": 1.2734, "step": 8542 }, { "epoch": 3.7968888888888888, "grad_norm": 3.895962715148926, "learning_rate": 4.816725978647687e-05, "loss": 0.8572, "step": 8543 }, { "epoch": 3.7973333333333334, "grad_norm": 3.5614423751831055, "learning_rate": 4.814946619217082e-05, "loss": 1.3358, "step": 8544 }, { "epoch": 3.7977777777777777, "grad_norm": 4.679959774017334, "learning_rate": 4.813167259786477e-05, "loss": 1.301, "step": 8545 }, { "epoch": 3.7982222222222224, "grad_norm": 3.1577718257904053, "learning_rate": 4.811387900355872e-05, "loss": 0.7731, "step": 8546 }, { "epoch": 3.7986666666666666, "grad_norm": 3.898137331008911, "learning_rate": 4.809608540925267e-05, "loss": 1.1303, "step": 8547 }, { "epoch": 3.7991111111111113, "grad_norm": 4.067081928253174, "learning_rate": 4.807829181494662e-05, "loss": 1.231, "step": 8548 }, { "epoch": 3.7995555555555556, "grad_norm": 3.7059855461120605, "learning_rate": 4.806049822064057e-05, "loss": 1.2194, "step": 8549 }, { "epoch": 3.8, "grad_norm": 4.398168563842773, "learning_rate": 4.8042704626334526e-05, "loss": 1.011, "step": 8550 }, { "epoch": 3.8004444444444445, "grad_norm": 2.875924587249756, "learning_rate": 4.8024911032028476e-05, "loss": 1.9846, "step": 8551 }, { "epoch": 3.8008888888888888, "grad_norm": 2.9650471210479736, "learning_rate": 4.800711743772242e-05, "loss": 2.1224, "step": 8552 }, { "epoch": 3.8013333333333335, "grad_norm": 2.5322835445404053, "learning_rate": 4.7989323843416375e-05, "loss": 1.3393, "step": 8553 }, { "epoch": 3.8017777777777777, "grad_norm": 3.003859281539917, "learning_rate": 4.7971530249110325e-05, "loss": 1.2544, "step": 8554 }, { "epoch": 3.8022222222222224, "grad_norm": 3.0419626235961914, "learning_rate": 4.795373665480427e-05, "loss": 1.7736, "step": 8555 }, { "epoch": 3.8026666666666666, "grad_norm": 3.1113429069519043, "learning_rate": 4.7935943060498224e-05, "loss": 1.4109, "step": 8556 }, { "epoch": 3.803111111111111, "grad_norm": 3.09995698928833, "learning_rate": 4.7918149466192174e-05, "loss": 1.862, "step": 8557 }, { "epoch": 3.8035555555555556, "grad_norm": 2.92777156829834, "learning_rate": 4.7900355871886124e-05, "loss": 1.3818, "step": 8558 }, { "epoch": 3.8040000000000003, "grad_norm": 3.054277181625366, "learning_rate": 4.7882562277580073e-05, "loss": 1.6397, "step": 8559 }, { "epoch": 3.8044444444444445, "grad_norm": 3.6034891605377197, "learning_rate": 4.786476868327402e-05, "loss": 1.8047, "step": 8560 }, { "epoch": 3.8048888888888888, "grad_norm": 3.8269503116607666, "learning_rate": 4.784697508896797e-05, "loss": 1.6447, "step": 8561 }, { "epoch": 3.8053333333333335, "grad_norm": 3.4392542839050293, "learning_rate": 4.782918149466192e-05, "loss": 1.5413, "step": 8562 }, { "epoch": 3.8057777777777777, "grad_norm": 3.8955986499786377, "learning_rate": 4.781138790035587e-05, "loss": 1.7783, "step": 8563 }, { "epoch": 3.806222222222222, "grad_norm": 3.504478693008423, "learning_rate": 4.779359430604983e-05, "loss": 1.0836, "step": 8564 }, { "epoch": 3.8066666666666666, "grad_norm": 2.9569408893585205, "learning_rate": 4.777580071174377e-05, "loss": 1.1422, "step": 8565 }, { "epoch": 3.8071111111111113, "grad_norm": 3.3336431980133057, "learning_rate": 4.775800711743773e-05, "loss": 1.6741, "step": 8566 }, { "epoch": 3.8075555555555556, "grad_norm": 3.0466670989990234, "learning_rate": 4.774021352313168e-05, "loss": 1.207, "step": 8567 }, { "epoch": 3.808, "grad_norm": 3.735114574432373, "learning_rate": 4.772241992882562e-05, "loss": 1.0772, "step": 8568 }, { "epoch": 3.8084444444444445, "grad_norm": 3.699998617172241, "learning_rate": 4.770462633451958e-05, "loss": 1.3742, "step": 8569 }, { "epoch": 3.8088888888888888, "grad_norm": 3.7992329597473145, "learning_rate": 4.768683274021353e-05, "loss": 1.3175, "step": 8570 }, { "epoch": 3.8093333333333335, "grad_norm": 2.9106605052948, "learning_rate": 4.766903914590748e-05, "loss": 1.1899, "step": 8571 }, { "epoch": 3.8097777777777777, "grad_norm": 3.2745654582977295, "learning_rate": 4.7651245551601427e-05, "loss": 1.0309, "step": 8572 }, { "epoch": 3.8102222222222224, "grad_norm": 4.083169937133789, "learning_rate": 4.7633451957295376e-05, "loss": 1.5096, "step": 8573 }, { "epoch": 3.8106666666666666, "grad_norm": 3.3155415058135986, "learning_rate": 4.7615658362989326e-05, "loss": 1.3212, "step": 8574 }, { "epoch": 3.811111111111111, "grad_norm": 3.864699363708496, "learning_rate": 4.7597864768683276e-05, "loss": 1.3274, "step": 8575 }, { "epoch": 3.8115555555555556, "grad_norm": 3.174386739730835, "learning_rate": 4.7580071174377225e-05, "loss": 1.1286, "step": 8576 }, { "epoch": 3.8120000000000003, "grad_norm": 2.8707237243652344, "learning_rate": 4.7562277580071175e-05, "loss": 0.837, "step": 8577 }, { "epoch": 3.8124444444444445, "grad_norm": 3.41292405128479, "learning_rate": 4.7544483985765125e-05, "loss": 1.3842, "step": 8578 }, { "epoch": 3.8128888888888888, "grad_norm": 3.6212575435638428, "learning_rate": 4.7526690391459075e-05, "loss": 1.4985, "step": 8579 }, { "epoch": 3.8133333333333335, "grad_norm": 3.4787771701812744, "learning_rate": 4.750889679715303e-05, "loss": 1.2398, "step": 8580 }, { "epoch": 3.8137777777777777, "grad_norm": 3.5631048679351807, "learning_rate": 4.7491103202846974e-05, "loss": 1.2581, "step": 8581 }, { "epoch": 3.814222222222222, "grad_norm": 3.253061056137085, "learning_rate": 4.747330960854093e-05, "loss": 0.8728, "step": 8582 }, { "epoch": 3.8146666666666667, "grad_norm": 3.8624134063720703, "learning_rate": 4.745551601423488e-05, "loss": 1.3563, "step": 8583 }, { "epoch": 3.8151111111111113, "grad_norm": 3.724658966064453, "learning_rate": 4.743772241992882e-05, "loss": 1.2391, "step": 8584 }, { "epoch": 3.8155555555555556, "grad_norm": 4.286252975463867, "learning_rate": 4.741992882562278e-05, "loss": 1.6731, "step": 8585 }, { "epoch": 3.816, "grad_norm": 4.29559326171875, "learning_rate": 4.740213523131673e-05, "loss": 1.3075, "step": 8586 }, { "epoch": 3.8164444444444445, "grad_norm": 4.753545761108398, "learning_rate": 4.738434163701068e-05, "loss": 1.5687, "step": 8587 }, { "epoch": 3.8168888888888888, "grad_norm": 2.7094476222991943, "learning_rate": 4.736654804270463e-05, "loss": 0.731, "step": 8588 }, { "epoch": 3.8173333333333335, "grad_norm": 3.8449037075042725, "learning_rate": 4.734875444839858e-05, "loss": 1.4633, "step": 8589 }, { "epoch": 3.8177777777777777, "grad_norm": 3.7161953449249268, "learning_rate": 4.733096085409253e-05, "loss": 0.9841, "step": 8590 }, { "epoch": 3.8182222222222224, "grad_norm": 4.3916449546813965, "learning_rate": 4.731316725978648e-05, "loss": 1.2136, "step": 8591 }, { "epoch": 3.8186666666666667, "grad_norm": 2.985351800918579, "learning_rate": 4.729537366548043e-05, "loss": 0.7911, "step": 8592 }, { "epoch": 3.819111111111111, "grad_norm": 3.5371320247650146, "learning_rate": 4.7277580071174384e-05, "loss": 1.0655, "step": 8593 }, { "epoch": 3.8195555555555556, "grad_norm": 9.645051956176758, "learning_rate": 4.725978647686833e-05, "loss": 1.1596, "step": 8594 }, { "epoch": 3.82, "grad_norm": 4.049137115478516, "learning_rate": 4.7241992882562284e-05, "loss": 1.3832, "step": 8595 }, { "epoch": 3.8204444444444445, "grad_norm": 3.475752592086792, "learning_rate": 4.7224199288256233e-05, "loss": 1.171, "step": 8596 }, { "epoch": 3.820888888888889, "grad_norm": 4.638784408569336, "learning_rate": 4.7206405693950176e-05, "loss": 1.3144, "step": 8597 }, { "epoch": 3.8213333333333335, "grad_norm": 4.170307636260986, "learning_rate": 4.718861209964413e-05, "loss": 1.1174, "step": 8598 }, { "epoch": 3.8217777777777777, "grad_norm": 4.556606292724609, "learning_rate": 4.717081850533808e-05, "loss": 1.1181, "step": 8599 }, { "epoch": 3.822222222222222, "grad_norm": 4.029767990112305, "learning_rate": 4.7153024911032026e-05, "loss": 0.8181, "step": 8600 }, { "epoch": 3.8226666666666667, "grad_norm": 2.57649827003479, "learning_rate": 4.713523131672598e-05, "loss": 2.2343, "step": 8601 }, { "epoch": 3.8231111111111113, "grad_norm": 3.349898338317871, "learning_rate": 4.711743772241993e-05, "loss": 1.74, "step": 8602 }, { "epoch": 3.8235555555555556, "grad_norm": 3.0327224731445312, "learning_rate": 4.709964412811388e-05, "loss": 1.4284, "step": 8603 }, { "epoch": 3.824, "grad_norm": 3.228236436843872, "learning_rate": 4.708185053380783e-05, "loss": 1.6167, "step": 8604 }, { "epoch": 3.8244444444444445, "grad_norm": 3.0474486351013184, "learning_rate": 4.706405693950178e-05, "loss": 1.2462, "step": 8605 }, { "epoch": 3.824888888888889, "grad_norm": 3.509099006652832, "learning_rate": 4.704626334519573e-05, "loss": 1.7278, "step": 8606 }, { "epoch": 3.8253333333333335, "grad_norm": 2.920429229736328, "learning_rate": 4.702846975088968e-05, "loss": 1.5507, "step": 8607 }, { "epoch": 3.8257777777777777, "grad_norm": 1.919208288192749, "learning_rate": 4.701067615658363e-05, "loss": 0.6798, "step": 8608 }, { "epoch": 3.8262222222222224, "grad_norm": 3.108910322189331, "learning_rate": 4.699288256227759e-05, "loss": 0.9496, "step": 8609 }, { "epoch": 3.8266666666666667, "grad_norm": 3.4903645515441895, "learning_rate": 4.697508896797153e-05, "loss": 1.2665, "step": 8610 }, { "epoch": 3.827111111111111, "grad_norm": 3.292814254760742, "learning_rate": 4.6957295373665486e-05, "loss": 1.7923, "step": 8611 }, { "epoch": 3.8275555555555556, "grad_norm": 2.9696640968322754, "learning_rate": 4.6939501779359436e-05, "loss": 1.1515, "step": 8612 }, { "epoch": 3.828, "grad_norm": 2.911243438720703, "learning_rate": 4.692170818505338e-05, "loss": 1.2357, "step": 8613 }, { "epoch": 3.8284444444444445, "grad_norm": 3.220693588256836, "learning_rate": 4.6903914590747335e-05, "loss": 1.6873, "step": 8614 }, { "epoch": 3.828888888888889, "grad_norm": 3.2728185653686523, "learning_rate": 4.6886120996441285e-05, "loss": 0.9055, "step": 8615 }, { "epoch": 3.8293333333333335, "grad_norm": 3.27384090423584, "learning_rate": 4.6868327402135235e-05, "loss": 1.1947, "step": 8616 }, { "epoch": 3.8297777777777777, "grad_norm": 3.1152114868164062, "learning_rate": 4.6850533807829184e-05, "loss": 1.2603, "step": 8617 }, { "epoch": 3.830222222222222, "grad_norm": 2.450045108795166, "learning_rate": 4.6832740213523134e-05, "loss": 0.7963, "step": 8618 }, { "epoch": 3.8306666666666667, "grad_norm": 3.4387762546539307, "learning_rate": 4.6814946619217084e-05, "loss": 1.3703, "step": 8619 }, { "epoch": 3.8311111111111114, "grad_norm": 2.993537187576294, "learning_rate": 4.6797153024911034e-05, "loss": 1.0733, "step": 8620 }, { "epoch": 3.8315555555555556, "grad_norm": 3.041719436645508, "learning_rate": 4.677935943060498e-05, "loss": 1.1477, "step": 8621 }, { "epoch": 3.832, "grad_norm": 3.9076027870178223, "learning_rate": 4.676156583629894e-05, "loss": 1.3451, "step": 8622 }, { "epoch": 3.8324444444444445, "grad_norm": 3.3174734115600586, "learning_rate": 4.674377224199288e-05, "loss": 0.8637, "step": 8623 }, { "epoch": 3.832888888888889, "grad_norm": 3.7948451042175293, "learning_rate": 4.672597864768683e-05, "loss": 1.4609, "step": 8624 }, { "epoch": 3.8333333333333335, "grad_norm": 3.765836477279663, "learning_rate": 4.670818505338079e-05, "loss": 1.1835, "step": 8625 }, { "epoch": 3.8337777777777777, "grad_norm": 3.9058310985565186, "learning_rate": 4.669039145907473e-05, "loss": 1.5075, "step": 8626 }, { "epoch": 3.8342222222222224, "grad_norm": 3.738953113555908, "learning_rate": 4.667259786476869e-05, "loss": 1.2718, "step": 8627 }, { "epoch": 3.8346666666666667, "grad_norm": 2.5701253414154053, "learning_rate": 4.665480427046264e-05, "loss": 0.6246, "step": 8628 }, { "epoch": 3.835111111111111, "grad_norm": 4.04362678527832, "learning_rate": 4.663701067615658e-05, "loss": 1.4515, "step": 8629 }, { "epoch": 3.8355555555555556, "grad_norm": 1.7729761600494385, "learning_rate": 4.661921708185054e-05, "loss": 0.4256, "step": 8630 }, { "epoch": 3.836, "grad_norm": 3.309948444366455, "learning_rate": 4.660142348754449e-05, "loss": 0.6741, "step": 8631 }, { "epoch": 3.8364444444444445, "grad_norm": 3.3855819702148438, "learning_rate": 4.658362989323844e-05, "loss": 1.4831, "step": 8632 }, { "epoch": 3.836888888888889, "grad_norm": 3.421257734298706, "learning_rate": 4.656583629893239e-05, "loss": 1.3999, "step": 8633 }, { "epoch": 3.8373333333333335, "grad_norm": 3.1086058616638184, "learning_rate": 4.6548042704626336e-05, "loss": 0.8159, "step": 8634 }, { "epoch": 3.8377777777777777, "grad_norm": 3.4166300296783447, "learning_rate": 4.6530249110320286e-05, "loss": 1.3853, "step": 8635 }, { "epoch": 3.838222222222222, "grad_norm": 4.6446051597595215, "learning_rate": 4.6512455516014236e-05, "loss": 1.4196, "step": 8636 }, { "epoch": 3.8386666666666667, "grad_norm": 3.4511923789978027, "learning_rate": 4.6494661921708186e-05, "loss": 1.4235, "step": 8637 }, { "epoch": 3.8391111111111114, "grad_norm": 4.0160980224609375, "learning_rate": 4.647686832740214e-05, "loss": 1.1172, "step": 8638 }, { "epoch": 3.8395555555555556, "grad_norm": 3.6638247966766357, "learning_rate": 4.6459074733096085e-05, "loss": 0.8709, "step": 8639 }, { "epoch": 3.84, "grad_norm": 3.9915387630462646, "learning_rate": 4.644128113879004e-05, "loss": 1.3121, "step": 8640 }, { "epoch": 3.8404444444444445, "grad_norm": 3.331205368041992, "learning_rate": 4.642348754448399e-05, "loss": 0.9015, "step": 8641 }, { "epoch": 3.840888888888889, "grad_norm": 3.3656539916992188, "learning_rate": 4.6405693950177934e-05, "loss": 1.0394, "step": 8642 }, { "epoch": 3.8413333333333335, "grad_norm": 3.8786230087280273, "learning_rate": 4.638790035587189e-05, "loss": 1.1873, "step": 8643 }, { "epoch": 3.8417777777777777, "grad_norm": 4.622972011566162, "learning_rate": 4.637010676156584e-05, "loss": 1.1365, "step": 8644 }, { "epoch": 3.8422222222222224, "grad_norm": 3.3173351287841797, "learning_rate": 4.635231316725978e-05, "loss": 1.0542, "step": 8645 }, { "epoch": 3.8426666666666667, "grad_norm": 4.794419765472412, "learning_rate": 4.633451957295374e-05, "loss": 1.4618, "step": 8646 }, { "epoch": 3.843111111111111, "grad_norm": 4.264388084411621, "learning_rate": 4.631672597864769e-05, "loss": 0.9792, "step": 8647 }, { "epoch": 3.8435555555555556, "grad_norm": 5.3533501625061035, "learning_rate": 4.629893238434164e-05, "loss": 1.031, "step": 8648 }, { "epoch": 3.844, "grad_norm": 3.8206562995910645, "learning_rate": 4.628113879003559e-05, "loss": 0.9972, "step": 8649 }, { "epoch": 3.8444444444444446, "grad_norm": 3.9211204051971436, "learning_rate": 4.626334519572954e-05, "loss": 0.4228, "step": 8650 }, { "epoch": 3.844888888888889, "grad_norm": 2.347119092941284, "learning_rate": 4.6245551601423495e-05, "loss": 1.5638, "step": 8651 }, { "epoch": 3.8453333333333335, "grad_norm": 2.3549845218658447, "learning_rate": 4.622775800711744e-05, "loss": 1.7397, "step": 8652 }, { "epoch": 3.8457777777777777, "grad_norm": 2.7358062267303467, "learning_rate": 4.620996441281139e-05, "loss": 1.5028, "step": 8653 }, { "epoch": 3.846222222222222, "grad_norm": 3.0936498641967773, "learning_rate": 4.6192170818505344e-05, "loss": 1.7485, "step": 8654 }, { "epoch": 3.8466666666666667, "grad_norm": 1.712854266166687, "learning_rate": 4.617437722419929e-05, "loss": 0.8592, "step": 8655 }, { "epoch": 3.8471111111111114, "grad_norm": 3.0746419429779053, "learning_rate": 4.6156583629893244e-05, "loss": 1.7988, "step": 8656 }, { "epoch": 3.8475555555555556, "grad_norm": 2.9364185333251953, "learning_rate": 4.6138790035587194e-05, "loss": 1.5508, "step": 8657 }, { "epoch": 3.848, "grad_norm": 3.4229776859283447, "learning_rate": 4.6120996441281137e-05, "loss": 1.9598, "step": 8658 }, { "epoch": 3.8484444444444446, "grad_norm": 3.2707581520080566, "learning_rate": 4.610320284697509e-05, "loss": 2.055, "step": 8659 }, { "epoch": 3.848888888888889, "grad_norm": 3.055335521697998, "learning_rate": 4.608540925266904e-05, "loss": 1.3324, "step": 8660 }, { "epoch": 3.8493333333333335, "grad_norm": 3.291973114013672, "learning_rate": 4.606761565836299e-05, "loss": 2.014, "step": 8661 }, { "epoch": 3.8497777777777777, "grad_norm": 2.739182233810425, "learning_rate": 4.604982206405694e-05, "loss": 0.6788, "step": 8662 }, { "epoch": 3.8502222222222224, "grad_norm": 2.2576968669891357, "learning_rate": 4.603202846975089e-05, "loss": 0.7666, "step": 8663 }, { "epoch": 3.8506666666666667, "grad_norm": 3.3199477195739746, "learning_rate": 4.601423487544484e-05, "loss": 1.4767, "step": 8664 }, { "epoch": 3.851111111111111, "grad_norm": 3.386324405670166, "learning_rate": 4.599644128113879e-05, "loss": 1.273, "step": 8665 }, { "epoch": 3.8515555555555556, "grad_norm": 3.1465635299682617, "learning_rate": 4.597864768683274e-05, "loss": 1.5099, "step": 8666 }, { "epoch": 3.852, "grad_norm": 3.1457812786102295, "learning_rate": 4.59608540925267e-05, "loss": 1.2209, "step": 8667 }, { "epoch": 3.8524444444444446, "grad_norm": 3.257748603820801, "learning_rate": 4.594306049822064e-05, "loss": 1.5008, "step": 8668 }, { "epoch": 3.852888888888889, "grad_norm": 3.149172306060791, "learning_rate": 4.592526690391459e-05, "loss": 1.2685, "step": 8669 }, { "epoch": 3.8533333333333335, "grad_norm": 2.9013049602508545, "learning_rate": 4.590747330960855e-05, "loss": 1.0847, "step": 8670 }, { "epoch": 3.8537777777777777, "grad_norm": 2.962352752685547, "learning_rate": 4.588967971530249e-05, "loss": 0.8992, "step": 8671 }, { "epoch": 3.854222222222222, "grad_norm": 3.3367416858673096, "learning_rate": 4.5871886120996446e-05, "loss": 1.6202, "step": 8672 }, { "epoch": 3.8546666666666667, "grad_norm": 3.3775758743286133, "learning_rate": 4.5854092526690396e-05, "loss": 1.727, "step": 8673 }, { "epoch": 3.8551111111111114, "grad_norm": 1.9687055349349976, "learning_rate": 4.583629893238434e-05, "loss": 0.7593, "step": 8674 }, { "epoch": 3.8555555555555556, "grad_norm": 3.5084187984466553, "learning_rate": 4.5818505338078295e-05, "loss": 1.3926, "step": 8675 }, { "epoch": 3.856, "grad_norm": 3.405186176300049, "learning_rate": 4.5800711743772245e-05, "loss": 1.5413, "step": 8676 }, { "epoch": 3.8564444444444446, "grad_norm": 3.67044734954834, "learning_rate": 4.5782918149466195e-05, "loss": 1.5604, "step": 8677 }, { "epoch": 3.856888888888889, "grad_norm": 3.343731641769409, "learning_rate": 4.5765124555160144e-05, "loss": 1.4601, "step": 8678 }, { "epoch": 3.857333333333333, "grad_norm": 3.770385265350342, "learning_rate": 4.5747330960854094e-05, "loss": 1.3078, "step": 8679 }, { "epoch": 3.8577777777777778, "grad_norm": 3.8169827461242676, "learning_rate": 4.572953736654805e-05, "loss": 1.362, "step": 8680 }, { "epoch": 3.8582222222222224, "grad_norm": 3.3232243061065674, "learning_rate": 4.5711743772241994e-05, "loss": 1.4338, "step": 8681 }, { "epoch": 3.8586666666666667, "grad_norm": 4.413866996765137, "learning_rate": 4.569395017793594e-05, "loss": 1.5802, "step": 8682 }, { "epoch": 3.859111111111111, "grad_norm": 3.514523983001709, "learning_rate": 4.56761565836299e-05, "loss": 1.5358, "step": 8683 }, { "epoch": 3.8595555555555556, "grad_norm": 3.5461461544036865, "learning_rate": 4.565836298932384e-05, "loss": 1.3908, "step": 8684 }, { "epoch": 3.86, "grad_norm": 3.1903622150421143, "learning_rate": 4.56405693950178e-05, "loss": 1.0759, "step": 8685 }, { "epoch": 3.8604444444444446, "grad_norm": 3.541867971420288, "learning_rate": 4.562277580071175e-05, "loss": 1.4128, "step": 8686 }, { "epoch": 3.860888888888889, "grad_norm": 3.281956434249878, "learning_rate": 4.560498220640569e-05, "loss": 1.2347, "step": 8687 }, { "epoch": 3.8613333333333335, "grad_norm": 3.273998498916626, "learning_rate": 4.558718861209965e-05, "loss": 0.9553, "step": 8688 }, { "epoch": 3.8617777777777778, "grad_norm": 3.5180327892303467, "learning_rate": 4.55693950177936e-05, "loss": 1.2825, "step": 8689 }, { "epoch": 3.862222222222222, "grad_norm": 3.3110413551330566, "learning_rate": 4.555160142348754e-05, "loss": 1.1399, "step": 8690 }, { "epoch": 3.8626666666666667, "grad_norm": 3.2750625610351562, "learning_rate": 4.55338078291815e-05, "loss": 1.181, "step": 8691 }, { "epoch": 3.8631111111111114, "grad_norm": 3.3777711391448975, "learning_rate": 4.551601423487545e-05, "loss": 0.9839, "step": 8692 }, { "epoch": 3.8635555555555556, "grad_norm": 4.099118232727051, "learning_rate": 4.54982206405694e-05, "loss": 1.184, "step": 8693 }, { "epoch": 3.864, "grad_norm": 3.8149666786193848, "learning_rate": 4.548042704626335e-05, "loss": 1.387, "step": 8694 }, { "epoch": 3.8644444444444446, "grad_norm": 3.8572885990142822, "learning_rate": 4.5462633451957297e-05, "loss": 1.1251, "step": 8695 }, { "epoch": 3.864888888888889, "grad_norm": 3.872878313064575, "learning_rate": 4.544483985765125e-05, "loss": 1.2452, "step": 8696 }, { "epoch": 3.865333333333333, "grad_norm": 4.542336940765381, "learning_rate": 4.5427046263345196e-05, "loss": 1.1995, "step": 8697 }, { "epoch": 3.8657777777777778, "grad_norm": 4.309566974639893, "learning_rate": 4.5409252669039146e-05, "loss": 1.4818, "step": 8698 }, { "epoch": 3.8662222222222224, "grad_norm": 4.477492332458496, "learning_rate": 4.53914590747331e-05, "loss": 1.4093, "step": 8699 }, { "epoch": 3.8666666666666667, "grad_norm": 3.625142812728882, "learning_rate": 4.5373665480427045e-05, "loss": 0.8632, "step": 8700 }, { "epoch": 3.867111111111111, "grad_norm": 1.706745982170105, "learning_rate": 4.5355871886121e-05, "loss": 1.0516, "step": 8701 }, { "epoch": 3.8675555555555556, "grad_norm": 2.5270049571990967, "learning_rate": 4.533807829181495e-05, "loss": 1.6525, "step": 8702 }, { "epoch": 3.868, "grad_norm": 1.587844729423523, "learning_rate": 4.5320284697508894e-05, "loss": 0.7329, "step": 8703 }, { "epoch": 3.8684444444444446, "grad_norm": 3.1046266555786133, "learning_rate": 4.530249110320285e-05, "loss": 1.6955, "step": 8704 }, { "epoch": 3.868888888888889, "grad_norm": 3.3074681758880615, "learning_rate": 4.52846975088968e-05, "loss": 1.6316, "step": 8705 }, { "epoch": 3.8693333333333335, "grad_norm": 3.674323320388794, "learning_rate": 4.526690391459075e-05, "loss": 2.3296, "step": 8706 }, { "epoch": 3.8697777777777778, "grad_norm": 3.3690025806427, "learning_rate": 4.52491103202847e-05, "loss": 1.4658, "step": 8707 }, { "epoch": 3.870222222222222, "grad_norm": 3.0607759952545166, "learning_rate": 4.523131672597865e-05, "loss": 1.8122, "step": 8708 }, { "epoch": 3.8706666666666667, "grad_norm": 3.5003387928009033, "learning_rate": 4.52135231316726e-05, "loss": 1.5672, "step": 8709 }, { "epoch": 3.871111111111111, "grad_norm": 3.6459617614746094, "learning_rate": 4.519572953736655e-05, "loss": 1.6405, "step": 8710 }, { "epoch": 3.8715555555555556, "grad_norm": 3.704782247543335, "learning_rate": 4.51779359430605e-05, "loss": 1.7152, "step": 8711 }, { "epoch": 3.872, "grad_norm": 3.561216115951538, "learning_rate": 4.5160142348754455e-05, "loss": 1.2059, "step": 8712 }, { "epoch": 3.8724444444444446, "grad_norm": 3.5210413932800293, "learning_rate": 4.51423487544484e-05, "loss": 1.4903, "step": 8713 }, { "epoch": 3.872888888888889, "grad_norm": 3.023265838623047, "learning_rate": 4.512455516014235e-05, "loss": 1.3212, "step": 8714 }, { "epoch": 3.873333333333333, "grad_norm": 3.663731098175049, "learning_rate": 4.5106761565836305e-05, "loss": 1.3391, "step": 8715 }, { "epoch": 3.8737777777777778, "grad_norm": 3.64522385597229, "learning_rate": 4.508896797153025e-05, "loss": 0.937, "step": 8716 }, { "epoch": 3.8742222222222225, "grad_norm": 3.8014652729034424, "learning_rate": 4.5071174377224204e-05, "loss": 1.5124, "step": 8717 }, { "epoch": 3.8746666666666667, "grad_norm": 2.626232624053955, "learning_rate": 4.5053380782918154e-05, "loss": 0.9118, "step": 8718 }, { "epoch": 3.875111111111111, "grad_norm": 3.8472952842712402, "learning_rate": 4.50355871886121e-05, "loss": 1.2868, "step": 8719 }, { "epoch": 3.8755555555555556, "grad_norm": 3.5172226428985596, "learning_rate": 4.501779359430605e-05, "loss": 1.4457, "step": 8720 }, { "epoch": 3.876, "grad_norm": 2.7889018058776855, "learning_rate": 4.5e-05, "loss": 1.2991, "step": 8721 }, { "epoch": 3.8764444444444446, "grad_norm": 3.4194562435150146, "learning_rate": 4.498220640569395e-05, "loss": 1.5408, "step": 8722 }, { "epoch": 3.876888888888889, "grad_norm": 3.4279887676239014, "learning_rate": 4.49644128113879e-05, "loss": 1.0715, "step": 8723 }, { "epoch": 3.8773333333333335, "grad_norm": 3.7102408409118652, "learning_rate": 4.494661921708185e-05, "loss": 1.19, "step": 8724 }, { "epoch": 3.8777777777777778, "grad_norm": 4.325709342956543, "learning_rate": 4.492882562277581e-05, "loss": 0.7462, "step": 8725 }, { "epoch": 3.878222222222222, "grad_norm": 3.521773099899292, "learning_rate": 4.491103202846975e-05, "loss": 0.9747, "step": 8726 }, { "epoch": 3.8786666666666667, "grad_norm": 3.7296512126922607, "learning_rate": 4.48932384341637e-05, "loss": 1.3466, "step": 8727 }, { "epoch": 3.879111111111111, "grad_norm": 2.833895444869995, "learning_rate": 4.487544483985766e-05, "loss": 1.0873, "step": 8728 }, { "epoch": 3.8795555555555556, "grad_norm": 3.184312105178833, "learning_rate": 4.48576512455516e-05, "loss": 1.184, "step": 8729 }, { "epoch": 3.88, "grad_norm": 2.265151023864746, "learning_rate": 4.483985765124555e-05, "loss": 0.5063, "step": 8730 }, { "epoch": 3.8804444444444446, "grad_norm": 3.3480918407440186, "learning_rate": 4.482206405693951e-05, "loss": 1.0596, "step": 8731 }, { "epoch": 3.880888888888889, "grad_norm": 4.041034698486328, "learning_rate": 4.480427046263345e-05, "loss": 1.4421, "step": 8732 }, { "epoch": 3.881333333333333, "grad_norm": 2.776762008666992, "learning_rate": 4.4786476868327406e-05, "loss": 0.9752, "step": 8733 }, { "epoch": 3.8817777777777778, "grad_norm": 3.5926032066345215, "learning_rate": 4.4768683274021356e-05, "loss": 1.1643, "step": 8734 }, { "epoch": 3.8822222222222225, "grad_norm": 3.4924111366271973, "learning_rate": 4.47508896797153e-05, "loss": 1.1958, "step": 8735 }, { "epoch": 3.8826666666666667, "grad_norm": 3.821939468383789, "learning_rate": 4.4733096085409255e-05, "loss": 1.3903, "step": 8736 }, { "epoch": 3.883111111111111, "grad_norm": 2.8820078372955322, "learning_rate": 4.4715302491103205e-05, "loss": 0.5837, "step": 8737 }, { "epoch": 3.8835555555555556, "grad_norm": 4.466034889221191, "learning_rate": 4.4697508896797155e-05, "loss": 1.4463, "step": 8738 }, { "epoch": 3.884, "grad_norm": 3.6252734661102295, "learning_rate": 4.4679715302491105e-05, "loss": 1.1598, "step": 8739 }, { "epoch": 3.8844444444444446, "grad_norm": 3.406493902206421, "learning_rate": 4.4661921708185054e-05, "loss": 0.9051, "step": 8740 }, { "epoch": 3.884888888888889, "grad_norm": 3.9739794731140137, "learning_rate": 4.464412811387901e-05, "loss": 1.7437, "step": 8741 }, { "epoch": 3.8853333333333335, "grad_norm": 4.158414840698242, "learning_rate": 4.4626334519572954e-05, "loss": 1.689, "step": 8742 }, { "epoch": 3.8857777777777778, "grad_norm": 4.537271022796631, "learning_rate": 4.4608540925266903e-05, "loss": 1.1898, "step": 8743 }, { "epoch": 3.886222222222222, "grad_norm": 2.962108612060547, "learning_rate": 4.459074733096086e-05, "loss": 1.0265, "step": 8744 }, { "epoch": 3.8866666666666667, "grad_norm": 3.3965911865234375, "learning_rate": 4.45729537366548e-05, "loss": 1.1665, "step": 8745 }, { "epoch": 3.887111111111111, "grad_norm": 4.494457244873047, "learning_rate": 4.455516014234876e-05, "loss": 1.4225, "step": 8746 }, { "epoch": 3.8875555555555557, "grad_norm": 4.060354709625244, "learning_rate": 4.453736654804271e-05, "loss": 1.1413, "step": 8747 }, { "epoch": 3.888, "grad_norm": 4.178462982177734, "learning_rate": 4.451957295373665e-05, "loss": 1.4544, "step": 8748 }, { "epoch": 3.8884444444444446, "grad_norm": 3.7196671962738037, "learning_rate": 4.450177935943061e-05, "loss": 1.0486, "step": 8749 }, { "epoch": 3.888888888888889, "grad_norm": 5.249838352203369, "learning_rate": 4.448398576512456e-05, "loss": 1.2359, "step": 8750 }, { "epoch": 3.889333333333333, "grad_norm": 2.497725248336792, "learning_rate": 4.446619217081851e-05, "loss": 1.4867, "step": 8751 }, { "epoch": 3.889777777777778, "grad_norm": 2.9607138633728027, "learning_rate": 4.444839857651246e-05, "loss": 1.9622, "step": 8752 }, { "epoch": 3.8902222222222225, "grad_norm": 2.8709583282470703, "learning_rate": 4.443060498220641e-05, "loss": 1.7152, "step": 8753 }, { "epoch": 3.8906666666666667, "grad_norm": 2.264155387878418, "learning_rate": 4.441281138790036e-05, "loss": 1.1801, "step": 8754 }, { "epoch": 3.891111111111111, "grad_norm": 2.909691333770752, "learning_rate": 4.439501779359431e-05, "loss": 1.4905, "step": 8755 }, { "epoch": 3.8915555555555557, "grad_norm": 2.7087597846984863, "learning_rate": 4.437722419928826e-05, "loss": 1.0053, "step": 8756 }, { "epoch": 3.892, "grad_norm": 3.0195884704589844, "learning_rate": 4.435943060498221e-05, "loss": 1.4873, "step": 8757 }, { "epoch": 3.8924444444444446, "grad_norm": 1.9923820495605469, "learning_rate": 4.4341637010676156e-05, "loss": 0.7845, "step": 8758 }, { "epoch": 3.892888888888889, "grad_norm": 3.6413817405700684, "learning_rate": 4.4323843416370106e-05, "loss": 1.5665, "step": 8759 }, { "epoch": 3.8933333333333335, "grad_norm": 3.3323585987091064, "learning_rate": 4.430604982206406e-05, "loss": 1.3828, "step": 8760 }, { "epoch": 3.893777777777778, "grad_norm": 3.166923999786377, "learning_rate": 4.4288256227758005e-05, "loss": 1.5736, "step": 8761 }, { "epoch": 3.894222222222222, "grad_norm": 3.236964464187622, "learning_rate": 4.427046263345196e-05, "loss": 1.5058, "step": 8762 }, { "epoch": 3.8946666666666667, "grad_norm": 3.7442686557769775, "learning_rate": 4.425266903914591e-05, "loss": 1.1915, "step": 8763 }, { "epoch": 3.895111111111111, "grad_norm": 3.836761236190796, "learning_rate": 4.4234875444839854e-05, "loss": 1.8305, "step": 8764 }, { "epoch": 3.8955555555555557, "grad_norm": 3.648864984512329, "learning_rate": 4.421708185053381e-05, "loss": 1.4003, "step": 8765 }, { "epoch": 3.896, "grad_norm": 3.3367178440093994, "learning_rate": 4.419928825622776e-05, "loss": 1.5771, "step": 8766 }, { "epoch": 3.8964444444444446, "grad_norm": 4.111900806427002, "learning_rate": 4.418149466192171e-05, "loss": 1.5626, "step": 8767 }, { "epoch": 3.896888888888889, "grad_norm": 3.5727264881134033, "learning_rate": 4.416370106761566e-05, "loss": 1.5124, "step": 8768 }, { "epoch": 3.897333333333333, "grad_norm": 3.073716640472412, "learning_rate": 4.414590747330961e-05, "loss": 1.3135, "step": 8769 }, { "epoch": 3.897777777777778, "grad_norm": 3.225069046020508, "learning_rate": 4.4128113879003566e-05, "loss": 1.4713, "step": 8770 }, { "epoch": 3.8982222222222225, "grad_norm": 3.486926555633545, "learning_rate": 4.411032028469751e-05, "loss": 1.2325, "step": 8771 }, { "epoch": 3.8986666666666667, "grad_norm": 3.7756948471069336, "learning_rate": 4.409252669039146e-05, "loss": 1.4494, "step": 8772 }, { "epoch": 3.899111111111111, "grad_norm": 3.8202269077301025, "learning_rate": 4.4074733096085415e-05, "loss": 1.3107, "step": 8773 }, { "epoch": 3.8995555555555557, "grad_norm": 3.3835747241973877, "learning_rate": 4.405693950177936e-05, "loss": 1.5086, "step": 8774 }, { "epoch": 3.9, "grad_norm": 3.0347068309783936, "learning_rate": 4.403914590747331e-05, "loss": 1.3952, "step": 8775 }, { "epoch": 3.9004444444444446, "grad_norm": 3.6462948322296143, "learning_rate": 4.4021352313167265e-05, "loss": 1.0433, "step": 8776 }, { "epoch": 3.900888888888889, "grad_norm": 4.498116970062256, "learning_rate": 4.400355871886121e-05, "loss": 1.5242, "step": 8777 }, { "epoch": 3.9013333333333335, "grad_norm": 3.007981061935425, "learning_rate": 4.3985765124555164e-05, "loss": 1.2649, "step": 8778 }, { "epoch": 3.901777777777778, "grad_norm": 4.1256208419799805, "learning_rate": 4.3967971530249114e-05, "loss": 0.9533, "step": 8779 }, { "epoch": 3.902222222222222, "grad_norm": 3.121368169784546, "learning_rate": 4.395017793594306e-05, "loss": 1.1291, "step": 8780 }, { "epoch": 3.9026666666666667, "grad_norm": 2.9976303577423096, "learning_rate": 4.393238434163701e-05, "loss": 1.0546, "step": 8781 }, { "epoch": 3.903111111111111, "grad_norm": 3.8373613357543945, "learning_rate": 4.391459074733096e-05, "loss": 1.2271, "step": 8782 }, { "epoch": 3.9035555555555557, "grad_norm": 3.4057092666625977, "learning_rate": 4.389679715302491e-05, "loss": 1.0357, "step": 8783 }, { "epoch": 3.904, "grad_norm": 3.2696046829223633, "learning_rate": 4.387900355871886e-05, "loss": 1.0851, "step": 8784 }, { "epoch": 3.9044444444444446, "grad_norm": 3.9882287979125977, "learning_rate": 4.386120996441281e-05, "loss": 0.895, "step": 8785 }, { "epoch": 3.904888888888889, "grad_norm": 2.950498104095459, "learning_rate": 4.384341637010677e-05, "loss": 1.1437, "step": 8786 }, { "epoch": 3.905333333333333, "grad_norm": 3.5203466415405273, "learning_rate": 4.382562277580071e-05, "loss": 1.3823, "step": 8787 }, { "epoch": 3.905777777777778, "grad_norm": 2.380671262741089, "learning_rate": 4.380782918149466e-05, "loss": 0.7028, "step": 8788 }, { "epoch": 3.9062222222222225, "grad_norm": 3.788663625717163, "learning_rate": 4.379003558718862e-05, "loss": 1.3234, "step": 8789 }, { "epoch": 3.9066666666666667, "grad_norm": 2.707402467727661, "learning_rate": 4.377224199288256e-05, "loss": 0.5875, "step": 8790 }, { "epoch": 3.907111111111111, "grad_norm": 3.8801896572113037, "learning_rate": 4.375444839857652e-05, "loss": 1.3117, "step": 8791 }, { "epoch": 3.9075555555555557, "grad_norm": 3.4378976821899414, "learning_rate": 4.373665480427047e-05, "loss": 1.6461, "step": 8792 }, { "epoch": 3.908, "grad_norm": 4.15399169921875, "learning_rate": 4.371886120996441e-05, "loss": 1.313, "step": 8793 }, { "epoch": 3.9084444444444446, "grad_norm": 5.039565086364746, "learning_rate": 4.3701067615658366e-05, "loss": 1.3966, "step": 8794 }, { "epoch": 3.908888888888889, "grad_norm": 4.0032854080200195, "learning_rate": 4.3683274021352316e-05, "loss": 1.4609, "step": 8795 }, { "epoch": 3.9093333333333335, "grad_norm": 3.7062151432037354, "learning_rate": 4.3665480427046266e-05, "loss": 1.0876, "step": 8796 }, { "epoch": 3.909777777777778, "grad_norm": 4.596202373504639, "learning_rate": 4.3647686832740216e-05, "loss": 1.5062, "step": 8797 }, { "epoch": 3.910222222222222, "grad_norm": 4.020972728729248, "learning_rate": 4.3629893238434165e-05, "loss": 1.3268, "step": 8798 }, { "epoch": 3.9106666666666667, "grad_norm": 4.46404504776001, "learning_rate": 4.3612099644128115e-05, "loss": 0.916, "step": 8799 }, { "epoch": 3.911111111111111, "grad_norm": 3.5452182292938232, "learning_rate": 4.3594306049822065e-05, "loss": 0.8356, "step": 8800 }, { "epoch": 3.9115555555555557, "grad_norm": 2.5292141437530518, "learning_rate": 4.3576512455516014e-05, "loss": 2.0216, "step": 8801 }, { "epoch": 3.912, "grad_norm": 2.2973062992095947, "learning_rate": 4.355871886120997e-05, "loss": 0.8748, "step": 8802 }, { "epoch": 3.9124444444444446, "grad_norm": 2.9960880279541016, "learning_rate": 4.3540925266903914e-05, "loss": 1.6939, "step": 8803 }, { "epoch": 3.912888888888889, "grad_norm": 2.8162448406219482, "learning_rate": 4.3523131672597864e-05, "loss": 1.1804, "step": 8804 }, { "epoch": 3.913333333333333, "grad_norm": 3.2202110290527344, "learning_rate": 4.350533807829182e-05, "loss": 1.2968, "step": 8805 }, { "epoch": 3.913777777777778, "grad_norm": 3.30061936378479, "learning_rate": 4.348754448398576e-05, "loss": 1.4436, "step": 8806 }, { "epoch": 3.9142222222222225, "grad_norm": 3.3244714736938477, "learning_rate": 4.346975088967972e-05, "loss": 1.6709, "step": 8807 }, { "epoch": 3.9146666666666667, "grad_norm": 3.8059160709381104, "learning_rate": 4.345195729537367e-05, "loss": 1.5652, "step": 8808 }, { "epoch": 3.915111111111111, "grad_norm": 4.244156360626221, "learning_rate": 4.343416370106761e-05, "loss": 1.6886, "step": 8809 }, { "epoch": 3.9155555555555557, "grad_norm": 3.246370553970337, "learning_rate": 4.341637010676157e-05, "loss": 1.2196, "step": 8810 }, { "epoch": 3.916, "grad_norm": 3.3758699893951416, "learning_rate": 4.339857651245552e-05, "loss": 1.3343, "step": 8811 }, { "epoch": 3.916444444444444, "grad_norm": 3.9745421409606934, "learning_rate": 4.338078291814947e-05, "loss": 1.5243, "step": 8812 }, { "epoch": 3.916888888888889, "grad_norm": 2.7803597450256348, "learning_rate": 4.336298932384342e-05, "loss": 0.7635, "step": 8813 }, { "epoch": 3.9173333333333336, "grad_norm": 2.751984119415283, "learning_rate": 4.334519572953737e-05, "loss": 1.0825, "step": 8814 }, { "epoch": 3.917777777777778, "grad_norm": 3.5463430881500244, "learning_rate": 4.3327402135231324e-05, "loss": 0.7509, "step": 8815 }, { "epoch": 3.918222222222222, "grad_norm": 2.5409305095672607, "learning_rate": 4.330960854092527e-05, "loss": 0.7807, "step": 8816 }, { "epoch": 3.9186666666666667, "grad_norm": 3.2667789459228516, "learning_rate": 4.329181494661922e-05, "loss": 1.3171, "step": 8817 }, { "epoch": 3.919111111111111, "grad_norm": 2.9117629528045654, "learning_rate": 4.327402135231317e-05, "loss": 1.1148, "step": 8818 }, { "epoch": 3.9195555555555557, "grad_norm": 3.8055434226989746, "learning_rate": 4.3256227758007116e-05, "loss": 1.6964, "step": 8819 }, { "epoch": 3.92, "grad_norm": 3.8291947841644287, "learning_rate": 4.3238434163701066e-05, "loss": 1.5032, "step": 8820 }, { "epoch": 3.9204444444444446, "grad_norm": 3.757000207901001, "learning_rate": 4.322064056939502e-05, "loss": 1.4395, "step": 8821 }, { "epoch": 3.920888888888889, "grad_norm": 3.474553108215332, "learning_rate": 4.3202846975088965e-05, "loss": 1.2536, "step": 8822 }, { "epoch": 3.921333333333333, "grad_norm": 3.879840612411499, "learning_rate": 4.318505338078292e-05, "loss": 1.3524, "step": 8823 }, { "epoch": 3.921777777777778, "grad_norm": 3.432889699935913, "learning_rate": 4.316725978647687e-05, "loss": 1.5825, "step": 8824 }, { "epoch": 3.9222222222222225, "grad_norm": 3.799661159515381, "learning_rate": 4.314946619217082e-05, "loss": 1.4398, "step": 8825 }, { "epoch": 3.9226666666666667, "grad_norm": 3.7226650714874268, "learning_rate": 4.313167259786477e-05, "loss": 1.1822, "step": 8826 }, { "epoch": 3.923111111111111, "grad_norm": 3.7417545318603516, "learning_rate": 4.311387900355872e-05, "loss": 1.4244, "step": 8827 }, { "epoch": 3.9235555555555557, "grad_norm": 3.5519144535064697, "learning_rate": 4.309608540925267e-05, "loss": 1.4199, "step": 8828 }, { "epoch": 3.924, "grad_norm": 3.4887189865112305, "learning_rate": 4.307829181494662e-05, "loss": 1.4381, "step": 8829 }, { "epoch": 3.924444444444444, "grad_norm": 3.223085880279541, "learning_rate": 4.306049822064057e-05, "loss": 1.0015, "step": 8830 }, { "epoch": 3.924888888888889, "grad_norm": 3.3802876472473145, "learning_rate": 4.3042704626334526e-05, "loss": 1.0103, "step": 8831 }, { "epoch": 3.9253333333333336, "grad_norm": 3.1447699069976807, "learning_rate": 4.302491103202847e-05, "loss": 1.2326, "step": 8832 }, { "epoch": 3.925777777777778, "grad_norm": 3.206096887588501, "learning_rate": 4.300711743772242e-05, "loss": 0.9133, "step": 8833 }, { "epoch": 3.926222222222222, "grad_norm": 3.67698335647583, "learning_rate": 4.2989323843416376e-05, "loss": 1.1838, "step": 8834 }, { "epoch": 3.9266666666666667, "grad_norm": 3.849926233291626, "learning_rate": 4.297153024911032e-05, "loss": 1.1519, "step": 8835 }, { "epoch": 3.927111111111111, "grad_norm": 3.2638113498687744, "learning_rate": 4.2953736654804275e-05, "loss": 1.0861, "step": 8836 }, { "epoch": 3.9275555555555557, "grad_norm": 2.141021728515625, "learning_rate": 4.2935943060498225e-05, "loss": 0.3943, "step": 8837 }, { "epoch": 3.928, "grad_norm": 2.9865190982818604, "learning_rate": 4.291814946619217e-05, "loss": 0.9882, "step": 8838 }, { "epoch": 3.9284444444444446, "grad_norm": 3.7992167472839355, "learning_rate": 4.2900355871886124e-05, "loss": 1.3604, "step": 8839 }, { "epoch": 3.928888888888889, "grad_norm": 5.145820617675781, "learning_rate": 4.2882562277580074e-05, "loss": 1.3215, "step": 8840 }, { "epoch": 3.929333333333333, "grad_norm": 3.7761855125427246, "learning_rate": 4.2864768683274024e-05, "loss": 1.3284, "step": 8841 }, { "epoch": 3.929777777777778, "grad_norm": 4.058622360229492, "learning_rate": 4.284697508896797e-05, "loss": 1.3971, "step": 8842 }, { "epoch": 3.930222222222222, "grad_norm": 4.228296279907227, "learning_rate": 4.282918149466192e-05, "loss": 1.4811, "step": 8843 }, { "epoch": 3.9306666666666668, "grad_norm": 4.4582085609436035, "learning_rate": 4.281138790035587e-05, "loss": 1.2761, "step": 8844 }, { "epoch": 3.931111111111111, "grad_norm": 3.634289026260376, "learning_rate": 4.279359430604982e-05, "loss": 0.9966, "step": 8845 }, { "epoch": 3.9315555555555557, "grad_norm": 5.011377334594727, "learning_rate": 4.277580071174377e-05, "loss": 1.1816, "step": 8846 }, { "epoch": 3.932, "grad_norm": 4.198838233947754, "learning_rate": 4.275800711743773e-05, "loss": 1.4124, "step": 8847 }, { "epoch": 3.932444444444444, "grad_norm": 4.471510887145996, "learning_rate": 4.274021352313167e-05, "loss": 1.0095, "step": 8848 }, { "epoch": 3.932888888888889, "grad_norm": 3.8801698684692383, "learning_rate": 4.272241992882562e-05, "loss": 0.4851, "step": 8849 }, { "epoch": 3.9333333333333336, "grad_norm": 3.203237533569336, "learning_rate": 4.270462633451958e-05, "loss": 0.411, "step": 8850 }, { "epoch": 3.933777777777778, "grad_norm": 2.3682878017425537, "learning_rate": 4.268683274021352e-05, "loss": 2.0329, "step": 8851 }, { "epoch": 3.934222222222222, "grad_norm": 2.3607728481292725, "learning_rate": 4.266903914590748e-05, "loss": 1.0491, "step": 8852 }, { "epoch": 3.9346666666666668, "grad_norm": 3.4332642555236816, "learning_rate": 4.265124555160143e-05, "loss": 1.532, "step": 8853 }, { "epoch": 3.935111111111111, "grad_norm": 3.4008309841156006, "learning_rate": 4.263345195729538e-05, "loss": 1.6618, "step": 8854 }, { "epoch": 3.9355555555555557, "grad_norm": 3.161529779434204, "learning_rate": 4.2615658362989326e-05, "loss": 1.4769, "step": 8855 }, { "epoch": 3.936, "grad_norm": 3.2563178539276123, "learning_rate": 4.2597864768683276e-05, "loss": 1.8546, "step": 8856 }, { "epoch": 3.9364444444444446, "grad_norm": 3.0360910892486572, "learning_rate": 4.2580071174377226e-05, "loss": 1.0507, "step": 8857 }, { "epoch": 3.936888888888889, "grad_norm": 3.976771116256714, "learning_rate": 4.2562277580071176e-05, "loss": 1.6975, "step": 8858 }, { "epoch": 3.937333333333333, "grad_norm": 3.122298002243042, "learning_rate": 4.2544483985765125e-05, "loss": 1.4874, "step": 8859 }, { "epoch": 3.937777777777778, "grad_norm": 3.1202025413513184, "learning_rate": 4.252669039145908e-05, "loss": 1.4433, "step": 8860 }, { "epoch": 3.938222222222222, "grad_norm": 4.454637050628662, "learning_rate": 4.2508896797153025e-05, "loss": 1.6434, "step": 8861 }, { "epoch": 3.9386666666666668, "grad_norm": 3.968618392944336, "learning_rate": 4.2491103202846975e-05, "loss": 1.365, "step": 8862 }, { "epoch": 3.939111111111111, "grad_norm": 3.396158456802368, "learning_rate": 4.247330960854093e-05, "loss": 1.464, "step": 8863 }, { "epoch": 3.9395555555555557, "grad_norm": 2.9130702018737793, "learning_rate": 4.2455516014234874e-05, "loss": 0.9835, "step": 8864 }, { "epoch": 3.94, "grad_norm": 3.9278974533081055, "learning_rate": 4.2437722419928824e-05, "loss": 1.585, "step": 8865 }, { "epoch": 3.940444444444444, "grad_norm": 4.043193817138672, "learning_rate": 4.241992882562278e-05, "loss": 1.4772, "step": 8866 }, { "epoch": 3.940888888888889, "grad_norm": 3.436844825744629, "learning_rate": 4.240213523131672e-05, "loss": 1.0931, "step": 8867 }, { "epoch": 3.9413333333333336, "grad_norm": 3.061730146408081, "learning_rate": 4.238434163701068e-05, "loss": 1.3839, "step": 8868 }, { "epoch": 3.941777777777778, "grad_norm": 3.5630033016204834, "learning_rate": 4.236654804270463e-05, "loss": 1.4972, "step": 8869 }, { "epoch": 3.942222222222222, "grad_norm": 5.172977924346924, "learning_rate": 4.234875444839858e-05, "loss": 1.0713, "step": 8870 }, { "epoch": 3.9426666666666668, "grad_norm": 3.7209513187408447, "learning_rate": 4.233096085409253e-05, "loss": 1.5315, "step": 8871 }, { "epoch": 3.943111111111111, "grad_norm": 3.7008655071258545, "learning_rate": 4.231316725978648e-05, "loss": 1.2798, "step": 8872 }, { "epoch": 3.9435555555555557, "grad_norm": 3.4888625144958496, "learning_rate": 4.229537366548043e-05, "loss": 1.139, "step": 8873 }, { "epoch": 3.944, "grad_norm": 3.5707790851593018, "learning_rate": 4.227758007117438e-05, "loss": 1.2187, "step": 8874 }, { "epoch": 3.9444444444444446, "grad_norm": 3.2470076084136963, "learning_rate": 4.225978647686833e-05, "loss": 1.5406, "step": 8875 }, { "epoch": 3.944888888888889, "grad_norm": 3.3586134910583496, "learning_rate": 4.2241992882562284e-05, "loss": 1.3913, "step": 8876 }, { "epoch": 3.945333333333333, "grad_norm": 4.539661407470703, "learning_rate": 4.222419928825623e-05, "loss": 1.8586, "step": 8877 }, { "epoch": 3.945777777777778, "grad_norm": 3.6496472358703613, "learning_rate": 4.220640569395018e-05, "loss": 1.5811, "step": 8878 }, { "epoch": 3.946222222222222, "grad_norm": 2.9896080493927, "learning_rate": 4.218861209964413e-05, "loss": 1.1142, "step": 8879 }, { "epoch": 3.9466666666666668, "grad_norm": 4.495405673980713, "learning_rate": 4.2170818505338076e-05, "loss": 1.316, "step": 8880 }, { "epoch": 3.947111111111111, "grad_norm": 2.01225209236145, "learning_rate": 4.215302491103203e-05, "loss": 0.4757, "step": 8881 }, { "epoch": 3.9475555555555557, "grad_norm": 2.771531343460083, "learning_rate": 4.213523131672598e-05, "loss": 0.4966, "step": 8882 }, { "epoch": 3.948, "grad_norm": 3.772594928741455, "learning_rate": 4.211743772241993e-05, "loss": 1.0786, "step": 8883 }, { "epoch": 3.948444444444444, "grad_norm": 3.722163438796997, "learning_rate": 4.209964412811388e-05, "loss": 1.3533, "step": 8884 }, { "epoch": 3.948888888888889, "grad_norm": 4.219447135925293, "learning_rate": 4.208185053380783e-05, "loss": 1.7263, "step": 8885 }, { "epoch": 3.9493333333333336, "grad_norm": 3.999436855316162, "learning_rate": 4.206405693950178e-05, "loss": 1.3957, "step": 8886 }, { "epoch": 3.949777777777778, "grad_norm": 3.6108932495117188, "learning_rate": 4.204626334519573e-05, "loss": 1.2958, "step": 8887 }, { "epoch": 3.950222222222222, "grad_norm": 3.92063570022583, "learning_rate": 4.202846975088968e-05, "loss": 1.4854, "step": 8888 }, { "epoch": 3.9506666666666668, "grad_norm": 3.5336453914642334, "learning_rate": 4.201067615658363e-05, "loss": 1.0695, "step": 8889 }, { "epoch": 3.951111111111111, "grad_norm": 4.354097366333008, "learning_rate": 4.199288256227758e-05, "loss": 1.4669, "step": 8890 }, { "epoch": 3.9515555555555557, "grad_norm": 4.140068054199219, "learning_rate": 4.197508896797153e-05, "loss": 1.292, "step": 8891 }, { "epoch": 3.952, "grad_norm": 4.120720863342285, "learning_rate": 4.1957295373665486e-05, "loss": 1.2629, "step": 8892 }, { "epoch": 3.9524444444444446, "grad_norm": 4.0067901611328125, "learning_rate": 4.193950177935943e-05, "loss": 1.318, "step": 8893 }, { "epoch": 3.952888888888889, "grad_norm": 3.640927314758301, "learning_rate": 4.192170818505338e-05, "loss": 0.8191, "step": 8894 }, { "epoch": 3.953333333333333, "grad_norm": 3.907336473464966, "learning_rate": 4.1903914590747336e-05, "loss": 1.0409, "step": 8895 }, { "epoch": 3.953777777777778, "grad_norm": 4.239986419677734, "learning_rate": 4.188612099644128e-05, "loss": 1.4091, "step": 8896 }, { "epoch": 3.954222222222222, "grad_norm": 4.236495494842529, "learning_rate": 4.1868327402135235e-05, "loss": 1.1437, "step": 8897 }, { "epoch": 3.9546666666666668, "grad_norm": 3.6864097118377686, "learning_rate": 4.1850533807829185e-05, "loss": 1.0221, "step": 8898 }, { "epoch": 3.955111111111111, "grad_norm": 5.639880180358887, "learning_rate": 4.1832740213523135e-05, "loss": 1.8105, "step": 8899 }, { "epoch": 3.9555555555555557, "grad_norm": 3.3932342529296875, "learning_rate": 4.1814946619217084e-05, "loss": 0.5343, "step": 8900 }, { "epoch": 3.956, "grad_norm": 2.1324033737182617, "learning_rate": 4.1797153024911034e-05, "loss": 2.0662, "step": 8901 }, { "epoch": 3.956444444444444, "grad_norm": 2.3241565227508545, "learning_rate": 4.1779359430604984e-05, "loss": 1.8588, "step": 8902 }, { "epoch": 3.956888888888889, "grad_norm": 2.4009382724761963, "learning_rate": 4.1761565836298933e-05, "loss": 2.0112, "step": 8903 }, { "epoch": 3.9573333333333336, "grad_norm": 2.540668487548828, "learning_rate": 4.174377224199288e-05, "loss": 1.4803, "step": 8904 }, { "epoch": 3.957777777777778, "grad_norm": 2.241854190826416, "learning_rate": 4.172597864768684e-05, "loss": 1.6004, "step": 8905 }, { "epoch": 3.958222222222222, "grad_norm": 3.1838226318359375, "learning_rate": 4.170818505338078e-05, "loss": 1.6044, "step": 8906 }, { "epoch": 3.958666666666667, "grad_norm": 2.7227938175201416, "learning_rate": 4.169039145907473e-05, "loss": 1.3226, "step": 8907 }, { "epoch": 3.959111111111111, "grad_norm": 2.933244466781616, "learning_rate": 4.167259786476869e-05, "loss": 1.3356, "step": 8908 }, { "epoch": 3.9595555555555557, "grad_norm": 2.794614315032959, "learning_rate": 4.165480427046263e-05, "loss": 1.3394, "step": 8909 }, { "epoch": 3.96, "grad_norm": 3.1257002353668213, "learning_rate": 4.163701067615658e-05, "loss": 1.6075, "step": 8910 }, { "epoch": 3.9604444444444447, "grad_norm": 3.743485927581787, "learning_rate": 4.161921708185054e-05, "loss": 1.9586, "step": 8911 }, { "epoch": 3.960888888888889, "grad_norm": 2.8176426887512207, "learning_rate": 4.160142348754449e-05, "loss": 1.7985, "step": 8912 }, { "epoch": 3.961333333333333, "grad_norm": 3.1608290672302246, "learning_rate": 4.158362989323844e-05, "loss": 1.729, "step": 8913 }, { "epoch": 3.961777777777778, "grad_norm": 3.7492446899414062, "learning_rate": 4.156583629893239e-05, "loss": 1.606, "step": 8914 }, { "epoch": 3.962222222222222, "grad_norm": 3.0357353687286377, "learning_rate": 4.154804270462634e-05, "loss": 1.2235, "step": 8915 }, { "epoch": 3.962666666666667, "grad_norm": 3.4608840942382812, "learning_rate": 4.1530249110320287e-05, "loss": 1.6648, "step": 8916 }, { "epoch": 3.963111111111111, "grad_norm": 3.740095615386963, "learning_rate": 4.1512455516014236e-05, "loss": 1.4451, "step": 8917 }, { "epoch": 3.9635555555555557, "grad_norm": 3.2377631664276123, "learning_rate": 4.1494661921708186e-05, "loss": 1.3791, "step": 8918 }, { "epoch": 3.964, "grad_norm": 2.5673611164093018, "learning_rate": 4.1476868327402136e-05, "loss": 0.6982, "step": 8919 }, { "epoch": 3.964444444444444, "grad_norm": 3.1579864025115967, "learning_rate": 4.1459074733096085e-05, "loss": 1.1644, "step": 8920 }, { "epoch": 3.964888888888889, "grad_norm": 3.7532455921173096, "learning_rate": 4.144128113879004e-05, "loss": 1.3643, "step": 8921 }, { "epoch": 3.9653333333333336, "grad_norm": 3.0965757369995117, "learning_rate": 4.1423487544483985e-05, "loss": 1.5047, "step": 8922 }, { "epoch": 3.965777777777778, "grad_norm": 3.5118284225463867, "learning_rate": 4.1405693950177935e-05, "loss": 1.7549, "step": 8923 }, { "epoch": 3.966222222222222, "grad_norm": 3.520890712738037, "learning_rate": 4.138790035587189e-05, "loss": 1.4312, "step": 8924 }, { "epoch": 3.966666666666667, "grad_norm": 3.2968950271606445, "learning_rate": 4.1370106761565834e-05, "loss": 1.1051, "step": 8925 }, { "epoch": 3.967111111111111, "grad_norm": 3.584757089614868, "learning_rate": 4.135231316725979e-05, "loss": 1.4383, "step": 8926 }, { "epoch": 3.9675555555555553, "grad_norm": 2.9149937629699707, "learning_rate": 4.133451957295374e-05, "loss": 0.7798, "step": 8927 }, { "epoch": 3.968, "grad_norm": 4.206515312194824, "learning_rate": 4.131672597864769e-05, "loss": 1.2928, "step": 8928 }, { "epoch": 3.9684444444444447, "grad_norm": 3.031240224838257, "learning_rate": 4.129893238434164e-05, "loss": 1.0482, "step": 8929 }, { "epoch": 3.968888888888889, "grad_norm": 3.6411449909210205, "learning_rate": 4.128113879003559e-05, "loss": 1.0632, "step": 8930 }, { "epoch": 3.969333333333333, "grad_norm": 3.8553061485290527, "learning_rate": 4.126334519572954e-05, "loss": 1.5348, "step": 8931 }, { "epoch": 3.969777777777778, "grad_norm": 3.2213852405548096, "learning_rate": 4.124555160142349e-05, "loss": 1.4563, "step": 8932 }, { "epoch": 3.970222222222222, "grad_norm": 3.013375759124756, "learning_rate": 4.122775800711744e-05, "loss": 1.1047, "step": 8933 }, { "epoch": 3.970666666666667, "grad_norm": 3.3184897899627686, "learning_rate": 4.120996441281139e-05, "loss": 1.2822, "step": 8934 }, { "epoch": 3.971111111111111, "grad_norm": 3.349609375, "learning_rate": 4.119217081850534e-05, "loss": 1.4031, "step": 8935 }, { "epoch": 3.9715555555555557, "grad_norm": 3.315196990966797, "learning_rate": 4.117437722419929e-05, "loss": 1.3917, "step": 8936 }, { "epoch": 3.972, "grad_norm": 3.5079121589660645, "learning_rate": 4.1156583629893244e-05, "loss": 1.3935, "step": 8937 }, { "epoch": 3.9724444444444442, "grad_norm": 3.8228821754455566, "learning_rate": 4.113879003558719e-05, "loss": 1.2339, "step": 8938 }, { "epoch": 3.972888888888889, "grad_norm": 3.724152088165283, "learning_rate": 4.112099644128114e-05, "loss": 1.03, "step": 8939 }, { "epoch": 3.9733333333333336, "grad_norm": 3.229318141937256, "learning_rate": 4.1103202846975093e-05, "loss": 0.7204, "step": 8940 }, { "epoch": 3.973777777777778, "grad_norm": 4.946290493011475, "learning_rate": 4.108540925266904e-05, "loss": 1.647, "step": 8941 }, { "epoch": 3.974222222222222, "grad_norm": 4.947329521179199, "learning_rate": 4.106761565836299e-05, "loss": 1.4321, "step": 8942 }, { "epoch": 3.974666666666667, "grad_norm": 3.631192684173584, "learning_rate": 4.104982206405694e-05, "loss": 1.2327, "step": 8943 }, { "epoch": 3.975111111111111, "grad_norm": 3.9020490646362305, "learning_rate": 4.103202846975089e-05, "loss": 1.3794, "step": 8944 }, { "epoch": 3.9755555555555553, "grad_norm": 4.423495769500732, "learning_rate": 4.101423487544484e-05, "loss": 1.5862, "step": 8945 }, { "epoch": 3.976, "grad_norm": 4.301177024841309, "learning_rate": 4.099644128113879e-05, "loss": 0.9748, "step": 8946 }, { "epoch": 3.9764444444444447, "grad_norm": 3.9020419120788574, "learning_rate": 4.097864768683274e-05, "loss": 0.873, "step": 8947 }, { "epoch": 3.976888888888889, "grad_norm": 4.565361022949219, "learning_rate": 4.096085409252669e-05, "loss": 0.9833, "step": 8948 }, { "epoch": 3.977333333333333, "grad_norm": 3.912226676940918, "learning_rate": 4.094306049822064e-05, "loss": 1.3495, "step": 8949 }, { "epoch": 3.977777777777778, "grad_norm": 5.293661594390869, "learning_rate": 4.09252669039146e-05, "loss": 1.3508, "step": 8950 }, { "epoch": 3.978222222222222, "grad_norm": 0.17634160816669464, "learning_rate": 4.090747330960854e-05, "loss": 0.0163, "step": 8951 }, { "epoch": 3.978666666666667, "grad_norm": 2.6012349128723145, "learning_rate": 4.088967971530249e-05, "loss": 1.7285, "step": 8952 }, { "epoch": 3.979111111111111, "grad_norm": 2.6007161140441895, "learning_rate": 4.0871886120996447e-05, "loss": 0.9842, "step": 8953 }, { "epoch": 3.9795555555555557, "grad_norm": 2.962765693664551, "learning_rate": 4.085409252669039e-05, "loss": 1.6334, "step": 8954 }, { "epoch": 3.98, "grad_norm": 3.1964375972747803, "learning_rate": 4.083629893238434e-05, "loss": 1.7721, "step": 8955 }, { "epoch": 3.9804444444444442, "grad_norm": 3.102666139602661, "learning_rate": 4.0818505338078296e-05, "loss": 1.5754, "step": 8956 }, { "epoch": 3.980888888888889, "grad_norm": 3.193918466567993, "learning_rate": 4.0800711743772245e-05, "loss": 2.0491, "step": 8957 }, { "epoch": 3.981333333333333, "grad_norm": 2.9397614002227783, "learning_rate": 4.0782918149466195e-05, "loss": 1.044, "step": 8958 }, { "epoch": 3.981777777777778, "grad_norm": 3.4079766273498535, "learning_rate": 4.0765124555160145e-05, "loss": 1.0677, "step": 8959 }, { "epoch": 3.982222222222222, "grad_norm": 3.466888666152954, "learning_rate": 4.0747330960854095e-05, "loss": 1.5662, "step": 8960 }, { "epoch": 3.982666666666667, "grad_norm": 2.9022140502929688, "learning_rate": 4.0729537366548044e-05, "loss": 1.1119, "step": 8961 }, { "epoch": 3.983111111111111, "grad_norm": 3.6132586002349854, "learning_rate": 4.0711743772241994e-05, "loss": 1.4747, "step": 8962 }, { "epoch": 3.9835555555555553, "grad_norm": 3.143056631088257, "learning_rate": 4.0693950177935944e-05, "loss": 1.6553, "step": 8963 }, { "epoch": 3.984, "grad_norm": 3.3530349731445312, "learning_rate": 4.0676156583629894e-05, "loss": 1.1596, "step": 8964 }, { "epoch": 3.9844444444444447, "grad_norm": 3.0167648792266846, "learning_rate": 4.065836298932384e-05, "loss": 1.0196, "step": 8965 }, { "epoch": 3.984888888888889, "grad_norm": 2.661754846572876, "learning_rate": 4.06405693950178e-05, "loss": 0.8042, "step": 8966 }, { "epoch": 3.985333333333333, "grad_norm": 4.018869400024414, "learning_rate": 4.062277580071174e-05, "loss": 1.6149, "step": 8967 }, { "epoch": 3.985777777777778, "grad_norm": 3.3034558296203613, "learning_rate": 4.060498220640569e-05, "loss": 1.0005, "step": 8968 }, { "epoch": 3.986222222222222, "grad_norm": 3.277484178543091, "learning_rate": 4.058718861209965e-05, "loss": 1.4676, "step": 8969 }, { "epoch": 3.986666666666667, "grad_norm": 2.8795299530029297, "learning_rate": 4.05693950177936e-05, "loss": 1.0455, "step": 8970 }, { "epoch": 3.987111111111111, "grad_norm": 3.6743695735931396, "learning_rate": 4.055160142348755e-05, "loss": 1.3309, "step": 8971 }, { "epoch": 3.9875555555555557, "grad_norm": 2.998657464981079, "learning_rate": 4.05338078291815e-05, "loss": 1.3431, "step": 8972 }, { "epoch": 3.988, "grad_norm": 3.6148931980133057, "learning_rate": 4.051601423487545e-05, "loss": 1.167, "step": 8973 }, { "epoch": 3.9884444444444442, "grad_norm": 3.467222213745117, "learning_rate": 4.04982206405694e-05, "loss": 1.2744, "step": 8974 }, { "epoch": 3.988888888888889, "grad_norm": 3.517871618270874, "learning_rate": 4.048042704626335e-05, "loss": 1.4644, "step": 8975 }, { "epoch": 3.989333333333333, "grad_norm": 3.028439998626709, "learning_rate": 4.04626334519573e-05, "loss": 0.8676, "step": 8976 }, { "epoch": 3.989777777777778, "grad_norm": 3.8672285079956055, "learning_rate": 4.044483985765125e-05, "loss": 1.3043, "step": 8977 }, { "epoch": 3.990222222222222, "grad_norm": 3.13926362991333, "learning_rate": 4.0427046263345196e-05, "loss": 1.0397, "step": 8978 }, { "epoch": 3.990666666666667, "grad_norm": 3.7574684619903564, "learning_rate": 4.0409252669039146e-05, "loss": 1.2111, "step": 8979 }, { "epoch": 3.991111111111111, "grad_norm": 3.468540906906128, "learning_rate": 4.0391459074733096e-05, "loss": 1.4037, "step": 8980 }, { "epoch": 3.9915555555555553, "grad_norm": 2.5756216049194336, "learning_rate": 4.0373665480427046e-05, "loss": 0.6104, "step": 8981 }, { "epoch": 3.992, "grad_norm": 4.789612293243408, "learning_rate": 4.0355871886121e-05, "loss": 1.3617, "step": 8982 }, { "epoch": 3.9924444444444447, "grad_norm": 4.018329620361328, "learning_rate": 4.0338078291814945e-05, "loss": 1.3874, "step": 8983 }, { "epoch": 3.992888888888889, "grad_norm": 3.488252639770508, "learning_rate": 4.0320284697508895e-05, "loss": 1.2444, "step": 8984 }, { "epoch": 3.993333333333333, "grad_norm": 3.8759312629699707, "learning_rate": 4.030249110320285e-05, "loss": 1.1931, "step": 8985 }, { "epoch": 3.993777777777778, "grad_norm": 2.9700896739959717, "learning_rate": 4.02846975088968e-05, "loss": 0.9424, "step": 8986 }, { "epoch": 3.994222222222222, "grad_norm": 3.435788154602051, "learning_rate": 4.026690391459075e-05, "loss": 1.1437, "step": 8987 }, { "epoch": 3.994666666666667, "grad_norm": 3.1736974716186523, "learning_rate": 4.02491103202847e-05, "loss": 1.014, "step": 8988 }, { "epoch": 3.995111111111111, "grad_norm": 3.615291118621826, "learning_rate": 4.023131672597865e-05, "loss": 1.4715, "step": 8989 }, { "epoch": 3.9955555555555557, "grad_norm": 3.378284215927124, "learning_rate": 4.02135231316726e-05, "loss": 1.2936, "step": 8990 }, { "epoch": 3.996, "grad_norm": 3.3950233459472656, "learning_rate": 4.019572953736655e-05, "loss": 1.1708, "step": 8991 }, { "epoch": 3.9964444444444442, "grad_norm": 3.4889776706695557, "learning_rate": 4.01779359430605e-05, "loss": 1.192, "step": 8992 }, { "epoch": 3.996888888888889, "grad_norm": 3.3020076751708984, "learning_rate": 4.016014234875445e-05, "loss": 0.991, "step": 8993 }, { "epoch": 3.997333333333333, "grad_norm": 4.46552038192749, "learning_rate": 4.01423487544484e-05, "loss": 1.4608, "step": 8994 }, { "epoch": 3.997777777777778, "grad_norm": 4.276278495788574, "learning_rate": 4.0124555160142355e-05, "loss": 1.2873, "step": 8995 }, { "epoch": 3.998222222222222, "grad_norm": 3.8281071186065674, "learning_rate": 4.01067615658363e-05, "loss": 1.461, "step": 8996 }, { "epoch": 3.998666666666667, "grad_norm": 5.66308069229126, "learning_rate": 4.008896797153025e-05, "loss": 1.2658, "step": 8997 }, { "epoch": 3.999111111111111, "grad_norm": 4.838554382324219, "learning_rate": 4.0071174377224204e-05, "loss": 1.175, "step": 8998 }, { "epoch": 3.9995555555555553, "grad_norm": 4.6421990394592285, "learning_rate": 4.0053380782918154e-05, "loss": 1.2439, "step": 8999 }, { "epoch": 4.0, "grad_norm": 3.736137628555298, "learning_rate": 4.00355871886121e-05, "loss": 0.6085, "step": 9000 }, { "epoch": 4.0, "eval_loss": 2.81941556930542, "eval_runtime": 47.453, "eval_samples_per_second": 10.537, "eval_steps_per_second": 10.537, "step": 9000 }, { "epoch": 4.000444444444445, "grad_norm": 2.5390360355377197, "learning_rate": 4.0017793594306054e-05, "loss": 1.3083, "step": 9001 }, { "epoch": 4.0008888888888885, "grad_norm": 2.7254271507263184, "learning_rate": 4e-05, "loss": 0.5198, "step": 9002 }, { "epoch": 4.001333333333333, "grad_norm": 2.502013683319092, "learning_rate": 3.998220640569395e-05, "loss": 1.3684, "step": 9003 }, { "epoch": 4.001777777777778, "grad_norm": 2.1942808628082275, "learning_rate": 3.99644128113879e-05, "loss": 0.9158, "step": 9004 }, { "epoch": 4.002222222222223, "grad_norm": 2.4862608909606934, "learning_rate": 3.994661921708185e-05, "loss": 1.1732, "step": 9005 }, { "epoch": 4.002666666666666, "grad_norm": 2.521996259689331, "learning_rate": 3.99288256227758e-05, "loss": 0.9865, "step": 9006 }, { "epoch": 4.003111111111111, "grad_norm": 2.7241384983062744, "learning_rate": 3.991103202846975e-05, "loss": 0.9629, "step": 9007 }, { "epoch": 4.003555555555556, "grad_norm": 2.860523223876953, "learning_rate": 3.98932384341637e-05, "loss": 0.9462, "step": 9008 }, { "epoch": 4.004, "grad_norm": 2.424485921859741, "learning_rate": 3.987544483985765e-05, "loss": 0.762, "step": 9009 }, { "epoch": 4.004444444444444, "grad_norm": 3.2008659839630127, "learning_rate": 3.98576512455516e-05, "loss": 1.2424, "step": 9010 }, { "epoch": 4.004888888888889, "grad_norm": 3.4407267570495605, "learning_rate": 3.983985765124556e-05, "loss": 0.9112, "step": 9011 }, { "epoch": 4.005333333333334, "grad_norm": 2.80182147026062, "learning_rate": 3.98220640569395e-05, "loss": 1.006, "step": 9012 }, { "epoch": 4.005777777777777, "grad_norm": 3.254213571548462, "learning_rate": 3.980427046263345e-05, "loss": 1.0918, "step": 9013 }, { "epoch": 4.006222222222222, "grad_norm": 3.264662027359009, "learning_rate": 3.978647686832741e-05, "loss": 1.248, "step": 9014 }, { "epoch": 4.006666666666667, "grad_norm": 2.7406811714172363, "learning_rate": 3.9768683274021356e-05, "loss": 0.7658, "step": 9015 }, { "epoch": 4.0071111111111115, "grad_norm": 3.4597320556640625, "learning_rate": 3.9750889679715306e-05, "loss": 0.9957, "step": 9016 }, { "epoch": 4.007555555555555, "grad_norm": 3.379284620285034, "learning_rate": 3.9733096085409256e-05, "loss": 1.1522, "step": 9017 }, { "epoch": 4.008, "grad_norm": 2.9539339542388916, "learning_rate": 3.9715302491103206e-05, "loss": 0.8746, "step": 9018 }, { "epoch": 4.008444444444445, "grad_norm": 3.017645835876465, "learning_rate": 3.9697508896797155e-05, "loss": 0.7478, "step": 9019 }, { "epoch": 4.0088888888888885, "grad_norm": 3.781395196914673, "learning_rate": 3.9679715302491105e-05, "loss": 1.0097, "step": 9020 }, { "epoch": 4.009333333333333, "grad_norm": 3.5405569076538086, "learning_rate": 3.9661921708185055e-05, "loss": 0.7791, "step": 9021 }, { "epoch": 4.009777777777778, "grad_norm": 4.20705509185791, "learning_rate": 3.9644128113879004e-05, "loss": 0.98, "step": 9022 }, { "epoch": 4.010222222222223, "grad_norm": 3.9584195613861084, "learning_rate": 3.9626334519572954e-05, "loss": 0.6952, "step": 9023 }, { "epoch": 4.010666666666666, "grad_norm": 4.0206522941589355, "learning_rate": 3.9608540925266904e-05, "loss": 1.0842, "step": 9024 }, { "epoch": 4.011111111111111, "grad_norm": 4.080977439880371, "learning_rate": 3.9590747330960854e-05, "loss": 0.7917, "step": 9025 }, { "epoch": 4.011555555555556, "grad_norm": 4.305196285247803, "learning_rate": 3.95729537366548e-05, "loss": 1.0147, "step": 9026 }, { "epoch": 4.012, "grad_norm": 3.7779810428619385, "learning_rate": 3.955516014234876e-05, "loss": 0.764, "step": 9027 }, { "epoch": 4.012444444444444, "grad_norm": 3.0636918544769287, "learning_rate": 3.953736654804271e-05, "loss": 0.5695, "step": 9028 }, { "epoch": 4.012888888888889, "grad_norm": 3.5221712589263916, "learning_rate": 3.951957295373665e-05, "loss": 0.7406, "step": 9029 }, { "epoch": 4.013333333333334, "grad_norm": 4.276998996734619, "learning_rate": 3.950177935943061e-05, "loss": 0.8573, "step": 9030 }, { "epoch": 4.0137777777777774, "grad_norm": 5.169942855834961, "learning_rate": 3.948398576512456e-05, "loss": 1.1206, "step": 9031 }, { "epoch": 4.014222222222222, "grad_norm": 3.651606798171997, "learning_rate": 3.946619217081851e-05, "loss": 0.6189, "step": 9032 }, { "epoch": 4.014666666666667, "grad_norm": 4.276918411254883, "learning_rate": 3.944839857651246e-05, "loss": 1.2343, "step": 9033 }, { "epoch": 4.0151111111111115, "grad_norm": 3.116567850112915, "learning_rate": 3.943060498220641e-05, "loss": 0.5037, "step": 9034 }, { "epoch": 4.015555555555555, "grad_norm": 4.222073078155518, "learning_rate": 3.941281138790036e-05, "loss": 0.9321, "step": 9035 }, { "epoch": 4.016, "grad_norm": 3.9893438816070557, "learning_rate": 3.939501779359431e-05, "loss": 1.0099, "step": 9036 }, { "epoch": 4.016444444444445, "grad_norm": 4.333362102508545, "learning_rate": 3.937722419928826e-05, "loss": 1.0413, "step": 9037 }, { "epoch": 4.0168888888888885, "grad_norm": 3.694157361984253, "learning_rate": 3.935943060498221e-05, "loss": 0.6196, "step": 9038 }, { "epoch": 4.017333333333333, "grad_norm": 4.171072006225586, "learning_rate": 3.9341637010676157e-05, "loss": 0.6015, "step": 9039 }, { "epoch": 4.017777777777778, "grad_norm": 4.184826374053955, "learning_rate": 3.932384341637011e-05, "loss": 0.9599, "step": 9040 }, { "epoch": 4.018222222222223, "grad_norm": 3.0356812477111816, "learning_rate": 3.9306049822064056e-05, "loss": 0.5598, "step": 9041 }, { "epoch": 4.018666666666666, "grad_norm": 5.620950222015381, "learning_rate": 3.9288256227758006e-05, "loss": 0.7523, "step": 9042 }, { "epoch": 4.019111111111111, "grad_norm": 3.5355186462402344, "learning_rate": 3.927046263345196e-05, "loss": 0.5461, "step": 9043 }, { "epoch": 4.019555555555556, "grad_norm": 4.568739414215088, "learning_rate": 3.925266903914591e-05, "loss": 0.8424, "step": 9044 }, { "epoch": 4.02, "grad_norm": 3.9112837314605713, "learning_rate": 3.9234875444839855e-05, "loss": 0.6756, "step": 9045 }, { "epoch": 4.020444444444444, "grad_norm": 4.544727325439453, "learning_rate": 3.921708185053381e-05, "loss": 0.7065, "step": 9046 }, { "epoch": 4.020888888888889, "grad_norm": 4.089946746826172, "learning_rate": 3.919928825622776e-05, "loss": 1.122, "step": 9047 }, { "epoch": 4.021333333333334, "grad_norm": 5.61737060546875, "learning_rate": 3.918149466192171e-05, "loss": 0.4246, "step": 9048 }, { "epoch": 4.0217777777777775, "grad_norm": 4.636961460113525, "learning_rate": 3.916370106761566e-05, "loss": 0.4196, "step": 9049 }, { "epoch": 4.022222222222222, "grad_norm": 7.075558662414551, "learning_rate": 3.914590747330961e-05, "loss": 0.5413, "step": 9050 }, { "epoch": 4.022666666666667, "grad_norm": 2.6508331298828125, "learning_rate": 3.912811387900356e-05, "loss": 1.1433, "step": 9051 }, { "epoch": 4.0231111111111115, "grad_norm": 2.6102499961853027, "learning_rate": 3.911032028469751e-05, "loss": 1.4944, "step": 9052 }, { "epoch": 4.023555555555555, "grad_norm": 2.353135108947754, "learning_rate": 3.909252669039146e-05, "loss": 0.6206, "step": 9053 }, { "epoch": 4.024, "grad_norm": 2.1263370513916016, "learning_rate": 3.907473309608541e-05, "loss": 0.9243, "step": 9054 }, { "epoch": 4.024444444444445, "grad_norm": 3.224517583847046, "learning_rate": 3.905693950177936e-05, "loss": 1.5233, "step": 9055 }, { "epoch": 4.0248888888888885, "grad_norm": 3.488454580307007, "learning_rate": 3.9039145907473315e-05, "loss": 1.5544, "step": 9056 }, { "epoch": 4.025333333333333, "grad_norm": 3.6243624687194824, "learning_rate": 3.9021352313167265e-05, "loss": 1.1879, "step": 9057 }, { "epoch": 4.025777777777778, "grad_norm": 3.8051021099090576, "learning_rate": 3.900355871886121e-05, "loss": 1.3218, "step": 9058 }, { "epoch": 4.026222222222223, "grad_norm": 3.4981696605682373, "learning_rate": 3.8985765124555164e-05, "loss": 1.0695, "step": 9059 }, { "epoch": 4.026666666666666, "grad_norm": 3.527284860610962, "learning_rate": 3.8967971530249114e-05, "loss": 1.0332, "step": 9060 }, { "epoch": 4.027111111111111, "grad_norm": 3.228292942047119, "learning_rate": 3.8950177935943064e-05, "loss": 1.0043, "step": 9061 }, { "epoch": 4.027555555555556, "grad_norm": 3.101989984512329, "learning_rate": 3.8932384341637014e-05, "loss": 0.7491, "step": 9062 }, { "epoch": 4.028, "grad_norm": 3.7942066192626953, "learning_rate": 3.891459074733096e-05, "loss": 1.1937, "step": 9063 }, { "epoch": 4.028444444444444, "grad_norm": 4.187127590179443, "learning_rate": 3.889679715302491e-05, "loss": 1.0301, "step": 9064 }, { "epoch": 4.028888888888889, "grad_norm": 3.1858954429626465, "learning_rate": 3.887900355871886e-05, "loss": 0.8335, "step": 9065 }, { "epoch": 4.029333333333334, "grad_norm": 3.0691826343536377, "learning_rate": 3.886120996441281e-05, "loss": 0.8928, "step": 9066 }, { "epoch": 4.0297777777777775, "grad_norm": 2.3211822509765625, "learning_rate": 3.884341637010676e-05, "loss": 0.3684, "step": 9067 }, { "epoch": 4.030222222222222, "grad_norm": 3.79524302482605, "learning_rate": 3.882562277580071e-05, "loss": 1.2104, "step": 9068 }, { "epoch": 4.030666666666667, "grad_norm": 4.332894325256348, "learning_rate": 3.880782918149466e-05, "loss": 1.0843, "step": 9069 }, { "epoch": 4.0311111111111115, "grad_norm": 4.316042423248291, "learning_rate": 3.879003558718861e-05, "loss": 0.5951, "step": 9070 }, { "epoch": 4.031555555555555, "grad_norm": 3.6957225799560547, "learning_rate": 3.877224199288256e-05, "loss": 1.0805, "step": 9071 }, { "epoch": 4.032, "grad_norm": 4.04442834854126, "learning_rate": 3.875444839857652e-05, "loss": 1.1406, "step": 9072 }, { "epoch": 4.032444444444445, "grad_norm": 3.6291229724884033, "learning_rate": 3.873665480427047e-05, "loss": 1.2136, "step": 9073 }, { "epoch": 4.0328888888888885, "grad_norm": 3.7658095359802246, "learning_rate": 3.871886120996441e-05, "loss": 0.9391, "step": 9074 }, { "epoch": 4.033333333333333, "grad_norm": 4.630577564239502, "learning_rate": 3.870106761565837e-05, "loss": 0.9134, "step": 9075 }, { "epoch": 4.033777777777778, "grad_norm": 4.5560407638549805, "learning_rate": 3.8683274021352317e-05, "loss": 1.164, "step": 9076 }, { "epoch": 4.034222222222223, "grad_norm": 3.639586925506592, "learning_rate": 3.8665480427046266e-05, "loss": 0.7082, "step": 9077 }, { "epoch": 4.034666666666666, "grad_norm": 3.4507131576538086, "learning_rate": 3.8647686832740216e-05, "loss": 1.0112, "step": 9078 }, { "epoch": 4.035111111111111, "grad_norm": 3.1812744140625, "learning_rate": 3.8629893238434166e-05, "loss": 0.5256, "step": 9079 }, { "epoch": 4.035555555555556, "grad_norm": 3.704305410385132, "learning_rate": 3.8612099644128115e-05, "loss": 0.7616, "step": 9080 }, { "epoch": 4.036, "grad_norm": 2.5341804027557373, "learning_rate": 3.8594306049822065e-05, "loss": 0.4988, "step": 9081 }, { "epoch": 4.036444444444444, "grad_norm": 3.8586108684539795, "learning_rate": 3.8576512455516015e-05, "loss": 0.7027, "step": 9082 }, { "epoch": 4.036888888888889, "grad_norm": 3.8743813037872314, "learning_rate": 3.8558718861209965e-05, "loss": 1.023, "step": 9083 }, { "epoch": 4.037333333333334, "grad_norm": 3.394871711730957, "learning_rate": 3.8540925266903914e-05, "loss": 1.0268, "step": 9084 }, { "epoch": 4.0377777777777775, "grad_norm": 4.589004993438721, "learning_rate": 3.852313167259787e-05, "loss": 1.0115, "step": 9085 }, { "epoch": 4.038222222222222, "grad_norm": 3.1262948513031006, "learning_rate": 3.850533807829182e-05, "loss": 0.576, "step": 9086 }, { "epoch": 4.038666666666667, "grad_norm": 4.138561725616455, "learning_rate": 3.8487544483985763e-05, "loss": 0.9221, "step": 9087 }, { "epoch": 4.0391111111111115, "grad_norm": 3.105081796646118, "learning_rate": 3.846975088967972e-05, "loss": 0.4531, "step": 9088 }, { "epoch": 4.039555555555555, "grad_norm": 4.928419589996338, "learning_rate": 3.845195729537367e-05, "loss": 0.906, "step": 9089 }, { "epoch": 4.04, "grad_norm": 4.220268249511719, "learning_rate": 3.843416370106761e-05, "loss": 0.8529, "step": 9090 }, { "epoch": 4.040444444444445, "grad_norm": 5.022706031799316, "learning_rate": 3.841637010676157e-05, "loss": 0.9518, "step": 9091 }, { "epoch": 4.0408888888888885, "grad_norm": 3.5744247436523438, "learning_rate": 3.839857651245552e-05, "loss": 0.8885, "step": 9092 }, { "epoch": 4.041333333333333, "grad_norm": 3.773207902908325, "learning_rate": 3.838078291814947e-05, "loss": 0.7402, "step": 9093 }, { "epoch": 4.041777777777778, "grad_norm": 3.8730592727661133, "learning_rate": 3.836298932384342e-05, "loss": 0.8627, "step": 9094 }, { "epoch": 4.042222222222223, "grad_norm": 4.191495895385742, "learning_rate": 3.834519572953737e-05, "loss": 0.6384, "step": 9095 }, { "epoch": 4.042666666666666, "grad_norm": 4.794586658477783, "learning_rate": 3.832740213523132e-05, "loss": 0.9203, "step": 9096 }, { "epoch": 4.043111111111111, "grad_norm": 4.144529819488525, "learning_rate": 3.830960854092527e-05, "loss": 0.7881, "step": 9097 }, { "epoch": 4.043555555555556, "grad_norm": 8.298238754272461, "learning_rate": 3.829181494661922e-05, "loss": 1.375, "step": 9098 }, { "epoch": 4.044, "grad_norm": 3.5953052043914795, "learning_rate": 3.827402135231317e-05, "loss": 0.3941, "step": 9099 }, { "epoch": 4.044444444444444, "grad_norm": 10.296087265014648, "learning_rate": 3.825622775800712e-05, "loss": 0.2957, "step": 9100 }, { "epoch": 4.044888888888889, "grad_norm": 3.059195041656494, "learning_rate": 3.823843416370107e-05, "loss": 1.8857, "step": 9101 }, { "epoch": 4.045333333333334, "grad_norm": 2.8062288761138916, "learning_rate": 3.822064056939502e-05, "loss": 1.7158, "step": 9102 }, { "epoch": 4.0457777777777775, "grad_norm": 1.8304457664489746, "learning_rate": 3.8202846975088966e-05, "loss": 0.4758, "step": 9103 }, { "epoch": 4.046222222222222, "grad_norm": 2.9134559631347656, "learning_rate": 3.818505338078292e-05, "loss": 1.2599, "step": 9104 }, { "epoch": 4.046666666666667, "grad_norm": 2.96635365486145, "learning_rate": 3.816725978647687e-05, "loss": 1.1921, "step": 9105 }, { "epoch": 4.0471111111111115, "grad_norm": 2.9679348468780518, "learning_rate": 3.814946619217082e-05, "loss": 1.0036, "step": 9106 }, { "epoch": 4.047555555555555, "grad_norm": 3.5697379112243652, "learning_rate": 3.813167259786477e-05, "loss": 1.3876, "step": 9107 }, { "epoch": 4.048, "grad_norm": 2.98809814453125, "learning_rate": 3.811387900355872e-05, "loss": 0.7762, "step": 9108 }, { "epoch": 4.048444444444445, "grad_norm": 3.473585367202759, "learning_rate": 3.809608540925267e-05, "loss": 1.2346, "step": 9109 }, { "epoch": 4.0488888888888885, "grad_norm": 3.487264394760132, "learning_rate": 3.807829181494662e-05, "loss": 1.0737, "step": 9110 }, { "epoch": 4.049333333333333, "grad_norm": 3.552971839904785, "learning_rate": 3.806049822064057e-05, "loss": 1.1126, "step": 9111 }, { "epoch": 4.049777777777778, "grad_norm": 4.056872844696045, "learning_rate": 3.804270462633452e-05, "loss": 1.2205, "step": 9112 }, { "epoch": 4.050222222222223, "grad_norm": 3.7317447662353516, "learning_rate": 3.802491103202847e-05, "loss": 1.0883, "step": 9113 }, { "epoch": 4.050666666666666, "grad_norm": 3.8739066123962402, "learning_rate": 3.800711743772242e-05, "loss": 0.8847, "step": 9114 }, { "epoch": 4.051111111111111, "grad_norm": 3.574202060699463, "learning_rate": 3.7989323843416376e-05, "loss": 0.7981, "step": 9115 }, { "epoch": 4.051555555555556, "grad_norm": 3.6287660598754883, "learning_rate": 3.797153024911032e-05, "loss": 1.0176, "step": 9116 }, { "epoch": 4.052, "grad_norm": 3.372129201889038, "learning_rate": 3.7953736654804275e-05, "loss": 0.8463, "step": 9117 }, { "epoch": 4.052444444444444, "grad_norm": 0.2473270297050476, "learning_rate": 3.7935943060498225e-05, "loss": 0.0263, "step": 9118 }, { "epoch": 4.052888888888889, "grad_norm": 2.3387582302093506, "learning_rate": 3.791814946619217e-05, "loss": 0.4731, "step": 9119 }, { "epoch": 4.053333333333334, "grad_norm": 3.4713640213012695, "learning_rate": 3.7900355871886125e-05, "loss": 1.0173, "step": 9120 }, { "epoch": 4.0537777777777775, "grad_norm": 4.791953086853027, "learning_rate": 3.7882562277580074e-05, "loss": 1.3301, "step": 9121 }, { "epoch": 4.054222222222222, "grad_norm": 3.886340618133545, "learning_rate": 3.7864768683274024e-05, "loss": 0.759, "step": 9122 }, { "epoch": 4.054666666666667, "grad_norm": 3.3495802879333496, "learning_rate": 3.7846975088967974e-05, "loss": 0.8179, "step": 9123 }, { "epoch": 4.0551111111111116, "grad_norm": 3.790015935897827, "learning_rate": 3.7829181494661923e-05, "loss": 1.1811, "step": 9124 }, { "epoch": 4.055555555555555, "grad_norm": 4.013554096221924, "learning_rate": 3.781138790035587e-05, "loss": 0.7972, "step": 9125 }, { "epoch": 4.056, "grad_norm": 4.106941223144531, "learning_rate": 3.779359430604982e-05, "loss": 1.1104, "step": 9126 }, { "epoch": 4.056444444444445, "grad_norm": 4.0450334548950195, "learning_rate": 3.777580071174377e-05, "loss": 1.0491, "step": 9127 }, { "epoch": 4.0568888888888885, "grad_norm": 4.481957912445068, "learning_rate": 3.775800711743772e-05, "loss": 1.1531, "step": 9128 }, { "epoch": 4.057333333333333, "grad_norm": 3.5693199634552, "learning_rate": 3.774021352313167e-05, "loss": 0.8621, "step": 9129 }, { "epoch": 4.057777777777778, "grad_norm": 2.899524688720703, "learning_rate": 3.772241992882563e-05, "loss": 0.5678, "step": 9130 }, { "epoch": 4.058222222222223, "grad_norm": 4.416593551635742, "learning_rate": 3.770462633451958e-05, "loss": 1.0094, "step": 9131 }, { "epoch": 4.058666666666666, "grad_norm": 4.252343654632568, "learning_rate": 3.768683274021352e-05, "loss": 0.7924, "step": 9132 }, { "epoch": 4.059111111111111, "grad_norm": 4.362743377685547, "learning_rate": 3.766903914590748e-05, "loss": 0.5001, "step": 9133 }, { "epoch": 4.059555555555556, "grad_norm": 4.057370662689209, "learning_rate": 3.765124555160143e-05, "loss": 0.8535, "step": 9134 }, { "epoch": 4.06, "grad_norm": 4.266189098358154, "learning_rate": 3.763345195729537e-05, "loss": 0.7122, "step": 9135 }, { "epoch": 4.060444444444444, "grad_norm": 3.547168016433716, "learning_rate": 3.761565836298933e-05, "loss": 0.6885, "step": 9136 }, { "epoch": 4.060888888888889, "grad_norm": 3.209439992904663, "learning_rate": 3.759786476868328e-05, "loss": 0.7069, "step": 9137 }, { "epoch": 4.061333333333334, "grad_norm": 3.9220142364501953, "learning_rate": 3.7580071174377226e-05, "loss": 0.8905, "step": 9138 }, { "epoch": 4.0617777777777775, "grad_norm": 3.8512017726898193, "learning_rate": 3.7562277580071176e-05, "loss": 0.9789, "step": 9139 }, { "epoch": 4.062222222222222, "grad_norm": 4.788553714752197, "learning_rate": 3.7544483985765126e-05, "loss": 1.0029, "step": 9140 }, { "epoch": 4.062666666666667, "grad_norm": 5.606046676635742, "learning_rate": 3.7526690391459076e-05, "loss": 0.781, "step": 9141 }, { "epoch": 4.063111111111111, "grad_norm": 4.540804386138916, "learning_rate": 3.7508896797153025e-05, "loss": 1.0023, "step": 9142 }, { "epoch": 4.063555555555555, "grad_norm": 4.766351699829102, "learning_rate": 3.7491103202846975e-05, "loss": 1.0042, "step": 9143 }, { "epoch": 4.064, "grad_norm": 4.029290199279785, "learning_rate": 3.747330960854093e-05, "loss": 0.7966, "step": 9144 }, { "epoch": 4.064444444444445, "grad_norm": 6.49293327331543, "learning_rate": 3.7455516014234874e-05, "loss": 0.6414, "step": 9145 }, { "epoch": 4.0648888888888886, "grad_norm": 4.462486267089844, "learning_rate": 3.743772241992883e-05, "loss": 0.9538, "step": 9146 }, { "epoch": 4.065333333333333, "grad_norm": 3.4527547359466553, "learning_rate": 3.741992882562278e-05, "loss": 0.4922, "step": 9147 }, { "epoch": 4.065777777777778, "grad_norm": 3.8521132469177246, "learning_rate": 3.7402135231316724e-05, "loss": 0.7803, "step": 9148 }, { "epoch": 4.066222222222223, "grad_norm": 3.7171268463134766, "learning_rate": 3.738434163701068e-05, "loss": 0.6302, "step": 9149 }, { "epoch": 4.066666666666666, "grad_norm": 2.2403533458709717, "learning_rate": 3.736654804270463e-05, "loss": 0.2649, "step": 9150 }, { "epoch": 4.067111111111111, "grad_norm": 2.9199023246765137, "learning_rate": 3.734875444839858e-05, "loss": 1.3195, "step": 9151 }, { "epoch": 4.067555555555556, "grad_norm": 3.4877567291259766, "learning_rate": 3.733096085409253e-05, "loss": 1.3656, "step": 9152 }, { "epoch": 4.068, "grad_norm": 2.9192590713500977, "learning_rate": 3.731316725978648e-05, "loss": 1.1056, "step": 9153 }, { "epoch": 4.068444444444444, "grad_norm": 4.004071235656738, "learning_rate": 3.729537366548043e-05, "loss": 1.1344, "step": 9154 }, { "epoch": 4.068888888888889, "grad_norm": 3.422654628753662, "learning_rate": 3.727758007117438e-05, "loss": 0.9968, "step": 9155 }, { "epoch": 4.069333333333334, "grad_norm": 3.3041553497314453, "learning_rate": 3.725978647686833e-05, "loss": 1.4082, "step": 9156 }, { "epoch": 4.0697777777777775, "grad_norm": 3.1231582164764404, "learning_rate": 3.7241992882562285e-05, "loss": 1.2489, "step": 9157 }, { "epoch": 4.070222222222222, "grad_norm": 2.934847593307495, "learning_rate": 3.722419928825623e-05, "loss": 1.0434, "step": 9158 }, { "epoch": 4.070666666666667, "grad_norm": 4.84507942199707, "learning_rate": 3.720640569395018e-05, "loss": 1.56, "step": 9159 }, { "epoch": 4.071111111111111, "grad_norm": 4.400614261627197, "learning_rate": 3.7188612099644134e-05, "loss": 1.1331, "step": 9160 }, { "epoch": 4.071555555555555, "grad_norm": 4.824821949005127, "learning_rate": 3.717081850533808e-05, "loss": 0.8231, "step": 9161 }, { "epoch": 4.072, "grad_norm": 3.7322332859039307, "learning_rate": 3.715302491103203e-05, "loss": 1.0261, "step": 9162 }, { "epoch": 4.072444444444445, "grad_norm": 3.8609957695007324, "learning_rate": 3.713523131672598e-05, "loss": 0.9616, "step": 9163 }, { "epoch": 4.072888888888889, "grad_norm": 5.574636936187744, "learning_rate": 3.7117437722419926e-05, "loss": 1.2415, "step": 9164 }, { "epoch": 4.073333333333333, "grad_norm": 4.391664028167725, "learning_rate": 3.709964412811388e-05, "loss": 1.3483, "step": 9165 }, { "epoch": 4.073777777777778, "grad_norm": 3.788674831390381, "learning_rate": 3.708185053380783e-05, "loss": 1.0001, "step": 9166 }, { "epoch": 4.074222222222223, "grad_norm": 4.3154072761535645, "learning_rate": 3.706405693950178e-05, "loss": 1.3682, "step": 9167 }, { "epoch": 4.074666666666666, "grad_norm": 3.440797805786133, "learning_rate": 3.704626334519573e-05, "loss": 0.8641, "step": 9168 }, { "epoch": 4.075111111111111, "grad_norm": 4.154467582702637, "learning_rate": 3.702846975088968e-05, "loss": 0.9788, "step": 9169 }, { "epoch": 4.075555555555556, "grad_norm": 3.506791591644287, "learning_rate": 3.701067615658363e-05, "loss": 0.9756, "step": 9170 }, { "epoch": 4.076, "grad_norm": 4.369688987731934, "learning_rate": 3.699288256227758e-05, "loss": 1.4468, "step": 9171 }, { "epoch": 4.076444444444444, "grad_norm": 4.314336776733398, "learning_rate": 3.697508896797153e-05, "loss": 0.9896, "step": 9172 }, { "epoch": 4.076888888888889, "grad_norm": 3.7983624935150146, "learning_rate": 3.695729537366549e-05, "loss": 1.0006, "step": 9173 }, { "epoch": 4.077333333333334, "grad_norm": 3.259927749633789, "learning_rate": 3.693950177935943e-05, "loss": 0.7188, "step": 9174 }, { "epoch": 4.0777777777777775, "grad_norm": 3.296593189239502, "learning_rate": 3.6921708185053386e-05, "loss": 0.5057, "step": 9175 }, { "epoch": 4.078222222222222, "grad_norm": 3.704531669616699, "learning_rate": 3.6903914590747336e-05, "loss": 0.7359, "step": 9176 }, { "epoch": 4.078666666666667, "grad_norm": 3.4545626640319824, "learning_rate": 3.688612099644128e-05, "loss": 0.784, "step": 9177 }, { "epoch": 4.079111111111111, "grad_norm": 3.940635919570923, "learning_rate": 3.6868327402135236e-05, "loss": 0.9062, "step": 9178 }, { "epoch": 4.079555555555555, "grad_norm": 3.7405149936676025, "learning_rate": 3.6850533807829185e-05, "loss": 1.1758, "step": 9179 }, { "epoch": 4.08, "grad_norm": 3.7189319133758545, "learning_rate": 3.683274021352313e-05, "loss": 1.0273, "step": 9180 }, { "epoch": 4.080444444444445, "grad_norm": 3.3661651611328125, "learning_rate": 3.6814946619217085e-05, "loss": 0.5712, "step": 9181 }, { "epoch": 4.080888888888889, "grad_norm": 4.403657913208008, "learning_rate": 3.6797153024911034e-05, "loss": 1.1332, "step": 9182 }, { "epoch": 4.081333333333333, "grad_norm": 3.4443655014038086, "learning_rate": 3.6779359430604984e-05, "loss": 1.0131, "step": 9183 }, { "epoch": 4.081777777777778, "grad_norm": 6.885471343994141, "learning_rate": 3.6761565836298934e-05, "loss": 0.9145, "step": 9184 }, { "epoch": 4.082222222222223, "grad_norm": 4.67835807800293, "learning_rate": 3.6743772241992884e-05, "loss": 0.8626, "step": 9185 }, { "epoch": 4.082666666666666, "grad_norm": 4.347279071807861, "learning_rate": 3.672597864768684e-05, "loss": 0.753, "step": 9186 }, { "epoch": 4.083111111111111, "grad_norm": 4.374654293060303, "learning_rate": 3.670818505338078e-05, "loss": 0.8974, "step": 9187 }, { "epoch": 4.083555555555556, "grad_norm": 3.8928260803222656, "learning_rate": 3.669039145907473e-05, "loss": 0.8108, "step": 9188 }, { "epoch": 4.084, "grad_norm": 5.292436122894287, "learning_rate": 3.667259786476869e-05, "loss": 0.9941, "step": 9189 }, { "epoch": 4.084444444444444, "grad_norm": 4.306451320648193, "learning_rate": 3.665480427046263e-05, "loss": 1.0679, "step": 9190 }, { "epoch": 4.084888888888889, "grad_norm": 4.150672435760498, "learning_rate": 3.663701067615659e-05, "loss": 0.7973, "step": 9191 }, { "epoch": 4.085333333333334, "grad_norm": 3.7112274169921875, "learning_rate": 3.661921708185054e-05, "loss": 0.7108, "step": 9192 }, { "epoch": 4.0857777777777775, "grad_norm": 4.035175323486328, "learning_rate": 3.660142348754448e-05, "loss": 0.8418, "step": 9193 }, { "epoch": 4.086222222222222, "grad_norm": 4.420337677001953, "learning_rate": 3.658362989323844e-05, "loss": 0.8937, "step": 9194 }, { "epoch": 4.086666666666667, "grad_norm": 4.49367618560791, "learning_rate": 3.656583629893239e-05, "loss": 1.1035, "step": 9195 }, { "epoch": 4.087111111111111, "grad_norm": 5.570310115814209, "learning_rate": 3.654804270462634e-05, "loss": 0.7943, "step": 9196 }, { "epoch": 4.087555555555555, "grad_norm": 7.375543117523193, "learning_rate": 3.653024911032029e-05, "loss": 1.0006, "step": 9197 }, { "epoch": 4.088, "grad_norm": 3.2930474281311035, "learning_rate": 3.651245551601424e-05, "loss": 0.3692, "step": 9198 }, { "epoch": 4.088444444444445, "grad_norm": 2.375638961791992, "learning_rate": 3.6494661921708186e-05, "loss": 0.151, "step": 9199 }, { "epoch": 4.088888888888889, "grad_norm": 3.8127799034118652, "learning_rate": 3.6476868327402136e-05, "loss": 0.318, "step": 9200 }, { "epoch": 4.089333333333333, "grad_norm": 0.2315405160188675, "learning_rate": 3.6459074733096086e-05, "loss": 0.0145, "step": 9201 }, { "epoch": 4.089777777777778, "grad_norm": 2.930769920349121, "learning_rate": 3.644128113879004e-05, "loss": 1.5272, "step": 9202 }, { "epoch": 4.090222222222223, "grad_norm": 2.6000430583953857, "learning_rate": 3.6423487544483985e-05, "loss": 1.2693, "step": 9203 }, { "epoch": 4.0906666666666665, "grad_norm": 3.497532367706299, "learning_rate": 3.6405693950177935e-05, "loss": 1.5588, "step": 9204 }, { "epoch": 4.091111111111111, "grad_norm": 3.512805223464966, "learning_rate": 3.638790035587189e-05, "loss": 1.3456, "step": 9205 }, { "epoch": 4.091555555555556, "grad_norm": 4.103816032409668, "learning_rate": 3.6370106761565835e-05, "loss": 0.9345, "step": 9206 }, { "epoch": 4.092, "grad_norm": 3.1070504188537598, "learning_rate": 3.635231316725979e-05, "loss": 1.1946, "step": 9207 }, { "epoch": 4.092444444444444, "grad_norm": 3.410989999771118, "learning_rate": 3.633451957295374e-05, "loss": 1.0064, "step": 9208 }, { "epoch": 4.092888888888889, "grad_norm": 2.981509208679199, "learning_rate": 3.6316725978647684e-05, "loss": 0.8163, "step": 9209 }, { "epoch": 4.093333333333334, "grad_norm": 2.2597262859344482, "learning_rate": 3.629893238434164e-05, "loss": 0.5533, "step": 9210 }, { "epoch": 4.0937777777777775, "grad_norm": 2.4587838649749756, "learning_rate": 3.628113879003559e-05, "loss": 0.6522, "step": 9211 }, { "epoch": 4.094222222222222, "grad_norm": 3.2743875980377197, "learning_rate": 3.626334519572954e-05, "loss": 0.9227, "step": 9212 }, { "epoch": 4.094666666666667, "grad_norm": 3.8894238471984863, "learning_rate": 3.624555160142349e-05, "loss": 1.2398, "step": 9213 }, { "epoch": 4.095111111111111, "grad_norm": 4.327610969543457, "learning_rate": 3.622775800711744e-05, "loss": 1.296, "step": 9214 }, { "epoch": 4.095555555555555, "grad_norm": 3.491788864135742, "learning_rate": 3.6209964412811396e-05, "loss": 0.9961, "step": 9215 }, { "epoch": 4.096, "grad_norm": 4.091619968414307, "learning_rate": 3.619217081850534e-05, "loss": 1.0178, "step": 9216 }, { "epoch": 4.096444444444445, "grad_norm": 5.12504243850708, "learning_rate": 3.617437722419929e-05, "loss": 0.9988, "step": 9217 }, { "epoch": 4.096888888888889, "grad_norm": 5.116464614868164, "learning_rate": 3.6156583629893245e-05, "loss": 1.3894, "step": 9218 }, { "epoch": 4.097333333333333, "grad_norm": 3.9038383960723877, "learning_rate": 3.613879003558719e-05, "loss": 1.2835, "step": 9219 }, { "epoch": 4.097777777777778, "grad_norm": 4.344372272491455, "learning_rate": 3.6120996441281144e-05, "loss": 1.0891, "step": 9220 }, { "epoch": 4.098222222222223, "grad_norm": 4.03400993347168, "learning_rate": 3.6103202846975094e-05, "loss": 1.1626, "step": 9221 }, { "epoch": 4.0986666666666665, "grad_norm": 4.054080486297607, "learning_rate": 3.608540925266904e-05, "loss": 1.0427, "step": 9222 }, { "epoch": 4.099111111111111, "grad_norm": 4.079352855682373, "learning_rate": 3.606761565836299e-05, "loss": 1.0579, "step": 9223 }, { "epoch": 4.099555555555556, "grad_norm": 3.899838924407959, "learning_rate": 3.604982206405694e-05, "loss": 0.7473, "step": 9224 }, { "epoch": 4.1, "grad_norm": 3.900310754776001, "learning_rate": 3.6032028469750886e-05, "loss": 1.0974, "step": 9225 }, { "epoch": 4.100444444444444, "grad_norm": 3.1842355728149414, "learning_rate": 3.601423487544484e-05, "loss": 0.7394, "step": 9226 }, { "epoch": 4.100888888888889, "grad_norm": 3.721182346343994, "learning_rate": 3.599644128113879e-05, "loss": 0.7716, "step": 9227 }, { "epoch": 4.101333333333334, "grad_norm": 5.3094353675842285, "learning_rate": 3.597864768683274e-05, "loss": 0.9151, "step": 9228 }, { "epoch": 4.1017777777777775, "grad_norm": 3.914445638656616, "learning_rate": 3.596085409252669e-05, "loss": 1.0772, "step": 9229 }, { "epoch": 4.102222222222222, "grad_norm": 4.377343654632568, "learning_rate": 3.594306049822064e-05, "loss": 0.9153, "step": 9230 }, { "epoch": 4.102666666666667, "grad_norm": 3.879788637161255, "learning_rate": 3.59252669039146e-05, "loss": 0.6954, "step": 9231 }, { "epoch": 4.103111111111111, "grad_norm": 4.046523571014404, "learning_rate": 3.590747330960854e-05, "loss": 0.8208, "step": 9232 }, { "epoch": 4.103555555555555, "grad_norm": 4.044562339782715, "learning_rate": 3.588967971530249e-05, "loss": 0.9976, "step": 9233 }, { "epoch": 4.104, "grad_norm": 3.7858481407165527, "learning_rate": 3.587188612099645e-05, "loss": 0.9062, "step": 9234 }, { "epoch": 4.104444444444445, "grad_norm": 2.641073226928711, "learning_rate": 3.585409252669039e-05, "loss": 0.4488, "step": 9235 }, { "epoch": 4.104888888888889, "grad_norm": 3.873842239379883, "learning_rate": 3.5836298932384346e-05, "loss": 0.8443, "step": 9236 }, { "epoch": 4.105333333333333, "grad_norm": 3.6468522548675537, "learning_rate": 3.5818505338078296e-05, "loss": 0.8935, "step": 9237 }, { "epoch": 4.105777777777778, "grad_norm": 4.058321952819824, "learning_rate": 3.580071174377224e-05, "loss": 0.7038, "step": 9238 }, { "epoch": 4.106222222222222, "grad_norm": 4.622478008270264, "learning_rate": 3.5782918149466196e-05, "loss": 1.1684, "step": 9239 }, { "epoch": 4.1066666666666665, "grad_norm": 3.3128762245178223, "learning_rate": 3.5765124555160145e-05, "loss": 0.6763, "step": 9240 }, { "epoch": 4.107111111111111, "grad_norm": 3.4522128105163574, "learning_rate": 3.5747330960854095e-05, "loss": 0.7716, "step": 9241 }, { "epoch": 4.107555555555556, "grad_norm": 4.44683837890625, "learning_rate": 3.5729537366548045e-05, "loss": 1.36, "step": 9242 }, { "epoch": 4.108, "grad_norm": 4.440934181213379, "learning_rate": 3.5711743772241995e-05, "loss": 0.9069, "step": 9243 }, { "epoch": 4.108444444444444, "grad_norm": 3.85774302482605, "learning_rate": 3.5693950177935944e-05, "loss": 0.7827, "step": 9244 }, { "epoch": 4.108888888888889, "grad_norm": 5.362185955047607, "learning_rate": 3.5676156583629894e-05, "loss": 1.2746, "step": 9245 }, { "epoch": 4.109333333333334, "grad_norm": 3.934819221496582, "learning_rate": 3.5658362989323844e-05, "loss": 0.7316, "step": 9246 }, { "epoch": 4.1097777777777775, "grad_norm": 4.532813549041748, "learning_rate": 3.56405693950178e-05, "loss": 0.6302, "step": 9247 }, { "epoch": 4.110222222222222, "grad_norm": 4.29311990737915, "learning_rate": 3.562277580071174e-05, "loss": 0.5404, "step": 9248 }, { "epoch": 4.110666666666667, "grad_norm": 3.867619276046753, "learning_rate": 3.560498220640569e-05, "loss": 0.8902, "step": 9249 }, { "epoch": 4.111111111111111, "grad_norm": 5.4382805824279785, "learning_rate": 3.558718861209965e-05, "loss": 0.8051, "step": 9250 }, { "epoch": 4.111555555555555, "grad_norm": 1.7659415006637573, "learning_rate": 3.556939501779359e-05, "loss": 0.8157, "step": 9251 }, { "epoch": 4.112, "grad_norm": 2.5079727172851562, "learning_rate": 3.555160142348755e-05, "loss": 1.3793, "step": 9252 }, { "epoch": 4.112444444444445, "grad_norm": 3.136521100997925, "learning_rate": 3.55338078291815e-05, "loss": 1.223, "step": 9253 }, { "epoch": 4.112888888888889, "grad_norm": 3.1797564029693604, "learning_rate": 3.551601423487544e-05, "loss": 1.2796, "step": 9254 }, { "epoch": 4.113333333333333, "grad_norm": 3.613799810409546, "learning_rate": 3.54982206405694e-05, "loss": 1.1687, "step": 9255 }, { "epoch": 4.113777777777778, "grad_norm": 3.8610999584198, "learning_rate": 3.548042704626335e-05, "loss": 0.957, "step": 9256 }, { "epoch": 4.114222222222223, "grad_norm": 3.316948890686035, "learning_rate": 3.54626334519573e-05, "loss": 1.4552, "step": 9257 }, { "epoch": 4.1146666666666665, "grad_norm": 3.1068124771118164, "learning_rate": 3.544483985765125e-05, "loss": 0.9491, "step": 9258 }, { "epoch": 4.115111111111111, "grad_norm": 3.8049232959747314, "learning_rate": 3.54270462633452e-05, "loss": 1.0955, "step": 9259 }, { "epoch": 4.115555555555556, "grad_norm": 3.263183355331421, "learning_rate": 3.540925266903915e-05, "loss": 0.8825, "step": 9260 }, { "epoch": 4.116, "grad_norm": 3.227997303009033, "learning_rate": 3.5391459074733096e-05, "loss": 1.1426, "step": 9261 }, { "epoch": 4.116444444444444, "grad_norm": 3.8824005126953125, "learning_rate": 3.5373665480427046e-05, "loss": 1.0386, "step": 9262 }, { "epoch": 4.116888888888889, "grad_norm": 4.455204486846924, "learning_rate": 3.5355871886121e-05, "loss": 1.0883, "step": 9263 }, { "epoch": 4.117333333333334, "grad_norm": 3.621575355529785, "learning_rate": 3.5338078291814945e-05, "loss": 1.4027, "step": 9264 }, { "epoch": 4.1177777777777775, "grad_norm": 3.8370847702026367, "learning_rate": 3.53202846975089e-05, "loss": 1.212, "step": 9265 }, { "epoch": 4.118222222222222, "grad_norm": 3.060487747192383, "learning_rate": 3.530249110320285e-05, "loss": 0.8656, "step": 9266 }, { "epoch": 4.118666666666667, "grad_norm": 3.36088228225708, "learning_rate": 3.5284697508896795e-05, "loss": 0.7663, "step": 9267 }, { "epoch": 4.119111111111111, "grad_norm": 3.684251308441162, "learning_rate": 3.526690391459075e-05, "loss": 0.9489, "step": 9268 }, { "epoch": 4.119555555555555, "grad_norm": 4.51201868057251, "learning_rate": 3.52491103202847e-05, "loss": 1.1845, "step": 9269 }, { "epoch": 4.12, "grad_norm": 4.410719394683838, "learning_rate": 3.5231316725978644e-05, "loss": 1.702, "step": 9270 }, { "epoch": 4.120444444444445, "grad_norm": 3.741725444793701, "learning_rate": 3.52135231316726e-05, "loss": 0.7647, "step": 9271 }, { "epoch": 4.120888888888889, "grad_norm": 4.066876411437988, "learning_rate": 3.519572953736655e-05, "loss": 1.046, "step": 9272 }, { "epoch": 4.121333333333333, "grad_norm": 3.6181998252868652, "learning_rate": 3.51779359430605e-05, "loss": 1.0439, "step": 9273 }, { "epoch": 4.121777777777778, "grad_norm": 4.158766746520996, "learning_rate": 3.516014234875445e-05, "loss": 0.8574, "step": 9274 }, { "epoch": 4.122222222222222, "grad_norm": 3.483020067214966, "learning_rate": 3.51423487544484e-05, "loss": 0.9312, "step": 9275 }, { "epoch": 4.1226666666666665, "grad_norm": 3.8150248527526855, "learning_rate": 3.5124555160142356e-05, "loss": 0.7764, "step": 9276 }, { "epoch": 4.123111111111111, "grad_norm": 4.526205062866211, "learning_rate": 3.51067615658363e-05, "loss": 0.9631, "step": 9277 }, { "epoch": 4.123555555555556, "grad_norm": 3.8249216079711914, "learning_rate": 3.508896797153025e-05, "loss": 0.7817, "step": 9278 }, { "epoch": 4.124, "grad_norm": 3.743093729019165, "learning_rate": 3.5071174377224205e-05, "loss": 0.9439, "step": 9279 }, { "epoch": 4.124444444444444, "grad_norm": 4.130136966705322, "learning_rate": 3.505338078291815e-05, "loss": 0.8861, "step": 9280 }, { "epoch": 4.124888888888889, "grad_norm": 4.320974826812744, "learning_rate": 3.5035587188612104e-05, "loss": 0.9908, "step": 9281 }, { "epoch": 4.125333333333334, "grad_norm": 4.112794876098633, "learning_rate": 3.5017793594306054e-05, "loss": 0.8566, "step": 9282 }, { "epoch": 4.1257777777777775, "grad_norm": 4.176024913787842, "learning_rate": 3.5e-05, "loss": 0.9719, "step": 9283 }, { "epoch": 4.126222222222222, "grad_norm": 3.905268669128418, "learning_rate": 3.4982206405693953e-05, "loss": 0.8443, "step": 9284 }, { "epoch": 4.126666666666667, "grad_norm": 4.007937431335449, "learning_rate": 3.49644128113879e-05, "loss": 0.6952, "step": 9285 }, { "epoch": 4.127111111111111, "grad_norm": 4.117457866668701, "learning_rate": 3.494661921708185e-05, "loss": 1.1521, "step": 9286 }, { "epoch": 4.127555555555555, "grad_norm": 4.604625225067139, "learning_rate": 3.49288256227758e-05, "loss": 0.7057, "step": 9287 }, { "epoch": 4.128, "grad_norm": 4.086756706237793, "learning_rate": 3.491103202846975e-05, "loss": 0.7674, "step": 9288 }, { "epoch": 4.128444444444445, "grad_norm": 3.6481192111968994, "learning_rate": 3.48932384341637e-05, "loss": 0.5105, "step": 9289 }, { "epoch": 4.128888888888889, "grad_norm": 3.9914069175720215, "learning_rate": 3.487544483985765e-05, "loss": 0.8233, "step": 9290 }, { "epoch": 4.129333333333333, "grad_norm": 4.540968418121338, "learning_rate": 3.48576512455516e-05, "loss": 1.2001, "step": 9291 }, { "epoch": 4.129777777777778, "grad_norm": 4.1771769523620605, "learning_rate": 3.483985765124556e-05, "loss": 0.759, "step": 9292 }, { "epoch": 4.130222222222222, "grad_norm": 4.386975288391113, "learning_rate": 3.48220640569395e-05, "loss": 0.8304, "step": 9293 }, { "epoch": 4.1306666666666665, "grad_norm": 3.9730632305145264, "learning_rate": 3.480427046263345e-05, "loss": 0.7385, "step": 9294 }, { "epoch": 4.131111111111111, "grad_norm": 3.7220194339752197, "learning_rate": 3.478647686832741e-05, "loss": 0.8544, "step": 9295 }, { "epoch": 4.131555555555556, "grad_norm": 3.411055564880371, "learning_rate": 3.476868327402135e-05, "loss": 0.7001, "step": 9296 }, { "epoch": 4.132, "grad_norm": 4.14982795715332, "learning_rate": 3.4750889679715307e-05, "loss": 0.9112, "step": 9297 }, { "epoch": 4.132444444444444, "grad_norm": 4.247723579406738, "learning_rate": 3.4733096085409256e-05, "loss": 0.8633, "step": 9298 }, { "epoch": 4.132888888888889, "grad_norm": 5.790535926818848, "learning_rate": 3.47153024911032e-05, "loss": 0.6881, "step": 9299 }, { "epoch": 4.133333333333334, "grad_norm": 3.3628830909729004, "learning_rate": 3.4697508896797156e-05, "loss": 0.5777, "step": 9300 }, { "epoch": 4.1337777777777776, "grad_norm": 2.9413001537323, "learning_rate": 3.4679715302491105e-05, "loss": 1.5624, "step": 9301 }, { "epoch": 4.134222222222222, "grad_norm": 2.9262659549713135, "learning_rate": 3.4661921708185055e-05, "loss": 1.451, "step": 9302 }, { "epoch": 4.134666666666667, "grad_norm": 2.8405566215515137, "learning_rate": 3.4644128113879005e-05, "loss": 1.6277, "step": 9303 }, { "epoch": 4.135111111111111, "grad_norm": 1.9037202596664429, "learning_rate": 3.4626334519572955e-05, "loss": 0.3974, "step": 9304 }, { "epoch": 4.135555555555555, "grad_norm": 3.3218352794647217, "learning_rate": 3.460854092526691e-05, "loss": 1.4065, "step": 9305 }, { "epoch": 4.136, "grad_norm": 3.091264009475708, "learning_rate": 3.4590747330960854e-05, "loss": 1.1051, "step": 9306 }, { "epoch": 4.136444444444445, "grad_norm": 3.341606616973877, "learning_rate": 3.4572953736654804e-05, "loss": 1.2214, "step": 9307 }, { "epoch": 4.136888888888889, "grad_norm": 3.4468531608581543, "learning_rate": 3.455516014234876e-05, "loss": 1.1427, "step": 9308 }, { "epoch": 4.137333333333333, "grad_norm": 3.611560583114624, "learning_rate": 3.45373665480427e-05, "loss": 1.0753, "step": 9309 }, { "epoch": 4.137777777777778, "grad_norm": 4.02140474319458, "learning_rate": 3.451957295373665e-05, "loss": 1.0676, "step": 9310 }, { "epoch": 4.138222222222222, "grad_norm": 3.795001268386841, "learning_rate": 3.450177935943061e-05, "loss": 1.0637, "step": 9311 }, { "epoch": 4.1386666666666665, "grad_norm": 3.871553659439087, "learning_rate": 3.448398576512455e-05, "loss": 1.1937, "step": 9312 }, { "epoch": 4.139111111111111, "grad_norm": 3.7541697025299072, "learning_rate": 3.446619217081851e-05, "loss": 0.9619, "step": 9313 }, { "epoch": 4.139555555555556, "grad_norm": 3.9635727405548096, "learning_rate": 3.444839857651246e-05, "loss": 0.9555, "step": 9314 }, { "epoch": 4.14, "grad_norm": 4.11905574798584, "learning_rate": 3.44306049822064e-05, "loss": 1.1721, "step": 9315 }, { "epoch": 4.140444444444444, "grad_norm": 4.109530448913574, "learning_rate": 3.441281138790036e-05, "loss": 1.0309, "step": 9316 }, { "epoch": 4.140888888888889, "grad_norm": 4.221639633178711, "learning_rate": 3.439501779359431e-05, "loss": 0.8867, "step": 9317 }, { "epoch": 4.141333333333334, "grad_norm": 4.1241068840026855, "learning_rate": 3.437722419928826e-05, "loss": 1.2, "step": 9318 }, { "epoch": 4.141777777777778, "grad_norm": 3.115713596343994, "learning_rate": 3.435943060498221e-05, "loss": 0.8329, "step": 9319 }, { "epoch": 4.142222222222222, "grad_norm": 4.011024475097656, "learning_rate": 3.434163701067616e-05, "loss": 0.9659, "step": 9320 }, { "epoch": 4.142666666666667, "grad_norm": 4.23065185546875, "learning_rate": 3.4323843416370113e-05, "loss": 0.834, "step": 9321 }, { "epoch": 4.143111111111111, "grad_norm": 4.004124641418457, "learning_rate": 3.4306049822064056e-05, "loss": 0.8905, "step": 9322 }, { "epoch": 4.143555555555555, "grad_norm": 4.145106315612793, "learning_rate": 3.4288256227758006e-05, "loss": 0.8814, "step": 9323 }, { "epoch": 4.144, "grad_norm": 3.7468135356903076, "learning_rate": 3.427046263345196e-05, "loss": 1.143, "step": 9324 }, { "epoch": 4.144444444444445, "grad_norm": 4.9056596755981445, "learning_rate": 3.4252669039145906e-05, "loss": 0.8847, "step": 9325 }, { "epoch": 4.144888888888889, "grad_norm": 3.6427161693573, "learning_rate": 3.423487544483986e-05, "loss": 0.7846, "step": 9326 }, { "epoch": 4.145333333333333, "grad_norm": 3.6018316745758057, "learning_rate": 3.421708185053381e-05, "loss": 1.1534, "step": 9327 }, { "epoch": 4.145777777777778, "grad_norm": 4.335727214813232, "learning_rate": 3.4199288256227755e-05, "loss": 0.7984, "step": 9328 }, { "epoch": 4.146222222222222, "grad_norm": 2.9448020458221436, "learning_rate": 3.418149466192171e-05, "loss": 0.6594, "step": 9329 }, { "epoch": 4.1466666666666665, "grad_norm": 5.207244396209717, "learning_rate": 3.416370106761566e-05, "loss": 0.9542, "step": 9330 }, { "epoch": 4.147111111111111, "grad_norm": 4.200593948364258, "learning_rate": 3.414590747330961e-05, "loss": 0.7464, "step": 9331 }, { "epoch": 4.147555555555556, "grad_norm": 3.8014614582061768, "learning_rate": 3.412811387900356e-05, "loss": 0.8468, "step": 9332 }, { "epoch": 4.148, "grad_norm": 4.140382289886475, "learning_rate": 3.411032028469751e-05, "loss": 1.0457, "step": 9333 }, { "epoch": 4.148444444444444, "grad_norm": 4.095797061920166, "learning_rate": 3.409252669039146e-05, "loss": 0.8541, "step": 9334 }, { "epoch": 4.148888888888889, "grad_norm": 4.9063825607299805, "learning_rate": 3.407473309608541e-05, "loss": 1.012, "step": 9335 }, { "epoch": 4.149333333333334, "grad_norm": 5.420862197875977, "learning_rate": 3.405693950177936e-05, "loss": 1.0987, "step": 9336 }, { "epoch": 4.149777777777778, "grad_norm": 4.148375988006592, "learning_rate": 3.4039145907473316e-05, "loss": 0.7677, "step": 9337 }, { "epoch": 4.150222222222222, "grad_norm": 4.134524345397949, "learning_rate": 3.402135231316726e-05, "loss": 0.7619, "step": 9338 }, { "epoch": 4.150666666666667, "grad_norm": 3.2314364910125732, "learning_rate": 3.400355871886121e-05, "loss": 0.6115, "step": 9339 }, { "epoch": 4.151111111111111, "grad_norm": 4.153497219085693, "learning_rate": 3.3985765124555165e-05, "loss": 1.0227, "step": 9340 }, { "epoch": 4.151555555555555, "grad_norm": 5.603724002838135, "learning_rate": 3.396797153024911e-05, "loss": 1.2158, "step": 9341 }, { "epoch": 4.152, "grad_norm": 4.0254716873168945, "learning_rate": 3.3950177935943064e-05, "loss": 0.9139, "step": 9342 }, { "epoch": 4.152444444444445, "grad_norm": 3.7253804206848145, "learning_rate": 3.3932384341637014e-05, "loss": 0.6698, "step": 9343 }, { "epoch": 4.152888888888889, "grad_norm": 3.8816728591918945, "learning_rate": 3.391459074733096e-05, "loss": 0.5557, "step": 9344 }, { "epoch": 4.153333333333333, "grad_norm": 4.364738941192627, "learning_rate": 3.3896797153024914e-05, "loss": 0.7052, "step": 9345 }, { "epoch": 4.153777777777778, "grad_norm": 4.066445827484131, "learning_rate": 3.387900355871886e-05, "loss": 0.6252, "step": 9346 }, { "epoch": 4.154222222222222, "grad_norm": 5.738955497741699, "learning_rate": 3.386120996441281e-05, "loss": 1.3448, "step": 9347 }, { "epoch": 4.1546666666666665, "grad_norm": 4.564332962036133, "learning_rate": 3.384341637010676e-05, "loss": 0.5821, "step": 9348 }, { "epoch": 4.155111111111111, "grad_norm": 4.382009983062744, "learning_rate": 3.382562277580071e-05, "loss": 0.7055, "step": 9349 }, { "epoch": 4.155555555555556, "grad_norm": 3.8401434421539307, "learning_rate": 3.380782918149467e-05, "loss": 0.4056, "step": 9350 }, { "epoch": 4.156, "grad_norm": 2.941746711730957, "learning_rate": 3.379003558718861e-05, "loss": 1.3041, "step": 9351 }, { "epoch": 4.156444444444444, "grad_norm": 3.0022342205047607, "learning_rate": 3.377224199288256e-05, "loss": 1.2223, "step": 9352 }, { "epoch": 4.156888888888889, "grad_norm": 3.1641345024108887, "learning_rate": 3.375444839857652e-05, "loss": 1.1629, "step": 9353 }, { "epoch": 4.157333333333334, "grad_norm": 3.8065378665924072, "learning_rate": 3.373665480427046e-05, "loss": 1.7449, "step": 9354 }, { "epoch": 4.157777777777778, "grad_norm": 3.80554461479187, "learning_rate": 3.371886120996441e-05, "loss": 1.2545, "step": 9355 }, { "epoch": 4.158222222222222, "grad_norm": 3.1130597591400146, "learning_rate": 3.370106761565837e-05, "loss": 0.8787, "step": 9356 }, { "epoch": 4.158666666666667, "grad_norm": 3.7158026695251465, "learning_rate": 3.368327402135231e-05, "loss": 1.171, "step": 9357 }, { "epoch": 4.159111111111111, "grad_norm": 3.599818468093872, "learning_rate": 3.366548042704627e-05, "loss": 1.0849, "step": 9358 }, { "epoch": 4.1595555555555555, "grad_norm": 3.1732017993927, "learning_rate": 3.3647686832740216e-05, "loss": 0.9308, "step": 9359 }, { "epoch": 4.16, "grad_norm": 3.3031604290008545, "learning_rate": 3.3629893238434166e-05, "loss": 0.8818, "step": 9360 }, { "epoch": 4.160444444444445, "grad_norm": 3.976465940475464, "learning_rate": 3.3612099644128116e-05, "loss": 1.2361, "step": 9361 }, { "epoch": 4.160888888888889, "grad_norm": 3.4227919578552246, "learning_rate": 3.3594306049822066e-05, "loss": 0.9153, "step": 9362 }, { "epoch": 4.161333333333333, "grad_norm": 4.071769714355469, "learning_rate": 3.3576512455516015e-05, "loss": 1.1655, "step": 9363 }, { "epoch": 4.161777777777778, "grad_norm": 3.795288324356079, "learning_rate": 3.3558718861209965e-05, "loss": 0.9261, "step": 9364 }, { "epoch": 4.162222222222222, "grad_norm": 4.52875280380249, "learning_rate": 3.3540925266903915e-05, "loss": 1.2094, "step": 9365 }, { "epoch": 4.1626666666666665, "grad_norm": 3.7887825965881348, "learning_rate": 3.352313167259787e-05, "loss": 0.9441, "step": 9366 }, { "epoch": 4.163111111111111, "grad_norm": 3.6934292316436768, "learning_rate": 3.3505338078291814e-05, "loss": 1.0685, "step": 9367 }, { "epoch": 4.163555555555556, "grad_norm": 4.112295150756836, "learning_rate": 3.3487544483985764e-05, "loss": 0.8924, "step": 9368 }, { "epoch": 4.164, "grad_norm": 4.231710910797119, "learning_rate": 3.346975088967972e-05, "loss": 1.2001, "step": 9369 }, { "epoch": 4.164444444444444, "grad_norm": 4.923578262329102, "learning_rate": 3.345195729537366e-05, "loss": 1.2478, "step": 9370 }, { "epoch": 4.164888888888889, "grad_norm": 4.2798895835876465, "learning_rate": 3.343416370106762e-05, "loss": 0.9576, "step": 9371 }, { "epoch": 4.165333333333333, "grad_norm": 3.816706895828247, "learning_rate": 3.341637010676157e-05, "loss": 0.9874, "step": 9372 }, { "epoch": 4.165777777777778, "grad_norm": 4.367424011230469, "learning_rate": 3.339857651245551e-05, "loss": 0.93, "step": 9373 }, { "epoch": 4.166222222222222, "grad_norm": 3.3792166709899902, "learning_rate": 3.338078291814947e-05, "loss": 0.7842, "step": 9374 }, { "epoch": 4.166666666666667, "grad_norm": 4.096624374389648, "learning_rate": 3.336298932384342e-05, "loss": 0.6063, "step": 9375 }, { "epoch": 4.167111111111111, "grad_norm": 3.6580982208251953, "learning_rate": 3.334519572953737e-05, "loss": 1.2232, "step": 9376 }, { "epoch": 4.1675555555555555, "grad_norm": 4.178760528564453, "learning_rate": 3.332740213523132e-05, "loss": 1.1404, "step": 9377 }, { "epoch": 4.168, "grad_norm": 2.8333945274353027, "learning_rate": 3.330960854092527e-05, "loss": 0.3454, "step": 9378 }, { "epoch": 4.168444444444445, "grad_norm": 4.03084135055542, "learning_rate": 3.329181494661922e-05, "loss": 0.9051, "step": 9379 }, { "epoch": 4.168888888888889, "grad_norm": 4.452610015869141, "learning_rate": 3.327402135231317e-05, "loss": 0.8157, "step": 9380 }, { "epoch": 4.169333333333333, "grad_norm": 3.308134078979492, "learning_rate": 3.325622775800712e-05, "loss": 0.5715, "step": 9381 }, { "epoch": 4.169777777777778, "grad_norm": 5.042409896850586, "learning_rate": 3.3238434163701074e-05, "loss": 0.9263, "step": 9382 }, { "epoch": 4.170222222222222, "grad_norm": 5.0167646408081055, "learning_rate": 3.3220640569395016e-05, "loss": 1.4425, "step": 9383 }, { "epoch": 4.1706666666666665, "grad_norm": 3.9779274463653564, "learning_rate": 3.3202846975088966e-05, "loss": 0.8751, "step": 9384 }, { "epoch": 4.171111111111111, "grad_norm": 3.9376795291900635, "learning_rate": 3.318505338078292e-05, "loss": 0.9257, "step": 9385 }, { "epoch": 4.171555555555556, "grad_norm": 6.37144136428833, "learning_rate": 3.3167259786476866e-05, "loss": 0.7895, "step": 9386 }, { "epoch": 4.172, "grad_norm": 3.9572556018829346, "learning_rate": 3.314946619217082e-05, "loss": 0.8676, "step": 9387 }, { "epoch": 4.172444444444444, "grad_norm": 4.135096073150635, "learning_rate": 3.313167259786477e-05, "loss": 0.977, "step": 9388 }, { "epoch": 4.172888888888889, "grad_norm": 4.981659412384033, "learning_rate": 3.311387900355872e-05, "loss": 0.8796, "step": 9389 }, { "epoch": 4.173333333333334, "grad_norm": 1.5012166500091553, "learning_rate": 3.309608540925267e-05, "loss": 0.1896, "step": 9390 }, { "epoch": 4.173777777777778, "grad_norm": 3.713898181915283, "learning_rate": 3.307829181494662e-05, "loss": 0.9846, "step": 9391 }, { "epoch": 4.174222222222222, "grad_norm": 4.347773551940918, "learning_rate": 3.306049822064057e-05, "loss": 0.9879, "step": 9392 }, { "epoch": 4.174666666666667, "grad_norm": 4.975940227508545, "learning_rate": 3.304270462633452e-05, "loss": 0.8384, "step": 9393 }, { "epoch": 4.175111111111111, "grad_norm": 4.148822784423828, "learning_rate": 3.302491103202847e-05, "loss": 0.621, "step": 9394 }, { "epoch": 4.1755555555555555, "grad_norm": 3.4947614669799805, "learning_rate": 3.300711743772243e-05, "loss": 0.6469, "step": 9395 }, { "epoch": 4.176, "grad_norm": 3.335649251937866, "learning_rate": 3.298932384341637e-05, "loss": 0.5926, "step": 9396 }, { "epoch": 4.176444444444445, "grad_norm": 5.286258220672607, "learning_rate": 3.297153024911032e-05, "loss": 1.1108, "step": 9397 }, { "epoch": 4.176888888888889, "grad_norm": 4.690072059631348, "learning_rate": 3.2953736654804276e-05, "loss": 0.9287, "step": 9398 }, { "epoch": 4.177333333333333, "grad_norm": 5.190115928649902, "learning_rate": 3.293594306049822e-05, "loss": 0.8765, "step": 9399 }, { "epoch": 4.177777777777778, "grad_norm": 5.4480719566345215, "learning_rate": 3.291814946619217e-05, "loss": 0.8428, "step": 9400 }, { "epoch": 4.178222222222222, "grad_norm": 1.7260178327560425, "learning_rate": 3.2900355871886125e-05, "loss": 0.6587, "step": 9401 }, { "epoch": 4.1786666666666665, "grad_norm": 3.0524849891662598, "learning_rate": 3.288256227758007e-05, "loss": 1.6461, "step": 9402 }, { "epoch": 4.179111111111111, "grad_norm": 3.80712890625, "learning_rate": 3.2864768683274024e-05, "loss": 1.5275, "step": 9403 }, { "epoch": 4.179555555555556, "grad_norm": 3.202948808670044, "learning_rate": 3.2846975088967974e-05, "loss": 1.101, "step": 9404 }, { "epoch": 4.18, "grad_norm": 3.789433002471924, "learning_rate": 3.2829181494661924e-05, "loss": 1.4379, "step": 9405 }, { "epoch": 4.180444444444444, "grad_norm": 3.376875162124634, "learning_rate": 3.2811387900355874e-05, "loss": 1.099, "step": 9406 }, { "epoch": 4.180888888888889, "grad_norm": 3.4530889987945557, "learning_rate": 3.279359430604982e-05, "loss": 1.5196, "step": 9407 }, { "epoch": 4.181333333333333, "grad_norm": 4.1841607093811035, "learning_rate": 3.277580071174377e-05, "loss": 1.3571, "step": 9408 }, { "epoch": 4.181777777777778, "grad_norm": 4.057764053344727, "learning_rate": 3.275800711743772e-05, "loss": 1.0025, "step": 9409 }, { "epoch": 4.182222222222222, "grad_norm": 3.6850550174713135, "learning_rate": 3.274021352313167e-05, "loss": 1.0223, "step": 9410 }, { "epoch": 4.182666666666667, "grad_norm": 3.958878517150879, "learning_rate": 3.272241992882563e-05, "loss": 1.1901, "step": 9411 }, { "epoch": 4.183111111111111, "grad_norm": 4.859387397766113, "learning_rate": 3.270462633451957e-05, "loss": 0.9504, "step": 9412 }, { "epoch": 4.1835555555555555, "grad_norm": 2.9225330352783203, "learning_rate": 3.268683274021352e-05, "loss": 0.5792, "step": 9413 }, { "epoch": 4.184, "grad_norm": 3.700796127319336, "learning_rate": 3.266903914590748e-05, "loss": 0.5557, "step": 9414 }, { "epoch": 4.184444444444445, "grad_norm": 2.9793245792388916, "learning_rate": 3.265124555160142e-05, "loss": 0.5418, "step": 9415 }, { "epoch": 4.184888888888889, "grad_norm": 4.47357702255249, "learning_rate": 3.263345195729538e-05, "loss": 1.2536, "step": 9416 }, { "epoch": 4.185333333333333, "grad_norm": 3.908679485321045, "learning_rate": 3.261565836298933e-05, "loss": 1.2, "step": 9417 }, { "epoch": 4.185777777777778, "grad_norm": 3.9256107807159424, "learning_rate": 3.259786476868328e-05, "loss": 0.7822, "step": 9418 }, { "epoch": 4.186222222222222, "grad_norm": 4.371975421905518, "learning_rate": 3.258007117437723e-05, "loss": 0.8867, "step": 9419 }, { "epoch": 4.1866666666666665, "grad_norm": 3.3875746726989746, "learning_rate": 3.2562277580071177e-05, "loss": 0.8726, "step": 9420 }, { "epoch": 4.187111111111111, "grad_norm": 4.142739295959473, "learning_rate": 3.2544483985765126e-05, "loss": 0.6472, "step": 9421 }, { "epoch": 4.187555555555556, "grad_norm": 3.9463632106781006, "learning_rate": 3.2526690391459076e-05, "loss": 1.1603, "step": 9422 }, { "epoch": 4.188, "grad_norm": 5.386812210083008, "learning_rate": 3.2508896797153026e-05, "loss": 0.9674, "step": 9423 }, { "epoch": 4.188444444444444, "grad_norm": 5.0619001388549805, "learning_rate": 3.2491103202846975e-05, "loss": 1.3021, "step": 9424 }, { "epoch": 4.188888888888889, "grad_norm": 4.491568565368652, "learning_rate": 3.2473309608540925e-05, "loss": 1.1475, "step": 9425 }, { "epoch": 4.189333333333333, "grad_norm": 4.122431755065918, "learning_rate": 3.2455516014234875e-05, "loss": 0.8021, "step": 9426 }, { "epoch": 4.189777777777778, "grad_norm": 4.491034507751465, "learning_rate": 3.243772241992883e-05, "loss": 1.0464, "step": 9427 }, { "epoch": 4.190222222222222, "grad_norm": 3.6117091178894043, "learning_rate": 3.2419928825622774e-05, "loss": 0.8858, "step": 9428 }, { "epoch": 4.190666666666667, "grad_norm": 5.300388813018799, "learning_rate": 3.2402135231316724e-05, "loss": 1.0057, "step": 9429 }, { "epoch": 4.191111111111111, "grad_norm": 3.5130858421325684, "learning_rate": 3.238434163701068e-05, "loss": 0.4761, "step": 9430 }, { "epoch": 4.1915555555555555, "grad_norm": 4.297408103942871, "learning_rate": 3.2366548042704623e-05, "loss": 0.8196, "step": 9431 }, { "epoch": 4.192, "grad_norm": 4.353086471557617, "learning_rate": 3.234875444839858e-05, "loss": 1.031, "step": 9432 }, { "epoch": 4.192444444444445, "grad_norm": 3.7016918659210205, "learning_rate": 3.233096085409253e-05, "loss": 0.8314, "step": 9433 }, { "epoch": 4.192888888888889, "grad_norm": 3.4741709232330322, "learning_rate": 3.231316725978648e-05, "loss": 0.894, "step": 9434 }, { "epoch": 4.193333333333333, "grad_norm": 3.2105934619903564, "learning_rate": 3.229537366548043e-05, "loss": 0.5393, "step": 9435 }, { "epoch": 4.193777777777778, "grad_norm": 4.7250447273254395, "learning_rate": 3.227758007117438e-05, "loss": 1.047, "step": 9436 }, { "epoch": 4.194222222222222, "grad_norm": 4.391071796417236, "learning_rate": 3.225978647686833e-05, "loss": 0.7677, "step": 9437 }, { "epoch": 4.1946666666666665, "grad_norm": 2.0137879848480225, "learning_rate": 3.224199288256228e-05, "loss": 0.2664, "step": 9438 }, { "epoch": 4.195111111111111, "grad_norm": 4.647293567657471, "learning_rate": 3.222419928825623e-05, "loss": 0.6575, "step": 9439 }, { "epoch": 4.195555555555556, "grad_norm": 3.5285840034484863, "learning_rate": 3.2206405693950184e-05, "loss": 0.8251, "step": 9440 }, { "epoch": 4.196, "grad_norm": 4.494802474975586, "learning_rate": 3.218861209964413e-05, "loss": 0.7617, "step": 9441 }, { "epoch": 4.196444444444444, "grad_norm": 3.300382614135742, "learning_rate": 3.217081850533808e-05, "loss": 0.8305, "step": 9442 }, { "epoch": 4.196888888888889, "grad_norm": 5.507649898529053, "learning_rate": 3.2153024911032034e-05, "loss": 0.8494, "step": 9443 }, { "epoch": 4.197333333333333, "grad_norm": 4.006106853485107, "learning_rate": 3.2135231316725977e-05, "loss": 0.6544, "step": 9444 }, { "epoch": 4.197777777777778, "grad_norm": 4.67263126373291, "learning_rate": 3.2117437722419926e-05, "loss": 0.9304, "step": 9445 }, { "epoch": 4.198222222222222, "grad_norm": 4.910307884216309, "learning_rate": 3.209964412811388e-05, "loss": 0.8954, "step": 9446 }, { "epoch": 4.198666666666667, "grad_norm": 6.133388996124268, "learning_rate": 3.208185053380783e-05, "loss": 0.6887, "step": 9447 }, { "epoch": 4.199111111111111, "grad_norm": 4.447690010070801, "learning_rate": 3.206405693950178e-05, "loss": 0.9454, "step": 9448 }, { "epoch": 4.1995555555555555, "grad_norm": 4.388811111450195, "learning_rate": 3.204626334519573e-05, "loss": 0.8632, "step": 9449 }, { "epoch": 4.2, "grad_norm": 3.5809247493743896, "learning_rate": 3.202846975088968e-05, "loss": 0.1799, "step": 9450 }, { "epoch": 4.200444444444445, "grad_norm": 2.599862575531006, "learning_rate": 3.201067615658363e-05, "loss": 1.5166, "step": 9451 }, { "epoch": 4.200888888888889, "grad_norm": 3.278106451034546, "learning_rate": 3.199288256227758e-05, "loss": 1.6669, "step": 9452 }, { "epoch": 4.201333333333333, "grad_norm": 2.277113676071167, "learning_rate": 3.197508896797153e-05, "loss": 0.5748, "step": 9453 }, { "epoch": 4.201777777777778, "grad_norm": 3.8572428226470947, "learning_rate": 3.195729537366548e-05, "loss": 1.4989, "step": 9454 }, { "epoch": 4.202222222222222, "grad_norm": 3.595043659210205, "learning_rate": 3.193950177935943e-05, "loss": 1.0113, "step": 9455 }, { "epoch": 4.2026666666666666, "grad_norm": 3.511258840560913, "learning_rate": 3.192170818505339e-05, "loss": 1.4261, "step": 9456 }, { "epoch": 4.203111111111111, "grad_norm": 3.4274392127990723, "learning_rate": 3.190391459074733e-05, "loss": 1.0837, "step": 9457 }, { "epoch": 4.203555555555556, "grad_norm": 3.7889490127563477, "learning_rate": 3.188612099644128e-05, "loss": 1.3672, "step": 9458 }, { "epoch": 4.204, "grad_norm": 3.413290500640869, "learning_rate": 3.1868327402135236e-05, "loss": 1.0793, "step": 9459 }, { "epoch": 4.204444444444444, "grad_norm": 3.5661888122558594, "learning_rate": 3.185053380782918e-05, "loss": 1.704, "step": 9460 }, { "epoch": 4.204888888888889, "grad_norm": 3.6765122413635254, "learning_rate": 3.1832740213523135e-05, "loss": 1.1704, "step": 9461 }, { "epoch": 4.205333333333333, "grad_norm": 3.350315570831299, "learning_rate": 3.1814946619217085e-05, "loss": 1.5331, "step": 9462 }, { "epoch": 4.205777777777778, "grad_norm": 3.8041248321533203, "learning_rate": 3.1797153024911035e-05, "loss": 1.1945, "step": 9463 }, { "epoch": 4.206222222222222, "grad_norm": 3.120882987976074, "learning_rate": 3.1779359430604985e-05, "loss": 0.9308, "step": 9464 }, { "epoch": 4.206666666666667, "grad_norm": 3.490065574645996, "learning_rate": 3.1761565836298934e-05, "loss": 1.0944, "step": 9465 }, { "epoch": 4.207111111111111, "grad_norm": 3.804657459259033, "learning_rate": 3.1743772241992884e-05, "loss": 1.2981, "step": 9466 }, { "epoch": 4.2075555555555555, "grad_norm": 4.033463954925537, "learning_rate": 3.1725978647686834e-05, "loss": 0.8921, "step": 9467 }, { "epoch": 4.208, "grad_norm": 3.6122589111328125, "learning_rate": 3.1708185053380783e-05, "loss": 0.8588, "step": 9468 }, { "epoch": 4.208444444444444, "grad_norm": 4.304235935211182, "learning_rate": 3.169039145907473e-05, "loss": 1.2107, "step": 9469 }, { "epoch": 4.208888888888889, "grad_norm": 4.344990253448486, "learning_rate": 3.167259786476868e-05, "loss": 1.1185, "step": 9470 }, { "epoch": 4.209333333333333, "grad_norm": 3.5502078533172607, "learning_rate": 3.165480427046263e-05, "loss": 1.0842, "step": 9471 }, { "epoch": 4.209777777777778, "grad_norm": 4.145521640777588, "learning_rate": 3.163701067615659e-05, "loss": 1.2853, "step": 9472 }, { "epoch": 4.210222222222222, "grad_norm": 3.878098964691162, "learning_rate": 3.161921708185053e-05, "loss": 0.8788, "step": 9473 }, { "epoch": 4.210666666666667, "grad_norm": 3.4567415714263916, "learning_rate": 3.160142348754448e-05, "loss": 0.9139, "step": 9474 }, { "epoch": 4.211111111111111, "grad_norm": 3.87986421585083, "learning_rate": 3.158362989323844e-05, "loss": 0.7227, "step": 9475 }, { "epoch": 4.211555555555556, "grad_norm": 5.376968860626221, "learning_rate": 3.156583629893239e-05, "loss": 1.4257, "step": 9476 }, { "epoch": 4.212, "grad_norm": 5.025868892669678, "learning_rate": 3.154804270462634e-05, "loss": 0.8064, "step": 9477 }, { "epoch": 4.212444444444444, "grad_norm": 3.2380011081695557, "learning_rate": 3.153024911032029e-05, "loss": 0.6622, "step": 9478 }, { "epoch": 4.212888888888889, "grad_norm": 4.465753555297852, "learning_rate": 3.151245551601424e-05, "loss": 0.9084, "step": 9479 }, { "epoch": 4.213333333333333, "grad_norm": 4.312685489654541, "learning_rate": 3.149466192170819e-05, "loss": 1.0134, "step": 9480 }, { "epoch": 4.213777777777778, "grad_norm": 3.1639740467071533, "learning_rate": 3.147686832740214e-05, "loss": 0.6682, "step": 9481 }, { "epoch": 4.214222222222222, "grad_norm": 3.7357571125030518, "learning_rate": 3.1459074733096086e-05, "loss": 0.8452, "step": 9482 }, { "epoch": 4.214666666666667, "grad_norm": 2.6543757915496826, "learning_rate": 3.1441281138790036e-05, "loss": 0.5144, "step": 9483 }, { "epoch": 4.215111111111111, "grad_norm": 4.193578243255615, "learning_rate": 3.1423487544483986e-05, "loss": 0.6794, "step": 9484 }, { "epoch": 4.2155555555555555, "grad_norm": 3.771239757537842, "learning_rate": 3.140569395017794e-05, "loss": 0.7003, "step": 9485 }, { "epoch": 4.216, "grad_norm": 4.657698631286621, "learning_rate": 3.1387900355871885e-05, "loss": 1.0667, "step": 9486 }, { "epoch": 4.216444444444445, "grad_norm": 3.7996761798858643, "learning_rate": 3.1370106761565835e-05, "loss": 0.7508, "step": 9487 }, { "epoch": 4.216888888888889, "grad_norm": 3.673656940460205, "learning_rate": 3.135231316725979e-05, "loss": 0.9042, "step": 9488 }, { "epoch": 4.217333333333333, "grad_norm": 4.215774059295654, "learning_rate": 3.1334519572953734e-05, "loss": 0.5879, "step": 9489 }, { "epoch": 4.217777777777778, "grad_norm": 5.385339260101318, "learning_rate": 3.1316725978647684e-05, "loss": 1.1556, "step": 9490 }, { "epoch": 4.218222222222222, "grad_norm": 5.48359489440918, "learning_rate": 3.129893238434164e-05, "loss": 0.8134, "step": 9491 }, { "epoch": 4.218666666666667, "grad_norm": 4.134088516235352, "learning_rate": 3.128113879003559e-05, "loss": 0.9353, "step": 9492 }, { "epoch": 4.219111111111111, "grad_norm": 4.422386646270752, "learning_rate": 3.126334519572954e-05, "loss": 0.7743, "step": 9493 }, { "epoch": 4.219555555555556, "grad_norm": 4.430136680603027, "learning_rate": 3.124555160142349e-05, "loss": 0.8493, "step": 9494 }, { "epoch": 4.22, "grad_norm": 4.236592769622803, "learning_rate": 3.122775800711744e-05, "loss": 0.6579, "step": 9495 }, { "epoch": 4.220444444444444, "grad_norm": 3.639747142791748, "learning_rate": 3.120996441281139e-05, "loss": 0.756, "step": 9496 }, { "epoch": 4.220888888888889, "grad_norm": 4.818484783172607, "learning_rate": 3.119217081850534e-05, "loss": 1.1835, "step": 9497 }, { "epoch": 4.221333333333333, "grad_norm": 4.322108745574951, "learning_rate": 3.117437722419929e-05, "loss": 0.6088, "step": 9498 }, { "epoch": 4.221777777777778, "grad_norm": 4.885183334350586, "learning_rate": 3.115658362989324e-05, "loss": 0.8394, "step": 9499 }, { "epoch": 4.222222222222222, "grad_norm": 0.4897964298725128, "learning_rate": 3.113879003558719e-05, "loss": 0.0723, "step": 9500 } ], "logging_steps": 1, "max_steps": 11250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.299541558648013e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }