{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 903, "global_step": 3611, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00027693159789531985, "grad_norm": 24.09097671508789, "learning_rate": 4.000000000000001e-06, "loss": 52.9022, "step": 1 }, { "epoch": 0.00027693159789531985, "eval_loss": 6.285526275634766, "eval_runtime": 810.5492, "eval_samples_per_second": 7.504, "eval_steps_per_second": 1.877, "step": 1 }, { "epoch": 0.0005538631957906397, "grad_norm": 26.264270782470703, "learning_rate": 8.000000000000001e-06, "loss": 50.5719, "step": 2 }, { "epoch": 0.0008307947936859596, "grad_norm": 24.111114501953125, "learning_rate": 1.2e-05, "loss": 51.6265, "step": 3 }, { "epoch": 0.0011077263915812794, "grad_norm": 23.482765197753906, "learning_rate": 1.6000000000000003e-05, "loss": 48.2097, "step": 4 }, { "epoch": 0.0013846579894765993, "grad_norm": 23.02947235107422, "learning_rate": 2e-05, "loss": 50.7815, "step": 5 }, { "epoch": 0.001661589587371919, "grad_norm": 21.88750648498535, "learning_rate": 2.4e-05, "loss": 51.859, "step": 6 }, { "epoch": 0.001938521185267239, "grad_norm": 23.62565803527832, "learning_rate": 2.8000000000000003e-05, "loss": 52.6551, "step": 7 }, { "epoch": 0.002215452783162559, "grad_norm": 28.487407684326172, "learning_rate": 3.2000000000000005e-05, "loss": 46.7997, "step": 8 }, { "epoch": 0.0024923843810578787, "grad_norm": 30.373289108276367, "learning_rate": 3.6e-05, "loss": 46.1472, "step": 9 }, { "epoch": 0.0027693159789531985, "grad_norm": 27.043048858642578, "learning_rate": 4e-05, "loss": 51.6249, "step": 10 }, { "epoch": 0.0030462475768485184, "grad_norm": 32.95696258544922, "learning_rate": 4.4000000000000006e-05, "loss": 39.4519, "step": 11 }, { "epoch": 0.003323179174743838, "grad_norm": 26.835208892822266, "learning_rate": 4.8e-05, "loss": 42.0588, "step": 12 }, { "epoch": 0.003600110772639158, "grad_norm": 29.825239181518555, "learning_rate": 5.2000000000000004e-05, "loss": 43.2945, "step": 13 }, { "epoch": 0.003877042370534478, "grad_norm": 29.57949447631836, "learning_rate": 5.6000000000000006e-05, "loss": 42.6722, "step": 14 }, { "epoch": 0.004153973968429798, "grad_norm": 35.65092849731445, "learning_rate": 6e-05, "loss": 46.1249, "step": 15 }, { "epoch": 0.004430905566325118, "grad_norm": 33.953556060791016, "learning_rate": 6.400000000000001e-05, "loss": 38.0391, "step": 16 }, { "epoch": 0.004707837164220438, "grad_norm": 36.93070983886719, "learning_rate": 6.800000000000001e-05, "loss": 40.2576, "step": 17 }, { "epoch": 0.004984768762115757, "grad_norm": 37.30794143676758, "learning_rate": 7.2e-05, "loss": 35.7331, "step": 18 }, { "epoch": 0.005261700360011078, "grad_norm": 35.6324577331543, "learning_rate": 7.6e-05, "loss": 33.1261, "step": 19 }, { "epoch": 0.005538631957906397, "grad_norm": 37.69329833984375, "learning_rate": 8e-05, "loss": 34.4139, "step": 20 }, { "epoch": 0.005815563555801717, "grad_norm": 39.204811096191406, "learning_rate": 8.4e-05, "loss": 25.1644, "step": 21 }, { "epoch": 0.006092495153697037, "grad_norm": 77.47728729248047, "learning_rate": 8.800000000000001e-05, "loss": 24.5212, "step": 22 }, { "epoch": 0.006369426751592357, "grad_norm": 35.82951354980469, "learning_rate": 9.200000000000001e-05, "loss": 23.2199, "step": 23 }, { "epoch": 0.006646358349487676, "grad_norm": 37.444766998291016, "learning_rate": 9.6e-05, "loss": 22.59, "step": 24 }, { "epoch": 0.006923289947382997, "grad_norm": 42.77293014526367, "learning_rate": 0.0001, "loss": 24.103, "step": 25 }, { "epoch": 0.007200221545278316, "grad_norm": 34.63228988647461, "learning_rate": 0.00010400000000000001, "loss": 21.5396, "step": 26 }, { "epoch": 0.007477153143173636, "grad_norm": 33.11638259887695, "learning_rate": 0.00010800000000000001, "loss": 18.6901, "step": 27 }, { "epoch": 0.007754084741068956, "grad_norm": 25.6596736907959, "learning_rate": 0.00011200000000000001, "loss": 16.7677, "step": 28 }, { "epoch": 0.008031016338964275, "grad_norm": 26.76543426513672, "learning_rate": 0.000116, "loss": 15.0301, "step": 29 }, { "epoch": 0.008307947936859596, "grad_norm": 27.160602569580078, "learning_rate": 0.00012, "loss": 12.1681, "step": 30 }, { "epoch": 0.008584879534754916, "grad_norm": 27.516878128051758, "learning_rate": 0.000124, "loss": 11.442, "step": 31 }, { "epoch": 0.008861811132650235, "grad_norm": 31.20770263671875, "learning_rate": 0.00012800000000000002, "loss": 7.6272, "step": 32 }, { "epoch": 0.009138742730545555, "grad_norm": 20.336532592773438, "learning_rate": 0.000132, "loss": 6.3337, "step": 33 }, { "epoch": 0.009415674328440876, "grad_norm": 26.660675048828125, "learning_rate": 0.00013600000000000003, "loss": 4.5571, "step": 34 }, { "epoch": 0.009692605926336195, "grad_norm": 21.37610626220703, "learning_rate": 0.00014, "loss": 3.659, "step": 35 }, { "epoch": 0.009969537524231515, "grad_norm": 32.23893356323242, "learning_rate": 0.000144, "loss": 5.3739, "step": 36 }, { "epoch": 0.010246469122126834, "grad_norm": 24.701919555664062, "learning_rate": 0.000148, "loss": 3.8149, "step": 37 }, { "epoch": 0.010523400720022155, "grad_norm": 22.42169189453125, "learning_rate": 0.000152, "loss": 3.7777, "step": 38 }, { "epoch": 0.010800332317917475, "grad_norm": 16.242782592773438, "learning_rate": 0.00015600000000000002, "loss": 3.3434, "step": 39 }, { "epoch": 0.011077263915812794, "grad_norm": 12.208995819091797, "learning_rate": 0.00016, "loss": 2.9402, "step": 40 }, { "epoch": 0.011354195513708113, "grad_norm": 20.679357528686523, "learning_rate": 0.000164, "loss": 3.7932, "step": 41 }, { "epoch": 0.011631127111603435, "grad_norm": 15.438067436218262, "learning_rate": 0.000168, "loss": 4.0106, "step": 42 }, { "epoch": 0.011908058709498754, "grad_norm": 18.89388656616211, "learning_rate": 0.000172, "loss": 3.9148, "step": 43 }, { "epoch": 0.012184990307394073, "grad_norm": 10.173064231872559, "learning_rate": 0.00017600000000000002, "loss": 2.9731, "step": 44 }, { "epoch": 0.012461921905289393, "grad_norm": 12.305123329162598, "learning_rate": 0.00018, "loss": 3.1847, "step": 45 }, { "epoch": 0.012738853503184714, "grad_norm": 10.65303897857666, "learning_rate": 0.00018400000000000003, "loss": 3.2399, "step": 46 }, { "epoch": 0.013015785101080033, "grad_norm": 12.510346412658691, "learning_rate": 0.000188, "loss": 3.1796, "step": 47 }, { "epoch": 0.013292716698975353, "grad_norm": 17.95038604736328, "learning_rate": 0.000192, "loss": 3.3959, "step": 48 }, { "epoch": 0.013569648296870672, "grad_norm": 7.766382217407227, "learning_rate": 0.000196, "loss": 3.1379, "step": 49 }, { "epoch": 0.013846579894765993, "grad_norm": 18.828136444091797, "learning_rate": 0.0002, "loss": 4.2675, "step": 50 }, { "epoch": 0.014123511492661313, "grad_norm": 8.210630416870117, "learning_rate": 0.0001999999610842169, "loss": 2.8736, "step": 51 }, { "epoch": 0.014400443090556632, "grad_norm": 6.19714879989624, "learning_rate": 0.00019999984433689786, "loss": 2.9424, "step": 52 }, { "epoch": 0.014677374688451952, "grad_norm": 8.041394233703613, "learning_rate": 0.0001999996497581338, "loss": 3.0541, "step": 53 }, { "epoch": 0.014954306286347273, "grad_norm": 8.640958786010742, "learning_rate": 0.00019999937734807612, "loss": 3.2354, "step": 54 }, { "epoch": 0.015231237884242592, "grad_norm": 6.604707717895508, "learning_rate": 0.00019999902710693683, "loss": 3.5656, "step": 55 }, { "epoch": 0.015508169482137912, "grad_norm": 5.787939548492432, "learning_rate": 0.00019999859903498856, "loss": 2.9423, "step": 56 }, { "epoch": 0.015785101080033233, "grad_norm": 5.111888408660889, "learning_rate": 0.00019999809313256446, "loss": 2.926, "step": 57 }, { "epoch": 0.01606203267792855, "grad_norm": 5.36300802230835, "learning_rate": 0.0001999975094000583, "loss": 2.9573, "step": 58 }, { "epoch": 0.01633896427582387, "grad_norm": 5.115900039672852, "learning_rate": 0.00019999684783792443, "loss": 2.8367, "step": 59 }, { "epoch": 0.016615895873719193, "grad_norm": 6.6548075675964355, "learning_rate": 0.00019999610844667772, "loss": 2.8389, "step": 60 }, { "epoch": 0.01689282747161451, "grad_norm": 3.7862396240234375, "learning_rate": 0.00019999529122689363, "loss": 2.7203, "step": 61 }, { "epoch": 0.01716975906950983, "grad_norm": 4.995016098022461, "learning_rate": 0.00019999439617920825, "loss": 3.084, "step": 62 }, { "epoch": 0.01744669066740515, "grad_norm": 7.811636447906494, "learning_rate": 0.00019999342330431823, "loss": 3.8168, "step": 63 }, { "epoch": 0.01772362226530047, "grad_norm": 8.833688735961914, "learning_rate": 0.00019999237260298072, "loss": 3.2426, "step": 64 }, { "epoch": 0.01800055386319579, "grad_norm": 7.167825222015381, "learning_rate": 0.00019999124407601353, "loss": 3.4158, "step": 65 }, { "epoch": 0.01827748546109111, "grad_norm": 9.22640609741211, "learning_rate": 0.000199990037724295, "loss": 3.6612, "step": 66 }, { "epoch": 0.01855441705898643, "grad_norm": 7.312063217163086, "learning_rate": 0.00019998875354876406, "loss": 3.4862, "step": 67 }, { "epoch": 0.01883134865688175, "grad_norm": 7.195916652679443, "learning_rate": 0.0001999873915504202, "loss": 3.1429, "step": 68 }, { "epoch": 0.01910828025477707, "grad_norm": 5.664394855499268, "learning_rate": 0.00019998595173032347, "loss": 2.7398, "step": 69 }, { "epoch": 0.01938521185267239, "grad_norm": 8.402835845947266, "learning_rate": 0.00019998443408959454, "loss": 2.8071, "step": 70 }, { "epoch": 0.019662143450567708, "grad_norm": 6.3753790855407715, "learning_rate": 0.00019998283862941458, "loss": 2.9036, "step": 71 }, { "epoch": 0.01993907504846303, "grad_norm": 6.662612438201904, "learning_rate": 0.0001999811653510254, "loss": 3.0893, "step": 72 }, { "epoch": 0.02021600664635835, "grad_norm": 7.037993907928467, "learning_rate": 0.00019997941425572928, "loss": 3.5855, "step": 73 }, { "epoch": 0.020492938244253668, "grad_norm": 4.904363632202148, "learning_rate": 0.00019997758534488915, "loss": 3.2274, "step": 74 }, { "epoch": 0.02076986984214899, "grad_norm": 9.387070655822754, "learning_rate": 0.0001999756786199285, "loss": 4.6668, "step": 75 }, { "epoch": 0.02104680144004431, "grad_norm": 2.9691836833953857, "learning_rate": 0.00019997369408233134, "loss": 3.1311, "step": 76 }, { "epoch": 0.021323733037939628, "grad_norm": 4.634636878967285, "learning_rate": 0.00019997163173364233, "loss": 3.2597, "step": 77 }, { "epoch": 0.02160066463583495, "grad_norm": 5.3558759689331055, "learning_rate": 0.00019996949157546654, "loss": 3.3337, "step": 78 }, { "epoch": 0.021877596233730267, "grad_norm": 4.282164096832275, "learning_rate": 0.00019996727360946972, "loss": 3.2675, "step": 79 }, { "epoch": 0.022154527831625588, "grad_norm": 4.271265506744385, "learning_rate": 0.00019996497783737817, "loss": 3.049, "step": 80 }, { "epoch": 0.02243145942952091, "grad_norm": 3.7361857891082764, "learning_rate": 0.0001999626042609787, "loss": 2.9496, "step": 81 }, { "epoch": 0.022708391027416227, "grad_norm": 6.172094345092773, "learning_rate": 0.00019996015288211872, "loss": 2.8293, "step": 82 }, { "epoch": 0.022985322625311548, "grad_norm": 4.567731857299805, "learning_rate": 0.00019995762370270614, "loss": 3.11, "step": 83 }, { "epoch": 0.02326225422320687, "grad_norm": 9.828329086303711, "learning_rate": 0.00019995501672470951, "loss": 3.1826, "step": 84 }, { "epoch": 0.023539185821102187, "grad_norm": 7.35584831237793, "learning_rate": 0.00019995233195015783, "loss": 3.1093, "step": 85 }, { "epoch": 0.023816117418997508, "grad_norm": 6.213433265686035, "learning_rate": 0.00019994956938114075, "loss": 3.0147, "step": 86 }, { "epoch": 0.024093049016892826, "grad_norm": 7.855972766876221, "learning_rate": 0.0001999467290198084, "loss": 3.8945, "step": 87 }, { "epoch": 0.024369980614788147, "grad_norm": 6.687296390533447, "learning_rate": 0.00019994381086837148, "loss": 3.1185, "step": 88 }, { "epoch": 0.024646912212683468, "grad_norm": 5.172341823577881, "learning_rate": 0.00019994081492910124, "loss": 3.0799, "step": 89 }, { "epoch": 0.024923843810578786, "grad_norm": 4.361382484436035, "learning_rate": 0.00019993774120432946, "loss": 2.8538, "step": 90 }, { "epoch": 0.025200775408474107, "grad_norm": 2.921813726425171, "learning_rate": 0.00019993458969644844, "loss": 3.1521, "step": 91 }, { "epoch": 0.025477707006369428, "grad_norm": 5.353600025177002, "learning_rate": 0.0001999313604079111, "loss": 3.3836, "step": 92 }, { "epoch": 0.025754638604264746, "grad_norm": 24.217308044433594, "learning_rate": 0.00019992805334123078, "loss": 3.0601, "step": 93 }, { "epoch": 0.026031570202160067, "grad_norm": 15.985791206359863, "learning_rate": 0.0001999246684989815, "loss": 3.2496, "step": 94 }, { "epoch": 0.026308501800055388, "grad_norm": 5.215633869171143, "learning_rate": 0.00019992120588379773, "loss": 3.2689, "step": 95 }, { "epoch": 0.026585433397950706, "grad_norm": 5.993437767028809, "learning_rate": 0.00019991766549837438, "loss": 3.0728, "step": 96 }, { "epoch": 0.026862364995846027, "grad_norm": 8.502284049987793, "learning_rate": 0.00019991404734546708, "loss": 3.1705, "step": 97 }, { "epoch": 0.027139296593741345, "grad_norm": 8.067936897277832, "learning_rate": 0.00019991035142789186, "loss": 2.809, "step": 98 }, { "epoch": 0.027416228191636666, "grad_norm": 4.59942626953125, "learning_rate": 0.00019990657774852534, "loss": 2.972, "step": 99 }, { "epoch": 0.027693159789531987, "grad_norm": 11.113409042358398, "learning_rate": 0.0001999027263103046, "loss": 4.6655, "step": 100 }, { "epoch": 0.027970091387427305, "grad_norm": 7.234707832336426, "learning_rate": 0.00019989879711622725, "loss": 3.2355, "step": 101 }, { "epoch": 0.028247022985322626, "grad_norm": 2.6375296115875244, "learning_rate": 0.00019989479016935152, "loss": 2.6118, "step": 102 }, { "epoch": 0.028523954583217947, "grad_norm": 3.2083632946014404, "learning_rate": 0.00019989070547279605, "loss": 3.017, "step": 103 }, { "epoch": 0.028800886181113265, "grad_norm": 3.726649522781372, "learning_rate": 0.00019988654302974, "loss": 3.1657, "step": 104 }, { "epoch": 0.029077817779008586, "grad_norm": 3.17037296295166, "learning_rate": 0.00019988230284342308, "loss": 2.8929, "step": 105 }, { "epoch": 0.029354749376903903, "grad_norm": 3.549494504928589, "learning_rate": 0.00019987798491714548, "loss": 2.9011, "step": 106 }, { "epoch": 0.029631680974799224, "grad_norm": 3.223787784576416, "learning_rate": 0.0001998735892542679, "loss": 3.0911, "step": 107 }, { "epoch": 0.029908612572694546, "grad_norm": 10.136894226074219, "learning_rate": 0.0001998691158582116, "loss": 3.3907, "step": 108 }, { "epoch": 0.030185544170589863, "grad_norm": 4.363787651062012, "learning_rate": 0.00019986456473245826, "loss": 3.1103, "step": 109 }, { "epoch": 0.030462475768485184, "grad_norm": 3.338764190673828, "learning_rate": 0.00019985993588055013, "loss": 2.8275, "step": 110 }, { "epoch": 0.030739407366380506, "grad_norm": 4.156650543212891, "learning_rate": 0.00019985522930608985, "loss": 3.1066, "step": 111 }, { "epoch": 0.031016338964275823, "grad_norm": 3.8609375953674316, "learning_rate": 0.0001998504450127407, "loss": 2.9508, "step": 112 }, { "epoch": 0.03129327056217114, "grad_norm": 18.497459411621094, "learning_rate": 0.00019984558300422628, "loss": 3.165, "step": 113 }, { "epoch": 0.031570202160066466, "grad_norm": 3.3081798553466797, "learning_rate": 0.00019984064328433084, "loss": 2.9892, "step": 114 }, { "epoch": 0.03184713375796178, "grad_norm": 5.705508232116699, "learning_rate": 0.000199835625856899, "loss": 3.1684, "step": 115 }, { "epoch": 0.0321240653558571, "grad_norm": 5.110528469085693, "learning_rate": 0.00019983053072583596, "loss": 3.1865, "step": 116 }, { "epoch": 0.032400996953752426, "grad_norm": 5.05802059173584, "learning_rate": 0.0001998253578951073, "loss": 2.9766, "step": 117 }, { "epoch": 0.03267792855164774, "grad_norm": 7.192070484161377, "learning_rate": 0.00019982010736873908, "loss": 3.2792, "step": 118 }, { "epoch": 0.03295486014954306, "grad_norm": 3.830031156539917, "learning_rate": 0.00019981477915081793, "loss": 2.721, "step": 119 }, { "epoch": 0.033231791747438386, "grad_norm": 3.6124696731567383, "learning_rate": 0.00019980937324549084, "loss": 2.6529, "step": 120 }, { "epoch": 0.0335087233453337, "grad_norm": 3.6310627460479736, "learning_rate": 0.00019980388965696534, "loss": 2.9436, "step": 121 }, { "epoch": 0.03378565494322902, "grad_norm": 6.552757740020752, "learning_rate": 0.00019979832838950938, "loss": 3.1585, "step": 122 }, { "epoch": 0.034062586541124346, "grad_norm": 3.328778028488159, "learning_rate": 0.00019979268944745137, "loss": 2.8573, "step": 123 }, { "epoch": 0.03433951813901966, "grad_norm": 4.673979759216309, "learning_rate": 0.00019978697283518023, "loss": 3.4607, "step": 124 }, { "epoch": 0.03461644973691498, "grad_norm": 5.156442165374756, "learning_rate": 0.00019978117855714526, "loss": 3.9396, "step": 125 }, { "epoch": 0.0348933813348103, "grad_norm": 20.98853874206543, "learning_rate": 0.00019977530661785618, "loss": 3.2191, "step": 126 }, { "epoch": 0.03517031293270562, "grad_norm": 2.979680299758911, "learning_rate": 0.0001997693570218833, "loss": 3.0521, "step": 127 }, { "epoch": 0.03544724453060094, "grad_norm": 3.1123874187469482, "learning_rate": 0.00019976332977385727, "loss": 2.9523, "step": 128 }, { "epoch": 0.03572417612849626, "grad_norm": 4.905653953552246, "learning_rate": 0.00019975722487846918, "loss": 3.0467, "step": 129 }, { "epoch": 0.03600110772639158, "grad_norm": 2.7707560062408447, "learning_rate": 0.00019975104234047053, "loss": 3.1212, "step": 130 }, { "epoch": 0.0362780393242869, "grad_norm": 3.286872148513794, "learning_rate": 0.00019974478216467335, "loss": 3.0796, "step": 131 }, { "epoch": 0.03655497092218222, "grad_norm": 4.147380352020264, "learning_rate": 0.00019973844435594996, "loss": 3.1645, "step": 132 }, { "epoch": 0.03683190252007754, "grad_norm": 3.7690138816833496, "learning_rate": 0.00019973202891923324, "loss": 2.9335, "step": 133 }, { "epoch": 0.03710883411797286, "grad_norm": 3.537510633468628, "learning_rate": 0.0001997255358595164, "loss": 3.0739, "step": 134 }, { "epoch": 0.03738576571586818, "grad_norm": 2.9214696884155273, "learning_rate": 0.0001997189651818531, "loss": 2.9609, "step": 135 }, { "epoch": 0.0376626973137635, "grad_norm": 4.733245372772217, "learning_rate": 0.00019971231689135737, "loss": 2.8016, "step": 136 }, { "epoch": 0.03793962891165882, "grad_norm": 6.125110149383545, "learning_rate": 0.0001997055909932037, "loss": 3.333, "step": 137 }, { "epoch": 0.03821656050955414, "grad_norm": 4.021219730377197, "learning_rate": 0.00019969878749262696, "loss": 3.4326, "step": 138 }, { "epoch": 0.03849349210744946, "grad_norm": 6.539845943450928, "learning_rate": 0.00019969190639492244, "loss": 3.0441, "step": 139 }, { "epoch": 0.03877042370534478, "grad_norm": 3.1108386516571045, "learning_rate": 0.00019968494770544577, "loss": 2.7918, "step": 140 }, { "epoch": 0.0390473553032401, "grad_norm": 2.605855703353882, "learning_rate": 0.00019967791142961304, "loss": 2.6988, "step": 141 }, { "epoch": 0.039324286901135416, "grad_norm": 2.4704034328460693, "learning_rate": 0.00019967079757290067, "loss": 2.9355, "step": 142 }, { "epoch": 0.03960121849903074, "grad_norm": 5.313930511474609, "learning_rate": 0.00019966360614084549, "loss": 2.9783, "step": 143 }, { "epoch": 0.03987815009692606, "grad_norm": 4.905053615570068, "learning_rate": 0.00019965633713904472, "loss": 2.7292, "step": 144 }, { "epoch": 0.040155081694821376, "grad_norm": 6.81724214553833, "learning_rate": 0.00019964899057315591, "loss": 3.4628, "step": 145 }, { "epoch": 0.0404320132927167, "grad_norm": 3.764613151550293, "learning_rate": 0.00019964156644889706, "loss": 2.9632, "step": 146 }, { "epoch": 0.04070894489061202, "grad_norm": 3.9981393814086914, "learning_rate": 0.00019963406477204644, "loss": 2.6181, "step": 147 }, { "epoch": 0.040985876488507336, "grad_norm": 26.09824562072754, "learning_rate": 0.0001996264855484427, "loss": 3.2855, "step": 148 }, { "epoch": 0.04126280808640266, "grad_norm": 3.9404141902923584, "learning_rate": 0.00019961882878398492, "loss": 2.9334, "step": 149 }, { "epoch": 0.04153973968429798, "grad_norm": 8.878583908081055, "learning_rate": 0.00019961109448463248, "loss": 4.5557, "step": 150 }, { "epoch": 0.041816671282193296, "grad_norm": 21.1249942779541, "learning_rate": 0.00019960328265640506, "loss": 3.4292, "step": 151 }, { "epoch": 0.04209360288008862, "grad_norm": 3.0727407932281494, "learning_rate": 0.00019959539330538273, "loss": 2.7764, "step": 152 }, { "epoch": 0.04237053447798394, "grad_norm": 16.201549530029297, "learning_rate": 0.00019958742643770593, "loss": 3.2883, "step": 153 }, { "epoch": 0.042647466075879256, "grad_norm": 11.132794380187988, "learning_rate": 0.0001995793820595754, "loss": 3.4929, "step": 154 }, { "epoch": 0.04292439767377458, "grad_norm": 3.189621686935425, "learning_rate": 0.0001995712601772522, "loss": 3.0601, "step": 155 }, { "epoch": 0.0432013292716699, "grad_norm": 5.149763107299805, "learning_rate": 0.0001995630607970577, "loss": 3.0358, "step": 156 }, { "epoch": 0.043478260869565216, "grad_norm": 3.466712474822998, "learning_rate": 0.00019955478392537362, "loss": 2.9031, "step": 157 }, { "epoch": 0.043755192467460534, "grad_norm": 1.9142956733703613, "learning_rate": 0.00019954642956864196, "loss": 2.8122, "step": 158 }, { "epoch": 0.04403212406535586, "grad_norm": 2.5402872562408447, "learning_rate": 0.00019953799773336507, "loss": 2.8495, "step": 159 }, { "epoch": 0.044309055663251176, "grad_norm": 3.6350300312042236, "learning_rate": 0.00019952948842610554, "loss": 3.1537, "step": 160 }, { "epoch": 0.044585987261146494, "grad_norm": 4.203430652618408, "learning_rate": 0.00019952090165348637, "loss": 2.7296, "step": 161 }, { "epoch": 0.04486291885904182, "grad_norm": 3.4311680793762207, "learning_rate": 0.0001995122374221907, "loss": 3.0541, "step": 162 }, { "epoch": 0.045139850456937136, "grad_norm": 3.866399049758911, "learning_rate": 0.00019950349573896213, "loss": 3.0068, "step": 163 }, { "epoch": 0.045416782054832454, "grad_norm": 3.7808001041412354, "learning_rate": 0.00019949467661060433, "loss": 3.0596, "step": 164 }, { "epoch": 0.04569371365272778, "grad_norm": 4.888076305389404, "learning_rate": 0.00019948578004398148, "loss": 3.3471, "step": 165 }, { "epoch": 0.045970645250623096, "grad_norm": 3.015810251235962, "learning_rate": 0.00019947680604601783, "loss": 2.868, "step": 166 }, { "epoch": 0.046247576848518414, "grad_norm": 2.8251118659973145, "learning_rate": 0.00019946775462369805, "loss": 2.7384, "step": 167 }, { "epoch": 0.04652450844641374, "grad_norm": 4.449611186981201, "learning_rate": 0.00019945862578406697, "loss": 3.2198, "step": 168 }, { "epoch": 0.046801440044309056, "grad_norm": 5.678244590759277, "learning_rate": 0.00019944941953422968, "loss": 3.2162, "step": 169 }, { "epoch": 0.047078371642204374, "grad_norm": 8.139033317565918, "learning_rate": 0.0001994401358813516, "loss": 2.859, "step": 170 }, { "epoch": 0.0473553032400997, "grad_norm": 6.555064678192139, "learning_rate": 0.00019943077483265833, "loss": 3.0907, "step": 171 }, { "epoch": 0.047632234837995016, "grad_norm": 4.014291763305664, "learning_rate": 0.00019942133639543573, "loss": 2.679, "step": 172 }, { "epoch": 0.047909166435890334, "grad_norm": 3.2168991565704346, "learning_rate": 0.0001994118205770298, "loss": 2.7681, "step": 173 }, { "epoch": 0.04818609803378565, "grad_norm": 7.394007205963135, "learning_rate": 0.000199402227384847, "loss": 3.5997, "step": 174 }, { "epoch": 0.048463029631680976, "grad_norm": 16.315059661865234, "learning_rate": 0.00019939255682635372, "loss": 4.6607, "step": 175 }, { "epoch": 0.048739961229576294, "grad_norm": 7.469921588897705, "learning_rate": 0.0001993828089090768, "loss": 3.3174, "step": 176 }, { "epoch": 0.04901689282747161, "grad_norm": 3.1955409049987793, "learning_rate": 0.00019937298364060312, "loss": 2.6456, "step": 177 }, { "epoch": 0.049293824425366936, "grad_norm": 6.934082508087158, "learning_rate": 0.00019936308102857993, "loss": 2.7126, "step": 178 }, { "epoch": 0.049570756023262254, "grad_norm": 2.4637463092803955, "learning_rate": 0.00019935310108071453, "loss": 2.608, "step": 179 }, { "epoch": 0.04984768762115757, "grad_norm": 2.381906747817993, "learning_rate": 0.0001993430438047745, "loss": 2.8141, "step": 180 }, { "epoch": 0.050124619219052896, "grad_norm": 12.824419975280762, "learning_rate": 0.00019933290920858752, "loss": 2.8778, "step": 181 }, { "epoch": 0.050401550816948214, "grad_norm": 2.076671838760376, "learning_rate": 0.00019932269730004153, "loss": 2.8058, "step": 182 }, { "epoch": 0.05067848241484353, "grad_norm": 2.2687172889709473, "learning_rate": 0.00019931240808708468, "loss": 2.9901, "step": 183 }, { "epoch": 0.050955414012738856, "grad_norm": 13.4617919921875, "learning_rate": 0.00019930204157772515, "loss": 2.9649, "step": 184 }, { "epoch": 0.051232345610634174, "grad_norm": 1.9112941026687622, "learning_rate": 0.00019929159778003138, "loss": 2.6419, "step": 185 }, { "epoch": 0.05150927720852949, "grad_norm": 3.7725117206573486, "learning_rate": 0.00019928107670213194, "loss": 2.993, "step": 186 }, { "epoch": 0.051786208806424816, "grad_norm": 4.537817478179932, "learning_rate": 0.00019927047835221554, "loss": 2.8517, "step": 187 }, { "epoch": 0.052063140404320134, "grad_norm": 2.7799930572509766, "learning_rate": 0.00019925980273853105, "loss": 2.9868, "step": 188 }, { "epoch": 0.05234007200221545, "grad_norm": 3.7512753009796143, "learning_rate": 0.00019924904986938754, "loss": 2.9711, "step": 189 }, { "epoch": 0.052617003600110776, "grad_norm": 3.921067714691162, "learning_rate": 0.00019923821975315403, "loss": 3.1084, "step": 190 }, { "epoch": 0.052893935198006094, "grad_norm": 6.35083532333374, "learning_rate": 0.00019922731239825978, "loss": 3.294, "step": 191 }, { "epoch": 0.05317086679590141, "grad_norm": 8.527310371398926, "learning_rate": 0.0001992163278131942, "loss": 2.9271, "step": 192 }, { "epoch": 0.05344779839379673, "grad_norm": 4.5866193771362305, "learning_rate": 0.00019920526600650674, "loss": 3.0789, "step": 193 }, { "epoch": 0.053724729991692054, "grad_norm": 2.8476200103759766, "learning_rate": 0.000199194126986807, "loss": 2.9648, "step": 194 }, { "epoch": 0.05400166158958737, "grad_norm": 4.191553592681885, "learning_rate": 0.00019918291076276466, "loss": 3.2052, "step": 195 }, { "epoch": 0.05427859318748269, "grad_norm": 12.183512687683105, "learning_rate": 0.00019917161734310943, "loss": 3.8112, "step": 196 }, { "epoch": 0.054555524785378014, "grad_norm": 3.692249059677124, "learning_rate": 0.0001991602467366312, "loss": 2.8357, "step": 197 }, { "epoch": 0.05483245638327333, "grad_norm": 6.3173394203186035, "learning_rate": 0.00019914879895217987, "loss": 3.1846, "step": 198 }, { "epoch": 0.05510938798116865, "grad_norm": 6.148683547973633, "learning_rate": 0.00019913727399866545, "loss": 2.8057, "step": 199 }, { "epoch": 0.055386319579063974, "grad_norm": 3.915633201599121, "learning_rate": 0.00019912567188505795, "loss": 3.6289, "step": 200 }, { "epoch": 0.05566325117695929, "grad_norm": 2.9523115158081055, "learning_rate": 0.00019911399262038753, "loss": 2.9399, "step": 201 }, { "epoch": 0.05594018277485461, "grad_norm": 2.9612538814544678, "learning_rate": 0.0001991022362137443, "loss": 2.9523, "step": 202 }, { "epoch": 0.056217114372749934, "grad_norm": 3.0619959831237793, "learning_rate": 0.0001990904026742785, "loss": 3.1371, "step": 203 }, { "epoch": 0.05649404597064525, "grad_norm": 2.0102360248565674, "learning_rate": 0.00019907849201120033, "loss": 3.1148, "step": 204 }, { "epoch": 0.05677097756854057, "grad_norm": 3.8012619018554688, "learning_rate": 0.00019906650423378005, "loss": 2.9833, "step": 205 }, { "epoch": 0.057047909166435894, "grad_norm": 3.3233330249786377, "learning_rate": 0.00019905443935134791, "loss": 2.5441, "step": 206 }, { "epoch": 0.05732484076433121, "grad_norm": 3.41413950920105, "learning_rate": 0.00019904229737329425, "loss": 3.0065, "step": 207 }, { "epoch": 0.05760177236222653, "grad_norm": 3.2364377975463867, "learning_rate": 0.00019903007830906935, "loss": 3.0149, "step": 208 }, { "epoch": 0.05787870396012185, "grad_norm": 2.9118869304656982, "learning_rate": 0.00019901778216818345, "loss": 2.8871, "step": 209 }, { "epoch": 0.05815563555801717, "grad_norm": 1.9055243730545044, "learning_rate": 0.00019900540896020688, "loss": 2.6669, "step": 210 }, { "epoch": 0.05843256715591249, "grad_norm": 2.51979923248291, "learning_rate": 0.0001989929586947699, "loss": 2.903, "step": 211 }, { "epoch": 0.05870949875380781, "grad_norm": 6.010900497436523, "learning_rate": 0.00019898043138156268, "loss": 3.3382, "step": 212 }, { "epoch": 0.05898643035170313, "grad_norm": 3.1433265209198, "learning_rate": 0.00019896782703033555, "loss": 2.7796, "step": 213 }, { "epoch": 0.05926336194959845, "grad_norm": 3.7899844646453857, "learning_rate": 0.00019895514565089855, "loss": 3.1045, "step": 214 }, { "epoch": 0.05954029354749377, "grad_norm": 4.300248146057129, "learning_rate": 0.00019894238725312184, "loss": 2.8547, "step": 215 }, { "epoch": 0.05981722514538909, "grad_norm": 4.351460933685303, "learning_rate": 0.0001989295518469355, "loss": 3.1198, "step": 216 }, { "epoch": 0.06009415674328441, "grad_norm": 14.191424369812012, "learning_rate": 0.0001989166394423295, "loss": 2.9771, "step": 217 }, { "epoch": 0.06037108834117973, "grad_norm": 8.077985763549805, "learning_rate": 0.00019890365004935379, "loss": 2.4881, "step": 218 }, { "epoch": 0.06064801993907505, "grad_norm": 5.447228908538818, "learning_rate": 0.00019889058367811822, "loss": 2.7146, "step": 219 }, { "epoch": 0.06092495153697037, "grad_norm": 2.0953845977783203, "learning_rate": 0.00019887744033879253, "loss": 2.6072, "step": 220 }, { "epoch": 0.06120188313486569, "grad_norm": 3.406092643737793, "learning_rate": 0.0001988642200416064, "loss": 2.8637, "step": 221 }, { "epoch": 0.06147881473276101, "grad_norm": 4.163158893585205, "learning_rate": 0.00019885092279684936, "loss": 2.4997, "step": 222 }, { "epoch": 0.06175574633065633, "grad_norm": 3.912731647491455, "learning_rate": 0.00019883754861487095, "loss": 2.8006, "step": 223 }, { "epoch": 0.06203267792855165, "grad_norm": 4.682294845581055, "learning_rate": 0.0001988240975060804, "loss": 2.9797, "step": 224 }, { "epoch": 0.062309609526446964, "grad_norm": 7.165354251861572, "learning_rate": 0.000198810569480947, "loss": 3.9649, "step": 225 }, { "epoch": 0.06258654112434228, "grad_norm": 1.584733247756958, "learning_rate": 0.00019879696454999977, "loss": 2.7378, "step": 226 }, { "epoch": 0.0628634727222376, "grad_norm": 3.165452241897583, "learning_rate": 0.00019878328272382768, "loss": 2.8733, "step": 227 }, { "epoch": 0.06314040432013293, "grad_norm": 2.215864419937134, "learning_rate": 0.0001987695240130795, "loss": 2.8075, "step": 228 }, { "epoch": 0.06341733591802824, "grad_norm": 2.2456507682800293, "learning_rate": 0.00019875568842846382, "loss": 2.7353, "step": 229 }, { "epoch": 0.06369426751592357, "grad_norm": 3.5446226596832275, "learning_rate": 0.0001987417759807491, "loss": 2.8128, "step": 230 }, { "epoch": 0.06397119911381889, "grad_norm": 2.351714611053467, "learning_rate": 0.0001987277866807637, "loss": 2.6989, "step": 231 }, { "epoch": 0.0642481307117142, "grad_norm": 1.9342094659805298, "learning_rate": 0.00019871372053939558, "loss": 2.8764, "step": 232 }, { "epoch": 0.06452506230960953, "grad_norm": 2.296945571899414, "learning_rate": 0.00019869957756759273, "loss": 2.7458, "step": 233 }, { "epoch": 0.06480199390750485, "grad_norm": 4.668132781982422, "learning_rate": 0.0001986853577763628, "loss": 3.1173, "step": 234 }, { "epoch": 0.06507892550540016, "grad_norm": 7.3507256507873535, "learning_rate": 0.0001986710611767733, "loss": 3.1764, "step": 235 }, { "epoch": 0.06535585710329549, "grad_norm": 7.586774826049805, "learning_rate": 0.00019865668777995147, "loss": 3.2926, "step": 236 }, { "epoch": 0.06563278870119081, "grad_norm": 7.294936656951904, "learning_rate": 0.0001986422375970844, "loss": 3.0098, "step": 237 }, { "epoch": 0.06590972029908612, "grad_norm": 3.664259195327759, "learning_rate": 0.00019862771063941886, "loss": 2.5577, "step": 238 }, { "epoch": 0.06618665189698145, "grad_norm": 1.538918375968933, "learning_rate": 0.00019861310691826143, "loss": 2.7598, "step": 239 }, { "epoch": 0.06646358349487677, "grad_norm": 5.194944381713867, "learning_rate": 0.00019859842644497837, "loss": 3.0205, "step": 240 }, { "epoch": 0.06674051509277208, "grad_norm": 3.615199327468872, "learning_rate": 0.00019858366923099574, "loss": 2.8842, "step": 241 }, { "epoch": 0.0670174466906674, "grad_norm": 2.8806331157684326, "learning_rate": 0.00019856883528779934, "loss": 2.8442, "step": 242 }, { "epoch": 0.06729437828856273, "grad_norm": 3.9716224670410156, "learning_rate": 0.00019855392462693466, "loss": 2.935, "step": 243 }, { "epoch": 0.06757130988645804, "grad_norm": 3.705488920211792, "learning_rate": 0.00019853893726000683, "loss": 3.1247, "step": 244 }, { "epoch": 0.06784824148435337, "grad_norm": 4.608498573303223, "learning_rate": 0.00019852387319868085, "loss": 2.9861, "step": 245 }, { "epoch": 0.06812517308224869, "grad_norm": 3.4963815212249756, "learning_rate": 0.00019850873245468126, "loss": 2.5856, "step": 246 }, { "epoch": 0.068402104680144, "grad_norm": 2.8581809997558594, "learning_rate": 0.00019849351503979233, "loss": 2.8976, "step": 247 }, { "epoch": 0.06867903627803933, "grad_norm": 4.240769863128662, "learning_rate": 0.00019847822096585803, "loss": 2.8831, "step": 248 }, { "epoch": 0.06895596787593464, "grad_norm": 13.28221321105957, "learning_rate": 0.00019846285024478202, "loss": 3.1482, "step": 249 }, { "epoch": 0.06923289947382996, "grad_norm": 11.003414154052734, "learning_rate": 0.00019844740288852748, "loss": 4.2067, "step": 250 }, { "epoch": 0.06950983107172529, "grad_norm": 2.4025285243988037, "learning_rate": 0.00019843187890911739, "loss": 2.8043, "step": 251 }, { "epoch": 0.0697867626696206, "grad_norm": 3.831378936767578, "learning_rate": 0.0001984162783186343, "loss": 3.1313, "step": 252 }, { "epoch": 0.07006369426751592, "grad_norm": 4.178756237030029, "learning_rate": 0.0001984006011292204, "loss": 3.4844, "step": 253 }, { "epoch": 0.07034062586541125, "grad_norm": 2.4836204051971436, "learning_rate": 0.00019838484735307748, "loss": 3.1151, "step": 254 }, { "epoch": 0.07061755746330656, "grad_norm": 3.50260591506958, "learning_rate": 0.00019836901700246695, "loss": 2.7573, "step": 255 }, { "epoch": 0.07089448906120188, "grad_norm": 2.2057502269744873, "learning_rate": 0.0001983531100897098, "loss": 2.7927, "step": 256 }, { "epoch": 0.0711714206590972, "grad_norm": 3.3176257610321045, "learning_rate": 0.0001983371266271867, "loss": 3.1583, "step": 257 }, { "epoch": 0.07144835225699252, "grad_norm": 43.778228759765625, "learning_rate": 0.00019832106662733774, "loss": 3.1222, "step": 258 }, { "epoch": 0.07172528385488784, "grad_norm": 2.3279812335968018, "learning_rate": 0.0001983049301026627, "loss": 2.7614, "step": 259 }, { "epoch": 0.07200221545278317, "grad_norm": 2.8496177196502686, "learning_rate": 0.00019828871706572095, "loss": 2.7267, "step": 260 }, { "epoch": 0.07227914705067848, "grad_norm": 2.8000481128692627, "learning_rate": 0.00019827242752913125, "loss": 3.2881, "step": 261 }, { "epoch": 0.0725560786485738, "grad_norm": 3.695194721221924, "learning_rate": 0.0001982560615055721, "loss": 2.9656, "step": 262 }, { "epoch": 0.07283301024646913, "grad_norm": 4.314358711242676, "learning_rate": 0.00019823961900778135, "loss": 2.8443, "step": 263 }, { "epoch": 0.07310994184436444, "grad_norm": 3.7146267890930176, "learning_rate": 0.00019822310004855652, "loss": 3.1693, "step": 264 }, { "epoch": 0.07338687344225976, "grad_norm": 3.826491117477417, "learning_rate": 0.0001982065046407545, "loss": 3.0863, "step": 265 }, { "epoch": 0.07366380504015509, "grad_norm": 4.198593616485596, "learning_rate": 0.0001981898327972918, "loss": 2.846, "step": 266 }, { "epoch": 0.0739407366380504, "grad_norm": 3.1046805381774902, "learning_rate": 0.0001981730845311444, "loss": 2.6716, "step": 267 }, { "epoch": 0.07421766823594572, "grad_norm": 2.8134427070617676, "learning_rate": 0.00019815625985534772, "loss": 2.8223, "step": 268 }, { "epoch": 0.07449459983384105, "grad_norm": 2.7822787761688232, "learning_rate": 0.00019813935878299662, "loss": 2.6138, "step": 269 }, { "epoch": 0.07477153143173636, "grad_norm": 4.507596969604492, "learning_rate": 0.00019812238132724556, "loss": 2.6792, "step": 270 }, { "epoch": 0.07504846302963168, "grad_norm": 3.206634283065796, "learning_rate": 0.0001981053275013083, "loss": 2.8331, "step": 271 }, { "epoch": 0.075325394627527, "grad_norm": 1.985059380531311, "learning_rate": 0.00019808819731845809, "loss": 2.4029, "step": 272 }, { "epoch": 0.07560232622542232, "grad_norm": 4.91208028793335, "learning_rate": 0.00019807099079202764, "loss": 2.7546, "step": 273 }, { "epoch": 0.07587925782331764, "grad_norm": 6.044703960418701, "learning_rate": 0.0001980537079354091, "loss": 3.9813, "step": 274 }, { "epoch": 0.07615618942121297, "grad_norm": 7.671187400817871, "learning_rate": 0.0001980363487620539, "loss": 4.2854, "step": 275 }, { "epoch": 0.07643312101910828, "grad_norm": 4.184301853179932, "learning_rate": 0.00019801891328547302, "loss": 3.0334, "step": 276 }, { "epoch": 0.0767100526170036, "grad_norm": 3.0849671363830566, "learning_rate": 0.00019800140151923675, "loss": 2.56, "step": 277 }, { "epoch": 0.07698698421489893, "grad_norm": 2.949519395828247, "learning_rate": 0.0001979838134769748, "loss": 3.0188, "step": 278 }, { "epoch": 0.07726391581279424, "grad_norm": 2.7396950721740723, "learning_rate": 0.00019796614917237616, "loss": 2.8281, "step": 279 }, { "epoch": 0.07754084741068956, "grad_norm": 1.4398963451385498, "learning_rate": 0.00019794840861918925, "loss": 2.6675, "step": 280 }, { "epoch": 0.07781777900858489, "grad_norm": 1.690563440322876, "learning_rate": 0.00019793059183122186, "loss": 2.9267, "step": 281 }, { "epoch": 0.0780947106064802, "grad_norm": 2.1529276371002197, "learning_rate": 0.00019791269882234102, "loss": 2.9394, "step": 282 }, { "epoch": 0.07837164220437552, "grad_norm": 2.6295652389526367, "learning_rate": 0.0001978947296064732, "loss": 2.9206, "step": 283 }, { "epoch": 0.07864857380227083, "grad_norm": 2.713244676589966, "learning_rate": 0.00019787668419760408, "loss": 2.7909, "step": 284 }, { "epoch": 0.07892550540016616, "grad_norm": 2.071441173553467, "learning_rate": 0.0001978585626097787, "loss": 2.9111, "step": 285 }, { "epoch": 0.07920243699806148, "grad_norm": 2.382073163986206, "learning_rate": 0.00019784036485710136, "loss": 2.6643, "step": 286 }, { "epoch": 0.07947936859595679, "grad_norm": 2.6257436275482178, "learning_rate": 0.00019782209095373565, "loss": 2.7276, "step": 287 }, { "epoch": 0.07975630019385212, "grad_norm": 2.6829757690429688, "learning_rate": 0.00019780374091390448, "loss": 2.3726, "step": 288 }, { "epoch": 0.08003323179174744, "grad_norm": 4.82999849319458, "learning_rate": 0.00019778531475188996, "loss": 3.1286, "step": 289 }, { "epoch": 0.08031016338964275, "grad_norm": 4.4267401695251465, "learning_rate": 0.0001977668124820334, "loss": 3.0422, "step": 290 }, { "epoch": 0.08058709498753808, "grad_norm": 6.372005462646484, "learning_rate": 0.00019774823411873547, "loss": 3.2041, "step": 291 }, { "epoch": 0.0808640265854334, "grad_norm": 3.2059247493743896, "learning_rate": 0.000197729579676456, "loss": 3.0659, "step": 292 }, { "epoch": 0.08114095818332871, "grad_norm": 4.272333145141602, "learning_rate": 0.000197710849169714, "loss": 3.1635, "step": 293 }, { "epoch": 0.08141788978122404, "grad_norm": 3.1921799182891846, "learning_rate": 0.00019769204261308774, "loss": 2.8431, "step": 294 }, { "epoch": 0.08169482137911936, "grad_norm": 2.311687469482422, "learning_rate": 0.00019767316002121467, "loss": 2.7492, "step": 295 }, { "epoch": 0.08197175297701467, "grad_norm": 5.720884799957275, "learning_rate": 0.00019765420140879135, "loss": 3.0525, "step": 296 }, { "epoch": 0.08224868457491, "grad_norm": 4.252990245819092, "learning_rate": 0.00019763516679057364, "loss": 2.785, "step": 297 }, { "epoch": 0.08252561617280532, "grad_norm": 3.961559772491455, "learning_rate": 0.00019761605618137643, "loss": 3.0525, "step": 298 }, { "epoch": 0.08280254777070063, "grad_norm": 5.757613182067871, "learning_rate": 0.00019759686959607383, "loss": 3.1933, "step": 299 }, { "epoch": 0.08307947936859596, "grad_norm": 6.132602691650391, "learning_rate": 0.00019757760704959903, "loss": 3.8208, "step": 300 }, { "epoch": 0.08335641096649128, "grad_norm": 4.055561065673828, "learning_rate": 0.00019755826855694442, "loss": 2.9217, "step": 301 }, { "epoch": 0.08363334256438659, "grad_norm": 3.2529594898223877, "learning_rate": 0.00019753885413316142, "loss": 3.1784, "step": 302 }, { "epoch": 0.08391027416228192, "grad_norm": 2.9904534816741943, "learning_rate": 0.0001975193637933606, "loss": 2.9318, "step": 303 }, { "epoch": 0.08418720576017724, "grad_norm": 3.231020927429199, "learning_rate": 0.00019749979755271155, "loss": 3.032, "step": 304 }, { "epoch": 0.08446413735807255, "grad_norm": 1.5338724851608276, "learning_rate": 0.00019748015542644302, "loss": 2.8338, "step": 305 }, { "epoch": 0.08474106895596788, "grad_norm": 2.9697089195251465, "learning_rate": 0.0001974604374298428, "loss": 2.7904, "step": 306 }, { "epoch": 0.0850180005538632, "grad_norm": 1.9385228157043457, "learning_rate": 0.00019744064357825767, "loss": 2.7301, "step": 307 }, { "epoch": 0.08529493215175851, "grad_norm": 2.643333911895752, "learning_rate": 0.00019742077388709353, "loss": 2.6874, "step": 308 }, { "epoch": 0.08557186374965384, "grad_norm": 2.934795379638672, "learning_rate": 0.00019740082837181526, "loss": 3.0575, "step": 309 }, { "epoch": 0.08584879534754916, "grad_norm": 2.615273952484131, "learning_rate": 0.00019738080704794677, "loss": 2.8407, "step": 310 }, { "epoch": 0.08612572694544447, "grad_norm": 2.9731595516204834, "learning_rate": 0.00019736070993107093, "loss": 2.8984, "step": 311 }, { "epoch": 0.0864026585433398, "grad_norm": 2.208294153213501, "learning_rate": 0.00019734053703682972, "loss": 2.7479, "step": 312 }, { "epoch": 0.08667959014123512, "grad_norm": 4.021787643432617, "learning_rate": 0.00019732028838092397, "loss": 3.1724, "step": 313 }, { "epoch": 0.08695652173913043, "grad_norm": 3.455578327178955, "learning_rate": 0.00019729996397911356, "loss": 3.1545, "step": 314 }, { "epoch": 0.08723345333702576, "grad_norm": 3.4853105545043945, "learning_rate": 0.00019727956384721723, "loss": 3.0364, "step": 315 }, { "epoch": 0.08751038493492107, "grad_norm": 1.7653639316558838, "learning_rate": 0.00019725908800111275, "loss": 2.8587, "step": 316 }, { "epoch": 0.08778731653281639, "grad_norm": 3.85060715675354, "learning_rate": 0.0001972385364567368, "loss": 2.8053, "step": 317 }, { "epoch": 0.08806424813071172, "grad_norm": 3.2734038829803467, "learning_rate": 0.00019721790923008499, "loss": 2.9927, "step": 318 }, { "epoch": 0.08834117972860703, "grad_norm": 3.687723159790039, "learning_rate": 0.00019719720633721178, "loss": 2.7902, "step": 319 }, { "epoch": 0.08861811132650235, "grad_norm": 3.100510358810425, "learning_rate": 0.00019717642779423057, "loss": 2.9339, "step": 320 }, { "epoch": 0.08889504292439768, "grad_norm": 2.6783623695373535, "learning_rate": 0.00019715557361731358, "loss": 2.6569, "step": 321 }, { "epoch": 0.08917197452229299, "grad_norm": 4.09005880355835, "learning_rate": 0.00019713464382269203, "loss": 2.9551, "step": 322 }, { "epoch": 0.08944890612018831, "grad_norm": 3.5520811080932617, "learning_rate": 0.00019711363842665587, "loss": 2.8597, "step": 323 }, { "epoch": 0.08972583771808364, "grad_norm": 4.7246479988098145, "learning_rate": 0.00019709255744555389, "loss": 2.9633, "step": 324 }, { "epoch": 0.09000276931597895, "grad_norm": 11.66451644897461, "learning_rate": 0.00019707140089579377, "loss": 4.6186, "step": 325 }, { "epoch": 0.09027970091387427, "grad_norm": 3.7271742820739746, "learning_rate": 0.00019705016879384201, "loss": 2.6191, "step": 326 }, { "epoch": 0.0905566325117696, "grad_norm": 4.173647880554199, "learning_rate": 0.00019702886115622388, "loss": 2.7914, "step": 327 }, { "epoch": 0.09083356410966491, "grad_norm": 2.0531363487243652, "learning_rate": 0.00019700747799952342, "loss": 2.8658, "step": 328 }, { "epoch": 0.09111049570756023, "grad_norm": 2.2616448402404785, "learning_rate": 0.0001969860193403835, "loss": 2.5976, "step": 329 }, { "epoch": 0.09138742730545556, "grad_norm": 2.0705738067626953, "learning_rate": 0.0001969644851955057, "loss": 2.6588, "step": 330 }, { "epoch": 0.09166435890335087, "grad_norm": 2.060875177383423, "learning_rate": 0.00019694287558165042, "loss": 2.6176, "step": 331 }, { "epoch": 0.09194129050124619, "grad_norm": 2.033216953277588, "learning_rate": 0.00019692119051563676, "loss": 2.7997, "step": 332 }, { "epoch": 0.09221822209914152, "grad_norm": 1.9967268705368042, "learning_rate": 0.00019689943001434252, "loss": 2.5866, "step": 333 }, { "epoch": 0.09249515369703683, "grad_norm": 2.241673707962036, "learning_rate": 0.00019687759409470426, "loss": 2.7305, "step": 334 }, { "epoch": 0.09277208529493215, "grad_norm": 3.0635623931884766, "learning_rate": 0.0001968556827737172, "loss": 3.0122, "step": 335 }, { "epoch": 0.09304901689282748, "grad_norm": 2.500697374343872, "learning_rate": 0.00019683369606843528, "loss": 2.7502, "step": 336 }, { "epoch": 0.09332594849072279, "grad_norm": 2.314696788787842, "learning_rate": 0.0001968116339959711, "loss": 3.0437, "step": 337 }, { "epoch": 0.09360288008861811, "grad_norm": 2.801692247390747, "learning_rate": 0.00019678949657349587, "loss": 2.8603, "step": 338 }, { "epoch": 0.09387981168651344, "grad_norm": 2.075801134109497, "learning_rate": 0.00019676728381823956, "loss": 2.9299, "step": 339 }, { "epoch": 0.09415674328440875, "grad_norm": 1.7948641777038574, "learning_rate": 0.00019674499574749067, "loss": 2.646, "step": 340 }, { "epoch": 0.09443367488230407, "grad_norm": 2.968350887298584, "learning_rate": 0.00019672263237859638, "loss": 2.7008, "step": 341 }, { "epoch": 0.0947106064801994, "grad_norm": 4.761722087860107, "learning_rate": 0.0001967001937289624, "loss": 3.13, "step": 342 }, { "epoch": 0.09498753807809471, "grad_norm": 3.11910343170166, "learning_rate": 0.00019667767981605314, "loss": 2.8321, "step": 343 }, { "epoch": 0.09526446967599003, "grad_norm": 4.676784515380859, "learning_rate": 0.00019665509065739149, "loss": 3.0559, "step": 344 }, { "epoch": 0.09554140127388536, "grad_norm": 2.8339955806732178, "learning_rate": 0.00019663242627055897, "loss": 2.7198, "step": 345 }, { "epoch": 0.09581833287178067, "grad_norm": 3.6793911457061768, "learning_rate": 0.00019660968667319561, "loss": 2.7427, "step": 346 }, { "epoch": 0.09609526446967599, "grad_norm": 1.839174747467041, "learning_rate": 0.000196586871883, "loss": 2.5297, "step": 347 }, { "epoch": 0.0963721960675713, "grad_norm": 3.2524006366729736, "learning_rate": 0.00019656398191772928, "loss": 2.6822, "step": 348 }, { "epoch": 0.09664912766546663, "grad_norm": 4.079854488372803, "learning_rate": 0.000196541016795199, "loss": 3.0894, "step": 349 }, { "epoch": 0.09692605926336195, "grad_norm": 5.71146297454834, "learning_rate": 0.00019651797653328333, "loss": 3.7383, "step": 350 }, { "epoch": 0.09720299086125726, "grad_norm": 2.944096088409424, "learning_rate": 0.00019649486114991485, "loss": 2.6763, "step": 351 }, { "epoch": 0.09747992245915259, "grad_norm": 3.183020830154419, "learning_rate": 0.0001964716706630846, "loss": 2.8159, "step": 352 }, { "epoch": 0.09775685405704791, "grad_norm": 2.9949750900268555, "learning_rate": 0.00019644840509084216, "loss": 3.2972, "step": 353 }, { "epoch": 0.09803378565494322, "grad_norm": 2.2809813022613525, "learning_rate": 0.00019642506445129545, "loss": 2.8126, "step": 354 }, { "epoch": 0.09831071725283855, "grad_norm": 1.4152350425720215, "learning_rate": 0.00019640164876261085, "loss": 2.6471, "step": 355 }, { "epoch": 0.09858764885073387, "grad_norm": 2.2027335166931152, "learning_rate": 0.00019637815804301315, "loss": 2.818, "step": 356 }, { "epoch": 0.09886458044862918, "grad_norm": 4.638129711151123, "learning_rate": 0.00019635459231078558, "loss": 2.8069, "step": 357 }, { "epoch": 0.09914151204652451, "grad_norm": 5.524168491363525, "learning_rate": 0.00019633095158426967, "loss": 3.0269, "step": 358 }, { "epoch": 0.09941844364441983, "grad_norm": 5.6533708572387695, "learning_rate": 0.00019630723588186545, "loss": 3.221, "step": 359 }, { "epoch": 0.09969537524231514, "grad_norm": 5.9534077644348145, "learning_rate": 0.00019628344522203115, "loss": 3.6454, "step": 360 }, { "epoch": 0.09997230684021047, "grad_norm": 2.3247969150543213, "learning_rate": 0.00019625957962328343, "loss": 3.2326, "step": 361 }, { "epoch": 0.10024923843810579, "grad_norm": 2.465853452682495, "learning_rate": 0.00019623563910419725, "loss": 2.7036, "step": 362 }, { "epoch": 0.1005261700360011, "grad_norm": 2.02891206741333, "learning_rate": 0.0001962116236834059, "loss": 2.7621, "step": 363 }, { "epoch": 0.10080310163389643, "grad_norm": 2.185584306716919, "learning_rate": 0.000196187533379601, "loss": 2.9568, "step": 364 }, { "epoch": 0.10108003323179175, "grad_norm": 1.705273985862732, "learning_rate": 0.00019616336821153235, "loss": 2.6296, "step": 365 }, { "epoch": 0.10135696482968706, "grad_norm": 3.615710973739624, "learning_rate": 0.0001961391281980081, "loss": 3.4777, "step": 366 }, { "epoch": 0.10163389642758239, "grad_norm": 5.6888957023620605, "learning_rate": 0.00019611481335789463, "loss": 3.3798, "step": 367 }, { "epoch": 0.10191082802547771, "grad_norm": 2.6681437492370605, "learning_rate": 0.00019609042371011655, "loss": 3.1311, "step": 368 }, { "epoch": 0.10218775962337302, "grad_norm": 2.4236390590667725, "learning_rate": 0.00019606595927365675, "loss": 2.7262, "step": 369 }, { "epoch": 0.10246469122126835, "grad_norm": 2.5280492305755615, "learning_rate": 0.00019604142006755625, "loss": 2.7093, "step": 370 }, { "epoch": 0.10274162281916367, "grad_norm": 2.658708333969116, "learning_rate": 0.0001960168061109143, "loss": 3.0635, "step": 371 }, { "epoch": 0.10301855441705898, "grad_norm": 6.890584468841553, "learning_rate": 0.0001959921174228883, "loss": 3.1091, "step": 372 }, { "epoch": 0.10329548601495431, "grad_norm": 3.1579232215881348, "learning_rate": 0.00019596735402269388, "loss": 3.1025, "step": 373 }, { "epoch": 0.10357241761284963, "grad_norm": 3.9812111854553223, "learning_rate": 0.00019594251592960479, "loss": 2.9105, "step": 374 }, { "epoch": 0.10384934921074494, "grad_norm": 6.3323187828063965, "learning_rate": 0.00019591760316295292, "loss": 4.0633, "step": 375 }, { "epoch": 0.10412628080864027, "grad_norm": 22.30817985534668, "learning_rate": 0.00019589261574212818, "loss": 3.3351, "step": 376 }, { "epoch": 0.10440321240653559, "grad_norm": 5.525020122528076, "learning_rate": 0.00019586755368657878, "loss": 2.7819, "step": 377 }, { "epoch": 0.1046801440044309, "grad_norm": 3.135986328125, "learning_rate": 0.0001958424170158108, "loss": 2.7154, "step": 378 }, { "epoch": 0.10495707560232623, "grad_norm": 5.768637180328369, "learning_rate": 0.0001958172057493886, "loss": 3.3564, "step": 379 }, { "epoch": 0.10523400720022155, "grad_norm": 2.8191492557525635, "learning_rate": 0.0001957919199069345, "loss": 3.0567, "step": 380 }, { "epoch": 0.10551093879811686, "grad_norm": 2.9175446033477783, "learning_rate": 0.0001957665595081288, "loss": 2.9957, "step": 381 }, { "epoch": 0.10578787039601219, "grad_norm": 2.604487180709839, "learning_rate": 0.00019574112457270993, "loss": 2.7528, "step": 382 }, { "epoch": 0.1060648019939075, "grad_norm": 2.066667079925537, "learning_rate": 0.00019571561512047428, "loss": 3.0062, "step": 383 }, { "epoch": 0.10634173359180282, "grad_norm": 3.283092975616455, "learning_rate": 0.0001956900311712763, "loss": 2.9717, "step": 384 }, { "epoch": 0.10661866518969815, "grad_norm": 1.97118079662323, "learning_rate": 0.00019566437274502833, "loss": 2.6684, "step": 385 }, { "epoch": 0.10689559678759346, "grad_norm": 1.9094443321228027, "learning_rate": 0.00019563863986170077, "loss": 2.8776, "step": 386 }, { "epoch": 0.10717252838548878, "grad_norm": 2.3373727798461914, "learning_rate": 0.0001956128325413219, "loss": 2.6434, "step": 387 }, { "epoch": 0.10744945998338411, "grad_norm": 5.657153129577637, "learning_rate": 0.00019558695080397796, "loss": 2.9741, "step": 388 }, { "epoch": 0.10772639158127942, "grad_norm": 3.7068936824798584, "learning_rate": 0.0001955609946698131, "loss": 2.8159, "step": 389 }, { "epoch": 0.10800332317917474, "grad_norm": 2.457514524459839, "learning_rate": 0.00019553496415902945, "loss": 3.0508, "step": 390 }, { "epoch": 0.10828025477707007, "grad_norm": 3.0572688579559326, "learning_rate": 0.00019550885929188686, "loss": 2.9603, "step": 391 }, { "epoch": 0.10855718637496538, "grad_norm": 3.693903684616089, "learning_rate": 0.00019548268008870329, "loss": 2.9386, "step": 392 }, { "epoch": 0.1088341179728607, "grad_norm": 5.157990455627441, "learning_rate": 0.00019545642656985428, "loss": 2.8769, "step": 393 }, { "epoch": 0.10911104957075603, "grad_norm": 4.02393913269043, "learning_rate": 0.00019543009875577346, "loss": 3.116, "step": 394 }, { "epoch": 0.10938798116865134, "grad_norm": 2.50396728515625, "learning_rate": 0.00019540369666695213, "loss": 2.7121, "step": 395 }, { "epoch": 0.10966491276654666, "grad_norm": 2.8286375999450684, "learning_rate": 0.00019537722032393945, "loss": 2.7944, "step": 396 }, { "epoch": 0.10994184436444199, "grad_norm": 2.6441237926483154, "learning_rate": 0.0001953506697473424, "loss": 2.7884, "step": 397 }, { "epoch": 0.1102187759623373, "grad_norm": 1.8171393871307373, "learning_rate": 0.0001953240449578257, "loss": 2.9018, "step": 398 }, { "epoch": 0.11049570756023262, "grad_norm": 5.175047874450684, "learning_rate": 0.0001952973459761118, "loss": 3.4427, "step": 399 }, { "epoch": 0.11077263915812795, "grad_norm": 7.9577860832214355, "learning_rate": 0.00019527057282298102, "loss": 4.3943, "step": 400 }, { "epoch": 0.11104957075602326, "grad_norm": 3.050788402557373, "learning_rate": 0.00019524372551927126, "loss": 2.8492, "step": 401 }, { "epoch": 0.11132650235391858, "grad_norm": 2.6500747203826904, "learning_rate": 0.0001952168040858782, "loss": 3.1055, "step": 402 }, { "epoch": 0.11160343395181391, "grad_norm": 2.206071615219116, "learning_rate": 0.00019518980854375521, "loss": 2.9417, "step": 403 }, { "epoch": 0.11188036554970922, "grad_norm": 2.746514081954956, "learning_rate": 0.0001951627389139134, "loss": 2.7294, "step": 404 }, { "epoch": 0.11215729714760454, "grad_norm": 2.190720319747925, "learning_rate": 0.00019513559521742142, "loss": 2.7132, "step": 405 }, { "epoch": 0.11243422874549987, "grad_norm": 1.882227897644043, "learning_rate": 0.00019510837747540566, "loss": 2.7519, "step": 406 }, { "epoch": 0.11271116034339518, "grad_norm": 3.664625644683838, "learning_rate": 0.00019508108570905013, "loss": 2.953, "step": 407 }, { "epoch": 0.1129880919412905, "grad_norm": 2.6911027431488037, "learning_rate": 0.0001950537199395964, "loss": 2.7795, "step": 408 }, { "epoch": 0.11326502353918583, "grad_norm": 2.9484004974365234, "learning_rate": 0.00019502628018834372, "loss": 3.057, "step": 409 }, { "epoch": 0.11354195513708114, "grad_norm": 2.3461923599243164, "learning_rate": 0.00019499876647664885, "loss": 2.8541, "step": 410 }, { "epoch": 0.11381888673497646, "grad_norm": 2.187255620956421, "learning_rate": 0.00019497117882592618, "loss": 2.6141, "step": 411 }, { "epoch": 0.11409581833287179, "grad_norm": 2.4059369564056396, "learning_rate": 0.00019494351725764752, "loss": 2.8761, "step": 412 }, { "epoch": 0.1143727499307671, "grad_norm": 4.065573692321777, "learning_rate": 0.00019491578179334243, "loss": 3.5152, "step": 413 }, { "epoch": 0.11464968152866242, "grad_norm": 2.2918248176574707, "learning_rate": 0.00019488797245459773, "loss": 2.8289, "step": 414 }, { "epoch": 0.11492661312655773, "grad_norm": 2.248016834259033, "learning_rate": 0.00019486008926305798, "loss": 2.8703, "step": 415 }, { "epoch": 0.11520354472445306, "grad_norm": 3.0664451122283936, "learning_rate": 0.000194832132240425, "loss": 2.7859, "step": 416 }, { "epoch": 0.11548047632234838, "grad_norm": 3.346709966659546, "learning_rate": 0.00019480410140845826, "loss": 2.8393, "step": 417 }, { "epoch": 0.1157574079202437, "grad_norm": 3.573076009750366, "learning_rate": 0.00019477599678897455, "loss": 3.0581, "step": 418 }, { "epoch": 0.11603433951813902, "grad_norm": 1.8886693716049194, "learning_rate": 0.00019474781840384816, "loss": 2.6688, "step": 419 }, { "epoch": 0.11631127111603434, "grad_norm": 3.444385051727295, "learning_rate": 0.00019471956627501076, "loss": 2.8626, "step": 420 }, { "epoch": 0.11658820271392965, "grad_norm": 2.1609256267547607, "learning_rate": 0.00019469124042445138, "loss": 2.8123, "step": 421 }, { "epoch": 0.11686513431182498, "grad_norm": 3.179786443710327, "learning_rate": 0.00019466284087421654, "loss": 2.8059, "step": 422 }, { "epoch": 0.1171420659097203, "grad_norm": 2.9083259105682373, "learning_rate": 0.00019463436764641007, "loss": 2.6698, "step": 423 }, { "epoch": 0.11741899750761561, "grad_norm": 6.696518898010254, "learning_rate": 0.00019460582076319302, "loss": 3.1366, "step": 424 }, { "epoch": 0.11769592910551094, "grad_norm": 4.36393928527832, "learning_rate": 0.000194577200246784, "loss": 3.3773, "step": 425 }, { "epoch": 0.11797286070340626, "grad_norm": 3.4995036125183105, "learning_rate": 0.00019454850611945872, "loss": 2.8375, "step": 426 }, { "epoch": 0.11824979230130157, "grad_norm": 2.442023277282715, "learning_rate": 0.0001945197384035503, "loss": 2.6869, "step": 427 }, { "epoch": 0.1185267238991969, "grad_norm": 3.9752767086029053, "learning_rate": 0.0001944908971214491, "loss": 3.051, "step": 428 }, { "epoch": 0.11880365549709222, "grad_norm": 2.235665798187256, "learning_rate": 0.00019446198229560276, "loss": 3.1079, "step": 429 }, { "epoch": 0.11908058709498753, "grad_norm": 1.734859824180603, "learning_rate": 0.00019443299394851616, "loss": 2.6128, "step": 430 }, { "epoch": 0.11935751869288286, "grad_norm": 2.163276433944702, "learning_rate": 0.0001944039321027513, "loss": 2.8891, "step": 431 }, { "epoch": 0.11963445029077818, "grad_norm": 2.1512417793273926, "learning_rate": 0.00019437479678092754, "loss": 3.2543, "step": 432 }, { "epoch": 0.1199113818886735, "grad_norm": 2.4014272689819336, "learning_rate": 0.0001943455880057213, "loss": 2.9538, "step": 433 }, { "epoch": 0.12018831348656882, "grad_norm": 2.0314533710479736, "learning_rate": 0.00019431630579986632, "loss": 2.7067, "step": 434 }, { "epoch": 0.12046524508446414, "grad_norm": 3.206453323364258, "learning_rate": 0.00019428695018615334, "loss": 3.1961, "step": 435 }, { "epoch": 0.12074217668235945, "grad_norm": 1.3990651369094849, "learning_rate": 0.00019425752118743027, "loss": 2.9877, "step": 436 }, { "epoch": 0.12101910828025478, "grad_norm": 2.4721100330352783, "learning_rate": 0.0001942280188266022, "loss": 3.0155, "step": 437 }, { "epoch": 0.1212960398781501, "grad_norm": 1.831649661064148, "learning_rate": 0.00019419844312663128, "loss": 2.8572, "step": 438 }, { "epoch": 0.12157297147604541, "grad_norm": 2.862865924835205, "learning_rate": 0.00019416879411053673, "loss": 2.9184, "step": 439 }, { "epoch": 0.12184990307394074, "grad_norm": 3.795285224914551, "learning_rate": 0.00019413907180139482, "loss": 3.9096, "step": 440 }, { "epoch": 0.12212683467183606, "grad_norm": 2.3346879482269287, "learning_rate": 0.00019410927622233893, "loss": 2.732, "step": 441 }, { "epoch": 0.12240376626973137, "grad_norm": 6.56414270401001, "learning_rate": 0.0001940794073965594, "loss": 2.9084, "step": 442 }, { "epoch": 0.1226806978676267, "grad_norm": 3.394613742828369, "learning_rate": 0.00019404946534730364, "loss": 3.0567, "step": 443 }, { "epoch": 0.12295762946552202, "grad_norm": 2.5969808101654053, "learning_rate": 0.00019401945009787594, "loss": 2.7193, "step": 444 }, { "epoch": 0.12323456106341733, "grad_norm": 2.7926831245422363, "learning_rate": 0.00019398936167163772, "loss": 2.4651, "step": 445 }, { "epoch": 0.12351149266131266, "grad_norm": 3.630769729614258, "learning_rate": 0.00019395920009200723, "loss": 2.8965, "step": 446 }, { "epoch": 0.12378842425920798, "grad_norm": 1.7607187032699585, "learning_rate": 0.0001939289653824597, "loss": 2.5714, "step": 447 }, { "epoch": 0.1240653558571033, "grad_norm": 4.170645236968994, "learning_rate": 0.00019389865756652732, "loss": 2.9125, "step": 448 }, { "epoch": 0.12434228745499862, "grad_norm": 5.252502918243408, "learning_rate": 0.0001938682766677991, "loss": 3.238, "step": 449 }, { "epoch": 0.12461921905289393, "grad_norm": 9.120099067687988, "learning_rate": 0.00019383782270992098, "loss": 4.1508, "step": 450 }, { "epoch": 0.12489615065078925, "grad_norm": 2.3955490589141846, "learning_rate": 0.00019380729571659574, "loss": 3.0882, "step": 451 }, { "epoch": 0.12517308224868456, "grad_norm": 2.8864545822143555, "learning_rate": 0.000193776695711583, "loss": 2.94, "step": 452 }, { "epoch": 0.1254500138465799, "grad_norm": 4.40331506729126, "learning_rate": 0.00019374602271869925, "loss": 3.0988, "step": 453 }, { "epoch": 0.1257269454444752, "grad_norm": 2.07773494720459, "learning_rate": 0.00019371527676181777, "loss": 2.8605, "step": 454 }, { "epoch": 0.12600387704237054, "grad_norm": 2.9056828022003174, "learning_rate": 0.00019368445786486862, "loss": 3.1449, "step": 455 }, { "epoch": 0.12628080864026586, "grad_norm": 2.314566135406494, "learning_rate": 0.00019365356605183863, "loss": 2.7058, "step": 456 }, { "epoch": 0.1265577402381612, "grad_norm": 3.2491626739501953, "learning_rate": 0.00019362260134677135, "loss": 2.8069, "step": 457 }, { "epoch": 0.12683467183605648, "grad_norm": 3.122917890548706, "learning_rate": 0.00019359156377376713, "loss": 2.9945, "step": 458 }, { "epoch": 0.1271116034339518, "grad_norm": 2.2590036392211914, "learning_rate": 0.00019356045335698296, "loss": 2.707, "step": 459 }, { "epoch": 0.12738853503184713, "grad_norm": 3.042433023452759, "learning_rate": 0.0001935292701206326, "loss": 2.9189, "step": 460 }, { "epoch": 0.12766546662974246, "grad_norm": 2.72328519821167, "learning_rate": 0.00019349801408898647, "loss": 3.2472, "step": 461 }, { "epoch": 0.12794239822763778, "grad_norm": 2.6480560302734375, "learning_rate": 0.00019346668528637156, "loss": 2.9503, "step": 462 }, { "epoch": 0.1282193298255331, "grad_norm": 3.1094019412994385, "learning_rate": 0.0001934352837371716, "loss": 3.017, "step": 463 }, { "epoch": 0.1284962614234284, "grad_norm": 5.252288341522217, "learning_rate": 0.00019340380946582695, "loss": 3.051, "step": 464 }, { "epoch": 0.12877319302132373, "grad_norm": 2.4221389293670654, "learning_rate": 0.0001933722624968345, "loss": 2.9024, "step": 465 }, { "epoch": 0.12905012461921905, "grad_norm": 2.4375619888305664, "learning_rate": 0.00019334064285474772, "loss": 2.7439, "step": 466 }, { "epoch": 0.12932705621711438, "grad_norm": 5.243897914886475, "learning_rate": 0.00019330895056417671, "loss": 3.147, "step": 467 }, { "epoch": 0.1296039878150097, "grad_norm": 3.8889989852905273, "learning_rate": 0.00019327718564978805, "loss": 2.8308, "step": 468 }, { "epoch": 0.129880919412905, "grad_norm": 3.177091360092163, "learning_rate": 0.00019324534813630487, "loss": 2.7943, "step": 469 }, { "epoch": 0.13015785101080032, "grad_norm": 3.306352376937866, "learning_rate": 0.00019321343804850683, "loss": 2.539, "step": 470 }, { "epoch": 0.13043478260869565, "grad_norm": 1.4211779832839966, "learning_rate": 0.00019318145541123005, "loss": 2.9397, "step": 471 }, { "epoch": 0.13071171420659097, "grad_norm": 2.220968723297119, "learning_rate": 0.0001931494002493671, "loss": 2.9446, "step": 472 }, { "epoch": 0.1309886458044863, "grad_norm": 4.130578994750977, "learning_rate": 0.00019311727258786702, "loss": 3.1133, "step": 473 }, { "epoch": 0.13126557740238162, "grad_norm": 3.1133687496185303, "learning_rate": 0.00019308507245173527, "loss": 3.1137, "step": 474 }, { "epoch": 0.13154250900027692, "grad_norm": 8.326321601867676, "learning_rate": 0.00019305279986603374, "loss": 4.4815, "step": 475 }, { "epoch": 0.13181944059817224, "grad_norm": 1.7736440896987915, "learning_rate": 0.00019302045485588068, "loss": 2.7976, "step": 476 }, { "epoch": 0.13209637219606757, "grad_norm": 2.2228281497955322, "learning_rate": 0.00019298803744645067, "loss": 2.8211, "step": 477 }, { "epoch": 0.1323733037939629, "grad_norm": 2.290677070617676, "learning_rate": 0.00019295554766297476, "loss": 2.7382, "step": 478 }, { "epoch": 0.13265023539185822, "grad_norm": 2.2529354095458984, "learning_rate": 0.0001929229855307402, "loss": 3.0548, "step": 479 }, { "epoch": 0.13292716698975354, "grad_norm": 1.641726016998291, "learning_rate": 0.00019289035107509066, "loss": 2.9073, "step": 480 }, { "epoch": 0.13320409858764884, "grad_norm": 2.6638753414154053, "learning_rate": 0.00019285764432142602, "loss": 2.8904, "step": 481 }, { "epoch": 0.13348103018554416, "grad_norm": 2.0153493881225586, "learning_rate": 0.00019282486529520243, "loss": 2.813, "step": 482 }, { "epoch": 0.1337579617834395, "grad_norm": 2.089580535888672, "learning_rate": 0.00019279201402193236, "loss": 3.0678, "step": 483 }, { "epoch": 0.1340348933813348, "grad_norm": 5.025186061859131, "learning_rate": 0.00019275909052718447, "loss": 2.6991, "step": 484 }, { "epoch": 0.13431182497923014, "grad_norm": 2.2681591510772705, "learning_rate": 0.00019272609483658362, "loss": 2.9733, "step": 485 }, { "epoch": 0.13458875657712546, "grad_norm": 3.0130293369293213, "learning_rate": 0.00019269302697581087, "loss": 2.8882, "step": 486 }, { "epoch": 0.13486568817502076, "grad_norm": 2.0153720378875732, "learning_rate": 0.00019265988697060346, "loss": 2.6695, "step": 487 }, { "epoch": 0.13514261977291608, "grad_norm": 3.0560152530670166, "learning_rate": 0.00019262667484675475, "loss": 2.8081, "step": 488 }, { "epoch": 0.1354195513708114, "grad_norm": 2.948871374130249, "learning_rate": 0.00019259339063011432, "loss": 2.9432, "step": 489 }, { "epoch": 0.13569648296870673, "grad_norm": 1.9918317794799805, "learning_rate": 0.00019256003434658773, "loss": 2.5663, "step": 490 }, { "epoch": 0.13597341456660206, "grad_norm": 2.153956651687622, "learning_rate": 0.00019252660602213674, "loss": 2.8127, "step": 491 }, { "epoch": 0.13625034616449738, "grad_norm": 2.756939649581909, "learning_rate": 0.00019249310568277909, "loss": 2.7582, "step": 492 }, { "epoch": 0.13652727776239268, "grad_norm": 2.7311301231384277, "learning_rate": 0.0001924595333545887, "loss": 2.3999, "step": 493 }, { "epoch": 0.136804209360288, "grad_norm": 3.7439494132995605, "learning_rate": 0.00019242588906369536, "loss": 2.8778, "step": 494 }, { "epoch": 0.13708114095818333, "grad_norm": 2.8746988773345947, "learning_rate": 0.00019239217283628497, "loss": 2.7263, "step": 495 }, { "epoch": 0.13735807255607865, "grad_norm": 1.9332902431488037, "learning_rate": 0.00019235838469859942, "loss": 2.7315, "step": 496 }, { "epoch": 0.13763500415397398, "grad_norm": 2.4865143299102783, "learning_rate": 0.00019232452467693657, "loss": 2.6248, "step": 497 }, { "epoch": 0.13791193575186927, "grad_norm": 2.1645824909210205, "learning_rate": 0.00019229059279765012, "loss": 2.7823, "step": 498 }, { "epoch": 0.1381888673497646, "grad_norm": 3.73016619682312, "learning_rate": 0.00019225658908714983, "loss": 3.0859, "step": 499 }, { "epoch": 0.13846579894765992, "grad_norm": 9.9523344039917, "learning_rate": 0.00019222251357190132, "loss": 4.6159, "step": 500 }, { "epoch": 0.13874273054555525, "grad_norm": 2.717419385910034, "learning_rate": 0.0001921883662784261, "loss": 2.9515, "step": 501 }, { "epoch": 0.13901966214345057, "grad_norm": 2.07205867767334, "learning_rate": 0.00019215414723330158, "loss": 2.764, "step": 502 }, { "epoch": 0.1392965937413459, "grad_norm": 2.078087091445923, "learning_rate": 0.00019211985646316093, "loss": 2.9914, "step": 503 }, { "epoch": 0.1395735253392412, "grad_norm": 1.61542546749115, "learning_rate": 0.00019208549399469318, "loss": 2.5703, "step": 504 }, { "epoch": 0.13985045693713652, "grad_norm": 1.77694833278656, "learning_rate": 0.0001920510598546432, "loss": 2.773, "step": 505 }, { "epoch": 0.14012738853503184, "grad_norm": 1.663738489151001, "learning_rate": 0.00019201655406981164, "loss": 2.6654, "step": 506 }, { "epoch": 0.14040432013292717, "grad_norm": 1.5260848999023438, "learning_rate": 0.00019198197666705488, "loss": 2.6962, "step": 507 }, { "epoch": 0.1406812517308225, "grad_norm": 1.298994541168213, "learning_rate": 0.00019194732767328505, "loss": 2.9068, "step": 508 }, { "epoch": 0.14095818332871782, "grad_norm": 1.8828246593475342, "learning_rate": 0.00019191260711547001, "loss": 3.1734, "step": 509 }, { "epoch": 0.14123511492661311, "grad_norm": 2.350924491882324, "learning_rate": 0.00019187781502063328, "loss": 2.9027, "step": 510 }, { "epoch": 0.14151204652450844, "grad_norm": 2.7922251224517822, "learning_rate": 0.00019184295141585415, "loss": 2.9525, "step": 511 }, { "epoch": 0.14178897812240376, "grad_norm": 2.086071491241455, "learning_rate": 0.00019180801632826748, "loss": 2.7488, "step": 512 }, { "epoch": 0.1420659097202991, "grad_norm": 4.257290840148926, "learning_rate": 0.00019177300978506377, "loss": 3.0051, "step": 513 }, { "epoch": 0.1423428413181944, "grad_norm": 4.1589508056640625, "learning_rate": 0.0001917379318134892, "loss": 3.1527, "step": 514 }, { "epoch": 0.14261977291608974, "grad_norm": 2.253479242324829, "learning_rate": 0.0001917027824408455, "loss": 2.9246, "step": 515 }, { "epoch": 0.14289670451398503, "grad_norm": 2.4339916706085205, "learning_rate": 0.00019166756169448993, "loss": 2.8065, "step": 516 }, { "epoch": 0.14317363611188036, "grad_norm": 1.9720395803451538, "learning_rate": 0.00019163226960183542, "loss": 3.0954, "step": 517 }, { "epoch": 0.14345056770977568, "grad_norm": 2.3299455642700195, "learning_rate": 0.00019159690619035034, "loss": 2.7443, "step": 518 }, { "epoch": 0.143727499307671, "grad_norm": 2.715243101119995, "learning_rate": 0.00019156147148755855, "loss": 2.7845, "step": 519 }, { "epoch": 0.14400443090556633, "grad_norm": 1.9602571725845337, "learning_rate": 0.00019152596552103948, "loss": 2.6434, "step": 520 }, { "epoch": 0.14428136250346166, "grad_norm": 1.381770372390747, "learning_rate": 0.00019149038831842792, "loss": 2.8227, "step": 521 }, { "epoch": 0.14455829410135695, "grad_norm": 3.3014719486236572, "learning_rate": 0.0001914547399074142, "loss": 3.0247, "step": 522 }, { "epoch": 0.14483522569925228, "grad_norm": 2.787161111831665, "learning_rate": 0.0001914190203157441, "loss": 3.0022, "step": 523 }, { "epoch": 0.1451121572971476, "grad_norm": 5.41040563583374, "learning_rate": 0.0001913832295712186, "loss": 3.4196, "step": 524 }, { "epoch": 0.14538908889504293, "grad_norm": 7.442314624786377, "learning_rate": 0.0001913473677016943, "loss": 4.4476, "step": 525 }, { "epoch": 0.14566602049293825, "grad_norm": 1.430985689163208, "learning_rate": 0.00019131143473508303, "loss": 2.5798, "step": 526 }, { "epoch": 0.14594295209083358, "grad_norm": 1.5545196533203125, "learning_rate": 0.000191275430699352, "loss": 2.9568, "step": 527 }, { "epoch": 0.14621988368872887, "grad_norm": 2.2793681621551514, "learning_rate": 0.00019123935562252365, "loss": 2.9475, "step": 528 }, { "epoch": 0.1464968152866242, "grad_norm": 1.8129557371139526, "learning_rate": 0.00019120320953267586, "loss": 2.8559, "step": 529 }, { "epoch": 0.14677374688451952, "grad_norm": 1.2468783855438232, "learning_rate": 0.0001911669924579416, "loss": 2.596, "step": 530 }, { "epoch": 0.14705067848241485, "grad_norm": 2.115652084350586, "learning_rate": 0.00019113070442650928, "loss": 2.6525, "step": 531 }, { "epoch": 0.14732761008031017, "grad_norm": 2.7340188026428223, "learning_rate": 0.00019109434546662243, "loss": 2.6955, "step": 532 }, { "epoch": 0.14760454167820547, "grad_norm": 1.7179347276687622, "learning_rate": 0.00019105791560657974, "loss": 3.0466, "step": 533 }, { "epoch": 0.1478814732761008, "grad_norm": 1.8532159328460693, "learning_rate": 0.0001910214148747352, "loss": 2.9043, "step": 534 }, { "epoch": 0.14815840487399612, "grad_norm": 2.033792734146118, "learning_rate": 0.00019098484329949786, "loss": 2.8517, "step": 535 }, { "epoch": 0.14843533647189144, "grad_norm": 1.300536036491394, "learning_rate": 0.00019094820090933195, "loss": 2.9107, "step": 536 }, { "epoch": 0.14871226806978677, "grad_norm": 3.2628746032714844, "learning_rate": 0.00019091148773275687, "loss": 2.829, "step": 537 }, { "epoch": 0.1489891996676821, "grad_norm": 1.9415562152862549, "learning_rate": 0.00019087470379834702, "loss": 2.6483, "step": 538 }, { "epoch": 0.1492661312655774, "grad_norm": 3.041368007659912, "learning_rate": 0.0001908378491347319, "loss": 3.1466, "step": 539 }, { "epoch": 0.14954306286347271, "grad_norm": 5.678069114685059, "learning_rate": 0.00019080092377059607, "loss": 2.8426, "step": 540 }, { "epoch": 0.14981999446136804, "grad_norm": 3.12460994720459, "learning_rate": 0.00019076392773467916, "loss": 2.9311, "step": 541 }, { "epoch": 0.15009692605926336, "grad_norm": 2.576343536376953, "learning_rate": 0.00019072686105577576, "loss": 2.8172, "step": 542 }, { "epoch": 0.1503738576571587, "grad_norm": 3.4362986087799072, "learning_rate": 0.0001906897237627354, "loss": 3.1724, "step": 543 }, { "epoch": 0.150650789255054, "grad_norm": 1.4936844110488892, "learning_rate": 0.00019065251588446265, "loss": 2.6596, "step": 544 }, { "epoch": 0.1509277208529493, "grad_norm": 1.8611481189727783, "learning_rate": 0.00019061523744991698, "loss": 2.7357, "step": 545 }, { "epoch": 0.15120465245084463, "grad_norm": 2.7158865928649902, "learning_rate": 0.00019057788848811277, "loss": 3.0875, "step": 546 }, { "epoch": 0.15148158404873996, "grad_norm": 2.2609493732452393, "learning_rate": 0.00019054046902811932, "loss": 2.7561, "step": 547 }, { "epoch": 0.15175851564663528, "grad_norm": 2.341095209121704, "learning_rate": 0.00019050297909906075, "loss": 2.8444, "step": 548 }, { "epoch": 0.1520354472445306, "grad_norm": 3.354525327682495, "learning_rate": 0.0001904654187301161, "loss": 3.2922, "step": 549 }, { "epoch": 0.15231237884242593, "grad_norm": 4.269205570220947, "learning_rate": 0.0001904277879505192, "loss": 3.528, "step": 550 }, { "epoch": 0.15258931044032123, "grad_norm": 3.570624589920044, "learning_rate": 0.00019039008678955864, "loss": 3.1134, "step": 551 }, { "epoch": 0.15286624203821655, "grad_norm": 2.4145169258117676, "learning_rate": 0.00019035231527657782, "loss": 2.7928, "step": 552 }, { "epoch": 0.15314317363611188, "grad_norm": 1.96170175075531, "learning_rate": 0.0001903144734409749, "loss": 2.8577, "step": 553 }, { "epoch": 0.1534201052340072, "grad_norm": 1.7383220195770264, "learning_rate": 0.0001902765613122028, "loss": 2.7327, "step": 554 }, { "epoch": 0.15369703683190253, "grad_norm": 2.1005501747131348, "learning_rate": 0.0001902385789197691, "loss": 2.948, "step": 555 }, { "epoch": 0.15397396842979785, "grad_norm": 2.3060429096221924, "learning_rate": 0.0001902005262932361, "loss": 2.7076, "step": 556 }, { "epoch": 0.15425090002769315, "grad_norm": 2.3842906951904297, "learning_rate": 0.00019016240346222079, "loss": 2.6827, "step": 557 }, { "epoch": 0.15452783162558847, "grad_norm": 2.296273708343506, "learning_rate": 0.00019012421045639473, "loss": 2.811, "step": 558 }, { "epoch": 0.1548047632234838, "grad_norm": 2.9906840324401855, "learning_rate": 0.0001900859473054841, "loss": 2.5892, "step": 559 }, { "epoch": 0.15508169482137912, "grad_norm": 1.905619502067566, "learning_rate": 0.00019004761403926978, "loss": 3.0653, "step": 560 }, { "epoch": 0.15535862641927445, "grad_norm": 3.003749370574951, "learning_rate": 0.00019000921068758708, "loss": 3.0541, "step": 561 }, { "epoch": 0.15563555801716977, "grad_norm": 3.384355068206787, "learning_rate": 0.000189970737280326, "loss": 3.0699, "step": 562 }, { "epoch": 0.15591248961506507, "grad_norm": 1.637637972831726, "learning_rate": 0.00018993219384743097, "loss": 2.6486, "step": 563 }, { "epoch": 0.1561894212129604, "grad_norm": 3.134511947631836, "learning_rate": 0.00018989358041890094, "loss": 3.2672, "step": 564 }, { "epoch": 0.15646635281085572, "grad_norm": 18.818422317504883, "learning_rate": 0.00018985489702478934, "loss": 2.9634, "step": 565 }, { "epoch": 0.15674328440875104, "grad_norm": 2.783501386642456, "learning_rate": 0.00018981614369520405, "loss": 2.824, "step": 566 }, { "epoch": 0.15702021600664637, "grad_norm": 3.167635440826416, "learning_rate": 0.00018977732046030746, "loss": 2.6968, "step": 567 }, { "epoch": 0.15729714760454167, "grad_norm": 3.8831708431243896, "learning_rate": 0.00018973842735031622, "loss": 2.8965, "step": 568 }, { "epoch": 0.157574079202437, "grad_norm": 3.5270354747772217, "learning_rate": 0.00018969946439550148, "loss": 2.5994, "step": 569 }, { "epoch": 0.15785101080033231, "grad_norm": 4.177906036376953, "learning_rate": 0.00018966043162618873, "loss": 2.9521, "step": 570 }, { "epoch": 0.15812794239822764, "grad_norm": 2.400700807571411, "learning_rate": 0.00018962132907275776, "loss": 2.9044, "step": 571 }, { "epoch": 0.15840487399612296, "grad_norm": 3.270749568939209, "learning_rate": 0.00018958215676564275, "loss": 3.0326, "step": 572 }, { "epoch": 0.1586818055940183, "grad_norm": 3.104177713394165, "learning_rate": 0.00018954291473533206, "loss": 2.8663, "step": 573 }, { "epoch": 0.15895873719191358, "grad_norm": 3.4584054946899414, "learning_rate": 0.0001895036030123684, "loss": 2.9584, "step": 574 }, { "epoch": 0.1592356687898089, "grad_norm": 4.127494812011719, "learning_rate": 0.00018946422162734872, "loss": 3.607, "step": 575 }, { "epoch": 0.15951260038770423, "grad_norm": 1.8092525005340576, "learning_rate": 0.00018942477061092413, "loss": 2.6754, "step": 576 }, { "epoch": 0.15978953198559956, "grad_norm": 2.801778554916382, "learning_rate": 0.0001893852499938, "loss": 2.9081, "step": 577 }, { "epoch": 0.16006646358349488, "grad_norm": 2.5070884227752686, "learning_rate": 0.00018934565980673583, "loss": 3.2574, "step": 578 }, { "epoch": 0.1603433951813902, "grad_norm": 1.5043915510177612, "learning_rate": 0.0001893060000805453, "loss": 3.0069, "step": 579 }, { "epoch": 0.1606203267792855, "grad_norm": 2.809873580932617, "learning_rate": 0.0001892662708460962, "loss": 2.6756, "step": 580 }, { "epoch": 0.16089725837718083, "grad_norm": 2.6917896270751953, "learning_rate": 0.0001892264721343104, "loss": 2.8422, "step": 581 }, { "epoch": 0.16117418997507615, "grad_norm": 2.8489990234375, "learning_rate": 0.00018918660397616388, "loss": 2.8668, "step": 582 }, { "epoch": 0.16145112157297148, "grad_norm": 1.7083117961883545, "learning_rate": 0.0001891466664026866, "loss": 2.8933, "step": 583 }, { "epoch": 0.1617280531708668, "grad_norm": 2.7621779441833496, "learning_rate": 0.00018910665944496264, "loss": 3.0861, "step": 584 }, { "epoch": 0.16200498476876213, "grad_norm": 1.6034456491470337, "learning_rate": 0.00018906658313413006, "loss": 3.081, "step": 585 }, { "epoch": 0.16228191636665742, "grad_norm": 2.1977531909942627, "learning_rate": 0.00018902643750138084, "loss": 2.7185, "step": 586 }, { "epoch": 0.16255884796455275, "grad_norm": 2.6708223819732666, "learning_rate": 0.00018898622257796098, "loss": 2.8751, "step": 587 }, { "epoch": 0.16283577956244807, "grad_norm": 2.5136728286743164, "learning_rate": 0.0001889459383951704, "loss": 3.0376, "step": 588 }, { "epoch": 0.1631127111603434, "grad_norm": 2.253354549407959, "learning_rate": 0.00018890558498436282, "loss": 2.9694, "step": 589 }, { "epoch": 0.16338964275823872, "grad_norm": 2.0030698776245117, "learning_rate": 0.00018886516237694607, "loss": 2.9248, "step": 590 }, { "epoch": 0.16366657435613405, "grad_norm": 3.905414342880249, "learning_rate": 0.0001888246706043816, "loss": 3.0461, "step": 591 }, { "epoch": 0.16394350595402934, "grad_norm": 3.2316644191741943, "learning_rate": 0.00018878410969818484, "loss": 2.9037, "step": 592 }, { "epoch": 0.16422043755192467, "grad_norm": 3.1154937744140625, "learning_rate": 0.0001887434796899249, "loss": 2.8276, "step": 593 }, { "epoch": 0.16449736914982, "grad_norm": 2.8231682777404785, "learning_rate": 0.00018870278061122484, "loss": 2.9152, "step": 594 }, { "epoch": 0.16477430074771532, "grad_norm": 2.0204408168792725, "learning_rate": 0.00018866201249376137, "loss": 2.7603, "step": 595 }, { "epoch": 0.16505123234561064, "grad_norm": 2.0140509605407715, "learning_rate": 0.00018862117536926496, "loss": 2.5941, "step": 596 }, { "epoch": 0.16532816394350594, "grad_norm": 3.0981736183166504, "learning_rate": 0.00018858026926951973, "loss": 2.9203, "step": 597 }, { "epoch": 0.16560509554140126, "grad_norm": 3.244689464569092, "learning_rate": 0.0001885392942263636, "loss": 2.7306, "step": 598 }, { "epoch": 0.1658820271392966, "grad_norm": 5.177836894989014, "learning_rate": 0.00018849825027168803, "loss": 3.3335, "step": 599 }, { "epoch": 0.16615895873719191, "grad_norm": 5.458366870880127, "learning_rate": 0.00018845713743743822, "loss": 3.7187, "step": 600 }, { "epoch": 0.16643589033508724, "grad_norm": 2.1460788249969482, "learning_rate": 0.00018841595575561294, "loss": 2.7101, "step": 601 }, { "epoch": 0.16671282193298256, "grad_norm": 1.636173963546753, "learning_rate": 0.00018837470525826452, "loss": 2.7277, "step": 602 }, { "epoch": 0.16698975353087786, "grad_norm": 3.3788509368896484, "learning_rate": 0.00018833338597749882, "loss": 3.2524, "step": 603 }, { "epoch": 0.16726668512877318, "grad_norm": 3.1274211406707764, "learning_rate": 0.00018829199794547535, "loss": 3.0485, "step": 604 }, { "epoch": 0.1675436167266685, "grad_norm": 3.001844644546509, "learning_rate": 0.00018825054119440705, "loss": 2.8739, "step": 605 }, { "epoch": 0.16782054832456383, "grad_norm": 2.3479843139648438, "learning_rate": 0.00018820901575656034, "loss": 2.9218, "step": 606 }, { "epoch": 0.16809747992245916, "grad_norm": 3.5614748001098633, "learning_rate": 0.00018816742166425515, "loss": 3.1383, "step": 607 }, { "epoch": 0.16837441152035448, "grad_norm": 2.6405105590820312, "learning_rate": 0.00018812575894986474, "loss": 2.7645, "step": 608 }, { "epoch": 0.16865134311824978, "grad_norm": 3.3058698177337646, "learning_rate": 0.00018808402764581596, "loss": 2.9221, "step": 609 }, { "epoch": 0.1689282747161451, "grad_norm": 3.376081705093384, "learning_rate": 0.00018804222778458885, "loss": 2.8451, "step": 610 }, { "epoch": 0.16920520631404043, "grad_norm": 2.419116497039795, "learning_rate": 0.00018800035939871697, "loss": 3.0378, "step": 611 }, { "epoch": 0.16948213791193575, "grad_norm": 3.627453327178955, "learning_rate": 0.00018795842252078705, "loss": 2.9525, "step": 612 }, { "epoch": 0.16975906950983108, "grad_norm": 2.9806618690490723, "learning_rate": 0.0001879164171834393, "loss": 3.2352, "step": 613 }, { "epoch": 0.1700360011077264, "grad_norm": 3.6521472930908203, "learning_rate": 0.0001878743434193671, "loss": 3.1307, "step": 614 }, { "epoch": 0.1703129327056217, "grad_norm": 2.553753137588501, "learning_rate": 0.0001878322012613171, "loss": 2.8143, "step": 615 }, { "epoch": 0.17058986430351702, "grad_norm": 3.0864615440368652, "learning_rate": 0.00018778999074208925, "loss": 2.8784, "step": 616 }, { "epoch": 0.17086679590141235, "grad_norm": 2.491281509399414, "learning_rate": 0.00018774771189453658, "loss": 2.7147, "step": 617 }, { "epoch": 0.17114372749930767, "grad_norm": 3.605449914932251, "learning_rate": 0.0001877053647515655, "loss": 3.0916, "step": 618 }, { "epoch": 0.171420659097203, "grad_norm": 2.559891939163208, "learning_rate": 0.00018766294934613535, "loss": 2.5182, "step": 619 }, { "epoch": 0.17169759069509832, "grad_norm": 2.9058518409729004, "learning_rate": 0.00018762046571125872, "loss": 2.8692, "step": 620 }, { "epoch": 0.17197452229299362, "grad_norm": 2.3772425651550293, "learning_rate": 0.00018757791388000135, "loss": 2.7498, "step": 621 }, { "epoch": 0.17225145389088894, "grad_norm": 2.9739794731140137, "learning_rate": 0.00018753529388548194, "loss": 2.9448, "step": 622 }, { "epoch": 0.17252838548878427, "grad_norm": 4.4551215171813965, "learning_rate": 0.00018749260576087228, "loss": 3.3577, "step": 623 }, { "epoch": 0.1728053170866796, "grad_norm": 2.452760934829712, "learning_rate": 0.00018744984953939726, "loss": 3.1046, "step": 624 }, { "epoch": 0.17308224868457492, "grad_norm": 4.560608386993408, "learning_rate": 0.00018740702525433468, "loss": 3.9237, "step": 625 }, { "epoch": 0.17335918028247024, "grad_norm": 1.832899808883667, "learning_rate": 0.0001873641329390154, "loss": 2.9019, "step": 626 }, { "epoch": 0.17363611188036554, "grad_norm": 3.598841428756714, "learning_rate": 0.00018732117262682313, "loss": 3.0795, "step": 627 }, { "epoch": 0.17391304347826086, "grad_norm": 2.3927810192108154, "learning_rate": 0.00018727814435119458, "loss": 2.8281, "step": 628 }, { "epoch": 0.1741899750761562, "grad_norm": 1.6871562004089355, "learning_rate": 0.0001872350481456193, "loss": 2.709, "step": 629 }, { "epoch": 0.17446690667405151, "grad_norm": 1.5506254434585571, "learning_rate": 0.0001871918840436398, "loss": 2.4343, "step": 630 }, { "epoch": 0.17474383827194684, "grad_norm": 1.5275046825408936, "learning_rate": 0.0001871486520788513, "loss": 2.6755, "step": 631 }, { "epoch": 0.17502076986984214, "grad_norm": 1.5423846244812012, "learning_rate": 0.000187105352284902, "loss": 2.5921, "step": 632 }, { "epoch": 0.17529770146773746, "grad_norm": 1.8302452564239502, "learning_rate": 0.0001870619846954927, "loss": 3.0291, "step": 633 }, { "epoch": 0.17557463306563278, "grad_norm": 2.1343085765838623, "learning_rate": 0.0001870185493443772, "loss": 2.7965, "step": 634 }, { "epoch": 0.1758515646635281, "grad_norm": 2.0958521366119385, "learning_rate": 0.00018697504626536185, "loss": 2.6242, "step": 635 }, { "epoch": 0.17612849626142343, "grad_norm": 1.856468915939331, "learning_rate": 0.00018693147549230576, "loss": 2.7097, "step": 636 }, { "epoch": 0.17640542785931876, "grad_norm": 1.8141541481018066, "learning_rate": 0.00018688783705912075, "loss": 2.7367, "step": 637 }, { "epoch": 0.17668235945721406, "grad_norm": 2.124817132949829, "learning_rate": 0.00018684413099977136, "loss": 2.8737, "step": 638 }, { "epoch": 0.17695929105510938, "grad_norm": 3.436328649520874, "learning_rate": 0.0001868003573482746, "loss": 3.1817, "step": 639 }, { "epoch": 0.1772362226530047, "grad_norm": 1.8416308164596558, "learning_rate": 0.0001867565161387003, "loss": 3.2015, "step": 640 }, { "epoch": 0.17751315425090003, "grad_norm": 3.4383628368377686, "learning_rate": 0.00018671260740517066, "loss": 2.7034, "step": 641 }, { "epoch": 0.17779008584879535, "grad_norm": 3.328667163848877, "learning_rate": 0.00018666863118186057, "loss": 3.1435, "step": 642 }, { "epoch": 0.17806701744669068, "grad_norm": 2.535160541534424, "learning_rate": 0.00018662458750299743, "loss": 2.6899, "step": 643 }, { "epoch": 0.17834394904458598, "grad_norm": 3.590573787689209, "learning_rate": 0.0001865804764028611, "loss": 2.64, "step": 644 }, { "epoch": 0.1786208806424813, "grad_norm": 2.0103766918182373, "learning_rate": 0.000186536297915784, "loss": 2.9379, "step": 645 }, { "epoch": 0.17889781224037662, "grad_norm": 1.850217342376709, "learning_rate": 0.00018649205207615084, "loss": 2.9077, "step": 646 }, { "epoch": 0.17917474383827195, "grad_norm": 3.9196999073028564, "learning_rate": 0.00018644773891839892, "loss": 3.0329, "step": 647 }, { "epoch": 0.17945167543616727, "grad_norm": 4.188611030578613, "learning_rate": 0.00018640335847701787, "loss": 2.9961, "step": 648 }, { "epoch": 0.1797286070340626, "grad_norm": 3.173583745956421, "learning_rate": 0.0001863589107865496, "loss": 3.0233, "step": 649 }, { "epoch": 0.1800055386319579, "grad_norm": 4.154677391052246, "learning_rate": 0.00018631439588158856, "loss": 3.8583, "step": 650 }, { "epoch": 0.18028247022985322, "grad_norm": 2.180216073989868, "learning_rate": 0.00018626981379678132, "loss": 2.9147, "step": 651 }, { "epoch": 0.18055940182774854, "grad_norm": 1.7871785163879395, "learning_rate": 0.00018622516456682684, "loss": 2.7395, "step": 652 }, { "epoch": 0.18083633342564387, "grad_norm": 1.6536489725112915, "learning_rate": 0.00018618044822647632, "loss": 2.7182, "step": 653 }, { "epoch": 0.1811132650235392, "grad_norm": 2.316704273223877, "learning_rate": 0.00018613566481053315, "loss": 2.7115, "step": 654 }, { "epoch": 0.18139019662143452, "grad_norm": 1.456023931503296, "learning_rate": 0.00018609081435385302, "loss": 2.9228, "step": 655 }, { "epoch": 0.18166712821932982, "grad_norm": 2.3114585876464844, "learning_rate": 0.00018604589689134372, "loss": 2.5995, "step": 656 }, { "epoch": 0.18194405981722514, "grad_norm": 1.963617205619812, "learning_rate": 0.0001860009124579652, "loss": 2.5466, "step": 657 }, { "epoch": 0.18222099141512046, "grad_norm": 1.4744776487350464, "learning_rate": 0.00018595586108872953, "loss": 2.7022, "step": 658 }, { "epoch": 0.1824979230130158, "grad_norm": 2.431297540664673, "learning_rate": 0.00018591074281870099, "loss": 2.7521, "step": 659 }, { "epoch": 0.1827748546109111, "grad_norm": 2.815333843231201, "learning_rate": 0.00018586555768299571, "loss": 2.9846, "step": 660 }, { "epoch": 0.18305178620880644, "grad_norm": 2.565122365951538, "learning_rate": 0.00018582030571678208, "loss": 2.789, "step": 661 }, { "epoch": 0.18332871780670174, "grad_norm": 2.2769696712493896, "learning_rate": 0.0001857749869552804, "loss": 2.9159, "step": 662 }, { "epoch": 0.18360564940459706, "grad_norm": 2.010282039642334, "learning_rate": 0.0001857296014337629, "loss": 2.7838, "step": 663 }, { "epoch": 0.18388258100249238, "grad_norm": 3.214738368988037, "learning_rate": 0.00018568414918755397, "loss": 3.1515, "step": 664 }, { "epoch": 0.1841595126003877, "grad_norm": 4.236786365509033, "learning_rate": 0.0001856386302520297, "loss": 3.2493, "step": 665 }, { "epoch": 0.18443644419828303, "grad_norm": 2.729323625564575, "learning_rate": 0.00018559304466261823, "loss": 2.7869, "step": 666 }, { "epoch": 0.18471337579617833, "grad_norm": 2.8418619632720947, "learning_rate": 0.0001855473924547995, "loss": 2.7512, "step": 667 }, { "epoch": 0.18499030739407366, "grad_norm": 2.5104856491088867, "learning_rate": 0.00018550167366410543, "loss": 2.7103, "step": 668 }, { "epoch": 0.18526723899196898, "grad_norm": 2.7290761470794678, "learning_rate": 0.00018545588832611956, "loss": 2.9332, "step": 669 }, { "epoch": 0.1855441705898643, "grad_norm": 2.2149202823638916, "learning_rate": 0.00018541003647647744, "loss": 2.5608, "step": 670 }, { "epoch": 0.18582110218775963, "grad_norm": 2.0544304847717285, "learning_rate": 0.0001853641181508662, "loss": 2.775, "step": 671 }, { "epoch": 0.18609803378565495, "grad_norm": 2.9898555278778076, "learning_rate": 0.0001853181333850248, "loss": 3.0448, "step": 672 }, { "epoch": 0.18637496538355025, "grad_norm": 2.771472215652466, "learning_rate": 0.00018527208221474396, "loss": 2.99, "step": 673 }, { "epoch": 0.18665189698144558, "grad_norm": 2.860121726989746, "learning_rate": 0.00018522596467586598, "loss": 2.9894, "step": 674 }, { "epoch": 0.1869288285793409, "grad_norm": 3.368523120880127, "learning_rate": 0.00018517978080428486, "loss": 3.1418, "step": 675 }, { "epoch": 0.18720576017723622, "grad_norm": 1.9053947925567627, "learning_rate": 0.00018513353063594627, "loss": 2.9306, "step": 676 }, { "epoch": 0.18748269177513155, "grad_norm": 3.7781260013580322, "learning_rate": 0.00018508721420684742, "loss": 3.1047, "step": 677 }, { "epoch": 0.18775962337302687, "grad_norm": 2.74504017829895, "learning_rate": 0.0001850408315530371, "loss": 2.5796, "step": 678 }, { "epoch": 0.18803655497092217, "grad_norm": 3.8274240493774414, "learning_rate": 0.00018499438271061568, "loss": 3.0927, "step": 679 }, { "epoch": 0.1883134865688175, "grad_norm": 2.835418701171875, "learning_rate": 0.00018494786771573496, "loss": 3.0944, "step": 680 }, { "epoch": 0.18859041816671282, "grad_norm": 1.6601074934005737, "learning_rate": 0.00018490128660459835, "loss": 2.8352, "step": 681 }, { "epoch": 0.18886734976460814, "grad_norm": 1.8874528408050537, "learning_rate": 0.00018485463941346066, "loss": 2.9458, "step": 682 }, { "epoch": 0.18914428136250347, "grad_norm": 2.7685647010803223, "learning_rate": 0.00018480792617862814, "loss": 2.9203, "step": 683 }, { "epoch": 0.1894212129603988, "grad_norm": 2.1622962951660156, "learning_rate": 0.0001847611469364584, "loss": 2.8604, "step": 684 }, { "epoch": 0.1896981445582941, "grad_norm": 2.5799427032470703, "learning_rate": 0.0001847143017233604, "loss": 2.7162, "step": 685 }, { "epoch": 0.18997507615618942, "grad_norm": 1.5721246004104614, "learning_rate": 0.00018466739057579462, "loss": 3.0661, "step": 686 }, { "epoch": 0.19025200775408474, "grad_norm": 2.5846004486083984, "learning_rate": 0.00018462041353027266, "loss": 3.0392, "step": 687 }, { "epoch": 0.19052893935198006, "grad_norm": 2.5718348026275635, "learning_rate": 0.00018457337062335752, "loss": 2.6367, "step": 688 }, { "epoch": 0.1908058709498754, "grad_norm": 2.7954459190368652, "learning_rate": 0.00018452626189166345, "loss": 3.039, "step": 689 }, { "epoch": 0.1910828025477707, "grad_norm": 1.9670569896697998, "learning_rate": 0.00018447908737185585, "loss": 2.9294, "step": 690 }, { "epoch": 0.191359734145666, "grad_norm": 2.261406421661377, "learning_rate": 0.00018443184710065146, "loss": 2.8604, "step": 691 }, { "epoch": 0.19163666574356134, "grad_norm": 3.316488742828369, "learning_rate": 0.00018438454111481808, "loss": 2.6537, "step": 692 }, { "epoch": 0.19191359734145666, "grad_norm": 3.1909279823303223, "learning_rate": 0.00018433716945117474, "loss": 2.8554, "step": 693 }, { "epoch": 0.19219052893935198, "grad_norm": 1.9768377542495728, "learning_rate": 0.0001842897321465915, "loss": 2.7122, "step": 694 }, { "epoch": 0.1924674605372473, "grad_norm": 3.4397292137145996, "learning_rate": 0.00018424222923798958, "loss": 2.492, "step": 695 }, { "epoch": 0.1927443921351426, "grad_norm": 1.6669995784759521, "learning_rate": 0.0001841946607623412, "loss": 2.6234, "step": 696 }, { "epoch": 0.19302132373303793, "grad_norm": 2.8505983352661133, "learning_rate": 0.00018414702675666975, "loss": 2.7902, "step": 697 }, { "epoch": 0.19329825533093326, "grad_norm": 3.3573968410491943, "learning_rate": 0.0001840993272580494, "loss": 2.9162, "step": 698 }, { "epoch": 0.19357518692882858, "grad_norm": 6.864968299865723, "learning_rate": 0.0001840515623036055, "loss": 3.9297, "step": 699 }, { "epoch": 0.1938521185267239, "grad_norm": 9.950575828552246, "learning_rate": 0.00018400373193051424, "loss": 4.8462, "step": 700 }, { "epoch": 0.19412905012461923, "grad_norm": 1.9874237775802612, "learning_rate": 0.0001839558361760027, "loss": 2.9369, "step": 701 }, { "epoch": 0.19440598172251453, "grad_norm": 2.468550443649292, "learning_rate": 0.000183907875077349, "loss": 2.9755, "step": 702 }, { "epoch": 0.19468291332040985, "grad_norm": 1.8362083435058594, "learning_rate": 0.00018385984867188193, "loss": 2.7249, "step": 703 }, { "epoch": 0.19495984491830518, "grad_norm": 3.345048189163208, "learning_rate": 0.0001838117569969812, "loss": 3.008, "step": 704 }, { "epoch": 0.1952367765162005, "grad_norm": 2.1463112831115723, "learning_rate": 0.00018376360009007736, "loss": 2.7318, "step": 705 }, { "epoch": 0.19551370811409582, "grad_norm": 1.7631304264068604, "learning_rate": 0.00018371537798865162, "loss": 2.7002, "step": 706 }, { "epoch": 0.19579063971199115, "grad_norm": 2.6849911212921143, "learning_rate": 0.00018366709073023606, "loss": 2.9509, "step": 707 }, { "epoch": 0.19606757130988645, "grad_norm": 2.5786337852478027, "learning_rate": 0.0001836187383524134, "loss": 3.0554, "step": 708 }, { "epoch": 0.19634450290778177, "grad_norm": 12.970073699951172, "learning_rate": 0.00018357032089281702, "loss": 2.7985, "step": 709 }, { "epoch": 0.1966214345056771, "grad_norm": 2.3600611686706543, "learning_rate": 0.00018352183838913098, "loss": 2.8667, "step": 710 }, { "epoch": 0.19689836610357242, "grad_norm": 2.256000518798828, "learning_rate": 0.00018347329087909002, "loss": 2.8481, "step": 711 }, { "epoch": 0.19717529770146774, "grad_norm": 2.0184097290039062, "learning_rate": 0.00018342467840047939, "loss": 3.0471, "step": 712 }, { "epoch": 0.19745222929936307, "grad_norm": 3.4170594215393066, "learning_rate": 0.00018337600099113495, "loss": 2.9388, "step": 713 }, { "epoch": 0.19772916089725837, "grad_norm": 2.170293092727661, "learning_rate": 0.00018332725868894313, "loss": 2.9068, "step": 714 }, { "epoch": 0.1980060924951537, "grad_norm": 2.9727580547332764, "learning_rate": 0.00018327845153184077, "loss": 3.1206, "step": 715 }, { "epoch": 0.19828302409304902, "grad_norm": 5.886566638946533, "learning_rate": 0.00018322957955781526, "loss": 3.09, "step": 716 }, { "epoch": 0.19855995569094434, "grad_norm": 11.844672203063965, "learning_rate": 0.0001831806428049044, "loss": 2.7557, "step": 717 }, { "epoch": 0.19883688728883966, "grad_norm": 3.298863172531128, "learning_rate": 0.00018313164131119654, "loss": 2.6253, "step": 718 }, { "epoch": 0.199113818886735, "grad_norm": 18.608060836791992, "learning_rate": 0.00018308257511483018, "loss": 3.1732, "step": 719 }, { "epoch": 0.19939075048463029, "grad_norm": 2.1521401405334473, "learning_rate": 0.00018303344425399436, "loss": 2.6386, "step": 720 }, { "epoch": 0.1996676820825256, "grad_norm": 2.3799684047698975, "learning_rate": 0.00018298424876692844, "loss": 2.761, "step": 721 }, { "epoch": 0.19994461368042094, "grad_norm": 3.3127503395080566, "learning_rate": 0.00018293498869192197, "loss": 2.8244, "step": 722 }, { "epoch": 0.20022154527831626, "grad_norm": 2.539456844329834, "learning_rate": 0.00018288566406731484, "loss": 2.4344, "step": 723 }, { "epoch": 0.20049847687621158, "grad_norm": 2.227818727493286, "learning_rate": 0.00018283627493149721, "loss": 2.816, "step": 724 }, { "epoch": 0.2007754084741069, "grad_norm": 6.158308029174805, "learning_rate": 0.00018278682132290944, "loss": 4.2033, "step": 725 }, { "epoch": 0.2010523400720022, "grad_norm": 2.4608898162841797, "learning_rate": 0.000182737303280042, "loss": 2.7077, "step": 726 }, { "epoch": 0.20132927166989753, "grad_norm": 3.126945734024048, "learning_rate": 0.00018268772084143562, "loss": 3.1508, "step": 727 }, { "epoch": 0.20160620326779286, "grad_norm": 1.4004101753234863, "learning_rate": 0.000182638074045681, "loss": 2.542, "step": 728 }, { "epoch": 0.20188313486568818, "grad_norm": 6.214250564575195, "learning_rate": 0.00018258836293141907, "loss": 2.8699, "step": 729 }, { "epoch": 0.2021600664635835, "grad_norm": 2.3528897762298584, "learning_rate": 0.0001825385875373408, "loss": 2.7283, "step": 730 }, { "epoch": 0.2024369980614788, "grad_norm": 3.6349143981933594, "learning_rate": 0.00018248874790218706, "loss": 3.2385, "step": 731 }, { "epoch": 0.20271392965937413, "grad_norm": 2.3201217651367188, "learning_rate": 0.0001824388440647489, "loss": 2.6714, "step": 732 }, { "epoch": 0.20299086125726945, "grad_norm": 2.7398152351379395, "learning_rate": 0.00018238887606386727, "loss": 2.8433, "step": 733 }, { "epoch": 0.20326779285516478, "grad_norm": 3.190858840942383, "learning_rate": 0.000182338843938433, "loss": 2.8483, "step": 734 }, { "epoch": 0.2035447244530601, "grad_norm": 4.260762691497803, "learning_rate": 0.0001822887477273869, "loss": 3.0106, "step": 735 }, { "epoch": 0.20382165605095542, "grad_norm": 1.6161547899246216, "learning_rate": 0.0001822385874697196, "loss": 2.6834, "step": 736 }, { "epoch": 0.20409858764885072, "grad_norm": 3.6264712810516357, "learning_rate": 0.0001821883632044717, "loss": 2.6701, "step": 737 }, { "epoch": 0.20437551924674605, "grad_norm": 6.149306297302246, "learning_rate": 0.00018213807497073346, "loss": 2.8069, "step": 738 }, { "epoch": 0.20465245084464137, "grad_norm": 3.0085160732269287, "learning_rate": 0.000182087722807645, "loss": 2.7487, "step": 739 }, { "epoch": 0.2049293824425367, "grad_norm": 3.4432058334350586, "learning_rate": 0.0001820373067543962, "loss": 3.0368, "step": 740 }, { "epoch": 0.20520631404043202, "grad_norm": 4.097099304199219, "learning_rate": 0.0001819868268502267, "loss": 3.002, "step": 741 }, { "epoch": 0.20548324563832734, "grad_norm": 3.168367624282837, "learning_rate": 0.00018193628313442576, "loss": 2.9108, "step": 742 }, { "epoch": 0.20576017723622264, "grad_norm": 3.36014986038208, "learning_rate": 0.00018188567564633238, "loss": 2.5995, "step": 743 }, { "epoch": 0.20603710883411797, "grad_norm": 2.911134719848633, "learning_rate": 0.00018183500442533514, "loss": 2.7216, "step": 744 }, { "epoch": 0.2063140404320133, "grad_norm": 2.1692538261413574, "learning_rate": 0.00018178426951087224, "loss": 2.9378, "step": 745 }, { "epoch": 0.20659097202990861, "grad_norm": 2.408252000808716, "learning_rate": 0.00018173347094243146, "loss": 2.9515, "step": 746 }, { "epoch": 0.20686790362780394, "grad_norm": 2.2805047035217285, "learning_rate": 0.00018168260875955013, "loss": 2.5644, "step": 747 }, { "epoch": 0.20714483522569926, "grad_norm": 3.825239658355713, "learning_rate": 0.00018163168300181507, "loss": 3.0186, "step": 748 }, { "epoch": 0.20742176682359456, "grad_norm": 6.781472682952881, "learning_rate": 0.00018158069370886266, "loss": 3.1843, "step": 749 }, { "epoch": 0.20769869842148989, "grad_norm": 6.31098747253418, "learning_rate": 0.00018152964092037855, "loss": 3.6861, "step": 750 }, { "epoch": 0.2079756300193852, "grad_norm": 5.607875823974609, "learning_rate": 0.00018147852467609803, "loss": 3.0506, "step": 751 }, { "epoch": 0.20825256161728053, "grad_norm": 2.671266794204712, "learning_rate": 0.0001814273450158056, "loss": 2.6357, "step": 752 }, { "epoch": 0.20852949321517586, "grad_norm": 2.7472641468048096, "learning_rate": 0.00018137610197933523, "loss": 3.0306, "step": 753 }, { "epoch": 0.20880642481307118, "grad_norm": 1.4980647563934326, "learning_rate": 0.0001813247956065702, "loss": 2.8052, "step": 754 }, { "epoch": 0.20908335641096648, "grad_norm": 2.9251890182495117, "learning_rate": 0.00018127342593744297, "loss": 3.1088, "step": 755 }, { "epoch": 0.2093602880088618, "grad_norm": 2.2277612686157227, "learning_rate": 0.00018122199301193548, "loss": 2.9202, "step": 756 }, { "epoch": 0.20963721960675713, "grad_norm": 2.4915273189544678, "learning_rate": 0.0001811704968700787, "loss": 3.0571, "step": 757 }, { "epoch": 0.20991415120465245, "grad_norm": 2.2147316932678223, "learning_rate": 0.0001811189375519529, "loss": 2.7547, "step": 758 }, { "epoch": 0.21019108280254778, "grad_norm": 1.6362735033035278, "learning_rate": 0.00018106731509768753, "loss": 2.7775, "step": 759 }, { "epoch": 0.2104680144004431, "grad_norm": 2.174014091491699, "learning_rate": 0.00018101562954746112, "loss": 2.7198, "step": 760 }, { "epoch": 0.2107449459983384, "grad_norm": 3.81546688079834, "learning_rate": 0.00018096388094150138, "loss": 2.9381, "step": 761 }, { "epoch": 0.21102187759623373, "grad_norm": 2.7602310180664062, "learning_rate": 0.00018091206932008502, "loss": 2.796, "step": 762 }, { "epoch": 0.21129880919412905, "grad_norm": 3.135514736175537, "learning_rate": 0.00018086019472353786, "loss": 2.7376, "step": 763 }, { "epoch": 0.21157574079202437, "grad_norm": 14.77597427368164, "learning_rate": 0.00018080825719223468, "loss": 2.8534, "step": 764 }, { "epoch": 0.2118526723899197, "grad_norm": 3.037137269973755, "learning_rate": 0.00018075625676659932, "loss": 3.238, "step": 765 }, { "epoch": 0.212129603987815, "grad_norm": 4.679478168487549, "learning_rate": 0.00018070419348710452, "loss": 3.0818, "step": 766 }, { "epoch": 0.21240653558571032, "grad_norm": 3.6584432125091553, "learning_rate": 0.00018065206739427192, "loss": 2.8912, "step": 767 }, { "epoch": 0.21268346718360565, "grad_norm": 7.0400166511535645, "learning_rate": 0.00018059987852867208, "loss": 3.1298, "step": 768 }, { "epoch": 0.21296039878150097, "grad_norm": 1.9162108898162842, "learning_rate": 0.00018054762693092444, "loss": 3.2406, "step": 769 }, { "epoch": 0.2132373303793963, "grad_norm": 2.555687189102173, "learning_rate": 0.00018049531264169723, "loss": 2.4545, "step": 770 }, { "epoch": 0.21351426197729162, "grad_norm": 2.5481579303741455, "learning_rate": 0.0001804429357017074, "loss": 2.9135, "step": 771 }, { "epoch": 0.21379119357518692, "grad_norm": 2.2174980640411377, "learning_rate": 0.0001803904961517209, "loss": 2.6785, "step": 772 }, { "epoch": 0.21406812517308224, "grad_norm": 2.345719814300537, "learning_rate": 0.00018033799403255208, "loss": 2.976, "step": 773 }, { "epoch": 0.21434505677097757, "grad_norm": 2.987143039703369, "learning_rate": 0.00018028542938506426, "loss": 3.103, "step": 774 }, { "epoch": 0.2146219883688729, "grad_norm": 5.510335922241211, "learning_rate": 0.00018023280225016932, "loss": 3.8119, "step": 775 }, { "epoch": 0.21489891996676821, "grad_norm": 1.4958183765411377, "learning_rate": 0.0001801801126688278, "loss": 2.6603, "step": 776 }, { "epoch": 0.21517585156466354, "grad_norm": 2.7116470336914062, "learning_rate": 0.0001801273606820488, "loss": 2.9782, "step": 777 }, { "epoch": 0.21545278316255884, "grad_norm": 1.8536920547485352, "learning_rate": 0.00018007454633089, "loss": 2.929, "step": 778 }, { "epoch": 0.21572971476045416, "grad_norm": 2.6260993480682373, "learning_rate": 0.0001800216696564576, "loss": 3.168, "step": 779 }, { "epoch": 0.21600664635834949, "grad_norm": 1.8232653141021729, "learning_rate": 0.00017996873069990647, "loss": 2.7345, "step": 780 }, { "epoch": 0.2162835779562448, "grad_norm": 2.8388843536376953, "learning_rate": 0.00017991572950243977, "loss": 2.6816, "step": 781 }, { "epoch": 0.21656050955414013, "grad_norm": 2.5359623432159424, "learning_rate": 0.0001798626661053091, "loss": 2.8546, "step": 782 }, { "epoch": 0.21683744115203546, "grad_norm": 2.4679372310638428, "learning_rate": 0.0001798095405498146, "loss": 2.8639, "step": 783 }, { "epoch": 0.21711437274993076, "grad_norm": 48.03520584106445, "learning_rate": 0.00017975635287730473, "loss": 2.8886, "step": 784 }, { "epoch": 0.21739130434782608, "grad_norm": 1.9362636804580688, "learning_rate": 0.00017970310312917623, "loss": 2.8487, "step": 785 }, { "epoch": 0.2176682359457214, "grad_norm": 1.8206771612167358, "learning_rate": 0.00017964979134687427, "loss": 2.8701, "step": 786 }, { "epoch": 0.21794516754361673, "grad_norm": 1.7026715278625488, "learning_rate": 0.00017959641757189223, "loss": 2.7412, "step": 787 }, { "epoch": 0.21822209914151205, "grad_norm": 1.8954119682312012, "learning_rate": 0.00017954298184577172, "loss": 2.9769, "step": 788 }, { "epoch": 0.21849903073940738, "grad_norm": 2.224543571472168, "learning_rate": 0.00017948948421010264, "loss": 2.9588, "step": 789 }, { "epoch": 0.21877596233730268, "grad_norm": 3.9762964248657227, "learning_rate": 0.00017943592470652303, "loss": 2.9722, "step": 790 }, { "epoch": 0.219052893935198, "grad_norm": 1.5863521099090576, "learning_rate": 0.00017938230337671908, "loss": 2.8169, "step": 791 }, { "epoch": 0.21932982553309333, "grad_norm": 1.8167140483856201, "learning_rate": 0.00017932862026242514, "loss": 2.6676, "step": 792 }, { "epoch": 0.21960675713098865, "grad_norm": 3.5854008197784424, "learning_rate": 0.00017927487540542357, "loss": 2.8992, "step": 793 }, { "epoch": 0.21988368872888397, "grad_norm": 2.874563694000244, "learning_rate": 0.00017922106884754488, "loss": 2.8493, "step": 794 }, { "epoch": 0.2201606203267793, "grad_norm": 3.0138399600982666, "learning_rate": 0.00017916720063066753, "loss": 2.931, "step": 795 }, { "epoch": 0.2204375519246746, "grad_norm": 1.9415334463119507, "learning_rate": 0.00017911327079671803, "loss": 2.7398, "step": 796 }, { "epoch": 0.22071448352256992, "grad_norm": 2.626664400100708, "learning_rate": 0.00017905927938767076, "loss": 2.8844, "step": 797 }, { "epoch": 0.22099141512046525, "grad_norm": 2.3523318767547607, "learning_rate": 0.00017900522644554812, "loss": 2.6385, "step": 798 }, { "epoch": 0.22126834671836057, "grad_norm": 3.625702142715454, "learning_rate": 0.0001789511120124203, "loss": 3.1983, "step": 799 }, { "epoch": 0.2215452783162559, "grad_norm": 6.103048324584961, "learning_rate": 0.00017889693613040554, "loss": 3.9482, "step": 800 }, { "epoch": 0.2218222099141512, "grad_norm": 2.523581027984619, "learning_rate": 0.00017884269884166967, "loss": 2.9803, "step": 801 }, { "epoch": 0.22209914151204652, "grad_norm": 1.7089117765426636, "learning_rate": 0.00017878840018842643, "loss": 2.8673, "step": 802 }, { "epoch": 0.22237607310994184, "grad_norm": 1.8813852071762085, "learning_rate": 0.0001787340402129374, "loss": 2.8749, "step": 803 }, { "epoch": 0.22265300470783717, "grad_norm": 1.4631633758544922, "learning_rate": 0.00017867961895751163, "loss": 2.9742, "step": 804 }, { "epoch": 0.2229299363057325, "grad_norm": 2.468569040298462, "learning_rate": 0.0001786251364645062, "loss": 2.7563, "step": 805 }, { "epoch": 0.22320686790362781, "grad_norm": 3.15724515914917, "learning_rate": 0.00017857059277632563, "loss": 2.9215, "step": 806 }, { "epoch": 0.2234837995015231, "grad_norm": 2.501721143722534, "learning_rate": 0.00017851598793542208, "loss": 2.7974, "step": 807 }, { "epoch": 0.22376073109941844, "grad_norm": 1.597951889038086, "learning_rate": 0.00017846132198429544, "loss": 2.8209, "step": 808 }, { "epoch": 0.22403766269731376, "grad_norm": 2.3806540966033936, "learning_rate": 0.00017840659496549298, "loss": 2.8041, "step": 809 }, { "epoch": 0.22431459429520909, "grad_norm": 1.3796292543411255, "learning_rate": 0.00017835180692160968, "loss": 2.7755, "step": 810 }, { "epoch": 0.2245915258931044, "grad_norm": 2.67073392868042, "learning_rate": 0.0001782969578952879, "loss": 2.9294, "step": 811 }, { "epoch": 0.22486845749099973, "grad_norm": 1.391492486000061, "learning_rate": 0.0001782420479292175, "loss": 2.7277, "step": 812 }, { "epoch": 0.22514538908889503, "grad_norm": 3.6642210483551025, "learning_rate": 0.00017818707706613576, "loss": 3.2001, "step": 813 }, { "epoch": 0.22542232068679036, "grad_norm": 2.9600884914398193, "learning_rate": 0.00017813204534882738, "loss": 3.121, "step": 814 }, { "epoch": 0.22569925228468568, "grad_norm": 2.7705891132354736, "learning_rate": 0.00017807695282012436, "loss": 2.9563, "step": 815 }, { "epoch": 0.225976183882581, "grad_norm": 1.836990475654602, "learning_rate": 0.00017802179952290613, "loss": 2.6764, "step": 816 }, { "epoch": 0.22625311548047633, "grad_norm": 3.026313066482544, "learning_rate": 0.00017796658550009933, "loss": 2.9005, "step": 817 }, { "epoch": 0.22653004707837165, "grad_norm": 2.420330047607422, "learning_rate": 0.00017791131079467794, "loss": 2.7694, "step": 818 }, { "epoch": 0.22680697867626695, "grad_norm": 3.7374370098114014, "learning_rate": 0.0001778559754496631, "loss": 2.9352, "step": 819 }, { "epoch": 0.22708391027416228, "grad_norm": 2.7599310874938965, "learning_rate": 0.00017780057950812317, "loss": 2.5512, "step": 820 }, { "epoch": 0.2273608418720576, "grad_norm": 2.0051608085632324, "learning_rate": 0.0001777451230131737, "loss": 2.9223, "step": 821 }, { "epoch": 0.22763777346995293, "grad_norm": 4.079291820526123, "learning_rate": 0.0001776896060079773, "loss": 2.6139, "step": 822 }, { "epoch": 0.22791470506784825, "grad_norm": 2.510545015335083, "learning_rate": 0.0001776340285357438, "loss": 2.4995, "step": 823 }, { "epoch": 0.22819163666574357, "grad_norm": 4.3523406982421875, "learning_rate": 0.00017757839063972997, "loss": 3.0332, "step": 824 }, { "epoch": 0.22846856826363887, "grad_norm": 4.616940975189209, "learning_rate": 0.00017752269236323966, "loss": 3.6388, "step": 825 }, { "epoch": 0.2287454998615342, "grad_norm": 2.210371494293213, "learning_rate": 0.00017746693374962372, "loss": 2.7925, "step": 826 }, { "epoch": 0.22902243145942952, "grad_norm": 2.476367950439453, "learning_rate": 0.00017741111484227996, "loss": 2.9856, "step": 827 }, { "epoch": 0.22929936305732485, "grad_norm": 2.4525411128997803, "learning_rate": 0.00017735523568465305, "loss": 2.6746, "step": 828 }, { "epoch": 0.22957629465522017, "grad_norm": 2.463634490966797, "learning_rate": 0.00017729929632023472, "loss": 2.7279, "step": 829 }, { "epoch": 0.22985322625311547, "grad_norm": 2.8449554443359375, "learning_rate": 0.00017724329679256337, "loss": 2.9282, "step": 830 }, { "epoch": 0.2301301578510108, "grad_norm": 2.233956813812256, "learning_rate": 0.00017718723714522434, "loss": 2.7956, "step": 831 }, { "epoch": 0.23040708944890612, "grad_norm": 2.3343842029571533, "learning_rate": 0.00017713111742184975, "loss": 2.9016, "step": 832 }, { "epoch": 0.23068402104680144, "grad_norm": 1.9796818494796753, "learning_rate": 0.0001770749376661184, "loss": 2.8013, "step": 833 }, { "epoch": 0.23096095264469677, "grad_norm": 1.3682773113250732, "learning_rate": 0.00017701869792175593, "loss": 2.8476, "step": 834 }, { "epoch": 0.2312378842425921, "grad_norm": 1.4512349367141724, "learning_rate": 0.00017696239823253459, "loss": 2.7705, "step": 835 }, { "epoch": 0.2315148158404874, "grad_norm": 1.6896343231201172, "learning_rate": 0.0001769060386422733, "loss": 2.7434, "step": 836 }, { "epoch": 0.2317917474383827, "grad_norm": 2.7096025943756104, "learning_rate": 0.00017684961919483763, "loss": 2.9079, "step": 837 }, { "epoch": 0.23206867903627804, "grad_norm": 1.879219651222229, "learning_rate": 0.0001767931399341397, "loss": 2.9284, "step": 838 }, { "epoch": 0.23234561063417336, "grad_norm": 3.0728893280029297, "learning_rate": 0.00017673660090413823, "loss": 2.9426, "step": 839 }, { "epoch": 0.23262254223206869, "grad_norm": 3.179382562637329, "learning_rate": 0.00017668000214883845, "loss": 3.0977, "step": 840 }, { "epoch": 0.232899473829964, "grad_norm": 2.8578603267669678, "learning_rate": 0.000176623343712292, "loss": 2.7335, "step": 841 }, { "epoch": 0.2331764054278593, "grad_norm": 2.0931589603424072, "learning_rate": 0.00017656662563859702, "loss": 2.8404, "step": 842 }, { "epoch": 0.23345333702575463, "grad_norm": 4.9795074462890625, "learning_rate": 0.00017650984797189814, "loss": 3.2182, "step": 843 }, { "epoch": 0.23373026862364996, "grad_norm": 2.5955379009246826, "learning_rate": 0.00017645301075638634, "loss": 2.7909, "step": 844 }, { "epoch": 0.23400720022154528, "grad_norm": 2.148406744003296, "learning_rate": 0.0001763961140362988, "loss": 2.4648, "step": 845 }, { "epoch": 0.2342841318194406, "grad_norm": 2.5193843841552734, "learning_rate": 0.00017633915785591917, "loss": 2.9867, "step": 846 }, { "epoch": 0.23456106341733593, "grad_norm": 2.263282537460327, "learning_rate": 0.00017628214225957737, "loss": 2.7114, "step": 847 }, { "epoch": 0.23483799501523123, "grad_norm": 3.8058855533599854, "learning_rate": 0.00017622506729164948, "loss": 3.3003, "step": 848 }, { "epoch": 0.23511492661312655, "grad_norm": 3.6773746013641357, "learning_rate": 0.00017616793299655794, "loss": 3.128, "step": 849 }, { "epoch": 0.23539185821102188, "grad_norm": 5.275571346282959, "learning_rate": 0.00017611073941877113, "loss": 4.1249, "step": 850 }, { "epoch": 0.2356687898089172, "grad_norm": 1.4605493545532227, "learning_rate": 0.00017605348660280383, "loss": 2.6483, "step": 851 }, { "epoch": 0.23594572140681253, "grad_norm": 2.3449642658233643, "learning_rate": 0.00017599617459321674, "loss": 2.9185, "step": 852 }, { "epoch": 0.23622265300470785, "grad_norm": 1.8987805843353271, "learning_rate": 0.00017593880343461668, "loss": 2.5391, "step": 853 }, { "epoch": 0.23649958460260315, "grad_norm": 1.6489249467849731, "learning_rate": 0.00017588137317165657, "loss": 2.575, "step": 854 }, { "epoch": 0.23677651620049847, "grad_norm": 1.4591307640075684, "learning_rate": 0.00017582388384903522, "loss": 2.9114, "step": 855 }, { "epoch": 0.2370534477983938, "grad_norm": 1.4331414699554443, "learning_rate": 0.00017576633551149752, "loss": 2.7333, "step": 856 }, { "epoch": 0.23733037939628912, "grad_norm": 1.4502758979797363, "learning_rate": 0.0001757087282038343, "loss": 2.5141, "step": 857 }, { "epoch": 0.23760731099418445, "grad_norm": 2.1150856018066406, "learning_rate": 0.00017565106197088207, "loss": 2.9157, "step": 858 }, { "epoch": 0.23788424259207977, "grad_norm": 3.100188732147217, "learning_rate": 0.0001755933368575235, "loss": 2.8012, "step": 859 }, { "epoch": 0.23816117418997507, "grad_norm": 1.8641769886016846, "learning_rate": 0.00017553555290868687, "loss": 2.5764, "step": 860 }, { "epoch": 0.2384381057878704, "grad_norm": 1.5892128944396973, "learning_rate": 0.00017547771016934643, "loss": 2.947, "step": 861 }, { "epoch": 0.23871503738576572, "grad_norm": 1.791627287864685, "learning_rate": 0.00017541980868452197, "loss": 2.9522, "step": 862 }, { "epoch": 0.23899196898366104, "grad_norm": 2.1791608333587646, "learning_rate": 0.0001753618484992792, "loss": 2.9263, "step": 863 }, { "epoch": 0.23926890058155637, "grad_norm": 2.1276986598968506, "learning_rate": 0.0001753038296587294, "loss": 3.0579, "step": 864 }, { "epoch": 0.23954583217945166, "grad_norm": 1.7338223457336426, "learning_rate": 0.00017524575220802962, "loss": 2.9095, "step": 865 }, { "epoch": 0.239822763777347, "grad_norm": 1.9278991222381592, "learning_rate": 0.00017518761619238234, "loss": 2.6981, "step": 866 }, { "epoch": 0.2400996953752423, "grad_norm": 3.1734070777893066, "learning_rate": 0.00017512942165703578, "loss": 3.0285, "step": 867 }, { "epoch": 0.24037662697313764, "grad_norm": 1.7613427639007568, "learning_rate": 0.00017507116864728367, "loss": 2.678, "step": 868 }, { "epoch": 0.24065355857103296, "grad_norm": 1.979413390159607, "learning_rate": 0.00017501285720846523, "loss": 2.7302, "step": 869 }, { "epoch": 0.24093049016892829, "grad_norm": 2.640336513519287, "learning_rate": 0.00017495448738596517, "loss": 2.6632, "step": 870 }, { "epoch": 0.24120742176682358, "grad_norm": 1.7245389223098755, "learning_rate": 0.00017489605922521361, "loss": 2.5472, "step": 871 }, { "epoch": 0.2414843533647189, "grad_norm": 3.25132155418396, "learning_rate": 0.00017483757277168618, "loss": 2.6675, "step": 872 }, { "epoch": 0.24176128496261423, "grad_norm": 2.5426268577575684, "learning_rate": 0.0001747790280709037, "loss": 2.8879, "step": 873 }, { "epoch": 0.24203821656050956, "grad_norm": 2.1168370246887207, "learning_rate": 0.0001747204251684325, "loss": 3.1219, "step": 874 }, { "epoch": 0.24231514815840488, "grad_norm": 4.783705711364746, "learning_rate": 0.0001746617641098841, "loss": 3.5755, "step": 875 }, { "epoch": 0.2425920797563002, "grad_norm": 3.421607494354248, "learning_rate": 0.00017460304494091534, "loss": 3.0924, "step": 876 }, { "epoch": 0.2428690113541955, "grad_norm": 1.700899600982666, "learning_rate": 0.00017454426770722824, "loss": 2.8364, "step": 877 }, { "epoch": 0.24314594295209083, "grad_norm": 1.693171501159668, "learning_rate": 0.00017448543245457009, "loss": 2.8619, "step": 878 }, { "epoch": 0.24342287454998615, "grad_norm": 2.3933541774749756, "learning_rate": 0.00017442653922873327, "loss": 2.9582, "step": 879 }, { "epoch": 0.24369980614788148, "grad_norm": 1.2178289890289307, "learning_rate": 0.00017436758807555527, "loss": 2.6739, "step": 880 }, { "epoch": 0.2439767377457768, "grad_norm": 2.796721935272217, "learning_rate": 0.00017430857904091873, "loss": 2.6545, "step": 881 }, { "epoch": 0.24425366934367213, "grad_norm": 3.1098473072052, "learning_rate": 0.0001742495121707513, "loss": 2.8777, "step": 882 }, { "epoch": 0.24453060094156742, "grad_norm": 2.6938257217407227, "learning_rate": 0.00017419038751102564, "loss": 2.8174, "step": 883 }, { "epoch": 0.24480753253946275, "grad_norm": 2.1703615188598633, "learning_rate": 0.0001741312051077594, "loss": 2.6357, "step": 884 }, { "epoch": 0.24508446413735807, "grad_norm": 1.8589909076690674, "learning_rate": 0.00017407196500701517, "loss": 2.5496, "step": 885 }, { "epoch": 0.2453613957352534, "grad_norm": 2.470015525817871, "learning_rate": 0.00017401266725490048, "loss": 3.1467, "step": 886 }, { "epoch": 0.24563832733314872, "grad_norm": 1.9733291864395142, "learning_rate": 0.00017395331189756763, "loss": 2.5779, "step": 887 }, { "epoch": 0.24591525893104405, "grad_norm": 5.104317665100098, "learning_rate": 0.00017389389898121388, "loss": 2.9302, "step": 888 }, { "epoch": 0.24619219052893934, "grad_norm": 1.8060482740402222, "learning_rate": 0.00017383442855208124, "loss": 2.8656, "step": 889 }, { "epoch": 0.24646912212683467, "grad_norm": 3.628758430480957, "learning_rate": 0.00017377490065645642, "loss": 3.1442, "step": 890 }, { "epoch": 0.24674605372473, "grad_norm": 2.087409019470215, "learning_rate": 0.000173715315340671, "loss": 2.8172, "step": 891 }, { "epoch": 0.24702298532262532, "grad_norm": 2.6525936126708984, "learning_rate": 0.00017365567265110107, "loss": 2.9967, "step": 892 }, { "epoch": 0.24729991692052064, "grad_norm": 1.745984673500061, "learning_rate": 0.0001735959726341675, "loss": 2.9043, "step": 893 }, { "epoch": 0.24757684851841597, "grad_norm": 2.4623262882232666, "learning_rate": 0.00017353621533633583, "loss": 2.6212, "step": 894 }, { "epoch": 0.24785378011631126, "grad_norm": 1.3718273639678955, "learning_rate": 0.000173476400804116, "loss": 2.7948, "step": 895 }, { "epoch": 0.2481307117142066, "grad_norm": 1.9852874279022217, "learning_rate": 0.0001734165290840626, "loss": 2.7263, "step": 896 }, { "epoch": 0.2484076433121019, "grad_norm": 2.4187746047973633, "learning_rate": 0.0001733566002227748, "loss": 2.8173, "step": 897 }, { "epoch": 0.24868457490999724, "grad_norm": 2.9589130878448486, "learning_rate": 0.00017329661426689609, "loss": 3.0224, "step": 898 }, { "epoch": 0.24896150650789256, "grad_norm": 1.7410316467285156, "learning_rate": 0.00017323657126311454, "loss": 2.9229, "step": 899 }, { "epoch": 0.24923843810578786, "grad_norm": 3.568751811981201, "learning_rate": 0.00017317647125816255, "loss": 3.2714, "step": 900 }, { "epoch": 0.24951536970368318, "grad_norm": 7.780263423919678, "learning_rate": 0.00017311631429881686, "loss": 2.8428, "step": 901 }, { "epoch": 0.2497923013015785, "grad_norm": 1.9568160772323608, "learning_rate": 0.0001730561004318986, "loss": 2.6005, "step": 902 }, { "epoch": 0.2500692328994738, "grad_norm": 1.8098704814910889, "learning_rate": 0.00017299582970427317, "loss": 2.7199, "step": 903 }, { "epoch": 0.2500692328994738, "eval_loss": 0.36362597346305847, "eval_runtime": 808.5036, "eval_samples_per_second": 7.523, "eval_steps_per_second": 1.881, "step": 903 }, { "epoch": 0.25034616449736913, "grad_norm": 2.204132556915283, "learning_rate": 0.0001729355021628502, "loss": 2.7864, "step": 904 }, { "epoch": 0.25062309609526445, "grad_norm": 2.6328420639038086, "learning_rate": 0.00017287511785458358, "loss": 2.8572, "step": 905 }, { "epoch": 0.2509000276931598, "grad_norm": 1.911133885383606, "learning_rate": 0.0001728146768264714, "loss": 2.695, "step": 906 }, { "epoch": 0.2511769592910551, "grad_norm": 1.584742546081543, "learning_rate": 0.00017275417912555576, "loss": 2.8547, "step": 907 }, { "epoch": 0.2514538908889504, "grad_norm": 3.0724382400512695, "learning_rate": 0.00017269362479892304, "loss": 3.1775, "step": 908 }, { "epoch": 0.25173082248684575, "grad_norm": 2.9462833404541016, "learning_rate": 0.00017263301389370362, "loss": 2.9909, "step": 909 }, { "epoch": 0.2520077540847411, "grad_norm": 1.692214012145996, "learning_rate": 0.00017257234645707187, "loss": 2.5681, "step": 910 }, { "epoch": 0.2522846856826364, "grad_norm": 1.7105425596237183, "learning_rate": 0.00017251162253624624, "loss": 2.5708, "step": 911 }, { "epoch": 0.2525616172805317, "grad_norm": 1.436359167098999, "learning_rate": 0.00017245084217848913, "loss": 2.7043, "step": 912 }, { "epoch": 0.25283854887842705, "grad_norm": 1.3870446681976318, "learning_rate": 0.0001723900054311068, "loss": 2.8711, "step": 913 }, { "epoch": 0.2531154804763224, "grad_norm": 2.342883348464966, "learning_rate": 0.0001723291123414495, "loss": 2.8609, "step": 914 }, { "epoch": 0.25339241207421764, "grad_norm": 1.8659863471984863, "learning_rate": 0.00017226816295691123, "loss": 2.7894, "step": 915 }, { "epoch": 0.25366934367211297, "grad_norm": 2.27763032913208, "learning_rate": 0.0001722071573249298, "loss": 2.8256, "step": 916 }, { "epoch": 0.2539462752700083, "grad_norm": 2.1597366333007812, "learning_rate": 0.00017214609549298697, "loss": 2.4851, "step": 917 }, { "epoch": 0.2542232068679036, "grad_norm": 1.9769222736358643, "learning_rate": 0.00017208497750860805, "loss": 2.4697, "step": 918 }, { "epoch": 0.25450013846579894, "grad_norm": 2.303415536880493, "learning_rate": 0.00017202380341936212, "loss": 2.5831, "step": 919 }, { "epoch": 0.25477707006369427, "grad_norm": 2.2522268295288086, "learning_rate": 0.00017196257327286194, "loss": 2.8313, "step": 920 }, { "epoch": 0.2550540016615896, "grad_norm": 2.688744306564331, "learning_rate": 0.00017190128711676392, "loss": 2.8566, "step": 921 }, { "epoch": 0.2553309332594849, "grad_norm": 2.4292383193969727, "learning_rate": 0.00017183994499876798, "loss": 2.29, "step": 922 }, { "epoch": 0.25560786485738024, "grad_norm": 6.555003643035889, "learning_rate": 0.00017177854696661773, "loss": 3.0297, "step": 923 }, { "epoch": 0.25588479645527556, "grad_norm": 3.1714563369750977, "learning_rate": 0.00017171709306810012, "loss": 3.1388, "step": 924 }, { "epoch": 0.2561617280531709, "grad_norm": 6.414551734924316, "learning_rate": 0.00017165558335104577, "loss": 4.24, "step": 925 }, { "epoch": 0.2564386596510662, "grad_norm": 2.3539018630981445, "learning_rate": 0.00017159401786332864, "loss": 2.6739, "step": 926 }, { "epoch": 0.2567155912489615, "grad_norm": 1.710160255432129, "learning_rate": 0.00017153239665286605, "loss": 2.6524, "step": 927 }, { "epoch": 0.2569925228468568, "grad_norm": 2.419032573699951, "learning_rate": 0.0001714707197676188, "loss": 2.8205, "step": 928 }, { "epoch": 0.25726945444475213, "grad_norm": 2.155780076980591, "learning_rate": 0.000171408987255591, "loss": 2.5581, "step": 929 }, { "epoch": 0.25754638604264746, "grad_norm": 2.413773775100708, "learning_rate": 0.00017134719916482997, "loss": 2.9444, "step": 930 }, { "epoch": 0.2578233176405428, "grad_norm": 2.2128782272338867, "learning_rate": 0.00017128535554342643, "loss": 3.1043, "step": 931 }, { "epoch": 0.2581002492384381, "grad_norm": 3.0503475666046143, "learning_rate": 0.00017122345643951417, "loss": 2.7248, "step": 932 }, { "epoch": 0.25837718083633343, "grad_norm": 2.457817554473877, "learning_rate": 0.00017116150190127024, "loss": 2.4952, "step": 933 }, { "epoch": 0.25865411243422876, "grad_norm": 2.2783281803131104, "learning_rate": 0.00017109949197691485, "loss": 2.8066, "step": 934 }, { "epoch": 0.2589310440321241, "grad_norm": 3.6404027938842773, "learning_rate": 0.0001710374267147113, "loss": 3.5088, "step": 935 }, { "epoch": 0.2592079756300194, "grad_norm": 13.817218780517578, "learning_rate": 0.00017097530616296593, "loss": 2.9703, "step": 936 }, { "epoch": 0.25948490722791473, "grad_norm": 4.384236812591553, "learning_rate": 0.00017091313037002816, "loss": 2.7491, "step": 937 }, { "epoch": 0.25976183882581, "grad_norm": 36.29664993286133, "learning_rate": 0.00017085089938429038, "loss": 3.0183, "step": 938 }, { "epoch": 0.2600387704237053, "grad_norm": 2.643810987472534, "learning_rate": 0.00017078861325418797, "loss": 2.9312, "step": 939 }, { "epoch": 0.26031570202160065, "grad_norm": 2.287806272506714, "learning_rate": 0.0001707262720281991, "loss": 3.0836, "step": 940 }, { "epoch": 0.260592633619496, "grad_norm": 58.10905456542969, "learning_rate": 0.00017066387575484501, "loss": 3.0783, "step": 941 }, { "epoch": 0.2608695652173913, "grad_norm": 228.61538696289062, "learning_rate": 0.00017060142448268968, "loss": 3.4171, "step": 942 }, { "epoch": 0.2611464968152866, "grad_norm": 14.011260032653809, "learning_rate": 0.0001705389182603399, "loss": 2.582, "step": 943 }, { "epoch": 0.26142342841318195, "grad_norm": 3.490990400314331, "learning_rate": 0.00017047635713644528, "loss": 2.8077, "step": 944 }, { "epoch": 0.26170036001107727, "grad_norm": 1.8019381761550903, "learning_rate": 0.00017041374115969805, "loss": 2.9052, "step": 945 }, { "epoch": 0.2619772916089726, "grad_norm": 22.604679107666016, "learning_rate": 0.00017035107037883328, "loss": 3.0083, "step": 946 }, { "epoch": 0.2622542232068679, "grad_norm": 2.184748888015747, "learning_rate": 0.0001702883448426286, "loss": 3.1804, "step": 947 }, { "epoch": 0.26253115480476324, "grad_norm": 10.158811569213867, "learning_rate": 0.00017022556459990424, "loss": 2.8463, "step": 948 }, { "epoch": 0.26280808640265857, "grad_norm": 9.095647811889648, "learning_rate": 0.00017016272969952304, "loss": 3.5129, "step": 949 }, { "epoch": 0.26308501800055384, "grad_norm": 2.826272487640381, "learning_rate": 0.00017009984019039049, "loss": 3.419, "step": 950 }, { "epoch": 0.26336194959844916, "grad_norm": 3.0602076053619385, "learning_rate": 0.00017003689612145437, "loss": 2.8289, "step": 951 }, { "epoch": 0.2636388811963445, "grad_norm": 1.6230366230010986, "learning_rate": 0.00016997389754170507, "loss": 3.0666, "step": 952 }, { "epoch": 0.2639158127942398, "grad_norm": 1.6178385019302368, "learning_rate": 0.00016991084450017542, "loss": 2.7564, "step": 953 }, { "epoch": 0.26419274439213514, "grad_norm": 1.646022915840149, "learning_rate": 0.0001698477370459405, "loss": 2.8767, "step": 954 }, { "epoch": 0.26446967599003046, "grad_norm": 2.150141954421997, "learning_rate": 0.00016978457522811788, "loss": 2.8903, "step": 955 }, { "epoch": 0.2647466075879258, "grad_norm": 1.6503727436065674, "learning_rate": 0.00016972135909586742, "loss": 2.8843, "step": 956 }, { "epoch": 0.2650235391858211, "grad_norm": 2.434426784515381, "learning_rate": 0.00016965808869839116, "loss": 2.7896, "step": 957 }, { "epoch": 0.26530047078371644, "grad_norm": 2.7235426902770996, "learning_rate": 0.00016959476408493347, "loss": 2.73, "step": 958 }, { "epoch": 0.26557740238161176, "grad_norm": 2.0859873294830322, "learning_rate": 0.00016953138530478092, "loss": 3.0053, "step": 959 }, { "epoch": 0.2658543339795071, "grad_norm": 2.04044771194458, "learning_rate": 0.00016946795240726216, "loss": 2.813, "step": 960 }, { "epoch": 0.26613126557740235, "grad_norm": 1.8185943365097046, "learning_rate": 0.00016940446544174803, "loss": 2.8833, "step": 961 }, { "epoch": 0.2664081971752977, "grad_norm": 2.5354840755462646, "learning_rate": 0.00016934092445765144, "loss": 3.0756, "step": 962 }, { "epoch": 0.266685128773193, "grad_norm": 1.4968249797821045, "learning_rate": 0.00016927732950442728, "loss": 2.9121, "step": 963 }, { "epoch": 0.26696206037108833, "grad_norm": 15.088715553283691, "learning_rate": 0.0001692136806315726, "loss": 2.7988, "step": 964 }, { "epoch": 0.26723899196898365, "grad_norm": 1.238550066947937, "learning_rate": 0.00016914997788862622, "loss": 2.8324, "step": 965 }, { "epoch": 0.267515923566879, "grad_norm": 1.432619571685791, "learning_rate": 0.00016908622132516903, "loss": 2.5692, "step": 966 }, { "epoch": 0.2677928551647743, "grad_norm": 2.6180357933044434, "learning_rate": 0.00016902241099082374, "loss": 3.2998, "step": 967 }, { "epoch": 0.2680697867626696, "grad_norm": 1.5661367177963257, "learning_rate": 0.00016895854693525496, "loss": 2.6682, "step": 968 }, { "epoch": 0.26834671836056495, "grad_norm": 1.7412769794464111, "learning_rate": 0.00016889462920816902, "loss": 2.8722, "step": 969 }, { "epoch": 0.2686236499584603, "grad_norm": 1.6578108072280884, "learning_rate": 0.00016883065785931414, "loss": 2.6731, "step": 970 }, { "epoch": 0.2689005815563556, "grad_norm": 1.3283472061157227, "learning_rate": 0.00016876663293848026, "loss": 2.7675, "step": 971 }, { "epoch": 0.2691775131542509, "grad_norm": 1.5704797506332397, "learning_rate": 0.0001687025544954989, "loss": 2.7657, "step": 972 }, { "epoch": 0.2694544447521462, "grad_norm": 2.236462116241455, "learning_rate": 0.00016863842258024335, "loss": 3.0339, "step": 973 }, { "epoch": 0.2697313763500415, "grad_norm": 2.717695713043213, "learning_rate": 0.00016857423724262849, "loss": 3.0133, "step": 974 }, { "epoch": 0.27000830794793684, "grad_norm": 8.298233985900879, "learning_rate": 0.00016850999853261075, "loss": 3.9595, "step": 975 }, { "epoch": 0.27028523954583217, "grad_norm": 2.5315914154052734, "learning_rate": 0.0001684457065001882, "loss": 2.8995, "step": 976 }, { "epoch": 0.2705621711437275, "grad_norm": 2.19551944732666, "learning_rate": 0.00016838136119540022, "loss": 2.7672, "step": 977 }, { "epoch": 0.2708391027416228, "grad_norm": 1.4843145608901978, "learning_rate": 0.00016831696266832787, "loss": 2.7836, "step": 978 }, { "epoch": 0.27111603433951814, "grad_norm": 1.3755983114242554, "learning_rate": 0.00016825251096909343, "loss": 2.9166, "step": 979 }, { "epoch": 0.27139296593741347, "grad_norm": 2.4390645027160645, "learning_rate": 0.00016818800614786075, "loss": 2.4564, "step": 980 }, { "epoch": 0.2716698975353088, "grad_norm": 1.8554071187973022, "learning_rate": 0.00016812344825483492, "loss": 2.607, "step": 981 }, { "epoch": 0.2719468291332041, "grad_norm": 1.954175353050232, "learning_rate": 0.00016805883734026238, "loss": 3.053, "step": 982 }, { "epoch": 0.27222376073109944, "grad_norm": 1.6565766334533691, "learning_rate": 0.00016799417345443078, "loss": 2.6651, "step": 983 }, { "epoch": 0.27250069232899476, "grad_norm": 1.8020745515823364, "learning_rate": 0.00016792945664766907, "loss": 2.6281, "step": 984 }, { "epoch": 0.27277762392689003, "grad_norm": 2.614210367202759, "learning_rate": 0.0001678646869703473, "loss": 3.0437, "step": 985 }, { "epoch": 0.27305455552478536, "grad_norm": 1.832960844039917, "learning_rate": 0.00016779986447287677, "loss": 2.7291, "step": 986 }, { "epoch": 0.2733314871226807, "grad_norm": 2.2536306381225586, "learning_rate": 0.00016773498920570983, "loss": 3.2322, "step": 987 }, { "epoch": 0.273608418720576, "grad_norm": 2.4348583221435547, "learning_rate": 0.0001676700612193399, "loss": 2.9202, "step": 988 }, { "epoch": 0.27388535031847133, "grad_norm": 196.9659881591797, "learning_rate": 0.00016760508056430152, "loss": 3.5759, "step": 989 }, { "epoch": 0.27416228191636666, "grad_norm": 10.67332935333252, "learning_rate": 0.00016754004729117004, "loss": 2.7365, "step": 990 }, { "epoch": 0.274439213514262, "grad_norm": 1.9112107753753662, "learning_rate": 0.00016747496145056196, "loss": 2.8596, "step": 991 }, { "epoch": 0.2747161451121573, "grad_norm": 2.2627761363983154, "learning_rate": 0.0001674098230931346, "loss": 2.7513, "step": 992 }, { "epoch": 0.27499307671005263, "grad_norm": 1.7090967893600464, "learning_rate": 0.00016734463226958615, "loss": 2.8605, "step": 993 }, { "epoch": 0.27527000830794796, "grad_norm": 1.4351768493652344, "learning_rate": 0.0001672793890306556, "loss": 2.5909, "step": 994 }, { "epoch": 0.2755469399058433, "grad_norm": 1.6425434350967407, "learning_rate": 0.0001672140934271229, "loss": 2.6916, "step": 995 }, { "epoch": 0.27582387150373855, "grad_norm": 5.847080230712891, "learning_rate": 0.00016714874550980853, "loss": 3.4974, "step": 996 }, { "epoch": 0.2761008031016339, "grad_norm": 19.677980422973633, "learning_rate": 0.00016708334532957383, "loss": 4.3623, "step": 997 }, { "epoch": 0.2763777346995292, "grad_norm": 6.76720666885376, "learning_rate": 0.0001670178929373208, "loss": 4.0329, "step": 998 }, { "epoch": 0.2766546662974245, "grad_norm": 9.839909553527832, "learning_rate": 0.00016695238838399206, "loss": 3.7058, "step": 999 }, { "epoch": 0.27693159789531985, "grad_norm": 4.762269496917725, "learning_rate": 0.00016688683172057087, "loss": 3.7894, "step": 1000 }, { "epoch": 0.27720852949321517, "grad_norm": 1.6144564151763916, "learning_rate": 0.0001668212229980809, "loss": 2.6048, "step": 1001 }, { "epoch": 0.2774854610911105, "grad_norm": 2.5514297485351562, "learning_rate": 0.00016675556226758656, "loss": 2.943, "step": 1002 }, { "epoch": 0.2777623926890058, "grad_norm": 3.8206777572631836, "learning_rate": 0.00016668984958019255, "loss": 2.5958, "step": 1003 }, { "epoch": 0.27803932428690115, "grad_norm": 1.4613410234451294, "learning_rate": 0.0001666240849870441, "loss": 2.74, "step": 1004 }, { "epoch": 0.27831625588479647, "grad_norm": 1.8141924142837524, "learning_rate": 0.00016655826853932684, "loss": 2.9371, "step": 1005 }, { "epoch": 0.2785931874826918, "grad_norm": 1.7682191133499146, "learning_rate": 0.00016649240028826674, "loss": 2.6332, "step": 1006 }, { "epoch": 0.2788701190805871, "grad_norm": 2.4179537296295166, "learning_rate": 0.0001664264802851301, "loss": 3.1183, "step": 1007 }, { "epoch": 0.2791470506784824, "grad_norm": 2.072624444961548, "learning_rate": 0.00016636050858122344, "loss": 2.9504, "step": 1008 }, { "epoch": 0.2794239822763777, "grad_norm": 2.388211250305176, "learning_rate": 0.0001662944852278936, "loss": 2.5977, "step": 1009 }, { "epoch": 0.27970091387427304, "grad_norm": 2.1789684295654297, "learning_rate": 0.00016622841027652766, "loss": 2.941, "step": 1010 }, { "epoch": 0.27997784547216836, "grad_norm": 1.181632161140442, "learning_rate": 0.00016616228377855268, "loss": 2.882, "step": 1011 }, { "epoch": 0.2802547770700637, "grad_norm": 1.4988845586776733, "learning_rate": 0.00016609610578543598, "loss": 3.073, "step": 1012 }, { "epoch": 0.280531708667959, "grad_norm": 2.1757216453552246, "learning_rate": 0.000166029876348685, "loss": 2.6687, "step": 1013 }, { "epoch": 0.28080864026585434, "grad_norm": 2.580033779144287, "learning_rate": 0.00016596359551984704, "loss": 3.1823, "step": 1014 }, { "epoch": 0.28108557186374966, "grad_norm": 2.1309080123901367, "learning_rate": 0.00016589726335050956, "loss": 2.8604, "step": 1015 }, { "epoch": 0.281362503461645, "grad_norm": 2.224088191986084, "learning_rate": 0.00016583087989229997, "loss": 3.053, "step": 1016 }, { "epoch": 0.2816394350595403, "grad_norm": 1.8244150876998901, "learning_rate": 0.00016576444519688548, "loss": 3.1972, "step": 1017 }, { "epoch": 0.28191636665743564, "grad_norm": 2.5288257598876953, "learning_rate": 0.00016569795931597328, "loss": 2.8303, "step": 1018 }, { "epoch": 0.28219329825533096, "grad_norm": 1.8015689849853516, "learning_rate": 0.0001656314223013104, "loss": 2.6104, "step": 1019 }, { "epoch": 0.28247022985322623, "grad_norm": 8.759150505065918, "learning_rate": 0.0001655648342046836, "loss": 3.0135, "step": 1020 }, { "epoch": 0.28274716145112155, "grad_norm": 2.5134949684143066, "learning_rate": 0.00016549819507791943, "loss": 2.6073, "step": 1021 }, { "epoch": 0.2830240930490169, "grad_norm": 1.9912227392196655, "learning_rate": 0.00016543150497288424, "loss": 2.949, "step": 1022 }, { "epoch": 0.2833010246469122, "grad_norm": 2.0433785915374756, "learning_rate": 0.0001653647639414839, "loss": 2.6917, "step": 1023 }, { "epoch": 0.2835779562448075, "grad_norm": 1.5686930418014526, "learning_rate": 0.00016529797203566405, "loss": 3.1209, "step": 1024 }, { "epoch": 0.28385488784270285, "grad_norm": 4.718352317810059, "learning_rate": 0.00016523112930740986, "loss": 4.037, "step": 1025 }, { "epoch": 0.2841318194405982, "grad_norm": 2.3235116004943848, "learning_rate": 0.00016516423580874607, "loss": 2.7712, "step": 1026 }, { "epoch": 0.2844087510384935, "grad_norm": 1.234822392463684, "learning_rate": 0.00016509729159173696, "loss": 2.6417, "step": 1027 }, { "epoch": 0.2846856826363888, "grad_norm": 3.115663766860962, "learning_rate": 0.00016503029670848621, "loss": 3.3312, "step": 1028 }, { "epoch": 0.28496261423428415, "grad_norm": 1.5264135599136353, "learning_rate": 0.00016496325121113706, "loss": 2.9245, "step": 1029 }, { "epoch": 0.2852395458321795, "grad_norm": 1.799498438835144, "learning_rate": 0.000164896155151872, "loss": 2.9856, "step": 1030 }, { "epoch": 0.28551647743007474, "grad_norm": 1.908801794052124, "learning_rate": 0.000164829008582913, "loss": 2.6698, "step": 1031 }, { "epoch": 0.28579340902797007, "grad_norm": 1.5490243434906006, "learning_rate": 0.00016476181155652126, "loss": 2.6932, "step": 1032 }, { "epoch": 0.2860703406258654, "grad_norm": 2.0292749404907227, "learning_rate": 0.00016469456412499724, "loss": 2.6477, "step": 1033 }, { "epoch": 0.2863472722237607, "grad_norm": 1.593946099281311, "learning_rate": 0.00016462726634068075, "loss": 2.7718, "step": 1034 }, { "epoch": 0.28662420382165604, "grad_norm": 2.0860631465911865, "learning_rate": 0.00016455991825595066, "loss": 2.6695, "step": 1035 }, { "epoch": 0.28690113541955137, "grad_norm": 2.2195959091186523, "learning_rate": 0.00016449251992322505, "loss": 2.5107, "step": 1036 }, { "epoch": 0.2871780670174467, "grad_norm": 1.1555812358856201, "learning_rate": 0.0001644250713949611, "loss": 2.7165, "step": 1037 }, { "epoch": 0.287454998615342, "grad_norm": 1.8912162780761719, "learning_rate": 0.00016435757272365505, "loss": 2.9361, "step": 1038 }, { "epoch": 0.28773193021323734, "grad_norm": 1.9437050819396973, "learning_rate": 0.00016429002396184215, "loss": 2.8861, "step": 1039 }, { "epoch": 0.28800886181113267, "grad_norm": 1.3384404182434082, "learning_rate": 0.00016422242516209672, "loss": 2.7149, "step": 1040 }, { "epoch": 0.288285793409028, "grad_norm": 1.8033853769302368, "learning_rate": 0.0001641547763770319, "loss": 2.9564, "step": 1041 }, { "epoch": 0.2885627250069233, "grad_norm": 2.3526084423065186, "learning_rate": 0.00016408707765929985, "loss": 2.9201, "step": 1042 }, { "epoch": 0.2888396566048186, "grad_norm": 1.8857325315475464, "learning_rate": 0.0001640193290615915, "loss": 3.0058, "step": 1043 }, { "epoch": 0.2891165882027139, "grad_norm": 2.3668930530548096, "learning_rate": 0.00016395153063663667, "loss": 2.5814, "step": 1044 }, { "epoch": 0.28939351980060923, "grad_norm": 1.8008521795272827, "learning_rate": 0.00016388368243720392, "loss": 2.4955, "step": 1045 }, { "epoch": 0.28967045139850456, "grad_norm": 1.3481143712997437, "learning_rate": 0.00016381578451610062, "loss": 2.5505, "step": 1046 }, { "epoch": 0.2899473829963999, "grad_norm": 2.66656494140625, "learning_rate": 0.0001637478369261727, "loss": 2.4911, "step": 1047 }, { "epoch": 0.2902243145942952, "grad_norm": 1.821694016456604, "learning_rate": 0.0001636798397203049, "loss": 2.4427, "step": 1048 }, { "epoch": 0.29050124619219053, "grad_norm": 4.15843391418457, "learning_rate": 0.00016361179295142046, "loss": 3.3711, "step": 1049 }, { "epoch": 0.29077817779008586, "grad_norm": 4.490833282470703, "learning_rate": 0.00016354369667248128, "loss": 4.0469, "step": 1050 }, { "epoch": 0.2910551093879812, "grad_norm": 2.0667965412139893, "learning_rate": 0.00016347555093648776, "loss": 2.6667, "step": 1051 }, { "epoch": 0.2913320409858765, "grad_norm": 2.4750208854675293, "learning_rate": 0.0001634073557964788, "loss": 3.2475, "step": 1052 }, { "epoch": 0.29160897258377183, "grad_norm": 15.866964340209961, "learning_rate": 0.00016333911130553172, "loss": 2.8132, "step": 1053 }, { "epoch": 0.29188590418166716, "grad_norm": 2.471449375152588, "learning_rate": 0.00016327081751676227, "loss": 2.9777, "step": 1054 }, { "epoch": 0.2921628357795624, "grad_norm": 1.6980383396148682, "learning_rate": 0.0001632024744833246, "loss": 2.9916, "step": 1055 }, { "epoch": 0.29243976737745775, "grad_norm": 1.6648608446121216, "learning_rate": 0.00016313408225841118, "loss": 2.6989, "step": 1056 }, { "epoch": 0.2927166989753531, "grad_norm": 2.11301589012146, "learning_rate": 0.00016306564089525272, "loss": 2.9521, "step": 1057 }, { "epoch": 0.2929936305732484, "grad_norm": 1.940384030342102, "learning_rate": 0.0001629971504471182, "loss": 2.6839, "step": 1058 }, { "epoch": 0.2932705621711437, "grad_norm": 1.652581810951233, "learning_rate": 0.0001629286109673148, "loss": 2.9663, "step": 1059 }, { "epoch": 0.29354749376903905, "grad_norm": 1.4100010395050049, "learning_rate": 0.00016286002250918792, "loss": 2.8476, "step": 1060 }, { "epoch": 0.29382442536693437, "grad_norm": 2.3216185569763184, "learning_rate": 0.000162791385126121, "loss": 2.8777, "step": 1061 }, { "epoch": 0.2941013569648297, "grad_norm": 2.377795934677124, "learning_rate": 0.00016272269887153559, "loss": 2.8286, "step": 1062 }, { "epoch": 0.294378288562725, "grad_norm": 1.5239886045455933, "learning_rate": 0.00016265396379889127, "loss": 3.0177, "step": 1063 }, { "epoch": 0.29465522016062035, "grad_norm": 1.657238245010376, "learning_rate": 0.00016258517996168564, "loss": 2.6564, "step": 1064 }, { "epoch": 0.29493215175851567, "grad_norm": 3.806039571762085, "learning_rate": 0.00016251634741345423, "loss": 2.6983, "step": 1065 }, { "epoch": 0.29520908335641094, "grad_norm": 3.4286303520202637, "learning_rate": 0.0001624474662077705, "loss": 2.9889, "step": 1066 }, { "epoch": 0.29548601495430626, "grad_norm": 1.889386534690857, "learning_rate": 0.00016237853639824575, "loss": 2.8944, "step": 1067 }, { "epoch": 0.2957629465522016, "grad_norm": 2.8036301136016846, "learning_rate": 0.00016230955803852912, "loss": 3.0058, "step": 1068 }, { "epoch": 0.2960398781500969, "grad_norm": 2.555205821990967, "learning_rate": 0.0001622405311823076, "loss": 2.6748, "step": 1069 }, { "epoch": 0.29631680974799224, "grad_norm": 2.4662954807281494, "learning_rate": 0.00016217145588330584, "loss": 2.7295, "step": 1070 }, { "epoch": 0.29659374134588756, "grad_norm": 3.151988983154297, "learning_rate": 0.00016210233219528624, "loss": 2.9012, "step": 1071 }, { "epoch": 0.2968706729437829, "grad_norm": 1.1206333637237549, "learning_rate": 0.0001620331601720488, "loss": 2.6216, "step": 1072 }, { "epoch": 0.2971476045416782, "grad_norm": 2.9131011962890625, "learning_rate": 0.0001619639398674313, "loss": 2.7355, "step": 1073 }, { "epoch": 0.29742453613957354, "grad_norm": 3.1310834884643555, "learning_rate": 0.00016189467133530884, "loss": 3.048, "step": 1074 }, { "epoch": 0.29770146773746886, "grad_norm": 5.254419326782227, "learning_rate": 0.00016182535462959434, "loss": 4.0407, "step": 1075 }, { "epoch": 0.2979783993353642, "grad_norm": 1.8420734405517578, "learning_rate": 0.00016175598980423797, "loss": 3.0963, "step": 1076 }, { "epoch": 0.2982553309332595, "grad_norm": 1.8723719120025635, "learning_rate": 0.00016168657691322755, "loss": 2.793, "step": 1077 }, { "epoch": 0.2985322625311548, "grad_norm": 2.471306085586548, "learning_rate": 0.00016161711601058815, "loss": 2.654, "step": 1078 }, { "epoch": 0.2988091941290501, "grad_norm": 2.581939220428467, "learning_rate": 0.0001615476071503823, "loss": 3.0836, "step": 1079 }, { "epoch": 0.29908612572694543, "grad_norm": 2.18522572517395, "learning_rate": 0.00016147805038670986, "loss": 3.0257, "step": 1080 }, { "epoch": 0.29936305732484075, "grad_norm": 2.2486062049865723, "learning_rate": 0.00016140844577370791, "loss": 2.814, "step": 1081 }, { "epoch": 0.2996399889227361, "grad_norm": 2.070225238800049, "learning_rate": 0.00016133879336555085, "loss": 2.7629, "step": 1082 }, { "epoch": 0.2999169205206314, "grad_norm": 1.965770959854126, "learning_rate": 0.00016126909321645023, "loss": 2.7618, "step": 1083 }, { "epoch": 0.3001938521185267, "grad_norm": 3.0630273818969727, "learning_rate": 0.0001611993453806547, "loss": 2.7491, "step": 1084 }, { "epoch": 0.30047078371642205, "grad_norm": 1.745874047279358, "learning_rate": 0.00016112954991245022, "loss": 2.6349, "step": 1085 }, { "epoch": 0.3007477153143174, "grad_norm": 2.672760248184204, "learning_rate": 0.0001610597068661596, "loss": 2.659, "step": 1086 }, { "epoch": 0.3010246469122127, "grad_norm": 1.7880834341049194, "learning_rate": 0.00016098981629614277, "loss": 2.5338, "step": 1087 }, { "epoch": 0.301301578510108, "grad_norm": 1.5602076053619385, "learning_rate": 0.00016091987825679672, "loss": 2.9071, "step": 1088 }, { "epoch": 0.30157851010800335, "grad_norm": 2.032357931137085, "learning_rate": 0.0001608498928025553, "loss": 2.8791, "step": 1089 }, { "epoch": 0.3018554417058986, "grad_norm": 1.5926483869552612, "learning_rate": 0.00016077985998788925, "loss": 2.867, "step": 1090 }, { "epoch": 0.30213237330379394, "grad_norm": 3.0426523685455322, "learning_rate": 0.00016070977986730624, "loss": 3.0692, "step": 1091 }, { "epoch": 0.30240930490168927, "grad_norm": 1.8183352947235107, "learning_rate": 0.00016063965249535074, "loss": 2.8024, "step": 1092 }, { "epoch": 0.3026862364995846, "grad_norm": 1.9122745990753174, "learning_rate": 0.00016056947792660392, "loss": 2.7971, "step": 1093 }, { "epoch": 0.3029631680974799, "grad_norm": 1.4231312274932861, "learning_rate": 0.00016049925621568382, "loss": 2.4285, "step": 1094 }, { "epoch": 0.30324009969537524, "grad_norm": 1.6702405214309692, "learning_rate": 0.00016042898741724503, "loss": 2.5288, "step": 1095 }, { "epoch": 0.30351703129327057, "grad_norm": 2.072124719619751, "learning_rate": 0.0001603586715859789, "loss": 2.9134, "step": 1096 }, { "epoch": 0.3037939628911659, "grad_norm": 2.3705649375915527, "learning_rate": 0.00016028830877661335, "loss": 2.8175, "step": 1097 }, { "epoch": 0.3040708944890612, "grad_norm": 3.557770013809204, "learning_rate": 0.00016021789904391282, "loss": 3.0662, "step": 1098 }, { "epoch": 0.30434782608695654, "grad_norm": 2.803607940673828, "learning_rate": 0.00016014744244267833, "loss": 3.1365, "step": 1099 }, { "epoch": 0.30462475768485187, "grad_norm": 2.111748695373535, "learning_rate": 0.00016007693902774737, "loss": 3.2262, "step": 1100 }, { "epoch": 0.30490168928274713, "grad_norm": 2.4110352993011475, "learning_rate": 0.0001600063888539938, "loss": 2.7931, "step": 1101 }, { "epoch": 0.30517862088064246, "grad_norm": 1.7806950807571411, "learning_rate": 0.00015993579197632796, "loss": 2.8071, "step": 1102 }, { "epoch": 0.3054555524785378, "grad_norm": 1.2968131303787231, "learning_rate": 0.00015986514844969652, "loss": 2.594, "step": 1103 }, { "epoch": 0.3057324840764331, "grad_norm": 2.446486473083496, "learning_rate": 0.00015979445832908242, "loss": 2.6563, "step": 1104 }, { "epoch": 0.30600941567432843, "grad_norm": 1.2931456565856934, "learning_rate": 0.0001597237216695049, "loss": 2.8281, "step": 1105 }, { "epoch": 0.30628634727222376, "grad_norm": 2.0900042057037354, "learning_rate": 0.00015965293852601944, "loss": 2.7806, "step": 1106 }, { "epoch": 0.3065632788701191, "grad_norm": 2.632905960083008, "learning_rate": 0.0001595821089537176, "loss": 2.9521, "step": 1107 }, { "epoch": 0.3068402104680144, "grad_norm": 2.728095769882202, "learning_rate": 0.00015951123300772717, "loss": 2.8188, "step": 1108 }, { "epoch": 0.30711714206590973, "grad_norm": 1.6814217567443848, "learning_rate": 0.00015944031074321204, "loss": 2.4426, "step": 1109 }, { "epoch": 0.30739407366380506, "grad_norm": 1.18573796749115, "learning_rate": 0.0001593693422153721, "loss": 2.734, "step": 1110 }, { "epoch": 0.3076710052617004, "grad_norm": 2.4105420112609863, "learning_rate": 0.00015929832747944324, "loss": 3.0473, "step": 1111 }, { "epoch": 0.3079479368595957, "grad_norm": 2.6489510536193848, "learning_rate": 0.0001592272665906974, "loss": 3.3682, "step": 1112 }, { "epoch": 0.308224868457491, "grad_norm": 1.4610475301742554, "learning_rate": 0.00015915615960444233, "loss": 2.6737, "step": 1113 }, { "epoch": 0.3085018000553863, "grad_norm": 1.8061065673828125, "learning_rate": 0.00015908500657602174, "loss": 2.8074, "step": 1114 }, { "epoch": 0.3087787316532816, "grad_norm": 1.55250883102417, "learning_rate": 0.00015901380756081515, "loss": 2.9482, "step": 1115 }, { "epoch": 0.30905566325117695, "grad_norm": 1.9959843158721924, "learning_rate": 0.0001589425626142378, "loss": 3.1434, "step": 1116 }, { "epoch": 0.3093325948490723, "grad_norm": 1.5615752935409546, "learning_rate": 0.00015887127179174085, "loss": 2.9626, "step": 1117 }, { "epoch": 0.3096095264469676, "grad_norm": 1.5040863752365112, "learning_rate": 0.00015879993514881103, "loss": 3.0734, "step": 1118 }, { "epoch": 0.3098864580448629, "grad_norm": 2.326559066772461, "learning_rate": 0.0001587285527409707, "loss": 2.7417, "step": 1119 }, { "epoch": 0.31016338964275825, "grad_norm": 1.2279635667800903, "learning_rate": 0.00015865712462377802, "loss": 2.6415, "step": 1120 }, { "epoch": 0.31044032124065357, "grad_norm": 1.839401364326477, "learning_rate": 0.0001585856508528265, "loss": 2.6821, "step": 1121 }, { "epoch": 0.3107172528385489, "grad_norm": 1.4215173721313477, "learning_rate": 0.00015851413148374537, "loss": 2.674, "step": 1122 }, { "epoch": 0.3109941844364442, "grad_norm": 1.4573885202407837, "learning_rate": 0.00015844256657219927, "loss": 2.8351, "step": 1123 }, { "epoch": 0.31127111603433955, "grad_norm": 2.0855629444122314, "learning_rate": 0.00015837095617388827, "loss": 2.8772, "step": 1124 }, { "epoch": 0.3115480476322348, "grad_norm": 3.5365757942199707, "learning_rate": 0.00015829930034454783, "loss": 3.5358, "step": 1125 }, { "epoch": 0.31182497923013014, "grad_norm": 1.9074093103408813, "learning_rate": 0.00015822759913994888, "loss": 3.1212, "step": 1126 }, { "epoch": 0.31210191082802546, "grad_norm": 1.4721168279647827, "learning_rate": 0.00015815585261589753, "loss": 2.6182, "step": 1127 }, { "epoch": 0.3123788424259208, "grad_norm": 1.8700261116027832, "learning_rate": 0.00015808406082823528, "loss": 2.8233, "step": 1128 }, { "epoch": 0.3126557740238161, "grad_norm": 1.7332760095596313, "learning_rate": 0.0001580122238328387, "loss": 2.9136, "step": 1129 }, { "epoch": 0.31293270562171144, "grad_norm": 1.2909797430038452, "learning_rate": 0.00015794034168561982, "loss": 3.0554, "step": 1130 }, { "epoch": 0.31320963721960676, "grad_norm": 1.7902755737304688, "learning_rate": 0.00015786841444252546, "loss": 2.7218, "step": 1131 }, { "epoch": 0.3134865688175021, "grad_norm": 1.855206847190857, "learning_rate": 0.00015779644215953783, "loss": 3.0041, "step": 1132 }, { "epoch": 0.3137635004153974, "grad_norm": 1.2451910972595215, "learning_rate": 0.00015772442489267406, "loss": 2.6642, "step": 1133 }, { "epoch": 0.31404043201329274, "grad_norm": 1.616047739982605, "learning_rate": 0.00015765236269798627, "loss": 2.9032, "step": 1134 }, { "epoch": 0.31431736361118806, "grad_norm": 1.1861200332641602, "learning_rate": 0.00015758025563156167, "loss": 2.6461, "step": 1135 }, { "epoch": 0.31459429520908333, "grad_norm": 1.9919323921203613, "learning_rate": 0.00015750810374952226, "loss": 3.0355, "step": 1136 }, { "epoch": 0.31487122680697865, "grad_norm": 1.5378572940826416, "learning_rate": 0.00015743590710802505, "loss": 2.8951, "step": 1137 }, { "epoch": 0.315148158404874, "grad_norm": 1.3112939596176147, "learning_rate": 0.00015736366576326173, "loss": 2.727, "step": 1138 }, { "epoch": 0.3154250900027693, "grad_norm": 1.2822314500808716, "learning_rate": 0.00015729137977145893, "loss": 2.7409, "step": 1139 }, { "epoch": 0.31570202160066463, "grad_norm": 1.6411978006362915, "learning_rate": 0.00015721904918887796, "loss": 2.9995, "step": 1140 }, { "epoch": 0.31597895319855995, "grad_norm": 2.176082134246826, "learning_rate": 0.00015714667407181482, "loss": 2.84, "step": 1141 }, { "epoch": 0.3162558847964553, "grad_norm": 2.9109854698181152, "learning_rate": 0.00015707425447660021, "loss": 2.7467, "step": 1142 }, { "epoch": 0.3165328163943506, "grad_norm": 1.649181842803955, "learning_rate": 0.00015700179045959942, "loss": 2.6447, "step": 1143 }, { "epoch": 0.3168097479922459, "grad_norm": 2.1657958030700684, "learning_rate": 0.0001569292820772124, "loss": 2.8329, "step": 1144 }, { "epoch": 0.31708667959014125, "grad_norm": 2.1416025161743164, "learning_rate": 0.00015685672938587345, "loss": 2.6883, "step": 1145 }, { "epoch": 0.3173636111880366, "grad_norm": 2.3641982078552246, "learning_rate": 0.00015678413244205156, "loss": 2.6026, "step": 1146 }, { "epoch": 0.3176405427859319, "grad_norm": 2.626178741455078, "learning_rate": 0.00015671149130225005, "loss": 3.1148, "step": 1147 }, { "epoch": 0.31791747438382717, "grad_norm": 2.206259250640869, "learning_rate": 0.00015663880602300657, "loss": 2.8485, "step": 1148 }, { "epoch": 0.3181944059817225, "grad_norm": 2.893170118331909, "learning_rate": 0.00015656607666089334, "loss": 3.3303, "step": 1149 }, { "epoch": 0.3184713375796178, "grad_norm": 5.792006492614746, "learning_rate": 0.0001564933032725167, "loss": 4.157, "step": 1150 }, { "epoch": 0.31874826917751314, "grad_norm": 1.754286527633667, "learning_rate": 0.0001564204859145173, "loss": 2.7435, "step": 1151 }, { "epoch": 0.31902520077540847, "grad_norm": 1.5062384605407715, "learning_rate": 0.00015634762464357004, "loss": 2.7834, "step": 1152 }, { "epoch": 0.3193021323733038, "grad_norm": 1.6033989191055298, "learning_rate": 0.00015627471951638403, "loss": 2.8711, "step": 1153 }, { "epoch": 0.3195790639711991, "grad_norm": 3.486161231994629, "learning_rate": 0.0001562017705897024, "loss": 2.5715, "step": 1154 }, { "epoch": 0.31985599556909444, "grad_norm": 1.360491156578064, "learning_rate": 0.00015612877792030248, "loss": 2.769, "step": 1155 }, { "epoch": 0.32013292716698977, "grad_norm": 1.607000470161438, "learning_rate": 0.00015605574156499568, "loss": 2.6347, "step": 1156 }, { "epoch": 0.3204098587648851, "grad_norm": 1.483081579208374, "learning_rate": 0.0001559826615806272, "loss": 2.8894, "step": 1157 }, { "epoch": 0.3206867903627804, "grad_norm": 2.1092448234558105, "learning_rate": 0.00015590953802407644, "loss": 2.8086, "step": 1158 }, { "epoch": 0.32096372196067574, "grad_norm": 1.5998456478118896, "learning_rate": 0.00015583637095225656, "loss": 2.918, "step": 1159 }, { "epoch": 0.321240653558571, "grad_norm": 1.6553558111190796, "learning_rate": 0.00015576316042211467, "loss": 2.963, "step": 1160 }, { "epoch": 0.32151758515646633, "grad_norm": 2.043443441390991, "learning_rate": 0.00015568990649063164, "loss": 2.7848, "step": 1161 }, { "epoch": 0.32179451675436166, "grad_norm": 1.8811475038528442, "learning_rate": 0.00015561660921482217, "loss": 2.87, "step": 1162 }, { "epoch": 0.322071448352257, "grad_norm": 1.8709558248519897, "learning_rate": 0.00015554326865173469, "loss": 2.8862, "step": 1163 }, { "epoch": 0.3223483799501523, "grad_norm": 1.5902208089828491, "learning_rate": 0.00015546988485845125, "loss": 3.2132, "step": 1164 }, { "epoch": 0.32262531154804763, "grad_norm": 2.2096242904663086, "learning_rate": 0.00015539645789208772, "loss": 2.8945, "step": 1165 }, { "epoch": 0.32290224314594296, "grad_norm": 1.1872581243515015, "learning_rate": 0.00015532298780979336, "loss": 2.6953, "step": 1166 }, { "epoch": 0.3231791747438383, "grad_norm": 2.396256685256958, "learning_rate": 0.00015524947466875107, "loss": 2.9257, "step": 1167 }, { "epoch": 0.3234561063417336, "grad_norm": 1.961571455001831, "learning_rate": 0.00015517591852617736, "loss": 2.4849, "step": 1168 }, { "epoch": 0.32373303793962893, "grad_norm": 2.79026460647583, "learning_rate": 0.0001551023194393221, "loss": 2.7722, "step": 1169 }, { "epoch": 0.32400996953752426, "grad_norm": 1.9868056774139404, "learning_rate": 0.00015502867746546858, "loss": 2.6865, "step": 1170 }, { "epoch": 0.3242869011354195, "grad_norm": 1.6515250205993652, "learning_rate": 0.00015495499266193354, "loss": 2.9801, "step": 1171 }, { "epoch": 0.32456383273331485, "grad_norm": 1.5028882026672363, "learning_rate": 0.00015488126508606703, "loss": 2.7559, "step": 1172 }, { "epoch": 0.3248407643312102, "grad_norm": 2.3353559970855713, "learning_rate": 0.0001548074947952523, "loss": 2.845, "step": 1173 }, { "epoch": 0.3251176959291055, "grad_norm": 1.6773449182510376, "learning_rate": 0.00015473368184690597, "loss": 2.9368, "step": 1174 }, { "epoch": 0.3253946275270008, "grad_norm": 4.547130107879639, "learning_rate": 0.00015465982629847783, "loss": 4.1124, "step": 1175 }, { "epoch": 0.32567155912489615, "grad_norm": 1.3638783693313599, "learning_rate": 0.00015458592820745079, "loss": 2.9854, "step": 1176 }, { "epoch": 0.3259484907227915, "grad_norm": 1.6351149082183838, "learning_rate": 0.0001545119876313409, "loss": 2.6985, "step": 1177 }, { "epoch": 0.3262254223206868, "grad_norm": 2.6430916786193848, "learning_rate": 0.00015443800462769727, "loss": 3.2984, "step": 1178 }, { "epoch": 0.3265023539185821, "grad_norm": 1.3868046998977661, "learning_rate": 0.00015436397925410201, "loss": 2.6771, "step": 1179 }, { "epoch": 0.32677928551647745, "grad_norm": 2.804959535598755, "learning_rate": 0.00015428991156817029, "loss": 3.083, "step": 1180 }, { "epoch": 0.32705621711437277, "grad_norm": 1.8770880699157715, "learning_rate": 0.00015421580162755004, "loss": 2.8515, "step": 1181 }, { "epoch": 0.3273331487122681, "grad_norm": 1.4491260051727295, "learning_rate": 0.00015414164948992227, "loss": 2.7167, "step": 1182 }, { "epoch": 0.32761008031016337, "grad_norm": 1.5363740921020508, "learning_rate": 0.00015406745521300073, "loss": 2.74, "step": 1183 }, { "epoch": 0.3278870119080587, "grad_norm": 1.8011248111724854, "learning_rate": 0.00015399321885453202, "loss": 2.7164, "step": 1184 }, { "epoch": 0.328163943505954, "grad_norm": 2.1668336391448975, "learning_rate": 0.00015391894047229538, "loss": 2.9057, "step": 1185 }, { "epoch": 0.32844087510384934, "grad_norm": 1.4623689651489258, "learning_rate": 0.00015384462012410293, "loss": 2.7671, "step": 1186 }, { "epoch": 0.32871780670174466, "grad_norm": 1.0454082489013672, "learning_rate": 0.00015377025786779928, "loss": 2.982, "step": 1187 }, { "epoch": 0.32899473829964, "grad_norm": 1.496302604675293, "learning_rate": 0.0001536958537612618, "loss": 2.561, "step": 1188 }, { "epoch": 0.3292716698975353, "grad_norm": 2.425139904022217, "learning_rate": 0.00015362140786240035, "loss": 2.9257, "step": 1189 }, { "epoch": 0.32954860149543064, "grad_norm": 1.459738850593567, "learning_rate": 0.00015354692022915732, "loss": 3.065, "step": 1190 }, { "epoch": 0.32982553309332596, "grad_norm": 3.5508320331573486, "learning_rate": 0.00015347239091950763, "loss": 3.0286, "step": 1191 }, { "epoch": 0.3301024646912213, "grad_norm": 1.521572232246399, "learning_rate": 0.0001533978199914586, "loss": 2.7615, "step": 1192 }, { "epoch": 0.3303793962891166, "grad_norm": 2.401167392730713, "learning_rate": 0.00015332320750304992, "loss": 3.0155, "step": 1193 }, { "epoch": 0.3306563278870119, "grad_norm": 1.7472994327545166, "learning_rate": 0.00015324855351235372, "loss": 2.5185, "step": 1194 }, { "epoch": 0.3309332594849072, "grad_norm": 1.205310344696045, "learning_rate": 0.00015317385807747433, "loss": 2.5906, "step": 1195 }, { "epoch": 0.33121019108280253, "grad_norm": 1.574655294418335, "learning_rate": 0.0001530991212565484, "loss": 2.8073, "step": 1196 }, { "epoch": 0.33148712268069785, "grad_norm": 2.1971731185913086, "learning_rate": 0.00015302434310774472, "loss": 2.9192, "step": 1197 }, { "epoch": 0.3317640542785932, "grad_norm": 1.4974596500396729, "learning_rate": 0.00015294952368926433, "loss": 2.7797, "step": 1198 }, { "epoch": 0.3320409858764885, "grad_norm": 1.7164533138275146, "learning_rate": 0.00015287466305934037, "loss": 2.8779, "step": 1199 }, { "epoch": 0.33231791747438383, "grad_norm": 4.789784908294678, "learning_rate": 0.000152799761276238, "loss": 4.2732, "step": 1200 }, { "epoch": 0.33259484907227915, "grad_norm": 1.7506219148635864, "learning_rate": 0.00015272481839825452, "loss": 2.788, "step": 1201 }, { "epoch": 0.3328717806701745, "grad_norm": 2.5078749656677246, "learning_rate": 0.00015264983448371906, "loss": 3.158, "step": 1202 }, { "epoch": 0.3331487122680698, "grad_norm": 1.8120169639587402, "learning_rate": 0.00015257480959099282, "loss": 2.5553, "step": 1203 }, { "epoch": 0.3334256438659651, "grad_norm": 1.7837135791778564, "learning_rate": 0.0001524997437784689, "loss": 2.6184, "step": 1204 }, { "epoch": 0.33370257546386045, "grad_norm": 1.3695642948150635, "learning_rate": 0.00015242463710457208, "loss": 2.673, "step": 1205 }, { "epoch": 0.3339795070617557, "grad_norm": 1.6549372673034668, "learning_rate": 0.0001523494896277591, "loss": 2.6288, "step": 1206 }, { "epoch": 0.33425643865965105, "grad_norm": 2.5411343574523926, "learning_rate": 0.0001522743014065185, "loss": 2.8383, "step": 1207 }, { "epoch": 0.33453337025754637, "grad_norm": 2.1475119590759277, "learning_rate": 0.00015219907249937036, "loss": 2.9761, "step": 1208 }, { "epoch": 0.3348103018554417, "grad_norm": 2.9487884044647217, "learning_rate": 0.00015212380296486652, "loss": 2.8682, "step": 1209 }, { "epoch": 0.335087233453337, "grad_norm": 2.9902172088623047, "learning_rate": 0.00015204849286159052, "loss": 3.1509, "step": 1210 }, { "epoch": 0.33536416505123234, "grad_norm": 1.7279140949249268, "learning_rate": 0.0001519731422481573, "loss": 2.7741, "step": 1211 }, { "epoch": 0.33564109664912767, "grad_norm": 2.067688465118408, "learning_rate": 0.00015189775118321346, "loss": 2.9027, "step": 1212 }, { "epoch": 0.335918028247023, "grad_norm": 1.4905089139938354, "learning_rate": 0.00015182231972543707, "loss": 3.028, "step": 1213 }, { "epoch": 0.3361949598449183, "grad_norm": 1.398120403289795, "learning_rate": 0.0001517468479335376, "loss": 3.0139, "step": 1214 }, { "epoch": 0.33647189144281364, "grad_norm": 2.3304412364959717, "learning_rate": 0.00015167133586625587, "loss": 3.2774, "step": 1215 }, { "epoch": 0.33674882304070897, "grad_norm": 1.7044647932052612, "learning_rate": 0.00015159578358236422, "loss": 2.5921, "step": 1216 }, { "epoch": 0.3370257546386043, "grad_norm": 1.8110103607177734, "learning_rate": 0.00015152019114066608, "loss": 2.9911, "step": 1217 }, { "epoch": 0.33730268623649956, "grad_norm": 1.3686960935592651, "learning_rate": 0.0001514445585999962, "loss": 2.7397, "step": 1218 }, { "epoch": 0.3375796178343949, "grad_norm": 2.4159533977508545, "learning_rate": 0.00015136888601922072, "loss": 2.9673, "step": 1219 }, { "epoch": 0.3378565494322902, "grad_norm": 1.7092558145523071, "learning_rate": 0.00015129317345723665, "loss": 2.6965, "step": 1220 }, { "epoch": 0.33813348103018553, "grad_norm": 1.7533438205718994, "learning_rate": 0.00015121742097297237, "loss": 2.7309, "step": 1221 }, { "epoch": 0.33841041262808086, "grad_norm": 1.2231271266937256, "learning_rate": 0.00015114162862538713, "loss": 2.9981, "step": 1222 }, { "epoch": 0.3386873442259762, "grad_norm": 1.5578258037567139, "learning_rate": 0.00015106579647347136, "loss": 2.9053, "step": 1223 }, { "epoch": 0.3389642758238715, "grad_norm": 1.9765286445617676, "learning_rate": 0.0001509899245762464, "loss": 3.091, "step": 1224 }, { "epoch": 0.33924120742176683, "grad_norm": 4.126635551452637, "learning_rate": 0.00015091401299276456, "loss": 3.932, "step": 1225 }, { "epoch": 0.33951813901966216, "grad_norm": 2.4818685054779053, "learning_rate": 0.00015083806178210895, "loss": 2.709, "step": 1226 }, { "epoch": 0.3397950706175575, "grad_norm": 1.4024157524108887, "learning_rate": 0.00015076207100339363, "loss": 2.7262, "step": 1227 }, { "epoch": 0.3400720022154528, "grad_norm": 1.5914798974990845, "learning_rate": 0.00015068604071576344, "loss": 2.6496, "step": 1228 }, { "epoch": 0.3403489338133481, "grad_norm": 2.053748607635498, "learning_rate": 0.00015060997097839386, "loss": 2.8565, "step": 1229 }, { "epoch": 0.3406258654112434, "grad_norm": 1.3927969932556152, "learning_rate": 0.00015053386185049123, "loss": 2.8148, "step": 1230 }, { "epoch": 0.3409027970091387, "grad_norm": 1.5601234436035156, "learning_rate": 0.00015045771339129246, "loss": 2.6251, "step": 1231 }, { "epoch": 0.34117972860703405, "grad_norm": 2.194117546081543, "learning_rate": 0.00015038152566006509, "loss": 2.6468, "step": 1232 }, { "epoch": 0.3414566602049294, "grad_norm": 1.6312081813812256, "learning_rate": 0.00015030529871610718, "loss": 2.8919, "step": 1233 }, { "epoch": 0.3417335918028247, "grad_norm": 1.736798882484436, "learning_rate": 0.00015022903261874748, "loss": 2.6619, "step": 1234 }, { "epoch": 0.34201052340072, "grad_norm": 2.73242449760437, "learning_rate": 0.00015015272742734492, "loss": 3.0658, "step": 1235 }, { "epoch": 0.34228745499861535, "grad_norm": 2.544443130493164, "learning_rate": 0.00015007638320128913, "loss": 3.2522, "step": 1236 }, { "epoch": 0.3425643865965107, "grad_norm": 2.2239553928375244, "learning_rate": 0.00015000000000000001, "loss": 2.7484, "step": 1237 }, { "epoch": 0.342841318194406, "grad_norm": 1.7828017473220825, "learning_rate": 0.00014992357788292776, "loss": 2.6737, "step": 1238 }, { "epoch": 0.3431182497923013, "grad_norm": 1.5759692192077637, "learning_rate": 0.00014984711690955297, "loss": 2.7174, "step": 1239 }, { "epoch": 0.34339518139019665, "grad_norm": 1.4020365476608276, "learning_rate": 0.00014977061713938636, "loss": 2.8425, "step": 1240 }, { "epoch": 0.3436721129880919, "grad_norm": 1.759036660194397, "learning_rate": 0.0001496940786319689, "loss": 2.9138, "step": 1241 }, { "epoch": 0.34394904458598724, "grad_norm": 1.7949292659759521, "learning_rate": 0.00014961750144687174, "loss": 2.7142, "step": 1242 }, { "epoch": 0.34422597618388256, "grad_norm": 1.4500319957733154, "learning_rate": 0.0001495408856436961, "loss": 2.7205, "step": 1243 }, { "epoch": 0.3445029077817779, "grad_norm": 2.0495407581329346, "learning_rate": 0.00014946423128207322, "loss": 2.7994, "step": 1244 }, { "epoch": 0.3447798393796732, "grad_norm": 2.7560412883758545, "learning_rate": 0.00014938753842166444, "loss": 2.76, "step": 1245 }, { "epoch": 0.34505677097756854, "grad_norm": 2.827164888381958, "learning_rate": 0.00014931080712216104, "loss": 2.9787, "step": 1246 }, { "epoch": 0.34533370257546386, "grad_norm": 1.410880446434021, "learning_rate": 0.00014923403744328408, "loss": 3.0478, "step": 1247 }, { "epoch": 0.3456106341733592, "grad_norm": 1.8312749862670898, "learning_rate": 0.0001491572294447847, "loss": 2.7135, "step": 1248 }, { "epoch": 0.3458875657712545, "grad_norm": 3.0767037868499756, "learning_rate": 0.00014908038318644373, "loss": 3.2266, "step": 1249 }, { "epoch": 0.34616449736914984, "grad_norm": 5.677850723266602, "learning_rate": 0.00014900349872807182, "loss": 4.2348, "step": 1250 }, { "epoch": 0.34644142896704516, "grad_norm": 2.243927240371704, "learning_rate": 0.00014892657612950935, "loss": 2.9267, "step": 1251 }, { "epoch": 0.3467183605649405, "grad_norm": 2.399611234664917, "learning_rate": 0.0001488496154506264, "loss": 2.7608, "step": 1252 }, { "epoch": 0.34699529216283576, "grad_norm": 2.9935808181762695, "learning_rate": 0.00014877261675132265, "loss": 3.0759, "step": 1253 }, { "epoch": 0.3472722237607311, "grad_norm": 2.5465614795684814, "learning_rate": 0.0001486955800915274, "loss": 2.7247, "step": 1254 }, { "epoch": 0.3475491553586264, "grad_norm": 2.167867422103882, "learning_rate": 0.0001486185055311995, "loss": 2.6247, "step": 1255 }, { "epoch": 0.34782608695652173, "grad_norm": 2.093701124191284, "learning_rate": 0.00014854139313032726, "loss": 2.7427, "step": 1256 }, { "epoch": 0.34810301855441705, "grad_norm": 2.3673460483551025, "learning_rate": 0.00014846424294892848, "loss": 2.7806, "step": 1257 }, { "epoch": 0.3483799501523124, "grad_norm": 2.3842132091522217, "learning_rate": 0.0001483870550470504, "loss": 2.9437, "step": 1258 }, { "epoch": 0.3486568817502077, "grad_norm": 2.410362482070923, "learning_rate": 0.0001483098294847695, "loss": 2.848, "step": 1259 }, { "epoch": 0.34893381334810303, "grad_norm": 2.2598326206207275, "learning_rate": 0.00014823256632219168, "loss": 2.9541, "step": 1260 }, { "epoch": 0.34921074494599835, "grad_norm": 2.2390294075012207, "learning_rate": 0.0001481552656194521, "loss": 2.9345, "step": 1261 }, { "epoch": 0.3494876765438937, "grad_norm": 1.682488203048706, "learning_rate": 0.00014807792743671506, "loss": 2.7376, "step": 1262 }, { "epoch": 0.349764608141789, "grad_norm": 2.382782220840454, "learning_rate": 0.00014800055183417408, "loss": 2.9122, "step": 1263 }, { "epoch": 0.35004153973968427, "grad_norm": 2.7201778888702393, "learning_rate": 0.00014792313887205182, "loss": 3.1305, "step": 1264 }, { "epoch": 0.3503184713375796, "grad_norm": 2.838747501373291, "learning_rate": 0.0001478456886106, "loss": 2.876, "step": 1265 }, { "epoch": 0.3505954029354749, "grad_norm": 2.1791601181030273, "learning_rate": 0.00014776820111009936, "loss": 2.9689, "step": 1266 }, { "epoch": 0.35087233453337024, "grad_norm": 2.376283645629883, "learning_rate": 0.0001476906764308597, "loss": 2.8344, "step": 1267 }, { "epoch": 0.35114926613126557, "grad_norm": 1.7677280902862549, "learning_rate": 0.00014761311463321958, "loss": 2.8116, "step": 1268 }, { "epoch": 0.3514261977291609, "grad_norm": 2.8490333557128906, "learning_rate": 0.00014753551577754664, "loss": 2.9138, "step": 1269 }, { "epoch": 0.3517031293270562, "grad_norm": 1.9829556941986084, "learning_rate": 0.0001474578799242373, "loss": 2.7738, "step": 1270 }, { "epoch": 0.35198006092495154, "grad_norm": 2.358916997909546, "learning_rate": 0.00014738020713371666, "loss": 2.9176, "step": 1271 }, { "epoch": 0.35225699252284687, "grad_norm": 2.4208502769470215, "learning_rate": 0.0001473024974664388, "loss": 2.9986, "step": 1272 }, { "epoch": 0.3525339241207422, "grad_norm": 1.8187532424926758, "learning_rate": 0.00014722475098288625, "loss": 2.9038, "step": 1273 }, { "epoch": 0.3528108557186375, "grad_norm": 2.0144996643066406, "learning_rate": 0.0001471469677435704, "loss": 2.8533, "step": 1274 }, { "epoch": 0.35308778731653284, "grad_norm": 2.9319944381713867, "learning_rate": 0.00014706914780903113, "loss": 3.6354, "step": 1275 }, { "epoch": 0.3533647189144281, "grad_norm": 1.9010802507400513, "learning_rate": 0.0001469912912398369, "loss": 2.853, "step": 1276 }, { "epoch": 0.35364165051232344, "grad_norm": 2.424229621887207, "learning_rate": 0.00014691339809658473, "loss": 2.8508, "step": 1277 }, { "epoch": 0.35391858211021876, "grad_norm": 2.659932851791382, "learning_rate": 0.00014683546843990007, "loss": 2.8557, "step": 1278 }, { "epoch": 0.3541955137081141, "grad_norm": 1.6898329257965088, "learning_rate": 0.00014675750233043679, "loss": 2.9636, "step": 1279 }, { "epoch": 0.3544724453060094, "grad_norm": 1.7070603370666504, "learning_rate": 0.0001466794998288771, "loss": 2.6075, "step": 1280 }, { "epoch": 0.35474937690390473, "grad_norm": 2.196258068084717, "learning_rate": 0.0001466014609959316, "loss": 2.8587, "step": 1281 }, { "epoch": 0.35502630850180006, "grad_norm": 2.935746669769287, "learning_rate": 0.00014652338589233913, "loss": 2.6526, "step": 1282 }, { "epoch": 0.3553032400996954, "grad_norm": 2.032379627227783, "learning_rate": 0.0001464452745788668, "loss": 2.5506, "step": 1283 }, { "epoch": 0.3555801716975907, "grad_norm": 4.194707870483398, "learning_rate": 0.00014636712711630978, "loss": 2.7623, "step": 1284 }, { "epoch": 0.35585710329548603, "grad_norm": 2.7065067291259766, "learning_rate": 0.00014628894356549156, "loss": 2.569, "step": 1285 }, { "epoch": 0.35613403489338136, "grad_norm": 1.7280241250991821, "learning_rate": 0.00014621072398726356, "loss": 2.9045, "step": 1286 }, { "epoch": 0.3564109664912767, "grad_norm": 2.0148704051971436, "learning_rate": 0.0001461324684425053, "loss": 3.1877, "step": 1287 }, { "epoch": 0.35668789808917195, "grad_norm": 9.018928527832031, "learning_rate": 0.0001460541769921244, "loss": 3.0865, "step": 1288 }, { "epoch": 0.3569648296870673, "grad_norm": 4.8206562995910645, "learning_rate": 0.00014597584969705616, "loss": 2.631, "step": 1289 }, { "epoch": 0.3572417612849626, "grad_norm": 1.6684445142745972, "learning_rate": 0.00014589748661826402, "loss": 2.6218, "step": 1290 }, { "epoch": 0.3575186928828579, "grad_norm": 1.579044222831726, "learning_rate": 0.00014581908781673928, "loss": 2.9571, "step": 1291 }, { "epoch": 0.35779562448075325, "grad_norm": 1.2074787616729736, "learning_rate": 0.0001457406533535008, "loss": 2.7196, "step": 1292 }, { "epoch": 0.3580725560786486, "grad_norm": 2.3285152912139893, "learning_rate": 0.00014566218328959543, "loss": 2.6068, "step": 1293 }, { "epoch": 0.3583494876765439, "grad_norm": 1.3609609603881836, "learning_rate": 0.00014558367768609766, "loss": 2.5368, "step": 1294 }, { "epoch": 0.3586264192744392, "grad_norm": 1.4789910316467285, "learning_rate": 0.0001455051366041096, "loss": 2.7763, "step": 1295 }, { "epoch": 0.35890335087233455, "grad_norm": 1.3611867427825928, "learning_rate": 0.000145426560104761, "loss": 2.6266, "step": 1296 }, { "epoch": 0.3591802824702299, "grad_norm": 1.7433539628982544, "learning_rate": 0.00014534794824920923, "loss": 2.9559, "step": 1297 }, { "epoch": 0.3594572140681252, "grad_norm": 1.8061453104019165, "learning_rate": 0.00014526930109863906, "loss": 2.7681, "step": 1298 }, { "epoch": 0.35973414566602047, "grad_norm": 2.418627977371216, "learning_rate": 0.00014519061871426286, "loss": 3.0256, "step": 1299 }, { "epoch": 0.3600110772639158, "grad_norm": 2.375866174697876, "learning_rate": 0.00014511190115732033, "loss": 3.3018, "step": 1300 }, { "epoch": 0.3602880088618111, "grad_norm": 1.1713021993637085, "learning_rate": 0.00014503314848907857, "loss": 2.8696, "step": 1301 }, { "epoch": 0.36056494045970644, "grad_norm": 2.7128472328186035, "learning_rate": 0.00014495436077083203, "loss": 2.8759, "step": 1302 }, { "epoch": 0.36084187205760176, "grad_norm": 1.0512962341308594, "learning_rate": 0.00014487553806390245, "loss": 2.5979, "step": 1303 }, { "epoch": 0.3611188036554971, "grad_norm": 4.019785404205322, "learning_rate": 0.0001447966804296387, "loss": 2.7857, "step": 1304 }, { "epoch": 0.3613957352533924, "grad_norm": 2.5276665687561035, "learning_rate": 0.000144717787929417, "loss": 3.0544, "step": 1305 }, { "epoch": 0.36167266685128774, "grad_norm": 2.3366951942443848, "learning_rate": 0.00014463886062464056, "loss": 2.9433, "step": 1306 }, { "epoch": 0.36194959844918306, "grad_norm": 1.6152623891830444, "learning_rate": 0.00014455989857673976, "loss": 2.6153, "step": 1307 }, { "epoch": 0.3622265300470784, "grad_norm": 2.1880078315734863, "learning_rate": 0.000144480901847172, "loss": 2.5427, "step": 1308 }, { "epoch": 0.3625034616449737, "grad_norm": 1.9220170974731445, "learning_rate": 0.00014440187049742165, "loss": 2.9394, "step": 1309 }, { "epoch": 0.36278039324286904, "grad_norm": 1.6536104679107666, "learning_rate": 0.00014432280458900009, "loss": 2.6208, "step": 1310 }, { "epoch": 0.3630573248407643, "grad_norm": 2.5638813972473145, "learning_rate": 0.00014424370418344553, "loss": 2.5677, "step": 1311 }, { "epoch": 0.36333425643865963, "grad_norm": 1.7210147380828857, "learning_rate": 0.00014416456934232308, "loss": 2.701, "step": 1312 }, { "epoch": 0.36361118803655496, "grad_norm": 1.981189250946045, "learning_rate": 0.00014408540012722455, "loss": 2.8845, "step": 1313 }, { "epoch": 0.3638881196344503, "grad_norm": 2.588027238845825, "learning_rate": 0.00014400619659976863, "loss": 2.8426, "step": 1314 }, { "epoch": 0.3641650512323456, "grad_norm": 1.580614447593689, "learning_rate": 0.00014392695882160073, "loss": 2.8997, "step": 1315 }, { "epoch": 0.36444198283024093, "grad_norm": 1.2011154890060425, "learning_rate": 0.00014384768685439273, "loss": 2.6991, "step": 1316 }, { "epoch": 0.36471891442813625, "grad_norm": 3.265779733657837, "learning_rate": 0.00014376838075984334, "loss": 3.0898, "step": 1317 }, { "epoch": 0.3649958460260316, "grad_norm": 2.203280448913574, "learning_rate": 0.00014368904059967769, "loss": 2.6934, "step": 1318 }, { "epoch": 0.3652727776239269, "grad_norm": 2.0738298892974854, "learning_rate": 0.00014360966643564747, "loss": 2.8216, "step": 1319 }, { "epoch": 0.3655497092218222, "grad_norm": 2.641373634338379, "learning_rate": 0.0001435302583295308, "loss": 2.854, "step": 1320 }, { "epoch": 0.36582664081971755, "grad_norm": 1.9381299018859863, "learning_rate": 0.00014345081634313238, "loss": 2.7057, "step": 1321 }, { "epoch": 0.3661035724176129, "grad_norm": 1.7109977006912231, "learning_rate": 0.00014337134053828306, "loss": 2.9296, "step": 1322 }, { "epoch": 0.36638050401550815, "grad_norm": 1.5071414709091187, "learning_rate": 0.00014329183097684008, "loss": 2.6535, "step": 1323 }, { "epoch": 0.36665743561340347, "grad_norm": 1.8007851839065552, "learning_rate": 0.00014321228772068702, "loss": 2.8942, "step": 1324 }, { "epoch": 0.3669343672112988, "grad_norm": 3.320955753326416, "learning_rate": 0.00014313271083173364, "loss": 3.8501, "step": 1325 }, { "epoch": 0.3672112988091941, "grad_norm": 1.723261833190918, "learning_rate": 0.00014305310037191583, "loss": 2.4756, "step": 1326 }, { "epoch": 0.36748823040708944, "grad_norm": 1.984232783317566, "learning_rate": 0.00014297345640319574, "loss": 2.8204, "step": 1327 }, { "epoch": 0.36776516200498477, "grad_norm": 2.312904119491577, "learning_rate": 0.0001428937789875615, "loss": 2.8424, "step": 1328 }, { "epoch": 0.3680420936028801, "grad_norm": 1.8733426332473755, "learning_rate": 0.0001428140681870272, "loss": 2.7003, "step": 1329 }, { "epoch": 0.3683190252007754, "grad_norm": 2.389039993286133, "learning_rate": 0.00014273432406363313, "loss": 2.5219, "step": 1330 }, { "epoch": 0.36859595679867074, "grad_norm": 2.193798780441284, "learning_rate": 0.0001426545466794453, "loss": 3.1102, "step": 1331 }, { "epoch": 0.36887288839656607, "grad_norm": 1.861578345298767, "learning_rate": 0.00014257473609655575, "loss": 2.7627, "step": 1332 }, { "epoch": 0.3691498199944614, "grad_norm": 2.2872440814971924, "learning_rate": 0.00014249489237708226, "loss": 2.7749, "step": 1333 }, { "epoch": 0.36942675159235666, "grad_norm": 2.083334445953369, "learning_rate": 0.0001424150155831685, "loss": 2.8019, "step": 1334 }, { "epoch": 0.369703683190252, "grad_norm": 2.032106876373291, "learning_rate": 0.00014233510577698377, "loss": 2.8242, "step": 1335 }, { "epoch": 0.3699806147881473, "grad_norm": 1.8062784671783447, "learning_rate": 0.0001422551630207232, "loss": 2.7238, "step": 1336 }, { "epoch": 0.37025754638604264, "grad_norm": 2.386137008666992, "learning_rate": 0.00014217518737660742, "loss": 2.7767, "step": 1337 }, { "epoch": 0.37053447798393796, "grad_norm": 2.164215087890625, "learning_rate": 0.00014209517890688279, "loss": 3.1791, "step": 1338 }, { "epoch": 0.3708114095818333, "grad_norm": 2.2616584300994873, "learning_rate": 0.00014201513767382108, "loss": 2.9295, "step": 1339 }, { "epoch": 0.3710883411797286, "grad_norm": 2.6709892749786377, "learning_rate": 0.00014193506373971968, "loss": 2.8127, "step": 1340 }, { "epoch": 0.37136527277762393, "grad_norm": 2.624154567718506, "learning_rate": 0.0001418549571669014, "loss": 2.9689, "step": 1341 }, { "epoch": 0.37164220437551926, "grad_norm": 3.4025349617004395, "learning_rate": 0.00014177481801771439, "loss": 3.2772, "step": 1342 }, { "epoch": 0.3719191359734146, "grad_norm": 2.731710433959961, "learning_rate": 0.00014169464635453222, "loss": 2.5917, "step": 1343 }, { "epoch": 0.3721960675713099, "grad_norm": 2.406323194503784, "learning_rate": 0.00014161444223975383, "loss": 3.0855, "step": 1344 }, { "epoch": 0.37247299916920523, "grad_norm": 2.2487471103668213, "learning_rate": 0.00014153420573580322, "loss": 2.7417, "step": 1345 }, { "epoch": 0.3727499307671005, "grad_norm": 1.1252191066741943, "learning_rate": 0.0001414539369051298, "loss": 2.7274, "step": 1346 }, { "epoch": 0.3730268623649958, "grad_norm": 0.8789071440696716, "learning_rate": 0.00014137363581020803, "loss": 2.8867, "step": 1347 }, { "epoch": 0.37330379396289115, "grad_norm": 2.283622980117798, "learning_rate": 0.0001412933025135375, "loss": 2.8808, "step": 1348 }, { "epoch": 0.3735807255607865, "grad_norm": 2.586961030960083, "learning_rate": 0.0001412129370776429, "loss": 2.8784, "step": 1349 }, { "epoch": 0.3738576571586818, "grad_norm": 6.691732406616211, "learning_rate": 0.0001411325395650739, "loss": 3.9918, "step": 1350 }, { "epoch": 0.3741345887565771, "grad_norm": 3.5954439640045166, "learning_rate": 0.0001410521100384051, "loss": 2.8741, "step": 1351 }, { "epoch": 0.37441152035447245, "grad_norm": 3.1264543533325195, "learning_rate": 0.0001409716485602361, "loss": 3.2264, "step": 1352 }, { "epoch": 0.3746884519523678, "grad_norm": 2.292165994644165, "learning_rate": 0.00014089115519319134, "loss": 2.7182, "step": 1353 }, { "epoch": 0.3749653835502631, "grad_norm": 1.8987075090408325, "learning_rate": 0.00014081062999992005, "loss": 2.7812, "step": 1354 }, { "epoch": 0.3752423151481584, "grad_norm": 1.3458808660507202, "learning_rate": 0.00014073007304309625, "loss": 2.8054, "step": 1355 }, { "epoch": 0.37551924674605375, "grad_norm": 1.2150719165802002, "learning_rate": 0.0001406494843854187, "loss": 2.8242, "step": 1356 }, { "epoch": 0.37579617834394907, "grad_norm": 1.3368840217590332, "learning_rate": 0.00014056886408961078, "loss": 2.6038, "step": 1357 }, { "epoch": 0.37607310994184434, "grad_norm": 1.6765084266662598, "learning_rate": 0.00014048821221842052, "loss": 2.4216, "step": 1358 }, { "epoch": 0.37635004153973967, "grad_norm": 1.8513046503067017, "learning_rate": 0.0001404075288346206, "loss": 2.6081, "step": 1359 }, { "epoch": 0.376626973137635, "grad_norm": 2.7676756381988525, "learning_rate": 0.00014032681400100812, "loss": 2.8518, "step": 1360 }, { "epoch": 0.3769039047355303, "grad_norm": 2.3777315616607666, "learning_rate": 0.00014024606778040467, "loss": 2.5985, "step": 1361 }, { "epoch": 0.37718083633342564, "grad_norm": 1.4810510873794556, "learning_rate": 0.00014016529023565632, "loss": 2.9121, "step": 1362 }, { "epoch": 0.37745776793132096, "grad_norm": 2.7724931240081787, "learning_rate": 0.00014008448142963355, "loss": 2.6499, "step": 1363 }, { "epoch": 0.3777346995292163, "grad_norm": 1.3956025838851929, "learning_rate": 0.00014000364142523103, "loss": 2.8069, "step": 1364 }, { "epoch": 0.3780116311271116, "grad_norm": 2.4681835174560547, "learning_rate": 0.0001399227702853679, "loss": 2.8497, "step": 1365 }, { "epoch": 0.37828856272500694, "grad_norm": 1.7103861570358276, "learning_rate": 0.00013984186807298736, "loss": 2.6658, "step": 1366 }, { "epoch": 0.37856549432290226, "grad_norm": 2.5172946453094482, "learning_rate": 0.0001397609348510569, "loss": 2.9519, "step": 1367 }, { "epoch": 0.3788424259207976, "grad_norm": 1.6615102291107178, "learning_rate": 0.0001396799706825681, "loss": 2.6468, "step": 1368 }, { "epoch": 0.37911935751869286, "grad_norm": 1.980188012123108, "learning_rate": 0.00013959897563053662, "loss": 2.7114, "step": 1369 }, { "epoch": 0.3793962891165882, "grad_norm": 1.8296494483947754, "learning_rate": 0.00013951794975800222, "loss": 2.406, "step": 1370 }, { "epoch": 0.3796732207144835, "grad_norm": 1.7942696809768677, "learning_rate": 0.00013943689312802864, "loss": 2.5578, "step": 1371 }, { "epoch": 0.37995015231237883, "grad_norm": 2.6955957412719727, "learning_rate": 0.0001393558058037034, "loss": 2.8052, "step": 1372 }, { "epoch": 0.38022708391027416, "grad_norm": 2.0833001136779785, "learning_rate": 0.00013927468784813814, "loss": 3.0135, "step": 1373 }, { "epoch": 0.3805040155081695, "grad_norm": 1.5874693393707275, "learning_rate": 0.00013919353932446822, "loss": 2.8376, "step": 1374 }, { "epoch": 0.3807809471060648, "grad_norm": 6.984336853027344, "learning_rate": 0.00013911236029585274, "loss": 4.5221, "step": 1375 }, { "epoch": 0.38105787870396013, "grad_norm": 52.98382568359375, "learning_rate": 0.0001390311508254747, "loss": 3.6362, "step": 1376 }, { "epoch": 0.38133481030185545, "grad_norm": 1.5272222757339478, "learning_rate": 0.00013894991097654068, "loss": 2.8065, "step": 1377 }, { "epoch": 0.3816117418997508, "grad_norm": 1.4378056526184082, "learning_rate": 0.00013886864081228087, "loss": 2.6339, "step": 1378 }, { "epoch": 0.3818886734976461, "grad_norm": 2.1655452251434326, "learning_rate": 0.0001387873403959492, "loss": 2.9441, "step": 1379 }, { "epoch": 0.3821656050955414, "grad_norm": 2.181152105331421, "learning_rate": 0.000138706009790823, "loss": 2.7229, "step": 1380 }, { "epoch": 0.3824425366934367, "grad_norm": 1.6623657941818237, "learning_rate": 0.0001386246490602031, "loss": 2.8498, "step": 1381 }, { "epoch": 0.382719468291332, "grad_norm": 2.1751708984375, "learning_rate": 0.00013854325826741394, "loss": 2.6604, "step": 1382 }, { "epoch": 0.38299639988922735, "grad_norm": 1.7822431325912476, "learning_rate": 0.00013846183747580318, "loss": 2.7339, "step": 1383 }, { "epoch": 0.38327333148712267, "grad_norm": 2.052023410797119, "learning_rate": 0.00013838038674874193, "loss": 2.8987, "step": 1384 }, { "epoch": 0.383550263085018, "grad_norm": 1.9398444890975952, "learning_rate": 0.00013829890614962457, "loss": 2.8354, "step": 1385 }, { "epoch": 0.3838271946829133, "grad_norm": 1.7574689388275146, "learning_rate": 0.0001382173957418687, "loss": 2.9091, "step": 1386 }, { "epoch": 0.38410412628080864, "grad_norm": 1.563295602798462, "learning_rate": 0.00013813585558891518, "loss": 2.5467, "step": 1387 }, { "epoch": 0.38438105787870397, "grad_norm": 1.682442307472229, "learning_rate": 0.00013805428575422793, "loss": 2.6808, "step": 1388 }, { "epoch": 0.3846579894765993, "grad_norm": 2.0732030868530273, "learning_rate": 0.00013797268630129413, "loss": 2.7206, "step": 1389 }, { "epoch": 0.3849349210744946, "grad_norm": 1.4792330265045166, "learning_rate": 0.0001378910572936238, "loss": 2.8664, "step": 1390 }, { "epoch": 0.38521185267238994, "grad_norm": 2.685608148574829, "learning_rate": 0.00013780939879475012, "loss": 3.2621, "step": 1391 }, { "epoch": 0.3854887842702852, "grad_norm": 1.362554669380188, "learning_rate": 0.0001377277108682292, "loss": 2.8747, "step": 1392 }, { "epoch": 0.38576571586818054, "grad_norm": 1.9244787693023682, "learning_rate": 0.00013764599357764002, "loss": 2.6234, "step": 1393 }, { "epoch": 0.38604264746607586, "grad_norm": 1.9200278520584106, "learning_rate": 0.0001375642469865844, "loss": 2.565, "step": 1394 }, { "epoch": 0.3863195790639712, "grad_norm": 1.5326173305511475, "learning_rate": 0.00013748247115868704, "loss": 2.785, "step": 1395 }, { "epoch": 0.3865965106618665, "grad_norm": 1.5612823963165283, "learning_rate": 0.0001374006661575953, "loss": 2.6662, "step": 1396 }, { "epoch": 0.38687344225976183, "grad_norm": 0.9551043510437012, "learning_rate": 0.00013731883204697932, "loss": 2.6571, "step": 1397 }, { "epoch": 0.38715037385765716, "grad_norm": 1.4987437725067139, "learning_rate": 0.0001372369688905319, "loss": 2.7669, "step": 1398 }, { "epoch": 0.3874273054555525, "grad_norm": 3.515704870223999, "learning_rate": 0.00013715507675196836, "loss": 3.3099, "step": 1399 }, { "epoch": 0.3877042370534478, "grad_norm": 6.313563346862793, "learning_rate": 0.00013707315569502665, "loss": 4.2518, "step": 1400 }, { "epoch": 0.38798116865134313, "grad_norm": 5.393907070159912, "learning_rate": 0.00013699120578346727, "loss": 3.1773, "step": 1401 }, { "epoch": 0.38825810024923846, "grad_norm": 3.510133743286133, "learning_rate": 0.00013690922708107302, "loss": 2.8114, "step": 1402 }, { "epoch": 0.3885350318471338, "grad_norm": 3.648080825805664, "learning_rate": 0.00013682721965164926, "loss": 2.8152, "step": 1403 }, { "epoch": 0.38881196344502905, "grad_norm": 1.3213528394699097, "learning_rate": 0.0001367451835590237, "loss": 2.5302, "step": 1404 }, { "epoch": 0.3890888950429244, "grad_norm": 1.6521259546279907, "learning_rate": 0.0001366631188670462, "loss": 2.5972, "step": 1405 }, { "epoch": 0.3893658266408197, "grad_norm": 2.0183587074279785, "learning_rate": 0.0001365810256395891, "loss": 3.1236, "step": 1406 }, { "epoch": 0.389642758238715, "grad_norm": 17.155466079711914, "learning_rate": 0.0001364989039405468, "loss": 3.1734, "step": 1407 }, { "epoch": 0.38991968983661035, "grad_norm": 1.8948163986206055, "learning_rate": 0.0001364167538338359, "loss": 2.656, "step": 1408 }, { "epoch": 0.3901966214345057, "grad_norm": 1.8051021099090576, "learning_rate": 0.00013633457538339514, "loss": 3.0139, "step": 1409 }, { "epoch": 0.390473553032401, "grad_norm": 2.2459874153137207, "learning_rate": 0.00013625236865318533, "loss": 2.9797, "step": 1410 }, { "epoch": 0.3907504846302963, "grad_norm": 2.143657684326172, "learning_rate": 0.00013617013370718914, "loss": 2.7228, "step": 1411 }, { "epoch": 0.39102741622819165, "grad_norm": 2.0867865085601807, "learning_rate": 0.00013608787060941143, "loss": 2.9089, "step": 1412 }, { "epoch": 0.391304347826087, "grad_norm": 1.8627058267593384, "learning_rate": 0.0001360055794238788, "loss": 2.9256, "step": 1413 }, { "epoch": 0.3915812794239823, "grad_norm": 1.8171625137329102, "learning_rate": 0.00013592326021463977, "loss": 3.2421, "step": 1414 }, { "epoch": 0.3918582110218776, "grad_norm": 2.448659896850586, "learning_rate": 0.00013584091304576467, "loss": 2.6718, "step": 1415 }, { "epoch": 0.3921351426197729, "grad_norm": 1.933889627456665, "learning_rate": 0.00013575853798134563, "loss": 2.7015, "step": 1416 }, { "epoch": 0.3924120742176682, "grad_norm": 2.313974618911743, "learning_rate": 0.0001356761350854964, "loss": 2.6308, "step": 1417 }, { "epoch": 0.39268900581556354, "grad_norm": 1.7314043045043945, "learning_rate": 0.0001355937044223525, "loss": 2.721, "step": 1418 }, { "epoch": 0.39296593741345887, "grad_norm": 2.7365071773529053, "learning_rate": 0.00013551124605607097, "loss": 2.579, "step": 1419 }, { "epoch": 0.3932428690113542, "grad_norm": 1.9637887477874756, "learning_rate": 0.00013542876005083045, "loss": 2.9192, "step": 1420 }, { "epoch": 0.3935198006092495, "grad_norm": 1.607652187347412, "learning_rate": 0.0001353462464708311, "loss": 2.6256, "step": 1421 }, { "epoch": 0.39379673220714484, "grad_norm": 2.0072946548461914, "learning_rate": 0.0001352637053802945, "loss": 2.7192, "step": 1422 }, { "epoch": 0.39407366380504016, "grad_norm": 4.196632385253906, "learning_rate": 0.00013518113684346373, "loss": 3.2017, "step": 1423 }, { "epoch": 0.3943505954029355, "grad_norm": 1.5665082931518555, "learning_rate": 0.00013509854092460312, "loss": 2.7653, "step": 1424 }, { "epoch": 0.3946275270008308, "grad_norm": 6.20534086227417, "learning_rate": 0.00013501591768799842, "loss": 4.5799, "step": 1425 }, { "epoch": 0.39490445859872614, "grad_norm": 1.3128141164779663, "learning_rate": 0.00013493326719795653, "loss": 2.6134, "step": 1426 }, { "epoch": 0.3951813901966214, "grad_norm": 1.984251618385315, "learning_rate": 0.00013485058951880568, "loss": 2.8511, "step": 1427 }, { "epoch": 0.39545832179451673, "grad_norm": 2.113044500350952, "learning_rate": 0.00013476788471489517, "loss": 2.8686, "step": 1428 }, { "epoch": 0.39573525339241206, "grad_norm": 2.3141541481018066, "learning_rate": 0.0001346851528505954, "loss": 2.9546, "step": 1429 }, { "epoch": 0.3960121849903074, "grad_norm": 2.277184009552002, "learning_rate": 0.00013460239399029796, "loss": 2.688, "step": 1430 }, { "epoch": 0.3962891165882027, "grad_norm": 2.426490068435669, "learning_rate": 0.00013451960819841537, "loss": 2.7827, "step": 1431 }, { "epoch": 0.39656604818609803, "grad_norm": 2.0649800300598145, "learning_rate": 0.00013443679553938102, "loss": 2.8816, "step": 1432 }, { "epoch": 0.39684297978399335, "grad_norm": 1.6541078090667725, "learning_rate": 0.00013435395607764935, "loss": 2.7599, "step": 1433 }, { "epoch": 0.3971199113818887, "grad_norm": 1.7938566207885742, "learning_rate": 0.00013427108987769566, "loss": 2.5367, "step": 1434 }, { "epoch": 0.397396842979784, "grad_norm": 2.0776562690734863, "learning_rate": 0.0001341881970040159, "loss": 3.1443, "step": 1435 }, { "epoch": 0.39767377457767933, "grad_norm": 1.541983962059021, "learning_rate": 0.000134105277521127, "loss": 2.7567, "step": 1436 }, { "epoch": 0.39795070617557465, "grad_norm": 1.157357096672058, "learning_rate": 0.00013402233149356643, "loss": 2.6953, "step": 1437 }, { "epoch": 0.39822763777347, "grad_norm": 2.230661153793335, "learning_rate": 0.00013393935898589238, "loss": 2.8742, "step": 1438 }, { "epoch": 0.39850456937136525, "grad_norm": 2.2509617805480957, "learning_rate": 0.00013385636006268368, "loss": 2.9729, "step": 1439 }, { "epoch": 0.39878150096926057, "grad_norm": 1.4444016218185425, "learning_rate": 0.00013377333478853967, "loss": 2.7014, "step": 1440 }, { "epoch": 0.3990584325671559, "grad_norm": 1.4741573333740234, "learning_rate": 0.00013369028322808023, "loss": 2.9002, "step": 1441 }, { "epoch": 0.3993353641650512, "grad_norm": 1.3836803436279297, "learning_rate": 0.0001336072054459457, "loss": 2.6069, "step": 1442 }, { "epoch": 0.39961229576294655, "grad_norm": 1.7925223112106323, "learning_rate": 0.00013352410150679683, "loss": 2.9793, "step": 1443 }, { "epoch": 0.39988922736084187, "grad_norm": 1.3602858781814575, "learning_rate": 0.00013344097147531469, "loss": 2.7197, "step": 1444 }, { "epoch": 0.4001661589587372, "grad_norm": 2.3307301998138428, "learning_rate": 0.0001333578154162007, "loss": 2.7677, "step": 1445 }, { "epoch": 0.4004430905566325, "grad_norm": 2.375763416290283, "learning_rate": 0.00013327463339417653, "loss": 2.7946, "step": 1446 }, { "epoch": 0.40072002215452784, "grad_norm": 1.6489684581756592, "learning_rate": 0.000133191425473984, "loss": 2.7883, "step": 1447 }, { "epoch": 0.40099695375242317, "grad_norm": 1.7327191829681396, "learning_rate": 0.0001331081917203852, "loss": 2.8872, "step": 1448 }, { "epoch": 0.4012738853503185, "grad_norm": 2.1036574840545654, "learning_rate": 0.00013302493219816223, "loss": 3.1464, "step": 1449 }, { "epoch": 0.4015508169482138, "grad_norm": 4.300667762756348, "learning_rate": 0.00013294164697211732, "loss": 4.0743, "step": 1450 }, { "epoch": 0.4018277485461091, "grad_norm": 3.243358612060547, "learning_rate": 0.0001328583361070726, "loss": 3.1086, "step": 1451 }, { "epoch": 0.4021046801440044, "grad_norm": 1.1075637340545654, "learning_rate": 0.0001327749996678703, "loss": 2.6491, "step": 1452 }, { "epoch": 0.40238161174189974, "grad_norm": 2.0181403160095215, "learning_rate": 0.0001326916377193724, "loss": 2.6164, "step": 1453 }, { "epoch": 0.40265854333979506, "grad_norm": 1.3411420583724976, "learning_rate": 0.00013260825032646083, "loss": 2.5476, "step": 1454 }, { "epoch": 0.4029354749376904, "grad_norm": 1.5511304140090942, "learning_rate": 0.00013252483755403735, "loss": 2.9308, "step": 1455 }, { "epoch": 0.4032124065355857, "grad_norm": 1.9710975885391235, "learning_rate": 0.00013244139946702336, "loss": 2.8186, "step": 1456 }, { "epoch": 0.40348933813348103, "grad_norm": 1.543416976928711, "learning_rate": 0.0001323579361303601, "loss": 2.5958, "step": 1457 }, { "epoch": 0.40376626973137636, "grad_norm": 1.2788368463516235, "learning_rate": 0.00013227444760900836, "loss": 2.7018, "step": 1458 }, { "epoch": 0.4040432013292717, "grad_norm": 2.103099822998047, "learning_rate": 0.00013219093396794852, "loss": 2.6884, "step": 1459 }, { "epoch": 0.404320132927167, "grad_norm": 1.5627678632736206, "learning_rate": 0.00013210739527218064, "loss": 2.9022, "step": 1460 }, { "epoch": 0.40459706452506233, "grad_norm": 1.1994236707687378, "learning_rate": 0.00013202383158672412, "loss": 2.9379, "step": 1461 }, { "epoch": 0.4048739961229576, "grad_norm": 1.8488538265228271, "learning_rate": 0.00013194024297661793, "loss": 2.8452, "step": 1462 }, { "epoch": 0.4051509277208529, "grad_norm": 1.3858424425125122, "learning_rate": 0.00013185662950692035, "loss": 2.5327, "step": 1463 }, { "epoch": 0.40542785931874825, "grad_norm": 1.5252209901809692, "learning_rate": 0.00013177299124270911, "loss": 2.8002, "step": 1464 }, { "epoch": 0.4057047909166436, "grad_norm": 2.618840456008911, "learning_rate": 0.00013168932824908115, "loss": 2.7631, "step": 1465 }, { "epoch": 0.4059817225145389, "grad_norm": 1.9298254251480103, "learning_rate": 0.0001316056405911527, "loss": 3.064, "step": 1466 }, { "epoch": 0.4062586541124342, "grad_norm": 2.1758623123168945, "learning_rate": 0.00013152192833405918, "loss": 2.7877, "step": 1467 }, { "epoch": 0.40653558571032955, "grad_norm": 1.194083333015442, "learning_rate": 0.0001314381915429551, "loss": 2.8273, "step": 1468 }, { "epoch": 0.4068125173082249, "grad_norm": 2.225982666015625, "learning_rate": 0.0001313544302830142, "loss": 2.6538, "step": 1469 }, { "epoch": 0.4070894489061202, "grad_norm": 2.0385043621063232, "learning_rate": 0.00013127064461942913, "loss": 2.8584, "step": 1470 }, { "epoch": 0.4073663805040155, "grad_norm": 1.949174165725708, "learning_rate": 0.00013118683461741155, "loss": 2.7196, "step": 1471 }, { "epoch": 0.40764331210191085, "grad_norm": 0.9881244897842407, "learning_rate": 0.00013110300034219216, "loss": 2.606, "step": 1472 }, { "epoch": 0.4079202436998062, "grad_norm": 3.028315544128418, "learning_rate": 0.00013101914185902045, "loss": 2.9581, "step": 1473 }, { "epoch": 0.40819717529770144, "grad_norm": 2.262873888015747, "learning_rate": 0.00013093525923316482, "loss": 3.0206, "step": 1474 }, { "epoch": 0.40847410689559677, "grad_norm": 3.912444829940796, "learning_rate": 0.00013085135252991238, "loss": 4.0442, "step": 1475 }, { "epoch": 0.4087510384934921, "grad_norm": 1.941103219985962, "learning_rate": 0.00013076742181456914, "loss": 2.9139, "step": 1476 }, { "epoch": 0.4090279700913874, "grad_norm": 1.6083106994628906, "learning_rate": 0.00013068346715245957, "loss": 2.5113, "step": 1477 }, { "epoch": 0.40930490168928274, "grad_norm": 1.6494433879852295, "learning_rate": 0.00013059948860892696, "loss": 2.6644, "step": 1478 }, { "epoch": 0.40958183328717807, "grad_norm": 1.2976964712142944, "learning_rate": 0.00013051548624933314, "loss": 2.4618, "step": 1479 }, { "epoch": 0.4098587648850734, "grad_norm": 1.8355804681777954, "learning_rate": 0.0001304314601390584, "loss": 2.9373, "step": 1480 }, { "epoch": 0.4101356964829687, "grad_norm": 1.3326092958450317, "learning_rate": 0.00013034741034350161, "loss": 2.6393, "step": 1481 }, { "epoch": 0.41041262808086404, "grad_norm": 1.550118088722229, "learning_rate": 0.00013026333692808006, "loss": 2.7761, "step": 1482 }, { "epoch": 0.41068955967875936, "grad_norm": 2.0922064781188965, "learning_rate": 0.00013017923995822938, "loss": 2.9785, "step": 1483 }, { "epoch": 0.4109664912766547, "grad_norm": 1.86045241355896, "learning_rate": 0.00013009511949940358, "loss": 2.5891, "step": 1484 }, { "epoch": 0.41124342287455, "grad_norm": 1.8999186754226685, "learning_rate": 0.00013001097561707493, "loss": 2.4681, "step": 1485 }, { "epoch": 0.4115203544724453, "grad_norm": 0.8147444725036621, "learning_rate": 0.00012992680837673392, "loss": 2.7766, "step": 1486 }, { "epoch": 0.4117972860703406, "grad_norm": 1.3010146617889404, "learning_rate": 0.00012984261784388922, "loss": 2.5141, "step": 1487 }, { "epoch": 0.41207421766823593, "grad_norm": 1.1852285861968994, "learning_rate": 0.00012975840408406767, "loss": 2.6221, "step": 1488 }, { "epoch": 0.41235114926613126, "grad_norm": 33.655174255371094, "learning_rate": 0.00012967416716281414, "loss": 2.9547, "step": 1489 }, { "epoch": 0.4126280808640266, "grad_norm": 1.393959641456604, "learning_rate": 0.00012958990714569154, "loss": 2.6042, "step": 1490 }, { "epoch": 0.4129050124619219, "grad_norm": 2.544478416442871, "learning_rate": 0.0001295056240982808, "loss": 2.6323, "step": 1491 }, { "epoch": 0.41318194405981723, "grad_norm": 1.5582716464996338, "learning_rate": 0.00012942131808618067, "loss": 2.9899, "step": 1492 }, { "epoch": 0.41345887565771255, "grad_norm": 1.7537245750427246, "learning_rate": 0.00012933698917500788, "loss": 2.6883, "step": 1493 }, { "epoch": 0.4137358072556079, "grad_norm": 1.275985836982727, "learning_rate": 0.00012925263743039693, "loss": 2.7156, "step": 1494 }, { "epoch": 0.4140127388535032, "grad_norm": 1.2964093685150146, "learning_rate": 0.0001291682629180001, "loss": 2.7783, "step": 1495 }, { "epoch": 0.41428967045139853, "grad_norm": 1.6701529026031494, "learning_rate": 0.0001290838657034874, "loss": 2.7645, "step": 1496 }, { "epoch": 0.4145666020492938, "grad_norm": 1.611763834953308, "learning_rate": 0.00012899944585254656, "loss": 2.8794, "step": 1497 }, { "epoch": 0.4148435336471891, "grad_norm": 1.6163182258605957, "learning_rate": 0.00012891500343088278, "loss": 2.9532, "step": 1498 }, { "epoch": 0.41512046524508445, "grad_norm": 2.047060489654541, "learning_rate": 0.00012883053850421897, "loss": 3.1522, "step": 1499 }, { "epoch": 0.41539739684297977, "grad_norm": 3.8433516025543213, "learning_rate": 0.00012874605113829553, "loss": 4.1045, "step": 1500 }, { "epoch": 0.4156743284408751, "grad_norm": 2.038623094558716, "learning_rate": 0.0001286615413988702, "loss": 2.7263, "step": 1501 }, { "epoch": 0.4159512600387704, "grad_norm": 1.697388768196106, "learning_rate": 0.00012857700935171835, "loss": 2.7766, "step": 1502 }, { "epoch": 0.41622819163666575, "grad_norm": 1.923000693321228, "learning_rate": 0.00012849245506263256, "loss": 2.8755, "step": 1503 }, { "epoch": 0.41650512323456107, "grad_norm": 1.566849708557129, "learning_rate": 0.00012840787859742266, "loss": 2.9213, "step": 1504 }, { "epoch": 0.4167820548324564, "grad_norm": 1.4963854551315308, "learning_rate": 0.00012832328002191599, "loss": 2.6829, "step": 1505 }, { "epoch": 0.4170589864303517, "grad_norm": 1.7939183712005615, "learning_rate": 0.00012823865940195684, "loss": 2.5174, "step": 1506 }, { "epoch": 0.41733591802824704, "grad_norm": 1.2367196083068848, "learning_rate": 0.00012815401680340682, "loss": 2.9059, "step": 1507 }, { "epoch": 0.41761284962614237, "grad_norm": 1.3630605936050415, "learning_rate": 0.00012806935229214456, "loss": 2.7343, "step": 1508 }, { "epoch": 0.41788978122403764, "grad_norm": 10.827095985412598, "learning_rate": 0.00012798466593406583, "loss": 2.8514, "step": 1509 }, { "epoch": 0.41816671282193296, "grad_norm": 1.4653106927871704, "learning_rate": 0.00012789995779508327, "loss": 2.8998, "step": 1510 }, { "epoch": 0.4184436444198283, "grad_norm": 1.8175551891326904, "learning_rate": 0.00012781522794112657, "loss": 2.8933, "step": 1511 }, { "epoch": 0.4187205760177236, "grad_norm": 1.614170789718628, "learning_rate": 0.00012773047643814235, "loss": 2.7967, "step": 1512 }, { "epoch": 0.41899750761561894, "grad_norm": 1.9364715814590454, "learning_rate": 0.000127645703352094, "loss": 2.7361, "step": 1513 }, { "epoch": 0.41927443921351426, "grad_norm": 2.4313740730285645, "learning_rate": 0.00012756090874896172, "loss": 3.164, "step": 1514 }, { "epoch": 0.4195513708114096, "grad_norm": 1.7980979681015015, "learning_rate": 0.0001274760926947425, "loss": 3.0288, "step": 1515 }, { "epoch": 0.4198283024093049, "grad_norm": 1.5087634325027466, "learning_rate": 0.00012739125525545002, "loss": 2.9844, "step": 1516 }, { "epoch": 0.42010523400720023, "grad_norm": 1.5665369033813477, "learning_rate": 0.00012730639649711454, "loss": 2.6869, "step": 1517 }, { "epoch": 0.42038216560509556, "grad_norm": 1.50995934009552, "learning_rate": 0.00012722151648578302, "loss": 2.8659, "step": 1518 }, { "epoch": 0.4206590972029909, "grad_norm": 1.8009982109069824, "learning_rate": 0.00012713661528751888, "loss": 2.5628, "step": 1519 }, { "epoch": 0.4209360288008862, "grad_norm": 1.7115381956100464, "learning_rate": 0.00012705169296840202, "loss": 2.5867, "step": 1520 }, { "epoch": 0.4212129603987815, "grad_norm": 1.8676518201828003, "learning_rate": 0.00012696674959452887, "loss": 2.7795, "step": 1521 }, { "epoch": 0.4214898919966768, "grad_norm": 1.6781612634658813, "learning_rate": 0.00012688178523201214, "loss": 2.7978, "step": 1522 }, { "epoch": 0.4217668235945721, "grad_norm": 2.0392162799835205, "learning_rate": 0.0001267967999469809, "loss": 2.8138, "step": 1523 }, { "epoch": 0.42204375519246745, "grad_norm": 2.3748555183410645, "learning_rate": 0.00012671179380558062, "loss": 3.1233, "step": 1524 }, { "epoch": 0.4223206867903628, "grad_norm": 5.822400093078613, "learning_rate": 0.00012662676687397284, "loss": 4.4801, "step": 1525 }, { "epoch": 0.4225976183882581, "grad_norm": 1.2015067338943481, "learning_rate": 0.00012654171921833534, "loss": 2.6184, "step": 1526 }, { "epoch": 0.4228745499861534, "grad_norm": 1.7967123985290527, "learning_rate": 0.00012645665090486212, "loss": 2.6241, "step": 1527 }, { "epoch": 0.42315148158404875, "grad_norm": 1.2164345979690552, "learning_rate": 0.0001263715619997631, "loss": 2.4707, "step": 1528 }, { "epoch": 0.4234284131819441, "grad_norm": 1.5270050764083862, "learning_rate": 0.00012628645256926438, "loss": 2.5566, "step": 1529 }, { "epoch": 0.4237053447798394, "grad_norm": 1.3630717992782593, "learning_rate": 0.00012620132267960788, "loss": 2.8082, "step": 1530 }, { "epoch": 0.4239822763777347, "grad_norm": 1.9305917024612427, "learning_rate": 0.00012611617239705161, "loss": 2.859, "step": 1531 }, { "epoch": 0.42425920797563, "grad_norm": 1.6518409252166748, "learning_rate": 0.00012603100178786928, "loss": 2.688, "step": 1532 }, { "epoch": 0.4245361395735253, "grad_norm": 2.4489521980285645, "learning_rate": 0.00012594581091835062, "loss": 2.9602, "step": 1533 }, { "epoch": 0.42481307117142064, "grad_norm": 3.1953954696655273, "learning_rate": 0.0001258605998548009, "loss": 2.9738, "step": 1534 }, { "epoch": 0.42509000276931597, "grad_norm": 1.9063570499420166, "learning_rate": 0.00012577536866354135, "loss": 2.8507, "step": 1535 }, { "epoch": 0.4253669343672113, "grad_norm": 1.2348942756652832, "learning_rate": 0.00012569011741090863, "loss": 2.2727, "step": 1536 }, { "epoch": 0.4256438659651066, "grad_norm": 1.3155874013900757, "learning_rate": 0.00012560484616325513, "loss": 2.7515, "step": 1537 }, { "epoch": 0.42592079756300194, "grad_norm": 1.3448861837387085, "learning_rate": 0.0001255195549869489, "loss": 2.7228, "step": 1538 }, { "epoch": 0.42619772916089727, "grad_norm": 1.4402925968170166, "learning_rate": 0.0001254342439483733, "loss": 2.7855, "step": 1539 }, { "epoch": 0.4264746607587926, "grad_norm": 1.7898362874984741, "learning_rate": 0.0001253489131139273, "loss": 2.8563, "step": 1540 }, { "epoch": 0.4267515923566879, "grad_norm": 1.267905592918396, "learning_rate": 0.0001252635625500252, "loss": 2.9075, "step": 1541 }, { "epoch": 0.42702852395458324, "grad_norm": 1.5784574747085571, "learning_rate": 0.0001251781923230967, "loss": 2.6532, "step": 1542 }, { "epoch": 0.42730545555247856, "grad_norm": 2.0321223735809326, "learning_rate": 0.00012509280249958674, "loss": 2.7816, "step": 1543 }, { "epoch": 0.42758238715037383, "grad_norm": 1.8454991579055786, "learning_rate": 0.00012500739314595563, "loss": 2.7301, "step": 1544 }, { "epoch": 0.42785931874826916, "grad_norm": 1.7442742586135864, "learning_rate": 0.00012492196432867874, "loss": 2.719, "step": 1545 }, { "epoch": 0.4281362503461645, "grad_norm": 2.122340202331543, "learning_rate": 0.00012483651611424666, "loss": 2.9033, "step": 1546 }, { "epoch": 0.4284131819440598, "grad_norm": 1.8782052993774414, "learning_rate": 0.00012475104856916511, "loss": 2.6263, "step": 1547 }, { "epoch": 0.42869011354195513, "grad_norm": 1.387713074684143, "learning_rate": 0.0001246655617599548, "loss": 2.5894, "step": 1548 }, { "epoch": 0.42896704513985046, "grad_norm": 1.0579252243041992, "learning_rate": 0.00012458005575315147, "loss": 3.0989, "step": 1549 }, { "epoch": 0.4292439767377458, "grad_norm": 4.583604335784912, "learning_rate": 0.0001244945306153058, "loss": 4.2209, "step": 1550 }, { "epoch": 0.4295209083356411, "grad_norm": 1.498977541923523, "learning_rate": 0.00012440898641298328, "loss": 2.6145, "step": 1551 }, { "epoch": 0.42979783993353643, "grad_norm": 1.6750907897949219, "learning_rate": 0.00012432342321276436, "loss": 2.6988, "step": 1552 }, { "epoch": 0.43007477153143175, "grad_norm": 2.225656270980835, "learning_rate": 0.0001242378410812442, "loss": 2.9361, "step": 1553 }, { "epoch": 0.4303517031293271, "grad_norm": 2.292717933654785, "learning_rate": 0.0001241522400850327, "loss": 2.9439, "step": 1554 }, { "epoch": 0.4306286347272224, "grad_norm": 1.349919319152832, "learning_rate": 0.00012406662029075446, "loss": 2.628, "step": 1555 }, { "epoch": 0.4309055663251177, "grad_norm": 2.0709357261657715, "learning_rate": 0.00012398098176504872, "loss": 2.6342, "step": 1556 }, { "epoch": 0.431182497923013, "grad_norm": 1.249013900756836, "learning_rate": 0.0001238953245745693, "loss": 2.7055, "step": 1557 }, { "epoch": 0.4314594295209083, "grad_norm": 1.735985517501831, "learning_rate": 0.0001238096487859845, "loss": 2.658, "step": 1558 }, { "epoch": 0.43173636111880365, "grad_norm": 1.6668094396591187, "learning_rate": 0.0001237239544659771, "loss": 2.8608, "step": 1559 }, { "epoch": 0.43201329271669897, "grad_norm": 1.760405421257019, "learning_rate": 0.0001236382416812444, "loss": 2.6073, "step": 1560 }, { "epoch": 0.4322902243145943, "grad_norm": 1.0794938802719116, "learning_rate": 0.000123552510498498, "loss": 2.6189, "step": 1561 }, { "epoch": 0.4325671559124896, "grad_norm": 2.733625650405884, "learning_rate": 0.00012346676098446378, "loss": 3.4352, "step": 1562 }, { "epoch": 0.43284408751038495, "grad_norm": 2.011298418045044, "learning_rate": 0.00012338099320588193, "loss": 2.7116, "step": 1563 }, { "epoch": 0.43312101910828027, "grad_norm": 1.5651980638504028, "learning_rate": 0.0001232952072295069, "loss": 2.9265, "step": 1564 }, { "epoch": 0.4333979507061756, "grad_norm": 2.379638671875, "learning_rate": 0.00012320940312210725, "loss": 2.8316, "step": 1565 }, { "epoch": 0.4336748823040709, "grad_norm": 0.860095202922821, "learning_rate": 0.00012312358095046562, "loss": 2.7123, "step": 1566 }, { "epoch": 0.4339518139019662, "grad_norm": 1.6845221519470215, "learning_rate": 0.00012303774078137872, "loss": 2.7341, "step": 1567 }, { "epoch": 0.4342287454998615, "grad_norm": 1.8354023694992065, "learning_rate": 0.00012295188268165742, "loss": 2.9535, "step": 1568 }, { "epoch": 0.43450567709775684, "grad_norm": 1.8640836477279663, "learning_rate": 0.0001228660067181263, "loss": 2.5892, "step": 1569 }, { "epoch": 0.43478260869565216, "grad_norm": 1.708167552947998, "learning_rate": 0.00012278011295762406, "loss": 2.7308, "step": 1570 }, { "epoch": 0.4350595402935475, "grad_norm": 2.214993476867676, "learning_rate": 0.00012269420146700312, "loss": 2.6486, "step": 1571 }, { "epoch": 0.4353364718914428, "grad_norm": 1.652694582939148, "learning_rate": 0.00012260827231312974, "loss": 2.951, "step": 1572 }, { "epoch": 0.43561340348933814, "grad_norm": 1.8127185106277466, "learning_rate": 0.00012252232556288393, "loss": 2.5346, "step": 1573 }, { "epoch": 0.43589033508723346, "grad_norm": 1.5010112524032593, "learning_rate": 0.00012243636128315939, "loss": 3.0699, "step": 1574 }, { "epoch": 0.4361672666851288, "grad_norm": 3.595019578933716, "learning_rate": 0.00012235037954086344, "loss": 3.8706, "step": 1575 }, { "epoch": 0.4364441982830241, "grad_norm": 2.001720428466797, "learning_rate": 0.00012226438040291703, "loss": 2.8349, "step": 1576 }, { "epoch": 0.43672112988091943, "grad_norm": 1.3290728330612183, "learning_rate": 0.0001221783639362547, "loss": 2.7987, "step": 1577 }, { "epoch": 0.43699806147881476, "grad_norm": 2.3499648571014404, "learning_rate": 0.0001220923302078243, "loss": 2.571, "step": 1578 }, { "epoch": 0.43727499307671003, "grad_norm": 1.8382792472839355, "learning_rate": 0.0001220062792845873, "loss": 2.715, "step": 1579 }, { "epoch": 0.43755192467460535, "grad_norm": 2.051205635070801, "learning_rate": 0.00012192021123351847, "loss": 2.7925, "step": 1580 }, { "epoch": 0.4378288562725007, "grad_norm": 2.2316954135894775, "learning_rate": 0.00012183412612160592, "loss": 2.7037, "step": 1581 }, { "epoch": 0.438105787870396, "grad_norm": 1.6185784339904785, "learning_rate": 0.00012174802401585106, "loss": 2.8406, "step": 1582 }, { "epoch": 0.4383827194682913, "grad_norm": 1.404736876487732, "learning_rate": 0.00012166190498326849, "loss": 2.6622, "step": 1583 }, { "epoch": 0.43865965106618665, "grad_norm": 1.912427544593811, "learning_rate": 0.00012157576909088599, "loss": 2.8172, "step": 1584 }, { "epoch": 0.438936582664082, "grad_norm": 2.464752674102783, "learning_rate": 0.00012148961640574448, "loss": 3.0364, "step": 1585 }, { "epoch": 0.4392135142619773, "grad_norm": 1.2723188400268555, "learning_rate": 0.00012140344699489797, "loss": 2.9328, "step": 1586 }, { "epoch": 0.4394904458598726, "grad_norm": 1.8504717350006104, "learning_rate": 0.0001213172609254134, "loss": 2.9263, "step": 1587 }, { "epoch": 0.43976737745776795, "grad_norm": 2.0745506286621094, "learning_rate": 0.0001212310582643708, "loss": 2.8592, "step": 1588 }, { "epoch": 0.4400443090556633, "grad_norm": 1.8228636980056763, "learning_rate": 0.00012114483907886308, "loss": 2.6974, "step": 1589 }, { "epoch": 0.4403212406535586, "grad_norm": 2.4309403896331787, "learning_rate": 0.00012105860343599587, "loss": 3.0777, "step": 1590 }, { "epoch": 0.44059817225145387, "grad_norm": 1.2440286874771118, "learning_rate": 0.00012097235140288778, "loss": 2.6633, "step": 1591 }, { "epoch": 0.4408751038493492, "grad_norm": 1.2311338186264038, "learning_rate": 0.00012088608304667013, "loss": 2.9672, "step": 1592 }, { "epoch": 0.4411520354472445, "grad_norm": 1.4380064010620117, "learning_rate": 0.00012079979843448695, "loss": 2.7795, "step": 1593 }, { "epoch": 0.44142896704513984, "grad_norm": 1.5805429220199585, "learning_rate": 0.00012071349763349484, "loss": 2.8181, "step": 1594 }, { "epoch": 0.44170589864303517, "grad_norm": 1.2909247875213623, "learning_rate": 0.00012062718071086317, "loss": 2.7215, "step": 1595 }, { "epoch": 0.4419828302409305, "grad_norm": 1.5421581268310547, "learning_rate": 0.00012054084773377364, "loss": 2.9609, "step": 1596 }, { "epoch": 0.4422597618388258, "grad_norm": 1.854814887046814, "learning_rate": 0.0001204544987694206, "loss": 2.6241, "step": 1597 }, { "epoch": 0.44253669343672114, "grad_norm": 3.275557279586792, "learning_rate": 0.00012036813388501085, "loss": 3.2255, "step": 1598 }, { "epoch": 0.44281362503461646, "grad_norm": 1.4313879013061523, "learning_rate": 0.00012028175314776344, "loss": 2.7054, "step": 1599 }, { "epoch": 0.4430905566325118, "grad_norm": 3.1973049640655518, "learning_rate": 0.00012019535662490992, "loss": 3.1645, "step": 1600 }, { "epoch": 0.4433674882304071, "grad_norm": 1.480504035949707, "learning_rate": 0.00012010894438369404, "loss": 2.9397, "step": 1601 }, { "epoch": 0.4436444198283024, "grad_norm": 2.36940598487854, "learning_rate": 0.00012002251649137179, "loss": 3.103, "step": 1602 }, { "epoch": 0.4439213514261977, "grad_norm": 1.381364345550537, "learning_rate": 0.00011993607301521137, "loss": 2.5824, "step": 1603 }, { "epoch": 0.44419828302409303, "grad_norm": 1.7203359603881836, "learning_rate": 0.00011984961402249311, "loss": 2.6152, "step": 1604 }, { "epoch": 0.44447521462198836, "grad_norm": 1.387699007987976, "learning_rate": 0.00011976313958050933, "loss": 2.6626, "step": 1605 }, { "epoch": 0.4447521462198837, "grad_norm": 1.2170424461364746, "learning_rate": 0.0001196766497565645, "loss": 2.5803, "step": 1606 }, { "epoch": 0.445029077817779, "grad_norm": 1.3350768089294434, "learning_rate": 0.00011959014461797497, "loss": 3.0191, "step": 1607 }, { "epoch": 0.44530600941567433, "grad_norm": 1.831148624420166, "learning_rate": 0.00011950362423206907, "loss": 2.6024, "step": 1608 }, { "epoch": 0.44558294101356966, "grad_norm": 1.5087878704071045, "learning_rate": 0.00011941708866618697, "loss": 2.7182, "step": 1609 }, { "epoch": 0.445859872611465, "grad_norm": 1.560234546661377, "learning_rate": 0.00011933053798768066, "loss": 2.8327, "step": 1610 }, { "epoch": 0.4461368042093603, "grad_norm": 1.6062860488891602, "learning_rate": 0.00011924397226391384, "loss": 2.7957, "step": 1611 }, { "epoch": 0.44641373580725563, "grad_norm": 1.3197059631347656, "learning_rate": 0.00011915739156226202, "loss": 2.9193, "step": 1612 }, { "epoch": 0.44669066740515095, "grad_norm": 1.0454370975494385, "learning_rate": 0.00011907079595011231, "loss": 2.9694, "step": 1613 }, { "epoch": 0.4469675990030462, "grad_norm": 1.8847899436950684, "learning_rate": 0.0001189841854948634, "loss": 3.1058, "step": 1614 }, { "epoch": 0.44724453060094155, "grad_norm": 1.7060329914093018, "learning_rate": 0.00011889756026392562, "loss": 3.068, "step": 1615 }, { "epoch": 0.4475214621988369, "grad_norm": 1.8782567977905273, "learning_rate": 0.00011881092032472073, "loss": 2.6358, "step": 1616 }, { "epoch": 0.4477983937967322, "grad_norm": 1.6177935600280762, "learning_rate": 0.00011872426574468187, "loss": 2.5237, "step": 1617 }, { "epoch": 0.4480753253946275, "grad_norm": 1.101818323135376, "learning_rate": 0.00011863759659125376, "loss": 2.6257, "step": 1618 }, { "epoch": 0.44835225699252285, "grad_norm": 1.8147088289260864, "learning_rate": 0.00011855091293189234, "loss": 2.5104, "step": 1619 }, { "epoch": 0.44862918859041817, "grad_norm": 1.6569304466247559, "learning_rate": 0.0001184642148340648, "loss": 2.9456, "step": 1620 }, { "epoch": 0.4489061201883135, "grad_norm": 1.4350814819335938, "learning_rate": 0.00011837750236524967, "loss": 2.6975, "step": 1621 }, { "epoch": 0.4491830517862088, "grad_norm": 1.4598429203033447, "learning_rate": 0.00011829077559293664, "loss": 2.7264, "step": 1622 }, { "epoch": 0.44945998338410414, "grad_norm": 1.9063446521759033, "learning_rate": 0.00011820403458462647, "loss": 3.161, "step": 1623 }, { "epoch": 0.44973691498199947, "grad_norm": 1.749877691268921, "learning_rate": 0.00011811727940783108, "loss": 3.0542, "step": 1624 }, { "epoch": 0.45001384657989474, "grad_norm": 3.7590386867523193, "learning_rate": 0.00011803051013007336, "loss": 3.8809, "step": 1625 }, { "epoch": 0.45029077817779006, "grad_norm": 1.150923728942871, "learning_rate": 0.00011794372681888721, "loss": 2.7087, "step": 1626 }, { "epoch": 0.4505677097756854, "grad_norm": 1.7766743898391724, "learning_rate": 0.00011785692954181746, "loss": 3.1327, "step": 1627 }, { "epoch": 0.4508446413735807, "grad_norm": 1.7171368598937988, "learning_rate": 0.00011777011836641978, "loss": 3.0311, "step": 1628 }, { "epoch": 0.45112157297147604, "grad_norm": 1.6420807838439941, "learning_rate": 0.00011768329336026062, "loss": 2.8227, "step": 1629 }, { "epoch": 0.45139850456937136, "grad_norm": 1.121830940246582, "learning_rate": 0.00011759645459091731, "loss": 2.5743, "step": 1630 }, { "epoch": 0.4516754361672667, "grad_norm": 1.3861570358276367, "learning_rate": 0.0001175096021259778, "loss": 2.6474, "step": 1631 }, { "epoch": 0.451952367765162, "grad_norm": 1.0731266736984253, "learning_rate": 0.0001174227360330407, "loss": 2.8231, "step": 1632 }, { "epoch": 0.45222929936305734, "grad_norm": 1.2852985858917236, "learning_rate": 0.00011733585637971527, "loss": 2.6447, "step": 1633 }, { "epoch": 0.45250623096095266, "grad_norm": 1.646578311920166, "learning_rate": 0.0001172489632336213, "loss": 2.6627, "step": 1634 }, { "epoch": 0.452783162558848, "grad_norm": 1.2219154834747314, "learning_rate": 0.00011716205666238908, "loss": 2.4261, "step": 1635 }, { "epoch": 0.4530600941567433, "grad_norm": 2.7012853622436523, "learning_rate": 0.0001170751367336594, "loss": 2.7471, "step": 1636 }, { "epoch": 0.4533370257546386, "grad_norm": 1.415909767150879, "learning_rate": 0.00011698820351508335, "loss": 2.6987, "step": 1637 }, { "epoch": 0.4536139573525339, "grad_norm": 1.3519766330718994, "learning_rate": 0.00011690125707432244, "loss": 2.9456, "step": 1638 }, { "epoch": 0.4538908889504292, "grad_norm": 1.2360529899597168, "learning_rate": 0.00011681429747904842, "loss": 2.5995, "step": 1639 }, { "epoch": 0.45416782054832455, "grad_norm": 2.9068734645843506, "learning_rate": 0.00011672732479694338, "loss": 3.1497, "step": 1640 }, { "epoch": 0.4544447521462199, "grad_norm": 1.7364581823349, "learning_rate": 0.0001166403390956994, "loss": 2.7091, "step": 1641 }, { "epoch": 0.4547216837441152, "grad_norm": 1.1984012126922607, "learning_rate": 0.00011655334044301892, "loss": 2.6776, "step": 1642 }, { "epoch": 0.4549986153420105, "grad_norm": 2.4119842052459717, "learning_rate": 0.00011646632890661431, "loss": 2.5245, "step": 1643 }, { "epoch": 0.45527554693990585, "grad_norm": 1.5090466737747192, "learning_rate": 0.00011637930455420798, "loss": 2.5598, "step": 1644 }, { "epoch": 0.4555524785378012, "grad_norm": 1.5950679779052734, "learning_rate": 0.00011629226745353242, "loss": 2.5069, "step": 1645 }, { "epoch": 0.4558294101356965, "grad_norm": 2.2791571617126465, "learning_rate": 0.00011620521767232988, "loss": 2.6385, "step": 1646 }, { "epoch": 0.4561063417335918, "grad_norm": 1.426064372062683, "learning_rate": 0.00011611815527835266, "loss": 2.8296, "step": 1647 }, { "epoch": 0.45638327333148715, "grad_norm": 1.7675575017929077, "learning_rate": 0.00011603108033936271, "loss": 2.5534, "step": 1648 }, { "epoch": 0.4566602049293824, "grad_norm": 1.77476966381073, "learning_rate": 0.00011594399292313192, "loss": 3.0106, "step": 1649 }, { "epoch": 0.45693713652727774, "grad_norm": 4.373526573181152, "learning_rate": 0.00011585689309744166, "loss": 4.0497, "step": 1650 }, { "epoch": 0.45721406812517307, "grad_norm": 1.977187156677246, "learning_rate": 0.00011576978093008317, "loss": 2.6723, "step": 1651 }, { "epoch": 0.4574909997230684, "grad_norm": 2.139688014984131, "learning_rate": 0.00011568265648885721, "loss": 2.4705, "step": 1652 }, { "epoch": 0.4577679313209637, "grad_norm": 1.7218035459518433, "learning_rate": 0.00011559551984157404, "loss": 2.761, "step": 1653 }, { "epoch": 0.45804486291885904, "grad_norm": 1.5717421770095825, "learning_rate": 0.00011550837105605354, "loss": 2.5358, "step": 1654 }, { "epoch": 0.45832179451675437, "grad_norm": 2.267744541168213, "learning_rate": 0.00011542121020012497, "loss": 2.9883, "step": 1655 }, { "epoch": 0.4585987261146497, "grad_norm": 1.3252437114715576, "learning_rate": 0.00011533403734162696, "loss": 2.6564, "step": 1656 }, { "epoch": 0.458875657712545, "grad_norm": 1.4101369380950928, "learning_rate": 0.00011524685254840752, "loss": 2.7786, "step": 1657 }, { "epoch": 0.45915258931044034, "grad_norm": 1.5715200901031494, "learning_rate": 0.00011515965588832393, "loss": 2.735, "step": 1658 }, { "epoch": 0.45942952090833566, "grad_norm": 1.70425283908844, "learning_rate": 0.00011507244742924274, "loss": 2.7144, "step": 1659 }, { "epoch": 0.45970645250623093, "grad_norm": 2.2731897830963135, "learning_rate": 0.00011498522723903966, "loss": 2.8134, "step": 1660 }, { "epoch": 0.45998338410412626, "grad_norm": 2.732503652572632, "learning_rate": 0.00011489799538559953, "loss": 2.8805, "step": 1661 }, { "epoch": 0.4602603157020216, "grad_norm": 1.5472544431686401, "learning_rate": 0.00011481075193681625, "loss": 2.8424, "step": 1662 }, { "epoch": 0.4605372472999169, "grad_norm": 1.6334117650985718, "learning_rate": 0.00011472349696059275, "loss": 2.7526, "step": 1663 }, { "epoch": 0.46081417889781223, "grad_norm": 1.8457192182540894, "learning_rate": 0.000114636230524841, "loss": 3.1816, "step": 1664 }, { "epoch": 0.46109111049570756, "grad_norm": 1.5061466693878174, "learning_rate": 0.00011454895269748176, "loss": 2.8831, "step": 1665 }, { "epoch": 0.4613680420936029, "grad_norm": 2.3925230503082275, "learning_rate": 0.00011446166354644479, "loss": 2.8143, "step": 1666 }, { "epoch": 0.4616449736914982, "grad_norm": 1.9008785486221313, "learning_rate": 0.00011437436313966856, "loss": 2.8392, "step": 1667 }, { "epoch": 0.46192190528939353, "grad_norm": 0.9940817952156067, "learning_rate": 0.0001142870515451004, "loss": 2.6159, "step": 1668 }, { "epoch": 0.46219883688728886, "grad_norm": 2.2903690338134766, "learning_rate": 0.00011419972883069623, "loss": 3.1102, "step": 1669 }, { "epoch": 0.4624757684851842, "grad_norm": 1.6850765943527222, "learning_rate": 0.00011411239506442073, "loss": 2.7338, "step": 1670 }, { "epoch": 0.4627527000830795, "grad_norm": 2.8360297679901123, "learning_rate": 0.0001140250503142471, "loss": 2.7281, "step": 1671 }, { "epoch": 0.4630296316809748, "grad_norm": 1.7955050468444824, "learning_rate": 0.00011393769464815718, "loss": 2.5796, "step": 1672 }, { "epoch": 0.4633065632788701, "grad_norm": 2.1702804565429688, "learning_rate": 0.0001138503281341412, "loss": 3.0308, "step": 1673 }, { "epoch": 0.4635834948767654, "grad_norm": 1.1264212131500244, "learning_rate": 0.00011376295084019792, "loss": 2.9888, "step": 1674 }, { "epoch": 0.46386042647466075, "grad_norm": 5.740417957305908, "learning_rate": 0.00011367556283433445, "loss": 4.6839, "step": 1675 }, { "epoch": 0.46413735807255607, "grad_norm": 1.6483699083328247, "learning_rate": 0.00011358816418456624, "loss": 2.6919, "step": 1676 }, { "epoch": 0.4644142896704514, "grad_norm": 1.5567094087600708, "learning_rate": 0.000113500754958917, "loss": 2.6648, "step": 1677 }, { "epoch": 0.4646912212683467, "grad_norm": 1.4033349752426147, "learning_rate": 0.00011341333522541873, "loss": 2.5723, "step": 1678 }, { "epoch": 0.46496815286624205, "grad_norm": 1.4402943849563599, "learning_rate": 0.00011332590505211159, "loss": 2.7539, "step": 1679 }, { "epoch": 0.46524508446413737, "grad_norm": 1.549176573753357, "learning_rate": 0.00011323846450704381, "loss": 2.756, "step": 1680 }, { "epoch": 0.4655220160620327, "grad_norm": 2.345027446746826, "learning_rate": 0.00011315101365827177, "loss": 2.9999, "step": 1681 }, { "epoch": 0.465798947659928, "grad_norm": 1.1298154592514038, "learning_rate": 0.00011306355257385985, "loss": 2.6519, "step": 1682 }, { "epoch": 0.46607587925782334, "grad_norm": 2.0884218215942383, "learning_rate": 0.00011297608132188035, "loss": 2.8539, "step": 1683 }, { "epoch": 0.4663528108557186, "grad_norm": 1.6356163024902344, "learning_rate": 0.00011288859997041353, "loss": 2.7576, "step": 1684 }, { "epoch": 0.46662974245361394, "grad_norm": 2.01662540435791, "learning_rate": 0.00011280110858754749, "loss": 2.8964, "step": 1685 }, { "epoch": 0.46690667405150926, "grad_norm": 1.871184229850769, "learning_rate": 0.00011271360724137812, "loss": 2.7625, "step": 1686 }, { "epoch": 0.4671836056494046, "grad_norm": 0.8342902064323425, "learning_rate": 0.00011262609600000913, "loss": 2.7472, "step": 1687 }, { "epoch": 0.4674605372472999, "grad_norm": 2.2938618659973145, "learning_rate": 0.00011253857493155188, "loss": 2.9402, "step": 1688 }, { "epoch": 0.46773746884519524, "grad_norm": 1.5873247385025024, "learning_rate": 0.00011245104410412537, "loss": 2.9376, "step": 1689 }, { "epoch": 0.46801440044309056, "grad_norm": 1.8236016035079956, "learning_rate": 0.00011236350358585618, "loss": 2.7425, "step": 1690 }, { "epoch": 0.4682913320409859, "grad_norm": 2.1926429271698, "learning_rate": 0.0001122759534448786, "loss": 2.9251, "step": 1691 }, { "epoch": 0.4685682636388812, "grad_norm": 2.688744068145752, "learning_rate": 0.00011218839374933413, "loss": 3.0736, "step": 1692 }, { "epoch": 0.46884519523677654, "grad_norm": 2.1417758464813232, "learning_rate": 0.00011210082456737192, "loss": 2.7558, "step": 1693 }, { "epoch": 0.46912212683467186, "grad_norm": 1.4778650999069214, "learning_rate": 0.00011201324596714844, "loss": 2.7607, "step": 1694 }, { "epoch": 0.46939905843256713, "grad_norm": 1.9460140466690063, "learning_rate": 0.00011192565801682744, "loss": 2.9039, "step": 1695 }, { "epoch": 0.46967599003046245, "grad_norm": 1.354663610458374, "learning_rate": 0.00011183806078458003, "loss": 2.8078, "step": 1696 }, { "epoch": 0.4699529216283578, "grad_norm": 1.3584784269332886, "learning_rate": 0.00011175045433858456, "loss": 2.8078, "step": 1697 }, { "epoch": 0.4702298532262531, "grad_norm": 2.069406032562256, "learning_rate": 0.00011166283874702638, "loss": 2.8907, "step": 1698 }, { "epoch": 0.4705067848241484, "grad_norm": 1.4888218641281128, "learning_rate": 0.00011157521407809815, "loss": 2.8859, "step": 1699 }, { "epoch": 0.47078371642204375, "grad_norm": 3.220172643661499, "learning_rate": 0.00011148758039999952, "loss": 3.7449, "step": 1700 }, { "epoch": 0.4710606480199391, "grad_norm": 1.4528577327728271, "learning_rate": 0.00011139993778093711, "loss": 2.6247, "step": 1701 }, { "epoch": 0.4713375796178344, "grad_norm": 0.8638042211532593, "learning_rate": 0.00011131228628912462, "loss": 2.6005, "step": 1702 }, { "epoch": 0.4716145112157297, "grad_norm": 2.3900153636932373, "learning_rate": 0.00011122462599278254, "loss": 3.0792, "step": 1703 }, { "epoch": 0.47189144281362505, "grad_norm": 1.8319112062454224, "learning_rate": 0.00011113695696013824, "loss": 2.6175, "step": 1704 }, { "epoch": 0.4721683744115204, "grad_norm": 1.171964168548584, "learning_rate": 0.0001110492792594259, "loss": 2.6471, "step": 1705 }, { "epoch": 0.4724453060094157, "grad_norm": 1.1825437545776367, "learning_rate": 0.00011096159295888646, "loss": 2.7537, "step": 1706 }, { "epoch": 0.47272223760731097, "grad_norm": 1.0841095447540283, "learning_rate": 0.00011087389812676754, "loss": 2.7579, "step": 1707 }, { "epoch": 0.4729991692052063, "grad_norm": 2.2057273387908936, "learning_rate": 0.0001107861948313234, "loss": 2.9958, "step": 1708 }, { "epoch": 0.4732761008031016, "grad_norm": 2.306844711303711, "learning_rate": 0.0001106984831408149, "loss": 3.1338, "step": 1709 }, { "epoch": 0.47355303240099694, "grad_norm": 1.9412044286727905, "learning_rate": 0.0001106107631235094, "loss": 2.8777, "step": 1710 }, { "epoch": 0.47382996399889227, "grad_norm": 2.0891644954681396, "learning_rate": 0.00011052303484768076, "loss": 2.8106, "step": 1711 }, { "epoch": 0.4741068955967876, "grad_norm": 1.4442548751831055, "learning_rate": 0.0001104352983816093, "loss": 2.8836, "step": 1712 }, { "epoch": 0.4743838271946829, "grad_norm": 1.5611284971237183, "learning_rate": 0.00011034755379358166, "loss": 2.72, "step": 1713 }, { "epoch": 0.47466075879257824, "grad_norm": 1.841544508934021, "learning_rate": 0.00011025980115189086, "loss": 2.7075, "step": 1714 }, { "epoch": 0.47493769039047357, "grad_norm": 1.9898557662963867, "learning_rate": 0.00011017204052483613, "loss": 2.8212, "step": 1715 }, { "epoch": 0.4752146219883689, "grad_norm": 1.2505582571029663, "learning_rate": 0.00011008427198072294, "loss": 2.8529, "step": 1716 }, { "epoch": 0.4754915535862642, "grad_norm": 1.7798211574554443, "learning_rate": 0.00010999649558786291, "loss": 2.7741, "step": 1717 }, { "epoch": 0.47576848518415954, "grad_norm": 1.4395679235458374, "learning_rate": 0.00010990871141457382, "loss": 2.9254, "step": 1718 }, { "epoch": 0.4760454167820548, "grad_norm": 1.0878676176071167, "learning_rate": 0.00010982091952917943, "loss": 2.8418, "step": 1719 }, { "epoch": 0.47632234837995013, "grad_norm": 1.452322244644165, "learning_rate": 0.00010973312000000956, "loss": 2.5341, "step": 1720 }, { "epoch": 0.47659927997784546, "grad_norm": 0.7994594573974609, "learning_rate": 0.00010964531289539996, "loss": 2.7033, "step": 1721 }, { "epoch": 0.4768762115757408, "grad_norm": 2.953162908554077, "learning_rate": 0.00010955749828369224, "loss": 3.1346, "step": 1722 }, { "epoch": 0.4771531431736361, "grad_norm": 1.1681833267211914, "learning_rate": 0.00010946967623323394, "loss": 2.8322, "step": 1723 }, { "epoch": 0.47743007477153143, "grad_norm": 2.5147387981414795, "learning_rate": 0.00010938184681237833, "loss": 2.9224, "step": 1724 }, { "epoch": 0.47770700636942676, "grad_norm": 3.0960545539855957, "learning_rate": 0.00010929401008948437, "loss": 3.5451, "step": 1725 }, { "epoch": 0.4779839379673221, "grad_norm": 1.759600043296814, "learning_rate": 0.00010920616613291682, "loss": 3.0218, "step": 1726 }, { "epoch": 0.4782608695652174, "grad_norm": 2.5177507400512695, "learning_rate": 0.00010911831501104597, "loss": 2.9697, "step": 1727 }, { "epoch": 0.47853780116311273, "grad_norm": 1.8307939767837524, "learning_rate": 0.00010903045679224772, "loss": 2.6927, "step": 1728 }, { "epoch": 0.47881473276100806, "grad_norm": 1.502057671546936, "learning_rate": 0.00010894259154490354, "loss": 2.6628, "step": 1729 }, { "epoch": 0.4790916643589033, "grad_norm": 1.1559317111968994, "learning_rate": 0.0001088547193374003, "loss": 2.8771, "step": 1730 }, { "epoch": 0.47936859595679865, "grad_norm": 1.6867003440856934, "learning_rate": 0.00010876684023813029, "loss": 3.0452, "step": 1731 }, { "epoch": 0.479645527554694, "grad_norm": 1.1228195428848267, "learning_rate": 0.00010867895431549122, "loss": 2.617, "step": 1732 }, { "epoch": 0.4799224591525893, "grad_norm": 2.0624401569366455, "learning_rate": 0.00010859106163788608, "loss": 2.8471, "step": 1733 }, { "epoch": 0.4801993907504846, "grad_norm": 2.141110420227051, "learning_rate": 0.00010850316227372312, "loss": 2.9351, "step": 1734 }, { "epoch": 0.48047632234837995, "grad_norm": 1.0106289386749268, "learning_rate": 0.0001084152562914158, "loss": 2.757, "step": 1735 }, { "epoch": 0.48075325394627527, "grad_norm": 1.6883660554885864, "learning_rate": 0.00010832734375938269, "loss": 2.8802, "step": 1736 }, { "epoch": 0.4810301855441706, "grad_norm": 1.8972655534744263, "learning_rate": 0.00010823942474604752, "loss": 2.8388, "step": 1737 }, { "epoch": 0.4813071171420659, "grad_norm": 2.012760639190674, "learning_rate": 0.00010815149931983902, "loss": 2.9814, "step": 1738 }, { "epoch": 0.48158404873996125, "grad_norm": 1.6839152574539185, "learning_rate": 0.00010806356754919091, "loss": 2.8423, "step": 1739 }, { "epoch": 0.48186098033785657, "grad_norm": 1.588602900505066, "learning_rate": 0.0001079756295025419, "loss": 2.4987, "step": 1740 }, { "epoch": 0.4821379119357519, "grad_norm": 2.094052791595459, "learning_rate": 0.00010788768524833555, "loss": 2.9114, "step": 1741 }, { "epoch": 0.48241484353364716, "grad_norm": 2.5224688053131104, "learning_rate": 0.0001077997348550202, "loss": 3.3598, "step": 1742 }, { "epoch": 0.4826917751315425, "grad_norm": 2.207646608352661, "learning_rate": 0.00010771177839104907, "loss": 2.9569, "step": 1743 }, { "epoch": 0.4829687067294378, "grad_norm": 1.835942268371582, "learning_rate": 0.00010762381592488002, "loss": 2.6772, "step": 1744 }, { "epoch": 0.48324563832733314, "grad_norm": 1.3604185581207275, "learning_rate": 0.00010753584752497565, "loss": 2.6844, "step": 1745 }, { "epoch": 0.48352256992522846, "grad_norm": 1.3155308961868286, "learning_rate": 0.0001074478732598031, "loss": 2.7029, "step": 1746 }, { "epoch": 0.4837995015231238, "grad_norm": 2.120432138442993, "learning_rate": 0.00010735989319783418, "loss": 2.6089, "step": 1747 }, { "epoch": 0.4840764331210191, "grad_norm": 1.0304890871047974, "learning_rate": 0.0001072719074075451, "loss": 2.577, "step": 1748 }, { "epoch": 0.48435336471891444, "grad_norm": 1.767606258392334, "learning_rate": 0.00010718391595741657, "loss": 2.8686, "step": 1749 }, { "epoch": 0.48463029631680976, "grad_norm": 4.137774467468262, "learning_rate": 0.00010709591891593376, "loss": 4.0072, "step": 1750 }, { "epoch": 0.4849072279147051, "grad_norm": 1.735060691833496, "learning_rate": 0.00010700791635158611, "loss": 2.7465, "step": 1751 }, { "epoch": 0.4851841595126004, "grad_norm": 1.67653489112854, "learning_rate": 0.0001069199083328674, "loss": 2.8397, "step": 1752 }, { "epoch": 0.48546109111049573, "grad_norm": 1.4346954822540283, "learning_rate": 0.00010683189492827567, "loss": 2.7064, "step": 1753 }, { "epoch": 0.485738022708391, "grad_norm": 1.6197758913040161, "learning_rate": 0.00010674387620631308, "loss": 2.8717, "step": 1754 }, { "epoch": 0.48601495430628633, "grad_norm": 1.7235110998153687, "learning_rate": 0.00010665585223548604, "loss": 2.5654, "step": 1755 }, { "epoch": 0.48629188590418165, "grad_norm": 1.6998274326324463, "learning_rate": 0.00010656782308430498, "loss": 2.8146, "step": 1756 }, { "epoch": 0.486568817502077, "grad_norm": 1.2824156284332275, "learning_rate": 0.00010647978882128431, "loss": 2.7132, "step": 1757 }, { "epoch": 0.4868457490999723, "grad_norm": 1.7642462253570557, "learning_rate": 0.00010639174951494253, "loss": 2.9732, "step": 1758 }, { "epoch": 0.4871226806978676, "grad_norm": 1.8423540592193604, "learning_rate": 0.00010630370523380202, "loss": 2.9273, "step": 1759 }, { "epoch": 0.48739961229576295, "grad_norm": 3.762970209121704, "learning_rate": 0.00010621565604638897, "loss": 2.8981, "step": 1760 }, { "epoch": 0.4876765438936583, "grad_norm": 2.405668258666992, "learning_rate": 0.00010612760202123346, "loss": 2.6629, "step": 1761 }, { "epoch": 0.4879534754915536, "grad_norm": 1.2584848403930664, "learning_rate": 0.00010603954322686935, "loss": 2.6843, "step": 1762 }, { "epoch": 0.4882304070894489, "grad_norm": 1.6214781999588013, "learning_rate": 0.00010595147973183415, "loss": 2.8037, "step": 1763 }, { "epoch": 0.48850733868734425, "grad_norm": 1.881946325302124, "learning_rate": 0.00010586341160466904, "loss": 3.0972, "step": 1764 }, { "epoch": 0.4887842702852395, "grad_norm": 1.7505426406860352, "learning_rate": 0.00010577533891391884, "loss": 2.8725, "step": 1765 }, { "epoch": 0.48906120188313484, "grad_norm": 2.402600049972534, "learning_rate": 0.00010568726172813193, "loss": 2.8975, "step": 1766 }, { "epoch": 0.48933813348103017, "grad_norm": 2.2055821418762207, "learning_rate": 0.00010559918011586014, "loss": 2.9041, "step": 1767 }, { "epoch": 0.4896150650789255, "grad_norm": 2.2101831436157227, "learning_rate": 0.00010551109414565878, "loss": 2.9287, "step": 1768 }, { "epoch": 0.4898919966768208, "grad_norm": 2.1074306964874268, "learning_rate": 0.00010542300388608652, "loss": 2.8658, "step": 1769 }, { "epoch": 0.49016892827471614, "grad_norm": 2.2460777759552, "learning_rate": 0.00010533490940570541, "loss": 2.598, "step": 1770 }, { "epoch": 0.49044585987261147, "grad_norm": 1.70467209815979, "learning_rate": 0.00010524681077308077, "loss": 2.8554, "step": 1771 }, { "epoch": 0.4907227914705068, "grad_norm": 1.0729258060455322, "learning_rate": 0.00010515870805678109, "loss": 2.7869, "step": 1772 }, { "epoch": 0.4909997230684021, "grad_norm": 1.8093794584274292, "learning_rate": 0.00010507060132537813, "loss": 3.181, "step": 1773 }, { "epoch": 0.49127665466629744, "grad_norm": 1.5102819204330444, "learning_rate": 0.00010498249064744679, "loss": 2.9491, "step": 1774 }, { "epoch": 0.49155358626419277, "grad_norm": 3.7557647228240967, "learning_rate": 0.00010489437609156492, "loss": 3.8743, "step": 1775 }, { "epoch": 0.4918305178620881, "grad_norm": 3.593411445617676, "learning_rate": 0.00010480625772631347, "loss": 2.7411, "step": 1776 }, { "epoch": 0.49210744945998336, "grad_norm": 2.1997084617614746, "learning_rate": 0.00010471813562027634, "loss": 2.8769, "step": 1777 }, { "epoch": 0.4923843810578787, "grad_norm": 1.8847730159759521, "learning_rate": 0.00010463000984204038, "loss": 2.8433, "step": 1778 }, { "epoch": 0.492661312655774, "grad_norm": 1.8461335897445679, "learning_rate": 0.00010454188046019524, "loss": 2.6241, "step": 1779 }, { "epoch": 0.49293824425366933, "grad_norm": 1.412909984588623, "learning_rate": 0.00010445374754333344, "loss": 2.8483, "step": 1780 }, { "epoch": 0.49321517585156466, "grad_norm": 2.3904054164886475, "learning_rate": 0.00010436561116005012, "loss": 3.296, "step": 1781 }, { "epoch": 0.49349210744946, "grad_norm": 1.9982287883758545, "learning_rate": 0.00010427747137894328, "loss": 2.91, "step": 1782 }, { "epoch": 0.4937690390473553, "grad_norm": 2.10746169090271, "learning_rate": 0.00010418932826861349, "loss": 2.8191, "step": 1783 }, { "epoch": 0.49404597064525063, "grad_norm": 1.9534249305725098, "learning_rate": 0.00010410118189766387, "loss": 2.6736, "step": 1784 }, { "epoch": 0.49432290224314596, "grad_norm": 1.4590779542922974, "learning_rate": 0.00010401303233470014, "loss": 2.5901, "step": 1785 }, { "epoch": 0.4945998338410413, "grad_norm": 1.5505417585372925, "learning_rate": 0.00010392487964833051, "loss": 2.7745, "step": 1786 }, { "epoch": 0.4948767654389366, "grad_norm": 1.32901930809021, "learning_rate": 0.00010383672390716556, "loss": 2.7089, "step": 1787 }, { "epoch": 0.49515369703683193, "grad_norm": 1.4571396112442017, "learning_rate": 0.00010374856517981832, "loss": 3.0791, "step": 1788 }, { "epoch": 0.4954306286347272, "grad_norm": 1.337172508239746, "learning_rate": 0.0001036604035349041, "loss": 2.7896, "step": 1789 }, { "epoch": 0.4957075602326225, "grad_norm": 2.150773286819458, "learning_rate": 0.00010357223904104045, "loss": 2.8772, "step": 1790 }, { "epoch": 0.49598449183051785, "grad_norm": 2.0338151454925537, "learning_rate": 0.00010348407176684723, "loss": 2.8992, "step": 1791 }, { "epoch": 0.4962614234284132, "grad_norm": 1.7818877696990967, "learning_rate": 0.00010339590178094638, "loss": 2.8217, "step": 1792 }, { "epoch": 0.4965383550263085, "grad_norm": 1.5991873741149902, "learning_rate": 0.00010330772915196199, "loss": 2.8831, "step": 1793 }, { "epoch": 0.4968152866242038, "grad_norm": 4.544161796569824, "learning_rate": 0.00010321955394852018, "loss": 3.1074, "step": 1794 }, { "epoch": 0.49709221822209915, "grad_norm": 1.8289427757263184, "learning_rate": 0.00010313137623924913, "loss": 2.7945, "step": 1795 }, { "epoch": 0.49736914981999447, "grad_norm": 1.4768778085708618, "learning_rate": 0.00010304319609277888, "loss": 2.8313, "step": 1796 }, { "epoch": 0.4976460814178898, "grad_norm": 2.174567222595215, "learning_rate": 0.00010295501357774146, "loss": 2.9793, "step": 1797 }, { "epoch": 0.4979230130157851, "grad_norm": 3.003870725631714, "learning_rate": 0.00010286682876277069, "loss": 2.7108, "step": 1798 }, { "epoch": 0.49819994461368045, "grad_norm": 1.7297585010528564, "learning_rate": 0.0001027786417165022, "loss": 2.899, "step": 1799 }, { "epoch": 0.4984768762115757, "grad_norm": 2.468169689178467, "learning_rate": 0.00010269045250757332, "loss": 3.4341, "step": 1800 }, { "epoch": 0.49875380780947104, "grad_norm": 1.8460444211959839, "learning_rate": 0.00010260226120462315, "loss": 2.7304, "step": 1801 }, { "epoch": 0.49903073940736636, "grad_norm": 1.7109407186508179, "learning_rate": 0.00010251406787629232, "loss": 2.7065, "step": 1802 }, { "epoch": 0.4993076710052617, "grad_norm": 2.210998773574829, "learning_rate": 0.00010242587259122307, "loss": 2.7809, "step": 1803 }, { "epoch": 0.499584602603157, "grad_norm": 1.3211313486099243, "learning_rate": 0.0001023376754180592, "loss": 2.6021, "step": 1804 }, { "epoch": 0.49986153420105234, "grad_norm": 2.163815975189209, "learning_rate": 0.00010224947642544593, "loss": 2.6656, "step": 1805 }, { "epoch": 0.5001384657989476, "grad_norm": 1.8360047340393066, "learning_rate": 0.00010216127568202994, "loss": 3.0708, "step": 1806 }, { "epoch": 0.5001384657989476, "eval_loss": 0.3538380563259125, "eval_runtime": 801.5582, "eval_samples_per_second": 7.588, "eval_steps_per_second": 1.898, "step": 1806 }, { "epoch": 0.5004153973968429, "grad_norm": 1.0672329664230347, "learning_rate": 0.00010207307325645923, "loss": 2.6721, "step": 1807 }, { "epoch": 0.5006923289947383, "grad_norm": 1.2056281566619873, "learning_rate": 0.00010198486921738313, "loss": 2.6439, "step": 1808 }, { "epoch": 0.5009692605926336, "grad_norm": 2.3176612854003906, "learning_rate": 0.00010189666363345223, "loss": 2.9849, "step": 1809 }, { "epoch": 0.5012461921905289, "grad_norm": 2.192941427230835, "learning_rate": 0.00010180845657331832, "loss": 3.0648, "step": 1810 }, { "epoch": 0.5015231237884242, "grad_norm": 1.4033616781234741, "learning_rate": 0.00010172024810563434, "loss": 2.6953, "step": 1811 }, { "epoch": 0.5018000553863196, "grad_norm": 1.9293441772460938, "learning_rate": 0.0001016320382990543, "loss": 2.7721, "step": 1812 }, { "epoch": 0.5020769869842149, "grad_norm": 1.4902727603912354, "learning_rate": 0.00010154382722223333, "loss": 2.6199, "step": 1813 }, { "epoch": 0.5023539185821102, "grad_norm": 1.641219973564148, "learning_rate": 0.00010145561494382742, "loss": 2.8022, "step": 1814 }, { "epoch": 0.5026308501800055, "grad_norm": 2.4387571811676025, "learning_rate": 0.0001013674015324936, "loss": 2.9015, "step": 1815 }, { "epoch": 0.5029077817779009, "grad_norm": 2.294370651245117, "learning_rate": 0.00010127918705688978, "loss": 2.7876, "step": 1816 }, { "epoch": 0.5031847133757962, "grad_norm": 1.5861742496490479, "learning_rate": 0.00010119097158567461, "loss": 2.8373, "step": 1817 }, { "epoch": 0.5034616449736915, "grad_norm": 2.003089666366577, "learning_rate": 0.0001011027551875076, "loss": 2.5265, "step": 1818 }, { "epoch": 0.5037385765715868, "grad_norm": 1.5193865299224854, "learning_rate": 0.00010101453793104898, "loss": 2.617, "step": 1819 }, { "epoch": 0.5040155081694822, "grad_norm": 2.096163511276245, "learning_rate": 0.00010092631988495957, "loss": 2.9376, "step": 1820 }, { "epoch": 0.5042924397673775, "grad_norm": 1.358824610710144, "learning_rate": 0.0001008381011179009, "loss": 2.6227, "step": 1821 }, { "epoch": 0.5045693713652728, "grad_norm": 1.8758240938186646, "learning_rate": 0.00010074988169853503, "loss": 2.7398, "step": 1822 }, { "epoch": 0.5048463029631681, "grad_norm": 1.6822946071624756, "learning_rate": 0.00010066166169552444, "loss": 3.0444, "step": 1823 }, { "epoch": 0.5051232345610634, "grad_norm": 2.4263274669647217, "learning_rate": 0.00010057344117753222, "loss": 2.781, "step": 1824 }, { "epoch": 0.5054001661589588, "grad_norm": 3.9306766986846924, "learning_rate": 0.00010048522021322176, "loss": 3.7365, "step": 1825 }, { "epoch": 0.5056770977568541, "grad_norm": 1.615681529045105, "learning_rate": 0.00010039699887125678, "loss": 2.6159, "step": 1826 }, { "epoch": 0.5059540293547494, "grad_norm": 2.1996114253997803, "learning_rate": 0.00010030877722030136, "loss": 3.1963, "step": 1827 }, { "epoch": 0.5062309609526447, "grad_norm": 1.7249773740768433, "learning_rate": 0.00010022055532901984, "loss": 2.8976, "step": 1828 }, { "epoch": 0.50650789255054, "grad_norm": 1.4680061340332031, "learning_rate": 0.00010013233326607661, "loss": 2.9497, "step": 1829 }, { "epoch": 0.5067848241484353, "grad_norm": 1.1771354675292969, "learning_rate": 0.00010004411110013634, "loss": 2.7234, "step": 1830 }, { "epoch": 0.5070617557463306, "grad_norm": 2.575040340423584, "learning_rate": 9.99558888998637e-05, "loss": 3.0858, "step": 1831 }, { "epoch": 0.5073386873442259, "grad_norm": 2.529264450073242, "learning_rate": 9.986766673392344e-05, "loss": 2.6105, "step": 1832 }, { "epoch": 0.5076156189421213, "grad_norm": 1.6454603672027588, "learning_rate": 9.977944467098018e-05, "loss": 2.6702, "step": 1833 }, { "epoch": 0.5078925505400166, "grad_norm": 2.0674197673797607, "learning_rate": 9.969122277969865e-05, "loss": 2.8699, "step": 1834 }, { "epoch": 0.5081694821379119, "grad_norm": 3.0945839881896973, "learning_rate": 9.960300112874326e-05, "loss": 3.1807, "step": 1835 }, { "epoch": 0.5084464137358072, "grad_norm": 1.3725661039352417, "learning_rate": 9.951477978677826e-05, "loss": 2.8623, "step": 1836 }, { "epoch": 0.5087233453337026, "grad_norm": 1.615574836730957, "learning_rate": 9.942655882246781e-05, "loss": 3.1129, "step": 1837 }, { "epoch": 0.5090002769315979, "grad_norm": 2.0210142135620117, "learning_rate": 9.93383383044756e-05, "loss": 2.7283, "step": 1838 }, { "epoch": 0.5092772085294932, "grad_norm": 2.330843925476074, "learning_rate": 9.9250118301465e-05, "loss": 2.8052, "step": 1839 }, { "epoch": 0.5095541401273885, "grad_norm": 1.1833444833755493, "learning_rate": 9.916189888209912e-05, "loss": 2.967, "step": 1840 }, { "epoch": 0.5098310717252839, "grad_norm": 1.0780837535858154, "learning_rate": 9.907368011504045e-05, "loss": 2.8319, "step": 1841 }, { "epoch": 0.5101080033231792, "grad_norm": 1.9871891736984253, "learning_rate": 9.898546206895106e-05, "loss": 2.8931, "step": 1842 }, { "epoch": 0.5103849349210745, "grad_norm": 1.5821150541305542, "learning_rate": 9.88972448124924e-05, "loss": 2.9264, "step": 1843 }, { "epoch": 0.5106618665189698, "grad_norm": 1.4421741962432861, "learning_rate": 9.880902841432544e-05, "loss": 2.5686, "step": 1844 }, { "epoch": 0.5109387981168652, "grad_norm": 1.768431544303894, "learning_rate": 9.872081294311024e-05, "loss": 2.6354, "step": 1845 }, { "epoch": 0.5112157297147605, "grad_norm": 1.4881033897399902, "learning_rate": 9.863259846750641e-05, "loss": 2.4879, "step": 1846 }, { "epoch": 0.5114926613126558, "grad_norm": 1.8734623193740845, "learning_rate": 9.854438505617263e-05, "loss": 2.5476, "step": 1847 }, { "epoch": 0.5117695929105511, "grad_norm": 2.301177501678467, "learning_rate": 9.84561727777667e-05, "loss": 3.0524, "step": 1848 }, { "epoch": 0.5120465245084465, "grad_norm": 1.8294378519058228, "learning_rate": 9.836796170094571e-05, "loss": 2.6988, "step": 1849 }, { "epoch": 0.5123234561063418, "grad_norm": 3.631112813949585, "learning_rate": 9.827975189436571e-05, "loss": 3.7903, "step": 1850 }, { "epoch": 0.5126003877042371, "grad_norm": 1.5857528448104858, "learning_rate": 9.81915434266817e-05, "loss": 2.7797, "step": 1851 }, { "epoch": 0.5128773193021324, "grad_norm": 1.5357552766799927, "learning_rate": 9.81033363665478e-05, "loss": 2.6727, "step": 1852 }, { "epoch": 0.5131542509000276, "grad_norm": 2.5034070014953613, "learning_rate": 9.801513078261692e-05, "loss": 2.9577, "step": 1853 }, { "epoch": 0.513431182497923, "grad_norm": 2.242323875427246, "learning_rate": 9.792692674354079e-05, "loss": 2.8901, "step": 1854 }, { "epoch": 0.5137081140958183, "grad_norm": 2.157565116882324, "learning_rate": 9.783872431797008e-05, "loss": 2.9966, "step": 1855 }, { "epoch": 0.5139850456937136, "grad_norm": 1.0566247701644897, "learning_rate": 9.77505235745541e-05, "loss": 2.6681, "step": 1856 }, { "epoch": 0.5142619772916089, "grad_norm": 1.577938437461853, "learning_rate": 9.766232458194082e-05, "loss": 2.8405, "step": 1857 }, { "epoch": 0.5145389088895043, "grad_norm": 0.847844660282135, "learning_rate": 9.757412740877697e-05, "loss": 2.7065, "step": 1858 }, { "epoch": 0.5148158404873996, "grad_norm": 1.4090830087661743, "learning_rate": 9.748593212370773e-05, "loss": 2.5943, "step": 1859 }, { "epoch": 0.5150927720852949, "grad_norm": 1.3234925270080566, "learning_rate": 9.739773879537687e-05, "loss": 2.6805, "step": 1860 }, { "epoch": 0.5153697036831902, "grad_norm": 1.1672470569610596, "learning_rate": 9.73095474924267e-05, "loss": 2.8095, "step": 1861 }, { "epoch": 0.5156466352810856, "grad_norm": 1.7627800703048706, "learning_rate": 9.722135828349784e-05, "loss": 2.6748, "step": 1862 }, { "epoch": 0.5159235668789809, "grad_norm": 1.1423671245574951, "learning_rate": 9.713317123722933e-05, "loss": 2.7358, "step": 1863 }, { "epoch": 0.5162004984768762, "grad_norm": 3.5104258060455322, "learning_rate": 9.704498642225856e-05, "loss": 3.2547, "step": 1864 }, { "epoch": 0.5164774300747715, "grad_norm": 2.085096836090088, "learning_rate": 9.695680390722116e-05, "loss": 2.9059, "step": 1865 }, { "epoch": 0.5167543616726669, "grad_norm": 1.6688218116760254, "learning_rate": 9.68686237607509e-05, "loss": 2.8208, "step": 1866 }, { "epoch": 0.5170312932705622, "grad_norm": 1.3693195581436157, "learning_rate": 9.678044605147984e-05, "loss": 2.737, "step": 1867 }, { "epoch": 0.5173082248684575, "grad_norm": 1.1143624782562256, "learning_rate": 9.669227084803806e-05, "loss": 2.8413, "step": 1868 }, { "epoch": 0.5175851564663528, "grad_norm": 1.4536685943603516, "learning_rate": 9.660409821905363e-05, "loss": 2.6045, "step": 1869 }, { "epoch": 0.5178620880642482, "grad_norm": 1.3811722993850708, "learning_rate": 9.65159282331528e-05, "loss": 2.4863, "step": 1870 }, { "epoch": 0.5181390196621435, "grad_norm": 2.0206351280212402, "learning_rate": 9.642776095895959e-05, "loss": 2.6733, "step": 1871 }, { "epoch": 0.5184159512600388, "grad_norm": 1.589301586151123, "learning_rate": 9.633959646509592e-05, "loss": 2.573, "step": 1872 }, { "epoch": 0.5186928828579341, "grad_norm": 1.077090859413147, "learning_rate": 9.62514348201817e-05, "loss": 2.8819, "step": 1873 }, { "epoch": 0.5189698144558295, "grad_norm": 1.8220001459121704, "learning_rate": 9.616327609283445e-05, "loss": 2.8533, "step": 1874 }, { "epoch": 0.5192467460537248, "grad_norm": 4.240963459014893, "learning_rate": 9.607512035166951e-05, "loss": 4.215, "step": 1875 }, { "epoch": 0.51952367765162, "grad_norm": 1.659738540649414, "learning_rate": 9.598696766529987e-05, "loss": 2.5913, "step": 1876 }, { "epoch": 0.5198006092495153, "grad_norm": 2.6843607425689697, "learning_rate": 9.589881810233617e-05, "loss": 3.0707, "step": 1877 }, { "epoch": 0.5200775408474106, "grad_norm": 1.4031602144241333, "learning_rate": 9.581067173138653e-05, "loss": 2.9531, "step": 1878 }, { "epoch": 0.520354472445306, "grad_norm": 1.6640127897262573, "learning_rate": 9.572252862105673e-05, "loss": 2.8823, "step": 1879 }, { "epoch": 0.5206314040432013, "grad_norm": 1.616991639137268, "learning_rate": 9.563438883994992e-05, "loss": 2.8805, "step": 1880 }, { "epoch": 0.5209083356410966, "grad_norm": 1.947402000427246, "learning_rate": 9.554625245666658e-05, "loss": 2.8934, "step": 1881 }, { "epoch": 0.521185267238992, "grad_norm": 1.7645297050476074, "learning_rate": 9.545811953980477e-05, "loss": 2.6654, "step": 1882 }, { "epoch": 0.5214621988368873, "grad_norm": 1.5257445573806763, "learning_rate": 9.536999015795963e-05, "loss": 2.9309, "step": 1883 }, { "epoch": 0.5217391304347826, "grad_norm": 1.9620789289474487, "learning_rate": 9.528186437972368e-05, "loss": 2.8087, "step": 1884 }, { "epoch": 0.5220160620326779, "grad_norm": 1.9745149612426758, "learning_rate": 9.519374227368656e-05, "loss": 2.5654, "step": 1885 }, { "epoch": 0.5222929936305732, "grad_norm": 1.953885793685913, "learning_rate": 9.510562390843513e-05, "loss": 2.6027, "step": 1886 }, { "epoch": 0.5225699252284686, "grad_norm": 1.6615899801254272, "learning_rate": 9.501750935255322e-05, "loss": 2.7731, "step": 1887 }, { "epoch": 0.5228468568263639, "grad_norm": 1.7982279062271118, "learning_rate": 9.492939867462188e-05, "loss": 2.7539, "step": 1888 }, { "epoch": 0.5231237884242592, "grad_norm": 0.8459172248840332, "learning_rate": 9.484129194321896e-05, "loss": 2.8204, "step": 1889 }, { "epoch": 0.5234007200221545, "grad_norm": 1.4594982862472534, "learning_rate": 9.475318922691926e-05, "loss": 2.9914, "step": 1890 }, { "epoch": 0.5236776516200499, "grad_norm": 1.65652596950531, "learning_rate": 9.466509059429461e-05, "loss": 2.8122, "step": 1891 }, { "epoch": 0.5239545832179452, "grad_norm": 1.3848004341125488, "learning_rate": 9.457699611391352e-05, "loss": 2.8318, "step": 1892 }, { "epoch": 0.5242315148158405, "grad_norm": 1.2636313438415527, "learning_rate": 9.448890585434124e-05, "loss": 2.8518, "step": 1893 }, { "epoch": 0.5245084464137358, "grad_norm": 1.5513272285461426, "learning_rate": 9.440081988413987e-05, "loss": 2.7005, "step": 1894 }, { "epoch": 0.5247853780116312, "grad_norm": 1.6557773351669312, "learning_rate": 9.431273827186808e-05, "loss": 2.6995, "step": 1895 }, { "epoch": 0.5250623096095265, "grad_norm": 1.735222578048706, "learning_rate": 9.422466108608117e-05, "loss": 2.676, "step": 1896 }, { "epoch": 0.5253392412074218, "grad_norm": 0.8899129629135132, "learning_rate": 9.413658839533099e-05, "loss": 2.614, "step": 1897 }, { "epoch": 0.5256161728053171, "grad_norm": 2.3472046852111816, "learning_rate": 9.40485202681659e-05, "loss": 2.9799, "step": 1898 }, { "epoch": 0.5258931044032124, "grad_norm": 2.162396192550659, "learning_rate": 9.396045677313067e-05, "loss": 3.197, "step": 1899 }, { "epoch": 0.5261700360011077, "grad_norm": 4.4616169929504395, "learning_rate": 9.387239797876657e-05, "loss": 3.7858, "step": 1900 }, { "epoch": 0.526446967599003, "grad_norm": 1.3809014558792114, "learning_rate": 9.378434395361108e-05, "loss": 2.8094, "step": 1901 }, { "epoch": 0.5267238991968983, "grad_norm": 1.2681607007980347, "learning_rate": 9.369629476619801e-05, "loss": 2.649, "step": 1902 }, { "epoch": 0.5270008307947937, "grad_norm": 1.460354208946228, "learning_rate": 9.360825048505749e-05, "loss": 2.6172, "step": 1903 }, { "epoch": 0.527277762392689, "grad_norm": 1.7847353219985962, "learning_rate": 9.352021117871574e-05, "loss": 2.7001, "step": 1904 }, { "epoch": 0.5275546939905843, "grad_norm": 1.2793521881103516, "learning_rate": 9.343217691569506e-05, "loss": 2.9796, "step": 1905 }, { "epoch": 0.5278316255884796, "grad_norm": 1.5768253803253174, "learning_rate": 9.3344147764514e-05, "loss": 2.7394, "step": 1906 }, { "epoch": 0.528108557186375, "grad_norm": 1.7183265686035156, "learning_rate": 9.325612379368695e-05, "loss": 2.5647, "step": 1907 }, { "epoch": 0.5283854887842703, "grad_norm": 2.033125638961792, "learning_rate": 9.316810507172435e-05, "loss": 2.7214, "step": 1908 }, { "epoch": 0.5286624203821656, "grad_norm": 1.1333674192428589, "learning_rate": 9.308009166713263e-05, "loss": 2.6175, "step": 1909 }, { "epoch": 0.5289393519800609, "grad_norm": 1.6264960765838623, "learning_rate": 9.299208364841394e-05, "loss": 2.7965, "step": 1910 }, { "epoch": 0.5292162835779562, "grad_norm": 1.4833307266235352, "learning_rate": 9.290408108406626e-05, "loss": 2.9848, "step": 1911 }, { "epoch": 0.5294932151758516, "grad_norm": 1.5251394510269165, "learning_rate": 9.281608404258347e-05, "loss": 2.8359, "step": 1912 }, { "epoch": 0.5297701467737469, "grad_norm": 1.435335636138916, "learning_rate": 9.272809259245497e-05, "loss": 2.7089, "step": 1913 }, { "epoch": 0.5300470783716422, "grad_norm": 1.2831498384475708, "learning_rate": 9.264010680216583e-05, "loss": 2.8127, "step": 1914 }, { "epoch": 0.5303240099695375, "grad_norm": 1.6412091255187988, "learning_rate": 9.255212674019691e-05, "loss": 2.8101, "step": 1915 }, { "epoch": 0.5306009415674329, "grad_norm": 1.5583209991455078, "learning_rate": 9.246415247502437e-05, "loss": 2.671, "step": 1916 }, { "epoch": 0.5308778731653282, "grad_norm": 0.9175875782966614, "learning_rate": 9.237618407512e-05, "loss": 2.8082, "step": 1917 }, { "epoch": 0.5311548047632235, "grad_norm": 1.8662338256835938, "learning_rate": 9.228822160895095e-05, "loss": 2.5191, "step": 1918 }, { "epoch": 0.5314317363611188, "grad_norm": 1.1972931623458862, "learning_rate": 9.220026514497983e-05, "loss": 2.4665, "step": 1919 }, { "epoch": 0.5317086679590142, "grad_norm": 1.1261391639709473, "learning_rate": 9.211231475166447e-05, "loss": 2.6903, "step": 1920 }, { "epoch": 0.5319855995569095, "grad_norm": 1.0068024396896362, "learning_rate": 9.202437049745812e-05, "loss": 2.5293, "step": 1921 }, { "epoch": 0.5322625311548047, "grad_norm": 1.2183352708816528, "learning_rate": 9.193643245080913e-05, "loss": 2.6087, "step": 1922 }, { "epoch": 0.5325394627527, "grad_norm": 0.8475114107131958, "learning_rate": 9.184850068016099e-05, "loss": 2.6615, "step": 1923 }, { "epoch": 0.5328163943505954, "grad_norm": 2.806544780731201, "learning_rate": 9.176057525395252e-05, "loss": 3.0558, "step": 1924 }, { "epoch": 0.5330933259484907, "grad_norm": 4.840540885925293, "learning_rate": 9.167265624061735e-05, "loss": 4.3507, "step": 1925 }, { "epoch": 0.533370257546386, "grad_norm": 2.502866744995117, "learning_rate": 9.158474370858421e-05, "loss": 2.9533, "step": 1926 }, { "epoch": 0.5336471891442813, "grad_norm": 1.1832548379898071, "learning_rate": 9.149683772627689e-05, "loss": 2.6061, "step": 1927 }, { "epoch": 0.5339241207421767, "grad_norm": 1.4272520542144775, "learning_rate": 9.140893836211393e-05, "loss": 2.6801, "step": 1928 }, { "epoch": 0.534201052340072, "grad_norm": 2.6324303150177, "learning_rate": 9.132104568450879e-05, "loss": 2.8803, "step": 1929 }, { "epoch": 0.5344779839379673, "grad_norm": 1.6218551397323608, "learning_rate": 9.123315976186972e-05, "loss": 2.8786, "step": 1930 }, { "epoch": 0.5347549155358626, "grad_norm": 2.4477269649505615, "learning_rate": 9.114528066259975e-05, "loss": 3.1828, "step": 1931 }, { "epoch": 0.535031847133758, "grad_norm": 2.003542423248291, "learning_rate": 9.105740845509647e-05, "loss": 3.0627, "step": 1932 }, { "epoch": 0.5353087787316533, "grad_norm": 1.9033154249191284, "learning_rate": 9.09695432077523e-05, "loss": 2.8415, "step": 1933 }, { "epoch": 0.5355857103295486, "grad_norm": 1.7393341064453125, "learning_rate": 9.088168498895408e-05, "loss": 2.8617, "step": 1934 }, { "epoch": 0.5358626419274439, "grad_norm": 2.2522337436676025, "learning_rate": 9.07938338670832e-05, "loss": 2.8665, "step": 1935 }, { "epoch": 0.5361395735253393, "grad_norm": 3.2727909088134766, "learning_rate": 9.070598991051565e-05, "loss": 3.087, "step": 1936 }, { "epoch": 0.5364165051232346, "grad_norm": 1.747339129447937, "learning_rate": 9.061815318762172e-05, "loss": 2.9148, "step": 1937 }, { "epoch": 0.5366934367211299, "grad_norm": 1.5096392631530762, "learning_rate": 9.053032376676607e-05, "loss": 2.3975, "step": 1938 }, { "epoch": 0.5369703683190252, "grad_norm": 1.3254399299621582, "learning_rate": 9.044250171630778e-05, "loss": 2.593, "step": 1939 }, { "epoch": 0.5372472999169206, "grad_norm": 1.1389713287353516, "learning_rate": 9.035468710460006e-05, "loss": 2.6224, "step": 1940 }, { "epoch": 0.5375242315148159, "grad_norm": 1.5871886014938354, "learning_rate": 9.026687999999045e-05, "loss": 2.8183, "step": 1941 }, { "epoch": 0.5378011631127112, "grad_norm": 1.3882852792739868, "learning_rate": 9.017908047082059e-05, "loss": 2.8155, "step": 1942 }, { "epoch": 0.5380780947106065, "grad_norm": 1.0509225130081177, "learning_rate": 9.009128858542621e-05, "loss": 2.7038, "step": 1943 }, { "epoch": 0.5383550263085018, "grad_norm": 1.0775872468948364, "learning_rate": 9.000350441213708e-05, "loss": 2.4207, "step": 1944 }, { "epoch": 0.5386319579063972, "grad_norm": 0.9771863222122192, "learning_rate": 8.991572801927709e-05, "loss": 2.5425, "step": 1945 }, { "epoch": 0.5389088895042924, "grad_norm": 0.9501717686653137, "learning_rate": 8.982795947516392e-05, "loss": 2.7157, "step": 1946 }, { "epoch": 0.5391858211021877, "grad_norm": 2.0657010078430176, "learning_rate": 8.974019884810915e-05, "loss": 2.4117, "step": 1947 }, { "epoch": 0.539462752700083, "grad_norm": 1.415224313735962, "learning_rate": 8.965244620641835e-05, "loss": 2.9617, "step": 1948 }, { "epoch": 0.5397396842979784, "grad_norm": 3.221825361251831, "learning_rate": 8.956470161839072e-05, "loss": 3.4721, "step": 1949 }, { "epoch": 0.5400166158958737, "grad_norm": 4.618506908416748, "learning_rate": 8.947696515231925e-05, "loss": 4.0782, "step": 1950 }, { "epoch": 0.540293547493769, "grad_norm": 2.003098249435425, "learning_rate": 8.938923687649063e-05, "loss": 2.8448, "step": 1951 }, { "epoch": 0.5405704790916643, "grad_norm": 1.5508081912994385, "learning_rate": 8.930151685918513e-05, "loss": 2.8655, "step": 1952 }, { "epoch": 0.5408474106895597, "grad_norm": 2.273536443710327, "learning_rate": 8.92138051686766e-05, "loss": 2.9843, "step": 1953 }, { "epoch": 0.541124342287455, "grad_norm": 2.2980072498321533, "learning_rate": 8.912610187323248e-05, "loss": 3.1316, "step": 1954 }, { "epoch": 0.5414012738853503, "grad_norm": 1.8171645402908325, "learning_rate": 8.903840704111357e-05, "loss": 2.7456, "step": 1955 }, { "epoch": 0.5416782054832456, "grad_norm": 1.8774298429489136, "learning_rate": 8.895072074057411e-05, "loss": 2.837, "step": 1956 }, { "epoch": 0.541955137081141, "grad_norm": 1.7538028955459595, "learning_rate": 8.886304303986179e-05, "loss": 2.8964, "step": 1957 }, { "epoch": 0.5422320686790363, "grad_norm": 1.317739486694336, "learning_rate": 8.87753740072175e-05, "loss": 2.5798, "step": 1958 }, { "epoch": 0.5425090002769316, "grad_norm": 1.340731143951416, "learning_rate": 8.868771371087539e-05, "loss": 2.7468, "step": 1959 }, { "epoch": 0.5427859318748269, "grad_norm": 1.9598355293273926, "learning_rate": 8.86000622190629e-05, "loss": 2.6228, "step": 1960 }, { "epoch": 0.5430628634727223, "grad_norm": 1.5818569660186768, "learning_rate": 8.851241960000052e-05, "loss": 2.8423, "step": 1961 }, { "epoch": 0.5433397950706176, "grad_norm": 1.6524901390075684, "learning_rate": 8.842478592190188e-05, "loss": 2.8836, "step": 1962 }, { "epoch": 0.5436167266685129, "grad_norm": 1.3244292736053467, "learning_rate": 8.833716125297364e-05, "loss": 2.7687, "step": 1963 }, { "epoch": 0.5438936582664082, "grad_norm": 1.4066139459609985, "learning_rate": 8.82495456614155e-05, "loss": 2.7047, "step": 1964 }, { "epoch": 0.5441705898643036, "grad_norm": 0.9185974597930908, "learning_rate": 8.816193921541995e-05, "loss": 2.8846, "step": 1965 }, { "epoch": 0.5444475214621989, "grad_norm": 1.8787202835083008, "learning_rate": 8.807434198317257e-05, "loss": 2.7159, "step": 1966 }, { "epoch": 0.5447244530600942, "grad_norm": 1.6922743320465088, "learning_rate": 8.79867540328516e-05, "loss": 2.5007, "step": 1967 }, { "epoch": 0.5450013846579895, "grad_norm": 1.678337812423706, "learning_rate": 8.78991754326281e-05, "loss": 2.9345, "step": 1968 }, { "epoch": 0.5452783162558847, "grad_norm": 1.3174742460250854, "learning_rate": 8.781160625066588e-05, "loss": 2.8186, "step": 1969 }, { "epoch": 0.5455552478537801, "grad_norm": 1.35786771774292, "learning_rate": 8.772404655512145e-05, "loss": 2.5213, "step": 1970 }, { "epoch": 0.5458321794516754, "grad_norm": 1.3507778644561768, "learning_rate": 8.76364964141438e-05, "loss": 2.7484, "step": 1971 }, { "epoch": 0.5461091110495707, "grad_norm": 1.12258780002594, "learning_rate": 8.754895589587466e-05, "loss": 2.6031, "step": 1972 }, { "epoch": 0.546386042647466, "grad_norm": 1.8437641859054565, "learning_rate": 8.746142506844815e-05, "loss": 2.9779, "step": 1973 }, { "epoch": 0.5466629742453614, "grad_norm": 1.7055060863494873, "learning_rate": 8.737390399999086e-05, "loss": 2.9552, "step": 1974 }, { "epoch": 0.5469399058432567, "grad_norm": 3.4427578449249268, "learning_rate": 8.72863927586219e-05, "loss": 3.9147, "step": 1975 }, { "epoch": 0.547216837441152, "grad_norm": 1.0980651378631592, "learning_rate": 8.719889141245256e-05, "loss": 2.902, "step": 1976 }, { "epoch": 0.5474937690390473, "grad_norm": 1.8571419715881348, "learning_rate": 8.711140002958648e-05, "loss": 2.6336, "step": 1977 }, { "epoch": 0.5477707006369427, "grad_norm": 2.336714506149292, "learning_rate": 8.702391867811967e-05, "loss": 2.7682, "step": 1978 }, { "epoch": 0.548047632234838, "grad_norm": 2.097658157348633, "learning_rate": 8.693644742614017e-05, "loss": 2.7868, "step": 1979 }, { "epoch": 0.5483245638327333, "grad_norm": 1.3417327404022217, "learning_rate": 8.684898634172823e-05, "loss": 2.6392, "step": 1980 }, { "epoch": 0.5486014954306286, "grad_norm": 1.5336050987243652, "learning_rate": 8.676153549295622e-05, "loss": 2.6745, "step": 1981 }, { "epoch": 0.548878427028524, "grad_norm": 1.5942492485046387, "learning_rate": 8.667409494788844e-05, "loss": 2.6761, "step": 1982 }, { "epoch": 0.5491553586264193, "grad_norm": 1.1920937299728394, "learning_rate": 8.658666477458128e-05, "loss": 2.9014, "step": 1983 }, { "epoch": 0.5494322902243146, "grad_norm": 2.115769386291504, "learning_rate": 8.649924504108302e-05, "loss": 2.8111, "step": 1984 }, { "epoch": 0.5497092218222099, "grad_norm": 1.9710432291030884, "learning_rate": 8.641183581543381e-05, "loss": 2.6635, "step": 1985 }, { "epoch": 0.5499861534201053, "grad_norm": 1.1491562128067017, "learning_rate": 8.632443716566556e-05, "loss": 2.8456, "step": 1986 }, { "epoch": 0.5502630850180006, "grad_norm": 1.9223709106445312, "learning_rate": 8.623704915980209e-05, "loss": 3.2156, "step": 1987 }, { "epoch": 0.5505400166158959, "grad_norm": 1.284507155418396, "learning_rate": 8.614967186585882e-05, "loss": 2.8755, "step": 1988 }, { "epoch": 0.5508169482137912, "grad_norm": 1.6829004287719727, "learning_rate": 8.606230535184283e-05, "loss": 3.087, "step": 1989 }, { "epoch": 0.5510938798116866, "grad_norm": 0.9009906053543091, "learning_rate": 8.59749496857529e-05, "loss": 2.7426, "step": 1990 }, { "epoch": 0.5513708114095819, "grad_norm": 1.6485047340393066, "learning_rate": 8.588760493557932e-05, "loss": 2.6128, "step": 1991 }, { "epoch": 0.5516477430074771, "grad_norm": 1.8770582675933838, "learning_rate": 8.580027116930378e-05, "loss": 3.0946, "step": 1992 }, { "epoch": 0.5519246746053724, "grad_norm": 1.648174524307251, "learning_rate": 8.571294845489963e-05, "loss": 2.6195, "step": 1993 }, { "epoch": 0.5522016062032677, "grad_norm": 1.851514458656311, "learning_rate": 8.562563686033145e-05, "loss": 2.7964, "step": 1994 }, { "epoch": 0.5524785378011631, "grad_norm": 1.098483681678772, "learning_rate": 8.553833645355524e-05, "loss": 2.727, "step": 1995 }, { "epoch": 0.5527554693990584, "grad_norm": 2.0957040786743164, "learning_rate": 8.545104730251824e-05, "loss": 2.4808, "step": 1996 }, { "epoch": 0.5530324009969537, "grad_norm": 1.257961392402649, "learning_rate": 8.536376947515905e-05, "loss": 2.4464, "step": 1997 }, { "epoch": 0.553309332594849, "grad_norm": 1.606569766998291, "learning_rate": 8.527650303940725e-05, "loss": 2.5888, "step": 1998 }, { "epoch": 0.5535862641927444, "grad_norm": 2.9079225063323975, "learning_rate": 8.518924806318378e-05, "loss": 3.1359, "step": 1999 }, { "epoch": 0.5538631957906397, "grad_norm": 3.6829776763916016, "learning_rate": 8.510200461440051e-05, "loss": 3.8721, "step": 2000 }, { "epoch": 0.554140127388535, "grad_norm": 1.0013054609298706, "learning_rate": 8.501477276096033e-05, "loss": 2.6452, "step": 2001 }, { "epoch": 0.5544170589864303, "grad_norm": 1.6629724502563477, "learning_rate": 8.492755257075728e-05, "loss": 2.8157, "step": 2002 }, { "epoch": 0.5546939905843257, "grad_norm": 2.2238128185272217, "learning_rate": 8.48403441116761e-05, "loss": 3.124, "step": 2003 }, { "epoch": 0.554970922182221, "grad_norm": 1.7178497314453125, "learning_rate": 8.47531474515925e-05, "loss": 2.6766, "step": 2004 }, { "epoch": 0.5552478537801163, "grad_norm": 2.264681816101074, "learning_rate": 8.466596265837305e-05, "loss": 2.7763, "step": 2005 }, { "epoch": 0.5555247853780116, "grad_norm": 1.8057752847671509, "learning_rate": 8.457878979987507e-05, "loss": 2.9713, "step": 2006 }, { "epoch": 0.555801716975907, "grad_norm": 1.1006498336791992, "learning_rate": 8.449162894394646e-05, "loss": 2.6146, "step": 2007 }, { "epoch": 0.5560786485738023, "grad_norm": 1.485538363456726, "learning_rate": 8.440448015842597e-05, "loss": 2.6359, "step": 2008 }, { "epoch": 0.5563555801716976, "grad_norm": 1.3723620176315308, "learning_rate": 8.431734351114284e-05, "loss": 2.6276, "step": 2009 }, { "epoch": 0.5566325117695929, "grad_norm": 1.5234686136245728, "learning_rate": 8.423021906991684e-05, "loss": 3.0212, "step": 2010 }, { "epoch": 0.5569094433674883, "grad_norm": 2.5109102725982666, "learning_rate": 8.414310690255835e-05, "loss": 3.2222, "step": 2011 }, { "epoch": 0.5571863749653836, "grad_norm": 1.1213618516921997, "learning_rate": 8.405600707686814e-05, "loss": 2.6941, "step": 2012 }, { "epoch": 0.5574633065632789, "grad_norm": 1.080403208732605, "learning_rate": 8.396891966063727e-05, "loss": 2.7642, "step": 2013 }, { "epoch": 0.5577402381611742, "grad_norm": 1.604928970336914, "learning_rate": 8.388184472164736e-05, "loss": 2.6035, "step": 2014 }, { "epoch": 0.5580171697590696, "grad_norm": 1.7442255020141602, "learning_rate": 8.379478232767013e-05, "loss": 2.7144, "step": 2015 }, { "epoch": 0.5582941013569648, "grad_norm": 1.8785520792007446, "learning_rate": 8.370773254646762e-05, "loss": 2.7974, "step": 2016 }, { "epoch": 0.5585710329548601, "grad_norm": 1.1109877824783325, "learning_rate": 8.362069544579205e-05, "loss": 2.5566, "step": 2017 }, { "epoch": 0.5588479645527554, "grad_norm": 1.8859199285507202, "learning_rate": 8.353367109338575e-05, "loss": 2.542, "step": 2018 }, { "epoch": 0.5591248961506508, "grad_norm": 1.1205683946609497, "learning_rate": 8.34466595569811e-05, "loss": 2.8586, "step": 2019 }, { "epoch": 0.5594018277485461, "grad_norm": 1.04397714138031, "learning_rate": 8.335966090430062e-05, "loss": 2.6522, "step": 2020 }, { "epoch": 0.5596787593464414, "grad_norm": 1.1475187540054321, "learning_rate": 8.327267520305669e-05, "loss": 2.3847, "step": 2021 }, { "epoch": 0.5599556909443367, "grad_norm": 1.7953752279281616, "learning_rate": 8.318570252095157e-05, "loss": 2.8365, "step": 2022 }, { "epoch": 0.560232622542232, "grad_norm": 1.6694520711898804, "learning_rate": 8.309874292567758e-05, "loss": 3.0579, "step": 2023 }, { "epoch": 0.5605095541401274, "grad_norm": 2.378661632537842, "learning_rate": 8.301179648491669e-05, "loss": 3.0315, "step": 2024 }, { "epoch": 0.5607864857380227, "grad_norm": 3.761888027191162, "learning_rate": 8.292486326634061e-05, "loss": 4.1084, "step": 2025 }, { "epoch": 0.561063417335918, "grad_norm": 1.8311100006103516, "learning_rate": 8.283794333761093e-05, "loss": 2.8907, "step": 2026 }, { "epoch": 0.5613403489338133, "grad_norm": 2.8452889919281006, "learning_rate": 8.275103676637871e-05, "loss": 3.1021, "step": 2027 }, { "epoch": 0.5616172805317087, "grad_norm": 1.8134194612503052, "learning_rate": 8.266414362028475e-05, "loss": 2.8781, "step": 2028 }, { "epoch": 0.561894212129604, "grad_norm": 1.291551947593689, "learning_rate": 8.257726396695933e-05, "loss": 2.9156, "step": 2029 }, { "epoch": 0.5621711437274993, "grad_norm": 1.6090056896209717, "learning_rate": 8.249039787402224e-05, "loss": 2.832, "step": 2030 }, { "epoch": 0.5624480753253946, "grad_norm": 1.1308784484863281, "learning_rate": 8.240354540908269e-05, "loss": 2.6733, "step": 2031 }, { "epoch": 0.56272500692329, "grad_norm": 1.1666631698608398, "learning_rate": 8.231670663973939e-05, "loss": 2.6943, "step": 2032 }, { "epoch": 0.5630019385211853, "grad_norm": 1.3602391481399536, "learning_rate": 8.222988163358027e-05, "loss": 2.7505, "step": 2033 }, { "epoch": 0.5632788701190806, "grad_norm": 1.7966910600662231, "learning_rate": 8.214307045818254e-05, "loss": 2.8815, "step": 2034 }, { "epoch": 0.563555801716976, "grad_norm": 2.1865363121032715, "learning_rate": 8.20562731811128e-05, "loss": 2.7694, "step": 2035 }, { "epoch": 0.5638327333148713, "grad_norm": 2.047175645828247, "learning_rate": 8.196948986992666e-05, "loss": 2.9757, "step": 2036 }, { "epoch": 0.5641096649127666, "grad_norm": 1.7177631855010986, "learning_rate": 8.188272059216894e-05, "loss": 3.0657, "step": 2037 }, { "epoch": 0.5643865965106619, "grad_norm": 2.061433792114258, "learning_rate": 8.179596541537355e-05, "loss": 3.3796, "step": 2038 }, { "epoch": 0.5646635281085571, "grad_norm": 1.3437374830245972, "learning_rate": 8.17092244070634e-05, "loss": 3.1446, "step": 2039 }, { "epoch": 0.5649404597064525, "grad_norm": 1.3183518648147583, "learning_rate": 8.162249763475034e-05, "loss": 2.9589, "step": 2040 }, { "epoch": 0.5652173913043478, "grad_norm": 1.7386242151260376, "learning_rate": 8.153578516593524e-05, "loss": 2.8237, "step": 2041 }, { "epoch": 0.5654943229022431, "grad_norm": 1.5994588136672974, "learning_rate": 8.144908706810773e-05, "loss": 2.8691, "step": 2042 }, { "epoch": 0.5657712545001384, "grad_norm": 1.195237636566162, "learning_rate": 8.136240340874624e-05, "loss": 2.7326, "step": 2043 }, { "epoch": 0.5660481860980338, "grad_norm": 2.3551249504089355, "learning_rate": 8.127573425531814e-05, "loss": 3.0542, "step": 2044 }, { "epoch": 0.5663251176959291, "grad_norm": 1.8700072765350342, "learning_rate": 8.118907967527934e-05, "loss": 2.5069, "step": 2045 }, { "epoch": 0.5666020492938244, "grad_norm": 1.3926947116851807, "learning_rate": 8.110243973607439e-05, "loss": 2.6471, "step": 2046 }, { "epoch": 0.5668789808917197, "grad_norm": 2.0446665287017822, "learning_rate": 8.10158145051366e-05, "loss": 3.096, "step": 2047 }, { "epoch": 0.567155912489615, "grad_norm": 2.3419084548950195, "learning_rate": 8.09292040498877e-05, "loss": 2.8887, "step": 2048 }, { "epoch": 0.5674328440875104, "grad_norm": 1.5965373516082764, "learning_rate": 8.084260843773799e-05, "loss": 2.7957, "step": 2049 }, { "epoch": 0.5677097756854057, "grad_norm": 2.7372281551361084, "learning_rate": 8.075602773608617e-05, "loss": 3.4179, "step": 2050 }, { "epoch": 0.567986707283301, "grad_norm": 1.4773602485656738, "learning_rate": 8.06694620123194e-05, "loss": 2.5247, "step": 2051 }, { "epoch": 0.5682636388811964, "grad_norm": 1.3511680364608765, "learning_rate": 8.058291133381304e-05, "loss": 2.929, "step": 2052 }, { "epoch": 0.5685405704790917, "grad_norm": 1.5803459882736206, "learning_rate": 8.049637576793094e-05, "loss": 2.8925, "step": 2053 }, { "epoch": 0.568817502076987, "grad_norm": 1.283042311668396, "learning_rate": 8.040985538202505e-05, "loss": 2.8373, "step": 2054 }, { "epoch": 0.5690944336748823, "grad_norm": 1.295939326286316, "learning_rate": 8.032335024343551e-05, "loss": 2.8411, "step": 2055 }, { "epoch": 0.5693713652727777, "grad_norm": 1.0056647062301636, "learning_rate": 8.02368604194907e-05, "loss": 2.8974, "step": 2056 }, { "epoch": 0.569648296870673, "grad_norm": 1.825939416885376, "learning_rate": 8.015038597750694e-05, "loss": 2.735, "step": 2057 }, { "epoch": 0.5699252284685683, "grad_norm": 1.263617992401123, "learning_rate": 8.006392698478862e-05, "loss": 2.811, "step": 2058 }, { "epoch": 0.5702021600664636, "grad_norm": 1.8211050033569336, "learning_rate": 7.997748350862822e-05, "loss": 3.0062, "step": 2059 }, { "epoch": 0.570479091664359, "grad_norm": 1.1094549894332886, "learning_rate": 7.989105561630598e-05, "loss": 2.9058, "step": 2060 }, { "epoch": 0.5707560232622543, "grad_norm": 1.377359390258789, "learning_rate": 7.980464337509006e-05, "loss": 2.6647, "step": 2061 }, { "epoch": 0.5710329548601495, "grad_norm": 1.1790040731430054, "learning_rate": 7.971824685223657e-05, "loss": 2.7497, "step": 2062 }, { "epoch": 0.5713098864580448, "grad_norm": 1.1479779481887817, "learning_rate": 7.96318661149892e-05, "loss": 2.726, "step": 2063 }, { "epoch": 0.5715868180559401, "grad_norm": 1.4475488662719727, "learning_rate": 7.954550123057939e-05, "loss": 2.7639, "step": 2064 }, { "epoch": 0.5718637496538355, "grad_norm": 1.5319483280181885, "learning_rate": 7.945915226622639e-05, "loss": 2.8743, "step": 2065 }, { "epoch": 0.5721406812517308, "grad_norm": 1.368962287902832, "learning_rate": 7.937281928913688e-05, "loss": 2.9782, "step": 2066 }, { "epoch": 0.5724176128496261, "grad_norm": 1.8736423254013062, "learning_rate": 7.928650236650514e-05, "loss": 2.857, "step": 2067 }, { "epoch": 0.5726945444475214, "grad_norm": 1.5408180952072144, "learning_rate": 7.920020156551307e-05, "loss": 2.5916, "step": 2068 }, { "epoch": 0.5729714760454168, "grad_norm": 1.3159997463226318, "learning_rate": 7.911391695332988e-05, "loss": 2.6835, "step": 2069 }, { "epoch": 0.5732484076433121, "grad_norm": 1.2888193130493164, "learning_rate": 7.902764859711223e-05, "loss": 2.6142, "step": 2070 }, { "epoch": 0.5735253392412074, "grad_norm": 1.3029569387435913, "learning_rate": 7.894139656400417e-05, "loss": 2.581, "step": 2071 }, { "epoch": 0.5738022708391027, "grad_norm": 1.5423462390899658, "learning_rate": 7.885516092113699e-05, "loss": 2.7403, "step": 2072 }, { "epoch": 0.5740792024369981, "grad_norm": 2.0652363300323486, "learning_rate": 7.876894173562919e-05, "loss": 3.1884, "step": 2073 }, { "epoch": 0.5743561340348934, "grad_norm": 2.0919711589813232, "learning_rate": 7.868273907458661e-05, "loss": 2.9806, "step": 2074 }, { "epoch": 0.5746330656327887, "grad_norm": 14.729402542114258, "learning_rate": 7.859655300510208e-05, "loss": 3.9938, "step": 2075 }, { "epoch": 0.574909997230684, "grad_norm": 2.284801721572876, "learning_rate": 7.851038359425553e-05, "loss": 3.2097, "step": 2076 }, { "epoch": 0.5751869288285794, "grad_norm": 1.1180181503295898, "learning_rate": 7.842423090911403e-05, "loss": 2.8244, "step": 2077 }, { "epoch": 0.5754638604264747, "grad_norm": 1.5196349620819092, "learning_rate": 7.833809501673155e-05, "loss": 2.8955, "step": 2078 }, { "epoch": 0.57574079202437, "grad_norm": 1.8050135374069214, "learning_rate": 7.825197598414895e-05, "loss": 2.8498, "step": 2079 }, { "epoch": 0.5760177236222653, "grad_norm": 1.4506398439407349, "learning_rate": 7.81658738783941e-05, "loss": 2.6669, "step": 2080 }, { "epoch": 0.5762946552201607, "grad_norm": 0.9477134943008423, "learning_rate": 7.807978876648154e-05, "loss": 2.6933, "step": 2081 }, { "epoch": 0.576571586818056, "grad_norm": 2.4606692790985107, "learning_rate": 7.799372071541272e-05, "loss": 2.8029, "step": 2082 }, { "epoch": 0.5768485184159513, "grad_norm": 1.804126262664795, "learning_rate": 7.790766979217571e-05, "loss": 2.7009, "step": 2083 }, { "epoch": 0.5771254500138466, "grad_norm": 2.0358893871307373, "learning_rate": 7.782163606374536e-05, "loss": 2.7267, "step": 2084 }, { "epoch": 0.577402381611742, "grad_norm": 1.8686617612838745, "learning_rate": 7.773561959708298e-05, "loss": 2.7292, "step": 2085 }, { "epoch": 0.5776793132096372, "grad_norm": 1.3002493381500244, "learning_rate": 7.76496204591366e-05, "loss": 3.0942, "step": 2086 }, { "epoch": 0.5779562448075325, "grad_norm": 1.4342671632766724, "learning_rate": 7.756363871684066e-05, "loss": 2.9262, "step": 2087 }, { "epoch": 0.5782331764054278, "grad_norm": 1.3472551107406616, "learning_rate": 7.747767443711609e-05, "loss": 2.9602, "step": 2088 }, { "epoch": 0.5785101080033231, "grad_norm": 1.9964295625686646, "learning_rate": 7.739172768687028e-05, "loss": 3.0501, "step": 2089 }, { "epoch": 0.5787870396012185, "grad_norm": 1.5070070028305054, "learning_rate": 7.730579853299691e-05, "loss": 2.8507, "step": 2090 }, { "epoch": 0.5790639711991138, "grad_norm": 1.8707159757614136, "learning_rate": 7.721988704237593e-05, "loss": 2.7029, "step": 2091 }, { "epoch": 0.5793409027970091, "grad_norm": 1.4172472953796387, "learning_rate": 7.713399328187371e-05, "loss": 3.0517, "step": 2092 }, { "epoch": 0.5796178343949044, "grad_norm": 1.8085438013076782, "learning_rate": 7.70481173183426e-05, "loss": 2.5827, "step": 2093 }, { "epoch": 0.5798947659927998, "grad_norm": 1.3533729314804077, "learning_rate": 7.696225921862126e-05, "loss": 2.5971, "step": 2094 }, { "epoch": 0.5801716975906951, "grad_norm": 1.7932556867599487, "learning_rate": 7.687641904953443e-05, "loss": 2.6052, "step": 2095 }, { "epoch": 0.5804486291885904, "grad_norm": 1.3073060512542725, "learning_rate": 7.67905968778928e-05, "loss": 2.7844, "step": 2096 }, { "epoch": 0.5807255607864857, "grad_norm": 1.4766454696655273, "learning_rate": 7.67047927704931e-05, "loss": 2.774, "step": 2097 }, { "epoch": 0.5810024923843811, "grad_norm": 1.1093850135803223, "learning_rate": 7.661900679411806e-05, "loss": 2.4939, "step": 2098 }, { "epoch": 0.5812794239822764, "grad_norm": 2.555997371673584, "learning_rate": 7.653323901553625e-05, "loss": 2.9841, "step": 2099 }, { "epoch": 0.5815563555801717, "grad_norm": 4.845314979553223, "learning_rate": 7.6447489501502e-05, "loss": 4.4183, "step": 2100 }, { "epoch": 0.581833287178067, "grad_norm": 3.8324882984161377, "learning_rate": 7.63617583187556e-05, "loss": 3.7171, "step": 2101 }, { "epoch": 0.5821102187759624, "grad_norm": 1.3293406963348389, "learning_rate": 7.627604553402291e-05, "loss": 2.6822, "step": 2102 }, { "epoch": 0.5823871503738577, "grad_norm": 1.512557029724121, "learning_rate": 7.619035121401554e-05, "loss": 2.8265, "step": 2103 }, { "epoch": 0.582664081971753, "grad_norm": 1.215285062789917, "learning_rate": 7.610467542543073e-05, "loss": 2.6078, "step": 2104 }, { "epoch": 0.5829410135696483, "grad_norm": 0.7829873561859131, "learning_rate": 7.60190182349513e-05, "loss": 2.6538, "step": 2105 }, { "epoch": 0.5832179451675437, "grad_norm": 2.242455005645752, "learning_rate": 7.593337970924554e-05, "loss": 2.7853, "step": 2106 }, { "epoch": 0.583494876765439, "grad_norm": 1.0244053602218628, "learning_rate": 7.584775991496732e-05, "loss": 2.7967, "step": 2107 }, { "epoch": 0.5837718083633343, "grad_norm": 2.7727086544036865, "learning_rate": 7.576215891875584e-05, "loss": 2.9253, "step": 2108 }, { "epoch": 0.5840487399612295, "grad_norm": 1.8041883707046509, "learning_rate": 7.567657678723565e-05, "loss": 2.6955, "step": 2109 }, { "epoch": 0.5843256715591248, "grad_norm": 2.026834726333618, "learning_rate": 7.559101358701673e-05, "loss": 2.5201, "step": 2110 }, { "epoch": 0.5846026031570202, "grad_norm": 1.2291382551193237, "learning_rate": 7.550546938469424e-05, "loss": 2.71, "step": 2111 }, { "epoch": 0.5848795347549155, "grad_norm": 1.3409702777862549, "learning_rate": 7.541994424684852e-05, "loss": 2.7428, "step": 2112 }, { "epoch": 0.5851564663528108, "grad_norm": 1.8270671367645264, "learning_rate": 7.53344382400452e-05, "loss": 2.8873, "step": 2113 }, { "epoch": 0.5854333979507061, "grad_norm": 2.524343729019165, "learning_rate": 7.52489514308349e-05, "loss": 3.2532, "step": 2114 }, { "epoch": 0.5857103295486015, "grad_norm": 1.66667902469635, "learning_rate": 7.516348388575336e-05, "loss": 2.6943, "step": 2115 }, { "epoch": 0.5859872611464968, "grad_norm": 2.1451516151428223, "learning_rate": 7.50780356713213e-05, "loss": 2.6747, "step": 2116 }, { "epoch": 0.5862641927443921, "grad_norm": 1.5665041208267212, "learning_rate": 7.499260685404442e-05, "loss": 2.6168, "step": 2117 }, { "epoch": 0.5865411243422874, "grad_norm": 1.6107885837554932, "learning_rate": 7.490719750041326e-05, "loss": 2.7984, "step": 2118 }, { "epoch": 0.5868180559401828, "grad_norm": 1.0318968296051025, "learning_rate": 7.482180767690334e-05, "loss": 2.6345, "step": 2119 }, { "epoch": 0.5870949875380781, "grad_norm": 1.556569218635559, "learning_rate": 7.473643744997483e-05, "loss": 2.885, "step": 2120 }, { "epoch": 0.5873719191359734, "grad_norm": 1.727760910987854, "learning_rate": 7.46510868860727e-05, "loss": 2.608, "step": 2121 }, { "epoch": 0.5876488507338687, "grad_norm": 1.6430820226669312, "learning_rate": 7.456575605162672e-05, "loss": 2.635, "step": 2122 }, { "epoch": 0.5879257823317641, "grad_norm": 1.2055941820144653, "learning_rate": 7.448044501305114e-05, "loss": 2.7858, "step": 2123 }, { "epoch": 0.5882027139296594, "grad_norm": 1.0556684732437134, "learning_rate": 7.439515383674485e-05, "loss": 2.8217, "step": 2124 }, { "epoch": 0.5884796455275547, "grad_norm": 3.524182081222534, "learning_rate": 7.43098825890914e-05, "loss": 3.5602, "step": 2125 }, { "epoch": 0.58875657712545, "grad_norm": 1.06099271774292, "learning_rate": 7.42246313364587e-05, "loss": 2.6188, "step": 2126 }, { "epoch": 0.5890335087233454, "grad_norm": 1.5477778911590576, "learning_rate": 7.413940014519907e-05, "loss": 2.6959, "step": 2127 }, { "epoch": 0.5893104403212407, "grad_norm": 1.5380752086639404, "learning_rate": 7.405418908164939e-05, "loss": 2.7056, "step": 2128 }, { "epoch": 0.589587371919136, "grad_norm": 1.5953619480133057, "learning_rate": 7.396899821213072e-05, "loss": 2.6368, "step": 2129 }, { "epoch": 0.5898643035170313, "grad_norm": 1.9024546146392822, "learning_rate": 7.38838276029484e-05, "loss": 2.8164, "step": 2130 }, { "epoch": 0.5901412351149267, "grad_norm": 1.7428923845291138, "learning_rate": 7.379867732039213e-05, "loss": 2.7139, "step": 2131 }, { "epoch": 0.5904181667128219, "grad_norm": 1.7151989936828613, "learning_rate": 7.371354743073566e-05, "loss": 2.6295, "step": 2132 }, { "epoch": 0.5906950983107172, "grad_norm": 1.5807915925979614, "learning_rate": 7.362843800023689e-05, "loss": 2.7809, "step": 2133 }, { "epoch": 0.5909720299086125, "grad_norm": 1.5456976890563965, "learning_rate": 7.354334909513791e-05, "loss": 2.6879, "step": 2134 }, { "epoch": 0.5912489615065079, "grad_norm": 1.4076478481292725, "learning_rate": 7.345828078166466e-05, "loss": 2.6555, "step": 2135 }, { "epoch": 0.5915258931044032, "grad_norm": 1.5420868396759033, "learning_rate": 7.337323312602718e-05, "loss": 2.6094, "step": 2136 }, { "epoch": 0.5918028247022985, "grad_norm": 1.3896031379699707, "learning_rate": 7.32882061944194e-05, "loss": 2.9571, "step": 2137 }, { "epoch": 0.5920797563001938, "grad_norm": 1.683767318725586, "learning_rate": 7.320320005301911e-05, "loss": 3.0786, "step": 2138 }, { "epoch": 0.5923566878980892, "grad_norm": 1.792648196220398, "learning_rate": 7.311821476798789e-05, "loss": 2.8892, "step": 2139 }, { "epoch": 0.5926336194959845, "grad_norm": 2.0883829593658447, "learning_rate": 7.303325040547116e-05, "loss": 3.319, "step": 2140 }, { "epoch": 0.5929105510938798, "grad_norm": 1.1606462001800537, "learning_rate": 7.2948307031598e-05, "loss": 2.8071, "step": 2141 }, { "epoch": 0.5931874826917751, "grad_norm": 1.9280019998550415, "learning_rate": 7.286338471248113e-05, "loss": 2.7506, "step": 2142 }, { "epoch": 0.5934644142896704, "grad_norm": 0.847400426864624, "learning_rate": 7.277848351421699e-05, "loss": 2.8121, "step": 2143 }, { "epoch": 0.5937413458875658, "grad_norm": 1.5167869329452515, "learning_rate": 7.269360350288547e-05, "loss": 2.6086, "step": 2144 }, { "epoch": 0.5940182774854611, "grad_norm": 1.0383349657058716, "learning_rate": 7.260874474455e-05, "loss": 2.715, "step": 2145 }, { "epoch": 0.5942952090833564, "grad_norm": 1.9108144044876099, "learning_rate": 7.25239073052575e-05, "loss": 2.9016, "step": 2146 }, { "epoch": 0.5945721406812517, "grad_norm": 1.9010608196258545, "learning_rate": 7.243909125103829e-05, "loss": 2.6386, "step": 2147 }, { "epoch": 0.5948490722791471, "grad_norm": 1.4881986379623413, "learning_rate": 7.235429664790603e-05, "loss": 2.7531, "step": 2148 }, { "epoch": 0.5951260038770424, "grad_norm": 1.6744920015335083, "learning_rate": 7.226952356185765e-05, "loss": 3.2777, "step": 2149 }, { "epoch": 0.5954029354749377, "grad_norm": 3.5773537158966064, "learning_rate": 7.218477205887344e-05, "loss": 3.7091, "step": 2150 }, { "epoch": 0.595679867072833, "grad_norm": 1.0836102962493896, "learning_rate": 7.210004220491673e-05, "loss": 2.7561, "step": 2151 }, { "epoch": 0.5959567986707284, "grad_norm": 1.4775099754333496, "learning_rate": 7.20153340659342e-05, "loss": 2.831, "step": 2152 }, { "epoch": 0.5962337302686237, "grad_norm": 1.6922560930252075, "learning_rate": 7.193064770785545e-05, "loss": 2.6291, "step": 2153 }, { "epoch": 0.596510661866519, "grad_norm": 1.7395472526550293, "learning_rate": 7.184598319659317e-05, "loss": 2.9307, "step": 2154 }, { "epoch": 0.5967875934644142, "grad_norm": 1.068514108657837, "learning_rate": 7.176134059804316e-05, "loss": 2.8556, "step": 2155 }, { "epoch": 0.5970645250623096, "grad_norm": 1.519996166229248, "learning_rate": 7.167671997808405e-05, "loss": 2.6497, "step": 2156 }, { "epoch": 0.5973414566602049, "grad_norm": 1.3461872339248657, "learning_rate": 7.159212140257734e-05, "loss": 2.6785, "step": 2157 }, { "epoch": 0.5976183882581002, "grad_norm": 1.668176293373108, "learning_rate": 7.150754493736749e-05, "loss": 2.9134, "step": 2158 }, { "epoch": 0.5978953198559955, "grad_norm": 1.2623629570007324, "learning_rate": 7.142299064828169e-05, "loss": 2.7, "step": 2159 }, { "epoch": 0.5981722514538909, "grad_norm": 1.7657973766326904, "learning_rate": 7.133845860112978e-05, "loss": 2.9739, "step": 2160 }, { "epoch": 0.5984491830517862, "grad_norm": 1.0858269929885864, "learning_rate": 7.12539488617045e-05, "loss": 2.5533, "step": 2161 }, { "epoch": 0.5987261146496815, "grad_norm": 1.3624961376190186, "learning_rate": 7.116946149578105e-05, "loss": 2.7661, "step": 2162 }, { "epoch": 0.5990030462475768, "grad_norm": 1.414823055267334, "learning_rate": 7.108499656911721e-05, "loss": 2.6578, "step": 2163 }, { "epoch": 0.5992799778454722, "grad_norm": 2.4731411933898926, "learning_rate": 7.100055414745346e-05, "loss": 3.1674, "step": 2164 }, { "epoch": 0.5995569094433675, "grad_norm": 1.1124169826507568, "learning_rate": 7.09161342965126e-05, "loss": 2.8148, "step": 2165 }, { "epoch": 0.5998338410412628, "grad_norm": 2.0510025024414062, "learning_rate": 7.08317370819999e-05, "loss": 3.0599, "step": 2166 }, { "epoch": 0.6001107726391581, "grad_norm": 1.8278013467788696, "learning_rate": 7.07473625696031e-05, "loss": 2.9529, "step": 2167 }, { "epoch": 0.6003877042370535, "grad_norm": 2.326231002807617, "learning_rate": 7.066301082499216e-05, "loss": 2.791, "step": 2168 }, { "epoch": 0.6006646358349488, "grad_norm": 1.308354139328003, "learning_rate": 7.057868191381936e-05, "loss": 2.6107, "step": 2169 }, { "epoch": 0.6009415674328441, "grad_norm": 1.7721244096755981, "learning_rate": 7.049437590171924e-05, "loss": 2.5734, "step": 2170 }, { "epoch": 0.6012184990307394, "grad_norm": 1.9452602863311768, "learning_rate": 7.041009285430848e-05, "loss": 2.8181, "step": 2171 }, { "epoch": 0.6014954306286348, "grad_norm": 1.4054052829742432, "learning_rate": 7.032583283718585e-05, "loss": 2.7896, "step": 2172 }, { "epoch": 0.6017723622265301, "grad_norm": 1.540339708328247, "learning_rate": 7.024159591593233e-05, "loss": 2.7709, "step": 2173 }, { "epoch": 0.6020492938244254, "grad_norm": 1.711843490600586, "learning_rate": 7.015738215611079e-05, "loss": 3.3467, "step": 2174 }, { "epoch": 0.6023262254223207, "grad_norm": 2.023811101913452, "learning_rate": 7.007319162326607e-05, "loss": 3.4878, "step": 2175 }, { "epoch": 0.602603157020216, "grad_norm": 0.9673023223876953, "learning_rate": 6.998902438292508e-05, "loss": 2.6065, "step": 2176 }, { "epoch": 0.6028800886181114, "grad_norm": 1.6274542808532715, "learning_rate": 6.990488050059644e-05, "loss": 2.6886, "step": 2177 }, { "epoch": 0.6031570202160067, "grad_norm": 2.1140644550323486, "learning_rate": 6.982076004177061e-05, "loss": 3.0166, "step": 2178 }, { "epoch": 0.6034339518139019, "grad_norm": 1.175340175628662, "learning_rate": 6.973666307191996e-05, "loss": 2.7278, "step": 2179 }, { "epoch": 0.6037108834117972, "grad_norm": 1.7920030355453491, "learning_rate": 6.965258965649841e-05, "loss": 2.8499, "step": 2180 }, { "epoch": 0.6039878150096926, "grad_norm": 1.4756419658660889, "learning_rate": 6.956853986094163e-05, "loss": 2.8933, "step": 2181 }, { "epoch": 0.6042647466075879, "grad_norm": 2.0288076400756836, "learning_rate": 6.948451375066689e-05, "loss": 2.9402, "step": 2182 }, { "epoch": 0.6045416782054832, "grad_norm": 1.8753596544265747, "learning_rate": 6.940051139107306e-05, "loss": 2.8504, "step": 2183 }, { "epoch": 0.6048186098033785, "grad_norm": 1.7512092590332031, "learning_rate": 6.931653284754042e-05, "loss": 2.8587, "step": 2184 }, { "epoch": 0.6050955414012739, "grad_norm": 0.7080219388008118, "learning_rate": 6.923257818543088e-05, "loss": 2.7284, "step": 2185 }, { "epoch": 0.6053724729991692, "grad_norm": 1.5267521142959595, "learning_rate": 6.914864747008762e-05, "loss": 3.099, "step": 2186 }, { "epoch": 0.6056494045970645, "grad_norm": 1.6850520372390747, "learning_rate": 6.906474076683519e-05, "loss": 2.7281, "step": 2187 }, { "epoch": 0.6059263361949598, "grad_norm": 1.2631269693374634, "learning_rate": 6.898085814097957e-05, "loss": 2.911, "step": 2188 }, { "epoch": 0.6062032677928552, "grad_norm": 1.7381616830825806, "learning_rate": 6.889699965780787e-05, "loss": 2.6787, "step": 2189 }, { "epoch": 0.6064801993907505, "grad_norm": 1.130190372467041, "learning_rate": 6.881316538258846e-05, "loss": 2.9246, "step": 2190 }, { "epoch": 0.6067571309886458, "grad_norm": 1.3303025960922241, "learning_rate": 6.87293553805709e-05, "loss": 3.1455, "step": 2191 }, { "epoch": 0.6070340625865411, "grad_norm": 1.4642069339752197, "learning_rate": 6.864556971698583e-05, "loss": 2.6106, "step": 2192 }, { "epoch": 0.6073109941844365, "grad_norm": 1.1034564971923828, "learning_rate": 6.856180845704488e-05, "loss": 2.7239, "step": 2193 }, { "epoch": 0.6075879257823318, "grad_norm": 1.9593942165374756, "learning_rate": 6.847807166594083e-05, "loss": 2.7019, "step": 2194 }, { "epoch": 0.6078648573802271, "grad_norm": 1.1505136489868164, "learning_rate": 6.839435940884731e-05, "loss": 2.6409, "step": 2195 }, { "epoch": 0.6081417889781224, "grad_norm": 1.1177148818969727, "learning_rate": 6.831067175091884e-05, "loss": 2.7674, "step": 2196 }, { "epoch": 0.6084187205760178, "grad_norm": 1.723169207572937, "learning_rate": 6.82270087572909e-05, "loss": 2.5711, "step": 2197 }, { "epoch": 0.6086956521739131, "grad_norm": 1.8692177534103394, "learning_rate": 6.814337049307966e-05, "loss": 2.8186, "step": 2198 }, { "epoch": 0.6089725837718084, "grad_norm": 1.2709190845489502, "learning_rate": 6.805975702338208e-05, "loss": 2.909, "step": 2199 }, { "epoch": 0.6092495153697037, "grad_norm": 3.416496515274048, "learning_rate": 6.797616841327591e-05, "loss": 3.9108, "step": 2200 }, { "epoch": 0.6095264469675991, "grad_norm": 1.9776725769042969, "learning_rate": 6.78926047278194e-05, "loss": 2.8953, "step": 2201 }, { "epoch": 0.6098033785654943, "grad_norm": 1.553315281867981, "learning_rate": 6.780906603205148e-05, "loss": 2.733, "step": 2202 }, { "epoch": 0.6100803101633896, "grad_norm": 1.8651763200759888, "learning_rate": 6.772555239099166e-05, "loss": 2.9544, "step": 2203 }, { "epoch": 0.6103572417612849, "grad_norm": 1.6313016414642334, "learning_rate": 6.764206386963991e-05, "loss": 2.6848, "step": 2204 }, { "epoch": 0.6106341733591802, "grad_norm": 1.7985180616378784, "learning_rate": 6.755860053297662e-05, "loss": 2.5212, "step": 2205 }, { "epoch": 0.6109111049570756, "grad_norm": 1.282670497894287, "learning_rate": 6.747516244596267e-05, "loss": 2.8279, "step": 2206 }, { "epoch": 0.6111880365549709, "grad_norm": 2.2696824073791504, "learning_rate": 6.73917496735392e-05, "loss": 2.9487, "step": 2207 }, { "epoch": 0.6114649681528662, "grad_norm": 1.2766075134277344, "learning_rate": 6.730836228062763e-05, "loss": 2.6696, "step": 2208 }, { "epoch": 0.6117418997507615, "grad_norm": 1.6926491260528564, "learning_rate": 6.722500033212974e-05, "loss": 2.8545, "step": 2209 }, { "epoch": 0.6120188313486569, "grad_norm": 1.9839706420898438, "learning_rate": 6.714166389292744e-05, "loss": 2.5835, "step": 2210 }, { "epoch": 0.6122957629465522, "grad_norm": 1.2539972066879272, "learning_rate": 6.70583530278827e-05, "loss": 3.0236, "step": 2211 }, { "epoch": 0.6125726945444475, "grad_norm": 1.5419610738754272, "learning_rate": 6.697506780183778e-05, "loss": 2.8228, "step": 2212 }, { "epoch": 0.6128496261423428, "grad_norm": 1.4121469259262085, "learning_rate": 6.689180827961481e-05, "loss": 3.0115, "step": 2213 }, { "epoch": 0.6131265577402382, "grad_norm": 1.744856834411621, "learning_rate": 6.680857452601598e-05, "loss": 2.7659, "step": 2214 }, { "epoch": 0.6134034893381335, "grad_norm": 2.4373672008514404, "learning_rate": 6.67253666058235e-05, "loss": 3.2158, "step": 2215 }, { "epoch": 0.6136804209360288, "grad_norm": 1.1923551559448242, "learning_rate": 6.664218458379933e-05, "loss": 2.6316, "step": 2216 }, { "epoch": 0.6139573525339241, "grad_norm": 1.2512011528015137, "learning_rate": 6.65590285246853e-05, "loss": 2.6847, "step": 2217 }, { "epoch": 0.6142342841318195, "grad_norm": 1.0812313556671143, "learning_rate": 6.647589849320318e-05, "loss": 2.5979, "step": 2218 }, { "epoch": 0.6145112157297148, "grad_norm": 1.2180825471878052, "learning_rate": 6.639279455405432e-05, "loss": 2.6674, "step": 2219 }, { "epoch": 0.6147881473276101, "grad_norm": 0.8857952356338501, "learning_rate": 6.630971677191978e-05, "loss": 2.6687, "step": 2220 }, { "epoch": 0.6150650789255054, "grad_norm": 1.5782952308654785, "learning_rate": 6.622666521146036e-05, "loss": 2.7259, "step": 2221 }, { "epoch": 0.6153420105234008, "grad_norm": 2.128387212753296, "learning_rate": 6.614363993731636e-05, "loss": 2.8938, "step": 2222 }, { "epoch": 0.6156189421212961, "grad_norm": 0.9730496406555176, "learning_rate": 6.606064101410765e-05, "loss": 2.5285, "step": 2223 }, { "epoch": 0.6158958737191914, "grad_norm": 1.520876407623291, "learning_rate": 6.597766850643361e-05, "loss": 3.1274, "step": 2224 }, { "epoch": 0.6161728053170866, "grad_norm": 2.834122657775879, "learning_rate": 6.589472247887305e-05, "loss": 3.5855, "step": 2225 }, { "epoch": 0.616449736914982, "grad_norm": 1.961204171180725, "learning_rate": 6.58118029959841e-05, "loss": 2.9324, "step": 2226 }, { "epoch": 0.6167266685128773, "grad_norm": 1.0947190523147583, "learning_rate": 6.572891012230438e-05, "loss": 2.5908, "step": 2227 }, { "epoch": 0.6170036001107726, "grad_norm": 1.51961350440979, "learning_rate": 6.564604392235066e-05, "loss": 2.8876, "step": 2228 }, { "epoch": 0.6172805317086679, "grad_norm": 2.206125020980835, "learning_rate": 6.556320446061902e-05, "loss": 2.8758, "step": 2229 }, { "epoch": 0.6175574633065632, "grad_norm": 1.4404202699661255, "learning_rate": 6.548039180158466e-05, "loss": 2.9083, "step": 2230 }, { "epoch": 0.6178343949044586, "grad_norm": 1.103936791419983, "learning_rate": 6.539760600970205e-05, "loss": 2.7771, "step": 2231 }, { "epoch": 0.6181113265023539, "grad_norm": 1.6521354913711548, "learning_rate": 6.531484714940461e-05, "loss": 2.7143, "step": 2232 }, { "epoch": 0.6183882581002492, "grad_norm": 1.1557974815368652, "learning_rate": 6.523211528510487e-05, "loss": 2.8761, "step": 2233 }, { "epoch": 0.6186651896981445, "grad_norm": 0.951933741569519, "learning_rate": 6.514941048119435e-05, "loss": 2.7942, "step": 2234 }, { "epoch": 0.6189421212960399, "grad_norm": 1.1569383144378662, "learning_rate": 6.50667328020435e-05, "loss": 2.7508, "step": 2235 }, { "epoch": 0.6192190528939352, "grad_norm": 1.6564579010009766, "learning_rate": 6.498408231200159e-05, "loss": 2.6084, "step": 2236 }, { "epoch": 0.6194959844918305, "grad_norm": 1.7123055458068848, "learning_rate": 6.490145907539689e-05, "loss": 2.4802, "step": 2237 }, { "epoch": 0.6197729160897258, "grad_norm": 1.9307233095169067, "learning_rate": 6.481886315653632e-05, "loss": 2.8049, "step": 2238 }, { "epoch": 0.6200498476876212, "grad_norm": 1.2563745975494385, "learning_rate": 6.47362946197055e-05, "loss": 2.9327, "step": 2239 }, { "epoch": 0.6203267792855165, "grad_norm": 1.546926498413086, "learning_rate": 6.465375352916893e-05, "loss": 2.7569, "step": 2240 }, { "epoch": 0.6206037108834118, "grad_norm": 1.681423306465149, "learning_rate": 6.45712399491696e-05, "loss": 3.1465, "step": 2241 }, { "epoch": 0.6208806424813071, "grad_norm": 1.571089267730713, "learning_rate": 6.448875394392905e-05, "loss": 2.8521, "step": 2242 }, { "epoch": 0.6211575740792025, "grad_norm": 1.9650115966796875, "learning_rate": 6.440629557764753e-05, "loss": 2.8507, "step": 2243 }, { "epoch": 0.6214345056770978, "grad_norm": 1.1810646057128906, "learning_rate": 6.432386491450361e-05, "loss": 2.5333, "step": 2244 }, { "epoch": 0.6217114372749931, "grad_norm": 1.0897647142410278, "learning_rate": 6.42414620186544e-05, "loss": 2.8562, "step": 2245 }, { "epoch": 0.6219883688728884, "grad_norm": 0.9666537046432495, "learning_rate": 6.415908695423534e-05, "loss": 2.8046, "step": 2246 }, { "epoch": 0.6222653004707838, "grad_norm": 1.246010422706604, "learning_rate": 6.407673978536029e-05, "loss": 2.749, "step": 2247 }, { "epoch": 0.6225422320686791, "grad_norm": 1.62389075756073, "learning_rate": 6.399442057612122e-05, "loss": 2.8788, "step": 2248 }, { "epoch": 0.6228191636665743, "grad_norm": 1.7508984804153442, "learning_rate": 6.391212939058861e-05, "loss": 3.1259, "step": 2249 }, { "epoch": 0.6230960952644696, "grad_norm": 3.678032159805298, "learning_rate": 6.38298662928109e-05, "loss": 3.8598, "step": 2250 }, { "epoch": 0.623373026862365, "grad_norm": 1.0360453128814697, "learning_rate": 6.374763134681472e-05, "loss": 2.6876, "step": 2251 }, { "epoch": 0.6236499584602603, "grad_norm": 1.6751775741577148, "learning_rate": 6.366542461660488e-05, "loss": 3.1079, "step": 2252 }, { "epoch": 0.6239268900581556, "grad_norm": 1.4977526664733887, "learning_rate": 6.358324616616412e-05, "loss": 2.695, "step": 2253 }, { "epoch": 0.6242038216560509, "grad_norm": 1.6655688285827637, "learning_rate": 6.350109605945323e-05, "loss": 3.0501, "step": 2254 }, { "epoch": 0.6244807532539463, "grad_norm": 1.2852628231048584, "learning_rate": 6.341897436041093e-05, "loss": 3.0004, "step": 2255 }, { "epoch": 0.6247576848518416, "grad_norm": 1.231870412826538, "learning_rate": 6.333688113295384e-05, "loss": 2.5332, "step": 2256 }, { "epoch": 0.6250346164497369, "grad_norm": 1.5554944276809692, "learning_rate": 6.325481644097635e-05, "loss": 2.5779, "step": 2257 }, { "epoch": 0.6253115480476322, "grad_norm": 1.6678074598312378, "learning_rate": 6.317278034835077e-05, "loss": 2.7953, "step": 2258 }, { "epoch": 0.6255884796455276, "grad_norm": 1.0044925212860107, "learning_rate": 6.309077291892702e-05, "loss": 2.6219, "step": 2259 }, { "epoch": 0.6258654112434229, "grad_norm": 1.7450201511383057, "learning_rate": 6.300879421653277e-05, "loss": 2.8333, "step": 2260 }, { "epoch": 0.6261423428413182, "grad_norm": 2.0873069763183594, "learning_rate": 6.292684430497335e-05, "loss": 2.6769, "step": 2261 }, { "epoch": 0.6264192744392135, "grad_norm": 1.1980091333389282, "learning_rate": 6.284492324803167e-05, "loss": 2.7962, "step": 2262 }, { "epoch": 0.6266962060371088, "grad_norm": 1.5165520906448364, "learning_rate": 6.27630311094681e-05, "loss": 2.92, "step": 2263 }, { "epoch": 0.6269731376350042, "grad_norm": 1.4691338539123535, "learning_rate": 6.268116795302068e-05, "loss": 3.0009, "step": 2264 }, { "epoch": 0.6272500692328995, "grad_norm": 1.3578917980194092, "learning_rate": 6.25993338424047e-05, "loss": 2.5752, "step": 2265 }, { "epoch": 0.6275270008307948, "grad_norm": 1.5438745021820068, "learning_rate": 6.251752884131298e-05, "loss": 2.6971, "step": 2266 }, { "epoch": 0.6278039324286901, "grad_norm": 1.9460550546646118, "learning_rate": 6.243575301341561e-05, "loss": 2.7949, "step": 2267 }, { "epoch": 0.6280808640265855, "grad_norm": 1.9470924139022827, "learning_rate": 6.235400642236002e-05, "loss": 2.6991, "step": 2268 }, { "epoch": 0.6283577956244808, "grad_norm": 1.1154978275299072, "learning_rate": 6.227228913177081e-05, "loss": 2.8878, "step": 2269 }, { "epoch": 0.6286347272223761, "grad_norm": 1.426668405532837, "learning_rate": 6.21906012052499e-05, "loss": 2.8709, "step": 2270 }, { "epoch": 0.6289116588202714, "grad_norm": 1.4238054752349854, "learning_rate": 6.210894270637625e-05, "loss": 2.6498, "step": 2271 }, { "epoch": 0.6291885904181667, "grad_norm": 1.6661927700042725, "learning_rate": 6.202731369870591e-05, "loss": 2.978, "step": 2272 }, { "epoch": 0.629465522016062, "grad_norm": 1.2813878059387207, "learning_rate": 6.19457142457721e-05, "loss": 2.8663, "step": 2273 }, { "epoch": 0.6297424536139573, "grad_norm": 1.130333662033081, "learning_rate": 6.186414441108487e-05, "loss": 2.8716, "step": 2274 }, { "epoch": 0.6300193852118526, "grad_norm": 3.8945274353027344, "learning_rate": 6.178260425813131e-05, "loss": 3.9808, "step": 2275 }, { "epoch": 0.630296316809748, "grad_norm": 1.5076262950897217, "learning_rate": 6.170109385037545e-05, "loss": 2.7333, "step": 2276 }, { "epoch": 0.6305732484076433, "grad_norm": 1.3850371837615967, "learning_rate": 6.161961325125809e-05, "loss": 2.9242, "step": 2277 }, { "epoch": 0.6308501800055386, "grad_norm": 1.3438189029693604, "learning_rate": 6.153816252419682e-05, "loss": 3.0425, "step": 2278 }, { "epoch": 0.6311271116034339, "grad_norm": 1.1912837028503418, "learning_rate": 6.14567417325861e-05, "loss": 2.773, "step": 2279 }, { "epoch": 0.6314040432013293, "grad_norm": 1.4491552114486694, "learning_rate": 6.137535093979694e-05, "loss": 2.6852, "step": 2280 }, { "epoch": 0.6316809747992246, "grad_norm": 1.8994818925857544, "learning_rate": 6.129399020917705e-05, "loss": 2.9605, "step": 2281 }, { "epoch": 0.6319579063971199, "grad_norm": 0.9408001899719238, "learning_rate": 6.121265960405085e-05, "loss": 2.9011, "step": 2282 }, { "epoch": 0.6322348379950152, "grad_norm": 1.6518540382385254, "learning_rate": 6.113135918771915e-05, "loss": 2.9159, "step": 2283 }, { "epoch": 0.6325117695929106, "grad_norm": 0.8682531118392944, "learning_rate": 6.105008902345935e-05, "loss": 2.584, "step": 2284 }, { "epoch": 0.6327887011908059, "grad_norm": 2.14064884185791, "learning_rate": 6.096884917452531e-05, "loss": 2.7146, "step": 2285 }, { "epoch": 0.6330656327887012, "grad_norm": 1.1299852132797241, "learning_rate": 6.088763970414726e-05, "loss": 3.0074, "step": 2286 }, { "epoch": 0.6333425643865965, "grad_norm": 1.2496439218521118, "learning_rate": 6.080646067553182e-05, "loss": 2.9153, "step": 2287 }, { "epoch": 0.6336194959844919, "grad_norm": 1.5232523679733276, "learning_rate": 6.0725312151861866e-05, "loss": 2.6882, "step": 2288 }, { "epoch": 0.6338964275823872, "grad_norm": 1.2904765605926514, "learning_rate": 6.064419419629662e-05, "loss": 3.0187, "step": 2289 }, { "epoch": 0.6341733591802825, "grad_norm": 1.1545737981796265, "learning_rate": 6.0563106871971385e-05, "loss": 2.7736, "step": 2290 }, { "epoch": 0.6344502907781778, "grad_norm": 1.1595368385314941, "learning_rate": 6.048205024199778e-05, "loss": 2.8019, "step": 2291 }, { "epoch": 0.6347272223760732, "grad_norm": 1.647827386856079, "learning_rate": 6.04010243694634e-05, "loss": 2.8922, "step": 2292 }, { "epoch": 0.6350041539739685, "grad_norm": 2.090649366378784, "learning_rate": 6.0320029317431924e-05, "loss": 2.5161, "step": 2293 }, { "epoch": 0.6352810855718638, "grad_norm": 1.726109266281128, "learning_rate": 6.023906514894313e-05, "loss": 2.709, "step": 2294 }, { "epoch": 0.635558017169759, "grad_norm": 1.3686494827270508, "learning_rate": 6.015813192701267e-05, "loss": 2.967, "step": 2295 }, { "epoch": 0.6358349487676543, "grad_norm": 21.516817092895508, "learning_rate": 6.007722971463211e-05, "loss": 2.955, "step": 2296 }, { "epoch": 0.6361118803655497, "grad_norm": 1.523563027381897, "learning_rate": 5.999635857476897e-05, "loss": 2.693, "step": 2297 }, { "epoch": 0.636388811963445, "grad_norm": 1.3220926523208618, "learning_rate": 5.991551857036648e-05, "loss": 3.0974, "step": 2298 }, { "epoch": 0.6366657435613403, "grad_norm": 1.6534843444824219, "learning_rate": 5.983470976434369e-05, "loss": 2.9154, "step": 2299 }, { "epoch": 0.6369426751592356, "grad_norm": 4.144333362579346, "learning_rate": 5.9753932219595356e-05, "loss": 3.9512, "step": 2300 }, { "epoch": 0.637219606757131, "grad_norm": 1.7014082670211792, "learning_rate": 5.967318599899194e-05, "loss": 2.7434, "step": 2301 }, { "epoch": 0.6374965383550263, "grad_norm": 1.1944373846054077, "learning_rate": 5.9592471165379425e-05, "loss": 2.746, "step": 2302 }, { "epoch": 0.6377734699529216, "grad_norm": 1.1372411251068115, "learning_rate": 5.95117877815795e-05, "loss": 2.7947, "step": 2303 }, { "epoch": 0.6380504015508169, "grad_norm": 1.6045831441879272, "learning_rate": 5.943113591038928e-05, "loss": 2.8579, "step": 2304 }, { "epoch": 0.6383273331487123, "grad_norm": 1.6579948663711548, "learning_rate": 5.935051561458134e-05, "loss": 2.7143, "step": 2305 }, { "epoch": 0.6386042647466076, "grad_norm": 0.9162551760673523, "learning_rate": 5.926992695690378e-05, "loss": 2.5414, "step": 2306 }, { "epoch": 0.6388811963445029, "grad_norm": 1.522512435913086, "learning_rate": 5.918937000007999e-05, "loss": 2.6506, "step": 2307 }, { "epoch": 0.6391581279423982, "grad_norm": 2.271165609359741, "learning_rate": 5.910884480680867e-05, "loss": 2.9811, "step": 2308 }, { "epoch": 0.6394350595402936, "grad_norm": 1.3790143728256226, "learning_rate": 5.902835143976393e-05, "loss": 2.6826, "step": 2309 }, { "epoch": 0.6397119911381889, "grad_norm": 1.4355623722076416, "learning_rate": 5.894788996159493e-05, "loss": 2.7139, "step": 2310 }, { "epoch": 0.6399889227360842, "grad_norm": 1.4164738655090332, "learning_rate": 5.8867460434926125e-05, "loss": 2.7957, "step": 2311 }, { "epoch": 0.6402658543339795, "grad_norm": 1.381259799003601, "learning_rate": 5.8787062922357115e-05, "loss": 2.5392, "step": 2312 }, { "epoch": 0.6405427859318749, "grad_norm": 1.610293984413147, "learning_rate": 5.870669748646254e-05, "loss": 2.7615, "step": 2313 }, { "epoch": 0.6408197175297702, "grad_norm": 1.2964593172073364, "learning_rate": 5.862636418979198e-05, "loss": 2.737, "step": 2314 }, { "epoch": 0.6410966491276655, "grad_norm": 1.647544264793396, "learning_rate": 5.854606309487023e-05, "loss": 2.6559, "step": 2315 }, { "epoch": 0.6413735807255608, "grad_norm": 1.4856895208358765, "learning_rate": 5.8465794264196815e-05, "loss": 2.8889, "step": 2316 }, { "epoch": 0.6416505123234562, "grad_norm": 1.3166649341583252, "learning_rate": 5.8385557760246203e-05, "loss": 2.6272, "step": 2317 }, { "epoch": 0.6419274439213515, "grad_norm": 6.107063293457031, "learning_rate": 5.830535364546779e-05, "loss": 3.0925, "step": 2318 }, { "epoch": 0.6422043755192467, "grad_norm": 1.8524757623672485, "learning_rate": 5.822518198228565e-05, "loss": 2.5587, "step": 2319 }, { "epoch": 0.642481307117142, "grad_norm": 1.1845844984054565, "learning_rate": 5.8145042833098654e-05, "loss": 2.5099, "step": 2320 }, { "epoch": 0.6427582387150373, "grad_norm": 1.5074481964111328, "learning_rate": 5.8064936260280335e-05, "loss": 2.7683, "step": 2321 }, { "epoch": 0.6430351703129327, "grad_norm": 0.9466500282287598, "learning_rate": 5.798486232617898e-05, "loss": 2.5384, "step": 2322 }, { "epoch": 0.643312101910828, "grad_norm": 1.6233863830566406, "learning_rate": 5.790482109311723e-05, "loss": 2.8482, "step": 2323 }, { "epoch": 0.6435890335087233, "grad_norm": 1.5645900964736938, "learning_rate": 5.782481262339261e-05, "loss": 3.1355, "step": 2324 }, { "epoch": 0.6438659651066186, "grad_norm": 5.980441093444824, "learning_rate": 5.774483697927684e-05, "loss": 4.6119, "step": 2325 }, { "epoch": 0.644142896704514, "grad_norm": 1.4056565761566162, "learning_rate": 5.766489422301624e-05, "loss": 2.7215, "step": 2326 }, { "epoch": 0.6444198283024093, "grad_norm": 1.7121729850769043, "learning_rate": 5.7584984416831535e-05, "loss": 2.7691, "step": 2327 }, { "epoch": 0.6446967599003046, "grad_norm": 1.0393214225769043, "learning_rate": 5.750510762291775e-05, "loss": 2.8974, "step": 2328 }, { "epoch": 0.6449736914981999, "grad_norm": 1.7014375925064087, "learning_rate": 5.742526390344427e-05, "loss": 2.8694, "step": 2329 }, { "epoch": 0.6452506230960953, "grad_norm": 1.6189837455749512, "learning_rate": 5.734545332055471e-05, "loss": 2.6544, "step": 2330 }, { "epoch": 0.6455275546939906, "grad_norm": 0.8944665193557739, "learning_rate": 5.7265675936366925e-05, "loss": 2.6017, "step": 2331 }, { "epoch": 0.6458044862918859, "grad_norm": 2.0103485584259033, "learning_rate": 5.718593181297278e-05, "loss": 3.1379, "step": 2332 }, { "epoch": 0.6460814178897812, "grad_norm": 1.183464527130127, "learning_rate": 5.710622101243857e-05, "loss": 2.8474, "step": 2333 }, { "epoch": 0.6463583494876766, "grad_norm": 1.8213006258010864, "learning_rate": 5.702654359680428e-05, "loss": 2.7316, "step": 2334 }, { "epoch": 0.6466352810855719, "grad_norm": 1.498777985572815, "learning_rate": 5.694689962808417e-05, "loss": 2.9369, "step": 2335 }, { "epoch": 0.6469122126834672, "grad_norm": 2.145932197570801, "learning_rate": 5.68672891682664e-05, "loss": 2.756, "step": 2336 }, { "epoch": 0.6471891442813625, "grad_norm": 1.2577202320098877, "learning_rate": 5.6787712279313e-05, "loss": 2.5537, "step": 2337 }, { "epoch": 0.6474660758792579, "grad_norm": 1.0808194875717163, "learning_rate": 5.670816902315994e-05, "loss": 2.6741, "step": 2338 }, { "epoch": 0.6477430074771532, "grad_norm": 1.7351064682006836, "learning_rate": 5.662865946171696e-05, "loss": 3.1183, "step": 2339 }, { "epoch": 0.6480199390750485, "grad_norm": 1.717112421989441, "learning_rate": 5.654918365686766e-05, "loss": 2.9241, "step": 2340 }, { "epoch": 0.6482968706729438, "grad_norm": 1.0554273128509521, "learning_rate": 5.6469741670469144e-05, "loss": 2.8183, "step": 2341 }, { "epoch": 0.648573802270839, "grad_norm": 12.077258110046387, "learning_rate": 5.639033356435257e-05, "loss": 2.8132, "step": 2342 }, { "epoch": 0.6488507338687344, "grad_norm": 1.9363653659820557, "learning_rate": 5.631095940032235e-05, "loss": 2.3197, "step": 2343 }, { "epoch": 0.6491276654666297, "grad_norm": 1.1245683431625366, "learning_rate": 5.6231619240156694e-05, "loss": 2.7323, "step": 2344 }, { "epoch": 0.649404597064525, "grad_norm": 1.9193580150604248, "learning_rate": 5.615231314560727e-05, "loss": 2.5539, "step": 2345 }, { "epoch": 0.6496815286624203, "grad_norm": 1.237788200378418, "learning_rate": 5.607304117839929e-05, "loss": 2.4474, "step": 2346 }, { "epoch": 0.6499584602603157, "grad_norm": 0.9905593395233154, "learning_rate": 5.599380340023135e-05, "loss": 2.7252, "step": 2347 }, { "epoch": 0.650235391858211, "grad_norm": 1.0714815855026245, "learning_rate": 5.591459987277545e-05, "loss": 2.7466, "step": 2348 }, { "epoch": 0.6505123234561063, "grad_norm": 1.6997700929641724, "learning_rate": 5.5835430657676976e-05, "loss": 2.9485, "step": 2349 }, { "epoch": 0.6507892550540016, "grad_norm": 3.4924404621124268, "learning_rate": 5.575629581655445e-05, "loss": 3.946, "step": 2350 }, { "epoch": 0.651066186651897, "grad_norm": 1.172761082649231, "learning_rate": 5.567719541099992e-05, "loss": 2.8241, "step": 2351 }, { "epoch": 0.6513431182497923, "grad_norm": 1.1470800638198853, "learning_rate": 5.5598129502578344e-05, "loss": 2.5238, "step": 2352 }, { "epoch": 0.6516200498476876, "grad_norm": 1.4182355403900146, "learning_rate": 5.551909815282801e-05, "loss": 2.7498, "step": 2353 }, { "epoch": 0.651896981445583, "grad_norm": 1.447364330291748, "learning_rate": 5.544010142326026e-05, "loss": 2.9252, "step": 2354 }, { "epoch": 0.6521739130434783, "grad_norm": 1.1676846742630005, "learning_rate": 5.5361139375359494e-05, "loss": 2.7525, "step": 2355 }, { "epoch": 0.6524508446413736, "grad_norm": 1.3512132167816162, "learning_rate": 5.5282212070583015e-05, "loss": 2.6566, "step": 2356 }, { "epoch": 0.6527277762392689, "grad_norm": 1.911013126373291, "learning_rate": 5.520331957036134e-05, "loss": 2.9136, "step": 2357 }, { "epoch": 0.6530047078371642, "grad_norm": 2.6728639602661133, "learning_rate": 5.5124461936097605e-05, "loss": 3.1852, "step": 2358 }, { "epoch": 0.6532816394350596, "grad_norm": 2.6511623859405518, "learning_rate": 5.504563922916799e-05, "loss": 3.2279, "step": 2359 }, { "epoch": 0.6535585710329549, "grad_norm": 1.7919533252716064, "learning_rate": 5.496685151092145e-05, "loss": 2.777, "step": 2360 }, { "epoch": 0.6538355026308502, "grad_norm": 1.2835693359375, "learning_rate": 5.4888098842679694e-05, "loss": 2.9625, "step": 2361 }, { "epoch": 0.6541124342287455, "grad_norm": 1.2808218002319336, "learning_rate": 5.480938128573716e-05, "loss": 2.789, "step": 2362 }, { "epoch": 0.6543893658266409, "grad_norm": 1.460275650024414, "learning_rate": 5.473069890136093e-05, "loss": 2.7713, "step": 2363 }, { "epoch": 0.6546662974245362, "grad_norm": 1.2018828392028809, "learning_rate": 5.4652051750790825e-05, "loss": 2.8828, "step": 2364 }, { "epoch": 0.6549432290224314, "grad_norm": 1.755499243736267, "learning_rate": 5.4573439895238976e-05, "loss": 2.449, "step": 2365 }, { "epoch": 0.6552201606203267, "grad_norm": 1.0556343793869019, "learning_rate": 5.449486339589043e-05, "loss": 2.7824, "step": 2366 }, { "epoch": 0.655497092218222, "grad_norm": 2.3796725273132324, "learning_rate": 5.441632231390237e-05, "loss": 3.0264, "step": 2367 }, { "epoch": 0.6557740238161174, "grad_norm": 1.304976224899292, "learning_rate": 5.4337816710404585e-05, "loss": 2.5941, "step": 2368 }, { "epoch": 0.6560509554140127, "grad_norm": 1.4646837711334229, "learning_rate": 5.425934664649921e-05, "loss": 2.7932, "step": 2369 }, { "epoch": 0.656327887011908, "grad_norm": 1.8304128646850586, "learning_rate": 5.4180912183260756e-05, "loss": 2.7483, "step": 2370 }, { "epoch": 0.6566048186098034, "grad_norm": 1.6258549690246582, "learning_rate": 5.410251338173595e-05, "loss": 2.8172, "step": 2371 }, { "epoch": 0.6568817502076987, "grad_norm": 1.4259107112884521, "learning_rate": 5.402415030294384e-05, "loss": 2.9041, "step": 2372 }, { "epoch": 0.657158681805594, "grad_norm": 1.149336576461792, "learning_rate": 5.3945823007875676e-05, "loss": 2.8761, "step": 2373 }, { "epoch": 0.6574356134034893, "grad_norm": 1.3595845699310303, "learning_rate": 5.3867531557494674e-05, "loss": 2.8705, "step": 2374 }, { "epoch": 0.6577125450013847, "grad_norm": 8.332207679748535, "learning_rate": 5.378927601273648e-05, "loss": 4.7642, "step": 2375 }, { "epoch": 0.65798947659928, "grad_norm": 1.699044942855835, "learning_rate": 5.371105643450849e-05, "loss": 2.8592, "step": 2376 }, { "epoch": 0.6582664081971753, "grad_norm": 1.3306677341461182, "learning_rate": 5.363287288369023e-05, "loss": 2.7023, "step": 2377 }, { "epoch": 0.6585433397950706, "grad_norm": 1.9283133745193481, "learning_rate": 5.355472542113325e-05, "loss": 2.8911, "step": 2378 }, { "epoch": 0.658820271392966, "grad_norm": 1.7714849710464478, "learning_rate": 5.347661410766087e-05, "loss": 2.8308, "step": 2379 }, { "epoch": 0.6590972029908613, "grad_norm": 1.4751148223876953, "learning_rate": 5.3398539004068415e-05, "loss": 2.7754, "step": 2380 }, { "epoch": 0.6593741345887566, "grad_norm": 1.3037173748016357, "learning_rate": 5.3320500171122914e-05, "loss": 2.9529, "step": 2381 }, { "epoch": 0.6596510661866519, "grad_norm": 1.1674219369888306, "learning_rate": 5.3242497669563264e-05, "loss": 3.0056, "step": 2382 }, { "epoch": 0.6599279977845472, "grad_norm": 0.8783468008041382, "learning_rate": 5.316453156009992e-05, "loss": 2.7169, "step": 2383 }, { "epoch": 0.6602049293824426, "grad_norm": 1.2833096981048584, "learning_rate": 5.308660190341528e-05, "loss": 2.6623, "step": 2384 }, { "epoch": 0.6604818609803379, "grad_norm": 1.4621068239212036, "learning_rate": 5.3008708760163106e-05, "loss": 2.7997, "step": 2385 }, { "epoch": 0.6607587925782332, "grad_norm": 2.31742525100708, "learning_rate": 5.293085219096889e-05, "loss": 2.8536, "step": 2386 }, { "epoch": 0.6610357241761285, "grad_norm": 0.8828467726707458, "learning_rate": 5.2853032256429614e-05, "loss": 2.4077, "step": 2387 }, { "epoch": 0.6613126557740238, "grad_norm": 1.3070199489593506, "learning_rate": 5.2775249017113796e-05, "loss": 2.9468, "step": 2388 }, { "epoch": 0.6615895873719191, "grad_norm": 0.896975040435791, "learning_rate": 5.2697502533561226e-05, "loss": 2.6763, "step": 2389 }, { "epoch": 0.6618665189698144, "grad_norm": 1.2044589519500732, "learning_rate": 5.2619792866283355e-05, "loss": 2.827, "step": 2390 }, { "epoch": 0.6621434505677097, "grad_norm": 1.2612204551696777, "learning_rate": 5.2542120075762755e-05, "loss": 2.7524, "step": 2391 }, { "epoch": 0.6624203821656051, "grad_norm": 0.9731190204620361, "learning_rate": 5.246448422245337e-05, "loss": 2.671, "step": 2392 }, { "epoch": 0.6626973137635004, "grad_norm": 1.6314771175384521, "learning_rate": 5.238688536678042e-05, "loss": 2.684, "step": 2393 }, { "epoch": 0.6629742453613957, "grad_norm": 1.4474800825119019, "learning_rate": 5.230932356914032e-05, "loss": 2.8404, "step": 2394 }, { "epoch": 0.663251176959291, "grad_norm": 1.5060440301895142, "learning_rate": 5.223179888990062e-05, "loss": 2.7726, "step": 2395 }, { "epoch": 0.6635281085571864, "grad_norm": 1.4345086812973022, "learning_rate": 5.215431138939999e-05, "loss": 3.0741, "step": 2396 }, { "epoch": 0.6638050401550817, "grad_norm": 1.7585291862487793, "learning_rate": 5.207686112794822e-05, "loss": 2.6842, "step": 2397 }, { "epoch": 0.664081971752977, "grad_norm": 1.4770677089691162, "learning_rate": 5.199944816582592e-05, "loss": 2.7869, "step": 2398 }, { "epoch": 0.6643589033508723, "grad_norm": 2.3265128135681152, "learning_rate": 5.1922072563284986e-05, "loss": 3.0738, "step": 2399 }, { "epoch": 0.6646358349487677, "grad_norm": 3.2085912227630615, "learning_rate": 5.184473438054793e-05, "loss": 3.7115, "step": 2400 }, { "epoch": 0.664912766546663, "grad_norm": 1.9501490592956543, "learning_rate": 5.176743367780833e-05, "loss": 2.9766, "step": 2401 }, { "epoch": 0.6651896981445583, "grad_norm": 1.0873230695724487, "learning_rate": 5.1690170515230504e-05, "loss": 2.7145, "step": 2402 }, { "epoch": 0.6654666297424536, "grad_norm": 1.1702440977096558, "learning_rate": 5.1612944952949615e-05, "loss": 2.7145, "step": 2403 }, { "epoch": 0.665743561340349, "grad_norm": 1.6476362943649292, "learning_rate": 5.153575705107152e-05, "loss": 2.803, "step": 2404 }, { "epoch": 0.6660204929382443, "grad_norm": 1.682246446609497, "learning_rate": 5.145860686967274e-05, "loss": 2.8331, "step": 2405 }, { "epoch": 0.6662974245361396, "grad_norm": 1.1325786113739014, "learning_rate": 5.138149446880054e-05, "loss": 2.6439, "step": 2406 }, { "epoch": 0.6665743561340349, "grad_norm": 1.6570894718170166, "learning_rate": 5.130441990847259e-05, "loss": 2.9284, "step": 2407 }, { "epoch": 0.6668512877319303, "grad_norm": 1.747573971748352, "learning_rate": 5.122738324867737e-05, "loss": 2.7495, "step": 2408 }, { "epoch": 0.6671282193298256, "grad_norm": 1.1928094625473022, "learning_rate": 5.115038454937362e-05, "loss": 2.8352, "step": 2409 }, { "epoch": 0.6674051509277209, "grad_norm": 1.263839840888977, "learning_rate": 5.1073423870490665e-05, "loss": 2.7625, "step": 2410 }, { "epoch": 0.6676820825256162, "grad_norm": 1.6746209859848022, "learning_rate": 5.09965012719282e-05, "loss": 2.7107, "step": 2411 }, { "epoch": 0.6679590141235114, "grad_norm": 1.419972538948059, "learning_rate": 5.0919616813556284e-05, "loss": 2.7692, "step": 2412 }, { "epoch": 0.6682359457214068, "grad_norm": 1.947323203086853, "learning_rate": 5.0842770555215316e-05, "loss": 2.9443, "step": 2413 }, { "epoch": 0.6685128773193021, "grad_norm": 2.007458209991455, "learning_rate": 5.076596255671592e-05, "loss": 2.7715, "step": 2414 }, { "epoch": 0.6687898089171974, "grad_norm": 1.3769513368606567, "learning_rate": 5.068919287783902e-05, "loss": 3.0913, "step": 2415 }, { "epoch": 0.6690667405150927, "grad_norm": 1.4708313941955566, "learning_rate": 5.0612461578335526e-05, "loss": 2.757, "step": 2416 }, { "epoch": 0.6693436721129881, "grad_norm": 1.393153190612793, "learning_rate": 5.053576871792678e-05, "loss": 2.5286, "step": 2417 }, { "epoch": 0.6696206037108834, "grad_norm": 1.5462419986724854, "learning_rate": 5.0459114356303926e-05, "loss": 2.5453, "step": 2418 }, { "epoch": 0.6698975353087787, "grad_norm": 1.400042176246643, "learning_rate": 5.0382498553128265e-05, "loss": 2.4766, "step": 2419 }, { "epoch": 0.670174466906674, "grad_norm": 2.019869089126587, "learning_rate": 5.030592136803112e-05, "loss": 3.0333, "step": 2420 }, { "epoch": 0.6704513985045694, "grad_norm": 1.8234835863113403, "learning_rate": 5.022938286061369e-05, "loss": 3.2183, "step": 2421 }, { "epoch": 0.6707283301024647, "grad_norm": 1.0895123481750488, "learning_rate": 5.0152883090447044e-05, "loss": 2.5655, "step": 2422 }, { "epoch": 0.67100526170036, "grad_norm": 1.3848090171813965, "learning_rate": 5.0076422117072266e-05, "loss": 2.7499, "step": 2423 }, { "epoch": 0.6712821932982553, "grad_norm": 1.6396257877349854, "learning_rate": 5.000000000000002e-05, "loss": 3.0039, "step": 2424 }, { "epoch": 0.6715591248961507, "grad_norm": 2.707336664199829, "learning_rate": 4.9923616798710884e-05, "loss": 3.4594, "step": 2425 }, { "epoch": 0.671836056494046, "grad_norm": 1.0716488361358643, "learning_rate": 4.984727257265509e-05, "loss": 2.7283, "step": 2426 }, { "epoch": 0.6721129880919413, "grad_norm": 1.407761812210083, "learning_rate": 4.9770967381252557e-05, "loss": 2.7461, "step": 2427 }, { "epoch": 0.6723899196898366, "grad_norm": 1.3955081701278687, "learning_rate": 4.9694701283892796e-05, "loss": 2.8607, "step": 2428 }, { "epoch": 0.672666851287732, "grad_norm": 1.2413393259048462, "learning_rate": 4.9618474339934916e-05, "loss": 2.7155, "step": 2429 }, { "epoch": 0.6729437828856273, "grad_norm": 2.0657734870910645, "learning_rate": 4.954228660870758e-05, "loss": 3.2198, "step": 2430 }, { "epoch": 0.6732207144835226, "grad_norm": 1.1644593477249146, "learning_rate": 4.946613814950877e-05, "loss": 2.873, "step": 2431 }, { "epoch": 0.6734976460814179, "grad_norm": 1.8703086376190186, "learning_rate": 4.939002902160616e-05, "loss": 2.7932, "step": 2432 }, { "epoch": 0.6737745776793133, "grad_norm": 0.9289453029632568, "learning_rate": 4.9313959284236614e-05, "loss": 2.774, "step": 2433 }, { "epoch": 0.6740515092772086, "grad_norm": 1.319032073020935, "learning_rate": 4.9237928996606384e-05, "loss": 2.6889, "step": 2434 }, { "epoch": 0.6743284408751038, "grad_norm": 1.0665626525878906, "learning_rate": 4.9161938217891065e-05, "loss": 2.7208, "step": 2435 }, { "epoch": 0.6746053724729991, "grad_norm": 1.254766821861267, "learning_rate": 4.9085987007235466e-05, "loss": 2.6566, "step": 2436 }, { "epoch": 0.6748823040708944, "grad_norm": 1.3757987022399902, "learning_rate": 4.90100754237536e-05, "loss": 2.8939, "step": 2437 }, { "epoch": 0.6751592356687898, "grad_norm": 1.6149036884307861, "learning_rate": 4.8934203526528634e-05, "loss": 2.7772, "step": 2438 }, { "epoch": 0.6754361672666851, "grad_norm": 1.2694714069366455, "learning_rate": 4.88583713746129e-05, "loss": 2.7618, "step": 2439 }, { "epoch": 0.6757130988645804, "grad_norm": 1.7957725524902344, "learning_rate": 4.878257902702764e-05, "loss": 2.6743, "step": 2440 }, { "epoch": 0.6759900304624757, "grad_norm": 1.4955267906188965, "learning_rate": 4.870682654276336e-05, "loss": 2.9218, "step": 2441 }, { "epoch": 0.6762669620603711, "grad_norm": 1.2260961532592773, "learning_rate": 4.863111398077932e-05, "loss": 2.5895, "step": 2442 }, { "epoch": 0.6765438936582664, "grad_norm": 1.1047837734222412, "learning_rate": 4.85554414000038e-05, "loss": 2.6782, "step": 2443 }, { "epoch": 0.6768208252561617, "grad_norm": 1.667965292930603, "learning_rate": 4.8479808859333964e-05, "loss": 2.9919, "step": 2444 }, { "epoch": 0.677097756854057, "grad_norm": 1.1275628805160522, "learning_rate": 4.840421641763582e-05, "loss": 2.4726, "step": 2445 }, { "epoch": 0.6773746884519524, "grad_norm": 1.742850661277771, "learning_rate": 4.832866413374413e-05, "loss": 2.7017, "step": 2446 }, { "epoch": 0.6776516200498477, "grad_norm": 1.8891174793243408, "learning_rate": 4.8253152066462416e-05, "loss": 2.9787, "step": 2447 }, { "epoch": 0.677928551647743, "grad_norm": 1.3550724983215332, "learning_rate": 4.817768027456296e-05, "loss": 2.8453, "step": 2448 }, { "epoch": 0.6782054832456383, "grad_norm": 1.646095633506775, "learning_rate": 4.810224881678652e-05, "loss": 2.8204, "step": 2449 }, { "epoch": 0.6784824148435337, "grad_norm": 3.411214590072632, "learning_rate": 4.802685775184272e-05, "loss": 3.8809, "step": 2450 }, { "epoch": 0.678759346441429, "grad_norm": 1.5782161951065063, "learning_rate": 4.7951507138409516e-05, "loss": 2.5259, "step": 2451 }, { "epoch": 0.6790362780393243, "grad_norm": 1.3184013366699219, "learning_rate": 4.787619703513348e-05, "loss": 2.7041, "step": 2452 }, { "epoch": 0.6793132096372196, "grad_norm": 1.0506107807159424, "learning_rate": 4.780092750062967e-05, "loss": 2.5995, "step": 2453 }, { "epoch": 0.679590141235115, "grad_norm": 1.323937177658081, "learning_rate": 4.772569859348156e-05, "loss": 2.7424, "step": 2454 }, { "epoch": 0.6798670728330103, "grad_norm": 1.750764012336731, "learning_rate": 4.7650510372240895e-05, "loss": 2.8218, "step": 2455 }, { "epoch": 0.6801440044309056, "grad_norm": 1.2859998941421509, "learning_rate": 4.757536289542798e-05, "loss": 2.8217, "step": 2456 }, { "epoch": 0.6804209360288009, "grad_norm": 1.068601131439209, "learning_rate": 4.750025622153117e-05, "loss": 2.8105, "step": 2457 }, { "epoch": 0.6806978676266962, "grad_norm": 1.5132895708084106, "learning_rate": 4.742519040900719e-05, "loss": 2.8082, "step": 2458 }, { "epoch": 0.6809747992245915, "grad_norm": 1.374457836151123, "learning_rate": 4.735016551628095e-05, "loss": 2.7911, "step": 2459 }, { "epoch": 0.6812517308224868, "grad_norm": 1.1537971496582031, "learning_rate": 4.72751816017455e-05, "loss": 2.7979, "step": 2460 }, { "epoch": 0.6815286624203821, "grad_norm": 1.4131252765655518, "learning_rate": 4.720023872376199e-05, "loss": 2.808, "step": 2461 }, { "epoch": 0.6818055940182774, "grad_norm": 1.4175875186920166, "learning_rate": 4.712533694065964e-05, "loss": 2.5207, "step": 2462 }, { "epoch": 0.6820825256161728, "grad_norm": 1.0395691394805908, "learning_rate": 4.7050476310735704e-05, "loss": 2.8166, "step": 2463 }, { "epoch": 0.6823594572140681, "grad_norm": 1.8296945095062256, "learning_rate": 4.697565689225528e-05, "loss": 3.0081, "step": 2464 }, { "epoch": 0.6826363888119634, "grad_norm": 1.247381329536438, "learning_rate": 4.690087874345165e-05, "loss": 2.9156, "step": 2465 }, { "epoch": 0.6829133204098587, "grad_norm": 1.3310344219207764, "learning_rate": 4.68261419225257e-05, "loss": 2.7953, "step": 2466 }, { "epoch": 0.6831902520077541, "grad_norm": 1.7048262357711792, "learning_rate": 4.67514464876463e-05, "loss": 2.9409, "step": 2467 }, { "epoch": 0.6834671836056494, "grad_norm": 1.1385921239852905, "learning_rate": 4.6676792496950095e-05, "loss": 2.6641, "step": 2468 }, { "epoch": 0.6837441152035447, "grad_norm": 1.2046301364898682, "learning_rate": 4.660218000854143e-05, "loss": 2.7068, "step": 2469 }, { "epoch": 0.68402104680144, "grad_norm": 1.1131768226623535, "learning_rate": 4.6527609080492385e-05, "loss": 2.6678, "step": 2470 }, { "epoch": 0.6842979783993354, "grad_norm": 1.2756351232528687, "learning_rate": 4.6453079770842686e-05, "loss": 2.6196, "step": 2471 }, { "epoch": 0.6845749099972307, "grad_norm": 2.282907009124756, "learning_rate": 4.637859213759971e-05, "loss": 2.6622, "step": 2472 }, { "epoch": 0.684851841595126, "grad_norm": 1.1799778938293457, "learning_rate": 4.6304146238738196e-05, "loss": 2.5564, "step": 2473 }, { "epoch": 0.6851287731930213, "grad_norm": 1.7189338207244873, "learning_rate": 4.6229742132200746e-05, "loss": 2.8177, "step": 2474 }, { "epoch": 0.6854057047909167, "grad_norm": 2.7451953887939453, "learning_rate": 4.615537987589711e-05, "loss": 3.7004, "step": 2475 }, { "epoch": 0.685682636388812, "grad_norm": 1.2103180885314941, "learning_rate": 4.608105952770464e-05, "loss": 2.5203, "step": 2476 }, { "epoch": 0.6859595679867073, "grad_norm": 1.9908820390701294, "learning_rate": 4.600678114546801e-05, "loss": 2.9552, "step": 2477 }, { "epoch": 0.6862364995846026, "grad_norm": 1.184588074684143, "learning_rate": 4.593254478699928e-05, "loss": 2.84, "step": 2478 }, { "epoch": 0.686513431182498, "grad_norm": 1.2031408548355103, "learning_rate": 4.585835051007774e-05, "loss": 2.8386, "step": 2479 }, { "epoch": 0.6867903627803933, "grad_norm": 0.9946233034133911, "learning_rate": 4.578419837244997e-05, "loss": 2.7443, "step": 2480 }, { "epoch": 0.6870672943782886, "grad_norm": 1.1824885606765747, "learning_rate": 4.571008843182978e-05, "loss": 2.6317, "step": 2481 }, { "epoch": 0.6873442259761838, "grad_norm": 0.902035653591156, "learning_rate": 4.5636020745897975e-05, "loss": 2.6174, "step": 2482 }, { "epoch": 0.6876211575740792, "grad_norm": 1.003700852394104, "learning_rate": 4.5561995372302756e-05, "loss": 2.916, "step": 2483 }, { "epoch": 0.6878980891719745, "grad_norm": 1.6476397514343262, "learning_rate": 4.548801236865912e-05, "loss": 2.5126, "step": 2484 }, { "epoch": 0.6881750207698698, "grad_norm": 1.7157683372497559, "learning_rate": 4.5414071792549217e-05, "loss": 2.9392, "step": 2485 }, { "epoch": 0.6884519523677651, "grad_norm": 1.2440541982650757, "learning_rate": 4.534017370152218e-05, "loss": 2.9517, "step": 2486 }, { "epoch": 0.6887288839656605, "grad_norm": 1.5872433185577393, "learning_rate": 4.5266318153094076e-05, "loss": 2.7537, "step": 2487 }, { "epoch": 0.6890058155635558, "grad_norm": 1.8673579692840576, "learning_rate": 4.5192505204747716e-05, "loss": 2.785, "step": 2488 }, { "epoch": 0.6892827471614511, "grad_norm": 2.1917476654052734, "learning_rate": 4.511873491393304e-05, "loss": 2.5704, "step": 2489 }, { "epoch": 0.6895596787593464, "grad_norm": 1.8062355518341064, "learning_rate": 4.504500733806648e-05, "loss": 2.8255, "step": 2490 }, { "epoch": 0.6898366103572418, "grad_norm": 1.379191279411316, "learning_rate": 4.497132253453143e-05, "loss": 3.0313, "step": 2491 }, { "epoch": 0.6901135419551371, "grad_norm": 1.0113121271133423, "learning_rate": 4.489768056067792e-05, "loss": 2.6995, "step": 2492 }, { "epoch": 0.6903904735530324, "grad_norm": 1.1049953699111938, "learning_rate": 4.4824081473822646e-05, "loss": 2.8133, "step": 2493 }, { "epoch": 0.6906674051509277, "grad_norm": 1.151623010635376, "learning_rate": 4.475052533124893e-05, "loss": 2.6096, "step": 2494 }, { "epoch": 0.690944336748823, "grad_norm": 1.7494882345199585, "learning_rate": 4.467701219020667e-05, "loss": 2.5964, "step": 2495 }, { "epoch": 0.6912212683467184, "grad_norm": 1.7969138622283936, "learning_rate": 4.460354210791233e-05, "loss": 2.4509, "step": 2496 }, { "epoch": 0.6914981999446137, "grad_norm": 0.80321204662323, "learning_rate": 4.453011514154872e-05, "loss": 2.7233, "step": 2497 }, { "epoch": 0.691775131542509, "grad_norm": 1.1336066722869873, "learning_rate": 4.4456731348265344e-05, "loss": 2.6503, "step": 2498 }, { "epoch": 0.6920520631404043, "grad_norm": 2.478153944015503, "learning_rate": 4.438339078517785e-05, "loss": 3.31, "step": 2499 }, { "epoch": 0.6923289947382997, "grad_norm": 3.321981191635132, "learning_rate": 4.431009350936838e-05, "loss": 3.7714, "step": 2500 }, { "epoch": 0.692605926336195, "grad_norm": 1.3748323917388916, "learning_rate": 4.4236839577885345e-05, "loss": 2.969, "step": 2501 }, { "epoch": 0.6928828579340903, "grad_norm": 1.1666796207427979, "learning_rate": 4.416362904774345e-05, "loss": 2.4245, "step": 2502 }, { "epoch": 0.6931597895319856, "grad_norm": 1.0594323873519897, "learning_rate": 4.4090461975923566e-05, "loss": 2.6162, "step": 2503 }, { "epoch": 0.693436721129881, "grad_norm": 1.1525086164474487, "learning_rate": 4.401733841937279e-05, "loss": 2.4683, "step": 2504 }, { "epoch": 0.6937136527277762, "grad_norm": 2.1008377075195312, "learning_rate": 4.394425843500437e-05, "loss": 3.0293, "step": 2505 }, { "epoch": 0.6939905843256715, "grad_norm": 1.5007545948028564, "learning_rate": 4.387122207969748e-05, "loss": 2.9255, "step": 2506 }, { "epoch": 0.6942675159235668, "grad_norm": 1.5370593070983887, "learning_rate": 4.379822941029762e-05, "loss": 2.9777, "step": 2507 }, { "epoch": 0.6945444475214622, "grad_norm": 2.107943058013916, "learning_rate": 4.372528048361601e-05, "loss": 2.8037, "step": 2508 }, { "epoch": 0.6948213791193575, "grad_norm": 1.1811412572860718, "learning_rate": 4.3652375356429974e-05, "loss": 2.716, "step": 2509 }, { "epoch": 0.6950983107172528, "grad_norm": 1.3295549154281616, "learning_rate": 4.357951408548272e-05, "loss": 2.7155, "step": 2510 }, { "epoch": 0.6953752423151481, "grad_norm": 1.7516978979110718, "learning_rate": 4.350669672748332e-05, "loss": 2.7683, "step": 2511 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7216670513153076, "learning_rate": 4.3433923339106666e-05, "loss": 2.6673, "step": 2512 }, { "epoch": 0.6959291055109388, "grad_norm": 1.3093624114990234, "learning_rate": 4.336119397699341e-05, "loss": 2.8426, "step": 2513 }, { "epoch": 0.6962060371088341, "grad_norm": 1.3083853721618652, "learning_rate": 4.328850869775001e-05, "loss": 2.4453, "step": 2514 }, { "epoch": 0.6964829687067294, "grad_norm": 1.5862066745758057, "learning_rate": 4.321586755794843e-05, "loss": 2.7283, "step": 2515 }, { "epoch": 0.6967599003046248, "grad_norm": 0.8611969351768494, "learning_rate": 4.314327061412656e-05, "loss": 2.6224, "step": 2516 }, { "epoch": 0.6970368319025201, "grad_norm": 1.286521553993225, "learning_rate": 4.307071792278764e-05, "loss": 2.7307, "step": 2517 }, { "epoch": 0.6973137635004154, "grad_norm": 1.4638444185256958, "learning_rate": 4.299820954040058e-05, "loss": 2.827, "step": 2518 }, { "epoch": 0.6975906950983107, "grad_norm": 1.2252277135849, "learning_rate": 4.292574552339981e-05, "loss": 2.5911, "step": 2519 }, { "epoch": 0.6978676266962061, "grad_norm": 1.9520236253738403, "learning_rate": 4.2853325928185236e-05, "loss": 2.8218, "step": 2520 }, { "epoch": 0.6981445582941014, "grad_norm": 1.7045632600784302, "learning_rate": 4.2780950811122056e-05, "loss": 2.809, "step": 2521 }, { "epoch": 0.6984214898919967, "grad_norm": 1.407806158065796, "learning_rate": 4.2708620228541105e-05, "loss": 3.1178, "step": 2522 }, { "epoch": 0.698698421489892, "grad_norm": 1.407745361328125, "learning_rate": 4.26363342367383e-05, "loss": 3.02, "step": 2523 }, { "epoch": 0.6989753530877874, "grad_norm": 1.6395858526229858, "learning_rate": 4.256409289197495e-05, "loss": 3.1189, "step": 2524 }, { "epoch": 0.6992522846856827, "grad_norm": 2.629124879837036, "learning_rate": 4.2491896250477734e-05, "loss": 3.7059, "step": 2525 }, { "epoch": 0.699529216283578, "grad_norm": 1.2712671756744385, "learning_rate": 4.241974436843834e-05, "loss": 2.6529, "step": 2526 }, { "epoch": 0.6998061478814733, "grad_norm": 1.899190902709961, "learning_rate": 4.234763730201373e-05, "loss": 2.6366, "step": 2527 }, { "epoch": 0.7000830794793685, "grad_norm": 1.3905671834945679, "learning_rate": 4.227557510732596e-05, "loss": 2.9294, "step": 2528 }, { "epoch": 0.7003600110772639, "grad_norm": 1.6453317403793335, "learning_rate": 4.2203557840462214e-05, "loss": 2.8451, "step": 2529 }, { "epoch": 0.7006369426751592, "grad_norm": 2.208395004272461, "learning_rate": 4.213158555747454e-05, "loss": 3.1687, "step": 2530 }, { "epoch": 0.7009138742730545, "grad_norm": 1.2020130157470703, "learning_rate": 4.2059658314380234e-05, "loss": 2.8466, "step": 2531 }, { "epoch": 0.7011908058709498, "grad_norm": 1.0927914381027222, "learning_rate": 4.1987776167161294e-05, "loss": 2.6193, "step": 2532 }, { "epoch": 0.7014677374688452, "grad_norm": 2.179379940032959, "learning_rate": 4.191593917176475e-05, "loss": 2.5691, "step": 2533 }, { "epoch": 0.7017446690667405, "grad_norm": 1.823103666305542, "learning_rate": 4.184414738410248e-05, "loss": 2.8192, "step": 2534 }, { "epoch": 0.7020216006646358, "grad_norm": 1.3351765871047974, "learning_rate": 4.177240086005113e-05, "loss": 2.732, "step": 2535 }, { "epoch": 0.7022985322625311, "grad_norm": 1.3181465864181519, "learning_rate": 4.170069965545217e-05, "loss": 2.7464, "step": 2536 }, { "epoch": 0.7025754638604265, "grad_norm": 1.428551197052002, "learning_rate": 4.162904382611174e-05, "loss": 2.5484, "step": 2537 }, { "epoch": 0.7028523954583218, "grad_norm": 1.301132321357727, "learning_rate": 4.155743342780077e-05, "loss": 2.7892, "step": 2538 }, { "epoch": 0.7031293270562171, "grad_norm": 2.054124593734741, "learning_rate": 4.148586851625461e-05, "loss": 2.8681, "step": 2539 }, { "epoch": 0.7034062586541124, "grad_norm": 1.7845115661621094, "learning_rate": 4.1414349147173514e-05, "loss": 2.9608, "step": 2540 }, { "epoch": 0.7036831902520078, "grad_norm": 2.0754706859588623, "learning_rate": 4.1342875376222015e-05, "loss": 3.1372, "step": 2541 }, { "epoch": 0.7039601218499031, "grad_norm": 1.381613850593567, "learning_rate": 4.12714472590293e-05, "loss": 2.8474, "step": 2542 }, { "epoch": 0.7042370534477984, "grad_norm": 1.4121874570846558, "learning_rate": 4.1200064851189e-05, "loss": 2.6797, "step": 2543 }, { "epoch": 0.7045139850456937, "grad_norm": 2.215932607650757, "learning_rate": 4.112872820825915e-05, "loss": 2.6889, "step": 2544 }, { "epoch": 0.7047909166435891, "grad_norm": 1.5485789775848389, "learning_rate": 4.105743738576219e-05, "loss": 2.772, "step": 2545 }, { "epoch": 0.7050678482414844, "grad_norm": 0.7619416117668152, "learning_rate": 4.0986192439184864e-05, "loss": 2.6577, "step": 2546 }, { "epoch": 0.7053447798393797, "grad_norm": 1.0292936563491821, "learning_rate": 4.09149934239783e-05, "loss": 2.9299, "step": 2547 }, { "epoch": 0.705621711437275, "grad_norm": 1.4957592487335205, "learning_rate": 4.0843840395557664e-05, "loss": 2.7241, "step": 2548 }, { "epoch": 0.7058986430351704, "grad_norm": 0.8321310877799988, "learning_rate": 4.077273340930263e-05, "loss": 2.9524, "step": 2549 }, { "epoch": 0.7061755746330657, "grad_norm": 2.9555342197418213, "learning_rate": 4.070167252055676e-05, "loss": 3.4832, "step": 2550 }, { "epoch": 0.7064525062309609, "grad_norm": 3.6347153186798096, "learning_rate": 4.063065778462792e-05, "loss": 3.2612, "step": 2551 }, { "epoch": 0.7067294378288562, "grad_norm": 1.2320882081985474, "learning_rate": 4.055968925678797e-05, "loss": 2.6501, "step": 2552 }, { "epoch": 0.7070063694267515, "grad_norm": 1.7431206703186035, "learning_rate": 4.0488766992272865e-05, "loss": 3.2253, "step": 2553 }, { "epoch": 0.7072833010246469, "grad_norm": 1.1208981275558472, "learning_rate": 4.041789104628241e-05, "loss": 2.9444, "step": 2554 }, { "epoch": 0.7075602326225422, "grad_norm": 0.957042396068573, "learning_rate": 4.034706147398061e-05, "loss": 2.7775, "step": 2555 }, { "epoch": 0.7078371642204375, "grad_norm": 1.6962882280349731, "learning_rate": 4.027627833049511e-05, "loss": 2.7378, "step": 2556 }, { "epoch": 0.7081140958183328, "grad_norm": 1.5339020490646362, "learning_rate": 4.020554167091756e-05, "loss": 2.6384, "step": 2557 }, { "epoch": 0.7083910274162282, "grad_norm": 1.851544737815857, "learning_rate": 4.0134851550303496e-05, "loss": 2.8828, "step": 2558 }, { "epoch": 0.7086679590141235, "grad_norm": 0.9925166964530945, "learning_rate": 4.006420802367205e-05, "loss": 2.5696, "step": 2559 }, { "epoch": 0.7089448906120188, "grad_norm": 1.440911889076233, "learning_rate": 3.999361114600621e-05, "loss": 2.6636, "step": 2560 }, { "epoch": 0.7092218222099141, "grad_norm": 1.3923132419586182, "learning_rate": 3.9923060972252655e-05, "loss": 2.7548, "step": 2561 }, { "epoch": 0.7094987538078095, "grad_norm": 1.1634244918823242, "learning_rate": 3.9852557557321714e-05, "loss": 3.009, "step": 2562 }, { "epoch": 0.7097756854057048, "grad_norm": 1.1615941524505615, "learning_rate": 3.978210095608718e-05, "loss": 2.8798, "step": 2563 }, { "epoch": 0.7100526170036001, "grad_norm": 2.0248026847839355, "learning_rate": 3.971169122338668e-05, "loss": 2.9412, "step": 2564 }, { "epoch": 0.7103295486014954, "grad_norm": 1.8589743375778198, "learning_rate": 3.964132841402112e-05, "loss": 2.8227, "step": 2565 }, { "epoch": 0.7106064801993908, "grad_norm": 1.61398184299469, "learning_rate": 3.9571012582754985e-05, "loss": 2.7969, "step": 2566 }, { "epoch": 0.7108834117972861, "grad_norm": 1.543515682220459, "learning_rate": 3.9500743784316206e-05, "loss": 3.1315, "step": 2567 }, { "epoch": 0.7111603433951814, "grad_norm": 1.421244502067566, "learning_rate": 3.9430522073396084e-05, "loss": 2.7719, "step": 2568 }, { "epoch": 0.7114372749930767, "grad_norm": 1.5355627536773682, "learning_rate": 3.936034750464927e-05, "loss": 2.613, "step": 2569 }, { "epoch": 0.7117142065909721, "grad_norm": 1.0952552556991577, "learning_rate": 3.9290220132693756e-05, "loss": 2.6977, "step": 2570 }, { "epoch": 0.7119911381888674, "grad_norm": 1.621482253074646, "learning_rate": 3.922014001211077e-05, "loss": 2.7291, "step": 2571 }, { "epoch": 0.7122680697867627, "grad_norm": 2.1047937870025635, "learning_rate": 3.91501071974447e-05, "loss": 2.8873, "step": 2572 }, { "epoch": 0.712545001384658, "grad_norm": 1.1262298822402954, "learning_rate": 3.908012174320329e-05, "loss": 2.9252, "step": 2573 }, { "epoch": 0.7128219329825534, "grad_norm": 1.6226671934127808, "learning_rate": 3.901018370385724e-05, "loss": 2.7067, "step": 2574 }, { "epoch": 0.7130988645804486, "grad_norm": 2.3573596477508545, "learning_rate": 3.894029313384043e-05, "loss": 3.3536, "step": 2575 }, { "epoch": 0.7133757961783439, "grad_norm": 2.8473141193389893, "learning_rate": 3.88704500875498e-05, "loss": 3.2805, "step": 2576 }, { "epoch": 0.7136527277762392, "grad_norm": 1.263624668121338, "learning_rate": 3.880065461934529e-05, "loss": 2.7721, "step": 2577 }, { "epoch": 0.7139296593741346, "grad_norm": 1.3694475889205933, "learning_rate": 3.8730906783549795e-05, "loss": 2.8154, "step": 2578 }, { "epoch": 0.7142065909720299, "grad_norm": 0.9628450870513916, "learning_rate": 3.866120663444914e-05, "loss": 2.8262, "step": 2579 }, { "epoch": 0.7144835225699252, "grad_norm": 1.5288922786712646, "learning_rate": 3.85915542262921e-05, "loss": 2.9611, "step": 2580 }, { "epoch": 0.7147604541678205, "grad_norm": 1.7090123891830444, "learning_rate": 3.852194961329013e-05, "loss": 2.7685, "step": 2581 }, { "epoch": 0.7150373857657158, "grad_norm": 1.9330387115478516, "learning_rate": 3.8452392849617715e-05, "loss": 2.9505, "step": 2582 }, { "epoch": 0.7153143173636112, "grad_norm": 1.3753412961959839, "learning_rate": 3.838288398941187e-05, "loss": 2.9888, "step": 2583 }, { "epoch": 0.7155912489615065, "grad_norm": 1.3932002782821655, "learning_rate": 3.831342308677247e-05, "loss": 2.7885, "step": 2584 }, { "epoch": 0.7158681805594018, "grad_norm": 1.121376395225525, "learning_rate": 3.824401019576203e-05, "loss": 2.8153, "step": 2585 }, { "epoch": 0.7161451121572971, "grad_norm": 1.4889800548553467, "learning_rate": 3.817464537040571e-05, "loss": 2.7574, "step": 2586 }, { "epoch": 0.7164220437551925, "grad_norm": 0.9660783410072327, "learning_rate": 3.810532866469114e-05, "loss": 2.5652, "step": 2587 }, { "epoch": 0.7166989753530878, "grad_norm": 0.9699069261550903, "learning_rate": 3.803606013256871e-05, "loss": 2.8056, "step": 2588 }, { "epoch": 0.7169759069509831, "grad_norm": 0.8376581072807312, "learning_rate": 3.7966839827951196e-05, "loss": 2.7144, "step": 2589 }, { "epoch": 0.7172528385488784, "grad_norm": 1.4266955852508545, "learning_rate": 3.7897667804713756e-05, "loss": 2.9554, "step": 2590 }, { "epoch": 0.7175297701467738, "grad_norm": 1.1173572540283203, "learning_rate": 3.7828544116694176e-05, "loss": 2.5609, "step": 2591 }, { "epoch": 0.7178067017446691, "grad_norm": 1.240865707397461, "learning_rate": 3.775946881769241e-05, "loss": 2.7688, "step": 2592 }, { "epoch": 0.7180836333425644, "grad_norm": 1.3590009212493896, "learning_rate": 3.769044196147088e-05, "loss": 2.9172, "step": 2593 }, { "epoch": 0.7183605649404597, "grad_norm": 1.9907031059265137, "learning_rate": 3.762146360175427e-05, "loss": 2.5582, "step": 2594 }, { "epoch": 0.7186374965383551, "grad_norm": 1.6450612545013428, "learning_rate": 3.755253379222955e-05, "loss": 2.8313, "step": 2595 }, { "epoch": 0.7189144281362504, "grad_norm": 1.57441246509552, "learning_rate": 3.748365258654577e-05, "loss": 2.6225, "step": 2596 }, { "epoch": 0.7191913597341457, "grad_norm": 0.8331193327903748, "learning_rate": 3.741482003831439e-05, "loss": 2.6939, "step": 2597 }, { "epoch": 0.7194682913320409, "grad_norm": 0.8779714703559875, "learning_rate": 3.734603620110876e-05, "loss": 2.6911, "step": 2598 }, { "epoch": 0.7197452229299363, "grad_norm": 1.293872356414795, "learning_rate": 3.727730112846444e-05, "loss": 2.7169, "step": 2599 }, { "epoch": 0.7200221545278316, "grad_norm": 2.3049585819244385, "learning_rate": 3.720861487387901e-05, "loss": 3.6294, "step": 2600 }, { "epoch": 0.7202990861257269, "grad_norm": 1.5383538007736206, "learning_rate": 3.7139977490812087e-05, "loss": 2.8187, "step": 2601 }, { "epoch": 0.7205760177236222, "grad_norm": 1.1297353506088257, "learning_rate": 3.707138903268519e-05, "loss": 2.7199, "step": 2602 }, { "epoch": 0.7208529493215176, "grad_norm": 1.559241771697998, "learning_rate": 3.7002849552881815e-05, "loss": 2.7379, "step": 2603 }, { "epoch": 0.7211298809194129, "grad_norm": 1.2448389530181885, "learning_rate": 3.693435910474732e-05, "loss": 2.7702, "step": 2604 }, { "epoch": 0.7214068125173082, "grad_norm": 1.3755321502685547, "learning_rate": 3.686591774158882e-05, "loss": 2.8438, "step": 2605 }, { "epoch": 0.7216837441152035, "grad_norm": 1.7060540914535522, "learning_rate": 3.679752551667541e-05, "loss": 3.1769, "step": 2606 }, { "epoch": 0.7219606757130989, "grad_norm": 1.2973065376281738, "learning_rate": 3.672918248323774e-05, "loss": 2.9313, "step": 2607 }, { "epoch": 0.7222376073109942, "grad_norm": 0.9247957468032837, "learning_rate": 3.666088869446831e-05, "loss": 2.7732, "step": 2608 }, { "epoch": 0.7225145389088895, "grad_norm": 0.8578128814697266, "learning_rate": 3.659264420352122e-05, "loss": 2.7906, "step": 2609 }, { "epoch": 0.7227914705067848, "grad_norm": 1.6201186180114746, "learning_rate": 3.652444906351223e-05, "loss": 2.8034, "step": 2610 }, { "epoch": 0.7230684021046802, "grad_norm": 1.3790613412857056, "learning_rate": 3.645630332751871e-05, "loss": 2.8627, "step": 2611 }, { "epoch": 0.7233453337025755, "grad_norm": 1.420621395111084, "learning_rate": 3.638820704857954e-05, "loss": 2.9343, "step": 2612 }, { "epoch": 0.7236222653004708, "grad_norm": 1.4299603700637817, "learning_rate": 3.632016027969514e-05, "loss": 2.7154, "step": 2613 }, { "epoch": 0.7238991968983661, "grad_norm": 1.3731162548065186, "learning_rate": 3.6252163073827294e-05, "loss": 2.8569, "step": 2614 }, { "epoch": 0.7241761284962615, "grad_norm": 0.7518453598022461, "learning_rate": 3.618421548389942e-05, "loss": 2.7739, "step": 2615 }, { "epoch": 0.7244530600941568, "grad_norm": 0.7167025208473206, "learning_rate": 3.6116317562796085e-05, "loss": 2.7477, "step": 2616 }, { "epoch": 0.7247299916920521, "grad_norm": 1.2134454250335693, "learning_rate": 3.6048469363363345e-05, "loss": 2.4328, "step": 2617 }, { "epoch": 0.7250069232899474, "grad_norm": 10.561895370483398, "learning_rate": 3.5980670938408514e-05, "loss": 2.8107, "step": 2618 }, { "epoch": 0.7252838548878427, "grad_norm": 1.1714071035385132, "learning_rate": 3.5912922340700206e-05, "loss": 2.6758, "step": 2619 }, { "epoch": 0.7255607864857381, "grad_norm": 1.4013359546661377, "learning_rate": 3.5845223622968104e-05, "loss": 2.8461, "step": 2620 }, { "epoch": 0.7258377180836333, "grad_norm": 1.1809953451156616, "learning_rate": 3.577757483790329e-05, "loss": 2.8659, "step": 2621 }, { "epoch": 0.7261146496815286, "grad_norm": 1.393582820892334, "learning_rate": 3.5709976038157875e-05, "loss": 2.6336, "step": 2622 }, { "epoch": 0.7263915812794239, "grad_norm": 1.4034349918365479, "learning_rate": 3.564242727634499e-05, "loss": 2.73, "step": 2623 }, { "epoch": 0.7266685128773193, "grad_norm": 1.565304160118103, "learning_rate": 3.557492860503893e-05, "loss": 3.0669, "step": 2624 }, { "epoch": 0.7269454444752146, "grad_norm": 2.7715139389038086, "learning_rate": 3.550748007677497e-05, "loss": 3.332, "step": 2625 }, { "epoch": 0.7272223760731099, "grad_norm": 1.4593929052352905, "learning_rate": 3.544008174404938e-05, "loss": 2.7901, "step": 2626 }, { "epoch": 0.7274993076710052, "grad_norm": 1.7429101467132568, "learning_rate": 3.5372733659319256e-05, "loss": 2.5106, "step": 2627 }, { "epoch": 0.7277762392689006, "grad_norm": 1.2671338319778442, "learning_rate": 3.530543587500278e-05, "loss": 2.5782, "step": 2628 }, { "epoch": 0.7280531708667959, "grad_norm": 2.224750280380249, "learning_rate": 3.5238188443478795e-05, "loss": 2.864, "step": 2629 }, { "epoch": 0.7283301024646912, "grad_norm": 1.1255524158477783, "learning_rate": 3.517099141708703e-05, "loss": 2.4131, "step": 2630 }, { "epoch": 0.7286070340625865, "grad_norm": 1.5960115194320679, "learning_rate": 3.510384484812802e-05, "loss": 2.8515, "step": 2631 }, { "epoch": 0.7288839656604819, "grad_norm": 2.0236940383911133, "learning_rate": 3.5036748788862974e-05, "loss": 2.8617, "step": 2632 }, { "epoch": 0.7291608972583772, "grad_norm": 1.7315971851348877, "learning_rate": 3.49697032915138e-05, "loss": 2.6889, "step": 2633 }, { "epoch": 0.7294378288562725, "grad_norm": 1.8834228515625, "learning_rate": 3.4902708408263066e-05, "loss": 2.6978, "step": 2634 }, { "epoch": 0.7297147604541678, "grad_norm": 0.9408883452415466, "learning_rate": 3.483576419125397e-05, "loss": 2.5433, "step": 2635 }, { "epoch": 0.7299916920520632, "grad_norm": 1.3076893091201782, "learning_rate": 3.4768870692590147e-05, "loss": 3.1385, "step": 2636 }, { "epoch": 0.7302686236499585, "grad_norm": 1.5100501775741577, "learning_rate": 3.470202796433598e-05, "loss": 2.6895, "step": 2637 }, { "epoch": 0.7305455552478538, "grad_norm": 1.9968551397323608, "learning_rate": 3.463523605851613e-05, "loss": 2.5876, "step": 2638 }, { "epoch": 0.7308224868457491, "grad_norm": 1.1709613800048828, "learning_rate": 3.45684950271158e-05, "loss": 2.5664, "step": 2639 }, { "epoch": 0.7310994184436445, "grad_norm": 0.9514749050140381, "learning_rate": 3.450180492208058e-05, "loss": 2.6495, "step": 2640 }, { "epoch": 0.7313763500415398, "grad_norm": 1.7250149250030518, "learning_rate": 3.443516579531644e-05, "loss": 2.9695, "step": 2641 }, { "epoch": 0.7316532816394351, "grad_norm": 2.7542314529418945, "learning_rate": 3.436857769868963e-05, "loss": 2.8169, "step": 2642 }, { "epoch": 0.7319302132373304, "grad_norm": 1.302791953086853, "learning_rate": 3.4302040684026736e-05, "loss": 3.0475, "step": 2643 }, { "epoch": 0.7322071448352258, "grad_norm": 1.1405366659164429, "learning_rate": 3.423555480311457e-05, "loss": 2.685, "step": 2644 }, { "epoch": 0.732484076433121, "grad_norm": 1.650712251663208, "learning_rate": 3.416912010770005e-05, "loss": 2.65, "step": 2645 }, { "epoch": 0.7327610080310163, "grad_norm": 1.5690234899520874, "learning_rate": 3.410273664949045e-05, "loss": 2.4833, "step": 2646 }, { "epoch": 0.7330379396289116, "grad_norm": 0.912711501121521, "learning_rate": 3.403640448015298e-05, "loss": 2.6781, "step": 2647 }, { "epoch": 0.7333148712268069, "grad_norm": 1.6189790964126587, "learning_rate": 3.3970123651315046e-05, "loss": 2.6668, "step": 2648 }, { "epoch": 0.7335918028247023, "grad_norm": 1.7619707584381104, "learning_rate": 3.3903894214564026e-05, "loss": 2.7105, "step": 2649 }, { "epoch": 0.7338687344225976, "grad_norm": 3.2235915660858154, "learning_rate": 3.383771622144738e-05, "loss": 3.5835, "step": 2650 }, { "epoch": 0.7341456660204929, "grad_norm": 0.9661833047866821, "learning_rate": 3.3771589723472365e-05, "loss": 2.617, "step": 2651 }, { "epoch": 0.7344225976183882, "grad_norm": 1.3732293844223022, "learning_rate": 3.3705514772106415e-05, "loss": 2.8641, "step": 2652 }, { "epoch": 0.7346995292162836, "grad_norm": 0.6539769768714905, "learning_rate": 3.363949141877659e-05, "loss": 2.7455, "step": 2653 }, { "epoch": 0.7349764608141789, "grad_norm": 2.1681718826293945, "learning_rate": 3.3573519714869914e-05, "loss": 2.5595, "step": 2654 }, { "epoch": 0.7352533924120742, "grad_norm": 1.89735746383667, "learning_rate": 3.3507599711733286e-05, "loss": 2.5669, "step": 2655 }, { "epoch": 0.7355303240099695, "grad_norm": 1.5284838676452637, "learning_rate": 3.344173146067317e-05, "loss": 2.6923, "step": 2656 }, { "epoch": 0.7358072556078649, "grad_norm": 1.8220961093902588, "learning_rate": 3.337591501295592e-05, "loss": 2.6526, "step": 2657 }, { "epoch": 0.7360841872057602, "grad_norm": 1.5740773677825928, "learning_rate": 3.3310150419807474e-05, "loss": 2.8735, "step": 2658 }, { "epoch": 0.7363611188036555, "grad_norm": 1.1814055442810059, "learning_rate": 3.324443773241349e-05, "loss": 2.6313, "step": 2659 }, { "epoch": 0.7366380504015508, "grad_norm": 1.3792455196380615, "learning_rate": 3.317877700191909e-05, "loss": 2.7861, "step": 2660 }, { "epoch": 0.7369149819994462, "grad_norm": 2.363563299179077, "learning_rate": 3.311316827942918e-05, "loss": 2.2229, "step": 2661 }, { "epoch": 0.7371919135973415, "grad_norm": 1.0998777151107788, "learning_rate": 3.3047611616007945e-05, "loss": 2.8149, "step": 2662 }, { "epoch": 0.7374688451952368, "grad_norm": 1.206363320350647, "learning_rate": 3.2982107062679215e-05, "loss": 2.762, "step": 2663 }, { "epoch": 0.7377457767931321, "grad_norm": 0.9043403267860413, "learning_rate": 3.291665467042618e-05, "loss": 2.6419, "step": 2664 }, { "epoch": 0.7380227083910275, "grad_norm": 2.09918212890625, "learning_rate": 3.2851254490191495e-05, "loss": 2.7888, "step": 2665 }, { "epoch": 0.7382996399889228, "grad_norm": 1.7726424932479858, "learning_rate": 3.278590657287713e-05, "loss": 3.0603, "step": 2666 }, { "epoch": 0.7385765715868181, "grad_norm": 1.3921743631362915, "learning_rate": 3.272061096934439e-05, "loss": 2.6677, "step": 2667 }, { "epoch": 0.7388535031847133, "grad_norm": 1.326472520828247, "learning_rate": 3.26553677304139e-05, "loss": 2.7886, "step": 2668 }, { "epoch": 0.7391304347826086, "grad_norm": 1.3895070552825928, "learning_rate": 3.25901769068654e-05, "loss": 2.5025, "step": 2669 }, { "epoch": 0.739407366380504, "grad_norm": 1.382885217666626, "learning_rate": 3.252503854943806e-05, "loss": 2.7819, "step": 2670 }, { "epoch": 0.7396842979783993, "grad_norm": 1.269974946975708, "learning_rate": 3.2459952708829974e-05, "loss": 2.7521, "step": 2671 }, { "epoch": 0.7399612295762946, "grad_norm": 1.6350727081298828, "learning_rate": 3.2394919435698524e-05, "loss": 2.5446, "step": 2672 }, { "epoch": 0.74023816117419, "grad_norm": 1.1815497875213623, "learning_rate": 3.232993878066011e-05, "loss": 2.8673, "step": 2673 }, { "epoch": 0.7405150927720853, "grad_norm": 1.97178053855896, "learning_rate": 3.2265010794290195e-05, "loss": 2.8683, "step": 2674 }, { "epoch": 0.7407920243699806, "grad_norm": 4.066892147064209, "learning_rate": 3.220013552712325e-05, "loss": 4.1637, "step": 2675 }, { "epoch": 0.7410689559678759, "grad_norm": 1.6811068058013916, "learning_rate": 3.213531302965271e-05, "loss": 2.9421, "step": 2676 }, { "epoch": 0.7413458875657712, "grad_norm": 1.3643163442611694, "learning_rate": 3.2070543352330986e-05, "loss": 2.6982, "step": 2677 }, { "epoch": 0.7416228191636666, "grad_norm": 0.915283739566803, "learning_rate": 3.2005826545569214e-05, "loss": 2.9205, "step": 2678 }, { "epoch": 0.7418997507615619, "grad_norm": 1.3957204818725586, "learning_rate": 3.1941162659737647e-05, "loss": 2.5736, "step": 2679 }, { "epoch": 0.7421766823594572, "grad_norm": 1.4447449445724487, "learning_rate": 3.1876551745165084e-05, "loss": 2.4436, "step": 2680 }, { "epoch": 0.7424536139573525, "grad_norm": 1.5328930616378784, "learning_rate": 3.1811993852139255e-05, "loss": 2.7714, "step": 2681 }, { "epoch": 0.7427305455552479, "grad_norm": 1.225975751876831, "learning_rate": 3.174748903090658e-05, "loss": 2.7398, "step": 2682 }, { "epoch": 0.7430074771531432, "grad_norm": 2.1260430812835693, "learning_rate": 3.1683037331672206e-05, "loss": 3.0236, "step": 2683 }, { "epoch": 0.7432844087510385, "grad_norm": 1.9688694477081299, "learning_rate": 3.16186388045998e-05, "loss": 2.9523, "step": 2684 }, { "epoch": 0.7435613403489338, "grad_norm": 2.151796340942383, "learning_rate": 3.1554293499811826e-05, "loss": 2.4532, "step": 2685 }, { "epoch": 0.7438382719468292, "grad_norm": 1.460901141166687, "learning_rate": 3.149000146738927e-05, "loss": 2.7071, "step": 2686 }, { "epoch": 0.7441152035447245, "grad_norm": 1.6640523672103882, "learning_rate": 3.142576275737151e-05, "loss": 3.0941, "step": 2687 }, { "epoch": 0.7443921351426198, "grad_norm": 1.4269442558288574, "learning_rate": 3.136157741975668e-05, "loss": 2.5218, "step": 2688 }, { "epoch": 0.7446690667405151, "grad_norm": 2.3087620735168457, "learning_rate": 3.129744550450113e-05, "loss": 2.9008, "step": 2689 }, { "epoch": 0.7449459983384105, "grad_norm": 1.1492661237716675, "learning_rate": 3.123336706151978e-05, "loss": 2.8982, "step": 2690 }, { "epoch": 0.7452229299363057, "grad_norm": 2.044564723968506, "learning_rate": 3.1169342140685865e-05, "loss": 3.0544, "step": 2691 }, { "epoch": 0.745499861534201, "grad_norm": 2.40684175491333, "learning_rate": 3.110537079183104e-05, "loss": 2.7255, "step": 2692 }, { "epoch": 0.7457767931320963, "grad_norm": 1.2087938785552979, "learning_rate": 3.104145306474507e-05, "loss": 2.635, "step": 2693 }, { "epoch": 0.7460537247299917, "grad_norm": 1.0230350494384766, "learning_rate": 3.09775890091763e-05, "loss": 2.5559, "step": 2694 }, { "epoch": 0.746330656327887, "grad_norm": 1.2716301679611206, "learning_rate": 3.091377867483101e-05, "loss": 2.6127, "step": 2695 }, { "epoch": 0.7466075879257823, "grad_norm": 1.2278804779052734, "learning_rate": 3.08500221113738e-05, "loss": 2.8742, "step": 2696 }, { "epoch": 0.7468845195236776, "grad_norm": 1.224388599395752, "learning_rate": 3.078631936842742e-05, "loss": 2.9862, "step": 2697 }, { "epoch": 0.747161451121573, "grad_norm": 0.8031061291694641, "learning_rate": 3.0722670495572714e-05, "loss": 2.8578, "step": 2698 }, { "epoch": 0.7474383827194683, "grad_norm": 1.8247153759002686, "learning_rate": 3.065907554234858e-05, "loss": 2.6747, "step": 2699 }, { "epoch": 0.7477153143173636, "grad_norm": 3.208592414855957, "learning_rate": 3.059553455825198e-05, "loss": 3.7767, "step": 2700 }, { "epoch": 0.7479922459152589, "grad_norm": 1.3665844202041626, "learning_rate": 3.0532047592737875e-05, "loss": 2.7423, "step": 2701 }, { "epoch": 0.7482691775131542, "grad_norm": 1.562633991241455, "learning_rate": 3.046861469521909e-05, "loss": 2.8209, "step": 2702 }, { "epoch": 0.7485461091110496, "grad_norm": 0.8740465044975281, "learning_rate": 3.0405235915066553e-05, "loss": 2.8583, "step": 2703 }, { "epoch": 0.7488230407089449, "grad_norm": 1.4273957014083862, "learning_rate": 3.034191130160887e-05, "loss": 2.9819, "step": 2704 }, { "epoch": 0.7490999723068402, "grad_norm": 1.8278026580810547, "learning_rate": 3.0278640904132625e-05, "loss": 2.9547, "step": 2705 }, { "epoch": 0.7493769039047355, "grad_norm": 1.4489752054214478, "learning_rate": 3.021542477188213e-05, "loss": 2.5978, "step": 2706 }, { "epoch": 0.7496538355026309, "grad_norm": 1.9576112031936646, "learning_rate": 3.015226295405952e-05, "loss": 2.6865, "step": 2707 }, { "epoch": 0.7499307671005262, "grad_norm": 0.8638213276863098, "learning_rate": 3.0089155499824605e-05, "loss": 2.599, "step": 2708 }, { "epoch": 0.7502076986984215, "grad_norm": 1.1353836059570312, "learning_rate": 3.0026102458294924e-05, "loss": 2.9723, "step": 2709 }, { "epoch": 0.7502076986984215, "eval_loss": 0.3528745770454407, "eval_runtime": 804.2518, "eval_samples_per_second": 7.562, "eval_steps_per_second": 1.891, "step": 2709 }, { "epoch": 0.7504846302963168, "grad_norm": 1.2876255512237549, "learning_rate": 2.9963103878545662e-05, "loss": 2.6112, "step": 2710 }, { "epoch": 0.7507615618942122, "grad_norm": 1.2612682580947876, "learning_rate": 2.9900159809609517e-05, "loss": 2.7906, "step": 2711 }, { "epoch": 0.7510384934921075, "grad_norm": 1.1340705156326294, "learning_rate": 2.983727030047696e-05, "loss": 2.8441, "step": 2712 }, { "epoch": 0.7513154250900028, "grad_norm": 1.7764126062393188, "learning_rate": 2.9774435400095802e-05, "loss": 3.2773, "step": 2713 }, { "epoch": 0.7515923566878981, "grad_norm": 1.3928422927856445, "learning_rate": 2.9711655157371443e-05, "loss": 2.603, "step": 2714 }, { "epoch": 0.7518692882857934, "grad_norm": 1.1370984315872192, "learning_rate": 2.9648929621166742e-05, "loss": 2.827, "step": 2715 }, { "epoch": 0.7521462198836887, "grad_norm": 1.6648856401443481, "learning_rate": 2.9586258840301983e-05, "loss": 2.8053, "step": 2716 }, { "epoch": 0.752423151481584, "grad_norm": 1.1349536180496216, "learning_rate": 2.9523642863554746e-05, "loss": 3.0446, "step": 2717 }, { "epoch": 0.7527000830794793, "grad_norm": 1.5090161561965942, "learning_rate": 2.9461081739660103e-05, "loss": 2.8436, "step": 2718 }, { "epoch": 0.7529770146773747, "grad_norm": 1.2340972423553467, "learning_rate": 2.9398575517310355e-05, "loss": 3.0514, "step": 2719 }, { "epoch": 0.75325394627527, "grad_norm": 0.8456771969795227, "learning_rate": 2.9336124245154995e-05, "loss": 2.5057, "step": 2720 }, { "epoch": 0.7535308778731653, "grad_norm": 1.490369439125061, "learning_rate": 2.9273727971800935e-05, "loss": 2.7719, "step": 2721 }, { "epoch": 0.7538078094710606, "grad_norm": 1.4499380588531494, "learning_rate": 2.9211386745812084e-05, "loss": 2.5641, "step": 2722 }, { "epoch": 0.754084741068956, "grad_norm": 1.4747735261917114, "learning_rate": 2.9149100615709634e-05, "loss": 3.0209, "step": 2723 }, { "epoch": 0.7543616726668513, "grad_norm": 1.836558222770691, "learning_rate": 2.9086869629971836e-05, "loss": 2.9703, "step": 2724 }, { "epoch": 0.7546386042647466, "grad_norm": 2.954867124557495, "learning_rate": 2.902469383703409e-05, "loss": 3.6567, "step": 2725 }, { "epoch": 0.7549155358626419, "grad_norm": 1.0744221210479736, "learning_rate": 2.8962573285288695e-05, "loss": 2.8091, "step": 2726 }, { "epoch": 0.7551924674605373, "grad_norm": 0.9103502035140991, "learning_rate": 2.8900508023085172e-05, "loss": 2.6369, "step": 2727 }, { "epoch": 0.7554693990584326, "grad_norm": 1.605891227722168, "learning_rate": 2.8838498098729784e-05, "loss": 3.0776, "step": 2728 }, { "epoch": 0.7557463306563279, "grad_norm": 1.0748717784881592, "learning_rate": 2.8776543560485857e-05, "loss": 2.7977, "step": 2729 }, { "epoch": 0.7560232622542232, "grad_norm": 1.3585500717163086, "learning_rate": 2.8714644456573593e-05, "loss": 2.8631, "step": 2730 }, { "epoch": 0.7563001938521186, "grad_norm": 1.102792739868164, "learning_rate": 2.8652800835170035e-05, "loss": 2.6547, "step": 2731 }, { "epoch": 0.7565771254500139, "grad_norm": 1.1205133199691772, "learning_rate": 2.8591012744409018e-05, "loss": 2.8023, "step": 2732 }, { "epoch": 0.7568540570479092, "grad_norm": 1.9962104558944702, "learning_rate": 2.8529280232381206e-05, "loss": 2.9932, "step": 2733 }, { "epoch": 0.7571309886458045, "grad_norm": 2.3501553535461426, "learning_rate": 2.8467603347133997e-05, "loss": 2.86, "step": 2734 }, { "epoch": 0.7574079202436999, "grad_norm": 1.4866106510162354, "learning_rate": 2.8405982136671395e-05, "loss": 2.7031, "step": 2735 }, { "epoch": 0.7576848518415952, "grad_norm": 1.0783441066741943, "learning_rate": 2.8344416648954265e-05, "loss": 2.6901, "step": 2736 }, { "epoch": 0.7579617834394905, "grad_norm": 1.2151702642440796, "learning_rate": 2.82829069318999e-05, "loss": 2.826, "step": 2737 }, { "epoch": 0.7582387150373857, "grad_norm": 1.172462821006775, "learning_rate": 2.8221453033382306e-05, "loss": 2.7353, "step": 2738 }, { "epoch": 0.758515646635281, "grad_norm": 1.1474409103393555, "learning_rate": 2.816005500123203e-05, "loss": 2.923, "step": 2739 }, { "epoch": 0.7587925782331764, "grad_norm": 1.625393271446228, "learning_rate": 2.80987128832361e-05, "loss": 2.7295, "step": 2740 }, { "epoch": 0.7590695098310717, "grad_norm": 1.599900245666504, "learning_rate": 2.803742672713807e-05, "loss": 2.8418, "step": 2741 }, { "epoch": 0.759346441428967, "grad_norm": 1.1119273900985718, "learning_rate": 2.797619658063789e-05, "loss": 2.6902, "step": 2742 }, { "epoch": 0.7596233730268623, "grad_norm": 1.754237174987793, "learning_rate": 2.7915022491391995e-05, "loss": 2.5603, "step": 2743 }, { "epoch": 0.7599003046247577, "grad_norm": 1.3307304382324219, "learning_rate": 2.785390450701303e-05, "loss": 2.7549, "step": 2744 }, { "epoch": 0.760177236222653, "grad_norm": 1.1118719577789307, "learning_rate": 2.7792842675070206e-05, "loss": 2.8739, "step": 2745 }, { "epoch": 0.7604541678205483, "grad_norm": 1.6018906831741333, "learning_rate": 2.7731837043088826e-05, "loss": 2.8068, "step": 2746 }, { "epoch": 0.7607310994184436, "grad_norm": 1.2182482481002808, "learning_rate": 2.7670887658550536e-05, "loss": 2.6503, "step": 2747 }, { "epoch": 0.761008031016339, "grad_norm": 0.8037989735603333, "learning_rate": 2.7609994568893204e-05, "loss": 2.6132, "step": 2748 }, { "epoch": 0.7612849626142343, "grad_norm": 1.4756443500518799, "learning_rate": 2.7549157821510885e-05, "loss": 2.8708, "step": 2749 }, { "epoch": 0.7615618942121296, "grad_norm": 2.4815382957458496, "learning_rate": 2.7488377463753757e-05, "loss": 3.4011, "step": 2750 }, { "epoch": 0.7618388258100249, "grad_norm": 1.4734253883361816, "learning_rate": 2.7427653542928132e-05, "loss": 2.7257, "step": 2751 }, { "epoch": 0.7621157574079203, "grad_norm": 1.261542797088623, "learning_rate": 2.7366986106296434e-05, "loss": 2.7427, "step": 2752 }, { "epoch": 0.7623926890058156, "grad_norm": 1.117368459701538, "learning_rate": 2.730637520107696e-05, "loss": 2.8775, "step": 2753 }, { "epoch": 0.7626696206037109, "grad_norm": 1.3518387079238892, "learning_rate": 2.7245820874444272e-05, "loss": 2.5352, "step": 2754 }, { "epoch": 0.7629465522016062, "grad_norm": 5.788182258605957, "learning_rate": 2.7185323173528643e-05, "loss": 2.5623, "step": 2755 }, { "epoch": 0.7632234837995016, "grad_norm": 1.519083023071289, "learning_rate": 2.712488214541642e-05, "loss": 2.8207, "step": 2756 }, { "epoch": 0.7635004153973969, "grad_norm": 1.3019345998764038, "learning_rate": 2.7064497837149804e-05, "loss": 2.5104, "step": 2757 }, { "epoch": 0.7637773469952922, "grad_norm": 1.2609566450119019, "learning_rate": 2.7004170295726872e-05, "loss": 2.8641, "step": 2758 }, { "epoch": 0.7640542785931875, "grad_norm": 1.7451201677322388, "learning_rate": 2.6943899568101405e-05, "loss": 3.1349, "step": 2759 }, { "epoch": 0.7643312101910829, "grad_norm": 2.2196478843688965, "learning_rate": 2.688368570118317e-05, "loss": 2.6598, "step": 2760 }, { "epoch": 0.7646081417889781, "grad_norm": 0.8295835852622986, "learning_rate": 2.6823528741837488e-05, "loss": 2.8099, "step": 2761 }, { "epoch": 0.7648850733868734, "grad_norm": 1.176751732826233, "learning_rate": 2.6763428736885476e-05, "loss": 2.7665, "step": 2762 }, { "epoch": 0.7651620049847687, "grad_norm": 1.5936609506607056, "learning_rate": 2.6703385733103924e-05, "loss": 2.8111, "step": 2763 }, { "epoch": 0.765438936582664, "grad_norm": 1.1688146591186523, "learning_rate": 2.6643399777225232e-05, "loss": 2.8102, "step": 2764 }, { "epoch": 0.7657158681805594, "grad_norm": 1.168511152267456, "learning_rate": 2.6583470915937402e-05, "loss": 2.7653, "step": 2765 }, { "epoch": 0.7659927997784547, "grad_norm": 1.4523048400878906, "learning_rate": 2.652359919588403e-05, "loss": 2.6246, "step": 2766 }, { "epoch": 0.76626973137635, "grad_norm": 1.4812711477279663, "learning_rate": 2.6463784663664214e-05, "loss": 2.9475, "step": 2767 }, { "epoch": 0.7665466629742453, "grad_norm": 10.08879280090332, "learning_rate": 2.6404027365832473e-05, "loss": 2.7762, "step": 2768 }, { "epoch": 0.7668235945721407, "grad_norm": 1.3308957815170288, "learning_rate": 2.6344327348898958e-05, "loss": 2.4427, "step": 2769 }, { "epoch": 0.767100526170036, "grad_norm": 1.9766590595245361, "learning_rate": 2.628468465932904e-05, "loss": 2.39, "step": 2770 }, { "epoch": 0.7673774577679313, "grad_norm": 2.2545158863067627, "learning_rate": 2.622509934354359e-05, "loss": 2.5055, "step": 2771 }, { "epoch": 0.7676543893658266, "grad_norm": 1.3092906475067139, "learning_rate": 2.616557144791879e-05, "loss": 2.5807, "step": 2772 }, { "epoch": 0.767931320963722, "grad_norm": 1.561122179031372, "learning_rate": 2.6106101018786134e-05, "loss": 2.9317, "step": 2773 }, { "epoch": 0.7682082525616173, "grad_norm": 1.4232147932052612, "learning_rate": 2.6046688102432382e-05, "loss": 2.9531, "step": 2774 }, { "epoch": 0.7684851841595126, "grad_norm": 3.122957944869995, "learning_rate": 2.5987332745099558e-05, "loss": 3.8673, "step": 2775 }, { "epoch": 0.7687621157574079, "grad_norm": 2.0886476039886475, "learning_rate": 2.5928034992984872e-05, "loss": 2.9174, "step": 2776 }, { "epoch": 0.7690390473553033, "grad_norm": 1.1254757642745972, "learning_rate": 2.5868794892240612e-05, "loss": 2.6686, "step": 2777 }, { "epoch": 0.7693159789531986, "grad_norm": 1.8632172346115112, "learning_rate": 2.58096124889744e-05, "loss": 2.9362, "step": 2778 }, { "epoch": 0.7695929105510939, "grad_norm": 1.440072774887085, "learning_rate": 2.5750487829248726e-05, "loss": 2.4785, "step": 2779 }, { "epoch": 0.7698698421489892, "grad_norm": 1.0852787494659424, "learning_rate": 2.5691420959081292e-05, "loss": 2.6737, "step": 2780 }, { "epoch": 0.7701467737468846, "grad_norm": 1.4319770336151123, "learning_rate": 2.5632411924444744e-05, "loss": 2.6004, "step": 2781 }, { "epoch": 0.7704237053447799, "grad_norm": 1.5288673639297485, "learning_rate": 2.5573460771266744e-05, "loss": 2.8314, "step": 2782 }, { "epoch": 0.7707006369426752, "grad_norm": 1.1410452127456665, "learning_rate": 2.5514567545429913e-05, "loss": 2.6155, "step": 2783 }, { "epoch": 0.7709775685405704, "grad_norm": 1.6288542747497559, "learning_rate": 2.545573229277175e-05, "loss": 2.6698, "step": 2784 }, { "epoch": 0.7712545001384657, "grad_norm": 2.0513246059417725, "learning_rate": 2.5396955059084703e-05, "loss": 2.958, "step": 2785 }, { "epoch": 0.7715314317363611, "grad_norm": 1.474790334701538, "learning_rate": 2.5338235890115902e-05, "loss": 2.7685, "step": 2786 }, { "epoch": 0.7718083633342564, "grad_norm": 1.5853452682495117, "learning_rate": 2.527957483156753e-05, "loss": 2.7268, "step": 2787 }, { "epoch": 0.7720852949321517, "grad_norm": 1.7835679054260254, "learning_rate": 2.5220971929096317e-05, "loss": 2.7315, "step": 2788 }, { "epoch": 0.772362226530047, "grad_norm": 1.3402307033538818, "learning_rate": 2.5162427228313857e-05, "loss": 2.9819, "step": 2789 }, { "epoch": 0.7726391581279424, "grad_norm": 1.7315587997436523, "learning_rate": 2.5103940774786385e-05, "loss": 2.7618, "step": 2790 }, { "epoch": 0.7729160897258377, "grad_norm": 1.3723822832107544, "learning_rate": 2.504551261403487e-05, "loss": 2.8444, "step": 2791 }, { "epoch": 0.773193021323733, "grad_norm": 1.6342122554779053, "learning_rate": 2.4987142791534767e-05, "loss": 2.7073, "step": 2792 }, { "epoch": 0.7734699529216283, "grad_norm": 1.0601471662521362, "learning_rate": 2.4928831352716353e-05, "loss": 2.7918, "step": 2793 }, { "epoch": 0.7737468845195237, "grad_norm": 1.8162391185760498, "learning_rate": 2.4870578342964245e-05, "loss": 2.6307, "step": 2794 }, { "epoch": 0.774023816117419, "grad_norm": 1.343056559562683, "learning_rate": 2.481238380761769e-05, "loss": 2.5655, "step": 2795 }, { "epoch": 0.7743007477153143, "grad_norm": 2.555288076400757, "learning_rate": 2.4754247791970418e-05, "loss": 3.0405, "step": 2796 }, { "epoch": 0.7745776793132096, "grad_norm": 1.2910922765731812, "learning_rate": 2.4696170341270585e-05, "loss": 2.7417, "step": 2797 }, { "epoch": 0.774854610911105, "grad_norm": 1.4226361513137817, "learning_rate": 2.4638151500720806e-05, "loss": 2.7559, "step": 2798 }, { "epoch": 0.7751315425090003, "grad_norm": 1.2817487716674805, "learning_rate": 2.458019131547803e-05, "loss": 2.9169, "step": 2799 }, { "epoch": 0.7754084741068956, "grad_norm": 3.5929179191589355, "learning_rate": 2.452228983065361e-05, "loss": 3.5593, "step": 2800 }, { "epoch": 0.7756854057047909, "grad_norm": 1.4803215265274048, "learning_rate": 2.4464447091313104e-05, "loss": 2.8882, "step": 2801 }, { "epoch": 0.7759623373026863, "grad_norm": 1.2327439785003662, "learning_rate": 2.4406663142476525e-05, "loss": 2.6362, "step": 2802 }, { "epoch": 0.7762392689005816, "grad_norm": 1.368813157081604, "learning_rate": 2.434893802911795e-05, "loss": 2.8441, "step": 2803 }, { "epoch": 0.7765162004984769, "grad_norm": 1.8350450992584229, "learning_rate": 2.429127179616575e-05, "loss": 3.1696, "step": 2804 }, { "epoch": 0.7767931320963722, "grad_norm": 1.1270864009857178, "learning_rate": 2.4233664488502484e-05, "loss": 2.6051, "step": 2805 }, { "epoch": 0.7770700636942676, "grad_norm": 1.1075642108917236, "learning_rate": 2.417611615096479e-05, "loss": 2.7663, "step": 2806 }, { "epoch": 0.7773469952921629, "grad_norm": 1.2369346618652344, "learning_rate": 2.411862682834346e-05, "loss": 2.5637, "step": 2807 }, { "epoch": 0.7776239268900581, "grad_norm": 1.526141881942749, "learning_rate": 2.4061196565383337e-05, "loss": 2.6793, "step": 2808 }, { "epoch": 0.7779008584879534, "grad_norm": 1.2792106866836548, "learning_rate": 2.4003825406783308e-05, "loss": 2.6412, "step": 2809 }, { "epoch": 0.7781777900858488, "grad_norm": 1.113972783088684, "learning_rate": 2.394651339719618e-05, "loss": 2.6725, "step": 2810 }, { "epoch": 0.7784547216837441, "grad_norm": 1.1308820247650146, "learning_rate": 2.3889260581228878e-05, "loss": 2.6811, "step": 2811 }, { "epoch": 0.7787316532816394, "grad_norm": 1.0981258153915405, "learning_rate": 2.3832067003442093e-05, "loss": 2.7958, "step": 2812 }, { "epoch": 0.7790085848795347, "grad_norm": 1.6483042240142822, "learning_rate": 2.3774932708350505e-05, "loss": 3.0734, "step": 2813 }, { "epoch": 0.77928551647743, "grad_norm": 1.5144284963607788, "learning_rate": 2.3717857740422644e-05, "loss": 2.7932, "step": 2814 }, { "epoch": 0.7795624480753254, "grad_norm": 1.8899537324905396, "learning_rate": 2.366084214408084e-05, "loss": 2.9024, "step": 2815 }, { "epoch": 0.7798393796732207, "grad_norm": 1.5420624017715454, "learning_rate": 2.360388596370122e-05, "loss": 2.9661, "step": 2816 }, { "epoch": 0.780116311271116, "grad_norm": 1.57925546169281, "learning_rate": 2.3546989243613682e-05, "loss": 2.7884, "step": 2817 }, { "epoch": 0.7803932428690113, "grad_norm": 1.3874616622924805, "learning_rate": 2.3490152028101852e-05, "loss": 2.6057, "step": 2818 }, { "epoch": 0.7806701744669067, "grad_norm": 1.1913419961929321, "learning_rate": 2.343337436140295e-05, "loss": 2.6187, "step": 2819 }, { "epoch": 0.780947106064802, "grad_norm": 1.2663335800170898, "learning_rate": 2.337665628770803e-05, "loss": 2.6802, "step": 2820 }, { "epoch": 0.7812240376626973, "grad_norm": 1.302472472190857, "learning_rate": 2.331999785116159e-05, "loss": 2.9861, "step": 2821 }, { "epoch": 0.7815009692605926, "grad_norm": 1.3166072368621826, "learning_rate": 2.326339909586178e-05, "loss": 2.6539, "step": 2822 }, { "epoch": 0.781777900858488, "grad_norm": 1.3500840663909912, "learning_rate": 2.3206860065860302e-05, "loss": 2.7886, "step": 2823 }, { "epoch": 0.7820548324563833, "grad_norm": 1.730839729309082, "learning_rate": 2.3150380805162418e-05, "loss": 2.7772, "step": 2824 }, { "epoch": 0.7823317640542786, "grad_norm": 2.9924428462982178, "learning_rate": 2.3093961357726723e-05, "loss": 3.5744, "step": 2825 }, { "epoch": 0.782608695652174, "grad_norm": 1.2666369676589966, "learning_rate": 2.3037601767465454e-05, "loss": 2.5735, "step": 2826 }, { "epoch": 0.7828856272500693, "grad_norm": 1.5073728561401367, "learning_rate": 2.2981302078244105e-05, "loss": 2.7513, "step": 2827 }, { "epoch": 0.7831625588479646, "grad_norm": 1.7456878423690796, "learning_rate": 2.292506233388162e-05, "loss": 2.8464, "step": 2828 }, { "epoch": 0.7834394904458599, "grad_norm": 0.926898181438446, "learning_rate": 2.2868882578150285e-05, "loss": 2.8184, "step": 2829 }, { "epoch": 0.7837164220437552, "grad_norm": 1.0987428426742554, "learning_rate": 2.281276285477566e-05, "loss": 2.6226, "step": 2830 }, { "epoch": 0.7839933536416505, "grad_norm": 1.6241536140441895, "learning_rate": 2.2756703207436624e-05, "loss": 2.7239, "step": 2831 }, { "epoch": 0.7842702852395458, "grad_norm": 0.9334229230880737, "learning_rate": 2.2700703679765278e-05, "loss": 2.7746, "step": 2832 }, { "epoch": 0.7845472168374411, "grad_norm": 1.409681797027588, "learning_rate": 2.2644764315346956e-05, "loss": 3.1124, "step": 2833 }, { "epoch": 0.7848241484353364, "grad_norm": 1.6021548509597778, "learning_rate": 2.258888515772005e-05, "loss": 2.7046, "step": 2834 }, { "epoch": 0.7851010800332318, "grad_norm": 1.2131134271621704, "learning_rate": 2.25330662503763e-05, "loss": 2.5163, "step": 2835 }, { "epoch": 0.7853780116311271, "grad_norm": 1.2493963241577148, "learning_rate": 2.2477307636760357e-05, "loss": 2.6996, "step": 2836 }, { "epoch": 0.7856549432290224, "grad_norm": 1.7190215587615967, "learning_rate": 2.2421609360270045e-05, "loss": 2.9826, "step": 2837 }, { "epoch": 0.7859318748269177, "grad_norm": 1.164171814918518, "learning_rate": 2.236597146425622e-05, "loss": 2.9398, "step": 2838 }, { "epoch": 0.7862088064248131, "grad_norm": 1.6923491954803467, "learning_rate": 2.2310393992022704e-05, "loss": 2.5764, "step": 2839 }, { "epoch": 0.7864857380227084, "grad_norm": 1.948574185371399, "learning_rate": 2.2254876986826323e-05, "loss": 2.7705, "step": 2840 }, { "epoch": 0.7867626696206037, "grad_norm": 1.9062774181365967, "learning_rate": 2.2199420491876845e-05, "loss": 3.0385, "step": 2841 }, { "epoch": 0.787039601218499, "grad_norm": 0.9570973515510559, "learning_rate": 2.214402455033694e-05, "loss": 2.9905, "step": 2842 }, { "epoch": 0.7873165328163944, "grad_norm": 1.036166787147522, "learning_rate": 2.2088689205322068e-05, "loss": 2.8339, "step": 2843 }, { "epoch": 0.7875934644142897, "grad_norm": 1.1238036155700684, "learning_rate": 2.2033414499900685e-05, "loss": 2.6468, "step": 2844 }, { "epoch": 0.787870396012185, "grad_norm": 1.448442816734314, "learning_rate": 2.1978200477093902e-05, "loss": 2.5738, "step": 2845 }, { "epoch": 0.7881473276100803, "grad_norm": 1.4662296772003174, "learning_rate": 2.1923047179875654e-05, "loss": 2.8733, "step": 2846 }, { "epoch": 0.7884242592079757, "grad_norm": 1.1000827550888062, "learning_rate": 2.186795465117265e-05, "loss": 2.461, "step": 2847 }, { "epoch": 0.788701190805871, "grad_norm": 1.3521825075149536, "learning_rate": 2.1812922933864256e-05, "loss": 2.6673, "step": 2848 }, { "epoch": 0.7889781224037663, "grad_norm": 1.9187358617782593, "learning_rate": 2.1757952070782504e-05, "loss": 2.844, "step": 2849 }, { "epoch": 0.7892550540016616, "grad_norm": 1.8662562370300293, "learning_rate": 2.170304210471209e-05, "loss": 3.4556, "step": 2850 }, { "epoch": 0.789531985599557, "grad_norm": 1.1305601596832275, "learning_rate": 2.1648193078390332e-05, "loss": 2.9509, "step": 2851 }, { "epoch": 0.7898089171974523, "grad_norm": 1.4433000087738037, "learning_rate": 2.1593405034506997e-05, "loss": 2.9256, "step": 2852 }, { "epoch": 0.7900858487953476, "grad_norm": 1.4666862487792969, "learning_rate": 2.1538678015704595e-05, "loss": 2.648, "step": 2853 }, { "epoch": 0.7903627803932428, "grad_norm": 1.5469653606414795, "learning_rate": 2.148401206457793e-05, "loss": 3.1121, "step": 2854 }, { "epoch": 0.7906397119911381, "grad_norm": 1.8728482723236084, "learning_rate": 2.1429407223674402e-05, "loss": 2.537, "step": 2855 }, { "epoch": 0.7909166435890335, "grad_norm": 0.8866031169891357, "learning_rate": 2.1374863535493816e-05, "loss": 2.609, "step": 2856 }, { "epoch": 0.7911935751869288, "grad_norm": 1.1060009002685547, "learning_rate": 2.132038104248839e-05, "loss": 2.6987, "step": 2857 }, { "epoch": 0.7914705067848241, "grad_norm": 1.1832937002182007, "learning_rate": 2.126595978706265e-05, "loss": 2.7546, "step": 2858 }, { "epoch": 0.7917474383827194, "grad_norm": 1.523499608039856, "learning_rate": 2.121159981157359e-05, "loss": 2.7163, "step": 2859 }, { "epoch": 0.7920243699806148, "grad_norm": 1.5445829629898071, "learning_rate": 2.1157301158330357e-05, "loss": 2.5706, "step": 2860 }, { "epoch": 0.7923013015785101, "grad_norm": 1.2627346515655518, "learning_rate": 2.1103063869594486e-05, "loss": 2.8291, "step": 2861 }, { "epoch": 0.7925782331764054, "grad_norm": 1.1070663928985596, "learning_rate": 2.104888798757969e-05, "loss": 2.8669, "step": 2862 }, { "epoch": 0.7928551647743007, "grad_norm": 1.612951636314392, "learning_rate": 2.0994773554451908e-05, "loss": 2.7573, "step": 2863 }, { "epoch": 0.7931320963721961, "grad_norm": 1.42770516872406, "learning_rate": 2.0940720612329258e-05, "loss": 2.535, "step": 2864 }, { "epoch": 0.7934090279700914, "grad_norm": 0.787700355052948, "learning_rate": 2.0886729203281995e-05, "loss": 2.6253, "step": 2865 }, { "epoch": 0.7936859595679867, "grad_norm": 1.5732358694076538, "learning_rate": 2.08327993693325e-05, "loss": 2.9229, "step": 2866 }, { "epoch": 0.793962891165882, "grad_norm": 0.9501615762710571, "learning_rate": 2.0778931152455116e-05, "loss": 2.5643, "step": 2867 }, { "epoch": 0.7942398227637774, "grad_norm": 0.8824294805526733, "learning_rate": 2.072512459457645e-05, "loss": 2.7563, "step": 2868 }, { "epoch": 0.7945167543616727, "grad_norm": 1.1400706768035889, "learning_rate": 2.067137973757489e-05, "loss": 2.7726, "step": 2869 }, { "epoch": 0.794793685959568, "grad_norm": 1.441460132598877, "learning_rate": 2.0617696623280934e-05, "loss": 2.8705, "step": 2870 }, { "epoch": 0.7950706175574633, "grad_norm": 1.4153046607971191, "learning_rate": 2.056407529347699e-05, "loss": 2.7678, "step": 2871 }, { "epoch": 0.7953475491553587, "grad_norm": 1.5290249586105347, "learning_rate": 2.0510515789897377e-05, "loss": 2.4862, "step": 2872 }, { "epoch": 0.795624480753254, "grad_norm": 2.0305423736572266, "learning_rate": 2.045701815422829e-05, "loss": 3.056, "step": 2873 }, { "epoch": 0.7959014123511493, "grad_norm": 1.9372893571853638, "learning_rate": 2.0403582428107792e-05, "loss": 3.0894, "step": 2874 }, { "epoch": 0.7961783439490446, "grad_norm": 3.0845742225646973, "learning_rate": 2.0350208653125756e-05, "loss": 3.8301, "step": 2875 }, { "epoch": 0.79645527554694, "grad_norm": 1.420465350151062, "learning_rate": 2.0296896870823766e-05, "loss": 2.8496, "step": 2876 }, { "epoch": 0.7967322071448353, "grad_norm": 2.17264986038208, "learning_rate": 2.024364712269531e-05, "loss": 2.9893, "step": 2877 }, { "epoch": 0.7970091387427305, "grad_norm": 1.0084600448608398, "learning_rate": 2.0190459450185418e-05, "loss": 2.6218, "step": 2878 }, { "epoch": 0.7972860703406258, "grad_norm": 1.1605966091156006, "learning_rate": 2.0137333894690912e-05, "loss": 2.449, "step": 2879 }, { "epoch": 0.7975630019385211, "grad_norm": 1.5368242263793945, "learning_rate": 2.0084270497560263e-05, "loss": 2.9994, "step": 2880 }, { "epoch": 0.7978399335364165, "grad_norm": 1.3453278541564941, "learning_rate": 2.0031269300093535e-05, "loss": 2.8973, "step": 2881 }, { "epoch": 0.7981168651343118, "grad_norm": 1.261345624923706, "learning_rate": 1.9978330343542384e-05, "loss": 2.8822, "step": 2882 }, { "epoch": 0.7983937967322071, "grad_norm": 1.3080271482467651, "learning_rate": 1.992545366911004e-05, "loss": 2.8124, "step": 2883 }, { "epoch": 0.7986707283301024, "grad_norm": 1.1540530920028687, "learning_rate": 1.987263931795126e-05, "loss": 2.8198, "step": 2884 }, { "epoch": 0.7989476599279978, "grad_norm": 1.3719029426574707, "learning_rate": 1.9819887331172204e-05, "loss": 2.8129, "step": 2885 }, { "epoch": 0.7992245915258931, "grad_norm": 1.1547625064849854, "learning_rate": 1.976719774983069e-05, "loss": 2.9531, "step": 2886 }, { "epoch": 0.7995015231237884, "grad_norm": 1.4991424083709717, "learning_rate": 1.9714570614935756e-05, "loss": 2.9222, "step": 2887 }, { "epoch": 0.7997784547216837, "grad_norm": 1.1141403913497925, "learning_rate": 1.966200596744794e-05, "loss": 2.6606, "step": 2888 }, { "epoch": 0.8000553863195791, "grad_norm": 1.5983209609985352, "learning_rate": 1.9609503848279144e-05, "loss": 2.6946, "step": 2889 }, { "epoch": 0.8003323179174744, "grad_norm": 1.5049333572387695, "learning_rate": 1.9557064298292615e-05, "loss": 2.9702, "step": 2890 }, { "epoch": 0.8006092495153697, "grad_norm": 2.0525035858154297, "learning_rate": 1.9504687358302798e-05, "loss": 2.9155, "step": 2891 }, { "epoch": 0.800886181113265, "grad_norm": 1.595201015472412, "learning_rate": 1.945237306907558e-05, "loss": 2.6907, "step": 2892 }, { "epoch": 0.8011631127111604, "grad_norm": 1.6689269542694092, "learning_rate": 1.9400121471327924e-05, "loss": 3.053, "step": 2893 }, { "epoch": 0.8014400443090557, "grad_norm": 1.5648447275161743, "learning_rate": 1.9347932605728093e-05, "loss": 2.494, "step": 2894 }, { "epoch": 0.801716975906951, "grad_norm": 1.5639302730560303, "learning_rate": 1.9295806512895498e-05, "loss": 2.7028, "step": 2895 }, { "epoch": 0.8019939075048463, "grad_norm": 0.7483145594596863, "learning_rate": 1.924374323340068e-05, "loss": 2.8071, "step": 2896 }, { "epoch": 0.8022708391027417, "grad_norm": 1.3213709592819214, "learning_rate": 1.9191742807765323e-05, "loss": 2.8306, "step": 2897 }, { "epoch": 0.802547770700637, "grad_norm": 1.1647886037826538, "learning_rate": 1.9139805276462165e-05, "loss": 2.6671, "step": 2898 }, { "epoch": 0.8028247022985323, "grad_norm": 1.4693671464920044, "learning_rate": 1.9087930679915023e-05, "loss": 2.9852, "step": 2899 }, { "epoch": 0.8031016338964276, "grad_norm": 4.712059497833252, "learning_rate": 1.9036119058498637e-05, "loss": 4.4702, "step": 2900 }, { "epoch": 0.8033785654943228, "grad_norm": 1.3639471530914307, "learning_rate": 1.89843704525389e-05, "loss": 2.7426, "step": 2901 }, { "epoch": 0.8036554970922182, "grad_norm": 0.9283391237258911, "learning_rate": 1.893268490231249e-05, "loss": 2.9628, "step": 2902 }, { "epoch": 0.8039324286901135, "grad_norm": 1.0291162729263306, "learning_rate": 1.8881062448047104e-05, "loss": 2.773, "step": 2903 }, { "epoch": 0.8042093602880088, "grad_norm": 1.4412510395050049, "learning_rate": 1.882950312992131e-05, "loss": 2.6427, "step": 2904 }, { "epoch": 0.8044862918859041, "grad_norm": 1.680640697479248, "learning_rate": 1.877800698806452e-05, "loss": 2.6781, "step": 2905 }, { "epoch": 0.8047632234837995, "grad_norm": 1.6135386228561401, "learning_rate": 1.8726574062557012e-05, "loss": 3.0742, "step": 2906 }, { "epoch": 0.8050401550816948, "grad_norm": 1.0270028114318848, "learning_rate": 1.8675204393429814e-05, "loss": 2.7521, "step": 2907 }, { "epoch": 0.8053170866795901, "grad_norm": 1.7050650119781494, "learning_rate": 1.8623898020664786e-05, "loss": 2.9526, "step": 2908 }, { "epoch": 0.8055940182774854, "grad_norm": 1.2581156492233276, "learning_rate": 1.8572654984194392e-05, "loss": 2.8206, "step": 2909 }, { "epoch": 0.8058709498753808, "grad_norm": 1.4937968254089355, "learning_rate": 1.8521475323901994e-05, "loss": 2.7404, "step": 2910 }, { "epoch": 0.8061478814732761, "grad_norm": 2.0960092544555664, "learning_rate": 1.847035907962146e-05, "loss": 2.7561, "step": 2911 }, { "epoch": 0.8064248130711714, "grad_norm": 1.0080113410949707, "learning_rate": 1.8419306291137372e-05, "loss": 2.8846, "step": 2912 }, { "epoch": 0.8067017446690667, "grad_norm": 1.9581778049468994, "learning_rate": 1.836831699818492e-05, "loss": 2.7467, "step": 2913 }, { "epoch": 0.8069786762669621, "grad_norm": 1.0130141973495483, "learning_rate": 1.8317391240449876e-05, "loss": 2.8305, "step": 2914 }, { "epoch": 0.8072556078648574, "grad_norm": 1.2986866235733032, "learning_rate": 1.8266529057568548e-05, "loss": 2.6439, "step": 2915 }, { "epoch": 0.8075325394627527, "grad_norm": 1.231287956237793, "learning_rate": 1.821573048912778e-05, "loss": 2.935, "step": 2916 }, { "epoch": 0.807809471060648, "grad_norm": 0.8357569575309753, "learning_rate": 1.81649955746649e-05, "loss": 2.8332, "step": 2917 }, { "epoch": 0.8080864026585434, "grad_norm": 1.57582426071167, "learning_rate": 1.8114324353667632e-05, "loss": 2.7864, "step": 2918 }, { "epoch": 0.8083633342564387, "grad_norm": 1.5003948211669922, "learning_rate": 1.8063716865574266e-05, "loss": 2.5028, "step": 2919 }, { "epoch": 0.808640265854334, "grad_norm": 1.723107099533081, "learning_rate": 1.8013173149773323e-05, "loss": 2.9521, "step": 2920 }, { "epoch": 0.8089171974522293, "grad_norm": 1.3525755405426025, "learning_rate": 1.796269324560381e-05, "loss": 2.8175, "step": 2921 }, { "epoch": 0.8091941290501247, "grad_norm": 1.3799983263015747, "learning_rate": 1.791227719235502e-05, "loss": 2.5512, "step": 2922 }, { "epoch": 0.80947106064802, "grad_norm": 1.5028530359268188, "learning_rate": 1.7861925029266592e-05, "loss": 3.0429, "step": 2923 }, { "epoch": 0.8097479922459152, "grad_norm": 1.3339258432388306, "learning_rate": 1.781163679552831e-05, "loss": 3.0033, "step": 2924 }, { "epoch": 0.8100249238438105, "grad_norm": 4.6099677085876465, "learning_rate": 1.7761412530280398e-05, "loss": 4.3239, "step": 2925 }, { "epoch": 0.8103018554417059, "grad_norm": 2.1620473861694336, "learning_rate": 1.7711252272613122e-05, "loss": 3.0301, "step": 2926 }, { "epoch": 0.8105787870396012, "grad_norm": 1.1942365169525146, "learning_rate": 1.7661156061566985e-05, "loss": 2.5074, "step": 2927 }, { "epoch": 0.8108557186374965, "grad_norm": 1.6597949266433716, "learning_rate": 1.7611123936132734e-05, "loss": 2.7833, "step": 2928 }, { "epoch": 0.8111326502353918, "grad_norm": 1.9225363731384277, "learning_rate": 1.7561155935251094e-05, "loss": 2.7084, "step": 2929 }, { "epoch": 0.8114095818332872, "grad_norm": 1.8110785484313965, "learning_rate": 1.7511252097812947e-05, "loss": 2.7287, "step": 2930 }, { "epoch": 0.8116865134311825, "grad_norm": 1.7865407466888428, "learning_rate": 1.7461412462659233e-05, "loss": 2.9899, "step": 2931 }, { "epoch": 0.8119634450290778, "grad_norm": 1.4619214534759521, "learning_rate": 1.7411637068580953e-05, "loss": 2.5232, "step": 2932 }, { "epoch": 0.8122403766269731, "grad_norm": 1.4430086612701416, "learning_rate": 1.7361925954319003e-05, "loss": 2.8457, "step": 2933 }, { "epoch": 0.8125173082248685, "grad_norm": 1.6865593194961548, "learning_rate": 1.7312279158564415e-05, "loss": 2.3381, "step": 2934 }, { "epoch": 0.8127942398227638, "grad_norm": 0.8854038715362549, "learning_rate": 1.7262696719958004e-05, "loss": 2.8799, "step": 2935 }, { "epoch": 0.8130711714206591, "grad_norm": 1.9693397283554077, "learning_rate": 1.721317867709057e-05, "loss": 2.6524, "step": 2936 }, { "epoch": 0.8133481030185544, "grad_norm": 0.8433536291122437, "learning_rate": 1.7163725068502788e-05, "loss": 2.5193, "step": 2937 }, { "epoch": 0.8136250346164497, "grad_norm": 0.9649327397346497, "learning_rate": 1.7114335932685165e-05, "loss": 2.8103, "step": 2938 }, { "epoch": 0.8139019662143451, "grad_norm": 1.7423204183578491, "learning_rate": 1.706501130807806e-05, "loss": 3.0139, "step": 2939 }, { "epoch": 0.8141788978122404, "grad_norm": 1.8019976615905762, "learning_rate": 1.7015751233071585e-05, "loss": 2.6158, "step": 2940 }, { "epoch": 0.8144558294101357, "grad_norm": 1.1565155982971191, "learning_rate": 1.6966555746005662e-05, "loss": 2.5206, "step": 2941 }, { "epoch": 0.814732761008031, "grad_norm": 1.6771178245544434, "learning_rate": 1.691742488516983e-05, "loss": 2.6546, "step": 2942 }, { "epoch": 0.8150096926059264, "grad_norm": 1.364692211151123, "learning_rate": 1.6868358688803497e-05, "loss": 2.8094, "step": 2943 }, { "epoch": 0.8152866242038217, "grad_norm": 0.8680933117866516, "learning_rate": 1.6819357195095597e-05, "loss": 2.4831, "step": 2944 }, { "epoch": 0.815563555801717, "grad_norm": 1.2125277519226074, "learning_rate": 1.6770420442184764e-05, "loss": 2.6179, "step": 2945 }, { "epoch": 0.8158404873996123, "grad_norm": 1.1408050060272217, "learning_rate": 1.6721548468159264e-05, "loss": 2.6832, "step": 2946 }, { "epoch": 0.8161174189975077, "grad_norm": 1.2853754758834839, "learning_rate": 1.6672741311056894e-05, "loss": 2.6696, "step": 2947 }, { "epoch": 0.8163943505954029, "grad_norm": 1.2306122779846191, "learning_rate": 1.6623999008865053e-05, "loss": 2.7143, "step": 2948 }, { "epoch": 0.8166712821932982, "grad_norm": 2.095325469970703, "learning_rate": 1.657532159952062e-05, "loss": 3.2119, "step": 2949 }, { "epoch": 0.8169482137911935, "grad_norm": 3.535750150680542, "learning_rate": 1.652670912091001e-05, "loss": 3.7123, "step": 2950 }, { "epoch": 0.8172251453890889, "grad_norm": 1.0357232093811035, "learning_rate": 1.6478161610869026e-05, "loss": 2.7336, "step": 2951 }, { "epoch": 0.8175020769869842, "grad_norm": 0.9846844673156738, "learning_rate": 1.6429679107183016e-05, "loss": 2.5957, "step": 2952 }, { "epoch": 0.8177790085848795, "grad_norm": 1.018471121788025, "learning_rate": 1.638126164758663e-05, "loss": 2.674, "step": 2953 }, { "epoch": 0.8180559401827748, "grad_norm": 1.5538946390151978, "learning_rate": 1.6332909269763953e-05, "loss": 2.734, "step": 2954 }, { "epoch": 0.8183328717806702, "grad_norm": 1.7519913911819458, "learning_rate": 1.628462201134838e-05, "loss": 3.0673, "step": 2955 }, { "epoch": 0.8186098033785655, "grad_norm": 1.0945311784744263, "learning_rate": 1.6236399909922684e-05, "loss": 2.7443, "step": 2956 }, { "epoch": 0.8188867349764608, "grad_norm": 1.7660750150680542, "learning_rate": 1.6188243003018798e-05, "loss": 2.7689, "step": 2957 }, { "epoch": 0.8191636665743561, "grad_norm": 1.089036226272583, "learning_rate": 1.6140151328118115e-05, "loss": 2.5295, "step": 2958 }, { "epoch": 0.8194405981722515, "grad_norm": 1.1123430728912354, "learning_rate": 1.609212492265103e-05, "loss": 2.7192, "step": 2959 }, { "epoch": 0.8197175297701468, "grad_norm": 1.407126784324646, "learning_rate": 1.6044163823997283e-05, "loss": 2.821, "step": 2960 }, { "epoch": 0.8199944613680421, "grad_norm": 1.7229863405227661, "learning_rate": 1.59962680694858e-05, "loss": 2.6649, "step": 2961 }, { "epoch": 0.8202713929659374, "grad_norm": 1.345894455909729, "learning_rate": 1.5948437696394513e-05, "loss": 2.77, "step": 2962 }, { "epoch": 0.8205483245638328, "grad_norm": 1.4527201652526855, "learning_rate": 1.5900672741950607e-05, "loss": 2.879, "step": 2963 }, { "epoch": 0.8208252561617281, "grad_norm": 1.544024109840393, "learning_rate": 1.585297324333027e-05, "loss": 2.778, "step": 2964 }, { "epoch": 0.8211021877596234, "grad_norm": 1.425732970237732, "learning_rate": 1.5805339237658813e-05, "loss": 2.8205, "step": 2965 }, { "epoch": 0.8213791193575187, "grad_norm": 1.687116265296936, "learning_rate": 1.5757770762010438e-05, "loss": 2.9889, "step": 2966 }, { "epoch": 0.821656050955414, "grad_norm": 0.8597702383995056, "learning_rate": 1.5710267853408534e-05, "loss": 2.5228, "step": 2967 }, { "epoch": 0.8219329825533094, "grad_norm": 1.3650617599487305, "learning_rate": 1.566283054882528e-05, "loss": 2.4924, "step": 2968 }, { "epoch": 0.8222099141512047, "grad_norm": 1.316955327987671, "learning_rate": 1.561545888518192e-05, "loss": 2.7898, "step": 2969 }, { "epoch": 0.8224868457491, "grad_norm": 1.252253770828247, "learning_rate": 1.556815289934854e-05, "loss": 2.5309, "step": 2970 }, { "epoch": 0.8227637773469952, "grad_norm": 1.389169692993164, "learning_rate": 1.5520912628144145e-05, "loss": 2.5478, "step": 2971 }, { "epoch": 0.8230407089448906, "grad_norm": 1.2352834939956665, "learning_rate": 1.5473738108336566e-05, "loss": 2.7469, "step": 2972 }, { "epoch": 0.8233176405427859, "grad_norm": 1.3856765031814575, "learning_rate": 1.5426629376642475e-05, "loss": 2.6795, "step": 2973 }, { "epoch": 0.8235945721406812, "grad_norm": 2.619903802871704, "learning_rate": 1.537958646972737e-05, "loss": 3.0275, "step": 2974 }, { "epoch": 0.8238715037385765, "grad_norm": 3.9109251499176025, "learning_rate": 1.533260942420539e-05, "loss": 4.043, "step": 2975 }, { "epoch": 0.8241484353364719, "grad_norm": 1.729552984237671, "learning_rate": 1.5285698276639615e-05, "loss": 2.5781, "step": 2976 }, { "epoch": 0.8244253669343672, "grad_norm": 1.358718991279602, "learning_rate": 1.5238853063541658e-05, "loss": 2.4212, "step": 2977 }, { "epoch": 0.8247022985322625, "grad_norm": 0.9492407441139221, "learning_rate": 1.5192073821371889e-05, "loss": 2.7406, "step": 2978 }, { "epoch": 0.8249792301301578, "grad_norm": 1.7622709274291992, "learning_rate": 1.5145360586539336e-05, "loss": 2.8144, "step": 2979 }, { "epoch": 0.8252561617280532, "grad_norm": 1.3219594955444336, "learning_rate": 1.5098713395401643e-05, "loss": 2.7726, "step": 2980 }, { "epoch": 0.8255330933259485, "grad_norm": 1.4270638227462769, "learning_rate": 1.5052132284265042e-05, "loss": 2.5905, "step": 2981 }, { "epoch": 0.8258100249238438, "grad_norm": 1.323522925376892, "learning_rate": 1.5005617289384343e-05, "loss": 2.7863, "step": 2982 }, { "epoch": 0.8260869565217391, "grad_norm": 1.531290054321289, "learning_rate": 1.4959168446962935e-05, "loss": 2.9003, "step": 2983 }, { "epoch": 0.8263638881196345, "grad_norm": 1.3229002952575684, "learning_rate": 1.4912785793152583e-05, "loss": 2.7109, "step": 2984 }, { "epoch": 0.8266408197175298, "grad_norm": 1.4431756734848022, "learning_rate": 1.4866469364053749e-05, "loss": 2.8354, "step": 2985 }, { "epoch": 0.8269177513154251, "grad_norm": 1.6073932647705078, "learning_rate": 1.4820219195715146e-05, "loss": 2.4679, "step": 2986 }, { "epoch": 0.8271946829133204, "grad_norm": 1.714503526687622, "learning_rate": 1.4774035324134039e-05, "loss": 2.5459, "step": 2987 }, { "epoch": 0.8274716145112158, "grad_norm": 1.2907015085220337, "learning_rate": 1.4727917785256051e-05, "loss": 2.8861, "step": 2988 }, { "epoch": 0.8277485461091111, "grad_norm": 1.1760333776474, "learning_rate": 1.4681866614975227e-05, "loss": 2.6618, "step": 2989 }, { "epoch": 0.8280254777070064, "grad_norm": 1.5860620737075806, "learning_rate": 1.4635881849133826e-05, "loss": 2.8644, "step": 2990 }, { "epoch": 0.8283024093049017, "grad_norm": 1.7209746837615967, "learning_rate": 1.4589963523522577e-05, "loss": 2.994, "step": 2991 }, { "epoch": 0.8285793409027971, "grad_norm": 1.2861586809158325, "learning_rate": 1.4544111673880445e-05, "loss": 2.7188, "step": 2992 }, { "epoch": 0.8288562725006924, "grad_norm": 0.8272273540496826, "learning_rate": 1.4498326335894574e-05, "loss": 2.7651, "step": 2993 }, { "epoch": 0.8291332040985876, "grad_norm": 1.729760766029358, "learning_rate": 1.4452607545200492e-05, "loss": 3.0048, "step": 2994 }, { "epoch": 0.8294101356964829, "grad_norm": 1.0698480606079102, "learning_rate": 1.440695533738179e-05, "loss": 2.8259, "step": 2995 }, { "epoch": 0.8296870672943782, "grad_norm": 1.9651565551757812, "learning_rate": 1.4361369747970311e-05, "loss": 3.0114, "step": 2996 }, { "epoch": 0.8299639988922736, "grad_norm": 0.9492764472961426, "learning_rate": 1.4315850812446042e-05, "loss": 2.6462, "step": 2997 }, { "epoch": 0.8302409304901689, "grad_norm": 1.9277658462524414, "learning_rate": 1.4270398566237098e-05, "loss": 3.0011, "step": 2998 }, { "epoch": 0.8305178620880642, "grad_norm": 1.873567819595337, "learning_rate": 1.4225013044719615e-05, "loss": 3.2477, "step": 2999 }, { "epoch": 0.8307947936859595, "grad_norm": 3.1596426963806152, "learning_rate": 1.4179694283217937e-05, "loss": 3.7095, "step": 3000 }, { "epoch": 0.8310717252838549, "grad_norm": 1.6815263032913208, "learning_rate": 1.4134442317004304e-05, "loss": 2.6958, "step": 3001 }, { "epoch": 0.8313486568817502, "grad_norm": 2.4771888256073, "learning_rate": 1.4089257181299042e-05, "loss": 3.3317, "step": 3002 }, { "epoch": 0.8316255884796455, "grad_norm": 1.9468337297439575, "learning_rate": 1.4044138911270466e-05, "loss": 2.4712, "step": 3003 }, { "epoch": 0.8319025200775408, "grad_norm": 2.0584003925323486, "learning_rate": 1.3999087542034817e-05, "loss": 2.9929, "step": 3004 }, { "epoch": 0.8321794516754362, "grad_norm": 0.7015234231948853, "learning_rate": 1.3954103108656291e-05, "loss": 2.6684, "step": 3005 }, { "epoch": 0.8324563832733315, "grad_norm": 1.70437753200531, "learning_rate": 1.3909185646146983e-05, "loss": 2.883, "step": 3006 }, { "epoch": 0.8327333148712268, "grad_norm": 1.496725082397461, "learning_rate": 1.386433518946686e-05, "loss": 2.7889, "step": 3007 }, { "epoch": 0.8330102464691221, "grad_norm": 1.208085298538208, "learning_rate": 1.3819551773523688e-05, "loss": 2.6361, "step": 3008 }, { "epoch": 0.8332871780670175, "grad_norm": 0.9877991676330566, "learning_rate": 1.3774835433173172e-05, "loss": 2.6832, "step": 3009 }, { "epoch": 0.8335641096649128, "grad_norm": 1.1782431602478027, "learning_rate": 1.3730186203218697e-05, "loss": 2.543, "step": 3010 }, { "epoch": 0.8338410412628081, "grad_norm": 0.740976870059967, "learning_rate": 1.3685604118411455e-05, "loss": 2.6201, "step": 3011 }, { "epoch": 0.8341179728607034, "grad_norm": 1.768426775932312, "learning_rate": 1.3641089213450397e-05, "loss": 2.7182, "step": 3012 }, { "epoch": 0.8343949044585988, "grad_norm": 1.4269012212753296, "learning_rate": 1.3596641522982157e-05, "loss": 3.1137, "step": 3013 }, { "epoch": 0.8346718360564941, "grad_norm": 1.229787826538086, "learning_rate": 1.3552261081601091e-05, "loss": 2.8571, "step": 3014 }, { "epoch": 0.8349487676543894, "grad_norm": 1.51630437374115, "learning_rate": 1.3507947923849163e-05, "loss": 2.8817, "step": 3015 }, { "epoch": 0.8352256992522847, "grad_norm": 1.804336428642273, "learning_rate": 1.3463702084216046e-05, "loss": 2.7904, "step": 3016 }, { "epoch": 0.83550263085018, "grad_norm": 1.476218819618225, "learning_rate": 1.3419523597138884e-05, "loss": 2.6541, "step": 3017 }, { "epoch": 0.8357795624480753, "grad_norm": 0.7786409854888916, "learning_rate": 1.3375412497002593e-05, "loss": 2.7245, "step": 3018 }, { "epoch": 0.8360564940459706, "grad_norm": 0.9623320698738098, "learning_rate": 1.3331368818139445e-05, "loss": 2.3649, "step": 3019 }, { "epoch": 0.8363334256438659, "grad_norm": 0.8559485673904419, "learning_rate": 1.3287392594829384e-05, "loss": 2.4915, "step": 3020 }, { "epoch": 0.8366103572417612, "grad_norm": 1.2838448286056519, "learning_rate": 1.3243483861299721e-05, "loss": 2.7953, "step": 3021 }, { "epoch": 0.8368872888396566, "grad_norm": 1.485473871231079, "learning_rate": 1.3199642651725408e-05, "loss": 2.5919, "step": 3022 }, { "epoch": 0.8371642204375519, "grad_norm": 1.3391149044036865, "learning_rate": 1.315586900022867e-05, "loss": 2.6176, "step": 3023 }, { "epoch": 0.8374411520354472, "grad_norm": 1.5078611373901367, "learning_rate": 1.3112162940879225e-05, "loss": 2.8531, "step": 3024 }, { "epoch": 0.8377180836333425, "grad_norm": 1.8650858402252197, "learning_rate": 1.3068524507694258e-05, "loss": 3.1421, "step": 3025 }, { "epoch": 0.8379950152312379, "grad_norm": 1.3443483114242554, "learning_rate": 1.3024953734638168e-05, "loss": 2.6515, "step": 3026 }, { "epoch": 0.8382719468291332, "grad_norm": 1.4175198078155518, "learning_rate": 1.2981450655622796e-05, "loss": 2.8623, "step": 3027 }, { "epoch": 0.8385488784270285, "grad_norm": 1.2949321269989014, "learning_rate": 1.2938015304507279e-05, "loss": 2.8468, "step": 3028 }, { "epoch": 0.8388258100249238, "grad_norm": 1.6689027547836304, "learning_rate": 1.289464771509804e-05, "loss": 2.8973, "step": 3029 }, { "epoch": 0.8391027416228192, "grad_norm": 1.496212124824524, "learning_rate": 1.2851347921148693e-05, "loss": 2.9497, "step": 3030 }, { "epoch": 0.8393796732207145, "grad_norm": 1.646315097808838, "learning_rate": 1.2808115956360233e-05, "loss": 2.9308, "step": 3031 }, { "epoch": 0.8396566048186098, "grad_norm": 1.179675579071045, "learning_rate": 1.2764951854380714e-05, "loss": 2.7608, "step": 3032 }, { "epoch": 0.8399335364165051, "grad_norm": 0.8177387118339539, "learning_rate": 1.2721855648805448e-05, "loss": 2.673, "step": 3033 }, { "epoch": 0.8402104680144005, "grad_norm": 0.940900444984436, "learning_rate": 1.2678827373176894e-05, "loss": 2.6349, "step": 3034 }, { "epoch": 0.8404873996122958, "grad_norm": 2.1662635803222656, "learning_rate": 1.2635867060984619e-05, "loss": 2.5167, "step": 3035 }, { "epoch": 0.8407643312101911, "grad_norm": 1.6904385089874268, "learning_rate": 1.259297474566532e-05, "loss": 2.9469, "step": 3036 }, { "epoch": 0.8410412628080864, "grad_norm": 1.4025242328643799, "learning_rate": 1.2550150460602761e-05, "loss": 2.9757, "step": 3037 }, { "epoch": 0.8413181944059818, "grad_norm": 1.0857107639312744, "learning_rate": 1.2507394239127757e-05, "loss": 2.819, "step": 3038 }, { "epoch": 0.8415951260038771, "grad_norm": 1.5865957736968994, "learning_rate": 1.2464706114518088e-05, "loss": 3.0157, "step": 3039 }, { "epoch": 0.8418720576017724, "grad_norm": 0.9632927775382996, "learning_rate": 1.2422086119998688e-05, "loss": 2.6684, "step": 3040 }, { "epoch": 0.8421489891996676, "grad_norm": 1.4581258296966553, "learning_rate": 1.237953428874129e-05, "loss": 2.651, "step": 3041 }, { "epoch": 0.842425920797563, "grad_norm": 1.2029368877410889, "learning_rate": 1.2337050653864679e-05, "loss": 2.6975, "step": 3042 }, { "epoch": 0.8427028523954583, "grad_norm": 1.4727216958999634, "learning_rate": 1.2294635248434528e-05, "loss": 3.2892, "step": 3043 }, { "epoch": 0.8429797839933536, "grad_norm": 1.3219698667526245, "learning_rate": 1.2252288105463405e-05, "loss": 2.426, "step": 3044 }, { "epoch": 0.8432567155912489, "grad_norm": 1.317419409751892, "learning_rate": 1.2210009257910771e-05, "loss": 2.6523, "step": 3045 }, { "epoch": 0.8435336471891443, "grad_norm": 1.3205360174179077, "learning_rate": 1.21677987386829e-05, "loss": 2.7689, "step": 3046 }, { "epoch": 0.8438105787870396, "grad_norm": 1.6884167194366455, "learning_rate": 1.2125656580632938e-05, "loss": 2.8025, "step": 3047 }, { "epoch": 0.8440875103849349, "grad_norm": 1.3840197324752808, "learning_rate": 1.2083582816560701e-05, "loss": 2.8889, "step": 3048 }, { "epoch": 0.8443644419828302, "grad_norm": 2.7933177947998047, "learning_rate": 1.2041577479212963e-05, "loss": 3.0994, "step": 3049 }, { "epoch": 0.8446413735807256, "grad_norm": 3.1562044620513916, "learning_rate": 1.1999640601283069e-05, "loss": 3.9769, "step": 3050 }, { "epoch": 0.8449183051786209, "grad_norm": 1.0793648958206177, "learning_rate": 1.1957772215411156e-05, "loss": 2.9159, "step": 3051 }, { "epoch": 0.8451952367765162, "grad_norm": 1.145857334136963, "learning_rate": 1.1915972354184058e-05, "loss": 2.5923, "step": 3052 }, { "epoch": 0.8454721683744115, "grad_norm": 2.3265607357025146, "learning_rate": 1.1874241050135282e-05, "loss": 2.9068, "step": 3053 }, { "epoch": 0.8457490999723069, "grad_norm": 1.6332247257232666, "learning_rate": 1.1832578335744882e-05, "loss": 2.8704, "step": 3054 }, { "epoch": 0.8460260315702022, "grad_norm": 1.4872264862060547, "learning_rate": 1.1790984243439674e-05, "loss": 3.0174, "step": 3055 }, { "epoch": 0.8463029631680975, "grad_norm": 1.0704503059387207, "learning_rate": 1.1749458805592983e-05, "loss": 2.6838, "step": 3056 }, { "epoch": 0.8465798947659928, "grad_norm": 1.7933567762374878, "learning_rate": 1.1708002054524647e-05, "loss": 2.9514, "step": 3057 }, { "epoch": 0.8468568263638881, "grad_norm": 1.73945152759552, "learning_rate": 1.16666140225012e-05, "loss": 2.7772, "step": 3058 }, { "epoch": 0.8471337579617835, "grad_norm": 1.2374337911605835, "learning_rate": 1.1625294741735526e-05, "loss": 2.9232, "step": 3059 }, { "epoch": 0.8474106895596788, "grad_norm": 1.5635979175567627, "learning_rate": 1.1584044244387083e-05, "loss": 2.8568, "step": 3060 }, { "epoch": 0.8476876211575741, "grad_norm": 1.1509644985198975, "learning_rate": 1.1542862562561784e-05, "loss": 2.7165, "step": 3061 }, { "epoch": 0.8479645527554694, "grad_norm": 1.016371250152588, "learning_rate": 1.1501749728311994e-05, "loss": 2.7675, "step": 3062 }, { "epoch": 0.8482414843533648, "grad_norm": 1.336298942565918, "learning_rate": 1.1460705773636427e-05, "loss": 2.7798, "step": 3063 }, { "epoch": 0.84851841595126, "grad_norm": 1.0788416862487793, "learning_rate": 1.1419730730480305e-05, "loss": 2.6508, "step": 3064 }, { "epoch": 0.8487953475491553, "grad_norm": 1.292082667350769, "learning_rate": 1.1378824630735086e-05, "loss": 2.7736, "step": 3065 }, { "epoch": 0.8490722791470506, "grad_norm": 0.9430363774299622, "learning_rate": 1.1337987506238645e-05, "loss": 2.7468, "step": 3066 }, { "epoch": 0.849349210744946, "grad_norm": 1.2281850576400757, "learning_rate": 1.129721938877516e-05, "loss": 2.9235, "step": 3067 }, { "epoch": 0.8496261423428413, "grad_norm": 1.5270968675613403, "learning_rate": 1.1256520310075102e-05, "loss": 3.0424, "step": 3068 }, { "epoch": 0.8499030739407366, "grad_norm": 1.4194164276123047, "learning_rate": 1.1215890301815201e-05, "loss": 2.8131, "step": 3069 }, { "epoch": 0.8501800055386319, "grad_norm": 0.9814782738685608, "learning_rate": 1.1175329395618417e-05, "loss": 2.6454, "step": 3070 }, { "epoch": 0.8504569371365273, "grad_norm": 1.30820894241333, "learning_rate": 1.1134837623053961e-05, "loss": 2.7237, "step": 3071 }, { "epoch": 0.8507338687344226, "grad_norm": 1.776144027709961, "learning_rate": 1.1094415015637161e-05, "loss": 2.6911, "step": 3072 }, { "epoch": 0.8510108003323179, "grad_norm": 1.0699905157089233, "learning_rate": 1.1054061604829636e-05, "loss": 2.6947, "step": 3073 }, { "epoch": 0.8512877319302132, "grad_norm": 0.7591848969459534, "learning_rate": 1.101377742203903e-05, "loss": 2.8729, "step": 3074 }, { "epoch": 0.8515646635281086, "grad_norm": 2.7936182022094727, "learning_rate": 1.0973562498619172e-05, "loss": 3.5067, "step": 3075 }, { "epoch": 0.8518415951260039, "grad_norm": 0.8582979440689087, "learning_rate": 1.093341686586995e-05, "loss": 2.8096, "step": 3076 }, { "epoch": 0.8521185267238992, "grad_norm": 1.1236815452575684, "learning_rate": 1.0893340555037357e-05, "loss": 2.8091, "step": 3077 }, { "epoch": 0.8523954583217945, "grad_norm": 0.9093992114067078, "learning_rate": 1.085333359731341e-05, "loss": 2.7521, "step": 3078 }, { "epoch": 0.8526723899196899, "grad_norm": 1.0349127054214478, "learning_rate": 1.0813396023836142e-05, "loss": 2.7961, "step": 3079 }, { "epoch": 0.8529493215175852, "grad_norm": 1.7463757991790771, "learning_rate": 1.0773527865689625e-05, "loss": 2.8264, "step": 3080 }, { "epoch": 0.8532262531154805, "grad_norm": 1.2333402633666992, "learning_rate": 1.0733729153903794e-05, "loss": 2.744, "step": 3081 }, { "epoch": 0.8535031847133758, "grad_norm": 1.2324150800704956, "learning_rate": 1.06939999194547e-05, "loss": 2.5561, "step": 3082 }, { "epoch": 0.8537801163112712, "grad_norm": 1.3192819356918335, "learning_rate": 1.0654340193264179e-05, "loss": 2.8713, "step": 3083 }, { "epoch": 0.8540570479091665, "grad_norm": 1.1821292638778687, "learning_rate": 1.0614750006200014e-05, "loss": 2.6635, "step": 3084 }, { "epoch": 0.8543339795070618, "grad_norm": 1.5261622667312622, "learning_rate": 1.0575229389075891e-05, "loss": 2.5562, "step": 3085 }, { "epoch": 0.8546109111049571, "grad_norm": 0.770987868309021, "learning_rate": 1.0535778372651317e-05, "loss": 2.7767, "step": 3086 }, { "epoch": 0.8548878427028523, "grad_norm": 1.4389278888702393, "learning_rate": 1.0496396987631606e-05, "loss": 3.2229, "step": 3087 }, { "epoch": 0.8551647743007477, "grad_norm": 1.6177245378494263, "learning_rate": 1.0457085264667954e-05, "loss": 3.2858, "step": 3088 }, { "epoch": 0.855441705898643, "grad_norm": 1.5438570976257324, "learning_rate": 1.0417843234357282e-05, "loss": 2.7003, "step": 3089 }, { "epoch": 0.8557186374965383, "grad_norm": 1.566727638244629, "learning_rate": 1.0378670927242228e-05, "loss": 3.1135, "step": 3090 }, { "epoch": 0.8559955690944336, "grad_norm": 1.2584772109985352, "learning_rate": 1.0339568373811292e-05, "loss": 2.7638, "step": 3091 }, { "epoch": 0.856272500692329, "grad_norm": 1.2752087116241455, "learning_rate": 1.030053560449854e-05, "loss": 2.704, "step": 3092 }, { "epoch": 0.8565494322902243, "grad_norm": 1.7779330015182495, "learning_rate": 1.0261572649683803e-05, "loss": 2.5948, "step": 3093 }, { "epoch": 0.8568263638881196, "grad_norm": 1.5309356451034546, "learning_rate": 1.022267953969257e-05, "loss": 2.9297, "step": 3094 }, { "epoch": 0.8571032954860149, "grad_norm": 1.2379956245422363, "learning_rate": 1.0183856304795969e-05, "loss": 2.6137, "step": 3095 }, { "epoch": 0.8573802270839103, "grad_norm": 0.926654040813446, "learning_rate": 1.0145102975210675e-05, "loss": 2.8791, "step": 3096 }, { "epoch": 0.8576571586818056, "grad_norm": 1.6225202083587646, "learning_rate": 1.0106419581099092e-05, "loss": 2.8463, "step": 3097 }, { "epoch": 0.8579340902797009, "grad_norm": 1.2629817724227905, "learning_rate": 1.0067806152569048e-05, "loss": 3.2293, "step": 3098 }, { "epoch": 0.8582110218775962, "grad_norm": 1.8424757719039917, "learning_rate": 1.0029262719674015e-05, "loss": 2.7858, "step": 3099 }, { "epoch": 0.8584879534754916, "grad_norm": 3.2015435695648193, "learning_rate": 9.99078931241293e-06, "loss": 3.8198, "step": 3100 }, { "epoch": 0.8587648850733869, "grad_norm": 0.9091971516609192, "learning_rate": 9.952385960730248e-06, "loss": 2.6804, "step": 3101 }, { "epoch": 0.8590418166712822, "grad_norm": 1.7424923181533813, "learning_rate": 9.914052694515908e-06, "loss": 2.8593, "step": 3102 }, { "epoch": 0.8593187482691775, "grad_norm": 1.7614586353302002, "learning_rate": 9.875789543605296e-06, "loss": 2.7757, "step": 3103 }, { "epoch": 0.8595956798670729, "grad_norm": 1.0697795152664185, "learning_rate": 9.837596537779237e-06, "loss": 2.7479, "step": 3104 }, { "epoch": 0.8598726114649682, "grad_norm": 0.9285095930099487, "learning_rate": 9.799473706763884e-06, "loss": 2.6441, "step": 3105 }, { "epoch": 0.8601495430628635, "grad_norm": 1.5014824867248535, "learning_rate": 9.761421080230915e-06, "loss": 2.9274, "step": 3106 }, { "epoch": 0.8604264746607588, "grad_norm": 1.2792730331420898, "learning_rate": 9.723438687797226e-06, "loss": 2.7602, "step": 3107 }, { "epoch": 0.8607034062586542, "grad_norm": 1.130605936050415, "learning_rate": 9.685526559025115e-06, "loss": 2.5661, "step": 3108 }, { "epoch": 0.8609803378565495, "grad_norm": 1.2102701663970947, "learning_rate": 9.647684723422213e-06, "loss": 2.8934, "step": 3109 }, { "epoch": 0.8612572694544448, "grad_norm": 1.3663723468780518, "learning_rate": 9.60991321044139e-06, "loss": 2.5983, "step": 3110 }, { "epoch": 0.86153420105234, "grad_norm": 1.541733980178833, "learning_rate": 9.572212049480811e-06, "loss": 2.7917, "step": 3111 }, { "epoch": 0.8618111326502353, "grad_norm": 1.1841416358947754, "learning_rate": 9.53458126988388e-06, "loss": 2.6935, "step": 3112 }, { "epoch": 0.8620880642481307, "grad_norm": 1.7382968664169312, "learning_rate": 9.497020900939246e-06, "loss": 2.7467, "step": 3113 }, { "epoch": 0.862364995846026, "grad_norm": 1.642300009727478, "learning_rate": 9.459530971880681e-06, "loss": 2.6757, "step": 3114 }, { "epoch": 0.8626419274439213, "grad_norm": 2.0042853355407715, "learning_rate": 9.422111511887244e-06, "loss": 3.1407, "step": 3115 }, { "epoch": 0.8629188590418166, "grad_norm": 1.2511892318725586, "learning_rate": 9.384762550083037e-06, "loss": 3.0645, "step": 3116 }, { "epoch": 0.863195790639712, "grad_norm": 1.4200023412704468, "learning_rate": 9.347484115537364e-06, "loss": 2.8308, "step": 3117 }, { "epoch": 0.8634727222376073, "grad_norm": 1.5993705987930298, "learning_rate": 9.310276237264615e-06, "loss": 2.7952, "step": 3118 }, { "epoch": 0.8637496538355026, "grad_norm": 1.4115570783615112, "learning_rate": 9.27313894422428e-06, "loss": 2.5305, "step": 3119 }, { "epoch": 0.8640265854333979, "grad_norm": 0.9652894139289856, "learning_rate": 9.23607226532084e-06, "loss": 2.5732, "step": 3120 }, { "epoch": 0.8643035170312933, "grad_norm": 1.4319514036178589, "learning_rate": 9.199076229403924e-06, "loss": 2.8168, "step": 3121 }, { "epoch": 0.8645804486291886, "grad_norm": 1.3451462984085083, "learning_rate": 9.162150865268127e-06, "loss": 2.7569, "step": 3122 }, { "epoch": 0.8648573802270839, "grad_norm": 2.326890707015991, "learning_rate": 9.12529620165299e-06, "loss": 3.0164, "step": 3123 }, { "epoch": 0.8651343118249792, "grad_norm": 2.1529388427734375, "learning_rate": 9.088512267243143e-06, "loss": 3.2552, "step": 3124 }, { "epoch": 0.8654112434228746, "grad_norm": 5.216430187225342, "learning_rate": 9.051799090668046e-06, "loss": 4.5724, "step": 3125 }, { "epoch": 0.8656881750207699, "grad_norm": 1.3277052640914917, "learning_rate": 9.015156700502159e-06, "loss": 2.7651, "step": 3126 }, { "epoch": 0.8659651066186652, "grad_norm": 1.582177758216858, "learning_rate": 8.978585125264816e-06, "loss": 2.8223, "step": 3127 }, { "epoch": 0.8662420382165605, "grad_norm": 1.732556700706482, "learning_rate": 8.94208439342028e-06, "loss": 2.858, "step": 3128 }, { "epoch": 0.8665189698144559, "grad_norm": 1.2172104120254517, "learning_rate": 8.905654533377583e-06, "loss": 2.6793, "step": 3129 }, { "epoch": 0.8667959014123512, "grad_norm": 1.3058675527572632, "learning_rate": 8.869295573490733e-06, "loss": 2.6897, "step": 3130 }, { "epoch": 0.8670728330102465, "grad_norm": 1.4295732975006104, "learning_rate": 8.833007542058402e-06, "loss": 2.9477, "step": 3131 }, { "epoch": 0.8673497646081418, "grad_norm": 1.7179218530654907, "learning_rate": 8.796790467324179e-06, "loss": 2.6237, "step": 3132 }, { "epoch": 0.8676266962060372, "grad_norm": 1.7100985050201416, "learning_rate": 8.760644377476368e-06, "loss": 2.709, "step": 3133 }, { "epoch": 0.8679036278039324, "grad_norm": 1.3927521705627441, "learning_rate": 8.724569300648034e-06, "loss": 2.8626, "step": 3134 }, { "epoch": 0.8681805594018277, "grad_norm": 1.7715344429016113, "learning_rate": 8.688565264916982e-06, "loss": 2.8555, "step": 3135 }, { "epoch": 0.868457490999723, "grad_norm": 1.1118872165679932, "learning_rate": 8.652632298305707e-06, "loss": 2.5965, "step": 3136 }, { "epoch": 0.8687344225976183, "grad_norm": 1.287585735321045, "learning_rate": 8.616770428781419e-06, "loss": 3.0108, "step": 3137 }, { "epoch": 0.8690113541955137, "grad_norm": 1.5482505559921265, "learning_rate": 8.58097968425593e-06, "loss": 2.7597, "step": 3138 }, { "epoch": 0.869288285793409, "grad_norm": 1.3987518548965454, "learning_rate": 8.545260092585805e-06, "loss": 2.7233, "step": 3139 }, { "epoch": 0.8695652173913043, "grad_norm": 1.880951166152954, "learning_rate": 8.509611681572106e-06, "loss": 2.9494, "step": 3140 }, { "epoch": 0.8698421489891996, "grad_norm": 1.631243348121643, "learning_rate": 8.474034478960557e-06, "loss": 2.8259, "step": 3141 }, { "epoch": 0.870119080587095, "grad_norm": 1.595776081085205, "learning_rate": 8.438528512441467e-06, "loss": 2.7464, "step": 3142 }, { "epoch": 0.8703960121849903, "grad_norm": 1.6883162260055542, "learning_rate": 8.403093809649675e-06, "loss": 2.6377, "step": 3143 }, { "epoch": 0.8706729437828856, "grad_norm": 1.1031101942062378, "learning_rate": 8.367730398164574e-06, "loss": 2.5506, "step": 3144 }, { "epoch": 0.870949875380781, "grad_norm": 1.3834490776062012, "learning_rate": 8.33243830551006e-06, "loss": 2.6796, "step": 3145 }, { "epoch": 0.8712268069786763, "grad_norm": 1.4311162233352661, "learning_rate": 8.297217559154535e-06, "loss": 2.5472, "step": 3146 }, { "epoch": 0.8715037385765716, "grad_norm": 1.4056702852249146, "learning_rate": 8.262068186510808e-06, "loss": 2.5875, "step": 3147 }, { "epoch": 0.8717806701744669, "grad_norm": 1.9589285850524902, "learning_rate": 8.226990214936247e-06, "loss": 2.8412, "step": 3148 }, { "epoch": 0.8720576017723622, "grad_norm": 2.5369226932525635, "learning_rate": 8.19198367173255e-06, "loss": 3.3846, "step": 3149 }, { "epoch": 0.8723345333702576, "grad_norm": 2.807080030441284, "learning_rate": 8.157048584145865e-06, "loss": 3.6268, "step": 3150 }, { "epoch": 0.8726114649681529, "grad_norm": 1.582241177558899, "learning_rate": 8.12218497936672e-06, "loss": 2.9072, "step": 3151 }, { "epoch": 0.8728883965660482, "grad_norm": 1.4820480346679688, "learning_rate": 8.08739288453001e-06, "loss": 2.7435, "step": 3152 }, { "epoch": 0.8731653281639435, "grad_norm": 1.1095006465911865, "learning_rate": 8.05267232671495e-06, "loss": 2.6025, "step": 3153 }, { "epoch": 0.8734422597618389, "grad_norm": 1.2409456968307495, "learning_rate": 8.018023332945112e-06, "loss": 2.8085, "step": 3154 }, { "epoch": 0.8737191913597342, "grad_norm": 1.187393307685852, "learning_rate": 7.98344593018836e-06, "loss": 2.6978, "step": 3155 }, { "epoch": 0.8739961229576295, "grad_norm": 1.2349612712860107, "learning_rate": 7.948940145356775e-06, "loss": 2.936, "step": 3156 }, { "epoch": 0.8742730545555247, "grad_norm": 1.1934999227523804, "learning_rate": 7.914506005306832e-06, "loss": 2.7566, "step": 3157 }, { "epoch": 0.8745499861534201, "grad_norm": 0.7864140868186951, "learning_rate": 7.880143536839091e-06, "loss": 2.829, "step": 3158 }, { "epoch": 0.8748269177513154, "grad_norm": 1.670947551727295, "learning_rate": 7.845852766698426e-06, "loss": 2.8919, "step": 3159 }, { "epoch": 0.8751038493492107, "grad_norm": 1.2872366905212402, "learning_rate": 7.811633721573885e-06, "loss": 2.9856, "step": 3160 }, { "epoch": 0.875380780947106, "grad_norm": 1.2436797618865967, "learning_rate": 7.777486428098691e-06, "loss": 2.947, "step": 3161 }, { "epoch": 0.8756577125450014, "grad_norm": 1.3402278423309326, "learning_rate": 7.74341091285018e-06, "loss": 2.7753, "step": 3162 }, { "epoch": 0.8759346441428967, "grad_norm": 1.3854641914367676, "learning_rate": 7.709407202349917e-06, "loss": 2.7943, "step": 3163 }, { "epoch": 0.876211575740792, "grad_norm": 1.118638277053833, "learning_rate": 7.675475323063475e-06, "loss": 2.763, "step": 3164 }, { "epoch": 0.8764885073386873, "grad_norm": 1.0722171068191528, "learning_rate": 7.641615301400585e-06, "loss": 2.9056, "step": 3165 }, { "epoch": 0.8767654389365827, "grad_norm": 2.5819013118743896, "learning_rate": 7.607827163715042e-06, "loss": 2.7059, "step": 3166 }, { "epoch": 0.877042370534478, "grad_norm": 1.32217538356781, "learning_rate": 7.574110936304657e-06, "loss": 2.8119, "step": 3167 }, { "epoch": 0.8773193021323733, "grad_norm": 1.9732376337051392, "learning_rate": 7.540466645411326e-06, "loss": 2.4895, "step": 3168 }, { "epoch": 0.8775962337302686, "grad_norm": 0.826056957244873, "learning_rate": 7.5068943172209025e-06, "loss": 2.5382, "step": 3169 }, { "epoch": 0.877873165328164, "grad_norm": 1.436314582824707, "learning_rate": 7.473393977863297e-06, "loss": 2.7361, "step": 3170 }, { "epoch": 0.8781500969260593, "grad_norm": 1.0173486471176147, "learning_rate": 7.439965653412273e-06, "loss": 2.6879, "step": 3171 }, { "epoch": 0.8784270285239546, "grad_norm": 1.8656055927276611, "learning_rate": 7.406609369885708e-06, "loss": 2.7196, "step": 3172 }, { "epoch": 0.8787039601218499, "grad_norm": 1.4444622993469238, "learning_rate": 7.373325153245259e-06, "loss": 2.8648, "step": 3173 }, { "epoch": 0.8789808917197452, "grad_norm": 1.5233651399612427, "learning_rate": 7.340113029396567e-06, "loss": 2.8779, "step": 3174 }, { "epoch": 0.8792578233176406, "grad_norm": 1.651416301727295, "learning_rate": 7.306973024189145e-06, "loss": 3.2433, "step": 3175 }, { "epoch": 0.8795347549155359, "grad_norm": 1.8997817039489746, "learning_rate": 7.273905163416395e-06, "loss": 3.0981, "step": 3176 }, { "epoch": 0.8798116865134312, "grad_norm": 0.88230299949646, "learning_rate": 7.240909472815538e-06, "loss": 2.6861, "step": 3177 }, { "epoch": 0.8800886181113265, "grad_norm": 2.475217819213867, "learning_rate": 7.207985978067644e-06, "loss": 2.6416, "step": 3178 }, { "epoch": 0.8803655497092219, "grad_norm": 1.2994290590286255, "learning_rate": 7.175134704797592e-06, "loss": 2.7834, "step": 3179 }, { "epoch": 0.8806424813071172, "grad_norm": 1.7192754745483398, "learning_rate": 7.142355678574009e-06, "loss": 2.8564, "step": 3180 }, { "epoch": 0.8809194129050124, "grad_norm": 1.0855594873428345, "learning_rate": 7.109648924909373e-06, "loss": 2.6875, "step": 3181 }, { "epoch": 0.8811963445029077, "grad_norm": 1.334720492362976, "learning_rate": 7.077014469259813e-06, "loss": 2.6114, "step": 3182 }, { "epoch": 0.8814732761008031, "grad_norm": 1.205359935760498, "learning_rate": 7.044452337025265e-06, "loss": 2.7585, "step": 3183 }, { "epoch": 0.8817502076986984, "grad_norm": 1.6648601293563843, "learning_rate": 7.011962553549345e-06, "loss": 2.8264, "step": 3184 }, { "epoch": 0.8820271392965937, "grad_norm": 0.624901294708252, "learning_rate": 6.979545144119348e-06, "loss": 2.7539, "step": 3185 }, { "epoch": 0.882304070894489, "grad_norm": 1.6340166330337524, "learning_rate": 6.9472001339662695e-06, "loss": 2.8868, "step": 3186 }, { "epoch": 0.8825810024923844, "grad_norm": 1.4930720329284668, "learning_rate": 6.914927548264727e-06, "loss": 2.7927, "step": 3187 }, { "epoch": 0.8828579340902797, "grad_norm": 1.087523102760315, "learning_rate": 6.8827274121329944e-06, "loss": 2.7179, "step": 3188 }, { "epoch": 0.883134865688175, "grad_norm": 1.7288202047348022, "learning_rate": 6.8505997506329024e-06, "loss": 2.9288, "step": 3189 }, { "epoch": 0.8834117972860703, "grad_norm": 1.4685242176055908, "learning_rate": 6.818544588769959e-06, "loss": 2.745, "step": 3190 }, { "epoch": 0.8836887288839657, "grad_norm": 1.3567365407943726, "learning_rate": 6.786561951493176e-06, "loss": 2.9668, "step": 3191 }, { "epoch": 0.883965660481861, "grad_norm": 1.338721752166748, "learning_rate": 6.754651863695138e-06, "loss": 2.517, "step": 3192 }, { "epoch": 0.8842425920797563, "grad_norm": 1.0524381399154663, "learning_rate": 6.72281435021197e-06, "loss": 2.6162, "step": 3193 }, { "epoch": 0.8845195236776516, "grad_norm": 2.0696678161621094, "learning_rate": 6.691049435823327e-06, "loss": 2.6048, "step": 3194 }, { "epoch": 0.884796455275547, "grad_norm": 1.5093910694122314, "learning_rate": 6.659357145252287e-06, "loss": 2.8598, "step": 3195 }, { "epoch": 0.8850733868734423, "grad_norm": 0.9233435988426208, "learning_rate": 6.627737503165532e-06, "loss": 2.7454, "step": 3196 }, { "epoch": 0.8853503184713376, "grad_norm": 1.0932697057724, "learning_rate": 6.5961905341730635e-06, "loss": 2.6687, "step": 3197 }, { "epoch": 0.8856272500692329, "grad_norm": 1.8618035316467285, "learning_rate": 6.5647162628283965e-06, "loss": 2.8007, "step": 3198 }, { "epoch": 0.8859041816671283, "grad_norm": 2.263721466064453, "learning_rate": 6.533314713628458e-06, "loss": 3.4359, "step": 3199 }, { "epoch": 0.8861811132650236, "grad_norm": 3.9213695526123047, "learning_rate": 6.5019859110135635e-06, "loss": 4.21, "step": 3200 }, { "epoch": 0.8864580448629189, "grad_norm": 1.4590137004852295, "learning_rate": 6.470729879367399e-06, "loss": 2.9189, "step": 3201 }, { "epoch": 0.8867349764608142, "grad_norm": 1.1353839635849, "learning_rate": 6.439546643017047e-06, "loss": 2.6915, "step": 3202 }, { "epoch": 0.8870119080587096, "grad_norm": 1.0528885126113892, "learning_rate": 6.4084362262329055e-06, "loss": 2.6218, "step": 3203 }, { "epoch": 0.8872888396566048, "grad_norm": 1.8522472381591797, "learning_rate": 6.377398653228661e-06, "loss": 2.6753, "step": 3204 }, { "epoch": 0.8875657712545001, "grad_norm": 0.7795437574386597, "learning_rate": 6.34643394816139e-06, "loss": 2.7333, "step": 3205 }, { "epoch": 0.8878427028523954, "grad_norm": 1.7405046224594116, "learning_rate": 6.315542135131381e-06, "loss": 2.9016, "step": 3206 }, { "epoch": 0.8881196344502907, "grad_norm": 0.8221628069877625, "learning_rate": 6.284723238182233e-06, "loss": 2.8126, "step": 3207 }, { "epoch": 0.8883965660481861, "grad_norm": 1.2659366130828857, "learning_rate": 6.253977281300749e-06, "loss": 2.8429, "step": 3208 }, { "epoch": 0.8886734976460814, "grad_norm": 1.5786124467849731, "learning_rate": 6.22330428841702e-06, "loss": 2.8225, "step": 3209 }, { "epoch": 0.8889504292439767, "grad_norm": 1.8932335376739502, "learning_rate": 6.1927042834042935e-06, "loss": 3.0667, "step": 3210 }, { "epoch": 0.889227360841872, "grad_norm": 1.5503121614456177, "learning_rate": 6.1621772900790405e-06, "loss": 2.7404, "step": 3211 }, { "epoch": 0.8895042924397674, "grad_norm": 1.1766709089279175, "learning_rate": 6.131723332200923e-06, "loss": 2.7821, "step": 3212 }, { "epoch": 0.8897812240376627, "grad_norm": 1.92313814163208, "learning_rate": 6.101342433472679e-06, "loss": 2.7026, "step": 3213 }, { "epoch": 0.890058155635558, "grad_norm": 1.6753193140029907, "learning_rate": 6.071034617540294e-06, "loss": 2.9872, "step": 3214 }, { "epoch": 0.8903350872334533, "grad_norm": 0.690635085105896, "learning_rate": 6.040799907992778e-06, "loss": 2.8038, "step": 3215 }, { "epoch": 0.8906120188313487, "grad_norm": 0.9176428914070129, "learning_rate": 6.010638328362283e-06, "loss": 2.8574, "step": 3216 }, { "epoch": 0.890888950429244, "grad_norm": 1.0934897661209106, "learning_rate": 5.980549902124055e-06, "loss": 2.855, "step": 3217 }, { "epoch": 0.8911658820271393, "grad_norm": 0.8238447904586792, "learning_rate": 5.950534652696382e-06, "loss": 2.7193, "step": 3218 }, { "epoch": 0.8914428136250346, "grad_norm": 1.7612361907958984, "learning_rate": 5.9205926034406e-06, "loss": 2.7353, "step": 3219 }, { "epoch": 0.89171974522293, "grad_norm": 1.9202384948730469, "learning_rate": 5.890723777661078e-06, "loss": 2.6319, "step": 3220 }, { "epoch": 0.8919966768208253, "grad_norm": 1.6076610088348389, "learning_rate": 5.8609281986051975e-06, "loss": 2.529, "step": 3221 }, { "epoch": 0.8922736084187206, "grad_norm": 1.4485604763031006, "learning_rate": 5.831205889463287e-06, "loss": 2.9042, "step": 3222 }, { "epoch": 0.8925505400166159, "grad_norm": 1.318128228187561, "learning_rate": 5.801556873368741e-06, "loss": 2.9158, "step": 3223 }, { "epoch": 0.8928274716145113, "grad_norm": 1.570043921470642, "learning_rate": 5.771981173397811e-06, "loss": 3.2793, "step": 3224 }, { "epoch": 0.8931044032124066, "grad_norm": 2.3622982501983643, "learning_rate": 5.74247881256974e-06, "loss": 3.4243, "step": 3225 }, { "epoch": 0.8933813348103019, "grad_norm": 1.2357228994369507, "learning_rate": 5.713049813846683e-06, "loss": 2.8305, "step": 3226 }, { "epoch": 0.8936582664081971, "grad_norm": 1.3328142166137695, "learning_rate": 5.683694200133705e-06, "loss": 2.5218, "step": 3227 }, { "epoch": 0.8939351980060924, "grad_norm": 1.4435487985610962, "learning_rate": 5.654411994278685e-06, "loss": 2.6991, "step": 3228 }, { "epoch": 0.8942121296039878, "grad_norm": 1.5753285884857178, "learning_rate": 5.625203219072495e-06, "loss": 2.7081, "step": 3229 }, { "epoch": 0.8944890612018831, "grad_norm": 2.0387070178985596, "learning_rate": 5.596067897248725e-06, "loss": 2.5834, "step": 3230 }, { "epoch": 0.8947659927997784, "grad_norm": 1.1547656059265137, "learning_rate": 5.5670060514838805e-06, "loss": 2.8424, "step": 3231 }, { "epoch": 0.8950429243976737, "grad_norm": 1.4619250297546387, "learning_rate": 5.538017704397236e-06, "loss": 2.8464, "step": 3232 }, { "epoch": 0.8953198559955691, "grad_norm": 0.9576718807220459, "learning_rate": 5.509102878550887e-06, "loss": 2.725, "step": 3233 }, { "epoch": 0.8955967875934644, "grad_norm": 1.6311956644058228, "learning_rate": 5.480261596449698e-06, "loss": 2.8379, "step": 3234 }, { "epoch": 0.8958737191913597, "grad_norm": 1.448899269104004, "learning_rate": 5.451493880541292e-06, "loss": 2.5566, "step": 3235 }, { "epoch": 0.896150650789255, "grad_norm": 1.109121322631836, "learning_rate": 5.422799753216023e-06, "loss": 2.6754, "step": 3236 }, { "epoch": 0.8964275823871504, "grad_norm": 1.8016066551208496, "learning_rate": 5.394179236806973e-06, "loss": 2.9337, "step": 3237 }, { "epoch": 0.8967045139850457, "grad_norm": 1.277727484703064, "learning_rate": 5.365632353589967e-06, "loss": 2.7793, "step": 3238 }, { "epoch": 0.896981445582941, "grad_norm": 1.4791587591171265, "learning_rate": 5.337159125783453e-06, "loss": 2.8146, "step": 3239 }, { "epoch": 0.8972583771808363, "grad_norm": 1.8600664138793945, "learning_rate": 5.308759575548617e-06, "loss": 3.0441, "step": 3240 }, { "epoch": 0.8975353087787317, "grad_norm": 1.9175560474395752, "learning_rate": 5.280433724989264e-06, "loss": 3.1218, "step": 3241 }, { "epoch": 0.897812240376627, "grad_norm": 1.1490317583084106, "learning_rate": 5.252181596151862e-06, "loss": 2.7404, "step": 3242 }, { "epoch": 0.8980891719745223, "grad_norm": 1.471359372138977, "learning_rate": 5.224003211025452e-06, "loss": 2.5608, "step": 3243 }, { "epoch": 0.8983661035724176, "grad_norm": 1.593123435974121, "learning_rate": 5.195898591541748e-06, "loss": 2.506, "step": 3244 }, { "epoch": 0.898643035170313, "grad_norm": 1.383209466934204, "learning_rate": 5.167867759575007e-06, "loss": 2.5702, "step": 3245 }, { "epoch": 0.8989199667682083, "grad_norm": 1.4914791584014893, "learning_rate": 5.139910736942044e-06, "loss": 3.1071, "step": 3246 }, { "epoch": 0.8991968983661036, "grad_norm": 1.4775235652923584, "learning_rate": 5.112027545402276e-06, "loss": 2.6725, "step": 3247 }, { "epoch": 0.8994738299639989, "grad_norm": 0.9750004410743713, "learning_rate": 5.084218206657609e-06, "loss": 2.5507, "step": 3248 }, { "epoch": 0.8997507615618943, "grad_norm": 1.4835078716278076, "learning_rate": 5.056482742352486e-06, "loss": 2.9729, "step": 3249 }, { "epoch": 0.9000276931597895, "grad_norm": 3.6488654613494873, "learning_rate": 5.028821174073861e-06, "loss": 4.0052, "step": 3250 }, { "epoch": 0.9003046247576848, "grad_norm": 1.391434907913208, "learning_rate": 5.001233523351156e-06, "loss": 2.7086, "step": 3251 }, { "epoch": 0.9005815563555801, "grad_norm": 1.7492971420288086, "learning_rate": 4.973719811656286e-06, "loss": 3.0276, "step": 3252 }, { "epoch": 0.9008584879534755, "grad_norm": 1.4157118797302246, "learning_rate": 4.9462800604035964e-06, "loss": 2.6521, "step": 3253 }, { "epoch": 0.9011354195513708, "grad_norm": 0.8659539222717285, "learning_rate": 4.9189142909498945e-06, "loss": 3.0122, "step": 3254 }, { "epoch": 0.9014123511492661, "grad_norm": 1.3970427513122559, "learning_rate": 4.891622524594342e-06, "loss": 2.6576, "step": 3255 }, { "epoch": 0.9016892827471614, "grad_norm": 1.1650993824005127, "learning_rate": 4.864404782578591e-06, "loss": 2.6377, "step": 3256 }, { "epoch": 0.9019662143450567, "grad_norm": 1.8018288612365723, "learning_rate": 4.83726108608662e-06, "loss": 2.9516, "step": 3257 }, { "epoch": 0.9022431459429521, "grad_norm": 1.2030545473098755, "learning_rate": 4.810191456244784e-06, "loss": 2.6211, "step": 3258 }, { "epoch": 0.9025200775408474, "grad_norm": 1.2440369129180908, "learning_rate": 4.783195914121818e-06, "loss": 2.7023, "step": 3259 }, { "epoch": 0.9027970091387427, "grad_norm": 1.9231184720993042, "learning_rate": 4.756274480728773e-06, "loss": 2.9374, "step": 3260 }, { "epoch": 0.903073940736638, "grad_norm": 1.1988775730133057, "learning_rate": 4.729427177018986e-06, "loss": 2.9222, "step": 3261 }, { "epoch": 0.9033508723345334, "grad_norm": 1.8029758930206299, "learning_rate": 4.7026540238881976e-06, "loss": 2.9991, "step": 3262 }, { "epoch": 0.9036278039324287, "grad_norm": 1.3029099702835083, "learning_rate": 4.6759550421743295e-06, "loss": 2.8718, "step": 3263 }, { "epoch": 0.903904735530324, "grad_norm": 1.1065994501113892, "learning_rate": 4.649330252657613e-06, "loss": 2.7638, "step": 3264 }, { "epoch": 0.9041816671282193, "grad_norm": 1.46235191822052, "learning_rate": 4.622779676060562e-06, "loss": 2.6686, "step": 3265 }, { "epoch": 0.9044585987261147, "grad_norm": 0.9452422857284546, "learning_rate": 4.596303333047891e-06, "loss": 2.6912, "step": 3266 }, { "epoch": 0.90473553032401, "grad_norm": 1.7004419565200806, "learning_rate": 4.56990124422656e-06, "loss": 2.913, "step": 3267 }, { "epoch": 0.9050124619219053, "grad_norm": 1.664442777633667, "learning_rate": 4.54357343014572e-06, "loss": 2.8846, "step": 3268 }, { "epoch": 0.9052893935198006, "grad_norm": 1.6478939056396484, "learning_rate": 4.517319911296747e-06, "loss": 2.6547, "step": 3269 }, { "epoch": 0.905566325117696, "grad_norm": 0.8395318984985352, "learning_rate": 4.491140708113117e-06, "loss": 2.5089, "step": 3270 }, { "epoch": 0.9058432567155913, "grad_norm": 1.541677474975586, "learning_rate": 4.465035840970577e-06, "loss": 2.8615, "step": 3271 }, { "epoch": 0.9061201883134866, "grad_norm": 1.2489078044891357, "learning_rate": 4.439005330186896e-06, "loss": 2.9873, "step": 3272 }, { "epoch": 0.9063971199113819, "grad_norm": 1.431944727897644, "learning_rate": 4.413049196022057e-06, "loss": 2.8534, "step": 3273 }, { "epoch": 0.9066740515092772, "grad_norm": 1.1621243953704834, "learning_rate": 4.387167458678121e-06, "loss": 2.9404, "step": 3274 }, { "epoch": 0.9069509831071725, "grad_norm": 2.5406911373138428, "learning_rate": 4.361360138299242e-06, "loss": 3.4224, "step": 3275 }, { "epoch": 0.9072279147050678, "grad_norm": 1.2161071300506592, "learning_rate": 4.335627254971675e-06, "loss": 2.9378, "step": 3276 }, { "epoch": 0.9075048463029631, "grad_norm": 1.362507700920105, "learning_rate": 4.3099688287237204e-06, "loss": 2.7757, "step": 3277 }, { "epoch": 0.9077817779008585, "grad_norm": 1.1091976165771484, "learning_rate": 4.284384879525749e-06, "loss": 2.7355, "step": 3278 }, { "epoch": 0.9080587094987538, "grad_norm": 1.3528761863708496, "learning_rate": 4.2588754272900985e-06, "loss": 2.7222, "step": 3279 }, { "epoch": 0.9083356410966491, "grad_norm": 1.0884292125701904, "learning_rate": 4.233440491871232e-06, "loss": 2.7229, "step": 3280 }, { "epoch": 0.9086125726945444, "grad_norm": 1.2718223333358765, "learning_rate": 4.208080093065536e-06, "loss": 2.839, "step": 3281 }, { "epoch": 0.9088895042924398, "grad_norm": 1.9365698099136353, "learning_rate": 4.1827942506113995e-06, "loss": 2.8317, "step": 3282 }, { "epoch": 0.9091664358903351, "grad_norm": 1.0091325044631958, "learning_rate": 4.15758298418919e-06, "loss": 2.8692, "step": 3283 }, { "epoch": 0.9094433674882304, "grad_norm": 1.2271143198013306, "learning_rate": 4.132446313421246e-06, "loss": 2.6124, "step": 3284 }, { "epoch": 0.9097202990861257, "grad_norm": 1.6683666706085205, "learning_rate": 4.107384257871816e-06, "loss": 3.0068, "step": 3285 }, { "epoch": 0.909997230684021, "grad_norm": 0.7507969737052917, "learning_rate": 4.082396837047109e-06, "loss": 2.7824, "step": 3286 }, { "epoch": 0.9102741622819164, "grad_norm": 1.477518916130066, "learning_rate": 4.057484070395213e-06, "loss": 3.1318, "step": 3287 }, { "epoch": 0.9105510938798117, "grad_norm": 1.1982351541519165, "learning_rate": 4.0326459773061045e-06, "loss": 2.8544, "step": 3288 }, { "epoch": 0.910828025477707, "grad_norm": 1.218491554260254, "learning_rate": 4.00788257711171e-06, "loss": 2.7816, "step": 3289 }, { "epoch": 0.9111049570756024, "grad_norm": 1.1615190505981445, "learning_rate": 3.9831938890857346e-06, "loss": 2.8993, "step": 3290 }, { "epoch": 0.9113818886734977, "grad_norm": 1.3096328973770142, "learning_rate": 3.958579932443773e-06, "loss": 2.8446, "step": 3291 }, { "epoch": 0.911658820271393, "grad_norm": 1.5708379745483398, "learning_rate": 3.934040726343258e-06, "loss": 2.8288, "step": 3292 }, { "epoch": 0.9119357518692883, "grad_norm": 1.3243839740753174, "learning_rate": 3.909576289883454e-06, "loss": 2.856, "step": 3293 }, { "epoch": 0.9122126834671836, "grad_norm": 1.4913750886917114, "learning_rate": 3.885186642105376e-06, "loss": 2.6688, "step": 3294 }, { "epoch": 0.912489615065079, "grad_norm": 1.4231692552566528, "learning_rate": 3.8608718019919275e-06, "loss": 2.7812, "step": 3295 }, { "epoch": 0.9127665466629743, "grad_norm": 1.3118404150009155, "learning_rate": 3.836631788467671e-06, "loss": 2.483, "step": 3296 }, { "epoch": 0.9130434782608695, "grad_norm": 1.285291314125061, "learning_rate": 3.8124666203990135e-06, "loss": 2.5784, "step": 3297 }, { "epoch": 0.9133204098587648, "grad_norm": 1.377078652381897, "learning_rate": 3.788376316594089e-06, "loss": 2.8021, "step": 3298 }, { "epoch": 0.9135973414566602, "grad_norm": 1.9740731716156006, "learning_rate": 3.7643608958027543e-06, "loss": 2.8414, "step": 3299 }, { "epoch": 0.9138742730545555, "grad_norm": 4.069131374359131, "learning_rate": 3.74042037671658e-06, "loss": 4.0822, "step": 3300 }, { "epoch": 0.9141512046524508, "grad_norm": 1.1507346630096436, "learning_rate": 3.7165547779688616e-06, "loss": 2.5579, "step": 3301 }, { "epoch": 0.9144281362503461, "grad_norm": 1.3568830490112305, "learning_rate": 3.692764118134573e-06, "loss": 2.8035, "step": 3302 }, { "epoch": 0.9147050678482415, "grad_norm": 1.8859748840332031, "learning_rate": 3.669048415730314e-06, "loss": 2.5274, "step": 3303 }, { "epoch": 0.9149819994461368, "grad_norm": 1.5772771835327148, "learning_rate": 3.6454076892144418e-06, "loss": 2.6752, "step": 3304 }, { "epoch": 0.9152589310440321, "grad_norm": 0.878659725189209, "learning_rate": 3.62184195698686e-06, "loss": 2.6252, "step": 3305 }, { "epoch": 0.9155358626419274, "grad_norm": 1.0205022096633911, "learning_rate": 3.598351237389175e-06, "loss": 2.5583, "step": 3306 }, { "epoch": 0.9158127942398228, "grad_norm": 1.0380001068115234, "learning_rate": 3.5749355487045722e-06, "loss": 2.7153, "step": 3307 }, { "epoch": 0.9160897258377181, "grad_norm": 1.053176760673523, "learning_rate": 3.5515949091578514e-06, "loss": 2.5989, "step": 3308 }, { "epoch": 0.9163666574356134, "grad_norm": 1.5257909297943115, "learning_rate": 3.5283293369154036e-06, "loss": 2.8998, "step": 3309 }, { "epoch": 0.9166435890335087, "grad_norm": 1.6158723831176758, "learning_rate": 3.5051388500851766e-06, "loss": 2.8025, "step": 3310 }, { "epoch": 0.9169205206314041, "grad_norm": 0.8846004009246826, "learning_rate": 3.4820234667166996e-06, "loss": 2.7201, "step": 3311 }, { "epoch": 0.9171974522292994, "grad_norm": 1.761206865310669, "learning_rate": 3.458983204801014e-06, "loss": 2.8465, "step": 3312 }, { "epoch": 0.9174743838271947, "grad_norm": 1.3853119611740112, "learning_rate": 3.436018082270753e-06, "loss": 2.8522, "step": 3313 }, { "epoch": 0.91775131542509, "grad_norm": 1.5413858890533447, "learning_rate": 3.4131281170000083e-06, "loss": 2.688, "step": 3314 }, { "epoch": 0.9180282470229854, "grad_norm": 1.460121512413025, "learning_rate": 3.3903133268043952e-06, "loss": 2.881, "step": 3315 }, { "epoch": 0.9183051786208807, "grad_norm": 1.2880120277404785, "learning_rate": 3.3675737294410425e-06, "loss": 3.0607, "step": 3316 }, { "epoch": 0.918582110218776, "grad_norm": 1.3540161848068237, "learning_rate": 3.3449093426085154e-06, "loss": 2.8574, "step": 3317 }, { "epoch": 0.9188590418166713, "grad_norm": 1.5042144060134888, "learning_rate": 3.3223201839468587e-06, "loss": 3.0026, "step": 3318 }, { "epoch": 0.9191359734145667, "grad_norm": 1.1813321113586426, "learning_rate": 3.2998062710375864e-06, "loss": 2.7569, "step": 3319 }, { "epoch": 0.9194129050124619, "grad_norm": 2.0243005752563477, "learning_rate": 3.2773676214036374e-06, "loss": 2.8165, "step": 3320 }, { "epoch": 0.9196898366103572, "grad_norm": 1.6620597839355469, "learning_rate": 3.2550042525093196e-06, "loss": 2.6096, "step": 3321 }, { "epoch": 0.9199667682082525, "grad_norm": 1.7305388450622559, "learning_rate": 3.232716181760442e-06, "loss": 2.7571, "step": 3322 }, { "epoch": 0.9202436998061478, "grad_norm": 1.4501975774765015, "learning_rate": 3.2105034265041302e-06, "loss": 2.744, "step": 3323 }, { "epoch": 0.9205206314040432, "grad_norm": 1.5626119375228882, "learning_rate": 3.188366004028931e-06, "loss": 2.8442, "step": 3324 }, { "epoch": 0.9207975630019385, "grad_norm": 4.155477523803711, "learning_rate": 3.16630393156474e-06, "loss": 4.1207, "step": 3325 }, { "epoch": 0.9210744945998338, "grad_norm": 2.5009517669677734, "learning_rate": 3.1443172262828223e-06, "loss": 3.1146, "step": 3326 }, { "epoch": 0.9213514261977291, "grad_norm": 1.2323756217956543, "learning_rate": 3.1224059052957556e-06, "loss": 2.6249, "step": 3327 }, { "epoch": 0.9216283577956245, "grad_norm": 1.3302520513534546, "learning_rate": 3.1005699856574978e-06, "loss": 2.5258, "step": 3328 }, { "epoch": 0.9219052893935198, "grad_norm": 1.6659871339797974, "learning_rate": 3.0788094843632655e-06, "loss": 2.7854, "step": 3329 }, { "epoch": 0.9221822209914151, "grad_norm": 1.27170991897583, "learning_rate": 3.0571244183495775e-06, "loss": 2.5841, "step": 3330 }, { "epoch": 0.9224591525893104, "grad_norm": 1.3297101259231567, "learning_rate": 3.0355148044943105e-06, "loss": 2.8057, "step": 3331 }, { "epoch": 0.9227360841872058, "grad_norm": 1.2992569208145142, "learning_rate": 3.0139806596165334e-06, "loss": 2.5827, "step": 3332 }, { "epoch": 0.9230130157851011, "grad_norm": 1.3038079738616943, "learning_rate": 2.9925220004766054e-06, "loss": 2.7893, "step": 3333 }, { "epoch": 0.9232899473829964, "grad_norm": 1.5119189023971558, "learning_rate": 2.9711388437761445e-06, "loss": 2.4833, "step": 3334 }, { "epoch": 0.9235668789808917, "grad_norm": 1.4106827974319458, "learning_rate": 2.9498312061580047e-06, "loss": 2.6852, "step": 3335 }, { "epoch": 0.9238438105787871, "grad_norm": 1.4682044982910156, "learning_rate": 2.9285991042062313e-06, "loss": 2.7403, "step": 3336 }, { "epoch": 0.9241207421766824, "grad_norm": 1.1259634494781494, "learning_rate": 2.907442554446138e-06, "loss": 2.8119, "step": 3337 }, { "epoch": 0.9243976737745777, "grad_norm": 1.324251651763916, "learning_rate": 2.886361573344165e-06, "loss": 2.9538, "step": 3338 }, { "epoch": 0.924674605372473, "grad_norm": 1.2294390201568604, "learning_rate": 2.8653561773079764e-06, "loss": 2.7142, "step": 3339 }, { "epoch": 0.9249515369703684, "grad_norm": 1.514574646949768, "learning_rate": 2.844426382686416e-06, "loss": 2.6999, "step": 3340 }, { "epoch": 0.9252284685682637, "grad_norm": 2.112339735031128, "learning_rate": 2.8235722057694534e-06, "loss": 2.8516, "step": 3341 }, { "epoch": 0.925505400166159, "grad_norm": 1.1753405332565308, "learning_rate": 2.802793662788239e-06, "loss": 2.7128, "step": 3342 }, { "epoch": 0.9257823317640543, "grad_norm": 1.5247375965118408, "learning_rate": 2.7820907699150246e-06, "loss": 2.8626, "step": 3343 }, { "epoch": 0.9260592633619495, "grad_norm": 1.4732311964035034, "learning_rate": 2.7614635432632097e-06, "loss": 2.6998, "step": 3344 }, { "epoch": 0.9263361949598449, "grad_norm": 1.4961967468261719, "learning_rate": 2.740911998887252e-06, "loss": 2.6971, "step": 3345 }, { "epoch": 0.9266131265577402, "grad_norm": 1.5658138990402222, "learning_rate": 2.7204361527827903e-06, "loss": 2.8843, "step": 3346 }, { "epoch": 0.9268900581556355, "grad_norm": 1.5200576782226562, "learning_rate": 2.700036020886465e-06, "loss": 2.7221, "step": 3347 }, { "epoch": 0.9271669897535308, "grad_norm": 1.2744560241699219, "learning_rate": 2.679711619076031e-06, "loss": 2.4677, "step": 3348 }, { "epoch": 0.9274439213514262, "grad_norm": 1.6359666585922241, "learning_rate": 2.6594629631702783e-06, "loss": 3.1598, "step": 3349 }, { "epoch": 0.9277208529493215, "grad_norm": 2.137948751449585, "learning_rate": 2.639290068929057e-06, "loss": 2.9673, "step": 3350 }, { "epoch": 0.9279977845472168, "grad_norm": 1.848804235458374, "learning_rate": 2.619192952053251e-06, "loss": 2.8939, "step": 3351 }, { "epoch": 0.9282747161451121, "grad_norm": 1.4783754348754883, "learning_rate": 2.599171628184749e-06, "loss": 2.8871, "step": 3352 }, { "epoch": 0.9285516477430075, "grad_norm": 1.6100101470947266, "learning_rate": 2.5792261129064854e-06, "loss": 2.758, "step": 3353 }, { "epoch": 0.9288285793409028, "grad_norm": 1.2014853954315186, "learning_rate": 2.5593564217423314e-06, "loss": 2.6645, "step": 3354 }, { "epoch": 0.9291055109387981, "grad_norm": 1.5369969606399536, "learning_rate": 2.5395625701572167e-06, "loss": 2.8224, "step": 3355 }, { "epoch": 0.9293824425366934, "grad_norm": 1.3187780380249023, "learning_rate": 2.519844573556984e-06, "loss": 3.0235, "step": 3356 }, { "epoch": 0.9296593741345888, "grad_norm": 1.7744249105453491, "learning_rate": 2.500202447288458e-06, "loss": 2.8146, "step": 3357 }, { "epoch": 0.9299363057324841, "grad_norm": 1.2549601793289185, "learning_rate": 2.4806362066394195e-06, "loss": 2.6661, "step": 3358 }, { "epoch": 0.9302132373303794, "grad_norm": 1.2055906057357788, "learning_rate": 2.461145866838599e-06, "loss": 2.7666, "step": 3359 }, { "epoch": 0.9304901689282747, "grad_norm": 1.3747657537460327, "learning_rate": 2.4417314430555837e-06, "loss": 2.7744, "step": 3360 }, { "epoch": 0.9307671005261701, "grad_norm": 1.4448826313018799, "learning_rate": 2.4223929504009646e-06, "loss": 2.8195, "step": 3361 }, { "epoch": 0.9310440321240654, "grad_norm": 1.956508755683899, "learning_rate": 2.4031304039261905e-06, "loss": 2.7472, "step": 3362 }, { "epoch": 0.9313209637219607, "grad_norm": 1.444071650505066, "learning_rate": 2.3839438186235684e-06, "loss": 2.8139, "step": 3363 }, { "epoch": 0.931597895319856, "grad_norm": 1.4014155864715576, "learning_rate": 2.364833209426376e-06, "loss": 3.0162, "step": 3364 }, { "epoch": 0.9318748269177514, "grad_norm": 1.24951171875, "learning_rate": 2.3457985912086476e-06, "loss": 2.7421, "step": 3365 }, { "epoch": 0.9321517585156467, "grad_norm": 1.0533653497695923, "learning_rate": 2.3268399787853557e-06, "loss": 2.7673, "step": 3366 }, { "epoch": 0.9324286901135419, "grad_norm": 0.9333207607269287, "learning_rate": 2.307957386912263e-06, "loss": 2.517, "step": 3367 }, { "epoch": 0.9327056217114372, "grad_norm": 2.0277838706970215, "learning_rate": 2.289150830286013e-06, "loss": 2.9124, "step": 3368 }, { "epoch": 0.9329825533093326, "grad_norm": 1.2337300777435303, "learning_rate": 2.270420323544009e-06, "loss": 2.6665, "step": 3369 }, { "epoch": 0.9332594849072279, "grad_norm": 1.0412017107009888, "learning_rate": 2.251765881264534e-06, "loss": 2.511, "step": 3370 }, { "epoch": 0.9335364165051232, "grad_norm": 1.042595624923706, "learning_rate": 2.2331875179666175e-06, "loss": 2.778, "step": 3371 }, { "epoch": 0.9338133481030185, "grad_norm": 1.1678199768066406, "learning_rate": 2.214685248110071e-06, "loss": 2.5503, "step": 3372 }, { "epoch": 0.9340902797009139, "grad_norm": 2.1503164768218994, "learning_rate": 2.1962590860955314e-06, "loss": 3.1102, "step": 3373 }, { "epoch": 0.9343672112988092, "grad_norm": 1.0278449058532715, "learning_rate": 2.177909046264348e-06, "loss": 2.8729, "step": 3374 }, { "epoch": 0.9346441428967045, "grad_norm": 2.8905749320983887, "learning_rate": 2.1596351428986637e-06, "loss": 3.2821, "step": 3375 }, { "epoch": 0.9349210744945998, "grad_norm": 0.8825234770774841, "learning_rate": 2.1414373902213236e-06, "loss": 2.5416, "step": 3376 }, { "epoch": 0.9351980060924951, "grad_norm": 1.6184552907943726, "learning_rate": 2.123315802395942e-06, "loss": 2.6308, "step": 3377 }, { "epoch": 0.9354749376903905, "grad_norm": 1.0278406143188477, "learning_rate": 2.1052703935268147e-06, "loss": 2.6233, "step": 3378 }, { "epoch": 0.9357518692882858, "grad_norm": 0.7804761528968811, "learning_rate": 2.0873011776589957e-06, "loss": 2.7131, "step": 3379 }, { "epoch": 0.9360288008861811, "grad_norm": 1.2199102640151978, "learning_rate": 2.069408168778164e-06, "loss": 2.6202, "step": 3380 }, { "epoch": 0.9363057324840764, "grad_norm": 1.2146902084350586, "learning_rate": 2.051591380810769e-06, "loss": 2.7762, "step": 3381 }, { "epoch": 0.9365826640819718, "grad_norm": 1.7390179634094238, "learning_rate": 2.0338508276238734e-06, "loss": 2.8942, "step": 3382 }, { "epoch": 0.9368595956798671, "grad_norm": 0.6415250897407532, "learning_rate": 2.016186523025232e-06, "loss": 2.7395, "step": 3383 }, { "epoch": 0.9371365272777624, "grad_norm": 1.0076696872711182, "learning_rate": 1.998598480763247e-06, "loss": 2.8295, "step": 3384 }, { "epoch": 0.9374134588756577, "grad_norm": 1.5434540510177612, "learning_rate": 1.9810867145269805e-06, "loss": 2.86, "step": 3385 }, { "epoch": 0.9376903904735531, "grad_norm": 1.3561028242111206, "learning_rate": 1.963651237946107e-06, "loss": 3.063, "step": 3386 }, { "epoch": 0.9379673220714484, "grad_norm": 1.0174596309661865, "learning_rate": 1.9462920645909155e-06, "loss": 2.6779, "step": 3387 }, { "epoch": 0.9382442536693437, "grad_norm": 2.0525801181793213, "learning_rate": 1.929009207972354e-06, "loss": 2.6454, "step": 3388 }, { "epoch": 0.938521185267239, "grad_norm": 0.9554057717323303, "learning_rate": 1.911802681541919e-06, "loss": 2.5937, "step": 3389 }, { "epoch": 0.9387981168651343, "grad_norm": 1.5000263452529907, "learning_rate": 1.8946724986917187e-06, "loss": 3.0528, "step": 3390 }, { "epoch": 0.9390750484630296, "grad_norm": 1.3776977062225342, "learning_rate": 1.8776186727544554e-06, "loss": 2.8474, "step": 3391 }, { "epoch": 0.9393519800609249, "grad_norm": 1.2192586660385132, "learning_rate": 1.8606412170033783e-06, "loss": 2.9783, "step": 3392 }, { "epoch": 0.9396289116588202, "grad_norm": 1.9527432918548584, "learning_rate": 1.843740144652295e-06, "loss": 2.9976, "step": 3393 }, { "epoch": 0.9399058432567156, "grad_norm": 1.2904397249221802, "learning_rate": 1.8269154688556056e-06, "loss": 2.7887, "step": 3394 }, { "epoch": 0.9401827748546109, "grad_norm": 1.648537278175354, "learning_rate": 1.8101672027082018e-06, "loss": 2.5434, "step": 3395 }, { "epoch": 0.9404597064525062, "grad_norm": 0.8858701586723328, "learning_rate": 1.7934953592455117e-06, "loss": 2.6729, "step": 3396 }, { "epoch": 0.9407366380504015, "grad_norm": 2.0341806411743164, "learning_rate": 1.7768999514435226e-06, "loss": 2.6933, "step": 3397 }, { "epoch": 0.9410135696482969, "grad_norm": 1.5567057132720947, "learning_rate": 1.7603809922186687e-06, "loss": 2.9675, "step": 3398 }, { "epoch": 0.9412905012461922, "grad_norm": 1.515378713607788, "learning_rate": 1.7439384944279213e-06, "loss": 2.8218, "step": 3399 }, { "epoch": 0.9415674328440875, "grad_norm": 3.610714912414551, "learning_rate": 1.7275724708687435e-06, "loss": 3.9697, "step": 3400 }, { "epoch": 0.9418443644419828, "grad_norm": 1.809895634651184, "learning_rate": 1.711282934279068e-06, "loss": 2.6432, "step": 3401 }, { "epoch": 0.9421212960398782, "grad_norm": 2.0301547050476074, "learning_rate": 1.6950698973372869e-06, "loss": 2.9055, "step": 3402 }, { "epoch": 0.9423982276377735, "grad_norm": 1.4767478704452515, "learning_rate": 1.6789333726622725e-06, "loss": 2.6788, "step": 3403 }, { "epoch": 0.9426751592356688, "grad_norm": 1.59397554397583, "learning_rate": 1.6628733728133227e-06, "loss": 2.9936, "step": 3404 }, { "epoch": 0.9429520908335641, "grad_norm": 1.12782621383667, "learning_rate": 1.6468899102901946e-06, "loss": 2.9682, "step": 3405 }, { "epoch": 0.9432290224314595, "grad_norm": 2.1295478343963623, "learning_rate": 1.6309829975330592e-06, "loss": 3.0926, "step": 3406 }, { "epoch": 0.9435059540293548, "grad_norm": 2.399533987045288, "learning_rate": 1.6151526469225243e-06, "loss": 2.8278, "step": 3407 }, { "epoch": 0.9437828856272501, "grad_norm": 4.179904460906982, "learning_rate": 1.5993988707796005e-06, "loss": 3.0627, "step": 3408 }, { "epoch": 0.9440598172251454, "grad_norm": 1.0878640413284302, "learning_rate": 1.5837216813656908e-06, "loss": 2.7063, "step": 3409 }, { "epoch": 0.9443367488230408, "grad_norm": 1.21199631690979, "learning_rate": 1.5681210908826127e-06, "loss": 2.6272, "step": 3410 }, { "epoch": 0.9446136804209361, "grad_norm": 1.9922666549682617, "learning_rate": 1.5525971114725203e-06, "loss": 2.8212, "step": 3411 }, { "epoch": 0.9448906120188314, "grad_norm": 1.2448945045471191, "learning_rate": 1.5371497552180037e-06, "loss": 2.7175, "step": 3412 }, { "epoch": 0.9451675436167266, "grad_norm": 1.281944751739502, "learning_rate": 1.521779034141968e-06, "loss": 2.7586, "step": 3413 }, { "epoch": 0.9454444752146219, "grad_norm": 1.6437686681747437, "learning_rate": 1.506484960207677e-06, "loss": 2.8117, "step": 3414 }, { "epoch": 0.9457214068125173, "grad_norm": 1.3451311588287354, "learning_rate": 1.491267545318753e-06, "loss": 2.6612, "step": 3415 }, { "epoch": 0.9459983384104126, "grad_norm": 1.2874051332473755, "learning_rate": 1.4761268013191553e-06, "loss": 2.743, "step": 3416 }, { "epoch": 0.9462752700083079, "grad_norm": 1.589040994644165, "learning_rate": 1.4610627399931687e-06, "loss": 2.9678, "step": 3417 }, { "epoch": 0.9465522016062032, "grad_norm": 1.0552315711975098, "learning_rate": 1.4460753730653587e-06, "loss": 2.7698, "step": 3418 }, { "epoch": 0.9468291332040986, "grad_norm": 1.3050696849822998, "learning_rate": 1.4311647122006721e-06, "loss": 2.6855, "step": 3419 }, { "epoch": 0.9471060648019939, "grad_norm": 1.5626434087753296, "learning_rate": 1.4163307690042593e-06, "loss": 2.9976, "step": 3420 }, { "epoch": 0.9473829963998892, "grad_norm": 1.091338872909546, "learning_rate": 1.4015735550216514e-06, "loss": 2.6805, "step": 3421 }, { "epoch": 0.9476599279977845, "grad_norm": 1.2689846754074097, "learning_rate": 1.3868930817385939e-06, "loss": 2.5436, "step": 3422 }, { "epoch": 0.9479368595956799, "grad_norm": 1.3194886445999146, "learning_rate": 1.372289360581147e-06, "loss": 2.5677, "step": 3423 }, { "epoch": 0.9482137911935752, "grad_norm": 0.955805242061615, "learning_rate": 1.3577624029155966e-06, "loss": 2.7816, "step": 3424 }, { "epoch": 0.9484907227914705, "grad_norm": 3.8523759841918945, "learning_rate": 1.3433122200485316e-06, "loss": 4.0054, "step": 3425 }, { "epoch": 0.9487676543893658, "grad_norm": 1.4586676359176636, "learning_rate": 1.328938823226722e-06, "loss": 2.631, "step": 3426 }, { "epoch": 0.9490445859872612, "grad_norm": 1.378494381904602, "learning_rate": 1.314642223637208e-06, "loss": 2.685, "step": 3427 }, { "epoch": 0.9493215175851565, "grad_norm": 1.3128551244735718, "learning_rate": 1.3004224324073e-06, "loss": 2.5419, "step": 3428 }, { "epoch": 0.9495984491830518, "grad_norm": 1.3833677768707275, "learning_rate": 1.2862794606044337e-06, "loss": 2.7505, "step": 3429 }, { "epoch": 0.9498753807809471, "grad_norm": 1.1600041389465332, "learning_rate": 1.2722133192363372e-06, "loss": 2.6111, "step": 3430 }, { "epoch": 0.9501523123788425, "grad_norm": 1.772107481956482, "learning_rate": 1.2582240192508866e-06, "loss": 3.0549, "step": 3431 }, { "epoch": 0.9504292439767378, "grad_norm": 0.8794018030166626, "learning_rate": 1.2443115715362054e-06, "loss": 2.8327, "step": 3432 }, { "epoch": 0.9507061755746331, "grad_norm": 1.4521026611328125, "learning_rate": 1.2304759869205207e-06, "loss": 2.719, "step": 3433 }, { "epoch": 0.9509831071725284, "grad_norm": 1.116992473602295, "learning_rate": 1.216717276172341e-06, "loss": 2.8102, "step": 3434 }, { "epoch": 0.9512600387704238, "grad_norm": 0.8220580220222473, "learning_rate": 1.203035450000245e-06, "loss": 2.6858, "step": 3435 }, { "epoch": 0.9515369703683191, "grad_norm": 1.5655560493469238, "learning_rate": 1.1894305190530252e-06, "loss": 2.7253, "step": 3436 }, { "epoch": 0.9518139019662143, "grad_norm": 0.8458607792854309, "learning_rate": 1.1759024939196117e-06, "loss": 2.9535, "step": 3437 }, { "epoch": 0.9520908335641096, "grad_norm": 1.0517743825912476, "learning_rate": 1.162451385129082e-06, "loss": 2.7594, "step": 3438 }, { "epoch": 0.9523677651620049, "grad_norm": 1.4435462951660156, "learning_rate": 1.1490772031506392e-06, "loss": 2.7204, "step": 3439 }, { "epoch": 0.9526446967599003, "grad_norm": 1.073880910873413, "learning_rate": 1.1357799583936235e-06, "loss": 3.0155, "step": 3440 }, { "epoch": 0.9529216283577956, "grad_norm": 1.1127359867095947, "learning_rate": 1.1225596612075006e-06, "loss": 2.7524, "step": 3441 }, { "epoch": 0.9531985599556909, "grad_norm": 1.126267910003662, "learning_rate": 1.1094163218817955e-06, "loss": 2.8228, "step": 3442 }, { "epoch": 0.9534754915535862, "grad_norm": 1.1283173561096191, "learning_rate": 1.0963499506462249e-06, "loss": 2.6293, "step": 3443 }, { "epoch": 0.9537524231514816, "grad_norm": 1.1441411972045898, "learning_rate": 1.0833605576705096e-06, "loss": 2.8343, "step": 3444 }, { "epoch": 0.9540293547493769, "grad_norm": 1.3150324821472168, "learning_rate": 1.0704481530645293e-06, "loss": 2.6825, "step": 3445 }, { "epoch": 0.9543062863472722, "grad_norm": 1.40483558177948, "learning_rate": 1.0576127468781783e-06, "loss": 2.7361, "step": 3446 }, { "epoch": 0.9545832179451675, "grad_norm": 1.6336510181427002, "learning_rate": 1.044854349101476e-06, "loss": 2.6316, "step": 3447 }, { "epoch": 0.9548601495430629, "grad_norm": 1.2103753089904785, "learning_rate": 1.0321729696644911e-06, "loss": 2.8156, "step": 3448 }, { "epoch": 0.9551370811409582, "grad_norm": 1.1275039911270142, "learning_rate": 1.0195686184373166e-06, "loss": 2.8522, "step": 3449 }, { "epoch": 0.9554140127388535, "grad_norm": 3.2600536346435547, "learning_rate": 1.0070413052301275e-06, "loss": 3.5911, "step": 3450 }, { "epoch": 0.9556909443367488, "grad_norm": 1.913200855255127, "learning_rate": 9.945910397931246e-07, "loss": 2.8252, "step": 3451 }, { "epoch": 0.9559678759346442, "grad_norm": 1.0066627264022827, "learning_rate": 9.822178318165564e-07, "loss": 2.6377, "step": 3452 }, { "epoch": 0.9562448075325395, "grad_norm": 1.3210686445236206, "learning_rate": 9.699216909306753e-07, "loss": 2.7325, "step": 3453 }, { "epoch": 0.9565217391304348, "grad_norm": 1.6365430355072021, "learning_rate": 9.577026267057476e-07, "loss": 2.4892, "step": 3454 }, { "epoch": 0.9567986707283301, "grad_norm": 1.5990711450576782, "learning_rate": 9.455606486520885e-07, "loss": 2.7405, "step": 3455 }, { "epoch": 0.9570756023262255, "grad_norm": 1.4092278480529785, "learning_rate": 9.334957662199717e-07, "loss": 2.7796, "step": 3456 }, { "epoch": 0.9573525339241208, "grad_norm": 1.3785223960876465, "learning_rate": 9.215079887996858e-07, "loss": 2.9916, "step": 3457 }, { "epoch": 0.9576294655220161, "grad_norm": 1.4673542976379395, "learning_rate": 9.095973257215117e-07, "loss": 2.6861, "step": 3458 }, { "epoch": 0.9579063971199114, "grad_norm": 1.6413600444793701, "learning_rate": 8.97763786255712e-07, "loss": 2.5797, "step": 3459 }, { "epoch": 0.9581833287178066, "grad_norm": 1.5023274421691895, "learning_rate": 8.860073796124857e-07, "loss": 2.6855, "step": 3460 }, { "epoch": 0.958460260315702, "grad_norm": 1.5544533729553223, "learning_rate": 8.74328114942069e-07, "loss": 2.6957, "step": 3461 }, { "epoch": 0.9587371919135973, "grad_norm": 1.3404407501220703, "learning_rate": 8.627260013345795e-07, "loss": 2.8544, "step": 3462 }, { "epoch": 0.9590141235114926, "grad_norm": 1.093775749206543, "learning_rate": 8.512010478201493e-07, "loss": 2.7508, "step": 3463 }, { "epoch": 0.959291055109388, "grad_norm": 1.5489881038665771, "learning_rate": 8.397532633688254e-07, "loss": 2.8989, "step": 3464 }, { "epoch": 0.9595679867072833, "grad_norm": 1.147836685180664, "learning_rate": 8.283826568905917e-07, "loss": 2.9515, "step": 3465 }, { "epoch": 0.9598449183051786, "grad_norm": 1.5323371887207031, "learning_rate": 8.170892372353577e-07, "loss": 2.8796, "step": 3466 }, { "epoch": 0.9601218499030739, "grad_norm": 1.594389796257019, "learning_rate": 8.058730131930037e-07, "loss": 3.1553, "step": 3467 }, { "epoch": 0.9603987815009692, "grad_norm": 2.070909261703491, "learning_rate": 7.947339934932574e-07, "loss": 2.5193, "step": 3468 }, { "epoch": 0.9606757130988646, "grad_norm": 17.432004928588867, "learning_rate": 7.836721868058061e-07, "loss": 2.8423, "step": 3469 }, { "epoch": 0.9609526446967599, "grad_norm": 1.2177340984344482, "learning_rate": 7.726876017402296e-07, "loss": 2.6421, "step": 3470 }, { "epoch": 0.9612295762946552, "grad_norm": 1.468483567237854, "learning_rate": 7.617802468460001e-07, "loss": 2.6394, "step": 3471 }, { "epoch": 0.9615065078925505, "grad_norm": 0.8232468366622925, "learning_rate": 7.509501306124823e-07, "loss": 2.5258, "step": 3472 }, { "epoch": 0.9617834394904459, "grad_norm": 1.6739842891693115, "learning_rate": 7.401972614689335e-07, "loss": 3.0923, "step": 3473 }, { "epoch": 0.9620603710883412, "grad_norm": 2.3070802688598633, "learning_rate": 7.295216477844702e-07, "loss": 3.124, "step": 3474 }, { "epoch": 0.9623373026862365, "grad_norm": 3.783268928527832, "learning_rate": 7.189232978680793e-07, "loss": 3.9758, "step": 3475 }, { "epoch": 0.9626142342841318, "grad_norm": 1.2166013717651367, "learning_rate": 7.084022199686513e-07, "loss": 2.8477, "step": 3476 }, { "epoch": 0.9628911658820272, "grad_norm": 1.5633941888809204, "learning_rate": 6.979584222748803e-07, "loss": 2.8795, "step": 3477 }, { "epoch": 0.9631680974799225, "grad_norm": 1.97480046749115, "learning_rate": 6.87591912915353e-07, "loss": 2.8306, "step": 3478 }, { "epoch": 0.9634450290778178, "grad_norm": 2.1172704696655273, "learning_rate": 6.773026999584708e-07, "loss": 3.0142, "step": 3479 }, { "epoch": 0.9637219606757131, "grad_norm": 1.2924878597259521, "learning_rate": 6.670907914125058e-07, "loss": 2.8441, "step": 3480 }, { "epoch": 0.9639988922736085, "grad_norm": 1.5648305416107178, "learning_rate": 6.569561952255332e-07, "loss": 2.705, "step": 3481 }, { "epoch": 0.9642758238715038, "grad_norm": 1.0812360048294067, "learning_rate": 6.46898919285488e-07, "loss": 2.5468, "step": 3482 }, { "epoch": 0.964552755469399, "grad_norm": 0.9468382000923157, "learning_rate": 6.369189714200863e-07, "loss": 2.4775, "step": 3483 }, { "epoch": 0.9648296870672943, "grad_norm": 0.991826057434082, "learning_rate": 6.270163593968703e-07, "loss": 2.7163, "step": 3484 }, { "epoch": 0.9651066186651897, "grad_norm": 0.9778124094009399, "learning_rate": 6.171910909232193e-07, "loss": 2.9867, "step": 3485 }, { "epoch": 0.965383550263085, "grad_norm": 1.0859044790267944, "learning_rate": 6.074431736462938e-07, "loss": 3.0588, "step": 3486 }, { "epoch": 0.9656604818609803, "grad_norm": 1.5992988348007202, "learning_rate": 5.97772615153025e-07, "loss": 2.6721, "step": 3487 }, { "epoch": 0.9659374134588756, "grad_norm": 1.050748586654663, "learning_rate": 5.88179422970192e-07, "loss": 2.6772, "step": 3488 }, { "epoch": 0.966214345056771, "grad_norm": 1.10953950881958, "learning_rate": 5.786636045643112e-07, "loss": 2.8636, "step": 3489 }, { "epoch": 0.9664912766546663, "grad_norm": 0.9930030703544617, "learning_rate": 5.692251673416804e-07, "loss": 2.722, "step": 3490 }, { "epoch": 0.9667682082525616, "grad_norm": 1.4072052240371704, "learning_rate": 5.598641186484011e-07, "loss": 2.6757, "step": 3491 }, { "epoch": 0.9670451398504569, "grad_norm": 1.5075595378875732, "learning_rate": 5.505804657703228e-07, "loss": 2.6082, "step": 3492 }, { "epoch": 0.9673220714483522, "grad_norm": 1.1784552335739136, "learning_rate": 5.413742159330548e-07, "loss": 2.6231, "step": 3493 }, { "epoch": 0.9675990030462476, "grad_norm": 1.228947401046753, "learning_rate": 5.322453763019653e-07, "loss": 2.5158, "step": 3494 }, { "epoch": 0.9678759346441429, "grad_norm": 1.220169186592102, "learning_rate": 5.231939539821706e-07, "loss": 2.9059, "step": 3495 }, { "epoch": 0.9681528662420382, "grad_norm": 1.6655526161193848, "learning_rate": 5.142199560185357e-07, "loss": 2.9041, "step": 3496 }, { "epoch": 0.9684297978399335, "grad_norm": 1.830000400543213, "learning_rate": 5.053233893956621e-07, "loss": 2.8374, "step": 3497 }, { "epoch": 0.9687067294378289, "grad_norm": 1.6736990213394165, "learning_rate": 4.96504261037889e-07, "loss": 3.0461, "step": 3498 }, { "epoch": 0.9689836610357242, "grad_norm": 2.2138006687164307, "learning_rate": 4.877625778092809e-07, "loss": 3.2083, "step": 3499 }, { "epoch": 0.9692605926336195, "grad_norm": 3.2736976146698, "learning_rate": 4.790983465136401e-07, "loss": 3.673, "step": 3500 }, { "epoch": 0.9695375242315148, "grad_norm": 1.161478042602539, "learning_rate": 4.70511573894461e-07, "loss": 2.6695, "step": 3501 }, { "epoch": 0.9698144558294102, "grad_norm": 2.052083730697632, "learning_rate": 4.6200226663495326e-07, "loss": 2.6304, "step": 3502 }, { "epoch": 0.9700913874273055, "grad_norm": 1.8516638278961182, "learning_rate": 4.535704313580635e-07, "loss": 2.6462, "step": 3503 }, { "epoch": 0.9703683190252008, "grad_norm": 1.1855838298797607, "learning_rate": 4.4521607462640893e-07, "loss": 2.729, "step": 3504 }, { "epoch": 0.9706452506230961, "grad_norm": 1.260607361793518, "learning_rate": 4.3693920294232136e-07, "loss": 2.6698, "step": 3505 }, { "epoch": 0.9709221822209915, "grad_norm": 1.1047613620758057, "learning_rate": 4.2873982274781453e-07, "loss": 2.8026, "step": 3506 }, { "epoch": 0.9711991138188867, "grad_norm": 1.4178338050842285, "learning_rate": 4.2061794042460577e-07, "loss": 2.8202, "step": 3507 }, { "epoch": 0.971476045416782, "grad_norm": 1.8993240594863892, "learning_rate": 4.1257356229407184e-07, "loss": 2.9319, "step": 3508 }, { "epoch": 0.9717529770146773, "grad_norm": 1.7039676904678345, "learning_rate": 4.046066946172822e-07, "loss": 2.7073, "step": 3509 }, { "epoch": 0.9720299086125727, "grad_norm": 1.7674676179885864, "learning_rate": 3.967173435949656e-07, "loss": 2.7071, "step": 3510 }, { "epoch": 0.972306840210468, "grad_norm": 1.4723281860351562, "learning_rate": 3.889055153675547e-07, "loss": 2.6833, "step": 3511 }, { "epoch": 0.9725837718083633, "grad_norm": 1.910642147064209, "learning_rate": 3.81171216015086e-07, "loss": 3.0199, "step": 3512 }, { "epoch": 0.9728607034062586, "grad_norm": 0.9734626412391663, "learning_rate": 3.735144515572997e-07, "loss": 2.6637, "step": 3513 }, { "epoch": 0.973137635004154, "grad_norm": 1.7201459407806396, "learning_rate": 3.659352279535733e-07, "loss": 2.7869, "step": 3514 }, { "epoch": 0.9734145666020493, "grad_norm": 1.0668485164642334, "learning_rate": 3.5843355110294354e-07, "loss": 2.6127, "step": 3515 }, { "epoch": 0.9736914981999446, "grad_norm": 1.1746799945831299, "learning_rate": 3.510094268440844e-07, "loss": 2.762, "step": 3516 }, { "epoch": 0.9739684297978399, "grad_norm": 0.9444898962974548, "learning_rate": 3.4366286095528497e-07, "loss": 2.6246, "step": 3517 }, { "epoch": 0.9742453613957353, "grad_norm": 1.3913249969482422, "learning_rate": 3.3639385915451573e-07, "loss": 2.5367, "step": 3518 }, { "epoch": 0.9745222929936306, "grad_norm": 1.1144284009933472, "learning_rate": 3.292024270993399e-07, "loss": 2.727, "step": 3519 }, { "epoch": 0.9747992245915259, "grad_norm": 1.6452025175094604, "learning_rate": 3.220885703869803e-07, "loss": 3.2008, "step": 3520 }, { "epoch": 0.9750761561894212, "grad_norm": 0.894197940826416, "learning_rate": 3.1505229455424115e-07, "loss": 2.5495, "step": 3521 }, { "epoch": 0.9753530877873166, "grad_norm": 0.7937556505203247, "learning_rate": 3.0809360507757514e-07, "loss": 2.5862, "step": 3522 }, { "epoch": 0.9756300193852119, "grad_norm": 0.7896092534065247, "learning_rate": 3.0121250737304985e-07, "loss": 2.7862, "step": 3523 }, { "epoch": 0.9759069509831072, "grad_norm": 1.9969873428344727, "learning_rate": 2.9440900679631457e-07, "loss": 3.0699, "step": 3524 }, { "epoch": 0.9761838825810025, "grad_norm": 3.0025217533111572, "learning_rate": 2.876831086426557e-07, "loss": 3.7546, "step": 3525 }, { "epoch": 0.9764608141788979, "grad_norm": 1.1754149198532104, "learning_rate": 2.8103481814693023e-07, "loss": 2.7908, "step": 3526 }, { "epoch": 0.9767377457767932, "grad_norm": 0.6472751498222351, "learning_rate": 2.7446414048361015e-07, "loss": 2.5569, "step": 3527 }, { "epoch": 0.9770146773746885, "grad_norm": 1.319361686706543, "learning_rate": 2.6797108076677127e-07, "loss": 2.7662, "step": 3528 }, { "epoch": 0.9772916089725838, "grad_norm": 1.3683019876480103, "learning_rate": 2.615556440500377e-07, "loss": 2.8489, "step": 3529 }, { "epoch": 0.977568540570479, "grad_norm": 0.9338417053222656, "learning_rate": 2.55217835326671e-07, "loss": 2.7165, "step": 3530 }, { "epoch": 0.9778454721683744, "grad_norm": 1.454641580581665, "learning_rate": 2.4895765952948067e-07, "loss": 2.7523, "step": 3531 }, { "epoch": 0.9781224037662697, "grad_norm": 0.9676294326782227, "learning_rate": 2.42775121530836e-07, "loss": 2.7136, "step": 3532 }, { "epoch": 0.978399335364165, "grad_norm": 1.5383201837539673, "learning_rate": 2.3667022614273226e-07, "loss": 2.5843, "step": 3533 }, { "epoch": 0.9786762669620603, "grad_norm": 2.314225673675537, "learning_rate": 2.306429781166908e-07, "loss": 2.9224, "step": 3534 }, { "epoch": 0.9789531985599557, "grad_norm": 1.3681161403656006, "learning_rate": 2.2469338214382574e-07, "loss": 2.8437, "step": 3535 }, { "epoch": 0.979230130157851, "grad_norm": 0.9051282405853271, "learning_rate": 2.1882144285477746e-07, "loss": 2.7259, "step": 3536 }, { "epoch": 0.9795070617557463, "grad_norm": 1.2935221195220947, "learning_rate": 2.1302716481979012e-07, "loss": 2.7093, "step": 3537 }, { "epoch": 0.9797839933536416, "grad_norm": 2.2106070518493652, "learning_rate": 2.0731055254863408e-07, "loss": 3.1981, "step": 3538 }, { "epoch": 0.980060924951537, "grad_norm": 1.511698603630066, "learning_rate": 2.016716104906391e-07, "loss": 2.6531, "step": 3539 }, { "epoch": 0.9803378565494323, "grad_norm": 0.9122217297554016, "learning_rate": 1.9611034303468335e-07, "loss": 2.718, "step": 3540 }, { "epoch": 0.9806147881473276, "grad_norm": 1.3138456344604492, "learning_rate": 1.906267545091711e-07, "loss": 2.8714, "step": 3541 }, { "epoch": 0.9808917197452229, "grad_norm": 2.0339131355285645, "learning_rate": 1.8522084918208836e-07, "loss": 2.7607, "step": 3542 }, { "epoch": 0.9811686513431183, "grad_norm": 1.3019803762435913, "learning_rate": 1.7989263126093614e-07, "loss": 2.7719, "step": 3543 }, { "epoch": 0.9814455829410136, "grad_norm": 1.5723626613616943, "learning_rate": 1.7464210489273047e-07, "loss": 2.6476, "step": 3544 }, { "epoch": 0.9817225145389089, "grad_norm": 1.4210187196731567, "learning_rate": 1.694692741640469e-07, "loss": 2.8297, "step": 3545 }, { "epoch": 0.9819994461368042, "grad_norm": 2.068538188934326, "learning_rate": 1.6437414310098708e-07, "loss": 2.5935, "step": 3546 }, { "epoch": 0.9822763777346996, "grad_norm": 1.3514223098754883, "learning_rate": 1.5935671566916777e-07, "loss": 2.7994, "step": 3547 }, { "epoch": 0.9825533093325949, "grad_norm": 2.3161251544952393, "learning_rate": 1.5441699577372072e-07, "loss": 2.5894, "step": 3548 }, { "epoch": 0.9828302409304902, "grad_norm": 1.861956000328064, "learning_rate": 1.4955498725932604e-07, "loss": 3.2641, "step": 3549 }, { "epoch": 0.9831071725283855, "grad_norm": 3.413618564605713, "learning_rate": 1.4477069391014563e-07, "loss": 3.7602, "step": 3550 }, { "epoch": 0.9833841041262809, "grad_norm": 1.23565673828125, "learning_rate": 1.4006411944988973e-07, "loss": 2.8523, "step": 3551 }, { "epoch": 0.9836610357241762, "grad_norm": 1.245562195777893, "learning_rate": 1.3543526754173918e-07, "loss": 2.7221, "step": 3552 }, { "epoch": 0.9839379673220714, "grad_norm": 0.939725935459137, "learning_rate": 1.3088414178840104e-07, "loss": 2.708, "step": 3553 }, { "epoch": 0.9842148989199667, "grad_norm": 1.6134532690048218, "learning_rate": 1.2641074573209733e-07, "loss": 2.704, "step": 3554 }, { "epoch": 0.984491830517862, "grad_norm": 1.680004358291626, "learning_rate": 1.2201508285454298e-07, "loss": 3.0606, "step": 3555 }, { "epoch": 0.9847687621157574, "grad_norm": 1.0474539995193481, "learning_rate": 1.176971565769458e-07, "loss": 2.686, "step": 3556 }, { "epoch": 0.9850456937136527, "grad_norm": 1.3566621541976929, "learning_rate": 1.1345697026001745e-07, "loss": 2.6762, "step": 3557 }, { "epoch": 0.985322625311548, "grad_norm": 1.2527090311050415, "learning_rate": 1.092945272039625e-07, "loss": 2.7056, "step": 3558 }, { "epoch": 0.9855995569094433, "grad_norm": 1.1411058902740479, "learning_rate": 1.0520983064847833e-07, "loss": 2.6102, "step": 3559 }, { "epoch": 0.9858764885073387, "grad_norm": 1.2734065055847168, "learning_rate": 1.0120288377274412e-07, "loss": 2.9378, "step": 3560 }, { "epoch": 0.986153420105234, "grad_norm": 1.2071735858917236, "learning_rate": 9.727368969542072e-08, "loss": 2.6635, "step": 3561 }, { "epoch": 0.9864303517031293, "grad_norm": 1.6888424158096313, "learning_rate": 9.342225147467299e-08, "loss": 2.9836, "step": 3562 }, { "epoch": 0.9867072833010246, "grad_norm": 1.2385547161102295, "learning_rate": 8.964857210814748e-08, "loss": 2.8505, "step": 3563 }, { "epoch": 0.98698421489892, "grad_norm": 1.72456955909729, "learning_rate": 8.595265453292811e-08, "loss": 2.7266, "step": 3564 }, { "epoch": 0.9872611464968153, "grad_norm": 2.967880964279175, "learning_rate": 8.233450162562495e-08, "loss": 3.0941, "step": 3565 }, { "epoch": 0.9875380780947106, "grad_norm": 1.703004240989685, "learning_rate": 7.87941162023076e-08, "loss": 3.0268, "step": 3566 }, { "epoch": 0.9878150096926059, "grad_norm": 1.254521131515503, "learning_rate": 7.533150101849407e-08, "loss": 2.6175, "step": 3567 }, { "epoch": 0.9880919412905013, "grad_norm": 1.4605085849761963, "learning_rate": 7.194665876920636e-08, "loss": 2.9556, "step": 3568 }, { "epoch": 0.9883688728883966, "grad_norm": 1.6051185131072998, "learning_rate": 6.86395920889149e-08, "loss": 2.8252, "step": 3569 }, { "epoch": 0.9886458044862919, "grad_norm": 1.8399105072021484, "learning_rate": 6.541030355156075e-08, "loss": 2.5048, "step": 3570 }, { "epoch": 0.9889227360841872, "grad_norm": 1.030032992362976, "learning_rate": 6.225879567055559e-08, "loss": 2.5485, "step": 3571 }, { "epoch": 0.9891996676820826, "grad_norm": 1.6225028038024902, "learning_rate": 5.918507089877068e-08, "loss": 2.7388, "step": 3572 }, { "epoch": 0.9894765992799779, "grad_norm": 1.342437505722046, "learning_rate": 5.6189131628525684e-08, "loss": 2.5809, "step": 3573 }, { "epoch": 0.9897535308778732, "grad_norm": 1.1848745346069336, "learning_rate": 5.327098019159982e-08, "loss": 2.746, "step": 3574 }, { "epoch": 0.9900304624757685, "grad_norm": 2.597025156021118, "learning_rate": 5.043061885925404e-08, "loss": 3.5433, "step": 3575 }, { "epoch": 0.9903073940736639, "grad_norm": 1.5176355838775635, "learning_rate": 4.7668049842175544e-08, "loss": 2.7927, "step": 3576 }, { "epoch": 0.9905843256715591, "grad_norm": 1.9392883777618408, "learning_rate": 4.498327529051105e-08, "loss": 2.9864, "step": 3577 }, { "epoch": 0.9908612572694544, "grad_norm": 1.6624773740768433, "learning_rate": 4.237629729387793e-08, "loss": 2.6176, "step": 3578 }, { "epoch": 0.9911381888673497, "grad_norm": 1.1208992004394531, "learning_rate": 3.9847117881308685e-08, "loss": 2.6707, "step": 3579 }, { "epoch": 0.991415120465245, "grad_norm": 1.0683854818344116, "learning_rate": 3.739573902131754e-08, "loss": 2.5527, "step": 3580 }, { "epoch": 0.9916920520631404, "grad_norm": 2.0292575359344482, "learning_rate": 3.502216262184499e-08, "loss": 2.7259, "step": 3581 }, { "epoch": 0.9919689836610357, "grad_norm": 0.834223210811615, "learning_rate": 3.272639053029103e-08, "loss": 2.5719, "step": 3582 }, { "epoch": 0.992245915258931, "grad_norm": 1.624928593635559, "learning_rate": 3.05084245334708e-08, "loss": 2.9334, "step": 3583 }, { "epoch": 0.9925228468568263, "grad_norm": 1.3256527185440063, "learning_rate": 2.8368266357681194e-08, "loss": 2.6071, "step": 3584 }, { "epoch": 0.9927997784547217, "grad_norm": 1.2522574663162231, "learning_rate": 2.6305917668645318e-08, "loss": 2.7054, "step": 3585 }, { "epoch": 0.993076710052617, "grad_norm": 1.1217739582061768, "learning_rate": 2.432138007150142e-08, "loss": 3.0231, "step": 3586 }, { "epoch": 0.9933536416505123, "grad_norm": 2.0638465881347656, "learning_rate": 2.2414655110858383e-08, "loss": 3.0328, "step": 3587 }, { "epoch": 0.9936305732484076, "grad_norm": 1.2661341428756714, "learning_rate": 2.0585744270740226e-08, "loss": 2.5907, "step": 3588 }, { "epoch": 0.993907504846303, "grad_norm": 1.2835946083068848, "learning_rate": 1.8834648974630497e-08, "loss": 2.7709, "step": 3589 }, { "epoch": 0.9941844364441983, "grad_norm": 2.156313180923462, "learning_rate": 1.7161370585427883e-08, "loss": 2.6873, "step": 3590 }, { "epoch": 0.9944613680420936, "grad_norm": 1.453169345855713, "learning_rate": 1.55659104054795e-08, "loss": 2.6949, "step": 3591 }, { "epoch": 0.9947382996399889, "grad_norm": 1.0746440887451172, "learning_rate": 1.4048269676536497e-08, "loss": 2.8744, "step": 3592 }, { "epoch": 0.9950152312378843, "grad_norm": 1.2922847270965576, "learning_rate": 1.2608449579820658e-08, "loss": 2.6111, "step": 3593 }, { "epoch": 0.9952921628357796, "grad_norm": 1.3704134225845337, "learning_rate": 1.12464512359578e-08, "loss": 2.803, "step": 3594 }, { "epoch": 0.9955690944336749, "grad_norm": 2.0052356719970703, "learning_rate": 9.96227570501107e-09, "loss": 2.5901, "step": 3595 }, { "epoch": 0.9958460260315702, "grad_norm": 1.3504281044006348, "learning_rate": 8.755923986480952e-09, "loss": 2.5067, "step": 3596 }, { "epoch": 0.9961229576294656, "grad_norm": 1.0229085683822632, "learning_rate": 7.627397019294158e-09, "loss": 2.6755, "step": 3597 }, { "epoch": 0.9963998892273609, "grad_norm": 1.2293682098388672, "learning_rate": 6.576695681792533e-09, "loss": 2.8801, "step": 3598 }, { "epoch": 0.9966768208252562, "grad_norm": 1.2077336311340332, "learning_rate": 5.603820791755254e-09, "loss": 3.0605, "step": 3599 }, { "epoch": 0.9969537524231514, "grad_norm": 4.453784465789795, "learning_rate": 4.708773106376629e-09, "loss": 4.3675, "step": 3600 }, { "epoch": 0.9972306840210468, "grad_norm": 1.1345912218093872, "learning_rate": 3.891553322299401e-09, "loss": 2.6014, "step": 3601 }, { "epoch": 0.9975076156189421, "grad_norm": 1.3901625871658325, "learning_rate": 3.152162075581444e-09, "loss": 2.6007, "step": 3602 }, { "epoch": 0.9977845472168374, "grad_norm": 1.705886960029602, "learning_rate": 2.4905999416957593e-09, "loss": 2.8874, "step": 3603 }, { "epoch": 0.9980614788147327, "grad_norm": 1.9912961721420288, "learning_rate": 1.9068674355415815e-09, "loss": 2.6562, "step": 3604 }, { "epoch": 0.998338410412628, "grad_norm": 1.2312753200531006, "learning_rate": 1.4009650114554796e-09, "loss": 2.5166, "step": 3605 }, { "epoch": 0.9986153420105234, "grad_norm": 0.9925754070281982, "learning_rate": 9.728930631780487e-10, "loss": 2.7784, "step": 3606 }, { "epoch": 0.9988922736084187, "grad_norm": 1.4208216667175293, "learning_rate": 6.226519238983209e-10, "loss": 2.6608, "step": 3607 }, { "epoch": 0.999169205206314, "grad_norm": 1.1245412826538086, "learning_rate": 3.502418662093554e-10, "loss": 2.5002, "step": 3608 }, { "epoch": 0.9994461368042094, "grad_norm": 1.6515275239944458, "learning_rate": 1.5566310213044333e-10, "loss": 2.5429, "step": 3609 }, { "epoch": 0.9997230684021047, "grad_norm": 1.8428101539611816, "learning_rate": 3.891578310710742e-11, "loss": 2.9925, "step": 3610 }, { "epoch": 1.0, "grad_norm": 2.526169776916504, "learning_rate": 0.0, "loss": 3.4422, "step": 3611 } ], "logging_steps": 1, "max_steps": 3611, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 903, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.452436247864672e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }