{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984840828701365, "eval_steps": 500, "global_step": 494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00202122283981809, "grad_norm": 4.962096691131592, "learning_rate": 1.0000000000000002e-06, "loss": 1.6441, "step": 1 }, { "epoch": 0.00404244567963618, "grad_norm": 5.593231678009033, "learning_rate": 2.0000000000000003e-06, "loss": 1.8613, "step": 2 }, { "epoch": 0.00606366851945427, "grad_norm": 5.743273735046387, "learning_rate": 3e-06, "loss": 1.9149, "step": 3 }, { "epoch": 0.00808489135927236, "grad_norm": 5.530357360839844, "learning_rate": 4.000000000000001e-06, "loss": 2.0057, "step": 4 }, { "epoch": 0.01010611419909045, "grad_norm": 6.651333332061768, "learning_rate": 5e-06, "loss": 1.9692, "step": 5 }, { "epoch": 0.01212733703890854, "grad_norm": 6.602941513061523, "learning_rate": 6e-06, "loss": 2.4343, "step": 6 }, { "epoch": 0.01414855987872663, "grad_norm": 6.895396709442139, "learning_rate": 7.000000000000001e-06, "loss": 2.26, "step": 7 }, { "epoch": 0.01616978271854472, "grad_norm": 7.525021553039551, "learning_rate": 8.000000000000001e-06, "loss": 2.2767, "step": 8 }, { "epoch": 0.01819100555836281, "grad_norm": 7.5351762771606445, "learning_rate": 9e-06, "loss": 2.7438, "step": 9 }, { "epoch": 0.0202122283981809, "grad_norm": 7.658970832824707, "learning_rate": 1e-05, "loss": 2.764, "step": 10 }, { "epoch": 0.02223345123799899, "grad_norm": 8.046220779418945, "learning_rate": 1.1000000000000001e-05, "loss": 2.3894, "step": 11 }, { "epoch": 0.02425467407781708, "grad_norm": 8.3847017288208, "learning_rate": 1.2e-05, "loss": 2.5517, "step": 12 }, { "epoch": 0.02627589691763517, "grad_norm": 8.96577262878418, "learning_rate": 1.3000000000000001e-05, "loss": 2.2152, "step": 13 }, { "epoch": 0.02829711975745326, "grad_norm": 8.063103675842285, "learning_rate": 1.4000000000000001e-05, "loss": 2.1623, "step": 14 }, { "epoch": 0.03031834259727135, "grad_norm": 8.5758638381958, "learning_rate": 1.5e-05, "loss": 2.4497, "step": 15 }, { "epoch": 0.03233956543708944, "grad_norm": 8.477540969848633, "learning_rate": 1.6000000000000003e-05, "loss": 2.3183, "step": 16 }, { "epoch": 0.03436078827690753, "grad_norm": 8.865395545959473, "learning_rate": 1.7000000000000003e-05, "loss": 2.2435, "step": 17 }, { "epoch": 0.03638201111672562, "grad_norm": 8.725611686706543, "learning_rate": 1.8e-05, "loss": 2.1894, "step": 18 }, { "epoch": 0.03840323395654371, "grad_norm": 8.353998184204102, "learning_rate": 1.9e-05, "loss": 2.0811, "step": 19 }, { "epoch": 0.0404244567963618, "grad_norm": 8.999526977539062, "learning_rate": 2e-05, "loss": 2.2778, "step": 20 }, { "epoch": 0.04244567963617989, "grad_norm": 8.592598915100098, "learning_rate": 2.1e-05, "loss": 2.3943, "step": 21 }, { "epoch": 0.04446690247599798, "grad_norm": 7.57433557510376, "learning_rate": 2.2000000000000003e-05, "loss": 2.1125, "step": 22 }, { "epoch": 0.046488125315816066, "grad_norm": 8.0515775680542, "learning_rate": 2.3000000000000003e-05, "loss": 1.9887, "step": 23 }, { "epoch": 0.04850934815563416, "grad_norm": 7.530181884765625, "learning_rate": 2.4e-05, "loss": 2.2077, "step": 24 }, { "epoch": 0.050530570995452245, "grad_norm": 6.949326992034912, "learning_rate": 2.5e-05, "loss": 2.049, "step": 25 }, { "epoch": 0.05255179383527034, "grad_norm": 7.002259254455566, "learning_rate": 2.6000000000000002e-05, "loss": 1.9994, "step": 26 }, { "epoch": 0.05457301667508843, "grad_norm": 7.145877838134766, "learning_rate": 2.7000000000000002e-05, "loss": 1.7604, "step": 27 }, { "epoch": 0.05659423951490652, "grad_norm": 7.082208156585693, "learning_rate": 2.8000000000000003e-05, "loss": 2.0095, "step": 28 }, { "epoch": 0.05861546235472461, "grad_norm": 6.70477294921875, "learning_rate": 2.9e-05, "loss": 1.6048, "step": 29 }, { "epoch": 0.0606366851945427, "grad_norm": 8.728182792663574, "learning_rate": 3e-05, "loss": 2.2502, "step": 30 }, { "epoch": 0.06265790803436079, "grad_norm": 8.69613265991211, "learning_rate": 3.1e-05, "loss": 2.0332, "step": 31 }, { "epoch": 0.06467913087417888, "grad_norm": 8.603922843933105, "learning_rate": 3.2000000000000005e-05, "loss": 2.3491, "step": 32 }, { "epoch": 0.06670035371399696, "grad_norm": 7.335165977478027, "learning_rate": 3.3e-05, "loss": 1.7337, "step": 33 }, { "epoch": 0.06872157655381506, "grad_norm": 8.186851501464844, "learning_rate": 3.4000000000000007e-05, "loss": 1.9627, "step": 34 }, { "epoch": 0.07074279939363315, "grad_norm": 7.595352649688721, "learning_rate": 3.5e-05, "loss": 1.5682, "step": 35 }, { "epoch": 0.07276402223345124, "grad_norm": 7.205020904541016, "learning_rate": 3.6e-05, "loss": 1.7703, "step": 36 }, { "epoch": 0.07478524507326932, "grad_norm": 7.933116436004639, "learning_rate": 3.7e-05, "loss": 1.8315, "step": 37 }, { "epoch": 0.07680646791308741, "grad_norm": 7.590288162231445, "learning_rate": 3.8e-05, "loss": 1.8305, "step": 38 }, { "epoch": 0.07882769075290551, "grad_norm": 7.468386650085449, "learning_rate": 3.9000000000000006e-05, "loss": 1.6923, "step": 39 }, { "epoch": 0.0808489135927236, "grad_norm": 8.244772911071777, "learning_rate": 4e-05, "loss": 2.0191, "step": 40 }, { "epoch": 0.0828701364325417, "grad_norm": 8.714116096496582, "learning_rate": 4.1e-05, "loss": 1.9665, "step": 41 }, { "epoch": 0.08489135927235977, "grad_norm": 8.570602416992188, "learning_rate": 4.2e-05, "loss": 1.9418, "step": 42 }, { "epoch": 0.08691258211217787, "grad_norm": 7.338136196136475, "learning_rate": 4.3e-05, "loss": 1.1443, "step": 43 }, { "epoch": 0.08893380495199596, "grad_norm": 8.277491569519043, "learning_rate": 4.4000000000000006e-05, "loss": 1.5882, "step": 44 }, { "epoch": 0.09095502779181405, "grad_norm": 7.305893421173096, "learning_rate": 4.5e-05, "loss": 1.1692, "step": 45 }, { "epoch": 0.09297625063163213, "grad_norm": 7.610684871673584, "learning_rate": 4.600000000000001e-05, "loss": 1.3468, "step": 46 }, { "epoch": 0.09499747347145023, "grad_norm": 7.890575885772705, "learning_rate": 4.7e-05, "loss": 1.1566, "step": 47 }, { "epoch": 0.09701869631126832, "grad_norm": 8.964077949523926, "learning_rate": 4.8e-05, "loss": 1.635, "step": 48 }, { "epoch": 0.09903991915108641, "grad_norm": 9.524826049804688, "learning_rate": 4.9e-05, "loss": 1.4733, "step": 49 }, { "epoch": 0.10106114199090449, "grad_norm": 9.499811172485352, "learning_rate": 5e-05, "loss": 1.5931, "step": 50 }, { "epoch": 0.10308236483072258, "grad_norm": 5.5525898933410645, "learning_rate": 5.1000000000000006e-05, "loss": 1.5699, "step": 51 }, { "epoch": 0.10510358767054068, "grad_norm": 4.883670330047607, "learning_rate": 5.2000000000000004e-05, "loss": 1.3653, "step": 52 }, { "epoch": 0.10712481051035877, "grad_norm": 3.8409433364868164, "learning_rate": 5.300000000000001e-05, "loss": 1.2819, "step": 53 }, { "epoch": 0.10914603335017686, "grad_norm": 4.300581932067871, "learning_rate": 5.4000000000000005e-05, "loss": 1.5477, "step": 54 }, { "epoch": 0.11116725618999494, "grad_norm": 3.8485517501831055, "learning_rate": 5.500000000000001e-05, "loss": 1.4893, "step": 55 }, { "epoch": 0.11318847902981304, "grad_norm": 4.364621639251709, "learning_rate": 5.6000000000000006e-05, "loss": 1.6671, "step": 56 }, { "epoch": 0.11520970186963113, "grad_norm": 4.246096134185791, "learning_rate": 5.6999999999999996e-05, "loss": 1.6163, "step": 57 }, { "epoch": 0.11723092470944922, "grad_norm": 4.382571697235107, "learning_rate": 5.8e-05, "loss": 1.6223, "step": 58 }, { "epoch": 0.1192521475492673, "grad_norm": 4.406397819519043, "learning_rate": 5.9e-05, "loss": 1.5827, "step": 59 }, { "epoch": 0.1212733703890854, "grad_norm": 5.563169002532959, "learning_rate": 6e-05, "loss": 2.1763, "step": 60 }, { "epoch": 0.12329459322890349, "grad_norm": 5.388707160949707, "learning_rate": 6.1e-05, "loss": 1.9962, "step": 61 }, { "epoch": 0.12531581606872158, "grad_norm": 4.910810947418213, "learning_rate": 6.2e-05, "loss": 1.6651, "step": 62 }, { "epoch": 0.12733703890853967, "grad_norm": 5.668425559997559, "learning_rate": 6.3e-05, "loss": 1.8851, "step": 63 }, { "epoch": 0.12935826174835777, "grad_norm": 5.245799541473389, "learning_rate": 6.400000000000001e-05, "loss": 1.7111, "step": 64 }, { "epoch": 0.13137948458817586, "grad_norm": 5.701318264007568, "learning_rate": 6.500000000000001e-05, "loss": 2.0466, "step": 65 }, { "epoch": 0.13340070742799393, "grad_norm": 6.002028942108154, "learning_rate": 6.6e-05, "loss": 1.9546, "step": 66 }, { "epoch": 0.13542193026781202, "grad_norm": 5.405800819396973, "learning_rate": 6.7e-05, "loss": 1.6626, "step": 67 }, { "epoch": 0.1374431531076301, "grad_norm": 5.076318740844727, "learning_rate": 6.800000000000001e-05, "loss": 1.726, "step": 68 }, { "epoch": 0.1394643759474482, "grad_norm": 5.462904930114746, "learning_rate": 6.9e-05, "loss": 1.6945, "step": 69 }, { "epoch": 0.1414855987872663, "grad_norm": 5.7171783447265625, "learning_rate": 7e-05, "loss": 1.9263, "step": 70 }, { "epoch": 0.1435068216270844, "grad_norm": 5.716061592102051, "learning_rate": 7.1e-05, "loss": 1.4844, "step": 71 }, { "epoch": 0.14552804446690248, "grad_norm": 5.982063293457031, "learning_rate": 7.2e-05, "loss": 1.8287, "step": 72 }, { "epoch": 0.14754926730672058, "grad_norm": 5.261101722717285, "learning_rate": 7.3e-05, "loss": 1.5783, "step": 73 }, { "epoch": 0.14957049014653864, "grad_norm": 5.717907428741455, "learning_rate": 7.4e-05, "loss": 1.4726, "step": 74 }, { "epoch": 0.15159171298635674, "grad_norm": 5.534896373748779, "learning_rate": 7.500000000000001e-05, "loss": 1.5899, "step": 75 }, { "epoch": 0.15361293582617483, "grad_norm": 6.794299125671387, "learning_rate": 7.6e-05, "loss": 1.7261, "step": 76 }, { "epoch": 0.15563415866599292, "grad_norm": 6.486598014831543, "learning_rate": 7.7e-05, "loss": 1.7126, "step": 77 }, { "epoch": 0.15765538150581102, "grad_norm": 6.078768730163574, "learning_rate": 7.800000000000001e-05, "loss": 1.5131, "step": 78 }, { "epoch": 0.1596766043456291, "grad_norm": 7.305526256561279, "learning_rate": 7.900000000000001e-05, "loss": 2.1216, "step": 79 }, { "epoch": 0.1616978271854472, "grad_norm": 6.43522310256958, "learning_rate": 8e-05, "loss": 1.5078, "step": 80 }, { "epoch": 0.1637190500252653, "grad_norm": 6.868276119232178, "learning_rate": 8.1e-05, "loss": 1.6478, "step": 81 }, { "epoch": 0.1657402728650834, "grad_norm": 6.5684051513671875, "learning_rate": 8.2e-05, "loss": 1.554, "step": 82 }, { "epoch": 0.16776149570490145, "grad_norm": 7.237800121307373, "learning_rate": 8.3e-05, "loss": 1.776, "step": 83 }, { "epoch": 0.16978271854471955, "grad_norm": 10.40848445892334, "learning_rate": 8.4e-05, "loss": 1.3637, "step": 84 }, { "epoch": 0.17180394138453764, "grad_norm": 7.5290846824646, "learning_rate": 8.5e-05, "loss": 1.8149, "step": 85 }, { "epoch": 0.17382516422435573, "grad_norm": 6.535577297210693, "learning_rate": 8.6e-05, "loss": 1.5878, "step": 86 }, { "epoch": 0.17584638706417383, "grad_norm": 6.797990322113037, "learning_rate": 8.7e-05, "loss": 1.6962, "step": 87 }, { "epoch": 0.17786760990399192, "grad_norm": 8.046355247497559, "learning_rate": 8.800000000000001e-05, "loss": 1.6756, "step": 88 }, { "epoch": 0.17988883274381, "grad_norm": 6.245670318603516, "learning_rate": 8.900000000000001e-05, "loss": 1.4684, "step": 89 }, { "epoch": 0.1819100555836281, "grad_norm": 6.456711769104004, "learning_rate": 9e-05, "loss": 1.4074, "step": 90 }, { "epoch": 0.1839312784234462, "grad_norm": 6.714746475219727, "learning_rate": 9.1e-05, "loss": 1.4863, "step": 91 }, { "epoch": 0.18595250126326426, "grad_norm": 8.266717910766602, "learning_rate": 9.200000000000001e-05, "loss": 1.8342, "step": 92 }, { "epoch": 0.18797372410308236, "grad_norm": 7.780879497528076, "learning_rate": 9.300000000000001e-05, "loss": 1.9541, "step": 93 }, { "epoch": 0.18999494694290045, "grad_norm": 6.307599067687988, "learning_rate": 9.4e-05, "loss": 1.2528, "step": 94 }, { "epoch": 0.19201616978271854, "grad_norm": 7.502289295196533, "learning_rate": 9.5e-05, "loss": 1.5187, "step": 95 }, { "epoch": 0.19403739262253664, "grad_norm": 6.638027667999268, "learning_rate": 9.6e-05, "loss": 1.2167, "step": 96 }, { "epoch": 0.19605861546235473, "grad_norm": 7.040843963623047, "learning_rate": 9.7e-05, "loss": 1.3433, "step": 97 }, { "epoch": 0.19807983830217282, "grad_norm": 6.591531753540039, "learning_rate": 9.8e-05, "loss": 1.1483, "step": 98 }, { "epoch": 0.20010106114199092, "grad_norm": 8.779806137084961, "learning_rate": 9.900000000000001e-05, "loss": 1.8501, "step": 99 }, { "epoch": 0.20212228398180898, "grad_norm": 8.384221076965332, "learning_rate": 0.0001, "loss": 1.5389, "step": 100 }, { "epoch": 0.20414350682162707, "grad_norm": 4.096580982208252, "learning_rate": 9.999841055681184e-05, "loss": 1.4128, "step": 101 }, { "epoch": 0.20616472966144517, "grad_norm": 4.011407375335693, "learning_rate": 9.999364232830052e-05, "loss": 1.2588, "step": 102 }, { "epoch": 0.20818595250126326, "grad_norm": 3.9498050212860107, "learning_rate": 9.99856956176192e-05, "loss": 1.4214, "step": 103 }, { "epoch": 0.21020717534108135, "grad_norm": 3.8423962593078613, "learning_rate": 9.997457093000164e-05, "loss": 1.7436, "step": 104 }, { "epoch": 0.21222839818089945, "grad_norm": 3.859107255935669, "learning_rate": 9.996026897273024e-05, "loss": 1.5557, "step": 105 }, { "epoch": 0.21424962102071754, "grad_norm": 4.101138591766357, "learning_rate": 9.994279065509093e-05, "loss": 1.6637, "step": 106 }, { "epoch": 0.21627084386053563, "grad_norm": 4.0843963623046875, "learning_rate": 9.992213708831543e-05, "loss": 1.6069, "step": 107 }, { "epoch": 0.21829206670035373, "grad_norm": 4.65712833404541, "learning_rate": 9.989830958551057e-05, "loss": 1.8503, "step": 108 }, { "epoch": 0.2203132895401718, "grad_norm": 5.231738090515137, "learning_rate": 9.987130966157486e-05, "loss": 1.7857, "step": 109 }, { "epoch": 0.22233451237998988, "grad_norm": 5.475664138793945, "learning_rate": 9.984113903310206e-05, "loss": 2.1317, "step": 110 }, { "epoch": 0.22435573521980798, "grad_norm": 5.313016414642334, "learning_rate": 9.98077996182722e-05, "loss": 2.0104, "step": 111 }, { "epoch": 0.22637695805962607, "grad_norm": 4.913154125213623, "learning_rate": 9.97712935367295e-05, "loss": 1.6801, "step": 112 }, { "epoch": 0.22839818089944416, "grad_norm": 5.323668479919434, "learning_rate": 9.973162310944768e-05, "loss": 1.8818, "step": 113 }, { "epoch": 0.23041940373926226, "grad_norm": 5.191682815551758, "learning_rate": 9.968879085858234e-05, "loss": 1.6902, "step": 114 }, { "epoch": 0.23244062657908035, "grad_norm": 5.358806133270264, "learning_rate": 9.964279950731066e-05, "loss": 1.5777, "step": 115 }, { "epoch": 0.23446184941889844, "grad_norm": 4.940889358520508, "learning_rate": 9.959365197965824e-05, "loss": 1.4829, "step": 116 }, { "epoch": 0.23648307225871654, "grad_norm": 5.8027191162109375, "learning_rate": 9.954135140031321e-05, "loss": 1.743, "step": 117 }, { "epoch": 0.2385042950985346, "grad_norm": 6.13554048538208, "learning_rate": 9.948590109442754e-05, "loss": 1.6588, "step": 118 }, { "epoch": 0.2405255179383527, "grad_norm": 5.018206596374512, "learning_rate": 9.942730458740568e-05, "loss": 1.4198, "step": 119 }, { "epoch": 0.2425467407781708, "grad_norm": 6.798793792724609, "learning_rate": 9.936556560468037e-05, "loss": 1.9842, "step": 120 }, { "epoch": 0.24456796361798888, "grad_norm": 5.7152323722839355, "learning_rate": 9.930068807147584e-05, "loss": 1.739, "step": 121 }, { "epoch": 0.24658918645780697, "grad_norm": 5.72061824798584, "learning_rate": 9.923267611255825e-05, "loss": 1.5452, "step": 122 }, { "epoch": 0.24861040929762507, "grad_norm": 5.586998462677002, "learning_rate": 9.916153405197332e-05, "loss": 1.5731, "step": 123 }, { "epoch": 0.25063163213744316, "grad_norm": 5.702105522155762, "learning_rate": 9.908726641277167e-05, "loss": 1.8093, "step": 124 }, { "epoch": 0.25265285497726125, "grad_norm": 6.596114158630371, "learning_rate": 9.9009877916721e-05, "loss": 1.6054, "step": 125 }, { "epoch": 0.25467407781707935, "grad_norm": 5.10860013961792, "learning_rate": 9.892937348400601e-05, "loss": 1.4648, "step": 126 }, { "epoch": 0.25669530065689744, "grad_norm": 6.096838474273682, "learning_rate": 9.88457582329156e-05, "loss": 1.5438, "step": 127 }, { "epoch": 0.25871652349671553, "grad_norm": 6.351302146911621, "learning_rate": 9.875903747951742e-05, "loss": 1.7681, "step": 128 }, { "epoch": 0.2607377463365336, "grad_norm": 5.202528476715088, "learning_rate": 9.866921673731992e-05, "loss": 1.2863, "step": 129 }, { "epoch": 0.2627589691763517, "grad_norm": 6.953995227813721, "learning_rate": 9.857630171692174e-05, "loss": 2.0508, "step": 130 }, { "epoch": 0.26478019201616976, "grad_norm": 6.509955883026123, "learning_rate": 9.848029832564875e-05, "loss": 1.6179, "step": 131 }, { "epoch": 0.26680141485598785, "grad_norm": 6.635447025299072, "learning_rate": 9.838121266717839e-05, "loss": 1.7545, "step": 132 }, { "epoch": 0.26882263769580594, "grad_norm": 7.2970099449157715, "learning_rate": 9.827905104115166e-05, "loss": 1.7671, "step": 133 }, { "epoch": 0.27084386053562404, "grad_norm": 5.802951335906982, "learning_rate": 9.817381994277261e-05, "loss": 1.4303, "step": 134 }, { "epoch": 0.27286508337544213, "grad_norm": 5.940200328826904, "learning_rate": 9.80655260623953e-05, "loss": 1.6777, "step": 135 }, { "epoch": 0.2748863062152602, "grad_norm": 6.702271938323975, "learning_rate": 9.795417628509857e-05, "loss": 1.7169, "step": 136 }, { "epoch": 0.2769075290550783, "grad_norm": 5.698646068572998, "learning_rate": 9.783977769024821e-05, "loss": 1.4602, "step": 137 }, { "epoch": 0.2789287518948964, "grad_norm": 7.149135589599609, "learning_rate": 9.772233755104694e-05, "loss": 1.6813, "step": 138 }, { "epoch": 0.2809499747347145, "grad_norm": 6.527937412261963, "learning_rate": 9.760186333407189e-05, "loss": 1.5575, "step": 139 }, { "epoch": 0.2829711975745326, "grad_norm": 9.34347152709961, "learning_rate": 9.747836269880003e-05, "loss": 1.6409, "step": 140 }, { "epoch": 0.2849924204143507, "grad_norm": 5.21145486831665, "learning_rate": 9.735184349712109e-05, "loss": 1.0628, "step": 141 }, { "epoch": 0.2870136432541688, "grad_norm": 5.601922988891602, "learning_rate": 9.722231377283841e-05, "loss": 1.2334, "step": 142 }, { "epoch": 0.2890348660939869, "grad_norm": 6.080234050750732, "learning_rate": 9.708978176115751e-05, "loss": 1.1516, "step": 143 }, { "epoch": 0.29105608893380497, "grad_norm": 6.611027717590332, "learning_rate": 9.695425588816249e-05, "loss": 1.3611, "step": 144 }, { "epoch": 0.29307731177362306, "grad_norm": 5.696514129638672, "learning_rate": 9.681574477028039e-05, "loss": 1.0987, "step": 145 }, { "epoch": 0.29509853461344115, "grad_norm": 8.035621643066406, "learning_rate": 9.667425721373332e-05, "loss": 1.7412, "step": 146 }, { "epoch": 0.29711975745325925, "grad_norm": 8.804828643798828, "learning_rate": 9.65298022139786e-05, "loss": 1.7839, "step": 147 }, { "epoch": 0.2991409802930773, "grad_norm": 6.8422064781188965, "learning_rate": 9.638238895513687e-05, "loss": 1.3999, "step": 148 }, { "epoch": 0.3011622031328954, "grad_norm": 9.455986022949219, "learning_rate": 9.623202680940811e-05, "loss": 1.8304, "step": 149 }, { "epoch": 0.30318342597271347, "grad_norm": 7.243097305297852, "learning_rate": 9.607872533647584e-05, "loss": 1.2319, "step": 150 }, { "epoch": 0.30520464881253156, "grad_norm": 3.3011507987976074, "learning_rate": 9.592249428289934e-05, "loss": 1.2937, "step": 151 }, { "epoch": 0.30722587165234966, "grad_norm": 3.4213240146636963, "learning_rate": 9.5763343581494e-05, "loss": 1.4224, "step": 152 }, { "epoch": 0.30924709449216775, "grad_norm": 2.8826541900634766, "learning_rate": 9.56012833506997e-05, "loss": 1.0413, "step": 153 }, { "epoch": 0.31126831733198584, "grad_norm": 3.137408494949341, "learning_rate": 9.543632389393767e-05, "loss": 1.3462, "step": 154 }, { "epoch": 0.31328954017180394, "grad_norm": 3.6072700023651123, "learning_rate": 9.52684756989553e-05, "loss": 1.4831, "step": 155 }, { "epoch": 0.31531076301162203, "grad_norm": 3.7660746574401855, "learning_rate": 9.509774943715939e-05, "loss": 1.6384, "step": 156 }, { "epoch": 0.3173319858514401, "grad_norm": 4.279458522796631, "learning_rate": 9.492415596293769e-05, "loss": 1.8637, "step": 157 }, { "epoch": 0.3193532086912582, "grad_norm": 3.9801113605499268, "learning_rate": 9.474770631296881e-05, "loss": 1.47, "step": 158 }, { "epoch": 0.3213744315310763, "grad_norm": 4.086119174957275, "learning_rate": 9.456841170552053e-05, "loss": 1.7968, "step": 159 }, { "epoch": 0.3233956543708944, "grad_norm": 3.968665838241577, "learning_rate": 9.438628353973653e-05, "loss": 1.4681, "step": 160 }, { "epoch": 0.3254168772107125, "grad_norm": 4.774650573730469, "learning_rate": 9.420133339491171e-05, "loss": 1.9598, "step": 161 }, { "epoch": 0.3274381000505306, "grad_norm": 4.956033706665039, "learning_rate": 9.401357302975599e-05, "loss": 1.7753, "step": 162 }, { "epoch": 0.3294593228903487, "grad_norm": 4.571745872497559, "learning_rate": 9.382301438164672e-05, "loss": 1.5018, "step": 163 }, { "epoch": 0.3314805457301668, "grad_norm": 4.879904747009277, "learning_rate": 9.362966956586969e-05, "loss": 1.795, "step": 164 }, { "epoch": 0.3335017685699848, "grad_norm": 4.576776027679443, "learning_rate": 9.343355087484894e-05, "loss": 1.6005, "step": 165 }, { "epoch": 0.3355229914098029, "grad_norm": 4.379726409912109, "learning_rate": 9.323467077736511e-05, "loss": 1.3807, "step": 166 }, { "epoch": 0.337544214249621, "grad_norm": 5.608097553253174, "learning_rate": 9.303304191776291e-05, "loss": 1.7641, "step": 167 }, { "epoch": 0.3395654370894391, "grad_norm": 4.803214073181152, "learning_rate": 9.282867711514702e-05, "loss": 1.5029, "step": 168 }, { "epoch": 0.3415866599292572, "grad_norm": 5.842015266418457, "learning_rate": 9.262158936256717e-05, "loss": 1.9833, "step": 169 }, { "epoch": 0.3436078827690753, "grad_norm": 5.395236968994141, "learning_rate": 9.241179182619206e-05, "loss": 1.6689, "step": 170 }, { "epoch": 0.34562910560889337, "grad_norm": 6.057033061981201, "learning_rate": 9.219929784447231e-05, "loss": 1.9053, "step": 171 }, { "epoch": 0.34765032844871147, "grad_norm": 5.41602897644043, "learning_rate": 9.19841209272924e-05, "loss": 1.56, "step": 172 }, { "epoch": 0.34967155128852956, "grad_norm": 5.450588703155518, "learning_rate": 9.17662747551117e-05, "loss": 1.5914, "step": 173 }, { "epoch": 0.35169277412834765, "grad_norm": 5.975580215454102, "learning_rate": 9.154577317809482e-05, "loss": 1.7152, "step": 174 }, { "epoch": 0.35371399696816574, "grad_norm": 5.361056804656982, "learning_rate": 9.132263021523096e-05, "loss": 1.804, "step": 175 }, { "epoch": 0.35573521980798384, "grad_norm": 5.825102806091309, "learning_rate": 9.109686005344258e-05, "loss": 1.5358, "step": 176 }, { "epoch": 0.35775644264780193, "grad_norm": 5.946747779846191, "learning_rate": 9.086847704668351e-05, "loss": 1.964, "step": 177 }, { "epoch": 0.35977766548762, "grad_norm": 5.494548320770264, "learning_rate": 9.063749571502634e-05, "loss": 1.4712, "step": 178 }, { "epoch": 0.3617988883274381, "grad_norm": 5.114996433258057, "learning_rate": 9.040393074373921e-05, "loss": 1.5819, "step": 179 }, { "epoch": 0.3638201111672562, "grad_norm": 5.512078285217285, "learning_rate": 9.016779698235227e-05, "loss": 1.6159, "step": 180 }, { "epoch": 0.3658413340070743, "grad_norm": 5.07801628112793, "learning_rate": 8.992910944371342e-05, "loss": 1.4474, "step": 181 }, { "epoch": 0.3678625568468924, "grad_norm": 6.879641056060791, "learning_rate": 8.9687883303034e-05, "loss": 1.7858, "step": 182 }, { "epoch": 0.36988377968671043, "grad_norm": 5.092156887054443, "learning_rate": 8.94441338969238e-05, "loss": 1.4168, "step": 183 }, { "epoch": 0.3719050025265285, "grad_norm": 5.373355865478516, "learning_rate": 8.919787672241619e-05, "loss": 1.5532, "step": 184 }, { "epoch": 0.3739262253663466, "grad_norm": 5.264693737030029, "learning_rate": 8.894912743598268e-05, "loss": 1.3256, "step": 185 }, { "epoch": 0.3759474482061647, "grad_norm": 5.930539608001709, "learning_rate": 8.869790185253766e-05, "loss": 1.6318, "step": 186 }, { "epoch": 0.3779686710459828, "grad_norm": 6.245593547821045, "learning_rate": 8.84442159444328e-05, "loss": 1.6553, "step": 187 }, { "epoch": 0.3799898938858009, "grad_norm": 6.605845928192139, "learning_rate": 8.818808584044162e-05, "loss": 1.1972, "step": 188 }, { "epoch": 0.382011116725619, "grad_norm": 5.590306758880615, "learning_rate": 8.792952782473413e-05, "loss": 1.2016, "step": 189 }, { "epoch": 0.3840323395654371, "grad_norm": 6.213796138763428, "learning_rate": 8.76685583358414e-05, "loss": 1.393, "step": 190 }, { "epoch": 0.3860535624052552, "grad_norm": 6.904628753662109, "learning_rate": 8.740519396561044e-05, "loss": 1.5803, "step": 191 }, { "epoch": 0.3880747852450733, "grad_norm": 6.069091796875, "learning_rate": 8.713945145814946e-05, "loss": 1.4332, "step": 192 }, { "epoch": 0.39009600808489137, "grad_norm": 6.582734107971191, "learning_rate": 8.687134770876319e-05, "loss": 1.3049, "step": 193 }, { "epoch": 0.39211723092470946, "grad_norm": 5.841890811920166, "learning_rate": 8.660089976287875e-05, "loss": 1.3647, "step": 194 }, { "epoch": 0.39413845376452755, "grad_norm": 5.880384922027588, "learning_rate": 8.632812481496195e-05, "loss": 1.37, "step": 195 }, { "epoch": 0.39615967660434565, "grad_norm": 5.35994291305542, "learning_rate": 8.60530402074241e-05, "loss": 1.103, "step": 196 }, { "epoch": 0.39818089944416374, "grad_norm": 5.617462635040283, "learning_rate": 8.577566342951943e-05, "loss": 0.9969, "step": 197 }, { "epoch": 0.40020212228398183, "grad_norm": 5.336188793182373, "learning_rate": 8.549601211623316e-05, "loss": 0.9578, "step": 198 }, { "epoch": 0.4022233451237999, "grad_norm": 7.3241047859191895, "learning_rate": 8.521410404716028e-05, "loss": 1.4591, "step": 199 }, { "epoch": 0.40424456796361796, "grad_norm": 5.509421348571777, "learning_rate": 8.492995714537518e-05, "loss": 1.113, "step": 200 }, { "epoch": 0.40626579080343606, "grad_norm": 2.5618975162506104, "learning_rate": 8.464358947629218e-05, "loss": 1.4167, "step": 201 }, { "epoch": 0.40828701364325415, "grad_norm": 3.311715602874756, "learning_rate": 8.435501924651691e-05, "loss": 1.3956, "step": 202 }, { "epoch": 0.41030823648307224, "grad_norm": 2.9615299701690674, "learning_rate": 8.406426480268881e-05, "loss": 1.1263, "step": 203 }, { "epoch": 0.41232945932289033, "grad_norm": 3.3075735569000244, "learning_rate": 8.377134463031469e-05, "loss": 1.3577, "step": 204 }, { "epoch": 0.41435068216270843, "grad_norm": 3.4304211139678955, "learning_rate": 8.347627735259343e-05, "loss": 1.6109, "step": 205 }, { "epoch": 0.4163719050025265, "grad_norm": 3.1346213817596436, "learning_rate": 8.317908172923205e-05, "loss": 1.1557, "step": 206 }, { "epoch": 0.4183931278423446, "grad_norm": 3.6796648502349854, "learning_rate": 8.287977665525292e-05, "loss": 1.3844, "step": 207 }, { "epoch": 0.4204143506821627, "grad_norm": 3.5582165718078613, "learning_rate": 8.257838115979244e-05, "loss": 1.4043, "step": 208 }, { "epoch": 0.4224355735219808, "grad_norm": 4.690045356750488, "learning_rate": 8.227491440489133e-05, "loss": 1.9057, "step": 209 }, { "epoch": 0.4244567963617989, "grad_norm": 4.603220462799072, "learning_rate": 8.196939568427624e-05, "loss": 2.0138, "step": 210 }, { "epoch": 0.426478019201617, "grad_norm": 4.161036014556885, "learning_rate": 8.166184442213313e-05, "loss": 1.5959, "step": 211 }, { "epoch": 0.4284992420414351, "grad_norm": 4.600170612335205, "learning_rate": 8.135228017187237e-05, "loss": 1.656, "step": 212 }, { "epoch": 0.4305204648812532, "grad_norm": 4.487064838409424, "learning_rate": 8.10407226148855e-05, "loss": 1.8356, "step": 213 }, { "epoch": 0.43254168772107127, "grad_norm": 4.709319591522217, "learning_rate": 8.0727191559294e-05, "loss": 1.6691, "step": 214 }, { "epoch": 0.43456291056088936, "grad_norm": 4.594611644744873, "learning_rate": 8.041170693868985e-05, "loss": 1.5736, "step": 215 }, { "epoch": 0.43658413340070745, "grad_norm": 4.373988628387451, "learning_rate": 8.009428881086835e-05, "loss": 1.5365, "step": 216 }, { "epoch": 0.4386053562405255, "grad_norm": 4.379314422607422, "learning_rate": 7.977495735655272e-05, "loss": 1.3149, "step": 217 }, { "epoch": 0.4406265790803436, "grad_norm": 5.018416881561279, "learning_rate": 7.945373287811116e-05, "loss": 1.6653, "step": 218 }, { "epoch": 0.4426478019201617, "grad_norm": 5.0816969871521, "learning_rate": 7.913063579826601e-05, "loss": 1.6499, "step": 219 }, { "epoch": 0.44466902475997977, "grad_norm": 4.877917289733887, "learning_rate": 7.880568665879542e-05, "loss": 1.6308, "step": 220 }, { "epoch": 0.44669024759979786, "grad_norm": 4.950286388397217, "learning_rate": 7.847890611922721e-05, "loss": 1.4406, "step": 221 }, { "epoch": 0.44871147043961596, "grad_norm": 5.2699480056762695, "learning_rate": 7.815031495552549e-05, "loss": 1.5366, "step": 222 }, { "epoch": 0.45073269327943405, "grad_norm": 5.018312931060791, "learning_rate": 7.781993405876972e-05, "loss": 1.3646, "step": 223 }, { "epoch": 0.45275391611925214, "grad_norm": 5.903739929199219, "learning_rate": 7.748778443382658e-05, "loss": 1.713, "step": 224 }, { "epoch": 0.45477513895907024, "grad_norm": 5.679375648498535, "learning_rate": 7.715388719801438e-05, "loss": 1.6164, "step": 225 }, { "epoch": 0.45679636179888833, "grad_norm": 5.779827117919922, "learning_rate": 7.68182635797606e-05, "loss": 1.8518, "step": 226 }, { "epoch": 0.4588175846387064, "grad_norm": 5.680889129638672, "learning_rate": 7.648093491725223e-05, "loss": 1.6413, "step": 227 }, { "epoch": 0.4608388074785245, "grad_norm": 6.258683681488037, "learning_rate": 7.614192265707905e-05, "loss": 1.4253, "step": 228 }, { "epoch": 0.4628600303183426, "grad_norm": 6.006860733032227, "learning_rate": 7.580124835287013e-05, "loss": 1.5382, "step": 229 }, { "epoch": 0.4648812531581607, "grad_norm": 5.656060695648193, "learning_rate": 7.545893366392358e-05, "loss": 1.491, "step": 230 }, { "epoch": 0.4669024759979788, "grad_norm": 6.140045642852783, "learning_rate": 7.511500035382942e-05, "loss": 1.754, "step": 231 }, { "epoch": 0.4689236988377969, "grad_norm": 6.063406467437744, "learning_rate": 7.476947028908594e-05, "loss": 1.5948, "step": 232 }, { "epoch": 0.470944921677615, "grad_norm": 5.987018585205078, "learning_rate": 7.442236543770944e-05, "loss": 1.6062, "step": 233 }, { "epoch": 0.4729661445174331, "grad_norm": 5.543860912322998, "learning_rate": 7.407370786783757e-05, "loss": 1.5026, "step": 234 }, { "epoch": 0.4749873673572511, "grad_norm": 5.393790245056152, "learning_rate": 7.372351974632634e-05, "loss": 1.5174, "step": 235 }, { "epoch": 0.4770085901970692, "grad_norm": 6.32440710067749, "learning_rate": 7.33718233373407e-05, "loss": 1.6588, "step": 236 }, { "epoch": 0.4790298130368873, "grad_norm": 5.640255928039551, "learning_rate": 7.301864100093912e-05, "loss": 1.3006, "step": 237 }, { "epoch": 0.4810510358767054, "grad_norm": 5.789175033569336, "learning_rate": 7.266399519165192e-05, "loss": 1.5062, "step": 238 }, { "epoch": 0.4830722587165235, "grad_norm": 5.77946662902832, "learning_rate": 7.230790845705379e-05, "loss": 1.3896, "step": 239 }, { "epoch": 0.4850934815563416, "grad_norm": 5.353186130523682, "learning_rate": 7.195040343633007e-05, "loss": 1.321, "step": 240 }, { "epoch": 0.48711470439615967, "grad_norm": 7.649440765380859, "learning_rate": 7.159150285883756e-05, "loss": 2.033, "step": 241 }, { "epoch": 0.48913592723597776, "grad_norm": 5.809901237487793, "learning_rate": 7.123122954265941e-05, "loss": 1.2447, "step": 242 }, { "epoch": 0.49115715007579586, "grad_norm": 5.7666215896606445, "learning_rate": 7.086960639315436e-05, "loss": 1.3123, "step": 243 }, { "epoch": 0.49317837291561395, "grad_norm": 6.335599422454834, "learning_rate": 7.050665640150045e-05, "loss": 1.6371, "step": 244 }, { "epoch": 0.49519959575543204, "grad_norm": 5.487225532531738, "learning_rate": 7.014240264323334e-05, "loss": 1.2528, "step": 245 }, { "epoch": 0.49722081859525014, "grad_norm": 5.477118968963623, "learning_rate": 6.977686827677926e-05, "loss": 1.1761, "step": 246 }, { "epoch": 0.49924204143506823, "grad_norm": 4.509613037109375, "learning_rate": 6.941007654198254e-05, "loss": 1.0277, "step": 247 }, { "epoch": 0.5012632642748863, "grad_norm": 7.54964542388916, "learning_rate": 6.904205075862816e-05, "loss": 1.6264, "step": 248 }, { "epoch": 0.5032844871147044, "grad_norm": 7.848708152770996, "learning_rate": 6.867281432495912e-05, "loss": 1.4215, "step": 249 }, { "epoch": 0.5053057099545225, "grad_norm": 7.514161109924316, "learning_rate": 6.830239071618873e-05, "loss": 1.4708, "step": 250 }, { "epoch": 0.5073269327943406, "grad_norm": 2.8156673908233643, "learning_rate": 6.793080348300833e-05, "loss": 1.4503, "step": 251 }, { "epoch": 0.5093481556341587, "grad_norm": 2.978626012802124, "learning_rate": 6.755807625008974e-05, "loss": 1.17, "step": 252 }, { "epoch": 0.5113693784739768, "grad_norm": 3.320791482925415, "learning_rate": 6.718423271458343e-05, "loss": 1.4699, "step": 253 }, { "epoch": 0.5133906013137949, "grad_norm": 3.1808319091796875, "learning_rate": 6.680929664461185e-05, "loss": 1.2698, "step": 254 }, { "epoch": 0.515411824153613, "grad_norm": 3.3711514472961426, "learning_rate": 6.643329187775827e-05, "loss": 1.5507, "step": 255 }, { "epoch": 0.5174330469934311, "grad_norm": 3.6619150638580322, "learning_rate": 6.605624231955131e-05, "loss": 1.6664, "step": 256 }, { "epoch": 0.5194542698332492, "grad_norm": 4.01786994934082, "learning_rate": 6.567817194194507e-05, "loss": 1.7517, "step": 257 }, { "epoch": 0.5214754926730673, "grad_norm": 4.117989540100098, "learning_rate": 6.529910478179499e-05, "loss": 1.7831, "step": 258 }, { "epoch": 0.5234967155128853, "grad_norm": 3.7679736614227295, "learning_rate": 6.491906493932968e-05, "loss": 1.4514, "step": 259 }, { "epoch": 0.5255179383527034, "grad_norm": 4.316635608673096, "learning_rate": 6.45380765766187e-05, "loss": 1.6405, "step": 260 }, { "epoch": 0.5275391611925214, "grad_norm": 3.958988904953003, "learning_rate": 6.415616391603638e-05, "loss": 1.5774, "step": 261 }, { "epoch": 0.5295603840323395, "grad_norm": 4.332874774932861, "learning_rate": 6.377335123872177e-05, "loss": 1.8736, "step": 262 }, { "epoch": 0.5315816068721576, "grad_norm": 4.065393447875977, "learning_rate": 6.338966288303499e-05, "loss": 1.5071, "step": 263 }, { "epoch": 0.5336028297119757, "grad_norm": 4.553988456726074, "learning_rate": 6.300512324300975e-05, "loss": 1.806, "step": 264 }, { "epoch": 0.5356240525517938, "grad_norm": 4.563177108764648, "learning_rate": 6.261975676680252e-05, "loss": 1.567, "step": 265 }, { "epoch": 0.5376452753916119, "grad_norm": 4.3816142082214355, "learning_rate": 6.223358795513812e-05, "loss": 1.6037, "step": 266 }, { "epoch": 0.53966649823143, "grad_norm": 4.977108001708984, "learning_rate": 6.184664135975203e-05, "loss": 1.8076, "step": 267 }, { "epoch": 0.5416877210712481, "grad_norm": 4.56311559677124, "learning_rate": 6.145894158182944e-05, "loss": 1.6309, "step": 268 }, { "epoch": 0.5437089439110662, "grad_norm": 5.014670372009277, "learning_rate": 6.107051327044124e-05, "loss": 1.6022, "step": 269 }, { "epoch": 0.5457301667508843, "grad_norm": 4.538066864013672, "learning_rate": 6.068138112097674e-05, "loss": 1.494, "step": 270 }, { "epoch": 0.5477513895907024, "grad_norm": 5.387009143829346, "learning_rate": 6.029156987357373e-05, "loss": 1.7367, "step": 271 }, { "epoch": 0.5497726124305204, "grad_norm": 4.673946857452393, "learning_rate": 5.9901104311545487e-05, "loss": 1.6585, "step": 272 }, { "epoch": 0.5517938352703385, "grad_norm": 5.331272125244141, "learning_rate": 5.9510009259805085e-05, "loss": 1.7205, "step": 273 }, { "epoch": 0.5538150581101566, "grad_norm": 5.224307537078857, "learning_rate": 5.91183095832872e-05, "loss": 1.8472, "step": 274 }, { "epoch": 0.5558362809499747, "grad_norm": 4.787731170654297, "learning_rate": 5.872603018536713e-05, "loss": 1.5981, "step": 275 }, { "epoch": 0.5578575037897928, "grad_norm": 5.033946990966797, "learning_rate": 5.833319600627753e-05, "loss": 1.5519, "step": 276 }, { "epoch": 0.5598787266296109, "grad_norm": 5.653214931488037, "learning_rate": 5.793983202152282e-05, "loss": 1.9657, "step": 277 }, { "epoch": 0.561899949469429, "grad_norm": 5.225715160369873, "learning_rate": 5.7545963240291246e-05, "loss": 1.4663, "step": 278 }, { "epoch": 0.5639211723092471, "grad_norm": 5.952142715454102, "learning_rate": 5.715161470386485e-05, "loss": 1.8356, "step": 279 }, { "epoch": 0.5659423951490652, "grad_norm": 5.057675361633301, "learning_rate": 5.6756811484027425e-05, "loss": 1.6058, "step": 280 }, { "epoch": 0.5679636179888833, "grad_norm": 4.865301132202148, "learning_rate": 5.636157868147054e-05, "loss": 1.2382, "step": 281 }, { "epoch": 0.5699848408287014, "grad_norm": 5.245824337005615, "learning_rate": 5.596594142419759e-05, "loss": 1.4634, "step": 282 }, { "epoch": 0.5720060636685195, "grad_norm": 5.30856990814209, "learning_rate": 5.556992486592634e-05, "loss": 1.5013, "step": 283 }, { "epoch": 0.5740272865083376, "grad_norm": 6.301365375518799, "learning_rate": 5.517355418448961e-05, "loss": 1.683, "step": 284 }, { "epoch": 0.5760485093481557, "grad_norm": 5.439041614532471, "learning_rate": 5.477685458023459e-05, "loss": 1.4477, "step": 285 }, { "epoch": 0.5780697321879738, "grad_norm": 5.788546085357666, "learning_rate": 5.437985127442065e-05, "loss": 1.5466, "step": 286 }, { "epoch": 0.5800909550277918, "grad_norm": 4.990469932556152, "learning_rate": 5.3982569507615775e-05, "loss": 1.4082, "step": 287 }, { "epoch": 0.5821121778676099, "grad_norm": 6.371885776519775, "learning_rate": 5.3585034538091885e-05, "loss": 1.5582, "step": 288 }, { "epoch": 0.584133400707428, "grad_norm": 5.770207405090332, "learning_rate": 5.318727164021896e-05, "loss": 1.6081, "step": 289 }, { "epoch": 0.5861546235472461, "grad_norm": 5.396596908569336, "learning_rate": 5.278930610285813e-05, "loss": 1.1804, "step": 290 }, { "epoch": 0.5881758463870642, "grad_norm": 5.193579196929932, "learning_rate": 5.239116322775391e-05, "loss": 1.1155, "step": 291 }, { "epoch": 0.5901970692268823, "grad_norm": 6.787877559661865, "learning_rate": 5.1992868327925526e-05, "loss": 1.7875, "step": 292 }, { "epoch": 0.5922182920667004, "grad_norm": 5.3459696769714355, "learning_rate": 5.159444672605759e-05, "loss": 1.3469, "step": 293 }, { "epoch": 0.5942395149065185, "grad_norm": 5.520552635192871, "learning_rate": 5.119592375289015e-05, "loss": 1.187, "step": 294 }, { "epoch": 0.5962607377463366, "grad_norm": 5.6787519454956055, "learning_rate": 5.079732474560821e-05, "loss": 1.4493, "step": 295 }, { "epoch": 0.5982819605861546, "grad_norm": 5.608784198760986, "learning_rate": 5.0398675046230835e-05, "loss": 1.2803, "step": 296 }, { "epoch": 0.6003031834259727, "grad_norm": 4.6050872802734375, "learning_rate": 5e-05, "loss": 0.8699, "step": 297 }, { "epoch": 0.6023244062657908, "grad_norm": 5.878370761871338, "learning_rate": 4.960132495376918e-05, "loss": 1.3753, "step": 298 }, { "epoch": 0.6043456291056089, "grad_norm": 6.23378324508667, "learning_rate": 4.92026752543918e-05, "loss": 1.3375, "step": 299 }, { "epoch": 0.6063668519454269, "grad_norm": 8.689998626708984, "learning_rate": 4.8804076247109865e-05, "loss": 1.3833, "step": 300 }, { "epoch": 0.608388074785245, "grad_norm": 2.740164279937744, "learning_rate": 4.840555327394241e-05, "loss": 1.2242, "step": 301 }, { "epoch": 0.6104092976250631, "grad_norm": 2.697038173675537, "learning_rate": 4.800713167207449e-05, "loss": 1.2152, "step": 302 }, { "epoch": 0.6124305204648812, "grad_norm": 3.111619234085083, "learning_rate": 4.760883677224609e-05, "loss": 1.4117, "step": 303 }, { "epoch": 0.6144517433046993, "grad_norm": 2.8779137134552, "learning_rate": 4.721069389714188e-05, "loss": 1.1105, "step": 304 }, { "epoch": 0.6164729661445174, "grad_norm": 3.4639029502868652, "learning_rate": 4.681272835978107e-05, "loss": 1.4196, "step": 305 }, { "epoch": 0.6184941889843355, "grad_norm": 4.024080753326416, "learning_rate": 4.6414965461908126e-05, "loss": 1.9051, "step": 306 }, { "epoch": 0.6205154118241536, "grad_norm": 3.319389581680298, "learning_rate": 4.601743049238424e-05, "loss": 1.3579, "step": 307 }, { "epoch": 0.6225366346639717, "grad_norm": 4.286203384399414, "learning_rate": 4.562014872557935e-05, "loss": 1.8763, "step": 308 }, { "epoch": 0.6245578575037898, "grad_norm": 4.204199314117432, "learning_rate": 4.522314541976541e-05, "loss": 1.6859, "step": 309 }, { "epoch": 0.6265790803436079, "grad_norm": 4.099395751953125, "learning_rate": 4.482644581551039e-05, "loss": 1.5438, "step": 310 }, { "epoch": 0.628600303183426, "grad_norm": 4.58848237991333, "learning_rate": 4.443007513407368e-05, "loss": 1.9432, "step": 311 }, { "epoch": 0.6306215260232441, "grad_norm": 4.800745010375977, "learning_rate": 4.4034058575802424e-05, "loss": 1.7121, "step": 312 }, { "epoch": 0.6326427488630622, "grad_norm": 4.463362216949463, "learning_rate": 4.3638421318529474e-05, "loss": 1.6288, "step": 313 }, { "epoch": 0.6346639717028802, "grad_norm": 4.3990864753723145, "learning_rate": 4.324318851597258e-05, "loss": 1.5733, "step": 314 }, { "epoch": 0.6366851945426983, "grad_norm": 4.997748851776123, "learning_rate": 4.284838529613516e-05, "loss": 1.9203, "step": 315 }, { "epoch": 0.6387064173825164, "grad_norm": 5.128242015838623, "learning_rate": 4.2454036759708765e-05, "loss": 1.8486, "step": 316 }, { "epoch": 0.6407276402223345, "grad_norm": 4.875668525695801, "learning_rate": 4.2060167978477184e-05, "loss": 1.5951, "step": 317 }, { "epoch": 0.6427488630621526, "grad_norm": 4.408964157104492, "learning_rate": 4.166680399372248e-05, "loss": 1.3977, "step": 318 }, { "epoch": 0.6447700859019707, "grad_norm": 4.5807905197143555, "learning_rate": 4.1273969814632894e-05, "loss": 1.4649, "step": 319 }, { "epoch": 0.6467913087417888, "grad_norm": 5.094422817230225, "learning_rate": 4.0881690416712805e-05, "loss": 1.6607, "step": 320 }, { "epoch": 0.6488125315816069, "grad_norm": 4.936136722564697, "learning_rate": 4.0489990740194926e-05, "loss": 1.6117, "step": 321 }, { "epoch": 0.650833754421425, "grad_norm": 5.264697074890137, "learning_rate": 4.009889568845453e-05, "loss": 1.6412, "step": 322 }, { "epoch": 0.6528549772612431, "grad_norm": 4.067869663238525, "learning_rate": 3.9708430126426284e-05, "loss": 1.2319, "step": 323 }, { "epoch": 0.6548762001010612, "grad_norm": 5.502519607543945, "learning_rate": 3.9318618879023256e-05, "loss": 1.6435, "step": 324 }, { "epoch": 0.6568974229408793, "grad_norm": 5.370153903961182, "learning_rate": 3.892948672955877e-05, "loss": 1.6528, "step": 325 }, { "epoch": 0.6589186457806974, "grad_norm": 4.501315593719482, "learning_rate": 3.854105841817056e-05, "loss": 1.4033, "step": 326 }, { "epoch": 0.6609398686205155, "grad_norm": 5.458628177642822, "learning_rate": 3.815335864024799e-05, "loss": 1.7448, "step": 327 }, { "epoch": 0.6629610914603336, "grad_norm": 5.266726016998291, "learning_rate": 3.776641204486191e-05, "loss": 1.6844, "step": 328 }, { "epoch": 0.6649823143001516, "grad_norm": 4.972016334533691, "learning_rate": 3.738024323319749e-05, "loss": 1.66, "step": 329 }, { "epoch": 0.6670035371399696, "grad_norm": 4.560551643371582, "learning_rate": 3.699487675699026e-05, "loss": 1.2507, "step": 330 }, { "epoch": 0.6690247599797877, "grad_norm": 5.447690010070801, "learning_rate": 3.661033711696501e-05, "loss": 1.4381, "step": 331 }, { "epoch": 0.6710459828196058, "grad_norm": 6.1798787117004395, "learning_rate": 3.6226648761278235e-05, "loss": 1.6519, "step": 332 }, { "epoch": 0.6730672056594239, "grad_norm": 8.073100090026855, "learning_rate": 3.584383608396362e-05, "loss": 1.5615, "step": 333 }, { "epoch": 0.675088428499242, "grad_norm": 6.568238735198975, "learning_rate": 3.546192342338131e-05, "loss": 1.5244, "step": 334 }, { "epoch": 0.6771096513390601, "grad_norm": 5.141592979431152, "learning_rate": 3.508093506067034e-05, "loss": 1.3669, "step": 335 }, { "epoch": 0.6791308741788782, "grad_norm": 5.7515950202941895, "learning_rate": 3.470089521820502e-05, "loss": 1.4939, "step": 336 }, { "epoch": 0.6811520970186963, "grad_norm": 5.398025035858154, "learning_rate": 3.432182805805495e-05, "loss": 1.3243, "step": 337 }, { "epoch": 0.6831733198585144, "grad_norm": 5.3287272453308105, "learning_rate": 3.394375768044869e-05, "loss": 1.3026, "step": 338 }, { "epoch": 0.6851945426983325, "grad_norm": 5.701461315155029, "learning_rate": 3.3566708122241756e-05, "loss": 1.5187, "step": 339 }, { "epoch": 0.6872157655381506, "grad_norm": 6.3350067138671875, "learning_rate": 3.3190703355388166e-05, "loss": 1.9201, "step": 340 }, { "epoch": 0.6892369883779687, "grad_norm": 6.057320594787598, "learning_rate": 3.2815767285416576e-05, "loss": 1.3178, "step": 341 }, { "epoch": 0.6912582112177867, "grad_norm": 4.327114105224609, "learning_rate": 3.244192374991027e-05, "loss": 1.0027, "step": 342 }, { "epoch": 0.6932794340576048, "grad_norm": 5.175032615661621, "learning_rate": 3.2069196516991686e-05, "loss": 1.1705, "step": 343 }, { "epoch": 0.6953006568974229, "grad_norm": 5.2318644523620605, "learning_rate": 3.169760928381127e-05, "loss": 1.1488, "step": 344 }, { "epoch": 0.697321879737241, "grad_norm": 4.282010555267334, "learning_rate": 3.13271856750409e-05, "loss": 0.9302, "step": 345 }, { "epoch": 0.6993431025770591, "grad_norm": 6.124838829040527, "learning_rate": 3.095794924137184e-05, "loss": 1.328, "step": 346 }, { "epoch": 0.7013643254168772, "grad_norm": 4.892329216003418, "learning_rate": 3.058992345801747e-05, "loss": 1.0958, "step": 347 }, { "epoch": 0.7033855482566953, "grad_norm": 5.745382308959961, "learning_rate": 3.0223131723220756e-05, "loss": 1.224, "step": 348 }, { "epoch": 0.7054067710965134, "grad_norm": 7.191976547241211, "learning_rate": 2.9857597356766674e-05, "loss": 1.1652, "step": 349 }, { "epoch": 0.7074279939363315, "grad_norm": 5.166662693023682, "learning_rate": 2.9493343598499567e-05, "loss": 1.0203, "step": 350 }, { "epoch": 0.7094492167761496, "grad_norm": 2.4570610523223877, "learning_rate": 2.913039360684565e-05, "loss": 1.2315, "step": 351 }, { "epoch": 0.7114704396159677, "grad_norm": 2.5633747577667236, "learning_rate": 2.8768770457340575e-05, "loss": 1.2238, "step": 352 }, { "epoch": 0.7134916624557858, "grad_norm": 2.9392590522766113, "learning_rate": 2.8408497141162438e-05, "loss": 1.2994, "step": 353 }, { "epoch": 0.7155128852956039, "grad_norm": 2.7536635398864746, "learning_rate": 2.8049596563669932e-05, "loss": 1.0344, "step": 354 }, { "epoch": 0.717534108135422, "grad_norm": 3.7724685668945312, "learning_rate": 2.769209154294623e-05, "loss": 1.6256, "step": 355 }, { "epoch": 0.71955533097524, "grad_norm": 3.661170721054077, "learning_rate": 2.7336004808348093e-05, "loss": 1.4207, "step": 356 }, { "epoch": 0.7215765538150581, "grad_norm": 4.080316066741943, "learning_rate": 2.69813589990609e-05, "loss": 1.6689, "step": 357 }, { "epoch": 0.7235977766548762, "grad_norm": 4.936508655548096, "learning_rate": 2.662817666265932e-05, "loss": 1.996, "step": 358 }, { "epoch": 0.7256189994946943, "grad_norm": 4.0, "learning_rate": 2.6276480253673662e-05, "loss": 1.6679, "step": 359 }, { "epoch": 0.7276402223345124, "grad_norm": 4.3267903327941895, "learning_rate": 2.5926292132162433e-05, "loss": 1.6934, "step": 360 }, { "epoch": 0.7296614451743305, "grad_norm": 4.0602593421936035, "learning_rate": 2.5577634562290564e-05, "loss": 1.5712, "step": 361 }, { "epoch": 0.7316826680141486, "grad_norm": 4.487820148468018, "learning_rate": 2.5230529710914076e-05, "loss": 1.5552, "step": 362 }, { "epoch": 0.7337038908539667, "grad_norm": 4.205357074737549, "learning_rate": 2.4884999646170597e-05, "loss": 1.5065, "step": 363 }, { "epoch": 0.7357251136937848, "grad_norm": 4.995065689086914, "learning_rate": 2.4541066336076434e-05, "loss": 1.7604, "step": 364 }, { "epoch": 0.7377463365336028, "grad_norm": 4.657031059265137, "learning_rate": 2.4198751647129897e-05, "loss": 1.6113, "step": 365 }, { "epoch": 0.7397675593734209, "grad_norm": 4.953658103942871, "learning_rate": 2.3858077342920972e-05, "loss": 1.4499, "step": 366 }, { "epoch": 0.741788782213239, "grad_norm": 4.663423538208008, "learning_rate": 2.3519065082747778e-05, "loss": 1.6878, "step": 367 }, { "epoch": 0.743810005053057, "grad_norm": 4.570845127105713, "learning_rate": 2.3181736420239385e-05, "loss": 1.5128, "step": 368 }, { "epoch": 0.7458312278928751, "grad_norm": 4.937878608703613, "learning_rate": 2.2846112801985632e-05, "loss": 1.7156, "step": 369 }, { "epoch": 0.7478524507326932, "grad_norm": 4.761300086975098, "learning_rate": 2.251221556617344e-05, "loss": 1.7288, "step": 370 }, { "epoch": 0.7498736735725113, "grad_norm": 4.429453372955322, "learning_rate": 2.2180065941230277e-05, "loss": 1.4495, "step": 371 }, { "epoch": 0.7518948964123294, "grad_norm": 3.9287309646606445, "learning_rate": 2.1849685044474533e-05, "loss": 1.2037, "step": 372 }, { "epoch": 0.7539161192521475, "grad_norm": 5.036333084106445, "learning_rate": 2.15210938807728e-05, "loss": 1.4006, "step": 373 }, { "epoch": 0.7559373420919656, "grad_norm": 4.479243755340576, "learning_rate": 2.1194313341204597e-05, "loss": 1.3916, "step": 374 }, { "epoch": 0.7579585649317837, "grad_norm": 4.996969699859619, "learning_rate": 2.0869364201733987e-05, "loss": 1.2482, "step": 375 }, { "epoch": 0.7599797877716018, "grad_norm": 5.1381449699401855, "learning_rate": 2.054626712188886e-05, "loss": 1.6205, "step": 376 }, { "epoch": 0.7620010106114199, "grad_norm": 5.011663913726807, "learning_rate": 2.0225042643447283e-05, "loss": 1.6553, "step": 377 }, { "epoch": 0.764022233451238, "grad_norm": 4.932290554046631, "learning_rate": 1.990571118913166e-05, "loss": 1.3811, "step": 378 }, { "epoch": 0.7660434562910561, "grad_norm": 5.215028285980225, "learning_rate": 1.9588293061310163e-05, "loss": 1.4943, "step": 379 }, { "epoch": 0.7680646791308742, "grad_norm": 4.604588985443115, "learning_rate": 1.9272808440706026e-05, "loss": 1.1947, "step": 380 }, { "epoch": 0.7700859019706923, "grad_norm": 5.850764274597168, "learning_rate": 1.8959277385114514e-05, "loss": 1.4795, "step": 381 }, { "epoch": 0.7721071248105104, "grad_norm": 5.08169412612915, "learning_rate": 1.864771982812763e-05, "loss": 1.4163, "step": 382 }, { "epoch": 0.7741283476503285, "grad_norm": 5.118016719818115, "learning_rate": 1.8338155577866873e-05, "loss": 1.4816, "step": 383 }, { "epoch": 0.7761495704901465, "grad_norm": 5.448619842529297, "learning_rate": 1.8030604315723766e-05, "loss": 1.3162, "step": 384 }, { "epoch": 0.7781707933299646, "grad_norm": 6.636441707611084, "learning_rate": 1.7725085595108682e-05, "loss": 1.4221, "step": 385 }, { "epoch": 0.7801920161697827, "grad_norm": 5.057902812957764, "learning_rate": 1.7421618840207578e-05, "loss": 1.4411, "step": 386 }, { "epoch": 0.7822132390096008, "grad_norm": 4.244833946228027, "learning_rate": 1.71202233447471e-05, "loss": 1.0515, "step": 387 }, { "epoch": 0.7842344618494189, "grad_norm": 4.543421745300293, "learning_rate": 1.682091827076796e-05, "loss": 1.3154, "step": 388 }, { "epoch": 0.786255684689237, "grad_norm": 5.357529163360596, "learning_rate": 1.6523722647406576e-05, "loss": 1.3857, "step": 389 }, { "epoch": 0.7882769075290551, "grad_norm": 5.082853317260742, "learning_rate": 1.622865536968534e-05, "loss": 1.2635, "step": 390 }, { "epoch": 0.7902981303688732, "grad_norm": 5.57720947265625, "learning_rate": 1.5935735197311202e-05, "loss": 1.422, "step": 391 }, { "epoch": 0.7923193532086913, "grad_norm": 5.431817054748535, "learning_rate": 1.5644980753483107e-05, "loss": 1.2788, "step": 392 }, { "epoch": 0.7943405760485094, "grad_norm": 5.892661094665527, "learning_rate": 1.5356410523707825e-05, "loss": 1.5827, "step": 393 }, { "epoch": 0.7963617988883275, "grad_norm": 5.396627902984619, "learning_rate": 1.5070042854624834e-05, "loss": 1.2314, "step": 394 }, { "epoch": 0.7983830217281456, "grad_norm": 5.037806510925293, "learning_rate": 1.4785895952839734e-05, "loss": 1.2281, "step": 395 }, { "epoch": 0.8004042445679637, "grad_norm": 6.198902130126953, "learning_rate": 1.4503987883766857e-05, "loss": 1.3708, "step": 396 }, { "epoch": 0.8024254674077818, "grad_norm": 4.604989051818848, "learning_rate": 1.4224336570480573e-05, "loss": 0.9869, "step": 397 }, { "epoch": 0.8044466902475998, "grad_norm": 6.081021785736084, "learning_rate": 1.3946959792575915e-05, "loss": 1.3921, "step": 398 }, { "epoch": 0.8064679130874179, "grad_norm": 4.676353931427002, "learning_rate": 1.3671875185038063e-05, "loss": 0.9632, "step": 399 }, { "epoch": 0.8084891359272359, "grad_norm": 6.441154956817627, "learning_rate": 1.3399100237121265e-05, "loss": 1.4048, "step": 400 }, { "epoch": 0.810510358767054, "grad_norm": 2.4489996433258057, "learning_rate": 1.312865229123681e-05, "loss": 1.2112, "step": 401 }, { "epoch": 0.8125315816068721, "grad_norm": 2.628634214401245, "learning_rate": 1.2860548541850542e-05, "loss": 1.1693, "step": 402 }, { "epoch": 0.8145528044466902, "grad_norm": 3.1994330883026123, "learning_rate": 1.2594806034389556e-05, "loss": 1.3584, "step": 403 }, { "epoch": 0.8165740272865083, "grad_norm": 2.7979207038879395, "learning_rate": 1.2331441664158611e-05, "loss": 1.1963, "step": 404 }, { "epoch": 0.8185952501263264, "grad_norm": 3.172201156616211, "learning_rate": 1.2070472175265856e-05, "loss": 1.308, "step": 405 }, { "epoch": 0.8206164729661445, "grad_norm": 3.279130458831787, "learning_rate": 1.1811914159558374e-05, "loss": 1.4214, "step": 406 }, { "epoch": 0.8226376958059626, "grad_norm": 3.5428879261016846, "learning_rate": 1.155578405556722e-05, "loss": 1.5145, "step": 407 }, { "epoch": 0.8246589186457807, "grad_norm": 4.0169830322265625, "learning_rate": 1.1302098147462347e-05, "loss": 1.6019, "step": 408 }, { "epoch": 0.8266801414855988, "grad_norm": 4.363892078399658, "learning_rate": 1.1050872564017328e-05, "loss": 1.9656, "step": 409 }, { "epoch": 0.8287013643254169, "grad_norm": 4.119546890258789, "learning_rate": 1.0802123277583819e-05, "loss": 1.6649, "step": 410 }, { "epoch": 0.830722587165235, "grad_norm": 4.996504783630371, "learning_rate": 1.0555866103076212e-05, "loss": 1.8195, "step": 411 }, { "epoch": 0.832743810005053, "grad_norm": 4.646332263946533, "learning_rate": 1.0312116696966012e-05, "loss": 1.9406, "step": 412 }, { "epoch": 0.8347650328448711, "grad_norm": 4.677328109741211, "learning_rate": 1.0070890556286577e-05, "loss": 1.6832, "step": 413 }, { "epoch": 0.8367862556846892, "grad_norm": 5.113035202026367, "learning_rate": 9.832203017647745e-06, "loss": 1.7782, "step": 414 }, { "epoch": 0.8388074785245073, "grad_norm": 4.52169942855835, "learning_rate": 9.596069256260792e-06, "loss": 1.5927, "step": 415 }, { "epoch": 0.8408287013643254, "grad_norm": 4.952767848968506, "learning_rate": 9.362504284973683e-06, "loss": 1.9372, "step": 416 }, { "epoch": 0.8428499242041435, "grad_norm": 5.01066255569458, "learning_rate": 9.131522953316501e-06, "loss": 1.5843, "step": 417 }, { "epoch": 0.8448711470439616, "grad_norm": 4.628442764282227, "learning_rate": 8.903139946557438e-06, "loss": 1.4746, "step": 418 }, { "epoch": 0.8468923698837797, "grad_norm": 4.917454242706299, "learning_rate": 8.67736978476904e-06, "loss": 1.6411, "step": 419 }, { "epoch": 0.8489135927235978, "grad_norm": 5.071280002593994, "learning_rate": 8.45422682190517e-06, "loss": 1.6055, "step": 420 }, { "epoch": 0.8509348155634159, "grad_norm": 4.860715866088867, "learning_rate": 8.233725244888291e-06, "loss": 1.4997, "step": 421 }, { "epoch": 0.852956038403234, "grad_norm": 4.802791595458984, "learning_rate": 8.01587907270761e-06, "loss": 1.3955, "step": 422 }, { "epoch": 0.8549772612430521, "grad_norm": 4.843109607696533, "learning_rate": 7.800702155527696e-06, "loss": 1.4801, "step": 423 }, { "epoch": 0.8569984840828702, "grad_norm": 5.050382137298584, "learning_rate": 7.588208173807943e-06, "loss": 1.5216, "step": 424 }, { "epoch": 0.8590197069226883, "grad_norm": 4.331174373626709, "learning_rate": 7.378410637432847e-06, "loss": 1.4275, "step": 425 }, { "epoch": 0.8610409297625063, "grad_norm": 5.057369709014893, "learning_rate": 7.171322884852988e-06, "loss": 1.6479, "step": 426 }, { "epoch": 0.8630621526023244, "grad_norm": 5.058509826660156, "learning_rate": 6.966958082237096e-06, "loss": 1.483, "step": 427 }, { "epoch": 0.8650833754421425, "grad_norm": 5.39819860458374, "learning_rate": 6.765329222634892e-06, "loss": 1.6475, "step": 428 }, { "epoch": 0.8671045982819606, "grad_norm": 5.942454814910889, "learning_rate": 6.566449125151086e-06, "loss": 1.4512, "step": 429 }, { "epoch": 0.8691258211217787, "grad_norm": 5.583020210266113, "learning_rate": 6.370330434130317e-06, "loss": 1.5802, "step": 430 }, { "epoch": 0.8711470439615968, "grad_norm": 5.76262903213501, "learning_rate": 6.176985618353282e-06, "loss": 1.7253, "step": 431 }, { "epoch": 0.8731682668014149, "grad_norm": 5.057263374328613, "learning_rate": 5.9864269702440075e-06, "loss": 1.424, "step": 432 }, { "epoch": 0.875189489641233, "grad_norm": 7.444010257720947, "learning_rate": 5.798666605088293e-06, "loss": 1.4872, "step": 433 }, { "epoch": 0.877210712481051, "grad_norm": 4.746302127838135, "learning_rate": 5.613716460263485e-06, "loss": 1.1225, "step": 434 }, { "epoch": 0.8792319353208691, "grad_norm": 4.994941234588623, "learning_rate": 5.431588294479478e-06, "loss": 1.2954, "step": 435 }, { "epoch": 0.8812531581606872, "grad_norm": 5.951868057250977, "learning_rate": 5.2522936870311955e-06, "loss": 1.4107, "step": 436 }, { "epoch": 0.8832743810005053, "grad_norm": 5.421668529510498, "learning_rate": 5.0758440370623214e-06, "loss": 1.8054, "step": 437 }, { "epoch": 0.8852956038403234, "grad_norm": 6.007269382476807, "learning_rate": 4.902250562840621e-06, "loss": 1.6518, "step": 438 }, { "epoch": 0.8873168266801414, "grad_norm": 4.964114189147949, "learning_rate": 4.731524301044715e-06, "loss": 1.1705, "step": 439 }, { "epoch": 0.8893380495199595, "grad_norm": 5.869405269622803, "learning_rate": 4.563676106062331e-06, "loss": 1.6333, "step": 440 }, { "epoch": 0.8913592723597776, "grad_norm": 5.429330348968506, "learning_rate": 4.398716649300311e-06, "loss": 1.277, "step": 441 }, { "epoch": 0.8933804951995957, "grad_norm": 6.709824562072754, "learning_rate": 4.236656418506013e-06, "loss": 1.4216, "step": 442 }, { "epoch": 0.8954017180394138, "grad_norm": 5.117186546325684, "learning_rate": 4.077505717100666e-06, "loss": 1.2206, "step": 443 }, { "epoch": 0.8974229408792319, "grad_norm": 4.522552967071533, "learning_rate": 3.921274663524182e-06, "loss": 0.9311, "step": 444 }, { "epoch": 0.89944416371905, "grad_norm": 6.130990505218506, "learning_rate": 3.767973190591906e-06, "loss": 1.3967, "step": 445 }, { "epoch": 0.9014653865588681, "grad_norm": 4.977992057800293, "learning_rate": 3.6176110448631394e-06, "loss": 1.062, "step": 446 }, { "epoch": 0.9034866093986862, "grad_norm": 4.500341892242432, "learning_rate": 3.4701977860213953e-06, "loss": 0.9393, "step": 447 }, { "epoch": 0.9055078322385043, "grad_norm": 5.150080680847168, "learning_rate": 3.325742786266689e-06, "loss": 1.1028, "step": 448 }, { "epoch": 0.9075290550783224, "grad_norm": 5.322256088256836, "learning_rate": 3.184255229719624e-06, "loss": 1.0733, "step": 449 }, { "epoch": 0.9095502779181405, "grad_norm": 7.01747989654541, "learning_rate": 3.0457441118375283e-06, "loss": 1.5135, "step": 450 }, { "epoch": 0.9115715007579586, "grad_norm": 2.3920202255249023, "learning_rate": 2.91021823884251e-06, "loss": 1.0922, "step": 451 }, { "epoch": 0.9135927235977767, "grad_norm": 2.5058305263519287, "learning_rate": 2.7776862271615912e-06, "loss": 1.1713, "step": 452 }, { "epoch": 0.9156139464375948, "grad_norm": 2.848264217376709, "learning_rate": 2.6481565028789067e-06, "loss": 1.1941, "step": 453 }, { "epoch": 0.9176351692774128, "grad_norm": 3.537458896636963, "learning_rate": 2.5216373011999695e-06, "loss": 1.5756, "step": 454 }, { "epoch": 0.9196563921172309, "grad_norm": 3.5460147857666016, "learning_rate": 2.3981366659281134e-06, "loss": 1.3457, "step": 455 }, { "epoch": 0.921677614957049, "grad_norm": 4.265406608581543, "learning_rate": 2.277662448953066e-06, "loss": 1.7724, "step": 456 }, { "epoch": 0.9236988377968671, "grad_norm": 3.961560010910034, "learning_rate": 2.1602223097517913e-06, "loss": 1.7004, "step": 457 }, { "epoch": 0.9257200606366852, "grad_norm": 4.308675289154053, "learning_rate": 2.0458237149014347e-06, "loss": 1.7709, "step": 458 }, { "epoch": 0.9277412834765033, "grad_norm": 3.9854753017425537, "learning_rate": 1.9344739376047083e-06, "loss": 1.6308, "step": 459 }, { "epoch": 0.9297625063163214, "grad_norm": 4.271021366119385, "learning_rate": 1.8261800572274001e-06, "loss": 1.65, "step": 460 }, { "epoch": 0.9317837291561395, "grad_norm": 4.231322288513184, "learning_rate": 1.7209489588483395e-06, "loss": 1.529, "step": 461 }, { "epoch": 0.9338049519959576, "grad_norm": 3.8486905097961426, "learning_rate": 1.6187873328216142e-06, "loss": 1.3554, "step": 462 }, { "epoch": 0.9358261748357757, "grad_norm": 3.996654510498047, "learning_rate": 1.519701674351265e-06, "loss": 1.4264, "step": 463 }, { "epoch": 0.9378473976755938, "grad_norm": 4.656818389892578, "learning_rate": 1.4236982830782674e-06, "loss": 1.5068, "step": 464 }, { "epoch": 0.9398686205154119, "grad_norm": 4.660806179046631, "learning_rate": 1.3307832626800964e-06, "loss": 1.5453, "step": 465 }, { "epoch": 0.94188984335523, "grad_norm": 4.372664928436279, "learning_rate": 1.2409625204825803e-06, "loss": 1.6273, "step": 466 }, { "epoch": 0.943911066195048, "grad_norm": 4.731912612915039, "learning_rate": 1.1542417670844074e-06, "loss": 1.5745, "step": 467 }, { "epoch": 0.9459322890348661, "grad_norm": 4.093106269836426, "learning_rate": 1.0706265159939943e-06, "loss": 1.2102, "step": 468 }, { "epoch": 0.9479535118746841, "grad_norm": 4.671316623687744, "learning_rate": 9.901220832790103e-07, "loss": 1.3927, "step": 469 }, { "epoch": 0.9499747347145022, "grad_norm": 5.522085666656494, "learning_rate": 9.12733587228326e-07, "loss": 1.7408, "step": 470 }, { "epoch": 0.9519959575543203, "grad_norm": 5.045165061950684, "learning_rate": 8.384659480266732e-07, "loss": 1.4933, "step": 471 }, { "epoch": 0.9540171803941384, "grad_norm": 5.388870716094971, "learning_rate": 7.673238874417677e-07, "loss": 1.6984, "step": 472 }, { "epoch": 0.9560384032339565, "grad_norm": 4.820789813995361, "learning_rate": 6.993119285241601e-07, "loss": 1.5436, "step": 473 }, { "epoch": 0.9580596260737746, "grad_norm": 5.232425212860107, "learning_rate": 6.344343953196385e-07, "loss": 1.5351, "step": 474 }, { "epoch": 0.9600808489135927, "grad_norm": 4.9825639724731445, "learning_rate": 5.726954125943318e-07, "loss": 1.5842, "step": 475 }, { "epoch": 0.9621020717534108, "grad_norm": 5.142207145690918, "learning_rate": 5.140989055724687e-07, "loss": 1.6371, "step": 476 }, { "epoch": 0.9641232945932289, "grad_norm": 5.354437828063965, "learning_rate": 4.5864859968679506e-07, "loss": 1.4612, "step": 477 }, { "epoch": 0.966144517433047, "grad_norm": 5.071149826049805, "learning_rate": 4.0634802034176244e-07, "loss": 1.3784, "step": 478 }, { "epoch": 0.9681657402728651, "grad_norm": 5.375519275665283, "learning_rate": 3.572004926893413e-07, "loss": 1.5642, "step": 479 }, { "epoch": 0.9701869631126832, "grad_norm": 5.717057704925537, "learning_rate": 3.112091414176621e-07, "loss": 1.7185, "step": 480 }, { "epoch": 0.9722081859525012, "grad_norm": 4.669751167297363, "learning_rate": 2.6837689055232426e-07, "loss": 1.2948, "step": 481 }, { "epoch": 0.9742294087923193, "grad_norm": 4.622890949249268, "learning_rate": 2.287064632705005e-07, "loss": 1.1733, "step": 482 }, { "epoch": 0.9762506316321374, "grad_norm": 4.939849376678467, "learning_rate": 1.9220038172780842e-07, "loss": 1.3322, "step": 483 }, { "epoch": 0.9782718544719555, "grad_norm": 5.243642330169678, "learning_rate": 1.588609668979446e-07, "loss": 1.2098, "step": 484 }, { "epoch": 0.9802930773117736, "grad_norm": 5.232309818267822, "learning_rate": 1.286903384251581e-07, "loss": 1.2723, "step": 485 }, { "epoch": 0.9823143001515917, "grad_norm": 6.266340732574463, "learning_rate": 1.0169041448943039e-07, "loss": 1.2856, "step": 486 }, { "epoch": 0.9843355229914098, "grad_norm": 6.7853851318359375, "learning_rate": 7.78629116845786e-08, "loss": 1.2888, "step": 487 }, { "epoch": 0.9863567458312279, "grad_norm": 5.390272617340088, "learning_rate": 5.7209344909076036e-08, "loss": 1.271, "step": 488 }, { "epoch": 0.988377968671046, "grad_norm": 5.515243053436279, "learning_rate": 3.973102726976819e-08, "loss": 1.1683, "step": 489 }, { "epoch": 0.9903991915108641, "grad_norm": 5.492679595947266, "learning_rate": 2.542906999836725e-08, "loss": 1.2462, "step": 490 }, { "epoch": 0.9924204143506822, "grad_norm": 5.038405418395996, "learning_rate": 1.4304382380819769e-08, "loss": 1.1106, "step": 491 }, { "epoch": 0.9944416371905003, "grad_norm": 5.300425052642822, "learning_rate": 6.357671699486201e-09, "loss": 1.1875, "step": 492 }, { "epoch": 0.9964628600303184, "grad_norm": 5.133315086364746, "learning_rate": 1.5894431881657845e-09, "loss": 1.1627, "step": 493 }, { "epoch": 0.9984840828701365, "grad_norm": 5.80331563949585, "learning_rate": 0.0, "loss": 1.1667, "step": 494 } ], "logging_steps": 1, "max_steps": 494, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2725168948314112e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }