{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 601, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2e-05, "loss": 0.547, "step": 1 }, { "epoch": 0.0, "learning_rate": 4e-05, "loss": 0.5148, "step": 2 }, { "epoch": 0.0, "learning_rate": 6e-05, "loss": 0.5241, "step": 3 }, { "epoch": 0.01, "learning_rate": 8e-05, "loss": 0.3872, "step": 4 }, { "epoch": 0.01, "learning_rate": 0.0001, "loss": 0.3484, "step": 5 }, { "epoch": 0.01, "learning_rate": 0.00012, "loss": 0.2567, "step": 6 }, { "epoch": 0.01, "learning_rate": 0.00014, "loss": 0.2197, "step": 7 }, { "epoch": 0.01, "learning_rate": 0.00016, "loss": 0.2134, "step": 8 }, { "epoch": 0.01, "learning_rate": 0.00018, "loss": 0.188, "step": 9 }, { "epoch": 0.02, "learning_rate": 0.0002, "loss": 0.2067, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.00019999858715745195, "loss": 0.1878, "step": 11 }, { "epoch": 0.02, "learning_rate": 0.00019999434866973016, "loss": 0.1898, "step": 12 }, { "epoch": 0.02, "learning_rate": 0.00019998728465660105, "loss": 0.1904, "step": 13 }, { "epoch": 0.02, "learning_rate": 0.0001999773953176713, "loss": 0.157, "step": 14 }, { "epoch": 0.02, "learning_rate": 0.00019996468093238257, "loss": 0.1442, "step": 15 }, { "epoch": 0.03, "learning_rate": 0.00019994914186000328, "loss": 0.1884, "step": 16 }, { "epoch": 0.03, "learning_rate": 0.00019993077853961872, "loss": 0.1518, "step": 17 }, { "epoch": 0.03, "learning_rate": 0.00019990959149011848, "loss": 0.1615, "step": 18 }, { "epoch": 0.03, "learning_rate": 0.00019988558131018186, "loss": 0.1421, "step": 19 }, { "epoch": 0.03, "learning_rate": 0.00019985874867826096, "loss": 0.1438, "step": 20 }, { "epoch": 0.03, "learning_rate": 0.00019982909435256144, "loss": 0.1365, "step": 21 }, { "epoch": 0.04, "learning_rate": 0.00019979661917102115, "loss": 0.127, "step": 22 }, { "epoch": 0.04, "learning_rate": 0.00019976132405128647, "loss": 0.145, "step": 23 }, { "epoch": 0.04, "learning_rate": 0.00019972320999068636, "loss": 0.1647, "step": 24 }, { "epoch": 0.04, "learning_rate": 0.0001996822780662041, "loss": 0.137, "step": 25 }, { "epoch": 0.04, "learning_rate": 0.00019963852943444702, "loss": 0.1427, "step": 26 }, { "epoch": 0.04, "learning_rate": 0.0001995919653316137, "loss": 0.1156, "step": 27 }, { "epoch": 0.05, "learning_rate": 0.000199542587073459, "loss": 0.1295, "step": 28 }, { "epoch": 0.05, "learning_rate": 0.00019949039605525703, "loss": 0.1465, "step": 29 }, { "epoch": 0.05, "learning_rate": 0.00019943539375176164, "loss": 0.1371, "step": 30 }, { "epoch": 0.05, "learning_rate": 0.00019937758171716468, "loss": 0.1276, "step": 31 }, { "epoch": 0.05, "learning_rate": 0.00019931696158505223, "loss": 0.1166, "step": 32 }, { "epoch": 0.05, "learning_rate": 0.00019925353506835826, "loss": 0.1194, "step": 33 }, { "epoch": 0.06, "learning_rate": 0.00019918730395931649, "loss": 0.1146, "step": 34 }, { "epoch": 0.06, "learning_rate": 0.00019911827012940946, "loss": 0.1539, "step": 35 }, { "epoch": 0.06, "learning_rate": 0.0001990464355293158, "loss": 1.9511, "step": 36 }, { "epoch": 0.06, "learning_rate": 0.00019897180218885507, "loss": 0.1222, "step": 37 }, { "epoch": 0.06, "learning_rate": 0.00019889437221693053, "loss": 0.1223, "step": 38 }, { "epoch": 0.06, "learning_rate": 0.0001988141478014693, "loss": 0.1353, "step": 39 }, { "epoch": 0.07, "learning_rate": 0.00019873113120936074, "loss": 0.1265, "step": 40 }, { "epoch": 0.07, "learning_rate": 0.00019864532478639234, "loss": 0.1288, "step": 41 }, { "epoch": 0.07, "learning_rate": 0.00019855673095718336, "loss": 0.128, "step": 42 }, { "epoch": 0.07, "learning_rate": 0.0001984653522251165, "loss": 0.1297, "step": 43 }, { "epoch": 0.07, "learning_rate": 0.00019837119117226688, "loss": 0.1265, "step": 44 }, { "epoch": 0.07, "learning_rate": 0.0001982742504593294, "loss": 0.1269, "step": 45 }, { "epoch": 0.08, "learning_rate": 0.00019817453282554333, "loss": 0.1407, "step": 46 }, { "epoch": 0.08, "learning_rate": 0.00019807204108861502, "loss": 0.1311, "step": 47 }, { "epoch": 0.08, "learning_rate": 0.0001979667781446381, "loss": 0.1312, "step": 48 }, { "epoch": 0.08, "learning_rate": 0.00019785874696801202, "loss": 0.1335, "step": 49 }, { "epoch": 0.08, "learning_rate": 0.00019774795061135752, "loss": 0.121, "step": 50 }, { "epoch": 0.08, "learning_rate": 0.00019763439220543084, "loss": 0.1268, "step": 51 }, { "epoch": 0.09, "learning_rate": 0.00019751807495903484, "loss": 0.1353, "step": 52 }, { "epoch": 0.09, "learning_rate": 0.00019739900215892867, "loss": 0.1242, "step": 53 }, { "epoch": 0.09, "learning_rate": 0.0001972771771697347, "loss": 0.1048, "step": 54 }, { "epoch": 0.09, "learning_rate": 0.00019715260343384347, "loss": 0.1329, "step": 55 }, { "epoch": 0.09, "learning_rate": 0.00019702528447131646, "loss": 0.1197, "step": 56 }, { "epoch": 0.09, "learning_rate": 0.00019689522387978666, "loss": 0.1463, "step": 57 }, { "epoch": 0.1, "learning_rate": 0.00019676242533435678, "loss": 0.1296, "step": 58 }, { "epoch": 0.1, "learning_rate": 0.00019662689258749554, "loss": 0.1243, "step": 59 }, { "epoch": 0.1, "learning_rate": 0.00019648862946893158, "loss": 0.1257, "step": 60 }, { "epoch": 0.1, "learning_rate": 0.00019634763988554522, "loss": 0.1348, "step": 61 }, { "epoch": 0.1, "learning_rate": 0.0001962039278212581, "loss": 0.1128, "step": 62 }, { "epoch": 0.1, "learning_rate": 0.00019605749733692064, "loss": 0.1227, "step": 63 }, { "epoch": 0.11, "learning_rate": 0.00019590835257019714, "loss": 0.1329, "step": 64 }, { "epoch": 0.11, "learning_rate": 0.00019575649773544913, "loss": 0.1291, "step": 65 }, { "epoch": 0.11, "learning_rate": 0.00019560193712361596, "loss": 0.119, "step": 66 }, { "epoch": 0.11, "learning_rate": 0.00019544467510209388, "loss": 0.126, "step": 67 }, { "epoch": 0.11, "learning_rate": 0.00019528471611461235, "loss": 0.1158, "step": 68 }, { "epoch": 0.11, "learning_rate": 0.00019512206468110863, "loss": 0.1309, "step": 69 }, { "epoch": 0.12, "learning_rate": 0.00019495672539760007, "loss": 0.1203, "step": 70 }, { "epoch": 0.12, "learning_rate": 0.00019478870293605416, "loss": 0.107, "step": 71 }, { "epoch": 0.12, "learning_rate": 0.0001946180020442565, "loss": 0.1249, "step": 72 }, { "epoch": 0.12, "learning_rate": 0.00019444462754567682, "loss": 0.1163, "step": 73 }, { "epoch": 0.12, "learning_rate": 0.00019426858433933247, "loss": 0.1192, "step": 74 }, { "epoch": 0.12, "learning_rate": 0.00019408987739965005, "loss": 0.1205, "step": 75 }, { "epoch": 0.13, "learning_rate": 0.00019390851177632497, "loss": 0.1205, "step": 76 }, { "epoch": 0.13, "learning_rate": 0.00019372449259417857, "loss": 0.1247, "step": 77 }, { "epoch": 0.13, "learning_rate": 0.0001935378250530135, "loss": 0.1195, "step": 78 }, { "epoch": 0.13, "learning_rate": 0.00019334851442746664, "loss": 0.1131, "step": 79 }, { "epoch": 0.13, "learning_rate": 0.00019315656606686013, "loss": 0.1119, "step": 80 }, { "epoch": 0.13, "learning_rate": 0.00019296198539505013, "loss": 0.1137, "step": 81 }, { "epoch": 0.14, "learning_rate": 0.00019276477791027374, "loss": 0.1052, "step": 82 }, { "epoch": 0.14, "learning_rate": 0.00019256494918499346, "loss": 0.1204, "step": 83 }, { "epoch": 0.14, "learning_rate": 0.00019236250486573978, "loss": 0.1158, "step": 84 }, { "epoch": 0.14, "learning_rate": 0.00019215745067295169, "loss": 0.1151, "step": 85 }, { "epoch": 0.14, "learning_rate": 0.0001919497924008149, "loss": 0.1274, "step": 86 }, { "epoch": 0.14, "learning_rate": 0.00019173953591709828, "loss": 0.1229, "step": 87 }, { "epoch": 0.15, "learning_rate": 0.000191526687162988, "loss": 0.1194, "step": 88 }, { "epoch": 0.15, "learning_rate": 0.0001913112521529195, "loss": 0.1409, "step": 89 }, { "epoch": 0.15, "learning_rate": 0.00019109323697440782, "loss": 0.1274, "step": 90 }, { "epoch": 0.15, "learning_rate": 0.00019087264778787534, "loss": 0.1188, "step": 91 }, { "epoch": 0.15, "learning_rate": 0.00019064949082647786, "loss": 0.1149, "step": 92 }, { "epoch": 0.15, "learning_rate": 0.0001904237723959283, "loss": 0.115, "step": 93 }, { "epoch": 0.16, "learning_rate": 0.00019019549887431877, "loss": 0.1297, "step": 94 }, { "epoch": 0.16, "learning_rate": 0.00018996467671194016, "loss": 0.1286, "step": 95 }, { "epoch": 0.16, "learning_rate": 0.00018973131243109988, "loss": 0.1061, "step": 96 }, { "epoch": 0.16, "learning_rate": 0.00018949541262593762, "loss": 0.1189, "step": 97 }, { "epoch": 0.16, "learning_rate": 0.00018925698396223909, "loss": 0.1372, "step": 98 }, { "epoch": 0.16, "learning_rate": 0.0001890160331772474, "loss": 0.118, "step": 99 }, { "epoch": 0.17, "learning_rate": 0.00018877256707947306, "loss": 0.1109, "step": 100 }, { "epoch": 0.17, "learning_rate": 0.00018852659254850126, "loss": 0.1142, "step": 101 }, { "epoch": 0.17, "learning_rate": 0.00018827811653479768, "loss": 0.1117, "step": 102 }, { "epoch": 0.17, "learning_rate": 0.00018802714605951199, "loss": 0.1251, "step": 103 }, { "epoch": 0.17, "learning_rate": 0.00018777368821427953, "loss": 0.1153, "step": 104 }, { "epoch": 0.17, "learning_rate": 0.00018751775016102087, "loss": 0.1288, "step": 105 }, { "epoch": 0.18, "learning_rate": 0.00018725933913173938, "loss": 0.1261, "step": 106 }, { "epoch": 0.18, "learning_rate": 0.00018699846242831706, "loss": 0.1349, "step": 107 }, { "epoch": 0.18, "learning_rate": 0.00018673512742230802, "loss": 0.1145, "step": 108 }, { "epoch": 0.18, "learning_rate": 0.00018646934155473022, "loss": 0.1206, "step": 109 }, { "epoch": 0.18, "learning_rate": 0.0001862011123358554, "loss": 0.1169, "step": 110 }, { "epoch": 0.18, "learning_rate": 0.00018593044734499655, "loss": 0.1256, "step": 111 }, { "epoch": 0.19, "learning_rate": 0.00018565735423029404, "loss": 0.1233, "step": 112 }, { "epoch": 0.19, "learning_rate": 0.00018538184070849924, "loss": 0.1252, "step": 113 }, { "epoch": 0.19, "learning_rate": 0.00018510391456475676, "loss": 0.1174, "step": 114 }, { "epoch": 0.19, "learning_rate": 0.00018482358365238413, "loss": 0.1205, "step": 115 }, { "epoch": 0.19, "learning_rate": 0.0001845408558926502, "loss": 0.1298, "step": 116 }, { "epoch": 0.19, "learning_rate": 0.00018425573927455117, "loss": 0.1085, "step": 117 }, { "epoch": 0.2, "learning_rate": 0.0001839682418545848, "loss": 0.1363, "step": 118 }, { "epoch": 0.2, "learning_rate": 0.00018367837175652284, "loss": 0.1228, "step": 119 }, { "epoch": 0.2, "learning_rate": 0.0001833861371711814, "loss": 0.1259, "step": 120 }, { "epoch": 0.2, "learning_rate": 0.00018309154635618965, "loss": 0.1246, "step": 121 }, { "epoch": 0.2, "learning_rate": 0.00018279460763575637, "loss": 0.1288, "step": 122 }, { "epoch": 0.2, "learning_rate": 0.0001824953294004347, "loss": 0.1203, "step": 123 }, { "epoch": 0.21, "learning_rate": 0.00018219372010688515, "loss": 0.1166, "step": 124 }, { "epoch": 0.21, "learning_rate": 0.00018188978827763652, "loss": 0.1377, "step": 125 }, { "epoch": 0.21, "learning_rate": 0.00018158354250084527, "loss": 0.1364, "step": 126 }, { "epoch": 0.21, "learning_rate": 0.00018127499143005268, "loss": 0.1179, "step": 127 }, { "epoch": 0.21, "learning_rate": 0.00018096414378394028, "loss": 0.1308, "step": 128 }, { "epoch": 0.21, "learning_rate": 0.00018065100834608377, "loss": 0.1359, "step": 129 }, { "epoch": 0.22, "learning_rate": 0.00018033559396470454, "loss": 0.1253, "step": 130 }, { "epoch": 0.22, "learning_rate": 0.00018001790955241972, "loss": 0.1106, "step": 131 }, { "epoch": 0.22, "learning_rate": 0.0001796979640859904, "loss": 0.118, "step": 132 }, { "epoch": 0.22, "learning_rate": 0.000179375766606068, "loss": 0.1361, "step": 133 }, { "epoch": 0.22, "learning_rate": 0.0001790513262169386, "loss": 0.1388, "step": 134 }, { "epoch": 0.22, "learning_rate": 0.00017872465208626598, "loss": 0.1448, "step": 135 }, { "epoch": 0.23, "learning_rate": 0.00017839575344483238, "loss": 0.111, "step": 136 }, { "epoch": 0.23, "learning_rate": 0.00017806463958627762, "loss": 0.115, "step": 137 }, { "epoch": 0.23, "learning_rate": 0.00017773131986683672, "loss": 0.1198, "step": 138 }, { "epoch": 0.23, "learning_rate": 0.00017739580370507532, "loss": 0.1347, "step": 139 }, { "epoch": 0.23, "learning_rate": 0.00017705810058162353, "loss": 0.12, "step": 140 }, { "epoch": 0.23, "learning_rate": 0.00017671822003890823, "loss": 0.1258, "step": 141 }, { "epoch": 0.24, "learning_rate": 0.00017637617168088325, "loss": 0.1357, "step": 142 }, { "epoch": 0.24, "learning_rate": 0.0001760319651727581, "loss": 0.132, "step": 143 }, { "epoch": 0.24, "learning_rate": 0.0001756856102407247, "loss": 0.1134, "step": 144 }, { "epoch": 0.24, "learning_rate": 0.0001753371166716828, "loss": 0.1106, "step": 145 }, { "epoch": 0.24, "learning_rate": 0.00017498649431296322, "loss": 0.1133, "step": 146 }, { "epoch": 0.24, "learning_rate": 0.0001746337530720497, "loss": 0.1232, "step": 147 }, { "epoch": 0.25, "learning_rate": 0.00017427890291629893, "loss": 0.1249, "step": 148 }, { "epoch": 0.25, "learning_rate": 0.00017392195387265887, "loss": 0.1172, "step": 149 }, { "epoch": 0.25, "learning_rate": 0.00017356291602738542, "loss": 0.1112, "step": 150 }, { "epoch": 0.25, "learning_rate": 0.0001732017995257575, "loss": 0.1124, "step": 151 }, { "epoch": 0.25, "learning_rate": 0.00017283861457179022, "loss": 0.124, "step": 152 }, { "epoch": 0.25, "learning_rate": 0.00017247337142794678, "loss": 0.1116, "step": 153 }, { "epoch": 0.26, "learning_rate": 0.0001721060804148482, "loss": 0.1294, "step": 154 }, { "epoch": 0.26, "learning_rate": 0.0001717367519109819, "loss": 0.1185, "step": 155 }, { "epoch": 0.26, "learning_rate": 0.00017136539635240837, "loss": 0.1383, "step": 156 }, { "epoch": 0.26, "learning_rate": 0.0001709920242324663, "loss": 0.1204, "step": 157 }, { "epoch": 0.26, "learning_rate": 0.00017061664610147604, "loss": 0.1293, "step": 158 }, { "epoch": 0.26, "learning_rate": 0.0001702392725664415, "loss": 0.1303, "step": 159 }, { "epoch": 0.27, "learning_rate": 0.00016985991429075036, "loss": 0.1189, "step": 160 }, { "epoch": 0.27, "learning_rate": 0.00016947858199387294, "loss": 0.1431, "step": 161 }, { "epoch": 0.27, "learning_rate": 0.00016909528645105907, "loss": 0.1174, "step": 162 }, { "epoch": 0.27, "learning_rate": 0.00016871003849303382, "loss": 0.1306, "step": 163 }, { "epoch": 0.27, "learning_rate": 0.0001683228490056913, "loss": 0.1086, "step": 164 }, { "epoch": 0.27, "learning_rate": 0.00016793372892978713, "loss": 0.122, "step": 165 }, { "epoch": 0.28, "learning_rate": 0.00016754268926062938, "loss": 0.1081, "step": 166 }, { "epoch": 0.28, "learning_rate": 0.0001671497410477676, "loss": 0.1184, "step": 167 }, { "epoch": 0.28, "learning_rate": 0.00016675489539468092, "loss": 0.1238, "step": 168 }, { "epoch": 0.28, "learning_rate": 0.0001663581634584641, "loss": 0.1256, "step": 169 }, { "epoch": 0.28, "learning_rate": 0.0001659595564495124, "loss": 0.1242, "step": 170 }, { "epoch": 0.28, "learning_rate": 0.00016555908563120457, "loss": 0.1179, "step": 171 }, { "epoch": 0.29, "learning_rate": 0.0001651567623195849, "loss": 0.1173, "step": 172 }, { "epoch": 0.29, "learning_rate": 0.00016475259788304317, "loss": 0.1146, "step": 173 }, { "epoch": 0.29, "learning_rate": 0.00016434660374199376, "loss": 0.1254, "step": 174 }, { "epoch": 0.29, "learning_rate": 0.00016393879136855248, "loss": 0.1263, "step": 175 }, { "epoch": 0.29, "learning_rate": 0.00016352917228621284, "loss": 0.1181, "step": 176 }, { "epoch": 0.29, "learning_rate": 0.0001631177580695202, "loss": 0.1213, "step": 177 }, { "epoch": 0.3, "learning_rate": 0.00016270456034374474, "loss": 0.1156, "step": 178 }, { "epoch": 0.3, "learning_rate": 0.00016228959078455306, "loss": 0.1145, "step": 179 }, { "epoch": 0.3, "learning_rate": 0.0001618728611176781, "loss": 0.1385, "step": 180 }, { "epoch": 0.3, "learning_rate": 0.000161454383118588, "loss": 0.1228, "step": 181 }, { "epoch": 0.3, "learning_rate": 0.00016103416861215313, "loss": 0.1175, "step": 182 }, { "epoch": 0.3, "learning_rate": 0.00016061222947231225, "loss": 0.1328, "step": 183 }, { "epoch": 0.31, "learning_rate": 0.0001601885776217367, "loss": 0.1223, "step": 184 }, { "epoch": 0.31, "learning_rate": 0.00015976322503149373, "loss": 0.1244, "step": 185 }, { "epoch": 0.31, "learning_rate": 0.00015933618372070805, "loss": 0.1356, "step": 186 }, { "epoch": 0.31, "learning_rate": 0.00015890746575622231, "loss": 0.12, "step": 187 }, { "epoch": 0.31, "learning_rate": 0.00015847708325225618, "loss": 0.1254, "step": 188 }, { "epoch": 0.31, "learning_rate": 0.00015804504837006394, "loss": 0.1139, "step": 189 }, { "epoch": 0.32, "learning_rate": 0.00015761137331759084, "loss": 0.1251, "step": 190 }, { "epoch": 0.32, "learning_rate": 0.0001571760703491282, "loss": 0.1269, "step": 191 }, { "epoch": 0.32, "learning_rate": 0.00015673915176496713, "loss": 0.1203, "step": 192 }, { "epoch": 0.32, "learning_rate": 0.00015630062991105098, "loss": 0.108, "step": 193 }, { "epoch": 0.32, "learning_rate": 0.00015586051717862636, "loss": 0.1276, "step": 194 }, { "epoch": 0.32, "learning_rate": 0.0001554188260038932, "loss": 0.1164, "step": 195 }, { "epoch": 0.33, "learning_rate": 0.00015497556886765316, "loss": 0.1212, "step": 196 }, { "epoch": 0.33, "learning_rate": 0.0001545307582949571, "loss": 0.1358, "step": 197 }, { "epoch": 0.33, "learning_rate": 0.00015408440685475109, "loss": 0.1168, "step": 198 }, { "epoch": 0.33, "learning_rate": 0.0001536365271595212, "loss": 0.1237, "step": 199 }, { "epoch": 0.33, "learning_rate": 0.00015318713186493734, "loss": 0.136, "step": 200 }, { "epoch": 0.33, "learning_rate": 0.00015273623366949523, "loss": 0.1332, "step": 201 }, { "epoch": 0.34, "learning_rate": 0.0001522838453141581, "loss": 0.1122, "step": 202 }, { "epoch": 0.34, "learning_rate": 0.00015182997958199617, "loss": 0.118, "step": 203 }, { "epoch": 0.34, "learning_rate": 0.00015137464929782586, "loss": 0.1277, "step": 204 }, { "epoch": 0.34, "learning_rate": 0.00015091786732784716, "loss": 0.1117, "step": 205 }, { "epoch": 0.34, "learning_rate": 0.00015045964657928006, "loss": 0.1025, "step": 206 }, { "epoch": 0.34, "learning_rate": 0.00015000000000000001, "loss": 0.1251, "step": 207 }, { "epoch": 0.35, "learning_rate": 0.00014953894057817188, "loss": 0.142, "step": 208 }, { "epoch": 0.35, "learning_rate": 0.00014907648134188304, "loss": 0.1322, "step": 209 }, { "epoch": 0.35, "learning_rate": 0.0001486126353587752, "loss": 0.1236, "step": 210 }, { "epoch": 0.35, "learning_rate": 0.00014814741573567514, "loss": 0.1293, "step": 211 }, { "epoch": 0.35, "learning_rate": 0.0001476808356182245, "loss": 0.1157, "step": 212 }, { "epoch": 0.35, "learning_rate": 0.00014721290819050804, "loss": 0.1299, "step": 213 }, { "epoch": 0.36, "learning_rate": 0.0001467436466746814, "loss": 0.1052, "step": 214 }, { "epoch": 0.36, "learning_rate": 0.00014627306433059723, "loss": 0.1242, "step": 215 }, { "epoch": 0.36, "learning_rate": 0.00014580117445543077, "loss": 0.103, "step": 216 }, { "epoch": 0.36, "learning_rate": 0.00014532799038330385, "loss": 0.1274, "step": 217 }, { "epoch": 0.36, "learning_rate": 0.00014485352548490826, "loss": 0.12, "step": 218 }, { "epoch": 0.36, "learning_rate": 0.00014437779316712796, "loss": 0.1113, "step": 219 }, { "epoch": 0.37, "learning_rate": 0.00014390080687266013, "loss": 0.1244, "step": 220 }, { "epoch": 0.37, "learning_rate": 0.0001434225800796354, "loss": 0.1111, "step": 221 }, { "epoch": 0.37, "learning_rate": 0.000142943126301237, "loss": 0.1007, "step": 222 }, { "epoch": 0.37, "learning_rate": 0.00014246245908531882, "loss": 0.1175, "step": 223 }, { "epoch": 0.37, "learning_rate": 0.00014198059201402287, "loss": 0.1285, "step": 224 }, { "epoch": 0.37, "learning_rate": 0.00014149753870339507, "loss": 0.1523, "step": 225 }, { "epoch": 0.38, "learning_rate": 0.0001410133128030009, "loss": 0.1212, "step": 226 }, { "epoch": 0.38, "learning_rate": 0.00014052792799553934, "loss": 0.1053, "step": 227 }, { "epoch": 0.38, "learning_rate": 0.00014004139799645668, "loss": 0.1356, "step": 228 }, { "epoch": 0.38, "learning_rate": 0.0001395537365535585, "loss": 0.1316, "step": 229 }, { "epoch": 0.38, "learning_rate": 0.00013906495744662157, "loss": 0.1316, "step": 230 }, { "epoch": 0.38, "learning_rate": 0.00013857507448700423, "loss": 0.1209, "step": 231 }, { "epoch": 0.39, "learning_rate": 0.0001380841015172563, "loss": 0.1191, "step": 232 }, { "epoch": 0.39, "learning_rate": 0.00013759205241072782, "loss": 0.1207, "step": 233 }, { "epoch": 0.39, "learning_rate": 0.00013709894107117698, "loss": 0.1234, "step": 234 }, { "epoch": 0.39, "learning_rate": 0.00013660478143237746, "loss": 0.1269, "step": 235 }, { "epoch": 0.39, "learning_rate": 0.00013610958745772456, "loss": 0.1163, "step": 236 }, { "epoch": 0.39, "learning_rate": 0.00013561337313984054, "loss": 0.1369, "step": 237 }, { "epoch": 0.4, "learning_rate": 0.0001351161525001795, "loss": 0.1146, "step": 238 }, { "epoch": 0.4, "learning_rate": 0.00013461793958863087, "loss": 0.1078, "step": 239 }, { "epoch": 0.4, "learning_rate": 0.00013411874848312272, "loss": 0.1238, "step": 240 }, { "epoch": 0.4, "learning_rate": 0.0001336185932892237, "loss": 0.1116, "step": 241 }, { "epoch": 0.4, "learning_rate": 0.00013311748813974453, "loss": 0.1365, "step": 242 }, { "epoch": 0.4, "learning_rate": 0.0001326154471943388, "loss": 0.1286, "step": 243 }, { "epoch": 0.41, "learning_rate": 0.00013211248463910262, "loss": 0.1142, "step": 244 }, { "epoch": 0.41, "learning_rate": 0.000131608614686174, "loss": 0.1158, "step": 245 }, { "epoch": 0.41, "learning_rate": 0.0001311038515733311, "loss": 1.5948, "step": 246 }, { "epoch": 0.41, "learning_rate": 0.00013059820956358998, "loss": 0.1267, "step": 247 }, { "epoch": 0.41, "learning_rate": 0.00013009170294480147, "loss": 0.1171, "step": 248 }, { "epoch": 0.41, "learning_rate": 0.0001295843460292477, "loss": 0.128, "step": 249 }, { "epoch": 0.42, "learning_rate": 0.0001290761531532374, "loss": 0.1455, "step": 250 }, { "epoch": 0.42, "learning_rate": 0.0001285671386767009, "loss": 0.1207, "step": 251 }, { "epoch": 0.42, "learning_rate": 0.00012805731698278442, "loss": 0.1266, "step": 252 }, { "epoch": 0.42, "learning_rate": 0.00012754670247744354, "loss": 0.125, "step": 253 }, { "epoch": 0.42, "learning_rate": 0.0001270353095890363, "loss": 0.1238, "step": 254 }, { "epoch": 0.42, "learning_rate": 0.00012652315276791528, "loss": 0.125, "step": 255 }, { "epoch": 0.43, "learning_rate": 0.0001260102464860195, "loss": 0.1201, "step": 256 }, { "epoch": 0.43, "learning_rate": 0.00012549660523646528, "loss": 0.1227, "step": 257 }, { "epoch": 0.43, "learning_rate": 0.00012498224353313684, "loss": 0.1205, "step": 258 }, { "epoch": 0.43, "learning_rate": 0.00012446717591027624, "loss": 0.1327, "step": 259 }, { "epoch": 0.43, "learning_rate": 0.00012395141692207243, "loss": 0.1193, "step": 260 }, { "epoch": 0.43, "learning_rate": 0.00012343498114225038, "loss": 0.114, "step": 261 }, { "epoch": 0.44, "learning_rate": 0.00012291788316365888, "loss": 0.1143, "step": 262 }, { "epoch": 0.44, "learning_rate": 0.00012240013759785848, "loss": 0.1168, "step": 263 }, { "epoch": 0.44, "learning_rate": 0.00012188175907470847, "loss": 0.1348, "step": 264 }, { "epoch": 0.44, "learning_rate": 0.00012136276224195348, "loss": 0.1153, "step": 265 }, { "epoch": 0.44, "learning_rate": 0.00012084316176480973, "loss": 0.1334, "step": 266 }, { "epoch": 0.44, "learning_rate": 0.00012032297232555039, "loss": 0.1024, "step": 267 }, { "epoch": 0.45, "learning_rate": 0.00011980220862309097, "loss": 0.1257, "step": 268 }, { "epoch": 0.45, "learning_rate": 0.00011928088537257375, "loss": 0.1145, "step": 269 }, { "epoch": 0.45, "learning_rate": 0.00011875901730495215, "loss": 0.1288, "step": 270 }, { "epoch": 0.45, "learning_rate": 0.0001182366191665744, "loss": 0.127, "step": 271 }, { "epoch": 0.45, "learning_rate": 0.00011771370571876681, "loss": 0.118, "step": 272 }, { "epoch": 0.45, "learning_rate": 0.00011719029173741676, "loss": 0.1216, "step": 273 }, { "epoch": 0.46, "learning_rate": 0.00011666639201255506, "loss": 0.1261, "step": 274 }, { "epoch": 0.46, "learning_rate": 0.00011614202134793823, "loss": 0.1207, "step": 275 }, { "epoch": 0.46, "learning_rate": 0.00011561719456062994, "loss": 0.1165, "step": 276 }, { "epoch": 0.46, "learning_rate": 0.00011509192648058249, "loss": 0.1293, "step": 277 }, { "epoch": 0.46, "learning_rate": 0.00011456623195021778, "loss": 0.1447, "step": 278 }, { "epoch": 0.46, "learning_rate": 0.00011404012582400779, "loss": 2.074, "step": 279 }, { "epoch": 0.47, "learning_rate": 0.00011351362296805485, "loss": 0.1065, "step": 280 }, { "epoch": 0.47, "learning_rate": 0.00011298673825967183, "loss": 0.1089, "step": 281 }, { "epoch": 0.47, "learning_rate": 0.00011245948658696126, "loss": 0.1108, "step": 282 }, { "epoch": 0.47, "learning_rate": 0.00011193188284839517, "loss": 0.1295, "step": 283 }, { "epoch": 0.47, "learning_rate": 0.00011140394195239376, "loss": 0.1173, "step": 284 }, { "epoch": 0.47, "learning_rate": 0.00011087567881690422, "loss": 0.1269, "step": 285 }, { "epoch": 0.48, "learning_rate": 0.00011034710836897921, "loss": 0.1198, "step": 286 }, { "epoch": 0.48, "learning_rate": 0.00010981824554435518, "loss": 0.153, "step": 287 }, { "epoch": 0.48, "learning_rate": 0.00010928910528703007, "loss": 0.1144, "step": 288 }, { "epoch": 0.48, "learning_rate": 0.0001087597025488413, "loss": 0.1291, "step": 289 }, { "epoch": 0.48, "learning_rate": 0.00010823005228904314, "loss": 0.1168, "step": 290 }, { "epoch": 0.48, "learning_rate": 0.00010770016947388407, "loss": 0.1037, "step": 291 }, { "epoch": 0.49, "learning_rate": 0.00010717006907618377, "loss": 0.1261, "step": 292 }, { "epoch": 0.49, "learning_rate": 0.0001066397660749102, "loss": 0.1297, "step": 293 }, { "epoch": 0.49, "learning_rate": 0.00010610927545475624, "loss": 0.1124, "step": 294 }, { "epoch": 0.49, "learning_rate": 0.00010557861220571625, "loss": 0.1249, "step": 295 }, { "epoch": 0.49, "learning_rate": 0.0001050477913226626, "loss": 0.1205, "step": 296 }, { "epoch": 0.49, "learning_rate": 0.00010451682780492189, "loss": 0.1174, "step": 297 }, { "epoch": 0.5, "learning_rate": 0.00010398573665585105, "loss": 0.1213, "step": 298 }, { "epoch": 0.5, "learning_rate": 0.00010345453288241356, "loss": 0.1079, "step": 299 }, { "epoch": 0.5, "learning_rate": 0.00010292323149475527, "loss": 0.1022, "step": 300 }, { "epoch": 0.5, "learning_rate": 0.0001023918475057803, "loss": 0.1013, "step": 301 }, { "epoch": 0.5, "learning_rate": 0.00010186039593072685, "loss": 0.1288, "step": 302 }, { "epoch": 0.5, "learning_rate": 0.00010132889178674283, "loss": 0.1187, "step": 303 }, { "epoch": 0.51, "learning_rate": 0.00010079735009246167, "loss": 0.1162, "step": 304 }, { "epoch": 0.51, "learning_rate": 0.00010026578586757778, "loss": 0.1128, "step": 305 }, { "epoch": 0.51, "learning_rate": 9.973421413242225e-05, "loss": 0.1187, "step": 306 }, { "epoch": 0.51, "learning_rate": 9.920264990753837e-05, "loss": 0.1238, "step": 307 }, { "epoch": 0.51, "learning_rate": 9.867110821325717e-05, "loss": 0.128, "step": 308 }, { "epoch": 0.51, "learning_rate": 9.813960406927319e-05, "loss": 0.1277, "step": 309 }, { "epoch": 0.52, "learning_rate": 9.760815249421973e-05, "loss": 0.1131, "step": 310 }, { "epoch": 0.52, "learning_rate": 9.707676850524473e-05, "loss": 0.116, "step": 311 }, { "epoch": 0.52, "learning_rate": 9.654546711758645e-05, "loss": 0.1076, "step": 312 }, { "epoch": 0.52, "learning_rate": 9.601426334414898e-05, "loss": 0.1117, "step": 313 }, { "epoch": 0.52, "learning_rate": 9.548317219507815e-05, "loss": 0.1418, "step": 314 }, { "epoch": 0.52, "learning_rate": 9.495220867733738e-05, "loss": 0.1216, "step": 315 }, { "epoch": 0.53, "learning_rate": 9.442138779428376e-05, "loss": 0.1156, "step": 316 }, { "epoch": 0.53, "learning_rate": 9.38907245452438e-05, "loss": 0.1208, "step": 317 }, { "epoch": 0.53, "learning_rate": 9.33602339250898e-05, "loss": 0.1244, "step": 318 }, { "epoch": 0.53, "learning_rate": 9.282993092381625e-05, "loss": 0.124, "step": 319 }, { "epoch": 0.53, "learning_rate": 9.229983052611597e-05, "loss": 0.1151, "step": 320 }, { "epoch": 0.53, "learning_rate": 9.176994771095687e-05, "loss": 0.1076, "step": 321 }, { "epoch": 0.54, "learning_rate": 9.12402974511587e-05, "loss": 0.1269, "step": 322 }, { "epoch": 0.54, "learning_rate": 9.071089471296995e-05, "loss": 0.1077, "step": 323 }, { "epoch": 0.54, "learning_rate": 9.018175445564485e-05, "loss": 0.1229, "step": 324 }, { "epoch": 0.54, "learning_rate": 8.965289163102078e-05, "loss": 0.1107, "step": 325 }, { "epoch": 0.54, "learning_rate": 8.912432118309582e-05, "loss": 0.1057, "step": 326 }, { "epoch": 0.54, "learning_rate": 8.859605804760626e-05, "loss": 0.107, "step": 327 }, { "epoch": 0.55, "learning_rate": 8.806811715160485e-05, "loss": 0.1142, "step": 328 }, { "epoch": 0.55, "learning_rate": 8.754051341303875e-05, "loss": 0.1188, "step": 329 }, { "epoch": 0.55, "learning_rate": 8.70132617403282e-05, "loss": 0.1011, "step": 330 }, { "epoch": 0.55, "learning_rate": 8.648637703194516e-05, "loss": 0.1168, "step": 331 }, { "epoch": 0.55, "learning_rate": 8.595987417599225e-05, "loss": 0.1239, "step": 332 }, { "epoch": 0.55, "learning_rate": 8.543376804978224e-05, "loss": 0.1039, "step": 333 }, { "epoch": 0.56, "learning_rate": 8.490807351941753e-05, "loss": 0.1113, "step": 334 }, { "epoch": 0.56, "learning_rate": 8.438280543937011e-05, "loss": 0.1179, "step": 335 }, { "epoch": 0.56, "learning_rate": 8.385797865206178e-05, "loss": 0.1257, "step": 336 }, { "epoch": 0.56, "learning_rate": 8.333360798744496e-05, "loss": 0.1193, "step": 337 }, { "epoch": 0.56, "learning_rate": 8.280970826258329e-05, "loss": 0.1265, "step": 338 }, { "epoch": 0.56, "learning_rate": 8.228629428123319e-05, "loss": 0.1181, "step": 339 }, { "epoch": 0.57, "learning_rate": 8.176338083342561e-05, "loss": 0.113, "step": 340 }, { "epoch": 0.57, "learning_rate": 8.124098269504787e-05, "loss": 0.1159, "step": 341 }, { "epoch": 0.57, "learning_rate": 8.07191146274263e-05, "loss": 0.1301, "step": 342 }, { "epoch": 0.57, "learning_rate": 8.019779137690906e-05, "loss": 0.1203, "step": 343 }, { "epoch": 0.57, "learning_rate": 7.967702767444964e-05, "loss": 0.1164, "step": 344 }, { "epoch": 0.57, "learning_rate": 7.915683823519031e-05, "loss": 0.1198, "step": 345 }, { "epoch": 0.58, "learning_rate": 7.863723775804651e-05, "loss": 0.1027, "step": 346 }, { "epoch": 0.58, "learning_rate": 7.811824092529156e-05, "loss": 0.1379, "step": 347 }, { "epoch": 0.58, "learning_rate": 7.759986240214155e-05, "loss": 0.1112, "step": 348 }, { "epoch": 0.58, "learning_rate": 7.708211683634112e-05, "loss": 0.1413, "step": 349 }, { "epoch": 0.58, "learning_rate": 7.656501885774964e-05, "loss": 0.1247, "step": 350 }, { "epoch": 0.58, "learning_rate": 7.604858307792758e-05, "loss": 0.1168, "step": 351 }, { "epoch": 0.59, "learning_rate": 7.553282408972382e-05, "loss": 0.1218, "step": 352 }, { "epoch": 0.59, "learning_rate": 7.501775646686315e-05, "loss": 0.1177, "step": 353 }, { "epoch": 0.59, "learning_rate": 7.450339476353474e-05, "loss": 0.1244, "step": 354 }, { "epoch": 0.59, "learning_rate": 7.398975351398053e-05, "loss": 0.1132, "step": 355 }, { "epoch": 0.59, "learning_rate": 7.34768472320847e-05, "loss": 0.1257, "step": 356 }, { "epoch": 0.59, "learning_rate": 7.29646904109637e-05, "loss": 0.1247, "step": 357 }, { "epoch": 0.6, "learning_rate": 7.245329752255647e-05, "loss": 0.1196, "step": 358 }, { "epoch": 0.6, "learning_rate": 7.194268301721563e-05, "loss": 0.1139, "step": 359 }, { "epoch": 0.6, "learning_rate": 7.143286132329912e-05, "loss": 0.129, "step": 360 }, { "epoch": 0.6, "learning_rate": 7.092384684676262e-05, "loss": 0.1164, "step": 361 }, { "epoch": 0.6, "learning_rate": 7.041565397075232e-05, "loss": 0.1048, "step": 362 }, { "epoch": 0.6, "learning_rate": 6.990829705519852e-05, "loss": 0.1098, "step": 363 }, { "epoch": 0.61, "learning_rate": 6.940179043641005e-05, "loss": 0.1129, "step": 364 }, { "epoch": 0.61, "learning_rate": 6.889614842666892e-05, "loss": 0.1162, "step": 365 }, { "epoch": 0.61, "learning_rate": 6.839138531382603e-05, "loss": 0.1198, "step": 366 }, { "epoch": 0.61, "learning_rate": 6.788751536089739e-05, "loss": 0.1107, "step": 367 }, { "epoch": 0.61, "learning_rate": 6.738455280566124e-05, "loss": 0.1177, "step": 368 }, { "epoch": 0.61, "learning_rate": 6.68825118602555e-05, "loss": 0.1189, "step": 369 }, { "epoch": 0.62, "learning_rate": 6.638140671077633e-05, "loss": 0.1105, "step": 370 }, { "epoch": 0.62, "learning_rate": 6.58812515168773e-05, "loss": 0.1156, "step": 371 }, { "epoch": 0.62, "learning_rate": 6.538206041136915e-05, "loss": 0.1306, "step": 372 }, { "epoch": 0.62, "learning_rate": 6.488384749982053e-05, "loss": 0.1218, "step": 373 }, { "epoch": 0.62, "learning_rate": 6.438662686015947e-05, "loss": 0.1162, "step": 374 }, { "epoch": 0.62, "learning_rate": 6.389041254227547e-05, "loss": 0.0903, "step": 375 }, { "epoch": 0.63, "learning_rate": 6.339521856762254e-05, "loss": 0.106, "step": 376 }, { "epoch": 0.63, "learning_rate": 6.290105892882303e-05, "loss": 0.1147, "step": 377 }, { "epoch": 0.63, "learning_rate": 6.240794758927221e-05, "loss": 0.1274, "step": 378 }, { "epoch": 0.63, "learning_rate": 6.191589848274368e-05, "loss": 0.1024, "step": 379 }, { "epoch": 0.63, "learning_rate": 6.142492551299576e-05, "loss": 0.1056, "step": 380 }, { "epoch": 0.63, "learning_rate": 6.093504255337844e-05, "loss": 0.1185, "step": 381 }, { "epoch": 0.64, "learning_rate": 6.044626344644151e-05, "loss": 0.1125, "step": 382 }, { "epoch": 0.64, "learning_rate": 5.995860200354335e-05, "loss": 0.118, "step": 383 }, { "epoch": 0.64, "learning_rate": 5.9472072004460665e-05, "loss": 0.1193, "step": 384 }, { "epoch": 0.64, "learning_rate": 5.8986687196999135e-05, "loss": 0.1221, "step": 385 }, { "epoch": 0.64, "learning_rate": 5.8502461296604935e-05, "loss": 0.1215, "step": 386 }, { "epoch": 0.64, "learning_rate": 5.801940798597716e-05, "loss": 0.1191, "step": 387 }, { "epoch": 0.65, "learning_rate": 5.753754091468115e-05, "loss": 0.1258, "step": 388 }, { "epoch": 0.65, "learning_rate": 5.7056873698763034e-05, "loss": 0.1149, "step": 389 }, { "epoch": 0.65, "learning_rate": 5.6577419920364625e-05, "loss": 0.1275, "step": 390 }, { "epoch": 0.65, "learning_rate": 5.6099193127339864e-05, "loss": 0.1203, "step": 391 }, { "epoch": 0.65, "learning_rate": 5.562220683287205e-05, "loss": 0.1165, "step": 392 }, { "epoch": 0.65, "learning_rate": 5.5146474515091754e-05, "loss": 0.1248, "step": 393 }, { "epoch": 0.66, "learning_rate": 5.467200961669619e-05, "loss": 0.0989, "step": 394 }, { "epoch": 0.66, "learning_rate": 5.4198825544569234e-05, "loss": 0.1127, "step": 395 }, { "epoch": 0.66, "learning_rate": 5.372693566940277e-05, "loss": 0.1248, "step": 396 }, { "epoch": 0.66, "learning_rate": 5.325635332531864e-05, "loss": 0.1094, "step": 397 }, { "epoch": 0.66, "learning_rate": 5.278709180949195e-05, "loss": 0.1194, "step": 398 }, { "epoch": 0.66, "learning_rate": 5.2319164381775524e-05, "loss": 0.1178, "step": 399 }, { "epoch": 0.67, "learning_rate": 5.1852584264324866e-05, "loss": 0.123, "step": 400 }, { "epoch": 0.67, "learning_rate": 5.138736464122484e-05, "loss": 0.1102, "step": 401 }, { "epoch": 0.67, "learning_rate": 5.092351865811698e-05, "loss": 0.1215, "step": 402 }, { "epoch": 0.67, "learning_rate": 5.046105942182815e-05, "loss": 0.115, "step": 403 }, { "epoch": 0.67, "learning_rate": 5.000000000000002e-05, "loss": 0.1181, "step": 404 }, { "epoch": 0.67, "learning_rate": 4.9540353420719946e-05, "loss": 0.096, "step": 405 }, { "epoch": 0.68, "learning_rate": 4.908213267215287e-05, "loss": 0.1176, "step": 406 }, { "epoch": 0.68, "learning_rate": 4.8625350702174166e-05, "loss": 0.1229, "step": 407 }, { "epoch": 0.68, "learning_rate": 4.817002041800388e-05, "loss": 0.1177, "step": 408 }, { "epoch": 0.68, "learning_rate": 4.7716154685841944e-05, "loss": 0.1124, "step": 409 }, { "epoch": 0.68, "learning_rate": 4.726376633050479e-05, "loss": 0.1325, "step": 410 }, { "epoch": 0.68, "learning_rate": 4.68128681350627e-05, "loss": 0.1072, "step": 411 }, { "epoch": 0.69, "learning_rate": 4.636347284047877e-05, "loss": 0.1168, "step": 412 }, { "epoch": 0.69, "learning_rate": 4.5915593145248924e-05, "loss": 0.105, "step": 413 }, { "epoch": 0.69, "learning_rate": 4.546924170504292e-05, "loss": 0.1121, "step": 414 }, { "epoch": 0.69, "learning_rate": 4.502443113234688e-05, "loss": 0.1075, "step": 415 }, { "epoch": 0.69, "learning_rate": 4.4581173996106815e-05, "loss": 0.1266, "step": 416 }, { "epoch": 0.69, "learning_rate": 4.413948282137367e-05, "loss": 0.1161, "step": 417 }, { "epoch": 0.7, "learning_rate": 4.3699370088949066e-05, "loss": 0.114, "step": 418 }, { "epoch": 0.7, "learning_rate": 4.326084823503287e-05, "loss": 0.1145, "step": 419 }, { "epoch": 0.7, "learning_rate": 4.282392965087182e-05, "loss": 0.1174, "step": 420 }, { "epoch": 0.7, "learning_rate": 4.2388626682409194e-05, "loss": 0.1368, "step": 421 }, { "epoch": 0.7, "learning_rate": 4.1954951629936065e-05, "loss": 0.1133, "step": 422 }, { "epoch": 0.7, "learning_rate": 4.152291674774383e-05, "loss": 0.1073, "step": 423 }, { "epoch": 0.71, "learning_rate": 4.109253424377772e-05, "loss": 0.0974, "step": 424 }, { "epoch": 0.71, "learning_rate": 4.0663816279292024e-05, "loss": 0.1427, "step": 425 }, { "epoch": 0.71, "learning_rate": 4.02367749685063e-05, "loss": 0.1349, "step": 426 }, { "epoch": 0.71, "learning_rate": 3.981142237826332e-05, "loss": 0.118, "step": 427 }, { "epoch": 0.71, "learning_rate": 3.93877705276878e-05, "loss": 0.1023, "step": 428 }, { "epoch": 0.71, "learning_rate": 3.896583138784688e-05, "loss": 0.0998, "step": 429 }, { "epoch": 0.72, "learning_rate": 3.854561688141205e-05, "loss": 0.112, "step": 430 }, { "epoch": 0.72, "learning_rate": 3.812713888232193e-05, "loss": 0.12, "step": 431 }, { "epoch": 0.72, "learning_rate": 3.7710409215446986e-05, "loss": 0.1135, "step": 432 }, { "epoch": 0.72, "learning_rate": 3.729543965625526e-05, "loss": 0.1196, "step": 433 }, { "epoch": 0.72, "learning_rate": 3.6882241930479824e-05, "loss": 0.1124, "step": 434 }, { "epoch": 0.72, "learning_rate": 3.6470827713787194e-05, "loss": 0.1165, "step": 435 }, { "epoch": 0.73, "learning_rate": 3.606120863144753e-05, "loss": 0.1031, "step": 436 }, { "epoch": 0.73, "learning_rate": 3.5653396258006265e-05, "loss": 0.1115, "step": 437 }, { "epoch": 0.73, "learning_rate": 3.524740211695683e-05, "loss": 0.1261, "step": 438 }, { "epoch": 0.73, "learning_rate": 3.4843237680415156e-05, "loss": 0.1213, "step": 439 }, { "epoch": 0.73, "learning_rate": 3.444091436879545e-05, "loss": 0.1236, "step": 440 }, { "epoch": 0.73, "learning_rate": 3.4040443550487645e-05, "loss": 0.1157, "step": 441 }, { "epoch": 0.74, "learning_rate": 3.364183654153592e-05, "loss": 0.1072, "step": 442 }, { "epoch": 0.74, "learning_rate": 3.32451046053191e-05, "loss": 0.0938, "step": 443 }, { "epoch": 0.74, "learning_rate": 3.285025895223244e-05, "loss": 0.1236, "step": 444 }, { "epoch": 0.74, "learning_rate": 3.245731073937068e-05, "loss": 0.1369, "step": 445 }, { "epoch": 0.74, "learning_rate": 3.2066271070212874e-05, "loss": 0.1168, "step": 446 }, { "epoch": 0.74, "learning_rate": 3.167715099430873e-05, "loss": 0.0973, "step": 447 }, { "epoch": 0.75, "learning_rate": 3.1289961506966214e-05, "loss": 0.0985, "step": 448 }, { "epoch": 0.75, "learning_rate": 3.0904713548940936e-05, "loss": 0.1168, "step": 449 }, { "epoch": 0.75, "learning_rate": 3.052141800612709e-05, "loss": 0.1115, "step": 450 }, { "epoch": 0.75, "learning_rate": 3.0140085709249667e-05, "loss": 0.1155, "step": 451 }, { "epoch": 0.75, "learning_rate": 2.9760727433558522e-05, "loss": 0.1223, "step": 452 }, { "epoch": 0.75, "learning_rate": 2.938335389852397e-05, "loss": 0.1252, "step": 453 }, { "epoch": 0.76, "learning_rate": 2.9007975767533714e-05, "loss": 0.1272, "step": 454 }, { "epoch": 0.76, "learning_rate": 2.863460364759163e-05, "loss": 0.114, "step": 455 }, { "epoch": 0.76, "learning_rate": 2.8263248089018113e-05, "loss": 0.0958, "step": 456 }, { "epoch": 0.76, "learning_rate": 2.789391958515183e-05, "loss": 0.1294, "step": 457 }, { "epoch": 0.76, "learning_rate": 2.7526628572053227e-05, "loss": 0.1126, "step": 458 }, { "epoch": 0.76, "learning_rate": 2.7161385428209774e-05, "loss": 0.11, "step": 459 }, { "epoch": 0.77, "learning_rate": 2.679820047424253e-05, "loss": 0.1189, "step": 460 }, { "epoch": 0.77, "learning_rate": 2.6437083972614572e-05, "loss": 0.1115, "step": 461 }, { "epoch": 0.77, "learning_rate": 2.6078046127341137e-05, "loss": 0.1205, "step": 462 }, { "epoch": 0.77, "learning_rate": 2.5721097083701084e-05, "loss": 0.096, "step": 463 }, { "epoch": 0.77, "learning_rate": 2.5366246927950286e-05, "loss": 0.1064, "step": 464 }, { "epoch": 0.77, "learning_rate": 2.5013505687036786e-05, "loss": 0.1279, "step": 465 }, { "epoch": 0.78, "learning_rate": 2.4662883328317222e-05, "loss": 0.1124, "step": 466 }, { "epoch": 0.78, "learning_rate": 2.4314389759275335e-05, "loss": 0.1052, "step": 467 }, { "epoch": 0.78, "learning_rate": 2.3968034827241925e-05, "loss": 0.1103, "step": 468 }, { "epoch": 0.78, "learning_rate": 2.3623828319116748e-05, "loss": 0.1075, "step": 469 }, { "epoch": 0.78, "learning_rate": 2.3281779961091775e-05, "loss": 0.0996, "step": 470 }, { "epoch": 0.78, "learning_rate": 2.2941899418376466e-05, "loss": 0.0935, "step": 471 }, { "epoch": 0.79, "learning_rate": 2.2604196294924694e-05, "loss": 0.1096, "step": 472 }, { "epoch": 0.79, "learning_rate": 2.2268680133163277e-05, "loss": 0.1101, "step": 473 }, { "epoch": 0.79, "learning_rate": 2.1935360413722395e-05, "loss": 0.1223, "step": 474 }, { "epoch": 0.79, "learning_rate": 2.1604246555167638e-05, "loss": 0.1165, "step": 475 }, { "epoch": 0.79, "learning_rate": 2.1275347913734022e-05, "loss": 0.1182, "step": 476 }, { "epoch": 0.79, "learning_rate": 2.0948673783061422e-05, "loss": 0.1039, "step": 477 }, { "epoch": 0.8, "learning_rate": 2.0624233393932024e-05, "loss": 0.1117, "step": 478 }, { "epoch": 0.8, "learning_rate": 2.03020359140096e-05, "loss": 0.1178, "step": 479 }, { "epoch": 0.8, "learning_rate": 1.9982090447580303e-05, "loss": 0.1253, "step": 480 }, { "epoch": 0.8, "learning_rate": 1.966440603529549e-05, "loss": 0.1079, "step": 481 }, { "epoch": 0.8, "learning_rate": 1.9348991653916228e-05, "loss": 0.101, "step": 482 }, { "epoch": 0.8, "learning_rate": 1.9035856216059722e-05, "loss": 0.1012, "step": 483 }, { "epoch": 0.81, "learning_rate": 1.8725008569947365e-05, "loss": 0.121, "step": 484 }, { "epoch": 0.81, "learning_rate": 1.8416457499154728e-05, "loss": 0.1232, "step": 485 }, { "epoch": 0.81, "learning_rate": 1.811021172236348e-05, "loss": 0.1058, "step": 486 }, { "epoch": 0.81, "learning_rate": 1.7806279893114875e-05, "loss": 0.1078, "step": 487 }, { "epoch": 0.81, "learning_rate": 1.750467059956531e-05, "loss": 0.1188, "step": 488 }, { "epoch": 0.81, "learning_rate": 1.7205392364243623e-05, "loss": 0.118, "step": 489 }, { "epoch": 0.82, "learning_rate": 1.690845364381034e-05, "loss": 0.108, "step": 490 }, { "epoch": 0.82, "learning_rate": 1.6613862828818628e-05, "loss": 0.1035, "step": 491 }, { "epoch": 0.82, "learning_rate": 1.6321628243477194e-05, "loss": 0.1274, "step": 492 }, { "epoch": 0.82, "learning_rate": 1.603175814541522e-05, "loss": 0.1184, "step": 493 }, { "epoch": 0.82, "learning_rate": 1.5744260725448844e-05, "loss": 0.1038, "step": 494 }, { "epoch": 0.82, "learning_rate": 1.5459144107349787e-05, "loss": 0.1059, "step": 495 }, { "epoch": 0.83, "learning_rate": 1.5176416347615885e-05, "loss": 0.1079, "step": 496 }, { "epoch": 0.83, "learning_rate": 1.4896085435243279e-05, "loss": 0.1169, "step": 497 }, { "epoch": 0.83, "learning_rate": 1.4618159291500778e-05, "loss": 0.1092, "step": 498 }, { "epoch": 0.83, "learning_rate": 1.4342645769705977e-05, "loss": 0.1362, "step": 499 }, { "epoch": 0.83, "learning_rate": 1.406955265500346e-05, "loss": 0.1393, "step": 500 }, { "epoch": 0.83, "learning_rate": 1.3798887664144633e-05, "loss": 0.1252, "step": 501 }, { "epoch": 0.84, "learning_rate": 1.3530658445269783e-05, "loss": 0.1019, "step": 502 }, { "epoch": 0.84, "learning_rate": 1.3264872577692022e-05, "loss": 0.1204, "step": 503 }, { "epoch": 0.84, "learning_rate": 1.3001537571682965e-05, "loss": 0.1303, "step": 504 }, { "epoch": 0.84, "learning_rate": 1.2740660868260633e-05, "loss": 0.1178, "step": 505 }, { "epoch": 0.84, "learning_rate": 1.2482249838979142e-05, "loss": 0.1066, "step": 506 }, { "epoch": 0.84, "learning_rate": 1.2226311785720468e-05, "loss": 0.1184, "step": 507 }, { "epoch": 0.85, "learning_rate": 1.1972853940488015e-05, "loss": 0.1062, "step": 508 }, { "epoch": 0.85, "learning_rate": 1.1721883465202332e-05, "loss": 0.1105, "step": 509 }, { "epoch": 0.85, "learning_rate": 1.1473407451498753e-05, "loss": 0.1232, "step": 510 }, { "epoch": 0.85, "learning_rate": 1.122743292052697e-05, "loss": 0.1225, "step": 511 }, { "epoch": 0.85, "learning_rate": 1.0983966822752623e-05, "loss": 0.1021, "step": 512 }, { "epoch": 0.85, "learning_rate": 1.0743016037760945e-05, "loss": 0.1062, "step": 513 }, { "epoch": 0.86, "learning_rate": 1.0504587374062391e-05, "loss": 0.116, "step": 514 }, { "epoch": 0.86, "learning_rate": 1.026868756890016e-05, "loss": 0.1203, "step": 515 }, { "epoch": 0.86, "learning_rate": 1.003532328805986e-05, "loss": 0.117, "step": 516 }, { "epoch": 0.86, "learning_rate": 9.804501125681243e-06, "loss": 0.1043, "step": 517 }, { "epoch": 0.86, "learning_rate": 9.57622760407173e-06, "loss": 0.1162, "step": 518 }, { "epoch": 0.86, "learning_rate": 9.350509173522193e-06, "loss": 0.1154, "step": 519 }, { "epoch": 0.87, "learning_rate": 9.127352212124662e-06, "loss": 0.1088, "step": 520 }, { "epoch": 0.87, "learning_rate": 8.90676302559219e-06, "loss": 0.1187, "step": 521 }, { "epoch": 0.87, "learning_rate": 8.688747847080514e-06, "loss": 0.1084, "step": 522 }, { "epoch": 0.87, "learning_rate": 8.473312837012026e-06, "loss": 0.1071, "step": 523 }, { "epoch": 0.87, "learning_rate": 8.260464082901732e-06, "loss": 0.1241, "step": 524 }, { "epoch": 0.87, "learning_rate": 8.050207599185134e-06, "loss": 0.1227, "step": 525 }, { "epoch": 0.88, "learning_rate": 7.842549327048365e-06, "loss": 0.1189, "step": 526 }, { "epoch": 0.88, "learning_rate": 7.637495134260242e-06, "loss": 0.1025, "step": 527 }, { "epoch": 0.88, "learning_rate": 7.435050815006561e-06, "loss": 0.1024, "step": 528 }, { "epoch": 0.88, "learning_rate": 7.235222089726279e-06, "loss": 0.0958, "step": 529 }, { "epoch": 0.88, "learning_rate": 7.038014604949883e-06, "loss": 0.1155, "step": 530 }, { "epoch": 0.88, "learning_rate": 6.843433933139909e-06, "loss": 0.1107, "step": 531 }, { "epoch": 0.89, "learning_rate": 6.651485572533378e-06, "loss": 0.1109, "step": 532 }, { "epoch": 0.89, "learning_rate": 6.46217494698651e-06, "loss": 0.1211, "step": 533 }, { "epoch": 0.89, "learning_rate": 6.275507405821435e-06, "loss": 0.121, "step": 534 }, { "epoch": 0.89, "learning_rate": 6.091488223675057e-06, "loss": 0.114, "step": 535 }, { "epoch": 0.89, "learning_rate": 5.910122600349965e-06, "loss": 0.1285, "step": 536 }, { "epoch": 0.89, "learning_rate": 5.7314156606675496e-06, "loss": 0.0938, "step": 537 }, { "epoch": 0.9, "learning_rate": 5.5553724543231825e-06, "loss": 0.1107, "step": 538 }, { "epoch": 0.9, "learning_rate": 5.381997955743501e-06, "loss": 0.1255, "step": 539 }, { "epoch": 0.9, "learning_rate": 5.2112970639458745e-06, "loss": 0.1278, "step": 540 }, { "epoch": 0.9, "learning_rate": 5.043274602399939e-06, "loss": 0.1134, "step": 541 }, { "epoch": 0.9, "learning_rate": 4.87793531889138e-06, "loss": 0.1072, "step": 542 }, { "epoch": 0.9, "learning_rate": 4.715283885387678e-06, "loss": 0.1115, "step": 543 }, { "epoch": 0.91, "learning_rate": 4.555324897906132e-06, "loss": 0.1012, "step": 544 }, { "epoch": 0.91, "learning_rate": 4.398062876384046e-06, "loss": 0.1145, "step": 545 }, { "epoch": 0.91, "learning_rate": 4.2435022645509025e-06, "loss": 0.1079, "step": 546 }, { "epoch": 0.91, "learning_rate": 4.091647429802869e-06, "loss": 0.1133, "step": 547 }, { "epoch": 0.91, "learning_rate": 3.942502663079395e-06, "loss": 0.1078, "step": 548 }, { "epoch": 0.91, "learning_rate": 3.796072178741916e-06, "loss": 0.1113, "step": 549 }, { "epoch": 0.92, "learning_rate": 3.6523601144548003e-06, "loss": 3.1026, "step": 550 }, { "epoch": 0.92, "learning_rate": 3.5113705310684363e-06, "loss": 0.1238, "step": 551 }, { "epoch": 0.92, "learning_rate": 3.3731074125044726e-06, "loss": 0.1266, "step": 552 }, { "epoch": 0.92, "learning_rate": 3.2375746656432284e-06, "loss": 0.1344, "step": 553 }, { "epoch": 0.92, "learning_rate": 3.1047761202133597e-06, "loss": 0.1211, "step": 554 }, { "epoch": 0.92, "learning_rate": 2.974715528683547e-06, "loss": 0.1223, "step": 555 }, { "epoch": 0.93, "learning_rate": 2.8473965661565347e-06, "loss": 0.1148, "step": 556 }, { "epoch": 0.93, "learning_rate": 2.7228228302653034e-06, "loss": 0.1238, "step": 557 }, { "epoch": 0.93, "learning_rate": 2.600997841071329e-06, "loss": 0.0977, "step": 558 }, { "epoch": 0.93, "learning_rate": 2.4819250409651607e-06, "loss": 0.1203, "step": 559 }, { "epoch": 0.93, "learning_rate": 2.3656077945691803e-06, "loss": 0.1029, "step": 560 }, { "epoch": 0.93, "learning_rate": 2.2520493886424743e-06, "loss": 0.1305, "step": 561 }, { "epoch": 0.94, "learning_rate": 2.1412530319879887e-06, "loss": 0.1208, "step": 562 }, { "epoch": 0.94, "learning_rate": 2.0332218553618885e-06, "loss": 0.1234, "step": 563 }, { "epoch": 0.94, "learning_rate": 1.9279589113850084e-06, "loss": 0.1387, "step": 564 }, { "epoch": 0.94, "learning_rate": 1.825467174456652e-06, "loss": 0.1313, "step": 565 }, { "epoch": 0.94, "learning_rate": 1.725749540670596e-06, "loss": 0.1135, "step": 566 }, { "epoch": 0.94, "learning_rate": 1.6288088277331304e-06, "loss": 0.101, "step": 567 }, { "epoch": 0.95, "learning_rate": 1.5346477748835354e-06, "loss": 0.1131, "step": 568 }, { "epoch": 0.95, "learning_rate": 1.4432690428166528e-06, "loss": 0.1117, "step": 569 }, { "epoch": 0.95, "learning_rate": 1.3546752136076923e-06, "loss": 0.1107, "step": 570 }, { "epoch": 0.95, "learning_rate": 1.268868790639277e-06, "loss": 0.115, "step": 571 }, { "epoch": 0.95, "learning_rate": 1.1858521985307125e-06, "loss": 0.1163, "step": 572 }, { "epoch": 0.95, "learning_rate": 1.105627783069485e-06, "loss": 0.1131, "step": 573 }, { "epoch": 0.96, "learning_rate": 1.0281978111449375e-06, "loss": 0.1085, "step": 574 }, { "epoch": 0.96, "learning_rate": 9.535644706842317e-07, "loss": 0.1257, "step": 575 }, { "epoch": 0.96, "learning_rate": 8.817298705905641e-07, "loss": 0.1188, "step": 576 }, { "epoch": 0.96, "learning_rate": 8.126960406835249e-07, "loss": 0.1101, "step": 577 }, { "epoch": 0.96, "learning_rate": 7.464649316417438e-07, "loss": 0.1075, "step": 578 }, { "epoch": 0.96, "learning_rate": 6.830384149478008e-07, "loss": 0.0966, "step": 579 }, { "epoch": 0.97, "learning_rate": 6.224182828353242e-07, "loss": 0.1244, "step": 580 }, { "epoch": 0.97, "learning_rate": 5.64606248238364e-07, "loss": 0.1137, "step": 581 }, { "epoch": 0.97, "learning_rate": 5.096039447429534e-07, "loss": 0.1087, "step": 582 }, { "epoch": 0.97, "learning_rate": 4.57412926541001e-07, "loss": 0.1022, "step": 583 }, { "epoch": 0.97, "learning_rate": 4.0803466838631455e-07, "loss": 0.1104, "step": 584 }, { "epoch": 0.97, "learning_rate": 3.614705655529682e-07, "loss": 0.1169, "step": 585 }, { "epoch": 0.98, "learning_rate": 3.177219337958892e-07, "loss": 0.1176, "step": 586 }, { "epoch": 0.98, "learning_rate": 2.767900093136544e-07, "loss": 0.1273, "step": 587 }, { "epoch": 0.98, "learning_rate": 2.3867594871352926e-07, "loss": 0.1032, "step": 588 }, { "epoch": 0.98, "learning_rate": 2.0338082897886079e-07, "loss": 0.1342, "step": 589 }, { "epoch": 0.98, "learning_rate": 1.709056474385795e-07, "loss": 0.1124, "step": 590 }, { "epoch": 0.98, "learning_rate": 1.412513217390554e-07, "loss": 0.1231, "step": 591 }, { "epoch": 0.99, "learning_rate": 1.1441868981815207e-07, "loss": 0.1221, "step": 592 }, { "epoch": 0.99, "learning_rate": 9.040850988153438e-08, "loss": 0.1088, "step": 593 }, { "epoch": 0.99, "learning_rate": 6.922146038129684e-08, "loss": 0.1155, "step": 594 }, { "epoch": 0.99, "learning_rate": 5.08581399967345e-08, "loss": 0.1103, "step": 595 }, { "epoch": 0.99, "learning_rate": 3.53190676174453e-08, "loss": 0.1189, "step": 596 }, { "epoch": 0.99, "learning_rate": 2.260468232869739e-08, "loss": 0.1202, "step": 597 }, { "epoch": 1.0, "learning_rate": 1.2715343398972402e-08, "loss": 0.1187, "step": 598 }, { "epoch": 1.0, "learning_rate": 5.651330269840216e-09, "loss": 0.0985, "step": 599 }, { "epoch": 1.0, "learning_rate": 1.4128425480763874e-09, "loss": 0.1111, "step": 600 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 0.1056, "step": 601 } ], "logging_steps": 1, "max_steps": 601, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 188987860844544.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }