diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,7 +10,7 @@ "log_history": [ { "epoch": 0.0, - "learning_rate": 1.36986301369863e-07, + "learning_rate": 1.36986301369863e-08, "logits/chosen": -2.8295512199401855, "logits/rejected": -2.9639337062835693, "logps/chosen": -242.64569091796875, @@ -24,2079 +24,2079 @@ }, { "epoch": 0.02, - "learning_rate": 1.3698630136986302e-06, - "logits/chosen": -2.777900218963623, - "logits/rejected": -2.813075065612793, - "logps/chosen": -292.14288330078125, - "logps/rejected": -78.98306274414062, - "loss": 0.69, - "rewards/accuracies": 0.5069444179534912, - "rewards/chosen": 0.0018207718385383487, - "rewards/margins": 0.010101978667080402, - "rewards/rejected": -0.008281207643449306, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -2.778250217437744, + "logits/rejected": -2.813397169113159, + "logps/chosen": -292.17218017578125, + "logps/rejected": -78.88499450683594, + "loss": 0.6942, + "rewards/accuracies": 0.4097222089767456, + "rewards/chosen": -0.001105638686567545, + "rewards/margins": -0.0026314095593988895, + "rewards/rejected": 0.0015257701743394136, "step": 10 }, { "epoch": 0.04, - "learning_rate": 2.7397260273972604e-06, - "logits/chosen": -2.7663190364837646, - "logits/rejected": -2.758164167404175, - "logps/chosen": -299.78204345703125, - "logps/rejected": -81.72047424316406, - "loss": 0.6536, - "rewards/accuracies": 0.84375, - "rewards/chosen": 0.027563810348510742, - "rewards/margins": 0.08724220097064972, - "rewards/rejected": -0.05967838317155838, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -2.76747465133667, + "logits/rejected": -2.7585418224334717, + "logps/chosen": -300.1101989746094, + "logps/rejected": -81.14244079589844, + "loss": 0.6932, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.005251293070614338, + "rewards/margins": -0.003377618733793497, + "rewards/rejected": -0.001873674220405519, "step": 20 }, { "epoch": 0.06, - "learning_rate": 4.109589041095891e-06, - "logits/chosen": -2.770326852798462, - "logits/rejected": -2.78336763381958, - "logps/chosen": -289.27728271484375, - "logps/rejected": -81.4061508178711, - "loss": 0.5577, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.07835596799850464, - "rewards/margins": 0.328571081161499, - "rewards/rejected": -0.2502151131629944, + "learning_rate": 4.10958904109589e-07, + "logits/chosen": -2.776379108428955, + "logits/rejected": -2.7856156826019287, + "logps/chosen": -290.00897216796875, + "logps/rejected": -78.96559143066406, + "loss": 0.6885, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.005190015770494938, + "rewards/margins": 0.011348642408847809, + "rewards/rejected": -0.006158626172691584, "step": 30 }, { "epoch": 0.08, - "learning_rate": 5.479452054794521e-06, - "logits/chosen": -2.7817535400390625, - "logits/rejected": -2.8053927421569824, - "logps/chosen": -244.7366180419922, - "logps/rejected": -90.08441162109375, - "loss": 0.4426, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.07394310086965561, - "rewards/margins": 0.6627634763717651, - "rewards/rejected": -0.5888203382492065, + "learning_rate": 5.47945205479452e-07, + "logits/chosen": -2.7960293292999268, + "logits/rejected": -2.814054250717163, + "logps/chosen": -245.3889617919922, + "logps/rejected": -84.38166046142578, + "loss": 0.6823, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.008710930123925209, + "rewards/margins": 0.027255941182374954, + "rewards/rejected": -0.018545005470514297, "step": 40 }, { "epoch": 0.1, - "learning_rate": 6.849315068493151e-06, - "logits/chosen": -2.7496659755706787, - "logits/rejected": -2.807068109512329, - "logps/chosen": -251.8906707763672, - "logps/rejected": -90.27186584472656, - "loss": 0.3608, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.04582538083195686, - "rewards/margins": 1.008236289024353, - "rewards/rejected": -0.9624108076095581, + "learning_rate": 6.84931506849315e-07, + "logits/chosen": -2.7724318504333496, + "logits/rejected": -2.8249223232269287, + "logps/chosen": -252.29598999023438, + "logps/rejected": -80.92951965332031, + "loss": 0.6751, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.00529166916385293, + "rewards/margins": 0.03346817195415497, + "rewards/rejected": -0.028176506981253624, "step": 50 }, { "epoch": 0.12, - "learning_rate": 8.219178082191782e-06, - "logits/chosen": -2.813101291656494, - "logits/rejected": -2.7498257160186768, - "logps/chosen": -280.83966064453125, - "logps/rejected": -92.8672103881836, - "loss": 0.3055, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.10900793224573135, - "rewards/margins": 1.581032633781433, - "rewards/rejected": -1.4720247983932495, + "learning_rate": 8.21917808219178e-07, + "logits/chosen": -2.8436636924743652, + "logits/rejected": -2.7858223915100098, + "logps/chosen": -281.6982421875, + "logps/rejected": -78.57672119140625, + "loss": 0.6656, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": 0.0231462512165308, + "rewards/margins": 0.06612209975719452, + "rewards/rejected": -0.04297585040330887, "step": 60 }, { "epoch": 0.14, - "learning_rate": 9.589041095890411e-06, - "logits/chosen": -2.7718911170959473, - "logits/rejected": -2.7741990089416504, - "logps/chosen": -264.4299621582031, - "logps/rejected": -92.99862670898438, - "loss": 0.2573, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.06832887977361679, - "rewards/margins": 1.9427387714385986, - "rewards/rejected": -1.8744099140167236, + "learning_rate": 9.58904109589041e-07, + "logits/chosen": -2.8172953128814697, + "logits/rejected": -2.813781261444092, + "logps/chosen": -264.85235595703125, + "logps/rejected": -74.86351013183594, + "loss": 0.6541, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.026092741638422012, + "rewards/margins": 0.0869915708899498, + "rewards/rejected": -0.060898829251527786, "step": 70 }, { "epoch": 0.16, - "learning_rate": 1.0958904109589042e-05, - "logits/chosen": -2.751570701599121, - "logits/rejected": -2.764747142791748, - "logps/chosen": -257.4574279785156, - "logps/rejected": -94.42860412597656, - "loss": 0.2239, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.14372114837169647, - "rewards/margins": 2.0751771926879883, - "rewards/rejected": -2.218898296356201, + "learning_rate": 1.095890410958904e-06, + "logits/chosen": -2.8075528144836426, + "logits/rejected": -2.8162968158721924, + "logps/chosen": -255.69869995117188, + "logps/rejected": -73.08162689208984, + "loss": 0.6355, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0321514829993248, + "rewards/margins": 0.11635198444128036, + "rewards/rejected": -0.08420050889253616, "step": 80 }, { "epoch": 0.19, - "learning_rate": 1.2328767123287673e-05, - "logits/chosen": -2.7785003185272217, - "logits/rejected": -2.7658634185791016, - "logps/chosen": -271.41583251953125, - "logps/rejected": -113.24674987792969, - "loss": 0.1911, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.36074167490005493, - "rewards/margins": 2.577946186065674, - "rewards/rejected": -2.938687801361084, + "learning_rate": 1.232876712328767e-06, + "logits/chosen": -2.841773748397827, + "logits/rejected": -2.825883388519287, + "logps/chosen": -267.5022888183594, + "logps/rejected": -85.21376037597656, + "loss": 0.6119, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.030611341819167137, + "rewards/margins": 0.16599974036216736, + "rewards/rejected": -0.13538840413093567, "step": 90 }, { "epoch": 0.21, - "learning_rate": 1.3698630136986302e-05, - "logits/chosen": -2.7907283306121826, - "logits/rejected": -2.7734055519104004, - "logps/chosen": -291.3459167480469, - "logps/rejected": -113.92546081542969, - "loss": 0.1648, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.3865249752998352, - "rewards/margins": 2.9936206340789795, - "rewards/rejected": -3.38014554977417, + "learning_rate": 1.36986301369863e-06, + "logits/chosen": -2.852410316467285, + "logits/rejected": -2.8519997596740723, + "logps/chosen": -286.9059143066406, + "logps/rejected": -82.15430450439453, + "loss": 0.5808, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.057482797652482986, + "rewards/margins": 0.26051220297813416, + "rewards/rejected": -0.20302939414978027, "step": 100 }, { "epoch": 0.23, - "learning_rate": 1.5068493150684933e-05, - "logits/chosen": -2.6879987716674805, - "logits/rejected": -2.6891028881073, - "logps/chosen": -285.2283935546875, - "logps/rejected": -117.8741683959961, - "loss": 0.1551, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.7047951817512512, - "rewards/margins": 2.912633180618286, - "rewards/rejected": -3.6174283027648926, + "learning_rate": 1.5068493150684932e-06, + "logits/chosen": -2.7674930095672607, + "logits/rejected": -2.7689127922058105, + "logps/chosen": -277.5893249511719, + "logps/rejected": -84.25486755371094, + "loss": 0.5528, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.05910978466272354, + "rewards/margins": 0.3146067261695862, + "rewards/rejected": -0.25549691915512085, "step": 110 }, { "epoch": 0.25, - "learning_rate": 1.6438356164383563e-05, - "logits/chosen": -2.666584014892578, - "logits/rejected": -2.7262470722198486, - "logps/chosen": -284.8210754394531, - "logps/rejected": -121.16352844238281, - "loss": 0.1394, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.631528913974762, - "rewards/margins": 3.5603854656219482, - "rewards/rejected": -4.1919145584106445, + "learning_rate": 1.643835616438356e-06, + "logits/chosen": -2.7497572898864746, + "logits/rejected": -2.808637857437134, + "logps/chosen": -277.6904602050781, + "logps/rejected": -82.65525817871094, + "loss": 0.5245, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.08153042942285538, + "rewards/margins": 0.4226166605949402, + "rewards/rejected": -0.3410862386226654, "step": 120 }, { "epoch": 0.27, - "learning_rate": 1.7808219178082194e-05, - "logits/chosen": -2.6699535846710205, - "logits/rejected": -2.6761491298675537, - "logps/chosen": -292.54754638671875, - "logps/rejected": -131.53846740722656, - "loss": 0.1216, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.6828351020812988, - "rewards/margins": 3.826262950897217, - "rewards/rejected": -4.509098052978516, + "learning_rate": 1.780821917808219e-06, + "logits/chosen": -2.765390634536743, + "logits/rejected": -2.7765676975250244, + "logps/chosen": -284.7118835449219, + "logps/rejected": -90.37391662597656, + "loss": 0.5057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1007312536239624, + "rewards/margins": 0.49337419867515564, + "rewards/rejected": -0.39264291524887085, "step": 130 }, { "epoch": 0.29, - "learning_rate": 1.9178082191780822e-05, - "logits/chosen": -2.7449731826782227, - "logits/rejected": -2.744097948074341, - "logps/chosen": -302.683837890625, - "logps/rejected": -129.56527709960938, - "loss": 0.116, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.8527084589004517, - "rewards/margins": 3.741666078567505, - "rewards/rejected": -4.594374179840088, + "learning_rate": 1.917808219178082e-06, + "logits/chosen": -2.8435187339782715, + "logits/rejected": -2.848928213119507, + "logps/chosen": -293.25323486328125, + "logps/rejected": -88.73191833496094, + "loss": 0.4691, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.09035232663154602, + "rewards/margins": 0.6013891696929932, + "rewards/rejected": -0.5110368132591248, "step": 140 }, { "epoch": 0.31, - "learning_rate": 1.9938884644767e-05, - "logits/chosen": -2.6660311222076416, - "logits/rejected": -2.668618679046631, - "logps/chosen": -300.77288818359375, - "logps/rejected": -126.13216400146484, - "loss": 0.1032, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.9231691360473633, - "rewards/margins": 3.9592361450195312, - "rewards/rejected": -4.882404804229736, + "learning_rate": 1.9938884644766997e-06, + "logits/chosen": -2.788867235183716, + "logits/rejected": -2.797130584716797, + "logps/chosen": -290.54638671875, + "logps/rejected": -82.95954895019531, + "loss": 0.4398, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.09947594255208969, + "rewards/margins": 0.6646188497543335, + "rewards/rejected": -0.5651428699493408, "step": 150 }, { "epoch": 0.33, - "learning_rate": 1.9786096256684494e-05, - "logits/chosen": -2.6760621070861816, - "logits/rejected": -2.743537187576294, - "logps/chosen": -284.9643249511719, - "logps/rejected": -130.59217834472656, - "loss": 0.0995, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.2370917797088623, - "rewards/margins": 4.145981788635254, - "rewards/rejected": -5.383072853088379, + "learning_rate": 1.978609625668449e-06, + "logits/chosen": -2.809487819671631, + "logits/rejected": -2.8850138187408447, + "logps/chosen": -271.88629150390625, + "logps/rejected": -83.17223358154297, + "loss": 0.4204, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.07071445882320404, + "rewards/margins": 0.7117933034896851, + "rewards/rejected": -0.6410789489746094, "step": 160 }, { "epoch": 0.35, - "learning_rate": 1.9633307868601987e-05, - "logits/chosen": -2.602896213531494, - "logits/rejected": -2.5765960216522217, - "logps/chosen": -302.64300537109375, - "logps/rejected": -141.293212890625, - "loss": 0.0875, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.2406305074691772, - "rewards/margins": 4.966603755950928, - "rewards/rejected": -6.207234859466553, + "learning_rate": 1.9633307868601984e-06, + "logits/chosen": -2.7891199588775635, + "logits/rejected": -2.789405584335327, + "logps/chosen": -288.9054870605469, + "logps/rejected": -86.50032043457031, + "loss": 0.4053, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.13311851024627686, + "rewards/margins": 0.861064076423645, + "rewards/rejected": -0.7279455065727234, "step": 170 }, { "epoch": 0.37, - "learning_rate": 1.9480519480519483e-05, - "logits/chosen": -2.561760187149048, - "logits/rejected": -2.5683257579803467, - "logps/chosen": -293.40093994140625, - "logps/rejected": -149.14077758789062, - "loss": 0.0846, - "rewards/accuracies": 0.96875, - "rewards/chosen": -2.0210623741149902, - "rewards/margins": 4.921706676483154, - "rewards/rejected": -6.942769527435303, + "learning_rate": 1.948051948051948e-06, + "logits/chosen": -2.760305643081665, + "logits/rejected": -2.7756900787353516, + "logps/chosen": -271.7988586425781, + "logps/rejected": -88.26414489746094, + "loss": 0.3764, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.1391439437866211, + "rewards/margins": 0.9942502975463867, + "rewards/rejected": -0.8551063537597656, "step": 180 }, { "epoch": 0.39, - "learning_rate": 1.9327731092436976e-05, - "logits/chosen": -2.5644702911376953, - "logits/rejected": -2.569974422454834, - "logps/chosen": -295.0621032714844, - "logps/rejected": -151.10609436035156, - "loss": 0.0794, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.016458034515381, - "rewards/margins": 5.062263488769531, - "rewards/rejected": -7.078721046447754, + "learning_rate": 1.9327731092436974e-06, + "logits/chosen": -2.7618346214294434, + "logits/rejected": -2.7937939167022705, + "logps/chosen": -274.2486877441406, + "logps/rejected": -89.63475799560547, + "loss": 0.3633, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.06488532572984695, + "rewards/margins": 0.9964747428894043, + "rewards/rejected": -0.9315894246101379, "step": 190 }, { "epoch": 0.41, - "learning_rate": 1.9174942704354472e-05, - "logits/chosen": -2.6058998107910156, - "logits/rejected": -2.5582222938537598, - "logps/chosen": -290.7182922363281, - "logps/rejected": -156.05072021484375, - "loss": 0.0759, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -2.2009756565093994, - "rewards/margins": 5.653419017791748, - "rewards/rejected": -7.854394435882568, + "learning_rate": 1.917494270435447e-06, + "logits/chosen": -2.834862232208252, + "logits/rejected": -2.8097951412200928, + "logps/chosen": -267.5804748535156, + "logps/rejected": -88.04959869384766, + "loss": 0.3405, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.11280632019042969, + "rewards/margins": 1.167088508605957, + "rewards/rejected": -1.054282307624817, "step": 200 }, { "epoch": 0.43, - "learning_rate": 1.9022154316271965e-05, - "logits/chosen": -2.540133237838745, - "logits/rejected": -2.4774231910705566, - "logps/chosen": -295.8403015136719, - "logps/rejected": -166.9518280029297, - "loss": 0.0739, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.3658392429351807, - "rewards/margins": 5.26444149017334, - "rewards/rejected": -7.630280494689941, + "learning_rate": 1.9022154316271963e-06, + "logits/chosen": -2.789674997329712, + "logits/rejected": -2.7489635944366455, + "logps/chosen": -271.354736328125, + "logps/rejected": -101.02803039550781, + "loss": 0.3406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.08271731436252594, + "rewards/margins": 1.1206178665161133, + "rewards/rejected": -1.037900447845459, "step": 210 }, { "epoch": 0.45, - "learning_rate": 1.8869365928189458e-05, - "logits/chosen": -2.5680794715881348, - "logits/rejected": -2.503758430480957, - "logps/chosen": -281.4465637207031, - "logps/rejected": -157.9706268310547, - "loss": 0.0679, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.859816074371338, - "rewards/margins": 5.681013584136963, - "rewards/rejected": -8.5408296585083, + "learning_rate": 1.8869365928189456e-06, + "logits/chosen": -2.8190276622772217, + "logits/rejected": -2.7801876068115234, + "logps/chosen": -252.3684844970703, + "logps/rejected": -83.58897399902344, + "loss": 0.3366, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.047989681363105774, + "rewards/margins": 1.1506556272506714, + "rewards/rejected": -1.102665901184082, "step": 220 }, { "epoch": 0.47, - "learning_rate": 1.8716577540106954e-05, - "logits/chosen": -2.516791820526123, - "logits/rejected": -2.4941375255584717, - "logps/chosen": -320.51336669921875, - "logps/rejected": -158.91770935058594, - "loss": 0.0662, + "learning_rate": 1.8716577540106951e-06, + "logits/chosen": -2.7455031871795654, + "logits/rejected": -2.7612051963806152, + "logps/chosen": -296.0608825683594, + "logps/rejected": -85.1841049194336, + "loss": 0.3066, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.3171486854553223, - "rewards/margins": 6.3180742263793945, - "rewards/rejected": -8.635221481323242, + "rewards/chosen": 0.12809792160987854, + "rewards/margins": 1.3899600505828857, + "rewards/rejected": -1.2618623971939087, "step": 230 }, { "epoch": 0.49, - "learning_rate": 1.8563789152024447e-05, - "logits/chosen": -2.4835712909698486, - "logits/rejected": -2.477268934249878, - "logps/chosen": -311.7545471191406, - "logps/rejected": -168.81459045410156, - "loss": 0.0708, - "rewards/accuracies": 0.96875, - "rewards/chosen": -2.8363029956817627, - "rewards/margins": 6.439309597015381, - "rewards/rejected": -9.275611877441406, + "learning_rate": 1.8563789152024445e-06, + "logits/chosen": -2.775570869445801, + "logits/rejected": -2.819958209991455, + "logps/chosen": -282.4068908691406, + "logps/rejected": -89.96455383300781, + "loss": 0.3055, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.09846373647451401, + "rewards/margins": 1.4890722036361694, + "rewards/rejected": -1.390608549118042, "step": 240 }, { "epoch": 0.52, - "learning_rate": 1.8411000763941943e-05, - "logits/chosen": -2.5130648612976074, - "logits/rejected": -2.510387897491455, - "logps/chosen": -301.73126220703125, - "logps/rejected": -176.51712036132812, - "loss": 0.0716, - "rewards/accuracies": 0.96875, - "rewards/chosen": -3.6253724098205566, - "rewards/margins": 5.836529731750488, - "rewards/rejected": -9.461902618408203, + "learning_rate": 1.841100076394194e-06, + "logits/chosen": -2.799436092376709, + "logits/rejected": -2.8345470428466797, + "logps/chosen": -265.59735107421875, + "logps/rejected": -96.5230941772461, + "loss": 0.2986, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.011983467265963554, + "rewards/margins": 1.4505140781402588, + "rewards/rejected": -1.462497591972351, "step": 250 }, { "epoch": 0.54, - "learning_rate": 1.8258212375859436e-05, - "logits/chosen": -2.4818222522735596, - "logits/rejected": -2.4154343605041504, - "logps/chosen": -312.3952331542969, - "logps/rejected": -160.39041137695312, - "loss": 0.0714, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -2.6363685131073, - "rewards/margins": 6.308477401733398, - "rewards/rejected": -8.944845199584961, + "learning_rate": 1.8258212375859433e-06, + "logits/chosen": -2.767268419265747, + "logits/rejected": -2.7291500568389893, + "logps/chosen": -284.69512939453125, + "logps/rejected": -84.67759704589844, + "loss": 0.2975, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.13364484906196594, + "rewards/margins": 1.5072095394134521, + "rewards/rejected": -1.373564600944519, "step": 260 }, { "epoch": 0.56, - "learning_rate": 1.8105423987776932e-05, - "logits/chosen": -2.4748425483703613, - "logits/rejected": -2.4548239707946777, - "logps/chosen": -305.8868103027344, - "logps/rejected": -178.6342010498047, - "loss": 0.0662, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -2.9469079971313477, - "rewards/margins": 6.27431583404541, - "rewards/rejected": -9.221223831176758, + "learning_rate": 1.8105423987776928e-06, + "logits/chosen": -2.740736961364746, + "logits/rejected": -2.7466208934783936, + "logps/chosen": -276.2334899902344, + "logps/rejected": -102.246826171875, + "loss": 0.271, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.018421288579702377, + "rewards/margins": 1.600907325744629, + "rewards/rejected": -1.5824859142303467, "step": 270 }, { "epoch": 0.58, - "learning_rate": 1.7952635599694425e-05, - "logits/chosen": -2.5008432865142822, - "logits/rejected": -2.489522933959961, - "logps/chosen": -292.95355224609375, - "logps/rejected": -161.0408935546875, - "loss": 0.0728, + "learning_rate": 1.7952635599694424e-06, + "logits/chosen": -2.763476610183716, + "logits/rejected": -2.826669216156006, + "logps/chosen": -256.71234130859375, + "logps/rejected": -84.85465240478516, + "loss": 0.2796, "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.6459877490997314, - "rewards/margins": 5.44729471206665, - "rewards/rejected": -9.093282699584961, + "rewards/chosen": -0.021867703646421432, + "rewards/margins": 1.4527885913848877, + "rewards/rejected": -1.474656343460083, "step": 280 }, { "epoch": 0.6, - "learning_rate": 1.7799847211611917e-05, - "logits/chosen": -2.46510648727417, - "logits/rejected": -2.4538471698760986, - "logps/chosen": -309.40313720703125, - "logps/rejected": -177.50119018554688, - "loss": 0.0694, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.042060613632202, - "rewards/margins": 6.3933000564575195, - "rewards/rejected": -9.435359954833984, + "learning_rate": 1.7799847211611915e-06, + "logits/chosen": -2.778783082962036, + "logits/rejected": -2.7868287563323975, + "logps/chosen": -278.37847900390625, + "logps/rejected": -101.5394287109375, + "loss": 0.2617, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06040378659963608, + "rewards/margins": 1.8995882272720337, + "rewards/rejected": -1.8391841650009155, "step": 290 }, { "epoch": 0.62, - "learning_rate": 1.7647058823529414e-05, - "logits/chosen": -2.3893284797668457, - "logits/rejected": -2.334172487258911, - "logps/chosen": -310.794189453125, - "logps/rejected": -172.20497131347656, - "loss": 0.0712, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.50679087638855, - "rewards/margins": 5.94513463973999, - "rewards/rejected": -9.451925277709961, + "learning_rate": 1.764705882352941e-06, + "logits/chosen": -2.727238893508911, + "logits/rejected": -2.723588466644287, + "logps/chosen": -275.4689025878906, + "logps/rejected": -93.48865509033203, + "loss": 0.2612, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02573985978960991, + "rewards/margins": 1.606034517288208, + "rewards/rejected": -1.5802944898605347, "step": 300 }, { "epoch": 0.64, - "learning_rate": 1.7494270435446906e-05, - "logits/chosen": -2.454113721847534, - "logits/rejected": -2.4209561347961426, - "logps/chosen": -323.4365234375, - "logps/rejected": -188.5634765625, - "loss": 0.0634, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -4.002615451812744, - "rewards/margins": 6.827607154846191, - "rewards/rejected": -10.830221176147461, + "learning_rate": 1.7494270435446906e-06, + "logits/chosen": -2.8035991191864014, + "logits/rejected": -2.8192009925842285, + "logps/chosen": -284.1933898925781, + "logps/rejected": -100.13687896728516, + "loss": 0.2521, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.07830613851547241, + "rewards/margins": 1.9092552661895752, + "rewards/rejected": -1.9875609874725342, "step": 310 }, { "epoch": 0.66, - "learning_rate": 1.7341482047364403e-05, - "logits/chosen": -2.469594717025757, - "logits/rejected": -2.4255645275115967, - "logps/chosen": -308.19000244140625, - "logps/rejected": -187.37420654296875, - "loss": 0.059, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.496091365814209, - "rewards/margins": 6.612002372741699, - "rewards/rejected": -10.108095169067383, + "learning_rate": 1.7341482047364399e-06, + "logits/chosen": -2.8154501914978027, + "logits/rejected": -2.810351848602295, + "logps/chosen": -274.22906494140625, + "logps/rejected": -105.29779052734375, + "loss": 0.2418, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.09999797493219376, + "rewards/margins": 1.8004556894302368, + "rewards/rejected": -1.9004535675048828, "step": 320 }, { "epoch": 0.68, - "learning_rate": 1.7188693659281895e-05, - "logits/chosen": -2.459336042404175, - "logits/rejected": -2.4206173419952393, - "logps/chosen": -285.8677062988281, - "logps/rejected": -170.1533660888672, - "loss": 0.0648, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.3919098377227783, - "rewards/margins": 6.427476406097412, - "rewards/rejected": -9.819387435913086, + "learning_rate": 1.7188693659281894e-06, + "logits/chosen": -2.7782671451568604, + "logits/rejected": -2.7969181537628174, + "logps/chosen": -253.5823516845703, + "logps/rejected": -91.79356384277344, + "loss": 0.238, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16337540745735168, + "rewards/margins": 1.8200305700302124, + "rewards/rejected": -1.9834058284759521, "step": 330 }, { "epoch": 0.7, - "learning_rate": 1.703590527119939e-05, - "logits/chosen": -2.5112051963806152, - "logits/rejected": -2.4785618782043457, - "logps/chosen": -304.47003173828125, - "logps/rejected": -176.83731079101562, - "loss": 0.0632, - "rewards/accuracies": 0.96875, - "rewards/chosen": -2.8963208198547363, - "rewards/margins": 6.368965148925781, - "rewards/rejected": -9.265286445617676, + "learning_rate": 1.703590527119939e-06, + "logits/chosen": -2.7838528156280518, + "logits/rejected": -2.7845559120178223, + "logps/chosen": -276.6927795410156, + "logps/rejected": -105.1791763305664, + "loss": 0.2408, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.11859698593616486, + "rewards/margins": 1.980875015258789, + "rewards/rejected": -2.0994718074798584, "step": 340 }, { "epoch": 0.72, - "learning_rate": 1.6883116883116884e-05, - "logits/chosen": -2.5041980743408203, - "logits/rejected": -2.46270751953125, - "logps/chosen": -309.81536865234375, - "logps/rejected": -182.5047149658203, - "loss": 0.0637, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.1548614501953125, - "rewards/margins": 6.430181980133057, - "rewards/rejected": -9.585042953491211, + "learning_rate": 1.688311688311688e-06, + "logits/chosen": -2.8013651371002197, + "logits/rejected": -2.7973175048828125, + "logps/chosen": -280.02337646484375, + "logps/rejected": -108.97808837890625, + "loss": 0.2366, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17565980553627014, + "rewards/margins": 2.0567195415496826, + "rewards/rejected": -2.23237943649292, "step": 350 }, { "epoch": 0.74, - "learning_rate": 1.6730328495034377e-05, - "logits/chosen": -2.4566521644592285, - "logits/rejected": -2.449441432952881, - "logps/chosen": -324.7444152832031, - "logps/rejected": -174.9409637451172, - "loss": 0.0623, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.9833693504333496, - "rewards/margins": 6.45650577545166, - "rewards/rejected": -9.439874649047852, + "learning_rate": 1.6730328495034376e-06, + "logits/chosen": -2.7511441707611084, + "logits/rejected": -2.79335355758667, + "logps/chosen": -295.89630126953125, + "logps/rejected": -102.7479476928711, + "loss": 0.2329, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.09856131672859192, + "rewards/margins": 2.122011184692383, + "rewards/rejected": -2.2205727100372314, "step": 360 }, { "epoch": 0.76, - "learning_rate": 1.6577540106951873e-05, - "logits/chosen": -2.4364638328552246, - "logits/rejected": -2.3899149894714355, - "logps/chosen": -309.7897033691406, - "logps/rejected": -176.37049865722656, - "loss": 0.0646, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.2578327655792236, - "rewards/margins": 7.072564125061035, - "rewards/rejected": -10.33039665222168, + "learning_rate": 1.6577540106951871e-06, + "logits/chosen": -2.7538399696350098, + "logits/rejected": -2.7933411598205566, + "logps/chosen": -278.8785705566406, + "logps/rejected": -95.23030090332031, + "loss": 0.2309, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.1667167693376541, + "rewards/margins": 2.049661159515381, + "rewards/rejected": -2.2163777351379395, "step": 370 }, { "epoch": 0.78, - "learning_rate": 1.6424751718869366e-05, - "logits/chosen": -2.4285435676574707, - "logits/rejected": -2.3717868328094482, - "logps/chosen": -307.73907470703125, - "logps/rejected": -190.9008331298828, - "loss": 0.0538, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.86690092086792, - "rewards/margins": 7.263972282409668, - "rewards/rejected": -11.130870819091797, + "learning_rate": 1.6424751718869364e-06, + "logits/chosen": -2.7607922554016113, + "logits/rejected": -2.763124942779541, + "logps/chosen": -270.15484619140625, + "logps/rejected": -103.03666687011719, + "loss": 0.2252, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.10847791284322739, + "rewards/margins": 2.2359797954559326, + "rewards/rejected": -2.3444576263427734, "step": 380 }, { "epoch": 0.8, - "learning_rate": 1.6271963330786862e-05, - "logits/chosen": -2.475437879562378, - "logits/rejected": -2.4385433197021484, - "logps/chosen": -318.3887634277344, - "logps/rejected": -187.13783264160156, - "loss": 0.0524, + "learning_rate": 1.627196333078686e-06, + "logits/chosen": -2.7692110538482666, + "logits/rejected": -2.7727513313293457, + "logps/chosen": -284.1410217285156, + "logps/rejected": -103.25160217285156, + "loss": 0.2183, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.500490665435791, - "rewards/margins": 7.331036567687988, - "rewards/rejected": -10.831527709960938, + "rewards/chosen": -0.07571631669998169, + "rewards/margins": 2.3671867847442627, + "rewards/rejected": -2.4429030418395996, "step": 390 }, { "epoch": 0.82, - "learning_rate": 1.6119174942704355e-05, - "logits/chosen": -2.413388252258301, - "logits/rejected": -2.384582281112671, - "logps/chosen": -284.5127868652344, - "logps/rejected": -182.04469299316406, - "loss": 0.0547, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.9967052936553955, - "rewards/margins": 6.6313934326171875, - "rewards/rejected": -10.628097534179688, + "learning_rate": 1.6119174942704355e-06, + "logits/chosen": -2.713252544403076, + "logits/rejected": -2.695067882537842, + "logps/chosen": -246.3933563232422, + "logps/rejected": -98.11837768554688, + "loss": 0.2068, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.18476447463035583, + "rewards/margins": 2.0507025718688965, + "rewards/rejected": -2.2354671955108643, "step": 400 }, { "epoch": 0.85, - "learning_rate": 1.596638655462185e-05, - "logits/chosen": -2.4949584007263184, - "logits/rejected": -2.4177098274230957, - "logps/chosen": -306.1350402832031, - "logps/rejected": -188.74630737304688, - "loss": 0.0559, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.9294257164001465, - "rewards/margins": 7.267237663269043, - "rewards/rejected": -11.196664810180664, + "learning_rate": 1.5966386554621848e-06, + "logits/chosen": -2.788705348968506, + "logits/rejected": -2.7501461505889893, + "logps/chosen": -268.0991516113281, + "logps/rejected": -102.09769439697266, + "loss": 0.1976, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.12583544850349426, + "rewards/margins": 2.4059669971466064, + "rewards/rejected": -2.5318026542663574, "step": 410 }, { "epoch": 0.87, - "learning_rate": 1.5813598166539344e-05, - "logits/chosen": -2.4707770347595215, - "logits/rejected": -2.41619610786438, - "logps/chosen": -309.9191589355469, - "logps/rejected": -187.71530151367188, - "loss": 0.0653, + "learning_rate": 1.5813598166539341e-06, + "logits/chosen": -2.728642702102661, + "logits/rejected": -2.7183501720428467, + "logps/chosen": -277.0406188964844, + "logps/rejected": -108.7503890991211, + "loss": 0.2093, "rewards/accuracies": 0.96875, - "rewards/chosen": -3.556119441986084, - "rewards/margins": 7.09530782699585, - "rewards/rejected": -10.651426315307617, + "rewards/chosen": -0.2682625353336334, + "rewards/margins": 2.486672878265381, + "rewards/rejected": -2.7549355030059814, "step": 420 }, { "epoch": 0.89, - "learning_rate": 1.5660809778456837e-05, - "logits/chosen": -2.475254535675049, - "logits/rejected": -2.458305597305298, - "logps/chosen": -310.1326904296875, - "logps/rejected": -195.8196563720703, - "loss": 0.0569, + "learning_rate": 1.5660809778456837e-06, + "logits/chosen": -2.755733013153076, + "logits/rejected": -2.794435977935791, + "logps/chosen": -267.26983642578125, + "logps/rejected": -103.68330383300781, + "loss": 0.1918, "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.553369998931885, - "rewards/margins": 7.457148551940918, - "rewards/rejected": -12.010518074035645, + "rewards/chosen": -0.2670840322971344, + "rewards/margins": 2.529799461364746, + "rewards/rejected": -2.7968833446502686, "step": 430 }, { "epoch": 0.91, - "learning_rate": 1.5508021390374333e-05, - "logits/chosen": -2.456172227859497, - "logits/rejected": -2.4342832565307617, - "logps/chosen": -309.1450500488281, - "logps/rejected": -188.05943298339844, - "loss": 0.0547, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.021037578582764, - "rewards/margins": 7.251453399658203, - "rewards/rejected": -11.272489547729492, + "learning_rate": 1.550802139037433e-06, + "logits/chosen": -2.722480058670044, + "logits/rejected": -2.7511894702911377, + "logps/chosen": -271.193359375, + "logps/rejected": -101.18013000488281, + "loss": 0.1901, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.22587260603904724, + "rewards/margins": 2.3586864471435547, + "rewards/rejected": -2.584559202194214, "step": 440 }, { "epoch": 0.93, - "learning_rate": 1.5355233002291826e-05, - "logits/chosen": -2.4393460750579834, - "logits/rejected": -2.410473585128784, - "logps/chosen": -343.1289978027344, - "logps/rejected": -199.44427490234375, - "loss": 0.0661, - "rewards/accuracies": 0.96875, - "rewards/chosen": -4.531455039978027, - "rewards/margins": 7.504457950592041, - "rewards/rejected": -12.035911560058594, + "learning_rate": 1.5355233002291825e-06, + "logits/chosen": -2.724370241165161, + "logits/rejected": -2.754976511001587, + "logps/chosen": -300.2513122558594, + "logps/rejected": -107.4105224609375, + "loss": 0.2001, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.2436836212873459, + "rewards/margins": 2.5888543128967285, + "rewards/rejected": -2.83253812789917, "step": 450 }, { "epoch": 0.95, - "learning_rate": 1.5202444614209322e-05, - "logits/chosen": -2.4784836769104004, - "logits/rejected": -2.4135525226593018, - "logps/chosen": -338.821044921875, - "logps/rejected": -197.35238647460938, - "loss": 0.059, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -3.623051881790161, - "rewards/margins": 7.590781211853027, - "rewards/rejected": -11.213833808898926, + "learning_rate": 1.520244461420932e-06, + "logits/chosen": -2.7703804969787598, + "logits/rejected": -2.743251323699951, + "logps/chosen": -304.1031799316406, + "logps/rejected": -113.75921630859375, + "loss": 0.2065, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15126582980155945, + "rewards/margins": 2.703249931335449, + "rewards/rejected": -2.854515552520752, "step": 460 }, { "epoch": 0.97, - "learning_rate": 1.5049656226126816e-05, - "logits/chosen": -2.4022135734558105, - "logits/rejected": -2.3517110347747803, - "logps/chosen": -277.9537048339844, - "logps/rejected": -186.30238342285156, - "loss": 0.0601, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.108160018920898, - "rewards/margins": 7.290404319763184, - "rewards/rejected": -11.398564338684082, + "learning_rate": 1.5049656226126814e-06, + "logits/chosen": -2.7441253662109375, + "logits/rejected": -2.7229297161102295, + "logps/chosen": -240.0807342529297, + "logps/rejected": -100.44654846191406, + "loss": 0.2019, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.32086318731307983, + "rewards/margins": 2.4921185970306396, + "rewards/rejected": -2.8129820823669434, "step": 470 }, { "epoch": 0.99, - "learning_rate": 1.489686783804431e-05, - "logits/chosen": -2.406426429748535, - "logits/rejected": -2.3724937438964844, - "logps/chosen": -294.24114990234375, - "logps/rejected": -201.96246337890625, - "loss": 0.0636, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -4.248598575592041, - "rewards/margins": 7.405554294586182, - "rewards/rejected": -11.654152870178223, + "learning_rate": 1.4896867838044307e-06, + "logits/chosen": -2.704709529876709, + "logits/rejected": -2.7237319946289062, + "logps/chosen": -255.7570343017578, + "logps/rejected": -113.80975341796875, + "loss": 0.2024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.40018701553344727, + "rewards/margins": 2.4386959075927734, + "rewards/rejected": -2.8388831615448, "step": 480 }, { "epoch": 1.0, - "eval_logits/chosen": -2.4664840698242188, - "eval_logits/rejected": -2.395963430404663, - "eval_logps/chosen": -313.8589172363281, - "eval_logps/rejected": -244.22576904296875, - "eval_loss": 0.6041525602340698, + "eval_logits/chosen": -2.7680206298828125, + "eval_logits/rejected": -2.7271738052368164, + "eval_logps/chosen": -277.8106689453125, + "eval_logps/rejected": -191.96604919433594, + "eval_loss": 0.4197174608707428, "eval_rewards/accuracies": 0.80859375, - "eval_rewards/chosen": -4.002192974090576, - "eval_rewards/margins": 3.116774797439575, - "eval_rewards/rejected": -7.1189680099487305, - "eval_runtime": 258.6968, - "eval_samples_per_second": 7.731, + "eval_rewards/chosen": -0.3973674178123474, + "eval_rewards/margins": 1.4956284761428833, + "eval_rewards/rejected": -1.892995834350586, + "eval_runtime": 259.9183, + "eval_samples_per_second": 7.695, "eval_steps_per_second": 0.062, "step": 485 }, { "epoch": 1.01, - "learning_rate": 1.4744079449961804e-05, - "logits/chosen": -2.4473445415496826, - "logits/rejected": -2.4362194538116455, - "logps/chosen": -317.8018798828125, - "logps/rejected": -199.7606201171875, - "loss": 0.049, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.134568691253662, - "rewards/margins": 7.972777366638184, - "rewards/rejected": -12.10734748840332, + "learning_rate": 1.4744079449961802e-06, + "logits/chosen": -2.7513904571533203, + "logits/rejected": -2.8016788959503174, + "logps/chosen": -279.7330017089844, + "logps/rejected": -109.1887435913086, + "loss": 0.1934, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.32768136262893677, + "rewards/margins": 2.722477436065674, + "rewards/rejected": -3.0501585006713867, "step": 490 }, { "epoch": 1.03, - "learning_rate": 1.4591291061879298e-05, - "logits/chosen": -2.390632390975952, - "logits/rejected": -2.3262600898742676, - "logps/chosen": -316.8980712890625, - "logps/rejected": -207.3352508544922, - "loss": 0.0461, + "learning_rate": 1.4591291061879296e-06, + "logits/chosen": -2.705814838409424, + "logits/rejected": -2.6782376766204834, + "logps/chosen": -274.86480712890625, + "logps/rejected": -113.91358947753906, + "loss": 0.1797, "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -4.459061622619629, - "rewards/margins": 7.6829423904418945, - "rewards/rejected": -12.142004013061523, + "rewards/chosen": -0.2557370066642761, + "rewards/margins": 2.544098377227783, + "rewards/rejected": -2.799835205078125, "step": 500 }, { "epoch": 1.05, - "learning_rate": 1.4438502673796793e-05, - "logits/chosen": -2.405426025390625, - "logits/rejected": -2.3402185440063477, - "logps/chosen": -304.3172302246094, - "logps/rejected": -192.7342529296875, - "loss": 0.0456, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -4.710546970367432, - "rewards/margins": 7.46035099029541, - "rewards/rejected": -12.170897483825684, + "learning_rate": 1.443850267379679e-06, + "logits/chosen": -2.7348856925964355, + "logits/rejected": -2.7256247997283936, + "logps/chosen": -260.6488342285156, + "logps/rejected": -98.91087341308594, + "loss": 0.1826, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.3437056541442871, + "rewards/margins": 2.4448533058166504, + "rewards/rejected": -2.7885591983795166, "step": 510 }, { "epoch": 1.07, - "learning_rate": 1.4285714285714287e-05, - "logits/chosen": -2.456904649734497, - "logits/rejected": -2.399510622024536, - "logps/chosen": -314.65142822265625, - "logps/rejected": -207.9474334716797, - "loss": 0.0404, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.895995616912842, - "rewards/margins": 7.744017124176025, - "rewards/rejected": -11.640012741088867, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -2.799448013305664, + "logits/rejected": -2.778778076171875, + "logps/chosen": -279.9732360839844, + "logps/rejected": -124.17852783203125, + "loss": 0.1678, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4281793236732483, + "rewards/margins": 2.834944248199463, + "rewards/rejected": -3.2631232738494873, "step": 520 }, { "epoch": 1.09, - "learning_rate": 1.4132925897631782e-05, - "logits/chosen": -2.443506956100464, - "logits/rejected": -2.363826274871826, - "logps/chosen": -346.93341064453125, - "logps/rejected": -210.9061737060547, - "loss": 0.0368, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -4.365347862243652, - "rewards/margins": 8.533185005187988, - "rewards/rejected": -12.898531913757324, + "learning_rate": 1.413292589763178e-06, + "logits/chosen": -2.7671663761138916, + "logits/rejected": -2.7472527027130127, + "logps/chosen": -306.42181396484375, + "logps/rejected": -113.77569580078125, + "loss": 0.1743, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.31418782472610474, + "rewards/margins": 2.8712990283966064, + "rewards/rejected": -3.1854867935180664, "step": 530 }, { "epoch": 1.11, - "learning_rate": 1.3980137509549276e-05, - "logits/chosen": -2.422494411468506, - "logits/rejected": -2.381016969680786, - "logps/chosen": -290.3888854980469, - "logps/rejected": -198.80857849121094, - "loss": 0.0441, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.720973968505859, - "rewards/margins": 8.11204719543457, - "rewards/rejected": -12.833023071289062, + "learning_rate": 1.3980137509549275e-06, + "logits/chosen": -2.7520015239715576, + "logits/rejected": -2.772326946258545, + "logps/chosen": -247.4855499267578, + "logps/rejected": -101.20677185058594, + "loss": 0.1779, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4306362271308899, + "rewards/margins": 2.6422040462493896, + "rewards/rejected": -3.072840452194214, "step": 540 }, { "epoch": 1.13, - "learning_rate": 1.3827349121466769e-05, - "logits/chosen": -2.3721251487731934, - "logits/rejected": -2.3095550537109375, - "logps/chosen": -311.7290344238281, - "logps/rejected": -209.42178344726562, - "loss": 0.047, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -4.692753791809082, - "rewards/margins": 8.287328720092773, - "rewards/rejected": -12.980082511901855, + "learning_rate": 1.3827349121466768e-06, + "logits/chosen": -2.685715436935425, + "logits/rejected": -2.705381155014038, + "logps/chosen": -269.1335144042969, + "logps/rejected": -111.51566314697266, + "loss": 0.1742, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.4332028329372406, + "rewards/margins": 2.756269693374634, + "rewards/rejected": -3.1894726753234863, "step": 550 }, { "epoch": 1.15, - "learning_rate": 1.3674560733384263e-05, - "logits/chosen": -2.430870532989502, - "logits/rejected": -2.4249157905578613, - "logps/chosen": -336.8976135253906, - "logps/rejected": -211.49765014648438, - "loss": 0.0416, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -4.568190574645996, - "rewards/margins": 8.739889144897461, - "rewards/rejected": -13.308080673217773, + "learning_rate": 1.3674560733384261e-06, + "logits/chosen": -2.737140417098999, + "logits/rejected": -2.7757163047790527, + "logps/chosen": -294.99530029296875, + "logps/rejected": -112.22420501708984, + "loss": 0.163, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.3779616951942444, + "rewards/margins": 3.002772331237793, + "rewards/rejected": -3.3807339668273926, "step": 560 }, { "epoch": 1.18, - "learning_rate": 1.3521772345301758e-05, - "logits/chosen": -2.4312033653259277, - "logits/rejected": -2.3765575885772705, - "logps/chosen": -332.5270690917969, - "logps/rejected": -210.74942016601562, - "loss": 0.0425, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -4.29726505279541, - "rewards/margins": 8.961139678955078, - "rewards/rejected": -13.258404731750488, + "learning_rate": 1.3521772345301757e-06, + "logits/chosen": -2.765671730041504, + "logits/rejected": -2.776446580886841, + "logps/chosen": -291.44732666015625, + "logps/rejected": -112.7077865600586, + "loss": 0.1604, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.1892886459827423, + "rewards/margins": 3.2649528980255127, + "rewards/rejected": -3.4542412757873535, "step": 570 }, { "epoch": 1.2, - "learning_rate": 1.3368983957219252e-05, - "logits/chosen": -2.3930814266204834, - "logits/rejected": -2.3834025859832764, - "logps/chosen": -290.7808837890625, - "logps/rejected": -206.78366088867188, - "loss": 0.0474, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -4.984532356262207, - "rewards/margins": 8.283910751342773, - "rewards/rejected": -13.26844310760498, + "learning_rate": 1.3368983957219252e-06, + "logits/chosen": -2.711365222930908, + "logits/rejected": -2.761141300201416, + "logps/chosen": -246.60989379882812, + "logps/rejected": -106.89306640625, + "loss": 0.1662, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.5674314498901367, + "rewards/margins": 2.7119510173797607, + "rewards/rejected": -3.2793827056884766, "step": 580 }, { "epoch": 1.22, - "learning_rate": 1.3216195569136747e-05, - "logits/chosen": -2.4135966300964355, - "logits/rejected": -2.3652544021606445, - "logps/chosen": -301.96429443359375, - "logps/rejected": -199.56053161621094, - "loss": 0.0487, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -4.061481475830078, - "rewards/margins": 8.369136810302734, - "rewards/rejected": -12.430618286132812, + "learning_rate": 1.3216195569136745e-06, + "logits/chosen": -2.706845283508301, + "logits/rejected": -2.7259907722473145, + "logps/chosen": -266.33453369140625, + "logps/rejected": -109.5294189453125, + "loss": 0.1638, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.49850577116012573, + "rewards/margins": 2.929001569747925, + "rewards/rejected": -3.4275078773498535, "step": 590 }, { "epoch": 1.24, - "learning_rate": 1.3063407181054241e-05, - "logits/chosen": -2.4600701332092285, - "logits/rejected": -2.3951315879821777, - "logps/chosen": -320.52459716796875, - "logps/rejected": -210.39572143554688, - "loss": 0.0493, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.6905012130737305, - "rewards/margins": 8.37987232208252, - "rewards/rejected": -13.07037353515625, + "learning_rate": 1.306340718105424e-06, + "logits/chosen": -2.7694671154022217, + "logits/rejected": -2.7433857917785645, + "logps/chosen": -279.09429931640625, + "logps/rejected": -113.45039367675781, + "loss": 0.1663, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5474687218666077, + "rewards/margins": 2.8283703327178955, + "rewards/rejected": -3.3758392333984375, "step": 600 }, { "epoch": 1.26, - "learning_rate": 1.2910618792971734e-05, - "logits/chosen": -2.376004219055176, - "logits/rejected": -2.2585413455963135, - "logps/chosen": -317.3911437988281, - "logps/rejected": -210.0692901611328, - "loss": 0.048, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.278079986572266, - "rewards/margins": 8.980937004089355, - "rewards/rejected": -14.259016036987305, + "learning_rate": 1.2910618792971732e-06, + "logits/chosen": -2.724958896636963, + "logits/rejected": -2.6607446670532227, + "logps/chosen": -268.52862548828125, + "logps/rejected": -99.50247955322266, + "loss": 0.1663, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.39182907342910767, + "rewards/margins": 2.8105075359344482, + "rewards/rejected": -3.2023367881774902, "step": 610 }, { "epoch": 1.28, - "learning_rate": 1.2757830404889229e-05, - "logits/chosen": -2.427064895629883, - "logits/rejected": -2.379770517349243, - "logps/chosen": -339.13507080078125, - "logps/rejected": -217.5348358154297, - "loss": 0.0366, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.716102123260498, - "rewards/margins": 8.656492233276367, - "rewards/rejected": -13.372593879699707, + "learning_rate": 1.2757830404889227e-06, + "logits/chosen": -2.7382755279541016, + "logits/rejected": -2.747498035430908, + "logps/chosen": -295.7679748535156, + "logps/rejected": -118.82535552978516, + "loss": 0.1486, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.37939247488975525, + "rewards/margins": 3.1222548484802246, + "rewards/rejected": -3.5016472339630127, "step": 620 }, { "epoch": 1.3, - "learning_rate": 1.2605042016806723e-05, - "logits/chosen": -2.4690704345703125, - "logits/rejected": -2.4057745933532715, - "logps/chosen": -324.30926513671875, - "logps/rejected": -215.36129760742188, - "loss": 0.0407, + "learning_rate": 1.2605042016806722e-06, + "logits/chosen": -2.7766032218933105, + "logits/rejected": -2.7410993576049805, + "logps/chosen": -282.5389099121094, + "logps/rejected": -115.2748794555664, + "loss": 0.153, "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.616096496582031, - "rewards/margins": 9.027668952941895, - "rewards/rejected": -13.643765449523926, + "rewards/chosen": -0.4390658736228943, + "rewards/margins": 3.1960551738739014, + "rewards/rejected": -3.6351211071014404, "step": 630 }, { "epoch": 1.32, - "learning_rate": 1.2452253628724218e-05, - "logits/chosen": -2.4484169483184814, - "logits/rejected": -2.408360004425049, - "logps/chosen": -343.5368957519531, - "logps/rejected": -223.6841583251953, - "loss": 0.0397, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.842722415924072, - "rewards/margins": 8.921377182006836, - "rewards/rejected": -13.764101028442383, + "learning_rate": 1.2452253628724215e-06, + "logits/chosen": -2.7750675678253174, + "logits/rejected": -2.781956195831299, + "logps/chosen": -299.9912109375, + "logps/rejected": -124.12471008300781, + "loss": 0.1546, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.4881526827812195, + "rewards/margins": 3.3200020790100098, + "rewards/rejected": -3.808154582977295, "step": 640 }, { "epoch": 1.34, - "learning_rate": 1.2299465240641712e-05, - "logits/chosen": -2.38913893699646, - "logits/rejected": -2.293104648590088, - "logps/chosen": -335.9158630371094, - "logps/rejected": -223.899658203125, - "loss": 0.045, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.496028900146484, - "rewards/margins": 8.91499137878418, - "rewards/rejected": -14.41102123260498, + "learning_rate": 1.229946524064171e-06, + "logits/chosen": -2.7297987937927246, + "logits/rejected": -2.668787956237793, + "logps/chosen": -285.3520202636719, + "logps/rejected": -114.99342346191406, + "loss": 0.1551, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.43964657187461853, + "rewards/margins": 3.080749988555908, + "rewards/rejected": -3.5203967094421387, "step": 650 }, { "epoch": 1.36, - "learning_rate": 1.2146676852559206e-05, - "logits/chosen": -2.3704442977905273, - "logits/rejected": -2.312631130218506, - "logps/chosen": -335.7287292480469, - "logps/rejected": -221.09228515625, - "loss": 0.0409, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -5.297285079956055, - "rewards/margins": 8.652203559875488, - "rewards/rejected": -13.949490547180176, + "learning_rate": 1.2146676852559206e-06, + "logits/chosen": -2.70336651802063, + "logits/rejected": -2.7211241722106934, + "logps/chosen": -288.2897644042969, + "logps/rejected": -117.46827697753906, + "loss": 0.1598, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5533924102783203, + "rewards/margins": 3.033695697784424, + "rewards/rejected": -3.587088108062744, "step": 660 }, { "epoch": 1.38, - "learning_rate": 1.1993888464476701e-05, - "logits/chosen": -2.36325740814209, - "logits/rejected": -2.3269762992858887, - "logps/chosen": -335.0709533691406, - "logps/rejected": -224.7407684326172, - "loss": 0.0417, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.079594612121582, - "rewards/margins": 8.919063568115234, - "rewards/rejected": -14.998659133911133, + "learning_rate": 1.19938884644767e-06, + "logits/chosen": -2.7294211387634277, + "logits/rejected": -2.776686191558838, + "logps/chosen": -279.3514709472656, + "logps/rejected": -110.04444885253906, + "loss": 0.1681, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.5076431035995483, + "rewards/margins": 3.0213823318481445, + "rewards/rejected": -3.5290253162384033, "step": 670 }, { "epoch": 1.4, - "learning_rate": 1.1841100076394194e-05, - "logits/chosen": -2.380213737487793, - "logits/rejected": -2.342026472091675, - "logps/chosen": -319.50146484375, - "logps/rejected": -216.95565795898438, - "loss": 0.0387, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.395963668823242, - "rewards/margins": 8.671093940734863, - "rewards/rejected": -14.067057609558105, + "learning_rate": 1.1841100076394192e-06, + "logits/chosen": -2.749238967895508, + "logits/rejected": -2.7573282718658447, + "logps/chosen": -270.27874755859375, + "logps/rejected": -111.33082580566406, + "loss": 0.1629, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4736942648887634, + "rewards/margins": 3.030879259109497, + "rewards/rejected": -3.5045738220214844, "step": 680 }, { "epoch": 1.42, - "learning_rate": 1.1688311688311688e-05, - "logits/chosen": -2.3510377407073975, - "logits/rejected": -2.320303440093994, - "logps/chosen": -352.9103088378906, - "logps/rejected": -234.5736083984375, - "loss": 0.0322, - "rewards/accuracies": 0.96875, - "rewards/chosen": -5.800360202789307, - "rewards/margins": 9.392560005187988, - "rewards/rejected": -15.19292163848877, + "learning_rate": 1.1688311688311688e-06, + "logits/chosen": -2.7193169593811035, + "logits/rejected": -2.7363669872283936, + "logps/chosen": -299.65753173828125, + "logps/rejected": -120.58744812011719, + "loss": 0.1436, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.4750828146934509, + "rewards/margins": 3.319221019744873, + "rewards/rejected": -3.794304370880127, "step": 690 }, { "epoch": 1.44, - "learning_rate": 1.1535523300229183e-05, - "logits/chosen": -2.3935928344726562, - "logits/rejected": -2.379652738571167, - "logps/chosen": -327.9419860839844, - "logps/rejected": -199.2997589111328, - "loss": 0.0573, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.665543556213379, - "rewards/margins": 7.67110538482666, - "rewards/rejected": -12.336648941040039, + "learning_rate": 1.153552330022918e-06, + "logits/chosen": -2.7201449871063232, + "logits/rejected": -2.7599716186523438, + "logps/chosen": -285.97869873046875, + "logps/rejected": -111.79976654052734, + "loss": 0.1637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.46921300888061523, + "rewards/margins": 3.1174368858337402, + "rewards/rejected": -3.5866501331329346, "step": 700 }, { "epoch": 1.46, - "learning_rate": 1.1382734912146677e-05, - "logits/chosen": -2.370570659637451, - "logits/rejected": -2.384655475616455, - "logps/chosen": -322.21661376953125, - "logps/rejected": -200.2702178955078, - "loss": 0.0441, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.125889301300049, - "rewards/margins": 8.075443267822266, - "rewards/rejected": -12.201333999633789, + "learning_rate": 1.1382734912146676e-06, + "logits/chosen": -2.683954954147339, + "logits/rejected": -2.760031223297119, + "logps/chosen": -285.8088073730469, + "logps/rejected": -114.0859375, + "loss": 0.1569, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.4851107597351074, + "rewards/margins": 3.097794771194458, + "rewards/rejected": -3.5829052925109863, "step": 710 }, { "epoch": 1.48, - "learning_rate": 1.1229946524064172e-05, - "logits/chosen": -2.3510353565216064, - "logits/rejected": -2.3337533473968506, - "logps/chosen": -299.6921081542969, - "logps/rejected": -213.6602020263672, - "loss": 0.0458, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -4.6531476974487305, - "rewards/margins": 8.622300148010254, - "rewards/rejected": -13.2754487991333, + "learning_rate": 1.1229946524064172e-06, + "logits/chosen": -2.695244550704956, + "logits/rejected": -2.7127084732055664, + "logps/chosen": -258.01812744140625, + "logps/rejected": -116.52490234375, + "loss": 0.148, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.48574456572532654, + "rewards/margins": 3.0761725902557373, + "rewards/rejected": -3.5619170665740967, "step": 720 }, { "epoch": 1.51, - "learning_rate": 1.1077158135981668e-05, - "logits/chosen": -2.3112430572509766, - "logits/rejected": -2.2705492973327637, - "logps/chosen": -301.2213439941406, - "logps/rejected": -215.2222900390625, - "loss": 0.0478, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.078838348388672, - "rewards/margins": 8.204427719116211, - "rewards/rejected": -14.2832670211792, + "learning_rate": 1.1077158135981665e-06, + "logits/chosen": -2.6948394775390625, + "logits/rejected": -2.7225587368011475, + "logps/chosen": -247.1420440673828, + "logps/rejected": -108.1529312133789, + "loss": 0.1546, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6709108352661133, + "rewards/margins": 2.9054205417633057, + "rewards/rejected": -3.5763309001922607, "step": 730 }, { "epoch": 1.53, - "learning_rate": 1.0924369747899159e-05, - "logits/chosen": -2.406601667404175, - "logits/rejected": -2.351696252822876, - "logps/chosen": -320.53375244140625, - "logps/rejected": -221.86679077148438, - "loss": 0.0416, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.614253044128418, - "rewards/margins": 8.879283905029297, - "rewards/rejected": -14.493537902832031, + "learning_rate": 1.0924369747899158e-06, + "logits/chosen": -2.776315450668335, + "logits/rejected": -2.776728868484497, + "logps/chosen": -270.6875915527344, + "logps/rejected": -114.87939453125, + "loss": 0.1503, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6296391487121582, + "rewards/margins": 3.1651525497436523, + "rewards/rejected": -3.7947916984558105, "step": 740 }, { "epoch": 1.55, - "learning_rate": 1.0771581359816653e-05, - "logits/chosen": -2.368277072906494, - "logits/rejected": -2.260631561279297, - "logps/chosen": -297.88189697265625, - "logps/rejected": -224.37228393554688, - "loss": 0.0403, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.92215633392334, - "rewards/margins": 8.878290176391602, - "rewards/rejected": -13.800447463989258, + "learning_rate": 1.0771581359816653e-06, + "logits/chosen": -2.733975887298584, + "logits/rejected": -2.6755785942077637, + "logps/chosen": -253.8512420654297, + "logps/rejected": -123.96826171875, + "loss": 0.1483, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5190945267677307, + "rewards/margins": 3.2409470081329346, + "rewards/rejected": -3.7600414752960205, "step": 750 }, { "epoch": 1.57, - "learning_rate": 1.0618792971734148e-05, - "logits/chosen": -2.3125646114349365, - "logits/rejected": -2.2867493629455566, - "logps/chosen": -360.1884460449219, - "logps/rejected": -216.7639617919922, - "loss": 0.0436, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -4.890374660491943, - "rewards/margins": 8.949200630187988, - "rewards/rejected": -13.839574813842773, + "learning_rate": 1.0618792971734147e-06, + "logits/chosen": -2.6686155796051025, + "logits/rejected": -2.7576305866241455, + "logps/chosen": -315.3510437011719, + "logps/rejected": -116.48152160644531, + "loss": 0.1401, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.4066368043422699, + "rewards/margins": 3.404694080352783, + "rewards/rejected": -3.811330795288086, "step": 760 }, { "epoch": 1.59, - "learning_rate": 1.0466004583651644e-05, - "logits/chosen": -2.3664181232452393, - "logits/rejected": -2.3008503913879395, - "logps/chosen": -329.5279541015625, - "logps/rejected": -223.3599090576172, - "loss": 0.0379, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -5.103986740112305, - "rewards/margins": 8.835712432861328, - "rewards/rejected": -13.939699172973633, + "learning_rate": 1.0466004583651642e-06, + "logits/chosen": -2.7223763465881348, + "logits/rejected": -2.7043228149414062, + "logps/chosen": -285.30767822265625, + "logps/rejected": -123.26036071777344, + "loss": 0.1394, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.6819589734077454, + "rewards/margins": 3.2477848529815674, + "rewards/rejected": -3.929744005203247, "step": 770 }, { "epoch": 1.61, - "learning_rate": 1.0313216195569139e-05, - "logits/chosen": -2.351306200027466, - "logits/rejected": -2.31449556350708, - "logps/chosen": -317.2604064941406, - "logps/rejected": -220.0719757080078, - "loss": 0.0347, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.727662086486816, - "rewards/margins": 9.234613418579102, - "rewards/rejected": -14.962274551391602, + "learning_rate": 1.0313216195569137e-06, + "logits/chosen": -2.710700511932373, + "logits/rejected": -2.748891592025757, + "logps/chosen": -267.1021423339844, + "logps/rejected": -108.77484130859375, + "loss": 0.15, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7118343710899353, + "rewards/margins": 3.120725154876709, + "rewards/rejected": -3.832559585571289, "step": 780 }, { "epoch": 1.63, - "learning_rate": 1.0160427807486633e-05, - "logits/chosen": -2.3417956829071045, - "logits/rejected": -2.2895348072052, - "logps/chosen": -328.92681884765625, - "logps/rejected": -225.81887817382812, - "loss": 0.0435, - "rewards/accuracies": 0.96875, - "rewards/chosen": -5.588496685028076, - "rewards/margins": 8.750701904296875, - "rewards/rejected": -14.339200019836426, + "learning_rate": 1.016042780748663e-06, + "logits/chosen": -2.6949081420898438, + "logits/rejected": -2.6823809146881104, + "logps/chosen": -278.98101806640625, + "logps/rejected": -121.89730072021484, + "loss": 0.1449, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.5939213037490845, + "rewards/margins": 3.3531200885772705, + "rewards/rejected": -3.9470412731170654, "step": 790 }, { "epoch": 1.65, - "learning_rate": 1.0007639419404128e-05, - "logits/chosen": -2.3706483840942383, - "logits/rejected": -2.329810857772827, - "logps/chosen": -322.5093688964844, - "logps/rejected": -217.6160430908203, - "loss": 0.0426, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.1367669105529785, - "rewards/margins": 9.024019241333008, - "rewards/rejected": -14.160784721374512, + "learning_rate": 1.0007639419404126e-06, + "logits/chosen": -2.710453748703003, + "logits/rejected": -2.720715045928955, + "logps/chosen": -277.281982421875, + "logps/rejected": -114.69615173339844, + "loss": 0.1473, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6140307188034058, + "rewards/margins": 3.254765748977661, + "rewards/rejected": -3.8687965869903564, "step": 800 }, { "epoch": 1.67, - "learning_rate": 9.85485103132162e-06, - "logits/chosen": -2.3830759525299072, - "logits/rejected": -2.3143844604492188, - "logps/chosen": -335.95965576171875, - "logps/rejected": -224.92111206054688, - "loss": 0.041, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.424968719482422, - "rewards/margins": 9.111248970031738, - "rewards/rejected": -14.536218643188477, + "learning_rate": 9.85485103132162e-07, + "logits/chosen": -2.7251040935516357, + "logits/rejected": -2.693974494934082, + "logps/chosen": -286.97686767578125, + "logps/rejected": -119.7470703125, + "loss": 0.1369, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5266898274421692, + "rewards/margins": 3.49212384223938, + "rewards/rejected": -4.018813133239746, "step": 810 }, { "epoch": 1.69, - "learning_rate": 9.702062643239115e-06, - "logits/chosen": -2.399111747741699, - "logits/rejected": -2.3377134799957275, - "logps/chosen": -342.5674133300781, - "logps/rejected": -235.2672119140625, - "loss": 0.0517, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -5.91513729095459, - "rewards/margins": 9.157655715942383, - "rewards/rejected": -15.072793960571289, + "learning_rate": 9.702062643239114e-07, + "logits/chosen": -2.7554683685302734, + "logits/rejected": -2.762106418609619, + "logps/chosen": -289.8918762207031, + "logps/rejected": -125.37986755371094, + "loss": 0.1401, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.6475861668586731, + "rewards/margins": 3.4364724159240723, + "rewards/rejected": -4.08405876159668, "step": 820 }, { "epoch": 1.71, - "learning_rate": 9.54927425515661e-06, - "logits/chosen": -2.371589183807373, - "logits/rejected": -2.301905632019043, - "logps/chosen": -327.13055419921875, - "logps/rejected": -220.7751007080078, - "loss": 0.0341, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.895151615142822, - "rewards/margins": 8.910463333129883, - "rewards/rejected": -13.80561351776123, + "learning_rate": 9.549274255156608e-07, + "logits/chosen": -2.7048392295837402, + "logits/rejected": -2.6878910064697266, + "logps/chosen": -284.31219482421875, + "logps/rejected": -122.46697998046875, + "loss": 0.1402, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.6133157014846802, + "rewards/margins": 3.361485242843628, + "rewards/rejected": -3.9748013019561768, "step": 830 }, { "epoch": 1.73, - "learning_rate": 9.396485867074104e-06, - "logits/chosen": -2.3354568481445312, - "logits/rejected": -2.2669918537139893, - "logps/chosen": -338.8708801269531, - "logps/rejected": -218.30947875976562, - "loss": 0.0458, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.41546630859375, - "rewards/margins": 8.734143257141113, - "rewards/rejected": -14.149609565734863, + "learning_rate": 9.396485867074102e-07, + "logits/chosen": -2.6750662326812744, + "logits/rejected": -2.696866750717163, + "logps/chosen": -291.42938232421875, + "logps/rejected": -113.9036865234375, + "loss": 0.1509, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.6713167428970337, + "rewards/margins": 3.037714719772339, + "rewards/rejected": -3.709031581878662, "step": 840 }, { "epoch": 1.75, - "learning_rate": 9.243697478991598e-06, - "logits/chosen": -2.3707711696624756, - "logits/rejected": -2.313260316848755, - "logps/chosen": -302.56170654296875, - "logps/rejected": -212.74365234375, - "loss": 0.0555, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.656682014465332, - "rewards/margins": 8.04932975769043, - "rewards/rejected": -13.706011772155762, + "learning_rate": 9.243697478991597e-07, + "logits/chosen": -2.68489933013916, + "logits/rejected": -2.6937882900238037, + "logps/chosen": -254.6746063232422, + "logps/rejected": -115.2020263671875, + "loss": 0.1543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8679746389389038, + "rewards/margins": 3.083875894546509, + "rewards/rejected": -3.951850414276123, "step": 850 }, { "epoch": 1.77, - "learning_rate": 9.090909090909091e-06, - "logits/chosen": -2.337785005569458, - "logits/rejected": -2.2829418182373047, - "logps/chosen": -330.53802490234375, - "logps/rejected": -225.6645965576172, - "loss": 0.035, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.462188243865967, - "rewards/margins": 9.399454116821289, - "rewards/rejected": -14.861642837524414, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": -2.6926121711730957, + "logits/rejected": -2.698666572570801, + "logps/chosen": -280.2935485839844, + "logps/rejected": -116.54022216796875, + "loss": 0.1349, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.43773943185806274, + "rewards/margins": 3.511465072631836, + "rewards/rejected": -3.949204683303833, "step": 860 }, { "epoch": 1.79, - "learning_rate": 8.938120702826586e-06, - "logits/chosen": -2.3577864170074463, - "logits/rejected": -2.2690200805664062, - "logps/chosen": -331.9044494628906, - "logps/rejected": -232.2950897216797, - "loss": 0.0348, + "learning_rate": 8.938120702826585e-07, + "logits/chosen": -2.740382671356201, + "logits/rejected": -2.68933367729187, + "logps/chosen": -276.387451171875, + "logps/rejected": -115.27424621582031, + "loss": 0.136, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.0897955894470215, - "rewards/margins": 9.550823211669922, - "rewards/rejected": -15.640619277954102, + "rewards/chosen": -0.5380962491035461, + "rewards/margins": 3.400437593460083, + "rewards/rejected": -3.9385337829589844, "step": 870 }, { "epoch": 1.81, - "learning_rate": 8.78533231474408e-06, - "logits/chosen": -2.4222211837768555, - "logits/rejected": -2.3889050483703613, - "logps/chosen": -339.27923583984375, - "logps/rejected": -226.44393920898438, - "loss": 0.0426, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.199553489685059, - "rewards/margins": 8.753616333007812, - "rewards/rejected": -13.953170776367188, + "learning_rate": 8.785332314744079e-07, + "logits/chosen": -2.7333688735961914, + "logits/rejected": -2.771758794784546, + "logps/chosen": -293.49578857421875, + "logps/rejected": -126.72651672363281, + "loss": 0.1379, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.6212050914764404, + "rewards/margins": 3.3602237701416016, + "rewards/rejected": -3.9814281463623047, "step": 880 }, { "epoch": 1.84, - "learning_rate": 8.632543926661574e-06, - "logits/chosen": -2.3915364742279053, - "logits/rejected": -2.3292319774627686, - "logps/chosen": -357.2469482421875, - "logps/rejected": -227.48550415039062, - "loss": 0.0419, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.296126365661621, - "rewards/margins": 9.113978385925293, - "rewards/rejected": -14.41010570526123, + "learning_rate": 8.632543926661573e-07, + "logits/chosen": -2.709484577178955, + "logits/rejected": -2.7399744987487793, + "logps/chosen": -309.5272521972656, + "logps/rejected": -123.85481262207031, + "loss": 0.1368, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5241509675979614, + "rewards/margins": 3.5228843688964844, + "rewards/rejected": -4.0470356941223145, "step": 890 }, { "epoch": 1.86, - "learning_rate": 8.479755538579069e-06, - "logits/chosen": -2.3756537437438965, - "logits/rejected": -2.32563853263855, - "logps/chosen": -318.2647399902344, - "logps/rejected": -227.1390838623047, - "loss": 0.0406, - "rewards/accuracies": 0.96875, - "rewards/chosen": -5.1770830154418945, - "rewards/margins": 9.713737487792969, - "rewards/rejected": -14.890820503234863, + "learning_rate": 8.479755538579067e-07, + "logits/chosen": -2.7088303565979004, + "logits/rejected": -2.7025811672210693, + "logps/chosen": -271.97784423828125, + "logps/rejected": -118.300537109375, + "loss": 0.1319, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.5483931303024292, + "rewards/margins": 3.45857310295105, + "rewards/rejected": -4.006965637207031, "step": 900 }, { "epoch": 1.88, - "learning_rate": 8.326967150496563e-06, - "logits/chosen": -2.41601300239563, - "logits/rejected": -2.297445297241211, - "logps/chosen": -348.1459655761719, - "logps/rejected": -232.72802734375, - "loss": 0.0424, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.221807479858398, - "rewards/margins": 9.922318458557129, - "rewards/rejected": -15.144124031066895, + "learning_rate": 8.326967150496562e-07, + "logits/chosen": -2.747811794281006, + "logits/rejected": -2.6834397315979004, + "logps/chosen": -301.4414367675781, + "logps/rejected": -123.6478271484375, + "loss": 0.1264, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.5513515472412109, + "rewards/margins": 3.684753894805908, + "rewards/rejected": -4.236104965209961, "step": 910 }, { "epoch": 1.9, - "learning_rate": 8.174178762414056e-06, - "logits/chosen": -2.425701856613159, - "logits/rejected": -2.3687078952789307, - "logps/chosen": -341.102294921875, - "logps/rejected": -232.12783813476562, - "loss": 0.0342, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -4.615460395812988, - "rewards/margins": 9.65539264678955, - "rewards/rejected": -14.270853042602539, + "learning_rate": 8.174178762414056e-07, + "logits/chosen": -2.7275288105010986, + "logits/rejected": -2.7216854095458984, + "logps/chosen": -300.93328857421875, + "logps/rejected": -133.4979248046875, + "loss": 0.1301, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.5985573530197144, + "rewards/margins": 3.8093056678771973, + "rewards/rejected": -4.407863140106201, "step": 920 }, { "epoch": 1.92, - "learning_rate": 8.02139037433155e-06, - "logits/chosen": -2.423699378967285, - "logits/rejected": -2.3600199222564697, - "logps/chosen": -321.5314025878906, - "logps/rejected": -228.6741943359375, - "loss": 0.0316, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": -2.7658352851867676, + "logits/rejected": -2.738119602203369, + "logps/chosen": -272.04449462890625, + "logps/rejected": -120.6009521484375, + "loss": 0.1252, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.644195556640625, - "rewards/margins": 9.397247314453125, - "rewards/rejected": -15.041444778442383, + "rewards/chosen": -0.6955040693283081, + "rewards/margins": 3.5386176109313965, + "rewards/rejected": -4.234121799468994, "step": 930 }, { "epoch": 1.94, - "learning_rate": 7.868601986249045e-06, - "logits/chosen": -2.4263432025909424, - "logits/rejected": -2.3628458976745605, - "logps/chosen": -330.029541015625, - "logps/rejected": -225.56637573242188, - "loss": 0.054, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -5.213741779327393, - "rewards/margins": 8.961358070373535, - "rewards/rejected": -14.175100326538086, + "learning_rate": 7.868601986249045e-07, + "logits/chosen": -2.7340145111083984, + "logits/rejected": -2.7205958366394043, + "logps/chosen": -286.5509948730469, + "logps/rejected": -126.727294921875, + "loss": 0.1468, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8658866882324219, + "rewards/margins": 3.4253039360046387, + "rewards/rejected": -4.2911906242370605, "step": 940 }, { "epoch": 1.96, - "learning_rate": 7.71581359816654e-06, - "logits/chosen": -2.3590340614318848, - "logits/rejected": -2.2720718383789062, - "logps/chosen": -272.0068359375, - "logps/rejected": -213.19833374023438, - "loss": 0.044, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.489508628845215, - "rewards/margins": 8.610600471496582, - "rewards/rejected": -14.100107192993164, + "learning_rate": 7.71581359816654e-07, + "logits/chosen": -2.7004799842834473, + "logits/rejected": -2.6438944339752197, + "logps/chosen": -226.2438507080078, + "logps/rejected": -113.9751205444336, + "loss": 0.1347, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.913210391998291, + "rewards/margins": 3.264578342437744, + "rewards/rejected": -4.177788734436035, "step": 950 }, { "epoch": 1.98, - "learning_rate": 7.563025210084034e-06, - "logits/chosen": -2.3800034523010254, - "logits/rejected": -2.3298089504241943, - "logps/chosen": -319.4668884277344, - "logps/rejected": -224.38330078125, - "loss": 0.0277, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.616637706756592, - "rewards/margins": 9.42906665802002, - "rewards/rejected": -15.045705795288086, + "learning_rate": 7.563025210084033e-07, + "logits/chosen": -2.7258245944976807, + "logits/rejected": -2.792160987854004, + "logps/chosen": -270.41314697265625, + "logps/rejected": -115.57466125488281, + "loss": 0.1153, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.7112643122673035, + "rewards/margins": 3.4535746574401855, + "rewards/rejected": -4.164839267730713, "step": 960 }, { "epoch": 2.0, - "learning_rate": 7.410236822001529e-06, - "logits/chosen": -2.35837984085083, - "logits/rejected": -2.2928102016448975, - "logps/chosen": -343.45587158203125, - "logps/rejected": -222.7772979736328, - "loss": 0.0443, + "learning_rate": 7.410236822001527e-07, + "logits/chosen": -2.682445526123047, + "logits/rejected": -2.680621862411499, + "logps/chosen": -300.4261474609375, + "logps/rejected": -123.0173110961914, + "loss": 0.1305, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -4.847626209259033, - "rewards/margins": 9.161903381347656, - "rewards/rejected": -14.009529113769531, + "rewards/chosen": -0.544654130935669, + "rewards/margins": 3.4888763427734375, + "rewards/rejected": -4.033530235290527, "step": 970 }, { "epoch": 2.0, - "eval_logits/chosen": -2.3850746154785156, - "eval_logits/rejected": -2.3026185035705566, - "eval_logps/chosen": -329.6904296875, - "eval_logps/rejected": -262.23040771484375, - "eval_loss": 0.7951104640960693, - "eval_rewards/accuracies": 0.73828125, - "eval_rewards/chosen": -5.5853447914123535, - "eval_rewards/margins": 3.3340888023376465, - "eval_rewards/rejected": -8.91943359375, - "eval_runtime": 258.3653, - "eval_samples_per_second": 7.741, + "eval_logits/chosen": -2.730585813522339, + "eval_logits/rejected": -2.6866092681884766, + "eval_logps/chosen": -281.4207763671875, + "eval_logps/rejected": -201.63304138183594, + "eval_loss": 0.3693665862083435, + "eval_rewards/accuracies": 0.82421875, + "eval_rewards/chosen": -0.7583777904510498, + "eval_rewards/margins": 2.1013174057006836, + "eval_rewards/rejected": -2.8596951961517334, + "eval_runtime": 259.2152, + "eval_samples_per_second": 7.716, "eval_steps_per_second": 0.062, "step": 970 }, { "epoch": 2.02, - "learning_rate": 7.257448433919023e-06, - "logits/chosen": -2.3800430297851562, - "logits/rejected": -2.3618946075439453, - "logps/chosen": -315.0664978027344, - "logps/rejected": -229.7184600830078, - "loss": 0.0275, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -5.884152889251709, - "rewards/margins": 9.261144638061523, - "rewards/rejected": -15.145297050476074, + "learning_rate": 7.257448433919023e-07, + "logits/chosen": -2.7307159900665283, + "logits/rejected": -2.760425090789795, + "logps/chosen": -263.69964599609375, + "logps/rejected": -121.00809478759766, + "loss": 0.1237, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7474689483642578, + "rewards/margins": 3.526792526245117, + "rewards/rejected": -4.274261474609375, "step": 980 }, { "epoch": 2.04, - "learning_rate": 7.104660045836517e-06, - "logits/chosen": -2.3531386852264404, - "logits/rejected": -2.3184380531311035, - "logps/chosen": -327.3660888671875, - "logps/rejected": -219.55581665039062, - "loss": 0.026, + "learning_rate": 7.104660045836516e-07, + "logits/chosen": -2.688023090362549, + "logits/rejected": -2.6981234550476074, + "logps/chosen": -283.63311767578125, + "logps/rejected": -119.1172866821289, + "loss": 0.1167, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -4.929337024688721, - "rewards/margins": 9.458481788635254, - "rewards/rejected": -14.3878173828125, + "rewards/chosen": -0.5560388565063477, + "rewards/margins": 3.7879276275634766, + "rewards/rejected": -4.343966484069824, "step": 990 }, { "epoch": 2.06, - "learning_rate": 6.951871657754011e-06, - "logits/chosen": -2.3432507514953613, - "logits/rejected": -2.2758610248565674, - "logps/chosen": -325.6359558105469, - "logps/rejected": -223.78164672851562, - "loss": 0.0353, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.556198596954346, - "rewards/margins": 9.385514259338379, - "rewards/rejected": -14.941709518432617, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": -2.6954102516174316, + "logits/rejected": -2.696408748626709, + "logps/chosen": -277.02764892578125, + "logps/rejected": -115.6692886352539, + "loss": 0.1243, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6953684091567993, + "rewards/margins": 3.4351089000701904, + "rewards/rejected": -4.130476951599121, "step": 1000 }, { "epoch": 2.08, - "learning_rate": 6.799083269671506e-06, - "logits/chosen": -2.3102869987487793, - "logits/rejected": -2.2865514755249023, - "logps/chosen": -304.4931640625, - "logps/rejected": -233.22525024414062, - "loss": 0.0308, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.142067909240723, - "rewards/margins": 9.77638053894043, - "rewards/rejected": -15.918451309204102, + "learning_rate": 6.799083269671506e-07, + "logits/chosen": -2.6869771480560303, + "logits/rejected": -2.7037670612335205, + "logps/chosen": -251.7620086669922, + "logps/rejected": -117.48094177246094, + "loss": 0.1317, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.8689507246017456, + "rewards/margins": 3.475069046020508, + "rewards/rejected": -4.344019889831543, "step": 1010 }, { "epoch": 2.1, - "learning_rate": 6.646294881588999e-06, - "logits/chosen": -2.318485736846924, - "logits/rejected": -2.2506542205810547, - "logps/chosen": -343.11334228515625, - "logps/rejected": -244.45883178710938, - "loss": 0.0273, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -6.299825191497803, - "rewards/margins": 10.17300033569336, - "rewards/rejected": -16.472827911376953, + "learning_rate": 6.646294881588999e-07, + "logits/chosen": -2.6988840103149414, + "logits/rejected": -2.6869451999664307, + "logps/chosen": -287.7177734375, + "logps/rejected": -123.6144790649414, + "loss": 0.119, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7602699995040894, + "rewards/margins": 3.628121852874756, + "rewards/rejected": -4.388391971588135, "step": 1020 }, { "epoch": 2.12, - "learning_rate": 6.493506493506494e-06, - "logits/chosen": -2.348334550857544, - "logits/rejected": -2.3145151138305664, - "logps/chosen": -362.18927001953125, - "logps/rejected": -244.39047241210938, - "loss": 0.0272, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.444186210632324, - "rewards/margins": 10.421486854553223, - "rewards/rejected": -16.865673065185547, + "learning_rate": 6.493506493506493e-07, + "logits/chosen": -2.7356371879577637, + "logits/rejected": -2.8164889812469482, + "logps/chosen": -305.6545715332031, + "logps/rejected": -119.76715087890625, + "loss": 0.1255, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7907172441482544, + "rewards/margins": 3.6126227378845215, + "rewards/rejected": -4.403339862823486, "step": 1030 }, { "epoch": 2.14, - "learning_rate": 6.340718105423988e-06, - "logits/chosen": -2.290079116821289, - "logits/rejected": -2.1995062828063965, - "logps/chosen": -333.0499572753906, - "logps/rejected": -232.38363647460938, - "loss": 0.0287, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.774559020996094, - "rewards/margins": 10.013866424560547, - "rewards/rejected": -15.788423538208008, + "learning_rate": 6.340718105423987e-07, + "logits/chosen": -2.647362232208252, + "logits/rejected": -2.6018130779266357, + "logps/chosen": -281.76092529296875, + "logps/rejected": -118.23252868652344, + "loss": 0.121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.64565110206604, + "rewards/margins": 3.727663516998291, + "rewards/rejected": -4.37331485748291, "step": 1040 }, { "epoch": 2.16, - "learning_rate": 6.187929717341482e-06, - "logits/chosen": -2.334545612335205, - "logits/rejected": -2.236114501953125, - "logps/chosen": -323.73876953125, - "logps/rejected": -227.9995880126953, - "loss": 0.033, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.580103874206543, - "rewards/margins": 10.12531566619873, - "rewards/rejected": -15.705419540405273, + "learning_rate": 6.187929717341482e-07, + "logits/chosen": -2.7285995483398438, + "logits/rejected": -2.67795467376709, + "logps/chosen": -273.88555908203125, + "logps/rejected": -112.9361801147461, + "loss": 0.1223, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.5947860479354858, + "rewards/margins": 3.604292392730713, + "rewards/rejected": -4.199078559875488, "step": 1050 }, { "epoch": 2.19, - "learning_rate": 6.0351413292589764e-06, - "logits/chosen": -2.3219540119171143, - "logits/rejected": -2.237586736679077, - "logps/chosen": -338.9471130371094, - "logps/rejected": -236.09146118164062, - "loss": 0.0303, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.9066386222839355, - "rewards/margins": 9.865789413452148, - "rewards/rejected": -15.772427558898926, + "learning_rate": 6.035141329258976e-07, + "logits/chosen": -2.718034505844116, + "logits/rejected": -2.6936326026916504, + "logps/chosen": -287.3531799316406, + "logps/rejected": -123.18647766113281, + "loss": 0.1196, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7472411394119263, + "rewards/margins": 3.734687328338623, + "rewards/rejected": -4.481928825378418, "step": 1060 }, { "epoch": 2.21, - "learning_rate": 5.882352941176471e-06, - "logits/chosen": -2.3054463863372803, - "logits/rejected": -2.2279181480407715, - "logps/chosen": -313.12384033203125, - "logps/rejected": -232.56820678710938, - "loss": 0.0389, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.444895267486572, - "rewards/margins": 9.621152877807617, - "rewards/rejected": -16.066049575805664, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -2.699462413787842, + "logits/rejected": -2.667182683944702, + "logps/chosen": -255.7397003173828, + "logps/rejected": -114.04801177978516, + "loss": 0.1287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.706480860710144, + "rewards/margins": 3.5075461864471436, + "rewards/rejected": -4.214027404785156, "step": 1070 }, { "epoch": 2.23, - "learning_rate": 5.729564553093966e-06, - "logits/chosen": -2.3564865589141846, - "logits/rejected": -2.3001480102539062, - "logps/chosen": -335.7102966308594, - "logps/rejected": -231.7644805908203, - "loss": 0.0265, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.110272407531738, - "rewards/margins": 9.538274765014648, - "rewards/rejected": -15.648547172546387, + "learning_rate": 5.729564553093965e-07, + "logits/chosen": -2.719064712524414, + "logits/rejected": -2.724663734436035, + "logps/chosen": -283.41607666015625, + "logps/rejected": -120.31791687011719, + "loss": 0.1205, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.8808507919311523, + "rewards/margins": 3.6230416297912598, + "rewards/rejected": -4.503891944885254, "step": 1080 }, { "epoch": 2.25, - "learning_rate": 5.576776165011459e-06, - "logits/chosen": -2.276754379272461, - "logits/rejected": -2.2232775688171387, - "logps/chosen": -332.77874755859375, - "logps/rejected": -249.8245849609375, - "loss": 0.03, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -6.621646881103516, - "rewards/margins": 10.335177421569824, - "rewards/rejected": -16.956823348999023, + "learning_rate": 5.576776165011459e-07, + "logits/chosen": -2.6613950729370117, + "logits/rejected": -2.669309377670288, + "logps/chosen": -274.6718444824219, + "logps/rejected": -123.7230453491211, + "loss": 0.1295, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.8109513521194458, + "rewards/margins": 3.535714626312256, + "rewards/rejected": -4.346665859222412, "step": 1090 }, { "epoch": 2.27, - "learning_rate": 5.423987776928954e-06, - "logits/chosen": -2.385565996170044, - "logits/rejected": -2.2726476192474365, - "logps/chosen": -356.91094970703125, - "logps/rejected": -251.0599822998047, - "loss": 0.0232, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -6.774880886077881, - "rewards/margins": 10.560681343078613, - "rewards/rejected": -17.335561752319336, + "learning_rate": 5.423987776928953e-07, + "logits/chosen": -2.752354860305786, + "logits/rejected": -2.689492702484131, + "logps/chosen": -297.46575927734375, + "logps/rejected": -123.75872802734375, + "loss": 0.1112, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8303642272949219, + "rewards/margins": 3.775070905685425, + "rewards/rejected": -4.605435371398926, "step": 1100 }, { "epoch": 2.29, - "learning_rate": 5.271199388846449e-06, - "logits/chosen": -2.301403045654297, - "logits/rejected": -2.238607406616211, - "logps/chosen": -329.09088134765625, - "logps/rejected": -261.70184326171875, - "loss": 0.0244, + "learning_rate": 5.271199388846448e-07, + "logits/chosen": -2.7163376808166504, + "logits/rejected": -2.7011306285858154, + "logps/chosen": -268.9468078613281, + "logps/rejected": -129.1416473388672, + "loss": 0.1147, "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.769029140472412, - "rewards/margins": 10.928190231323242, - "rewards/rejected": -17.697219848632812, + "rewards/chosen": -0.7546231150627136, + "rewards/margins": 3.6865787506103516, + "rewards/rejected": -4.441201686859131, "step": 1110 }, { "epoch": 2.31, - "learning_rate": 5.118411000763942e-06, - "logits/chosen": -2.344348907470703, - "logits/rejected": -2.238985061645508, - "logps/chosen": -337.75958251953125, - "logps/rejected": -255.0755157470703, - "loss": 0.0282, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.105334281921387, - "rewards/margins": 11.364806175231934, - "rewards/rejected": -17.470142364501953, + "learning_rate": 5.118411000763941e-07, + "logits/chosen": -2.7277450561523438, + "logits/rejected": -2.6973800659179688, + "logps/chosen": -282.5606689453125, + "logps/rejected": -124.7701416015625, + "loss": 0.1192, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5854411125183105, + "rewards/margins": 3.8541629314422607, + "rewards/rejected": -4.439603805541992, "step": 1120 }, { "epoch": 2.33, - "learning_rate": 4.965622612681437e-06, - "logits/chosen": -2.2969937324523926, - "logits/rejected": -2.2662580013275146, - "logps/chosen": -327.1737060546875, - "logps/rejected": -242.359619140625, - "loss": 0.0343, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.986081600189209, - "rewards/margins": 10.588550567626953, - "rewards/rejected": -16.57463264465332, + "learning_rate": 4.965622612681436e-07, + "logits/chosen": -2.6750576496124268, + "logits/rejected": -2.7182953357696533, + "logps/chosen": -274.384765625, + "logps/rejected": -122.15213775634766, + "loss": 0.1222, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7071852684020996, + "rewards/margins": 3.846698045730591, + "rewards/rejected": -4.5538835525512695, "step": 1130 }, { "epoch": 2.35, - "learning_rate": 4.812834224598931e-06, - "logits/chosen": -2.3156192302703857, - "logits/rejected": -2.268474578857422, - "logps/chosen": -317.2707824707031, - "logps/rejected": -234.99734497070312, - "loss": 0.0266, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.135073661804199, - "rewards/margins": 10.014872550964355, - "rewards/rejected": -16.149948120117188, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -2.676004648208618, + "logits/rejected": -2.6993675231933594, + "logps/chosen": -263.89752197265625, + "logps/rejected": -116.71295166015625, + "loss": 0.1107, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7977498769760132, + "rewards/margins": 3.523756742477417, + "rewards/rejected": -4.321506500244141, "step": 1140 }, { "epoch": 2.37, - "learning_rate": 4.660045836516425e-06, - "logits/chosen": -2.362359046936035, - "logits/rejected": -2.3042564392089844, - "logps/chosen": -341.3941650390625, - "logps/rejected": -235.54507446289062, - "loss": 0.0371, + "learning_rate": 4.660045836516425e-07, + "logits/chosen": -2.7055160999298096, + "logits/rejected": -2.7121243476867676, + "logps/chosen": -289.3218994140625, + "logps/rejected": -121.4267807006836, + "loss": 0.1189, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -5.8703131675720215, - "rewards/margins": 10.015909194946289, - "rewards/rejected": -15.886222839355469, + "rewards/chosen": -0.6630896329879761, + "rewards/margins": 3.811305522918701, + "rewards/rejected": -4.474394798278809, "step": 1150 }, { "epoch": 2.39, - "learning_rate": 4.5072574484339196e-06, - "logits/chosen": -2.2984766960144043, - "logits/rejected": -2.271493434906006, - "logps/chosen": -332.32489013671875, - "logps/rejected": -233.0104522705078, - "loss": 0.0332, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.160719394683838, - "rewards/margins": 9.819355010986328, - "rewards/rejected": -15.980074882507324, + "learning_rate": 4.5072574484339185e-07, + "logits/chosen": -2.6670591831207275, + "logits/rejected": -2.690974712371826, + "logps/chosen": -278.7057189941406, + "logps/rejected": -117.63764953613281, + "loss": 0.1214, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.798798143863678, + "rewards/margins": 3.643995761871338, + "rewards/rejected": -4.442793846130371, "step": 1160 }, { "epoch": 2.41, - "learning_rate": 4.354469060351414e-06, - "logits/chosen": -2.336785316467285, - "logits/rejected": -2.2462456226348877, - "logps/chosen": -321.7502746582031, - "logps/rejected": -244.50613403320312, - "loss": 0.0319, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.7888078689575195, - "rewards/margins": 9.919248580932617, - "rewards/rejected": -15.70805549621582, + "learning_rate": 4.3544690603514133e-07, + "logits/chosen": -2.686856269836426, + "logits/rejected": -2.6309545040130615, + "logps/chosen": -271.1042785644531, + "logps/rejected": -131.63259887695312, + "loss": 0.1233, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.7242040038108826, + "rewards/margins": 3.6964995861053467, + "rewards/rejected": -4.420703411102295, "step": 1170 }, { "epoch": 2.43, - "learning_rate": 4.201680672268908e-06, - "logits/chosen": -2.3283114433288574, - "logits/rejected": -2.3021600246429443, - "logps/chosen": -352.926513671875, - "logps/rejected": -241.5557098388672, - "loss": 0.0268, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.513485908508301, - "rewards/margins": 10.350736618041992, - "rewards/rejected": -15.864221572875977, + "learning_rate": 4.2016806722689076e-07, + "logits/chosen": -2.677506923675537, + "logits/rejected": -2.7353405952453613, + "logps/chosen": -304.0357360839844, + "logps/rejected": -128.03981018066406, + "loss": 0.1109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6244128942489624, + "rewards/margins": 3.888216733932495, + "rewards/rejected": -4.512629985809326, "step": 1180 }, { "epoch": 2.45, - "learning_rate": 4.048892284186402e-06, - "logits/chosen": -2.353079080581665, - "logits/rejected": -2.2632689476013184, - "logps/chosen": -317.30316162109375, - "logps/rejected": -238.99789428710938, - "loss": 0.0341, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.315035343170166, - "rewards/margins": 10.289588928222656, - "rewards/rejected": -16.60462188720703, + "learning_rate": 4.0488922841864013e-07, + "logits/chosen": -2.738661527633667, + "logits/rejected": -2.696394443511963, + "logps/chosen": -263.1926574707031, + "logps/rejected": -117.48931884765625, + "loss": 0.1266, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.903983473777771, + "rewards/margins": 3.5497829914093018, + "rewards/rejected": -4.453766345977783, "step": 1190 }, { "epoch": 2.47, - "learning_rate": 3.896103896103897e-06, - "logits/chosen": -2.347299337387085, - "logits/rejected": -2.2617671489715576, - "logps/chosen": -322.52716064453125, - "logps/rejected": -238.89794921875, - "loss": 0.0291, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.98498010635376, - "rewards/margins": 10.374175071716309, - "rewards/rejected": -16.359155654907227, + "learning_rate": 3.896103896103896e-07, + "logits/chosen": -2.721292018890381, + "logits/rejected": -2.6752657890319824, + "logps/chosen": -270.2406005859375, + "logps/rejected": -121.4746322631836, + "loss": 0.1118, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.7563194036483765, + "rewards/margins": 3.8605034351348877, + "rewards/rejected": -4.616823673248291, "step": 1200 }, { "epoch": 2.49, - "learning_rate": 3.7433155080213907e-06, - "logits/chosen": -2.3382067680358887, - "logits/rejected": -2.318660259246826, - "logps/chosen": -314.554443359375, - "logps/rejected": -241.98046875, - "loss": 0.0251, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.864360332489014, - "rewards/margins": 10.600669860839844, - "rewards/rejected": -16.465030670166016, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -2.6903328895568848, + "logits/rejected": -2.7279880046844482, + "logps/chosen": -263.3211975097656, + "logps/rejected": -120.23514556884766, + "loss": 0.1132, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7410348057746887, + "rewards/margins": 3.5494637489318848, + "rewards/rejected": -4.290497779846191, "step": 1210 }, { "epoch": 2.52, - "learning_rate": 3.5905271199388848e-06, - "logits/chosen": -2.33272123336792, - "logits/rejected": -2.2567524909973145, - "logps/chosen": -329.4327392578125, - "logps/rejected": -241.38650512695312, - "loss": 0.026, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.32674503326416, - "rewards/margins": 9.740303039550781, - "rewards/rejected": -16.067049026489258, + "learning_rate": 3.590527119938884e-07, + "logits/chosen": -2.697645425796509, + "logits/rejected": -2.676959753036499, + "logps/chosen": -275.2843322753906, + "logps/rejected": -126.24308776855469, + "loss": 0.1157, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9119027256965637, + "rewards/margins": 3.6408069133758545, + "rewards/rejected": -4.552709102630615, "step": 1220 }, { "epoch": 2.54, - "learning_rate": 3.4377387318563792e-06, - "logits/chosen": -2.304385185241699, - "logits/rejected": -2.242936849594116, - "logps/chosen": -350.80352783203125, - "logps/rejected": -257.4071350097656, - "loss": 0.0303, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.671088218688965, - "rewards/margins": 10.733926773071289, - "rewards/rejected": -17.40501594543457, + "learning_rate": 3.437738731856379e-07, + "logits/chosen": -2.704871654510498, + "logits/rejected": -2.6901638507843018, + "logps/chosen": -292.1809387207031, + "logps/rejected": -129.85531616210938, + "loss": 0.1134, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8088245391845703, + "rewards/margins": 3.841012477874756, + "rewards/rejected": -4.649837493896484, "step": 1230 }, { "epoch": 2.56, - "learning_rate": 3.2849503437738733e-06, - "logits/chosen": -2.3363845348358154, - "logits/rejected": -2.2699573040008545, - "logps/chosen": -340.39910888671875, - "logps/rejected": -249.5596160888672, - "loss": 0.0352, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.956596851348877, - "rewards/margins": 10.645536422729492, - "rewards/rejected": -16.60213279724121, + "learning_rate": 3.2849503437738727e-07, + "logits/chosen": -2.6897072792053223, + "logits/rejected": -2.6975724697113037, + "logps/chosen": -289.3966369628906, + "logps/rejected": -130.11814880371094, + "loss": 0.1215, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.8563445210456848, + "rewards/margins": 3.8016419410705566, + "rewards/rejected": -4.657986640930176, "step": 1240 }, { "epoch": 2.58, - "learning_rate": 3.1321619556913678e-06, - "logits/chosen": -2.289301633834839, - "logits/rejected": -2.259625196456909, - "logps/chosen": -358.3861083984375, - "logps/rejected": -251.97900390625, - "loss": 0.0381, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -6.069993019104004, - "rewards/margins": 10.830602645874023, - "rewards/rejected": -16.90059471130371, + "learning_rate": 3.1321619556913675e-07, + "logits/chosen": -2.6511547565460205, + "logits/rejected": -2.6918509006500244, + "logps/chosen": -304.6783447265625, + "logps/rejected": -128.91067504882812, + "loss": 0.1256, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.6992138028144836, + "rewards/margins": 3.894549608230591, + "rewards/rejected": -4.593764305114746, "step": 1250 }, { "epoch": 2.6, - "learning_rate": 2.979373567608862e-06, - "logits/chosen": -2.3595917224884033, - "logits/rejected": -2.281625270843506, - "logps/chosen": -336.3616638183594, - "logps/rejected": -242.55996704101562, - "loss": 0.0355, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.833378791809082, - "rewards/margins": 10.298261642456055, - "rewards/rejected": -16.131641387939453, + "learning_rate": 2.9793735676088617e-07, + "logits/chosen": -2.7180018424987793, + "logits/rejected": -2.73219633102417, + "logps/chosen": -287.21697998046875, + "logps/rejected": -128.64976501464844, + "loss": 0.1276, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9189088940620422, + "rewards/margins": 3.821709394454956, + "rewards/rejected": -4.7406182289123535, "step": 1260 }, { "epoch": 2.62, - "learning_rate": 2.826585179526356e-06, - "logits/chosen": -2.313523530960083, - "logits/rejected": -2.234619617462158, - "logps/chosen": -321.42913818359375, - "logps/rejected": -222.7084197998047, - "loss": 0.0296, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.917761325836182, - "rewards/margins": 9.67473030090332, - "rewards/rejected": -15.592493057250977, + "learning_rate": 2.8265851795263555e-07, + "logits/chosen": -2.6880316734313965, + "logits/rejected": -2.6963746547698975, + "logps/chosen": -270.5711364746094, + "logps/rejected": -109.63471984863281, + "loss": 0.1227, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.8319600820541382, + "rewards/margins": 3.4531593322753906, + "rewards/rejected": -4.285120010375977, "step": 1270 }, { "epoch": 2.64, - "learning_rate": 2.673796791443851e-06, - "logits/chosen": -2.324735164642334, - "logits/rejected": -2.2597193717956543, - "logps/chosen": -338.4864501953125, - "logps/rejected": -254.4070281982422, - "loss": 0.0285, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.023530006408691, - "rewards/margins": 11.340107917785645, - "rewards/rejected": -17.363636016845703, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -2.718834400177002, + "logits/rejected": -2.730088472366333, + "logps/chosen": -285.2646179199219, + "logps/rejected": -129.10108947753906, + "loss": 0.1177, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.7013461589813232, + "rewards/margins": 4.131698131561279, + "rewards/rejected": -4.833044052124023, "step": 1280 }, { "epoch": 2.66, - "learning_rate": 2.521008403361345e-06, - "logits/chosen": -2.2998228073120117, - "logits/rejected": -2.2498550415039062, - "logps/chosen": -314.4098815917969, - "logps/rejected": -228.45303344726562, - "loss": 0.0318, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -5.6659369468688965, - "rewards/margins": 9.735329627990723, - "rewards/rejected": -15.401266098022461, + "learning_rate": 2.5210084033613445e-07, + "logits/chosen": -2.6872308254241943, + "logits/rejected": -2.71856689453125, + "logps/chosen": -264.56591796875, + "logps/rejected": -117.89599609375, + "loss": 0.115, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.6815407872200012, + "rewards/margins": 3.6640231609344482, + "rewards/rejected": -4.3455634117126465, "step": 1290 }, { "epoch": 2.68, - "learning_rate": 2.368220015278839e-06, - "logits/chosen": -2.33604097366333, - "logits/rejected": -2.2575902938842773, - "logps/chosen": -367.1761474609375, - "logps/rejected": -240.1140594482422, - "loss": 0.034, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.646463394165039, - "rewards/margins": 10.95536994934082, - "rewards/rejected": -16.60183334350586, + "learning_rate": 2.3682200152788388e-07, + "logits/chosen": -2.723666191101074, + "logits/rejected": -2.7219574451446533, + "logps/chosen": -317.8732604980469, + "logps/rejected": -120.55049896240234, + "loss": 0.1072, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.7161759734153748, + "rewards/margins": 3.9293036460876465, + "rewards/rejected": -4.645480155944824, "step": 1300 }, { "epoch": 2.7, - "learning_rate": 2.2154316271963334e-06, - "logits/chosen": -2.279996871948242, - "logits/rejected": -2.220372200012207, - "logps/chosen": -315.04205322265625, - "logps/rejected": -238.9733123779297, - "loss": 0.0338, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -6.325307369232178, - "rewards/margins": 10.017477035522461, - "rewards/rejected": -16.342784881591797, + "learning_rate": 2.215431627196333e-07, + "logits/chosen": -2.6789979934692383, + "logits/rejected": -2.6807262897491455, + "logps/chosen": -260.33062744140625, + "logps/rejected": -120.8504409790039, + "loss": 0.1201, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.854164719581604, + "rewards/margins": 3.6763319969177246, + "rewards/rejected": -4.530496597290039, "step": 1310 }, { "epoch": 2.72, - "learning_rate": 2.0626432391138275e-06, - "logits/chosen": -2.3077378273010254, - "logits/rejected": -2.2532095909118652, - "logps/chosen": -306.40997314453125, - "logps/rejected": -244.31295776367188, - "loss": 0.0358, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.671900272369385, - "rewards/margins": 9.690256118774414, - "rewards/rejected": -16.36215591430664, + "learning_rate": 2.0626432391138274e-07, + "logits/chosen": -2.703617572784424, + "logits/rejected": -2.696929931640625, + "logps/chosen": -250.5372772216797, + "logps/rejected": -126.94450378417969, + "loss": 0.1262, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0846295356750488, + "rewards/margins": 3.540682554244995, + "rewards/rejected": -4.625311374664307, "step": 1320 }, { "epoch": 2.74, - "learning_rate": 1.9098548510313215e-06, - "logits/chosen": -2.3265862464904785, - "logits/rejected": -2.26108980178833, - "logps/chosen": -316.51409912109375, - "logps/rejected": -244.35745239257812, - "loss": 0.0299, + "learning_rate": 1.9098548510313214e-07, + "logits/chosen": -2.7046430110931396, + "logits/rejected": -2.691488027572632, + "logps/chosen": -267.9579162597656, + "logps/rejected": -123.9790267944336, + "loss": 0.1141, "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -5.494316577911377, - "rewards/margins": 11.21363353729248, - "rewards/rejected": -16.707950592041016, + "rewards/chosen": -0.6386908292770386, + "rewards/margins": 4.031415939331055, + "rewards/rejected": -4.670106887817383, "step": 1330 }, { "epoch": 2.76, - "learning_rate": 1.757066462948816e-06, - "logits/chosen": -2.313035011291504, - "logits/rejected": -2.2793192863464355, - "logps/chosen": -319.83758544921875, - "logps/rejected": -245.69088745117188, - "loss": 0.0341, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -5.988718032836914, - "rewards/margins": 10.558996200561523, - "rewards/rejected": -16.547714233398438, + "learning_rate": 1.757066462948816e-07, + "logits/chosen": -2.689009189605713, + "logits/rejected": -2.7033090591430664, + "logps/chosen": -266.8208312988281, + "logps/rejected": -126.87858581542969, + "loss": 0.1157, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6870437860488892, + "rewards/margins": 3.979443073272705, + "rewards/rejected": -4.666487216949463, "step": 1340 }, { "epoch": 2.78, - "learning_rate": 1.6042780748663103e-06, - "logits/chosen": -2.2976434230804443, - "logits/rejected": -2.2259607315063477, - "logps/chosen": -338.54229736328125, - "logps/rejected": -253.5747528076172, - "loss": 0.025, - "rewards/accuracies": 1.0, - "rewards/chosen": -7.011918067932129, - "rewards/margins": 10.417112350463867, - "rewards/rejected": -17.429031372070312, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": -2.70473051071167, + "logits/rejected": -2.6794614791870117, + "logps/chosen": -278.357421875, + "logps/rejected": -125.93829345703125, + "loss": 0.1152, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.9934309124946594, + "rewards/margins": 3.67195200920105, + "rewards/rejected": -4.665382385253906, "step": 1350 }, { "epoch": 2.8, - "learning_rate": 1.4514896867838045e-06, - "logits/chosen": -2.3067193031311035, - "logits/rejected": -2.264526844024658, - "logps/chosen": -356.2128601074219, - "logps/rejected": -264.0433044433594, - "loss": 0.0272, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -6.317415237426758, - "rewards/margins": 11.136034965515137, - "rewards/rejected": -17.45345115661621, + "learning_rate": 1.4514896867838044e-07, + "logits/chosen": -2.6879210472106934, + "logits/rejected": -2.7055411338806152, + "logps/chosen": -301.1689147949219, + "logps/rejected": -137.4368896484375, + "loss": 0.1134, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8130217790603638, + "rewards/margins": 3.979782819747925, + "rewards/rejected": -4.792804718017578, "step": 1360 }, { "epoch": 2.82, - "learning_rate": 1.2987012987012986e-06, - "logits/chosen": -2.2918033599853516, - "logits/rejected": -2.221585512161255, - "logps/chosen": -358.04180908203125, - "logps/rejected": -262.490478515625, - "loss": 0.0298, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.549307346343994, - "rewards/margins": 11.43887996673584, - "rewards/rejected": -17.98818588256836, + "learning_rate": 1.2987012987012984e-07, + "logits/chosen": -2.6889195442199707, + "logits/rejected": -2.677527666091919, + "logps/chosen": -300.85272216796875, + "logps/rejected": -131.1885986328125, + "loss": 0.1181, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.8304030299186707, + "rewards/margins": 4.027594089508057, + "rewards/rejected": -4.857996940612793, "step": 1370 }, { "epoch": 2.85, - "learning_rate": 1.145912910618793e-06, - "logits/chosen": -2.2997758388519287, - "logits/rejected": -2.217319965362549, - "logps/chosen": -385.3619384765625, - "logps/rejected": -264.56927490234375, - "loss": 0.0247, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -6.197786808013916, - "rewards/margins": 11.309389114379883, - "rewards/rejected": -17.50717544555664, + "learning_rate": 1.1459129106187928e-07, + "logits/chosen": -2.675020694732666, + "logits/rejected": -2.6782567501068115, + "logps/chosen": -329.69696044921875, + "logps/rejected": -135.40345764160156, + "loss": 0.1182, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.631290078163147, + "rewards/margins": 3.959303379058838, + "rewards/rejected": -4.5905938148498535, "step": 1380 }, { "epoch": 2.87, - "learning_rate": 9.931245225362874e-07, - "logits/chosen": -2.2803447246551514, - "logits/rejected": -2.218745708465576, - "logps/chosen": -334.13092041015625, - "logps/rejected": -247.9724578857422, - "loss": 0.0343, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.724381446838379, - "rewards/margins": 10.408391952514648, - "rewards/rejected": -17.13277244567871, + "learning_rate": 9.931245225362872e-08, + "logits/chosen": -2.6736457347869873, + "logits/rejected": -2.6810030937194824, + "logps/chosen": -274.8864440917969, + "logps/rejected": -124.03629302978516, + "loss": 0.1258, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7999323606491089, + "rewards/margins": 3.9392266273498535, + "rewards/rejected": -4.73915958404541, "step": 1390 }, { "epoch": 2.89, - "learning_rate": 8.403361344537816e-07, - "logits/chosen": -2.2882351875305176, - "logits/rejected": -2.1976969242095947, - "logps/chosen": -343.0736083984375, - "logps/rejected": -260.0362854003906, - "loss": 0.0316, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.6999921798706055, - "rewards/margins": 10.926143646240234, - "rewards/rejected": -17.626136779785156, + "learning_rate": 8.403361344537815e-08, + "logits/chosen": -2.699571132659912, + "logits/rejected": -2.701141834259033, + "logps/chosen": -283.51751708984375, + "logps/rejected": -130.84814453125, + "loss": 0.1212, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7443857192993164, + "rewards/margins": 3.96293568611145, + "rewards/rejected": -4.707321643829346, "step": 1400 }, { "epoch": 2.91, - "learning_rate": 6.875477463712758e-07, - "logits/chosen": -2.273560047149658, - "logits/rejected": -2.230538845062256, - "logps/chosen": -368.9632873535156, - "logps/rejected": -273.01080322265625, - "loss": 0.0258, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -6.649250030517578, - "rewards/margins": 11.343961715698242, - "rewards/rejected": -17.993213653564453, + "learning_rate": 6.875477463712758e-08, + "logits/chosen": -2.690229892730713, + "logits/rejected": -2.7361502647399902, + "logps/chosen": -310.0445556640625, + "logps/rejected": -141.15748596191406, + "loss": 0.1115, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.7573784589767456, + "rewards/margins": 4.050500869750977, + "rewards/rejected": -4.807879447937012, "step": 1410 }, { "epoch": 2.93, - "learning_rate": 5.347593582887701e-07, - "logits/chosen": -2.295637845993042, - "logits/rejected": -2.2116260528564453, - "logps/chosen": -355.9484558105469, - "logps/rejected": -262.65966796875, - "loss": 0.0361, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -6.842223167419434, - "rewards/margins": 11.27800178527832, - "rewards/rejected": -18.12022590637207, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -2.7038116455078125, + "logits/rejected": -2.715642213821411, + "logps/chosen": -294.73486328125, + "logps/rejected": -127.1126480102539, + "loss": 0.1176, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.720859944820404, + "rewards/margins": 3.844661235809326, + "rewards/rejected": -4.565520763397217, "step": 1420 }, { "epoch": 2.95, - "learning_rate": 3.819709702062643e-07, - "logits/chosen": -2.2705752849578857, - "logits/rejected": -2.191991090774536, - "logps/chosen": -316.24200439453125, - "logps/rejected": -252.88345336914062, - "loss": 0.032, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -6.750401973724365, - "rewards/margins": 11.157169342041016, - "rewards/rejected": -17.907573699951172, + "learning_rate": 3.8197097020626426e-08, + "logits/chosen": -2.695003032684326, + "logits/rejected": -2.7012951374053955, + "logps/chosen": -257.13287353515625, + "logps/rejected": -120.28269958496094, + "loss": 0.1259, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.8394849896430969, + "rewards/margins": 3.808011531829834, + "rewards/rejected": -4.647497177124023, "step": 1430 }, { "epoch": 2.97, - "learning_rate": 2.2918258212375862e-07, - "logits/chosen": -2.272953748703003, - "logits/rejected": -2.1898062229156494, - "logps/chosen": -342.50482177734375, - "logps/rejected": -259.5894775390625, - "loss": 0.025, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -7.047539710998535, - "rewards/margins": 10.942033767700195, - "rewards/rejected": -17.989574432373047, + "learning_rate": 2.291825821237586e-08, + "logits/chosen": -2.7045774459838867, + "logits/rejected": -2.679112672805786, + "logps/chosen": -280.9797058105469, + "logps/rejected": -127.39398193359375, + "loss": 0.1096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8950273394584656, + "rewards/margins": 3.8749961853027344, + "rewards/rejected": -4.770023345947266, "step": 1440 }, { "epoch": 2.99, - "learning_rate": 7.639419404125288e-08, - "logits/chosen": -2.346297025680542, - "logits/rejected": -2.2220358848571777, - "logps/chosen": -386.37823486328125, - "logps/rejected": -261.1239929199219, - "loss": 0.0238, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -6.26646614074707, - "rewards/margins": 11.493976593017578, - "rewards/rejected": -17.76044273376465, + "learning_rate": 7.639419404125286e-09, + "logits/chosen": -2.7298693656921387, + "logits/rejected": -2.673051357269287, + "logps/chosen": -329.364990234375, + "logps/rejected": -131.12486267089844, + "loss": 0.109, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.5651420950889587, + "rewards/margins": 4.195387840270996, + "rewards/rejected": -4.760529518127441, "step": 1450 }, { "epoch": 3.0, - "eval_logits/chosen": -2.3036882877349854, - "eval_logits/rejected": -2.218935012817383, - "eval_logps/chosen": -341.342041015625, - "eval_logps/rejected": -273.77117919921875, - "eval_loss": 1.0562912225723267, - "eval_rewards/accuracies": 0.72265625, - "eval_rewards/chosen": -6.750503063201904, - "eval_rewards/margins": 3.3230087757110596, - "eval_rewards/rejected": -10.07351303100586, - "eval_runtime": 258.6775, - "eval_samples_per_second": 7.732, + "eval_logits/chosen": -2.715555429458618, + "eval_logits/rejected": -2.6699323654174805, + "eval_logps/chosen": -282.458740234375, + "eval_logps/rejected": -204.27066040039062, + "eval_loss": 0.3553008437156677, + "eval_rewards/accuracies": 0.828125, + "eval_rewards/chosen": -0.8621728420257568, + "eval_rewards/margins": 2.261284112930298, + "eval_rewards/rejected": -3.123457193374634, + "eval_runtime": 259.2547, + "eval_samples_per_second": 7.714, "eval_steps_per_second": 0.062, "step": 1455 }, @@ -2104,9 +2104,9 @@ "epoch": 3.0, "step": 1455, "total_flos": 0.0, - "train_loss": 0.07034083745375122, - "train_runtime": 46831.0549, - "train_samples_per_second": 3.976, + "train_loss": 0.21351368668972423, + "train_runtime": 46913.4477, + "train_samples_per_second": 3.969, "train_steps_per_second": 0.031 } ],