diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,10 +11,10 @@ { "epoch": 0.0, "learning_rate": 1.36986301369863e-07, - "logits/chosen": -2.6635093688964844, - "logits/rejected": -2.7324111461639404, - "logps/chosen": -135.12002563476562, - "logps/rejected": -103.28743743896484, + "logits/chosen": -2.8295512199401855, + "logits/rejected": -2.9639337062835693, + "logps/chosen": -242.64569091796875, + "logps/rejected": -75.87144470214844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,2088 +25,2088 @@ { "epoch": 0.02, "learning_rate": 1.3698630136986302e-06, - "logits/chosen": -2.783435583114624, - "logits/rejected": -2.754120111465454, - "logps/chosen": -311.785400390625, - "logps/rejected": -273.2391357421875, - "loss": 0.6925, - "rewards/accuracies": 0.4513888955116272, - "rewards/chosen": -0.005032465327531099, - "rewards/margins": -0.004827913362532854, - "rewards/rejected": -0.00020455113553907722, + "logits/chosen": -2.777900218963623, + "logits/rejected": -2.813075065612793, + "logps/chosen": -292.14288330078125, + "logps/rejected": -78.98306274414062, + "loss": 0.69, + "rewards/accuracies": 0.5069444179534912, + "rewards/chosen": 0.0018207718385383487, + "rewards/margins": 0.010101978667080402, + "rewards/rejected": -0.008281207643449306, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.7397260273972604e-06, - "logits/chosen": -2.8342247009277344, - "logits/rejected": -2.8470585346221924, - "logps/chosen": -283.9891662597656, - "logps/rejected": -250.61019897460938, - "loss": 0.6861, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.006812130566686392, - "rewards/margins": 0.014938007108867168, - "rewards/rejected": -0.008125877007842064, + "logits/chosen": -2.7663190364837646, + "logits/rejected": -2.758164167404175, + "logps/chosen": -299.78204345703125, + "logps/rejected": -81.72047424316406, + "loss": 0.6536, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.027563810348510742, + "rewards/margins": 0.08724220097064972, + "rewards/rejected": -0.05967838317155838, "step": 20 }, { "epoch": 0.06, "learning_rate": 4.109589041095891e-06, - "logits/chosen": -2.816066026687622, - "logits/rejected": -2.8595542907714844, - "logps/chosen": -282.41351318359375, - "logps/rejected": -200.7602081298828, - "loss": 0.6719, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.009923343546688557, - "rewards/margins": 0.052515141665935516, - "rewards/rejected": -0.04259180277585983, + "logits/chosen": -2.770326852798462, + "logits/rejected": -2.78336763381958, + "logps/chosen": -289.27728271484375, + "logps/rejected": -81.4061508178711, + "loss": 0.5577, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.07835596799850464, + "rewards/margins": 0.328571081161499, + "rewards/rejected": -0.2502151131629944, "step": 30 }, { "epoch": 0.08, "learning_rate": 5.479452054794521e-06, - "logits/chosen": -2.8367600440979004, - "logits/rejected": -2.777651309967041, - "logps/chosen": -292.46136474609375, - "logps/rejected": -239.9720001220703, - "loss": 0.6352, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.041534725576639175, - "rewards/margins": 0.1540035903453827, - "rewards/rejected": -0.11246886104345322, + "logits/chosen": -2.7817535400390625, + "logits/rejected": -2.8053927421569824, + "logps/chosen": -244.7366180419922, + "logps/rejected": -90.08441162109375, + "loss": 0.4426, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.07394310086965561, + "rewards/margins": 0.6627634763717651, + "rewards/rejected": -0.5888203382492065, "step": 40 }, { "epoch": 0.1, "learning_rate": 6.849315068493151e-06, - "logits/chosen": -2.789390802383423, - "logits/rejected": -2.7937839031219482, - "logps/chosen": -296.8035583496094, - "logps/rejected": -220.61669921875, - "loss": 0.5922, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": 0.07272736728191376, - "rewards/margins": 0.3851665258407593, - "rewards/rejected": -0.3124391436576843, + "logits/chosen": -2.7496659755706787, + "logits/rejected": -2.807068109512329, + "logps/chosen": -251.8906707763672, + "logps/rejected": -90.27186584472656, + "loss": 0.3608, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.04582538083195686, + "rewards/margins": 1.008236289024353, + "rewards/rejected": -0.9624108076095581, "step": 50 }, { "epoch": 0.12, "learning_rate": 8.219178082191782e-06, - "logits/chosen": -2.8027291297912598, - "logits/rejected": -2.8161094188690186, - "logps/chosen": -298.8342590332031, - "logps/rejected": -248.74533081054688, - "loss": 0.5742, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.023758064955472946, - "rewards/margins": 0.40244102478027344, - "rewards/rejected": -0.4261991083621979, + "logits/chosen": -2.813101291656494, + "logits/rejected": -2.7498257160186768, + "logps/chosen": -280.83966064453125, + "logps/rejected": -92.8672103881836, + "loss": 0.3055, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.10900793224573135, + "rewards/margins": 1.581032633781433, + "rewards/rejected": -1.4720247983932495, "step": 60 }, { "epoch": 0.14, "learning_rate": 9.589041095890411e-06, - "logits/chosen": -2.8271241188049316, - "logits/rejected": -2.833664894104004, - "logps/chosen": -282.1733703613281, - "logps/rejected": -261.71844482421875, - "loss": 0.5511, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.05103034898638725, - "rewards/margins": 0.4532381594181061, - "rewards/rejected": -0.5042685270309448, + "logits/chosen": -2.7718911170959473, + "logits/rejected": -2.7741990089416504, + "logps/chosen": -264.4299621582031, + "logps/rejected": -92.99862670898438, + "loss": 0.2573, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.06832887977361679, + "rewards/margins": 1.9427387714385986, + "rewards/rejected": -1.8744099140167236, "step": 70 }, { "epoch": 0.16, "learning_rate": 1.0958904109589042e-05, - "logits/chosen": -2.8094613552093506, - "logits/rejected": -2.8167314529418945, - "logps/chosen": -302.01190185546875, - "logps/rejected": -241.74282836914062, - "loss": 0.538, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.05076650530099869, - "rewards/margins": 0.729707658290863, - "rewards/rejected": -0.7804741263389587, + "logits/chosen": -2.751570701599121, + "logits/rejected": -2.764747142791748, + "logps/chosen": -257.4574279785156, + "logps/rejected": -94.42860412597656, + "loss": 0.2239, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.14372114837169647, + "rewards/margins": 2.0751771926879883, + "rewards/rejected": -2.218898296356201, "step": 80 }, { "epoch": 0.19, "learning_rate": 1.2328767123287673e-05, - "logits/chosen": -2.7917096614837646, - "logits/rejected": -2.8236160278320312, - "logps/chosen": -255.85238647460938, - "logps/rejected": -210.2568359375, - "loss": 0.5137, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.12663120031356812, - "rewards/margins": 0.7104201316833496, - "rewards/rejected": -0.8370513916015625, + "logits/chosen": -2.7785003185272217, + "logits/rejected": -2.7658634185791016, + "logps/chosen": -271.41583251953125, + "logps/rejected": -113.24674987792969, + "loss": 0.1911, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.36074167490005493, + "rewards/margins": 2.577946186065674, + "rewards/rejected": -2.938687801361084, "step": 90 }, { "epoch": 0.21, "learning_rate": 1.3698630136986302e-05, - "logits/chosen": -2.8499982357025146, - "logits/rejected": -2.8203647136688232, - "logps/chosen": -261.9218444824219, - "logps/rejected": -243.7086639404297, - "loss": 0.5222, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.13231949508190155, - "rewards/margins": 0.815433144569397, - "rewards/rejected": -0.9477526545524597, + "logits/chosen": -2.7907283306121826, + "logits/rejected": -2.7734055519104004, + "logps/chosen": -291.3459167480469, + "logps/rejected": -113.92546081542969, + "loss": 0.1648, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3865249752998352, + "rewards/margins": 2.9936206340789795, + "rewards/rejected": -3.38014554977417, "step": 100 }, { "epoch": 0.23, "learning_rate": 1.5068493150684933e-05, - "logits/chosen": -2.8208096027374268, - "logits/rejected": -2.8195183277130127, - "logps/chosen": -278.68377685546875, - "logps/rejected": -239.5231170654297, - "loss": 0.4973, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.1940581500530243, - "rewards/margins": 0.9298496246337891, - "rewards/rejected": -1.1239076852798462, + "logits/chosen": -2.6879987716674805, + "logits/rejected": -2.6891028881073, + "logps/chosen": -285.2283935546875, + "logps/rejected": -117.8741683959961, + "loss": 0.1551, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.7047951817512512, + "rewards/margins": 2.912633180618286, + "rewards/rejected": -3.6174283027648926, "step": 110 }, { "epoch": 0.25, "learning_rate": 1.6438356164383563e-05, - "logits/chosen": -2.8535244464874268, - "logits/rejected": -2.8457090854644775, - "logps/chosen": -299.8865661621094, - "logps/rejected": -211.28964233398438, - "loss": 0.503, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.1717432737350464, - "rewards/margins": 0.9522634744644165, - "rewards/rejected": -1.124006748199463, + "logits/chosen": -2.666584014892578, + "logits/rejected": -2.7262470722198486, + "logps/chosen": -284.8210754394531, + "logps/rejected": -121.16352844238281, + "loss": 0.1394, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.631528913974762, + "rewards/margins": 3.5603854656219482, + "rewards/rejected": -4.1919145584106445, "step": 120 }, { "epoch": 0.27, "learning_rate": 1.7808219178082194e-05, - "logits/chosen": -2.806631565093994, - "logits/rejected": -2.8321399688720703, - "logps/chosen": -273.78619384765625, - "logps/rejected": -232.2666015625, - "loss": 0.5137, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.3173758387565613, - "rewards/margins": 0.8668048977851868, - "rewards/rejected": -1.1841806173324585, + "logits/chosen": -2.6699535846710205, + "logits/rejected": -2.6761491298675537, + "logps/chosen": -292.54754638671875, + "logps/rejected": -131.53846740722656, + "loss": 0.1216, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.6828351020812988, + "rewards/margins": 3.826262950897217, + "rewards/rejected": -4.509098052978516, "step": 130 }, { "epoch": 0.29, "learning_rate": 1.9178082191780822e-05, - "logits/chosen": -2.8575327396392822, - "logits/rejected": -2.8858590126037598, - "logps/chosen": -291.01837158203125, - "logps/rejected": -233.05001831054688, - "loss": 0.528, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.21833661198616028, - "rewards/margins": 0.7500525712966919, - "rewards/rejected": -0.9683893322944641, + "logits/chosen": -2.7449731826782227, + "logits/rejected": -2.744097948074341, + "logps/chosen": -302.683837890625, + "logps/rejected": -129.56527709960938, + "loss": 0.116, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8527084589004517, + "rewards/margins": 3.741666078567505, + "rewards/rejected": -4.594374179840088, "step": 140 }, { "epoch": 0.31, "learning_rate": 1.9938884644767e-05, - "logits/chosen": -2.8324332237243652, - "logits/rejected": -2.8657970428466797, - "logps/chosen": -263.90875244140625, - "logps/rejected": -248.7552490234375, - "loss": 0.5066, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.226444810628891, - "rewards/margins": 0.6770893335342407, - "rewards/rejected": -0.9035340547561646, + "logits/chosen": -2.6660311222076416, + "logits/rejected": -2.668618679046631, + "logps/chosen": -300.77288818359375, + "logps/rejected": -126.13216400146484, + "loss": 0.1032, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9231691360473633, + "rewards/margins": 3.9592361450195312, + "rewards/rejected": -4.882404804229736, "step": 150 }, { "epoch": 0.33, "learning_rate": 1.9786096256684494e-05, - "logits/chosen": -2.85012149810791, - "logits/rejected": -2.920163869857788, - "logps/chosen": -313.2906494140625, - "logps/rejected": -240.2455596923828, - "loss": 0.514, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.15915456414222717, - "rewards/margins": 0.9070942997932434, - "rewards/rejected": -1.066248893737793, + "logits/chosen": -2.6760621070861816, + "logits/rejected": -2.743537187576294, + "logps/chosen": -284.9643249511719, + "logps/rejected": -130.59217834472656, + "loss": 0.0995, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.2370917797088623, + "rewards/margins": 4.145981788635254, + "rewards/rejected": -5.383072853088379, "step": 160 }, { "epoch": 0.35, "learning_rate": 1.9633307868601987e-05, - "logits/chosen": -2.8070321083068848, - "logits/rejected": -2.8099472522735596, - "logps/chosen": -302.35736083984375, - "logps/rejected": -241.8223419189453, - "loss": 0.5346, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.07760269939899445, - "rewards/margins": 0.7661724090576172, - "rewards/rejected": -0.8437750935554504, + "logits/chosen": -2.602896213531494, + "logits/rejected": -2.5765960216522217, + "logps/chosen": -302.64300537109375, + "logps/rejected": -141.293212890625, + "loss": 0.0875, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.2406305074691772, + "rewards/margins": 4.966603755950928, + "rewards/rejected": -6.207234859466553, "step": 170 }, { "epoch": 0.37, "learning_rate": 1.9480519480519483e-05, - "logits/chosen": -2.7994544506073, - "logits/rejected": -2.8084394931793213, - "logps/chosen": -302.2978820800781, - "logps/rejected": -248.54586791992188, - "loss": 0.4909, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.09264491498470306, - "rewards/margins": 0.8108696937561035, - "rewards/rejected": -0.9035146832466125, + "logits/chosen": -2.561760187149048, + "logits/rejected": -2.5683257579803467, + "logps/chosen": -293.40093994140625, + "logps/rejected": -149.14077758789062, + "loss": 0.0846, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.0210623741149902, + "rewards/margins": 4.921706676483154, + "rewards/rejected": -6.942769527435303, "step": 180 }, { "epoch": 0.39, "learning_rate": 1.9327731092436976e-05, - "logits/chosen": -2.867279052734375, - "logits/rejected": -2.8662352561950684, - "logps/chosen": -294.74139404296875, - "logps/rejected": -247.1249542236328, - "loss": 0.5157, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.023864692077040672, - "rewards/margins": 1.0177843570709229, - "rewards/rejected": -1.0416491031646729, + "logits/chosen": -2.5644702911376953, + "logits/rejected": -2.569974422454834, + "logps/chosen": -295.0621032714844, + "logps/rejected": -151.10609436035156, + "loss": 0.0794, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.016458034515381, + "rewards/margins": 5.062263488769531, + "rewards/rejected": -7.078721046447754, "step": 190 }, { "epoch": 0.41, "learning_rate": 1.9174942704354472e-05, - "logits/chosen": -2.7849340438842773, - "logits/rejected": -2.8029227256774902, - "logps/chosen": -274.12408447265625, - "logps/rejected": -234.2401885986328, - "loss": 0.4979, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.18910327553749084, - "rewards/margins": 0.9559990763664246, - "rewards/rejected": -1.1451025009155273, + "logits/chosen": -2.6058998107910156, + "logits/rejected": -2.5582222938537598, + "logps/chosen": -290.7182922363281, + "logps/rejected": -156.05072021484375, + "loss": 0.0759, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.2009756565093994, + "rewards/margins": 5.653419017791748, + "rewards/rejected": -7.854394435882568, "step": 200 }, { "epoch": 0.43, "learning_rate": 1.9022154316271965e-05, - "logits/chosen": -2.886434555053711, - "logits/rejected": -2.8876872062683105, - "logps/chosen": -285.2301940917969, - "logps/rejected": -270.3388977050781, - "loss": 0.5246, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.0919012501835823, - "rewards/margins": 1.0152003765106201, - "rewards/rejected": -1.1071016788482666, + "logits/chosen": -2.540133237838745, + "logits/rejected": -2.4774231910705566, + "logps/chosen": -295.8403015136719, + "logps/rejected": -166.9518280029297, + "loss": 0.0739, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3658392429351807, + "rewards/margins": 5.26444149017334, + "rewards/rejected": -7.630280494689941, "step": 210 }, { "epoch": 0.45, "learning_rate": 1.8869365928189458e-05, - "logits/chosen": -2.8153045177459717, - "logits/rejected": -2.8722925186157227, - "logps/chosen": -295.7647399902344, - "logps/rejected": -241.56723022460938, - "loss": 0.4931, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.16732807457447052, - "rewards/margins": 0.9812033772468567, - "rewards/rejected": -1.1485313177108765, + "logits/chosen": -2.5680794715881348, + "logits/rejected": -2.503758430480957, + "logps/chosen": -281.4465637207031, + "logps/rejected": -157.9706268310547, + "loss": 0.0679, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.859816074371338, + "rewards/margins": 5.681013584136963, + "rewards/rejected": -8.5408296585083, "step": 220 }, { "epoch": 0.47, "learning_rate": 1.8716577540106954e-05, - "logits/chosen": -2.890263080596924, - "logits/rejected": -2.870927333831787, - "logps/chosen": -300.0990905761719, - "logps/rejected": -243.5838165283203, - "loss": 0.5021, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.1739131659269333, - "rewards/margins": 0.8733876943588257, - "rewards/rejected": -1.047300934791565, + "logits/chosen": -2.516791820526123, + "logits/rejected": -2.4941375255584717, + "logps/chosen": -320.51336669921875, + "logps/rejected": -158.91770935058594, + "loss": 0.0662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3171486854553223, + "rewards/margins": 6.3180742263793945, + "rewards/rejected": -8.635221481323242, "step": 230 }, { "epoch": 0.49, "learning_rate": 1.8563789152024447e-05, - "logits/chosen": -2.792865753173828, - "logits/rejected": -2.8081324100494385, - "logps/chosen": -249.1541748046875, - "logps/rejected": -221.9134979248047, - "loss": 0.4763, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.2561655640602112, - "rewards/margins": 1.2132002115249634, - "rewards/rejected": -1.4693658351898193, + "logits/chosen": -2.4835712909698486, + "logits/rejected": -2.477268934249878, + "logps/chosen": -311.7545471191406, + "logps/rejected": -168.81459045410156, + "loss": 0.0708, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.8363029956817627, + "rewards/margins": 6.439309597015381, + "rewards/rejected": -9.275611877441406, "step": 240 }, { "epoch": 0.52, "learning_rate": 1.8411000763941943e-05, - "logits/chosen": -2.780810594558716, - "logits/rejected": -2.8007028102874756, - "logps/chosen": -289.1332702636719, - "logps/rejected": -259.22491455078125, - "loss": 0.488, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.20490197837352753, - "rewards/margins": 0.9276278614997864, - "rewards/rejected": -1.132529854774475, + "logits/chosen": -2.5130648612976074, + "logits/rejected": -2.510387897491455, + "logps/chosen": -301.73126220703125, + "logps/rejected": -176.51712036132812, + "loss": 0.0716, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.6253724098205566, + "rewards/margins": 5.836529731750488, + "rewards/rejected": -9.461902618408203, "step": 250 }, { "epoch": 0.54, "learning_rate": 1.8258212375859436e-05, - "logits/chosen": -2.7935876846313477, - "logits/rejected": -2.795382022857666, - "logps/chosen": -300.5179748535156, - "logps/rejected": -260.3360900878906, - "loss": 0.4823, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.22746071219444275, - "rewards/margins": 1.0280539989471436, - "rewards/rejected": -1.2555148601531982, + "logits/chosen": -2.4818222522735596, + "logits/rejected": -2.4154343605041504, + "logps/chosen": -312.3952331542969, + "logps/rejected": -160.39041137695312, + "loss": 0.0714, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.6363685131073, + "rewards/margins": 6.308477401733398, + "rewards/rejected": -8.944845199584961, "step": 260 }, { "epoch": 0.56, "learning_rate": 1.8105423987776932e-05, - "logits/chosen": -2.8180222511291504, - "logits/rejected": -2.7675962448120117, - "logps/chosen": -264.4036560058594, - "logps/rejected": -242.5054931640625, - "loss": 0.4994, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.12344682216644287, - "rewards/margins": 1.0876801013946533, - "rewards/rejected": -1.2111269235610962, + "logits/chosen": -2.4748425483703613, + "logits/rejected": -2.4548239707946777, + "logps/chosen": -305.8868103027344, + "logps/rejected": -178.6342010498047, + "loss": 0.0662, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.9469079971313477, + "rewards/margins": 6.27431583404541, + "rewards/rejected": -9.221223831176758, "step": 270 }, { "epoch": 0.58, "learning_rate": 1.7952635599694425e-05, - "logits/chosen": -2.859412908554077, - "logits/rejected": -2.7997498512268066, - "logps/chosen": -274.4200744628906, - "logps/rejected": -253.32235717773438, - "loss": 0.4921, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.08681371808052063, - "rewards/margins": 0.9918006658554077, - "rewards/rejected": -1.078614354133606, + "logits/chosen": -2.5008432865142822, + "logits/rejected": -2.489522933959961, + "logps/chosen": -292.95355224609375, + "logps/rejected": -161.0408935546875, + "loss": 0.0728, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.6459877490997314, + "rewards/margins": 5.44729471206665, + "rewards/rejected": -9.093282699584961, "step": 280 }, { "epoch": 0.6, "learning_rate": 1.7799847211611917e-05, - "logits/chosen": -2.8540239334106445, - "logits/rejected": -2.8323752880096436, - "logps/chosen": -297.4104309082031, - "logps/rejected": -258.0458984375, - "loss": 0.4741, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.1035405844449997, - "rewards/margins": 1.009918451309204, - "rewards/rejected": -1.1134589910507202, + "logits/chosen": -2.46510648727417, + "logits/rejected": -2.4538471698760986, + "logps/chosen": -309.40313720703125, + "logps/rejected": -177.50119018554688, + "loss": 0.0694, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.042060613632202, + "rewards/margins": 6.3933000564575195, + "rewards/rejected": -9.435359954833984, "step": 290 }, { "epoch": 0.62, "learning_rate": 1.7647058823529414e-05, - "logits/chosen": -2.835106372833252, - "logits/rejected": -2.8273653984069824, - "logps/chosen": -251.33984375, - "logps/rejected": -243.59890747070312, - "loss": 0.4957, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.1399732530117035, - "rewards/margins": 0.9917305111885071, - "rewards/rejected": -1.1317037343978882, + "logits/chosen": -2.3893284797668457, + "logits/rejected": -2.334172487258911, + "logps/chosen": -310.794189453125, + "logps/rejected": -172.20497131347656, + "loss": 0.0712, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.50679087638855, + "rewards/margins": 5.94513463973999, + "rewards/rejected": -9.451925277709961, "step": 300 }, { "epoch": 0.64, "learning_rate": 1.7494270435446906e-05, - "logits/chosen": -2.8415441513061523, - "logits/rejected": -2.7994866371154785, - "logps/chosen": -260.0638732910156, - "logps/rejected": -227.3611602783203, - "loss": 0.5106, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.05403571575880051, - "rewards/margins": 1.2989342212677002, - "rewards/rejected": -1.3529701232910156, + "logits/chosen": -2.454113721847534, + "logits/rejected": -2.4209561347961426, + "logps/chosen": -323.4365234375, + "logps/rejected": -188.5634765625, + "loss": 0.0634, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.002615451812744, + "rewards/margins": 6.827607154846191, + "rewards/rejected": -10.830221176147461, "step": 310 }, { "epoch": 0.66, "learning_rate": 1.7341482047364403e-05, - "logits/chosen": -2.870739698410034, - "logits/rejected": -2.808722496032715, - "logps/chosen": -252.57870483398438, - "logps/rejected": -238.3760528564453, - "loss": 0.4714, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.3325572907924652, - "rewards/margins": 1.0449466705322266, - "rewards/rejected": -1.3775039911270142, + "logits/chosen": -2.469594717025757, + "logits/rejected": -2.4255645275115967, + "logps/chosen": -308.19000244140625, + "logps/rejected": -187.37420654296875, + "loss": 0.059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.496091365814209, + "rewards/margins": 6.612002372741699, + "rewards/rejected": -10.108095169067383, "step": 320 }, { "epoch": 0.68, "learning_rate": 1.7188693659281895e-05, - "logits/chosen": -2.870849132537842, - "logits/rejected": -2.8693222999572754, - "logps/chosen": -296.0721435546875, - "logps/rejected": -252.4921875, - "loss": 0.4776, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.14332377910614014, - "rewards/margins": 1.1690990924835205, - "rewards/rejected": -1.312422752380371, + "logits/chosen": -2.459336042404175, + "logits/rejected": -2.4206173419952393, + "logps/chosen": -285.8677062988281, + "logps/rejected": -170.1533660888672, + "loss": 0.0648, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.3919098377227783, + "rewards/margins": 6.427476406097412, + "rewards/rejected": -9.819387435913086, "step": 330 }, { "epoch": 0.7, "learning_rate": 1.703590527119939e-05, - "logits/chosen": -2.82586669921875, - "logits/rejected": -2.780189037322998, - "logps/chosen": -315.26727294921875, - "logps/rejected": -268.7083740234375, - "loss": 0.5107, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.26427751779556274, - "rewards/margins": 0.9479537010192871, - "rewards/rejected": -1.2122312784194946, + "logits/chosen": -2.5112051963806152, + "logits/rejected": -2.4785618782043457, + "logps/chosen": -304.47003173828125, + "logps/rejected": -176.83731079101562, + "loss": 0.0632, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.8963208198547363, + "rewards/margins": 6.368965148925781, + "rewards/rejected": -9.265286445617676, "step": 340 }, { "epoch": 0.72, "learning_rate": 1.6883116883116884e-05, - "logits/chosen": -2.8107895851135254, - "logits/rejected": -2.8060848712921143, - "logps/chosen": -242.031494140625, - "logps/rejected": -227.6324920654297, - "loss": 0.5167, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.1813223510980606, - "rewards/margins": 0.9393804669380188, - "rewards/rejected": -1.1207029819488525, + "logits/chosen": -2.5041980743408203, + "logits/rejected": -2.46270751953125, + "logps/chosen": -309.81536865234375, + "logps/rejected": -182.5047149658203, + "loss": 0.0637, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1548614501953125, + "rewards/margins": 6.430181980133057, + "rewards/rejected": -9.585042953491211, "step": 350 }, { "epoch": 0.74, "learning_rate": 1.6730328495034377e-05, - "logits/chosen": -2.8176794052124023, - "logits/rejected": -2.76552152633667, - "logps/chosen": -263.55047607421875, - "logps/rejected": -240.71484375, - "loss": 0.5146, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.1424541026353836, - "rewards/margins": 0.9593822360038757, - "rewards/rejected": -1.1018364429473877, + "logits/chosen": -2.4566521644592285, + "logits/rejected": -2.449441432952881, + "logps/chosen": -324.7444152832031, + "logps/rejected": -174.9409637451172, + "loss": 0.0623, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.9833693504333496, + "rewards/margins": 6.45650577545166, + "rewards/rejected": -9.439874649047852, "step": 360 }, { "epoch": 0.76, "learning_rate": 1.6577540106951873e-05, - "logits/chosen": -2.8674864768981934, - "logits/rejected": -2.8338735103607178, - "logps/chosen": -288.48687744140625, - "logps/rejected": -261.1170349121094, - "loss": 0.4945, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.24135860800743103, - "rewards/margins": 0.9816256761550903, - "rewards/rejected": -1.2229843139648438, + "logits/chosen": -2.4364638328552246, + "logits/rejected": -2.3899149894714355, + "logps/chosen": -309.7897033691406, + "logps/rejected": -176.37049865722656, + "loss": 0.0646, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.2578327655792236, + "rewards/margins": 7.072564125061035, + "rewards/rejected": -10.33039665222168, "step": 370 }, { "epoch": 0.78, "learning_rate": 1.6424751718869366e-05, - "logits/chosen": -2.8083739280700684, - "logits/rejected": -2.7698397636413574, - "logps/chosen": -243.06356811523438, - "logps/rejected": -258.90496826171875, - "loss": 0.4858, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.4036439061164856, - "rewards/margins": 1.0016567707061768, - "rewards/rejected": -1.4053006172180176, + "logits/chosen": -2.4285435676574707, + "logits/rejected": -2.3717868328094482, + "logps/chosen": -307.73907470703125, + "logps/rejected": -190.9008331298828, + "loss": 0.0538, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.86690092086792, + "rewards/margins": 7.263972282409668, + "rewards/rejected": -11.130870819091797, "step": 380 }, { "epoch": 0.8, "learning_rate": 1.6271963330786862e-05, - "logits/chosen": -2.8200416564941406, - "logits/rejected": -2.809105634689331, - "logps/chosen": -317.0741882324219, - "logps/rejected": -248.5664825439453, - "loss": 0.4763, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.23517903685569763, - "rewards/margins": 1.1138803958892822, - "rewards/rejected": -1.3490597009658813, + "logits/chosen": -2.475437879562378, + "logits/rejected": -2.4385433197021484, + "logps/chosen": -318.3887634277344, + "logps/rejected": -187.13783264160156, + "loss": 0.0524, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.500490665435791, + "rewards/margins": 7.331036567687988, + "rewards/rejected": -10.831527709960938, "step": 390 }, { "epoch": 0.82, "learning_rate": 1.6119174942704355e-05, - "logits/chosen": -2.860297679901123, - "logits/rejected": -2.8406786918640137, - "logps/chosen": -281.96527099609375, - "logps/rejected": -261.6462707519531, - "loss": 0.4637, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.20879487693309784, - "rewards/margins": 1.0736795663833618, - "rewards/rejected": -1.2824745178222656, + "logits/chosen": -2.413388252258301, + "logits/rejected": -2.384582281112671, + "logps/chosen": -284.5127868652344, + "logps/rejected": -182.04469299316406, + "loss": 0.0547, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.9967052936553955, + "rewards/margins": 6.6313934326171875, + "rewards/rejected": -10.628097534179688, "step": 400 }, { "epoch": 0.85, "learning_rate": 1.596638655462185e-05, - "logits/chosen": -2.8639817237854004, - "logits/rejected": -2.8506665229797363, - "logps/chosen": -295.0474853515625, - "logps/rejected": -243.6042938232422, - "loss": 0.4449, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.20277197659015656, - "rewards/margins": 1.3167582750320435, - "rewards/rejected": -1.519530177116394, + "logits/chosen": -2.4949584007263184, + "logits/rejected": -2.4177098274230957, + "logps/chosen": -306.1350402832031, + "logps/rejected": -188.74630737304688, + "loss": 0.0559, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9294257164001465, + "rewards/margins": 7.267237663269043, + "rewards/rejected": -11.196664810180664, "step": 410 }, { "epoch": 0.87, "learning_rate": 1.5813598166539344e-05, - "logits/chosen": -2.83903431892395, - "logits/rejected": -2.8222603797912598, - "logps/chosen": -279.1824035644531, - "logps/rejected": -243.2361297607422, - "loss": 0.4928, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4378887712955475, - "rewards/margins": 1.1546775102615356, - "rewards/rejected": -1.5925662517547607, + "logits/chosen": -2.4707770347595215, + "logits/rejected": -2.41619610786438, + "logps/chosen": -309.9191589355469, + "logps/rejected": -187.71530151367188, + "loss": 0.0653, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.556119441986084, + "rewards/margins": 7.09530782699585, + "rewards/rejected": -10.651426315307617, "step": 420 }, { "epoch": 0.89, "learning_rate": 1.5660809778456837e-05, - "logits/chosen": -2.8503293991088867, - "logits/rejected": -2.8215270042419434, - "logps/chosen": -260.7596130371094, - "logps/rejected": -272.5841979980469, - "loss": 0.49, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.2955819368362427, - "rewards/margins": 0.9733622670173645, - "rewards/rejected": -1.268944263458252, + "logits/chosen": -2.475254535675049, + "logits/rejected": -2.458305597305298, + "logps/chosen": -310.1326904296875, + "logps/rejected": -195.8196563720703, + "loss": 0.0569, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.553369998931885, + "rewards/margins": 7.457148551940918, + "rewards/rejected": -12.010518074035645, "step": 430 }, { "epoch": 0.91, "learning_rate": 1.5508021390374333e-05, - "logits/chosen": -2.820071220397949, - "logits/rejected": -2.7907521724700928, - "logps/chosen": -307.09576416015625, - "logps/rejected": -249.6613006591797, - "loss": 0.4867, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.3214530348777771, - "rewards/margins": 1.281368613243103, - "rewards/rejected": -1.602821707725525, + "logits/chosen": -2.456172227859497, + "logits/rejected": -2.4342832565307617, + "logps/chosen": -309.1450500488281, + "logps/rejected": -188.05943298339844, + "loss": 0.0547, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.021037578582764, + "rewards/margins": 7.251453399658203, + "rewards/rejected": -11.272489547729492, "step": 440 }, { "epoch": 0.93, "learning_rate": 1.5355233002291826e-05, - "logits/chosen": -2.8464276790618896, - "logits/rejected": -2.7966537475585938, - "logps/chosen": -296.7311706542969, - "logps/rejected": -254.079833984375, - "loss": 0.4991, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.35059598088264465, - "rewards/margins": 1.165145993232727, - "rewards/rejected": -1.5157420635223389, + "logits/chosen": -2.4393460750579834, + "logits/rejected": -2.410473585128784, + "logps/chosen": -343.1289978027344, + "logps/rejected": -199.44427490234375, + "loss": 0.0661, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.531455039978027, + "rewards/margins": 7.504457950592041, + "rewards/rejected": -12.035911560058594, "step": 450 }, { "epoch": 0.95, "learning_rate": 1.5202444614209322e-05, - "logits/chosen": -2.8699872493743896, - "logits/rejected": -2.8504984378814697, - "logps/chosen": -262.0877990722656, - "logps/rejected": -247.0086212158203, - "loss": 0.4908, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.2521810531616211, - "rewards/margins": 1.0113656520843506, - "rewards/rejected": -1.2635467052459717, + "logits/chosen": -2.4784836769104004, + "logits/rejected": -2.4135525226593018, + "logps/chosen": -338.821044921875, + "logps/rejected": -197.35238647460938, + "loss": 0.059, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.623051881790161, + "rewards/margins": 7.590781211853027, + "rewards/rejected": -11.213833808898926, "step": 460 }, { "epoch": 0.97, "learning_rate": 1.5049656226126816e-05, - "logits/chosen": -2.846945285797119, - "logits/rejected": -2.849860191345215, - "logps/chosen": -292.128173828125, - "logps/rejected": -255.2592315673828, - "loss": 0.48, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.06887436658143997, - "rewards/margins": 1.2138417959213257, - "rewards/rejected": -1.282716155052185, + "logits/chosen": -2.4022135734558105, + "logits/rejected": -2.3517110347747803, + "logps/chosen": -277.9537048339844, + "logps/rejected": -186.30238342285156, + "loss": 0.0601, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.108160018920898, + "rewards/margins": 7.290404319763184, + "rewards/rejected": -11.398564338684082, "step": 470 }, { "epoch": 0.99, "learning_rate": 1.489686783804431e-05, - "logits/chosen": -2.872967481613159, - "logits/rejected": -2.8441576957702637, - "logps/chosen": -263.8342590332031, - "logps/rejected": -239.5003204345703, - "loss": 0.4771, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3044741153717041, - "rewards/margins": 0.980197548866272, - "rewards/rejected": -1.2846715450286865, + "logits/chosen": -2.406426429748535, + "logits/rejected": -2.3724937438964844, + "logps/chosen": -294.24114990234375, + "logps/rejected": -201.96246337890625, + "loss": 0.0636, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.248598575592041, + "rewards/margins": 7.405554294586182, + "rewards/rejected": -11.654152870178223, "step": 480 }, { "epoch": 1.0, - "eval_logits/chosen": -2.8796305656433105, - "eval_logits/rejected": -2.8501133918762207, - "eval_logps/chosen": -274.4910583496094, - "eval_logps/rejected": -238.3082275390625, - "eval_loss": 0.4616946280002594, - "eval_rewards/accuracies": 0.7890625, - "eval_rewards/chosen": -0.0842861607670784, - "eval_rewards/margins": 1.2677102088928223, - "eval_rewards/rejected": -1.3519961833953857, - "eval_runtime": 253.7733, - "eval_samples_per_second": 7.881, - "eval_steps_per_second": 0.063, + "eval_logits/chosen": -2.4664840698242188, + "eval_logits/rejected": -2.395963430404663, + "eval_logps/chosen": -313.8589172363281, + "eval_logps/rejected": -244.22576904296875, + "eval_loss": 0.6041525602340698, + "eval_rewards/accuracies": 0.80859375, + "eval_rewards/chosen": -4.002192974090576, + "eval_rewards/margins": 3.116774797439575, + "eval_rewards/rejected": -7.1189680099487305, + "eval_runtime": 258.6968, + "eval_samples_per_second": 7.731, + "eval_steps_per_second": 0.062, "step": 485 }, { "epoch": 1.01, "learning_rate": 1.4744079449961804e-05, - "logits/chosen": -2.8916447162628174, - "logits/rejected": -2.836836576461792, - "logps/chosen": -317.30328369140625, - "logps/rejected": -261.12030029296875, - "loss": 0.4271, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.1437714546918869, - "rewards/margins": 1.377268671989441, - "rewards/rejected": -1.5210400819778442, + "logits/chosen": -2.4473445415496826, + "logits/rejected": -2.4362194538116455, + "logps/chosen": -317.8018798828125, + "logps/rejected": -199.7606201171875, + "loss": 0.049, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.134568691253662, + "rewards/margins": 7.972777366638184, + "rewards/rejected": -12.10734748840332, "step": 490 }, { "epoch": 1.03, "learning_rate": 1.4591291061879298e-05, - "logits/chosen": -2.8427810668945312, - "logits/rejected": -2.8195979595184326, - "logps/chosen": -262.42047119140625, - "logps/rejected": -256.4164733886719, - "loss": 0.4627, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.3208313584327698, - "rewards/margins": 1.1283810138702393, - "rewards/rejected": -1.4492123126983643, + "logits/chosen": -2.390632390975952, + "logits/rejected": -2.3262600898742676, + "logps/chosen": -316.8980712890625, + "logps/rejected": -207.3352508544922, + "loss": 0.0461, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.459061622619629, + "rewards/margins": 7.6829423904418945, + "rewards/rejected": -12.142004013061523, "step": 500 }, { "epoch": 1.05, "learning_rate": 1.4438502673796793e-05, - "logits/chosen": -2.8156418800354004, - "logits/rejected": -2.843306064605713, - "logps/chosen": -267.56536865234375, - "logps/rejected": -224.3167266845703, - "loss": 0.4201, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.24222290515899658, - "rewards/margins": 1.2655701637268066, - "rewards/rejected": -1.5077931880950928, + "logits/chosen": -2.405426025390625, + "logits/rejected": -2.3402185440063477, + "logps/chosen": -304.3172302246094, + "logps/rejected": -192.7342529296875, + "loss": 0.0456, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.710546970367432, + "rewards/margins": 7.46035099029541, + "rewards/rejected": -12.170897483825684, "step": 510 }, { "epoch": 1.07, "learning_rate": 1.4285714285714287e-05, - "logits/chosen": -2.7805769443511963, - "logits/rejected": -2.8112196922302246, - "logps/chosen": -287.56451416015625, - "logps/rejected": -262.3890075683594, - "loss": 0.4203, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.2784760892391205, - "rewards/margins": 1.08200204372406, - "rewards/rejected": -1.360478162765503, + "logits/chosen": -2.456904649734497, + "logits/rejected": -2.399510622024536, + "logps/chosen": -314.65142822265625, + "logps/rejected": -207.9474334716797, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.895995616912842, + "rewards/margins": 7.744017124176025, + "rewards/rejected": -11.640012741088867, "step": 520 }, { "epoch": 1.09, "learning_rate": 1.4132925897631782e-05, - "logits/chosen": -2.815563678741455, - "logits/rejected": -2.839988946914673, - "logps/chosen": -295.6175842285156, - "logps/rejected": -235.90164184570312, - "loss": 0.4009, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.23429016768932343, - "rewards/margins": 1.343583106994629, - "rewards/rejected": -1.5778734683990479, + "logits/chosen": -2.443506956100464, + "logits/rejected": -2.363826274871826, + "logps/chosen": -346.93341064453125, + "logps/rejected": -210.9061737060547, + "loss": 0.0368, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.365347862243652, + "rewards/margins": 8.533185005187988, + "rewards/rejected": -12.898531913757324, "step": 530 }, { "epoch": 1.11, "learning_rate": 1.3980137509549276e-05, - "logits/chosen": -2.8498940467834473, - "logits/rejected": -2.830718517303467, - "logps/chosen": -318.00909423828125, - "logps/rejected": -257.63348388671875, - "loss": 0.4186, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.3423912525177002, - "rewards/margins": 1.4794024229049683, - "rewards/rejected": -1.821793794631958, + "logits/chosen": -2.422494411468506, + "logits/rejected": -2.381016969680786, + "logps/chosen": -290.3888854980469, + "logps/rejected": -198.80857849121094, + "loss": 0.0441, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.720973968505859, + "rewards/margins": 8.11204719543457, + "rewards/rejected": -12.833023071289062, "step": 540 }, { "epoch": 1.13, "learning_rate": 1.3827349121466769e-05, - "logits/chosen": -2.7992899417877197, - "logits/rejected": -2.8059680461883545, - "logps/chosen": -258.46514892578125, - "logps/rejected": -210.663818359375, - "loss": 0.426, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.40961843729019165, - "rewards/margins": 1.0720117092132568, - "rewards/rejected": -1.4816303253173828, + "logits/chosen": -2.3721251487731934, + "logits/rejected": -2.3095550537109375, + "logps/chosen": -311.7290344238281, + "logps/rejected": -209.42178344726562, + "loss": 0.047, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.692753791809082, + "rewards/margins": 8.287328720092773, + "rewards/rejected": -12.980082511901855, "step": 550 }, { "epoch": 1.15, "learning_rate": 1.3674560733384263e-05, - "logits/chosen": -2.862682580947876, - "logits/rejected": -2.865583658218384, - "logps/chosen": -281.8017883300781, - "logps/rejected": -257.26690673828125, - "loss": 0.4007, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.2565487325191498, - "rewards/margins": 1.5748827457427979, - "rewards/rejected": -1.8314317464828491, + "logits/chosen": -2.430870532989502, + "logits/rejected": -2.4249157905578613, + "logps/chosen": -336.8976135253906, + "logps/rejected": -211.49765014648438, + "loss": 0.0416, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.568190574645996, + "rewards/margins": 8.739889144897461, + "rewards/rejected": -13.308080673217773, "step": 560 }, { "epoch": 1.18, "learning_rate": 1.3521772345301758e-05, - "logits/chosen": -2.8742198944091797, - "logits/rejected": -2.849208354949951, - "logps/chosen": -240.13259887695312, - "logps/rejected": -242.2512664794922, - "loss": 0.3911, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.2923469543457031, - "rewards/margins": 1.5814087390899658, - "rewards/rejected": -1.873755693435669, + "logits/chosen": -2.4312033653259277, + "logits/rejected": -2.3765575885772705, + "logps/chosen": -332.5270690917969, + "logps/rejected": -210.74942016601562, + "loss": 0.0425, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.29726505279541, + "rewards/margins": 8.961139678955078, + "rewards/rejected": -13.258404731750488, "step": 570 }, { "epoch": 1.2, "learning_rate": 1.3368983957219252e-05, - "logits/chosen": -2.8210272789001465, - "logits/rejected": -2.8307366371154785, - "logps/chosen": -277.41375732421875, - "logps/rejected": -270.2271728515625, - "loss": 0.4217, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.5435327887535095, - "rewards/margins": 1.276086688041687, - "rewards/rejected": -1.8196194171905518, + "logits/chosen": -2.3930814266204834, + "logits/rejected": -2.3834025859832764, + "logps/chosen": -290.7808837890625, + "logps/rejected": -206.78366088867188, + "loss": 0.0474, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.984532356262207, + "rewards/margins": 8.283910751342773, + "rewards/rejected": -13.26844310760498, "step": 580 }, { "epoch": 1.22, "learning_rate": 1.3216195569136747e-05, - "logits/chosen": -2.8047261238098145, - "logits/rejected": -2.8207616806030273, - "logps/chosen": -268.05999755859375, - "logps/rejected": -250.09188842773438, - "loss": 0.4192, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -0.2236345261335373, - "rewards/margins": 1.6592124700546265, - "rewards/rejected": -1.8828470706939697, + "logits/chosen": -2.4135966300964355, + "logits/rejected": -2.3652544021606445, + "logps/chosen": -301.96429443359375, + "logps/rejected": -199.56053161621094, + "loss": 0.0487, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.061481475830078, + "rewards/margins": 8.369136810302734, + "rewards/rejected": -12.430618286132812, "step": 590 }, { "epoch": 1.24, "learning_rate": 1.3063407181054241e-05, - "logits/chosen": -2.8321800231933594, - "logits/rejected": -2.813894748687744, - "logps/chosen": -247.3390655517578, - "logps/rejected": -245.7059326171875, - "loss": 0.4081, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.6002678871154785, - "rewards/margins": 1.3836045265197754, - "rewards/rejected": -1.983872652053833, + "logits/chosen": -2.4600701332092285, + "logits/rejected": -2.3951315879821777, + "logps/chosen": -320.52459716796875, + "logps/rejected": -210.39572143554688, + "loss": 0.0493, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.6905012130737305, + "rewards/margins": 8.37987232208252, + "rewards/rejected": -13.07037353515625, "step": 600 }, { "epoch": 1.26, "learning_rate": 1.2910618792971734e-05, - "logits/chosen": -2.8052244186401367, - "logits/rejected": -2.7987864017486572, - "logps/chosen": -252.0263214111328, - "logps/rejected": -265.32666015625, - "loss": 0.4056, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.32010939717292786, - "rewards/margins": 1.5006351470947266, - "rewards/rejected": -1.820744514465332, + "logits/chosen": -2.376004219055176, + "logits/rejected": -2.2585413455963135, + "logps/chosen": -317.3911437988281, + "logps/rejected": -210.0692901611328, + "loss": 0.048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.278079986572266, + "rewards/margins": 8.980937004089355, + "rewards/rejected": -14.259016036987305, "step": 610 }, { "epoch": 1.28, "learning_rate": 1.2757830404889229e-05, - "logits/chosen": -2.852949619293213, - "logits/rejected": -2.810314655303955, - "logps/chosen": -305.781982421875, - "logps/rejected": -255.346435546875, - "loss": 0.44, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.2824147343635559, - "rewards/margins": 1.5166089534759521, - "rewards/rejected": -1.7990238666534424, + "logits/chosen": -2.427064895629883, + "logits/rejected": -2.379770517349243, + "logps/chosen": -339.13507080078125, + "logps/rejected": -217.5348358154297, + "loss": 0.0366, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.716102123260498, + "rewards/margins": 8.656492233276367, + "rewards/rejected": -13.372593879699707, "step": 620 }, { "epoch": 1.3, "learning_rate": 1.2605042016806723e-05, - "logits/chosen": -2.8488757610321045, - "logits/rejected": -2.8255667686462402, - "logps/chosen": -272.29388427734375, - "logps/rejected": -245.6341552734375, - "loss": 0.42, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.3796336054801941, - "rewards/margins": 1.4470628499984741, - "rewards/rejected": -1.8266966342926025, + "logits/chosen": -2.4690704345703125, + "logits/rejected": -2.4057745933532715, + "logps/chosen": -324.30926513671875, + "logps/rejected": -215.36129760742188, + "loss": 0.0407, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.616096496582031, + "rewards/margins": 9.027668952941895, + "rewards/rejected": -13.643765449523926, "step": 630 }, { "epoch": 1.32, "learning_rate": 1.2452253628724218e-05, - "logits/chosen": -2.8429629802703857, - "logits/rejected": -2.837463140487671, - "logps/chosen": -262.43292236328125, - "logps/rejected": -261.91644287109375, - "loss": 0.4354, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.4664459228515625, - "rewards/margins": 1.388948917388916, - "rewards/rejected": -1.855394721031189, + "logits/chosen": -2.4484169483184814, + "logits/rejected": -2.408360004425049, + "logps/chosen": -343.5368957519531, + "logps/rejected": -223.6841583251953, + "loss": 0.0397, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -4.842722415924072, + "rewards/margins": 8.921377182006836, + "rewards/rejected": -13.764101028442383, "step": 640 }, { "epoch": 1.34, "learning_rate": 1.2299465240641712e-05, - "logits/chosen": -2.829761028289795, - "logits/rejected": -2.799344062805176, - "logps/chosen": -266.4898986816406, - "logps/rejected": -228.8533935546875, - "loss": 0.4393, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.6215249300003052, - "rewards/margins": 1.1069129705429077, - "rewards/rejected": -1.7284377813339233, + "logits/chosen": -2.38913893699646, + "logits/rejected": -2.293104648590088, + "logps/chosen": -335.9158630371094, + "logps/rejected": -223.899658203125, + "loss": 0.045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.496028900146484, + "rewards/margins": 8.91499137878418, + "rewards/rejected": -14.41102123260498, "step": 650 }, { "epoch": 1.36, "learning_rate": 1.2146676852559206e-05, - "logits/chosen": -2.8082187175750732, - "logits/rejected": -2.785370111465454, - "logps/chosen": -272.84552001953125, - "logps/rejected": -257.3983154296875, - "loss": 0.417, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.5420676469802856, - "rewards/margins": 1.0937607288360596, - "rewards/rejected": -1.6358283758163452, + "logits/chosen": -2.3704442977905273, + "logits/rejected": -2.312631130218506, + "logps/chosen": -335.7287292480469, + "logps/rejected": -221.09228515625, + "loss": 0.0409, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.297285079956055, + "rewards/margins": 8.652203559875488, + "rewards/rejected": -13.949490547180176, "step": 660 }, { "epoch": 1.38, "learning_rate": 1.1993888464476701e-05, - "logits/chosen": -2.796501636505127, - "logits/rejected": -2.753633975982666, - "logps/chosen": -275.73297119140625, - "logps/rejected": -261.51495361328125, - "loss": 0.4048, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.6535481214523315, - "rewards/margins": 1.5823442935943604, - "rewards/rejected": -2.2358925342559814, + "logits/chosen": -2.36325740814209, + "logits/rejected": -2.3269762992858887, + "logps/chosen": -335.0709533691406, + "logps/rejected": -224.7407684326172, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.079594612121582, + "rewards/margins": 8.919063568115234, + "rewards/rejected": -14.998659133911133, "step": 670 }, { "epoch": 1.4, "learning_rate": 1.1841100076394194e-05, - "logits/chosen": -2.8197669982910156, - "logits/rejected": -2.8018298149108887, - "logps/chosen": -279.07794189453125, - "logps/rejected": -264.37908935546875, - "loss": 0.4476, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.5116819143295288, - "rewards/margins": 1.3447668552398682, - "rewards/rejected": -1.856448769569397, + "logits/chosen": -2.380213737487793, + "logits/rejected": -2.342026472091675, + "logps/chosen": -319.50146484375, + "logps/rejected": -216.95565795898438, + "loss": 0.0387, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.395963668823242, + "rewards/margins": 8.671093940734863, + "rewards/rejected": -14.067057609558105, "step": 680 }, { "epoch": 1.42, "learning_rate": 1.1688311688311688e-05, - "logits/chosen": -2.844113826751709, - "logits/rejected": -2.844442844390869, - "logps/chosen": -252.5892791748047, - "logps/rejected": -259.8851623535156, - "loss": 0.4359, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.6136052012443542, - "rewards/margins": 1.2718920707702637, - "rewards/rejected": -1.8854974508285522, + "logits/chosen": -2.3510377407073975, + "logits/rejected": -2.320303440093994, + "logps/chosen": -352.9103088378906, + "logps/rejected": -234.5736083984375, + "loss": 0.0322, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.800360202789307, + "rewards/margins": 9.392560005187988, + "rewards/rejected": -15.19292163848877, "step": 690 }, { "epoch": 1.44, "learning_rate": 1.1535523300229183e-05, - "logits/chosen": -2.8703908920288086, - "logits/rejected": -2.8482773303985596, - "logps/chosen": -300.6127624511719, - "logps/rejected": -267.99188232421875, - "loss": 0.4299, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.30981573462486267, - "rewards/margins": 1.310162901878357, - "rewards/rejected": -1.619978666305542, + "logits/chosen": -2.3935928344726562, + "logits/rejected": -2.379652738571167, + "logps/chosen": -327.9419860839844, + "logps/rejected": -199.2997589111328, + "loss": 0.0573, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.665543556213379, + "rewards/margins": 7.67110538482666, + "rewards/rejected": -12.336648941040039, "step": 700 }, { "epoch": 1.46, "learning_rate": 1.1382734912146677e-05, - "logits/chosen": -2.807915449142456, - "logits/rejected": -2.7632646560668945, - "logps/chosen": -244.17822265625, - "logps/rejected": -210.9264678955078, - "loss": 0.4146, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.32631585001945496, - "rewards/margins": 1.2793928384780884, - "rewards/rejected": -1.6057088375091553, + "logits/chosen": -2.370570659637451, + "logits/rejected": -2.384655475616455, + "logps/chosen": -322.21661376953125, + "logps/rejected": -200.2702178955078, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.125889301300049, + "rewards/margins": 8.075443267822266, + "rewards/rejected": -12.201333999633789, "step": 710 }, { "epoch": 1.48, "learning_rate": 1.1229946524064172e-05, - "logits/chosen": -2.7481131553649902, - "logits/rejected": -2.7189228534698486, - "logps/chosen": -271.4867858886719, - "logps/rejected": -256.009521484375, - "loss": 0.4123, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.31545475125312805, - "rewards/margins": 1.4961215257644653, - "rewards/rejected": -1.811576247215271, + "logits/chosen": -2.3510353565216064, + "logits/rejected": -2.3337533473968506, + "logps/chosen": -299.6921081542969, + "logps/rejected": -213.6602020263672, + "loss": 0.0458, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.6531476974487305, + "rewards/margins": 8.622300148010254, + "rewards/rejected": -13.2754487991333, "step": 720 }, { "epoch": 1.51, "learning_rate": 1.1077158135981668e-05, - "logits/chosen": -2.80031681060791, - "logits/rejected": -2.7459282875061035, - "logps/chosen": -268.36669921875, - "logps/rejected": -244.3325958251953, - "loss": 0.4444, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.3755728006362915, - "rewards/margins": 1.379817008972168, - "rewards/rejected": -1.755389928817749, + "logits/chosen": -2.3112430572509766, + "logits/rejected": -2.2705492973327637, + "logps/chosen": -301.2213439941406, + "logps/rejected": -215.2222900390625, + "loss": 0.0478, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.078838348388672, + "rewards/margins": 8.204427719116211, + "rewards/rejected": -14.2832670211792, "step": 730 }, { "epoch": 1.53, "learning_rate": 1.0924369747899159e-05, - "logits/chosen": -2.809633255004883, - "logits/rejected": -2.85695219039917, - "logps/chosen": -293.177734375, - "logps/rejected": -231.4840087890625, - "loss": 0.4198, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.2760697901248932, - "rewards/margins": 1.443703293800354, - "rewards/rejected": -1.7197730541229248, + "logits/chosen": -2.406601667404175, + "logits/rejected": -2.351696252822876, + "logps/chosen": -320.53375244140625, + "logps/rejected": -221.86679077148438, + "loss": 0.0416, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.614253044128418, + "rewards/margins": 8.879283905029297, + "rewards/rejected": -14.493537902832031, "step": 740 }, { "epoch": 1.55, "learning_rate": 1.0771581359816653e-05, - "logits/chosen": -2.7558960914611816, - "logits/rejected": -2.803246021270752, - "logps/chosen": -261.3629455566406, - "logps/rejected": -246.388671875, - "loss": 0.39, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.4588204026222229, - "rewards/margins": 1.728859305381775, - "rewards/rejected": -2.1876797676086426, + "logits/chosen": -2.368277072906494, + "logits/rejected": -2.260631561279297, + "logps/chosen": -297.88189697265625, + "logps/rejected": -224.37228393554688, + "loss": 0.0403, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.92215633392334, + "rewards/margins": 8.878290176391602, + "rewards/rejected": -13.800447463989258, "step": 750 }, { "epoch": 1.57, "learning_rate": 1.0618792971734148e-05, - "logits/chosen": -2.78913950920105, - "logits/rejected": -2.7678422927856445, - "logps/chosen": -256.0678405761719, - "logps/rejected": -234.4947509765625, - "loss": 0.4066, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.471910297870636, - "rewards/margins": 1.3505375385284424, - "rewards/rejected": -1.8224480152130127, + "logits/chosen": -2.3125646114349365, + "logits/rejected": -2.2867493629455566, + "logps/chosen": -360.1884460449219, + "logps/rejected": -216.7639617919922, + "loss": 0.0436, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.890374660491943, + "rewards/margins": 8.949200630187988, + "rewards/rejected": -13.839574813842773, "step": 760 }, { "epoch": 1.59, "learning_rate": 1.0466004583651644e-05, - "logits/chosen": -2.808622121810913, - "logits/rejected": -2.8128342628479004, - "logps/chosen": -263.07305908203125, - "logps/rejected": -242.57357788085938, - "loss": 0.4032, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.24994757771492004, - "rewards/margins": 1.3186602592468262, - "rewards/rejected": -1.5686078071594238, + "logits/chosen": -2.3664181232452393, + "logits/rejected": -2.3008503913879395, + "logps/chosen": -329.5279541015625, + "logps/rejected": -223.3599090576172, + "loss": 0.0379, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.103986740112305, + "rewards/margins": 8.835712432861328, + "rewards/rejected": -13.939699172973633, "step": 770 }, { "epoch": 1.61, "learning_rate": 1.0313216195569139e-05, - "logits/chosen": -2.8431007862091064, - "logits/rejected": -2.8500876426696777, - "logps/chosen": -289.2240295410156, - "logps/rejected": -240.93447875976562, - "loss": 0.3989, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -0.1801755428314209, - "rewards/margins": 1.5674879550933838, - "rewards/rejected": -1.7476632595062256, + "logits/chosen": -2.351306200027466, + "logits/rejected": -2.31449556350708, + "logps/chosen": -317.2604064941406, + "logps/rejected": -220.0719757080078, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.727662086486816, + "rewards/margins": 9.234613418579102, + "rewards/rejected": -14.962274551391602, "step": 780 }, { "epoch": 1.63, "learning_rate": 1.0160427807486633e-05, - "logits/chosen": -2.896554470062256, - "logits/rejected": -2.8891143798828125, - "logps/chosen": -304.2243957519531, - "logps/rejected": -279.3757019042969, - "loss": 0.3938, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.3372874855995178, - "rewards/margins": 1.5321928262710571, - "rewards/rejected": -1.8694803714752197, + "logits/chosen": -2.3417956829071045, + "logits/rejected": -2.2895348072052, + "logps/chosen": -328.92681884765625, + "logps/rejected": -225.81887817382812, + "loss": 0.0435, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.588496685028076, + "rewards/margins": 8.750701904296875, + "rewards/rejected": -14.339200019836426, "step": 790 }, { "epoch": 1.65, "learning_rate": 1.0007639419404128e-05, - "logits/chosen": -2.8317697048187256, - "logits/rejected": -2.7945523262023926, - "logps/chosen": -292.1405029296875, - "logps/rejected": -234.8839569091797, - "loss": 0.4294, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.5130658149719238, - "rewards/margins": 1.4586106538772583, - "rewards/rejected": -1.9716764688491821, + "logits/chosen": -2.3706483840942383, + "logits/rejected": -2.329810857772827, + "logps/chosen": -322.5093688964844, + "logps/rejected": -217.6160430908203, + "loss": 0.0426, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.1367669105529785, + "rewards/margins": 9.024019241333008, + "rewards/rejected": -14.160784721374512, "step": 800 }, { "epoch": 1.67, "learning_rate": 9.85485103132162e-06, - "logits/chosen": -2.875779151916504, - "logits/rejected": -2.8391237258911133, - "logps/chosen": -298.0716857910156, - "logps/rejected": -242.75662231445312, - "loss": 0.4371, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.41954541206359863, - "rewards/margins": 1.413554072380066, - "rewards/rejected": -1.833099603652954, + "logits/chosen": -2.3830759525299072, + "logits/rejected": -2.3143844604492188, + "logps/chosen": -335.95965576171875, + "logps/rejected": -224.92111206054688, + "loss": 0.041, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.424968719482422, + "rewards/margins": 9.111248970031738, + "rewards/rejected": -14.536218643188477, "step": 810 }, { "epoch": 1.69, "learning_rate": 9.702062643239115e-06, - "logits/chosen": -2.842196464538574, - "logits/rejected": -2.824432849884033, - "logps/chosen": -255.82620239257812, - "logps/rejected": -256.314453125, - "loss": 0.4196, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4787816107273102, - "rewards/margins": 1.262540578842163, - "rewards/rejected": -1.7413221597671509, + "logits/chosen": -2.399111747741699, + "logits/rejected": -2.3377134799957275, + "logps/chosen": -342.5674133300781, + "logps/rejected": -235.2672119140625, + "loss": 0.0517, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.91513729095459, + "rewards/margins": 9.157655715942383, + "rewards/rejected": -15.072793960571289, "step": 820 }, { "epoch": 1.71, "learning_rate": 9.54927425515661e-06, - "logits/chosen": -2.8649404048919678, - "logits/rejected": -2.807797908782959, - "logps/chosen": -296.7332458496094, - "logps/rejected": -261.4539489746094, - "loss": 0.4274, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.34954220056533813, - "rewards/margins": 1.1616686582565308, - "rewards/rejected": -1.5112109184265137, + "logits/chosen": -2.371589183807373, + "logits/rejected": -2.301905632019043, + "logps/chosen": -327.13055419921875, + "logps/rejected": -220.7751007080078, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.895151615142822, + "rewards/margins": 8.910463333129883, + "rewards/rejected": -13.80561351776123, "step": 830 }, { "epoch": 1.73, "learning_rate": 9.396485867074104e-06, - "logits/chosen": -2.8419387340545654, - "logits/rejected": -2.7805838584899902, - "logps/chosen": -271.58807373046875, - "logps/rejected": -239.6482391357422, - "loss": 0.4106, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.35656026005744934, - "rewards/margins": 1.2432340383529663, - "rewards/rejected": -1.5997945070266724, + "logits/chosen": -2.3354568481445312, + "logits/rejected": -2.2669918537139893, + "logps/chosen": -338.8708801269531, + "logps/rejected": -218.30947875976562, + "loss": 0.0458, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.41546630859375, + "rewards/margins": 8.734143257141113, + "rewards/rejected": -14.149609565734863, "step": 840 }, { "epoch": 1.75, "learning_rate": 9.243697478991598e-06, - "logits/chosen": -2.851105213165283, - "logits/rejected": -2.8537163734436035, - "logps/chosen": -285.19293212890625, - "logps/rejected": -249.2686004638672, - "loss": 0.4067, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.4058365225791931, - "rewards/margins": 1.1938354969024658, - "rewards/rejected": -1.5996720790863037, + "logits/chosen": -2.3707711696624756, + "logits/rejected": -2.313260316848755, + "logps/chosen": -302.56170654296875, + "logps/rejected": -212.74365234375, + "loss": 0.0555, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.656682014465332, + "rewards/margins": 8.04932975769043, + "rewards/rejected": -13.706011772155762, "step": 850 }, { "epoch": 1.77, "learning_rate": 9.090909090909091e-06, - "logits/chosen": -2.8415451049804688, - "logits/rejected": -2.816315174102783, - "logps/chosen": -246.9827117919922, - "logps/rejected": -244.11172485351562, - "loss": 0.4166, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.5251684784889221, - "rewards/margins": 1.321463942527771, - "rewards/rejected": -1.8466323614120483, + "logits/chosen": -2.337785005569458, + "logits/rejected": -2.2829418182373047, + "logps/chosen": -330.53802490234375, + "logps/rejected": -225.6645965576172, + "loss": 0.035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.462188243865967, + "rewards/margins": 9.399454116821289, + "rewards/rejected": -14.861642837524414, "step": 860 }, { "epoch": 1.79, "learning_rate": 8.938120702826586e-06, - "logits/chosen": -2.8182404041290283, - "logits/rejected": -2.7850711345672607, - "logps/chosen": -320.24407958984375, - "logps/rejected": -251.82949829101562, - "loss": 0.409, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.2882133424282074, - "rewards/margins": 1.439748764038086, - "rewards/rejected": -1.7279622554779053, + "logits/chosen": -2.3577864170074463, + "logits/rejected": -2.2690200805664062, + "logps/chosen": -331.9044494628906, + "logps/rejected": -232.2950897216797, + "loss": 0.0348, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.0897955894470215, + "rewards/margins": 9.550823211669922, + "rewards/rejected": -15.640619277954102, "step": 870 }, { "epoch": 1.81, "learning_rate": 8.78533231474408e-06, - "logits/chosen": -2.859550952911377, - "logits/rejected": -2.8387062549591064, - "logps/chosen": -251.30764770507812, - "logps/rejected": -253.18179321289062, - "loss": 0.4071, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.4950861930847168, - "rewards/margins": 1.2707871198654175, - "rewards/rejected": -1.7658733129501343, + "logits/chosen": -2.4222211837768555, + "logits/rejected": -2.3889050483703613, + "logps/chosen": -339.27923583984375, + "logps/rejected": -226.44393920898438, + "loss": 0.0426, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.199553489685059, + "rewards/margins": 8.753616333007812, + "rewards/rejected": -13.953170776367188, "step": 880 }, { "epoch": 1.84, "learning_rate": 8.632543926661574e-06, - "logits/chosen": -2.8016982078552246, - "logits/rejected": -2.7528939247131348, - "logps/chosen": -277.7475891113281, - "logps/rejected": -262.52215576171875, - "loss": 0.444, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.487938791513443, - "rewards/margins": 1.4403693675994873, - "rewards/rejected": -1.928308129310608, + "logits/chosen": -2.3915364742279053, + "logits/rejected": -2.3292319774627686, + "logps/chosen": -357.2469482421875, + "logps/rejected": -227.48550415039062, + "loss": 0.0419, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.296126365661621, + "rewards/margins": 9.113978385925293, + "rewards/rejected": -14.41010570526123, "step": 890 }, { "epoch": 1.86, "learning_rate": 8.479755538579069e-06, - "logits/chosen": -2.832099199295044, - "logits/rejected": -2.786726951599121, - "logps/chosen": -232.41787719726562, - "logps/rejected": -241.41775512695312, - "loss": 0.4149, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.5055242776870728, - "rewards/margins": 1.4381215572357178, - "rewards/rejected": -1.9436458349227905, + "logits/chosen": -2.3756537437438965, + "logits/rejected": -2.32563853263855, + "logps/chosen": -318.2647399902344, + "logps/rejected": -227.1390838623047, + "loss": 0.0406, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.1770830154418945, + "rewards/margins": 9.713737487792969, + "rewards/rejected": -14.890820503234863, "step": 900 }, { "epoch": 1.88, "learning_rate": 8.326967150496563e-06, - "logits/chosen": -2.793572425842285, - "logits/rejected": -2.8044943809509277, - "logps/chosen": -293.1367492675781, - "logps/rejected": -241.0701904296875, - "loss": 0.4144, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.40146392583847046, - "rewards/margins": 1.4329584836959839, - "rewards/rejected": -1.8344223499298096, + "logits/chosen": -2.41601300239563, + "logits/rejected": -2.297445297241211, + "logps/chosen": -348.1459655761719, + "logps/rejected": -232.72802734375, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.221807479858398, + "rewards/margins": 9.922318458557129, + "rewards/rejected": -15.144124031066895, "step": 910 }, { "epoch": 1.9, "learning_rate": 8.174178762414056e-06, - "logits/chosen": -2.861027240753174, - "logits/rejected": -2.802192449569702, - "logps/chosen": -302.29095458984375, - "logps/rejected": -264.388916015625, - "loss": 0.3921, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.377704381942749, - "rewards/margins": 1.487374186515808, - "rewards/rejected": -1.8650786876678467, + "logits/chosen": -2.425701856613159, + "logits/rejected": -2.3687078952789307, + "logps/chosen": -341.102294921875, + "logps/rejected": -232.12783813476562, + "loss": 0.0342, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.615460395812988, + "rewards/margins": 9.65539264678955, + "rewards/rejected": -14.270853042602539, "step": 920 }, { "epoch": 1.92, "learning_rate": 8.02139037433155e-06, - "logits/chosen": -2.856374502182007, - "logits/rejected": -2.875044584274292, - "logps/chosen": -282.31585693359375, - "logps/rejected": -235.76748657226562, - "loss": 0.4205, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.39734259247779846, - "rewards/margins": 1.3766673803329468, - "rewards/rejected": -1.7740100622177124, + "logits/chosen": -2.423699378967285, + "logits/rejected": -2.3600199222564697, + "logps/chosen": -321.5314025878906, + "logps/rejected": -228.6741943359375, + "loss": 0.0316, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.644195556640625, + "rewards/margins": 9.397247314453125, + "rewards/rejected": -15.041444778442383, "step": 930 }, { "epoch": 1.94, "learning_rate": 7.868601986249045e-06, - "logits/chosen": -2.8075997829437256, - "logits/rejected": -2.77669620513916, - "logps/chosen": -256.1700134277344, - "logps/rejected": -247.21896362304688, - "loss": 0.4157, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.3420465886592865, - "rewards/margins": 1.5288991928100586, - "rewards/rejected": -1.8709455728530884, + "logits/chosen": -2.4263432025909424, + "logits/rejected": -2.3628458976745605, + "logps/chosen": -330.029541015625, + "logps/rejected": -225.56637573242188, + "loss": 0.054, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.213741779327393, + "rewards/margins": 8.961358070373535, + "rewards/rejected": -14.175100326538086, "step": 940 }, { "epoch": 1.96, "learning_rate": 7.71581359816654e-06, - "logits/chosen": -2.8214058876037598, - "logits/rejected": -2.812399387359619, - "logps/chosen": -271.9144592285156, - "logps/rejected": -248.6433563232422, - "loss": 0.4086, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.5668919682502747, - "rewards/margins": 1.462934970855713, - "rewards/rejected": -2.0298266410827637, + "logits/chosen": -2.3590340614318848, + "logits/rejected": -2.2720718383789062, + "logps/chosen": -272.0068359375, + "logps/rejected": -213.19833374023438, + "loss": 0.044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.489508628845215, + "rewards/margins": 8.610600471496582, + "rewards/rejected": -14.100107192993164, "step": 950 }, { "epoch": 1.98, "learning_rate": 7.563025210084034e-06, - "logits/chosen": -2.855318069458008, - "logits/rejected": -2.8766427040100098, - "logps/chosen": -305.3939514160156, - "logps/rejected": -259.01739501953125, - "loss": 0.4274, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4556516706943512, - "rewards/margins": 1.3332924842834473, - "rewards/rejected": -1.7889440059661865, + "logits/chosen": -2.3800034523010254, + "logits/rejected": -2.3298089504241943, + "logps/chosen": -319.4668884277344, + "logps/rejected": -224.38330078125, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.616637706756592, + "rewards/margins": 9.42906665802002, + "rewards/rejected": -15.045705795288086, "step": 960 }, { "epoch": 2.0, "learning_rate": 7.410236822001529e-06, - "logits/chosen": -2.804309368133545, - "logits/rejected": -2.7984023094177246, - "logps/chosen": -269.3817443847656, - "logps/rejected": -236.9586639404297, - "loss": 0.4124, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.4466172754764557, - "rewards/margins": 1.4143751859664917, - "rewards/rejected": -1.860992431640625, + "logits/chosen": -2.35837984085083, + "logits/rejected": -2.2928102016448975, + "logps/chosen": -343.45587158203125, + "logps/rejected": -222.7772979736328, + "loss": 0.0443, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.847626209259033, + "rewards/margins": 9.161903381347656, + "rewards/rejected": -14.009529113769531, "step": 970 }, { "epoch": 2.0, - "eval_logits/chosen": -2.865886688232422, - "eval_logits/rejected": -2.8562960624694824, - "eval_logps/chosen": -277.1995849609375, - "eval_logps/rejected": -242.37806701660156, - "eval_loss": 0.4544542133808136, - "eval_rewards/accuracies": 0.81640625, - "eval_rewards/chosen": -0.3551396429538727, - "eval_rewards/margins": 1.4038398265838623, - "eval_rewards/rejected": -1.7589795589447021, - "eval_runtime": 253.9486, - "eval_samples_per_second": 7.876, - "eval_steps_per_second": 0.063, + "eval_logits/chosen": -2.3850746154785156, + "eval_logits/rejected": -2.3026185035705566, + "eval_logps/chosen": -329.6904296875, + "eval_logps/rejected": -262.23040771484375, + "eval_loss": 0.7951104640960693, + "eval_rewards/accuracies": 0.73828125, + "eval_rewards/chosen": -5.5853447914123535, + "eval_rewards/margins": 3.3340888023376465, + "eval_rewards/rejected": -8.91943359375, + "eval_runtime": 258.3653, + "eval_samples_per_second": 7.741, + "eval_steps_per_second": 0.062, "step": 970 }, { "epoch": 2.02, "learning_rate": 7.257448433919023e-06, - "logits/chosen": -2.8543519973754883, - "logits/rejected": -2.8081583976745605, - "logps/chosen": -279.414306640625, - "logps/rejected": -276.77752685546875, - "loss": 0.3522, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.36832618713378906, - "rewards/margins": 1.5619885921478271, - "rewards/rejected": -1.9303147792816162, + "logits/chosen": -2.3800430297851562, + "logits/rejected": -2.3618946075439453, + "logps/chosen": -315.0664978027344, + "logps/rejected": -229.7184600830078, + "loss": 0.0275, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.884152889251709, + "rewards/margins": 9.261144638061523, + "rewards/rejected": -15.145297050476074, "step": 980 }, { "epoch": 2.04, "learning_rate": 7.104660045836517e-06, - "logits/chosen": -2.8730249404907227, - "logits/rejected": -2.8948845863342285, - "logps/chosen": -318.85736083984375, - "logps/rejected": -257.4644470214844, - "loss": 0.3859, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.29034391045570374, - "rewards/margins": 1.8795220851898193, - "rewards/rejected": -2.1698660850524902, + "logits/chosen": -2.3531386852264404, + "logits/rejected": -2.3184380531311035, + "logps/chosen": -327.3660888671875, + "logps/rejected": -219.55581665039062, + "loss": 0.026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.929337024688721, + "rewards/margins": 9.458481788635254, + "rewards/rejected": -14.3878173828125, "step": 990 }, { "epoch": 2.06, "learning_rate": 6.951871657754011e-06, - "logits/chosen": -2.8562302589416504, - "logits/rejected": -2.834265947341919, - "logps/chosen": -259.0556640625, - "logps/rejected": -247.9607696533203, - "loss": 0.3666, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.5741233229637146, - "rewards/margins": 1.5469454526901245, - "rewards/rejected": -2.1210689544677734, + "logits/chosen": -2.3432507514953613, + "logits/rejected": -2.2758610248565674, + "logps/chosen": -325.6359558105469, + "logps/rejected": -223.78164672851562, + "loss": 0.0353, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.556198596954346, + "rewards/margins": 9.385514259338379, + "rewards/rejected": -14.941709518432617, "step": 1000 }, { "epoch": 2.08, "learning_rate": 6.799083269671506e-06, - "logits/chosen": -2.905282974243164, - "logits/rejected": -2.836683988571167, - "logps/chosen": -293.37139892578125, - "logps/rejected": -253.64810180664062, - "loss": 0.3664, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.5017197728157043, - "rewards/margins": 1.8150691986083984, - "rewards/rejected": -2.316789150238037, + "logits/chosen": -2.3102869987487793, + "logits/rejected": -2.2865514755249023, + "logps/chosen": -304.4931640625, + "logps/rejected": -233.22525024414062, + "loss": 0.0308, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.142067909240723, + "rewards/margins": 9.77638053894043, + "rewards/rejected": -15.918451309204102, "step": 1010 }, { "epoch": 2.1, "learning_rate": 6.646294881588999e-06, - "logits/chosen": -2.8647093772888184, - "logits/rejected": -2.8602373600006104, - "logps/chosen": -253.71450805664062, - "logps/rejected": -248.3654022216797, - "loss": 0.369, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.581963300704956, - "rewards/margins": 1.589311957359314, - "rewards/rejected": -2.1712751388549805, + "logits/chosen": -2.318485736846924, + "logits/rejected": -2.2506542205810547, + "logps/chosen": -343.11334228515625, + "logps/rejected": -244.45883178710938, + "loss": 0.0273, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.299825191497803, + "rewards/margins": 10.17300033569336, + "rewards/rejected": -16.472827911376953, "step": 1020 }, { "epoch": 2.12, "learning_rate": 6.493506493506494e-06, - "logits/chosen": -2.837463617324829, - "logits/rejected": -2.817621946334839, - "logps/chosen": -285.40142822265625, - "logps/rejected": -252.75448608398438, - "loss": 0.354, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.5188383460044861, - "rewards/margins": 1.7174046039581299, - "rewards/rejected": -2.2362427711486816, + "logits/chosen": -2.348334550857544, + "logits/rejected": -2.3145151138305664, + "logps/chosen": -362.18927001953125, + "logps/rejected": -244.39047241210938, + "loss": 0.0272, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.444186210632324, + "rewards/margins": 10.421486854553223, + "rewards/rejected": -16.865673065185547, "step": 1030 }, { "epoch": 2.14, "learning_rate": 6.340718105423988e-06, - "logits/chosen": -2.901369571685791, - "logits/rejected": -2.8450589179992676, - "logps/chosen": -270.55145263671875, - "logps/rejected": -257.68060302734375, - "loss": 0.3873, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.28947222232818604, - "rewards/margins": 1.6885324716567993, - "rewards/rejected": -1.9780044555664062, + "logits/chosen": -2.290079116821289, + "logits/rejected": -2.1995062828063965, + "logps/chosen": -333.0499572753906, + "logps/rejected": -232.38363647460938, + "loss": 0.0287, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.774559020996094, + "rewards/margins": 10.013866424560547, + "rewards/rejected": -15.788423538208008, "step": 1040 }, { "epoch": 2.16, "learning_rate": 6.187929717341482e-06, - "logits/chosen": -2.8084139823913574, - "logits/rejected": -2.83860445022583, - "logps/chosen": -272.42822265625, - "logps/rejected": -243.85299682617188, - "loss": 0.3468, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4739387035369873, - "rewards/margins": 1.55910325050354, - "rewards/rejected": -2.0330419540405273, + "logits/chosen": -2.334545612335205, + "logits/rejected": -2.236114501953125, + "logps/chosen": -323.73876953125, + "logps/rejected": -227.9995880126953, + "loss": 0.033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.580103874206543, + "rewards/margins": 10.12531566619873, + "rewards/rejected": -15.705419540405273, "step": 1050 }, { "epoch": 2.19, "learning_rate": 6.0351413292589764e-06, - "logits/chosen": -2.8273186683654785, - "logits/rejected": -2.8402295112609863, - "logps/chosen": -282.76422119140625, - "logps/rejected": -263.18548583984375, - "loss": 0.3782, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.4047788083553314, - "rewards/margins": 1.8494300842285156, - "rewards/rejected": -2.254209041595459, + "logits/chosen": -2.3219540119171143, + "logits/rejected": -2.237586736679077, + "logps/chosen": -338.9471130371094, + "logps/rejected": -236.09146118164062, + "loss": 0.0303, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.9066386222839355, + "rewards/margins": 9.865789413452148, + "rewards/rejected": -15.772427558898926, "step": 1060 }, { "epoch": 2.21, "learning_rate": 5.882352941176471e-06, - "logits/chosen": -2.8585734367370605, - "logits/rejected": -2.8870089054107666, - "logps/chosen": -242.56851196289062, - "logps/rejected": -221.9137725830078, - "loss": 0.3691, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.5514971017837524, - "rewards/margins": 1.47260582447052, - "rewards/rejected": -2.0241026878356934, + "logits/chosen": -2.3054463863372803, + "logits/rejected": -2.2279181480407715, + "logps/chosen": -313.12384033203125, + "logps/rejected": -232.56820678710938, + "loss": 0.0389, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.444895267486572, + "rewards/margins": 9.621152877807617, + "rewards/rejected": -16.066049575805664, "step": 1070 }, { "epoch": 2.23, "learning_rate": 5.729564553093966e-06, - "logits/chosen": -2.8390653133392334, - "logits/rejected": -2.827547788619995, - "logps/chosen": -336.2854309082031, - "logps/rejected": -258.2242126464844, - "loss": 0.373, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.47643327713012695, - "rewards/margins": 1.622065544128418, - "rewards/rejected": -2.098498821258545, + "logits/chosen": -2.3564865589141846, + "logits/rejected": -2.3001480102539062, + "logps/chosen": -335.7102966308594, + "logps/rejected": -231.7644805908203, + "loss": 0.0265, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.110272407531738, + "rewards/margins": 9.538274765014648, + "rewards/rejected": -15.648547172546387, "step": 1080 }, { "epoch": 2.25, "learning_rate": 5.576776165011459e-06, - "logits/chosen": -2.864114284515381, - "logits/rejected": -2.8293545246124268, - "logps/chosen": -273.42315673828125, - "logps/rejected": -267.7647399902344, - "loss": 0.3652, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.6307514905929565, - "rewards/margins": 1.7940418720245361, - "rewards/rejected": -2.424793243408203, + "logits/chosen": -2.276754379272461, + "logits/rejected": -2.2232775688171387, + "logps/chosen": -332.77874755859375, + "logps/rejected": -249.8245849609375, + "loss": 0.03, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.621646881103516, + "rewards/margins": 10.335177421569824, + "rewards/rejected": -16.956823348999023, "step": 1090 }, { "epoch": 2.27, "learning_rate": 5.423987776928954e-06, - "logits/chosen": -2.847562313079834, - "logits/rejected": -2.8787343502044678, - "logps/chosen": -271.5540771484375, - "logps/rejected": -237.20877075195312, - "loss": 0.3705, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.7378746271133423, - "rewards/margins": 1.611467719078064, - "rewards/rejected": -2.3493425846099854, + "logits/chosen": -2.385565996170044, + "logits/rejected": -2.2726476192474365, + "logps/chosen": -356.91094970703125, + "logps/rejected": -251.0599822998047, + "loss": 0.0232, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.774880886077881, + "rewards/margins": 10.560681343078613, + "rewards/rejected": -17.335561752319336, "step": 1100 }, { "epoch": 2.29, "learning_rate": 5.271199388846449e-06, - "logits/chosen": -2.844463348388672, - "logits/rejected": -2.7723591327667236, - "logps/chosen": -289.79632568359375, - "logps/rejected": -266.92529296875, - "loss": 0.3637, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.6152908205986023, - "rewards/margins": 1.741233229637146, - "rewards/rejected": -2.3565244674682617, + "logits/chosen": -2.301403045654297, + "logits/rejected": -2.238607406616211, + "logps/chosen": -329.09088134765625, + "logps/rejected": -261.70184326171875, + "loss": 0.0244, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.769029140472412, + "rewards/margins": 10.928190231323242, + "rewards/rejected": -17.697219848632812, "step": 1110 }, { "epoch": 2.31, "learning_rate": 5.118411000763942e-06, - "logits/chosen": -2.8476827144622803, - "logits/rejected": -2.8521735668182373, - "logps/chosen": -273.8808288574219, - "logps/rejected": -241.986572265625, - "loss": 0.3575, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.7439392805099487, - "rewards/margins": 1.6854461431503296, - "rewards/rejected": -2.4293856620788574, + "logits/chosen": -2.344348907470703, + "logits/rejected": -2.238985061645508, + "logps/chosen": -337.75958251953125, + "logps/rejected": -255.0755157470703, + "loss": 0.0282, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.105334281921387, + "rewards/margins": 11.364806175231934, + "rewards/rejected": -17.470142364501953, "step": 1120 }, { "epoch": 2.33, "learning_rate": 4.965622612681437e-06, - "logits/chosen": -2.8075308799743652, - "logits/rejected": -2.7614858150482178, - "logps/chosen": -285.82550048828125, - "logps/rejected": -258.968994140625, - "loss": 0.3651, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.7141876220703125, - "rewards/margins": 1.7358572483062744, - "rewards/rejected": -2.450045108795166, + "logits/chosen": -2.2969937324523926, + "logits/rejected": -2.2662580013275146, + "logps/chosen": -327.1737060546875, + "logps/rejected": -242.359619140625, + "loss": 0.0343, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.986081600189209, + "rewards/margins": 10.588550567626953, + "rewards/rejected": -16.57463264465332, "step": 1130 }, { "epoch": 2.35, "learning_rate": 4.812834224598931e-06, - "logits/chosen": -2.8139889240264893, - "logits/rejected": -2.8023717403411865, - "logps/chosen": -304.9195251464844, - "logps/rejected": -261.23919677734375, - "loss": 0.3568, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.511367678642273, - "rewards/margins": 1.4749786853790283, - "rewards/rejected": -1.9863464832305908, + "logits/chosen": -2.3156192302703857, + "logits/rejected": -2.268474578857422, + "logps/chosen": -317.2707824707031, + "logps/rejected": -234.99734497070312, + "loss": 0.0266, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.135073661804199, + "rewards/margins": 10.014872550964355, + "rewards/rejected": -16.149948120117188, "step": 1140 }, { "epoch": 2.37, "learning_rate": 4.660045836516425e-06, - "logits/chosen": -2.7972934246063232, - "logits/rejected": -2.7768425941467285, - "logps/chosen": -277.6763916015625, - "logps/rejected": -251.0581817626953, - "loss": 0.3389, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.7810333967208862, - "rewards/margins": 1.623008370399475, - "rewards/rejected": -2.4040417671203613, + "logits/chosen": -2.362359046936035, + "logits/rejected": -2.3042564392089844, + "logps/chosen": -341.3941650390625, + "logps/rejected": -235.54507446289062, + "loss": 0.0371, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.8703131675720215, + "rewards/margins": 10.015909194946289, + "rewards/rejected": -15.886222839355469, "step": 1150 }, { "epoch": 2.39, "learning_rate": 4.5072574484339196e-06, - "logits/chosen": -2.842956781387329, - "logits/rejected": -2.8352267742156982, - "logps/chosen": -246.30056762695312, - "logps/rejected": -273.0400390625, - "loss": 0.3685, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6348632574081421, - "rewards/margins": 1.7184776067733765, - "rewards/rejected": -2.3533406257629395, + "logits/chosen": -2.2984766960144043, + "logits/rejected": -2.271493434906006, + "logps/chosen": -332.32489013671875, + "logps/rejected": -233.0104522705078, + "loss": 0.0332, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.160719394683838, + "rewards/margins": 9.819355010986328, + "rewards/rejected": -15.980074882507324, "step": 1160 }, { "epoch": 2.41, "learning_rate": 4.354469060351414e-06, - "logits/chosen": -2.801945447921753, - "logits/rejected": -2.791245937347412, - "logps/chosen": -279.83709716796875, - "logps/rejected": -284.6191711425781, - "loss": 0.3482, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.7146093249320984, - "rewards/margins": 1.7736566066741943, - "rewards/rejected": -2.4882664680480957, + "logits/chosen": -2.336785316467285, + "logits/rejected": -2.2462456226348877, + "logps/chosen": -321.7502746582031, + "logps/rejected": -244.50613403320312, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7888078689575195, + "rewards/margins": 9.919248580932617, + "rewards/rejected": -15.70805549621582, "step": 1170 }, { "epoch": 2.43, "learning_rate": 4.201680672268908e-06, - "logits/chosen": -2.752314329147339, - "logits/rejected": -2.751218318939209, - "logps/chosen": -285.19122314453125, - "logps/rejected": -262.41162109375, - "loss": 0.3726, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.7081433534622192, - "rewards/margins": 1.631823182106018, - "rewards/rejected": -2.339966297149658, + "logits/chosen": -2.3283114433288574, + "logits/rejected": -2.3021600246429443, + "logps/chosen": -352.926513671875, + "logps/rejected": -241.5557098388672, + "loss": 0.0268, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.513485908508301, + "rewards/margins": 10.350736618041992, + "rewards/rejected": -15.864221572875977, "step": 1180 }, { "epoch": 2.45, "learning_rate": 4.048892284186402e-06, - "logits/chosen": -2.7520546913146973, - "logits/rejected": -2.706789493560791, - "logps/chosen": -274.19903564453125, - "logps/rejected": -271.12530517578125, - "loss": 0.37, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.5612797737121582, - "rewards/margins": 1.6494731903076172, - "rewards/rejected": -2.2107529640197754, + "logits/chosen": -2.353079080581665, + "logits/rejected": -2.2632689476013184, + "logps/chosen": -317.30316162109375, + "logps/rejected": -238.99789428710938, + "loss": 0.0341, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.315035343170166, + "rewards/margins": 10.289588928222656, + "rewards/rejected": -16.60462188720703, "step": 1190 }, { "epoch": 2.47, "learning_rate": 3.896103896103897e-06, - "logits/chosen": -2.8326988220214844, - "logits/rejected": -2.7695236206054688, - "logps/chosen": -272.9900817871094, - "logps/rejected": -257.9608154296875, - "loss": 0.3775, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.7353371381759644, - "rewards/margins": 1.5964215993881226, - "rewards/rejected": -2.331758737564087, + "logits/chosen": -2.347299337387085, + "logits/rejected": -2.2617671489715576, + "logps/chosen": -322.52716064453125, + "logps/rejected": -238.89794921875, + "loss": 0.0291, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.98498010635376, + "rewards/margins": 10.374175071716309, + "rewards/rejected": -16.359155654907227, "step": 1200 }, { "epoch": 2.49, "learning_rate": 3.7433155080213907e-06, - "logits/chosen": -2.7996091842651367, - "logits/rejected": -2.8057217597961426, - "logps/chosen": -290.14324951171875, - "logps/rejected": -257.6935119628906, - "loss": 0.3824, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.5703123807907104, - "rewards/margins": 1.7026172876358032, - "rewards/rejected": -2.2729296684265137, + "logits/chosen": -2.3382067680358887, + "logits/rejected": -2.318660259246826, + "logps/chosen": -314.554443359375, + "logps/rejected": -241.98046875, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.864360332489014, + "rewards/margins": 10.600669860839844, + "rewards/rejected": -16.465030670166016, "step": 1210 }, { "epoch": 2.52, "learning_rate": 3.5905271199388848e-06, - "logits/chosen": -2.834031343460083, - "logits/rejected": -2.8236374855041504, - "logps/chosen": -279.32598876953125, - "logps/rejected": -250.78292846679688, - "loss": 0.376, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.5887583494186401, - "rewards/margins": 1.6018693447113037, - "rewards/rejected": -2.1906275749206543, + "logits/chosen": -2.33272123336792, + "logits/rejected": -2.2567524909973145, + "logps/chosen": -329.4327392578125, + "logps/rejected": -241.38650512695312, + "loss": 0.026, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.32674503326416, + "rewards/margins": 9.740303039550781, + "rewards/rejected": -16.067049026489258, "step": 1220 }, { "epoch": 2.54, "learning_rate": 3.4377387318563792e-06, - "logits/chosen": -2.8245434761047363, - "logits/rejected": -2.852121353149414, - "logps/chosen": -323.82940673828125, - "logps/rejected": -270.3510437011719, - "loss": 0.3703, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.7247987985610962, - "rewards/margins": 1.512142539024353, - "rewards/rejected": -2.236941337585449, + "logits/chosen": -2.304385185241699, + "logits/rejected": -2.242936849594116, + "logps/chosen": -350.80352783203125, + "logps/rejected": -257.4071350097656, + "loss": 0.0303, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.671088218688965, + "rewards/margins": 10.733926773071289, + "rewards/rejected": -17.40501594543457, "step": 1230 }, { "epoch": 2.56, "learning_rate": 3.2849503437738733e-06, - "logits/chosen": -2.842057466506958, - "logits/rejected": -2.855686902999878, - "logps/chosen": -275.85418701171875, - "logps/rejected": -251.3174591064453, - "loss": 0.3878, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.4698792099952698, - "rewards/margins": 1.7577711343765259, - "rewards/rejected": -2.2276504039764404, + "logits/chosen": -2.3363845348358154, + "logits/rejected": -2.2699573040008545, + "logps/chosen": -340.39910888671875, + "logps/rejected": -249.5596160888672, + "loss": 0.0352, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.956596851348877, + "rewards/margins": 10.645536422729492, + "rewards/rejected": -16.60213279724121, "step": 1240 }, { "epoch": 2.58, "learning_rate": 3.1321619556913678e-06, - "logits/chosen": -2.842163562774658, - "logits/rejected": -2.8198578357696533, - "logps/chosen": -293.8838195800781, - "logps/rejected": -260.05194091796875, - "loss": 0.3441, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.48028483986854553, - "rewards/margins": 1.8665335178375244, - "rewards/rejected": -2.346818447113037, + "logits/chosen": -2.289301633834839, + "logits/rejected": -2.259625196456909, + "logps/chosen": -358.3861083984375, + "logps/rejected": -251.97900390625, + "loss": 0.0381, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.069993019104004, + "rewards/margins": 10.830602645874023, + "rewards/rejected": -16.90059471130371, "step": 1250 }, { "epoch": 2.6, "learning_rate": 2.979373567608862e-06, - "logits/chosen": -2.835662364959717, - "logits/rejected": -2.8839526176452637, - "logps/chosen": -257.24334716796875, - "logps/rejected": -238.25588989257812, - "loss": 0.368, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.6586586236953735, - "rewards/margins": 1.7249507904052734, - "rewards/rejected": -2.3836092948913574, + "logits/chosen": -2.3595917224884033, + "logits/rejected": -2.281625270843506, + "logps/chosen": -336.3616638183594, + "logps/rejected": -242.55996704101562, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.833378791809082, + "rewards/margins": 10.298261642456055, + "rewards/rejected": -16.131641387939453, "step": 1260 }, { "epoch": 2.62, "learning_rate": 2.826585179526356e-06, - "logits/chosen": -2.7938365936279297, - "logits/rejected": -2.767805576324463, - "logps/chosen": -249.7235107421875, - "logps/rejected": -234.31283569335938, - "loss": 0.369, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7289739847183228, - "rewards/margins": 1.7214456796646118, - "rewards/rejected": -2.4504194259643555, + "logits/chosen": -2.313523530960083, + "logits/rejected": -2.234619617462158, + "logps/chosen": -321.42913818359375, + "logps/rejected": -222.7084197998047, + "loss": 0.0296, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.917761325836182, + "rewards/margins": 9.67473030090332, + "rewards/rejected": -15.592493057250977, "step": 1270 }, { "epoch": 2.64, "learning_rate": 2.673796791443851e-06, - "logits/chosen": -2.7239506244659424, - "logits/rejected": -2.751171588897705, - "logps/chosen": -329.4594421386719, - "logps/rejected": -260.408935546875, - "loss": 0.3679, - "rewards/accuracies": 0.90625, - "rewards/chosen": -0.6046456694602966, - "rewards/margins": 2.0207152366638184, - "rewards/rejected": -2.6253609657287598, + "logits/chosen": -2.324735164642334, + "logits/rejected": -2.2597193717956543, + "logps/chosen": -338.4864501953125, + "logps/rejected": -254.4070281982422, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.023530006408691, + "rewards/margins": 11.340107917785645, + "rewards/rejected": -17.363636016845703, "step": 1280 }, { "epoch": 2.66, "learning_rate": 2.521008403361345e-06, - "logits/chosen": -2.797870397567749, - "logits/rejected": -2.8259756565093994, - "logps/chosen": -305.6458435058594, - "logps/rejected": -244.69601440429688, - "loss": 0.3668, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.6542243957519531, - "rewards/margins": 1.900058388710022, - "rewards/rejected": -2.5542826652526855, + "logits/chosen": -2.2998228073120117, + "logits/rejected": -2.2498550415039062, + "logps/chosen": -314.4098815917969, + "logps/rejected": -228.45303344726562, + "loss": 0.0318, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -5.6659369468688965, + "rewards/margins": 9.735329627990723, + "rewards/rejected": -15.401266098022461, "step": 1290 }, { "epoch": 2.68, "learning_rate": 2.368220015278839e-06, - "logits/chosen": -2.7966253757476807, - "logits/rejected": -2.8371598720550537, - "logps/chosen": -281.93939208984375, - "logps/rejected": -261.60150146484375, - "loss": 0.3348, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.5757613182067871, - "rewards/margins": 1.6483405828475952, - "rewards/rejected": -2.224102020263672, + "logits/chosen": -2.33604097366333, + "logits/rejected": -2.2575902938842773, + "logps/chosen": -367.1761474609375, + "logps/rejected": -240.1140594482422, + "loss": 0.034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.646463394165039, + "rewards/margins": 10.95536994934082, + "rewards/rejected": -16.60183334350586, "step": 1300 }, { "epoch": 2.7, "learning_rate": 2.2154316271963334e-06, - "logits/chosen": -2.8347060680389404, - "logits/rejected": -2.818382501602173, - "logps/chosen": -266.34979248046875, - "logps/rejected": -248.8013153076172, - "loss": 0.3448, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.5766314268112183, - "rewards/margins": 1.5551669597625732, - "rewards/rejected": -2.131798505783081, + "logits/chosen": -2.279996871948242, + "logits/rejected": -2.220372200012207, + "logps/chosen": -315.04205322265625, + "logps/rejected": -238.9733123779297, + "loss": 0.0338, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.325307369232178, + "rewards/margins": 10.017477035522461, + "rewards/rejected": -16.342784881591797, "step": 1310 }, { "epoch": 2.72, "learning_rate": 2.0626432391138275e-06, - "logits/chosen": -2.780651569366455, - "logits/rejected": -2.7655837535858154, - "logps/chosen": -317.9383544921875, - "logps/rejected": -282.8804931640625, - "loss": 0.3784, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.5049678683280945, - "rewards/margins": 1.9375810623168945, - "rewards/rejected": -2.442549228668213, + "logits/chosen": -2.3077378273010254, + "logits/rejected": -2.2532095909118652, + "logps/chosen": -306.40997314453125, + "logps/rejected": -244.31295776367188, + "loss": 0.0358, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.671900272369385, + "rewards/margins": 9.690256118774414, + "rewards/rejected": -16.36215591430664, "step": 1320 }, { "epoch": 2.74, "learning_rate": 1.9098548510313215e-06, - "logits/chosen": -2.8032174110412598, - "logits/rejected": -2.7949814796447754, - "logps/chosen": -260.9715576171875, - "logps/rejected": -260.29913330078125, - "loss": 0.3628, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6280083060264587, - "rewards/margins": 1.837847113609314, - "rewards/rejected": -2.465855360031128, + "logits/chosen": -2.3265862464904785, + "logits/rejected": -2.26108980178833, + "logps/chosen": -316.51409912109375, + "logps/rejected": -244.35745239257812, + "loss": 0.0299, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.494316577911377, + "rewards/margins": 11.21363353729248, + "rewards/rejected": -16.707950592041016, "step": 1330 }, { "epoch": 2.76, "learning_rate": 1.757066462948816e-06, - "logits/chosen": -2.8185904026031494, - "logits/rejected": -2.7385153770446777, - "logps/chosen": -278.1485290527344, - "logps/rejected": -278.1205139160156, - "loss": 0.3636, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.6104816198348999, - "rewards/margins": 2.0218350887298584, - "rewards/rejected": -2.6323161125183105, + "logits/chosen": -2.313035011291504, + "logits/rejected": -2.2793192863464355, + "logps/chosen": -319.83758544921875, + "logps/rejected": -245.69088745117188, + "loss": 0.0341, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.988718032836914, + "rewards/margins": 10.558996200561523, + "rewards/rejected": -16.547714233398438, "step": 1340 }, { "epoch": 2.78, "learning_rate": 1.6042780748663103e-06, - "logits/chosen": -2.8218092918395996, - "logits/rejected": -2.7967371940612793, - "logps/chosen": -297.7966613769531, - "logps/rejected": -240.7419891357422, - "loss": 0.3763, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.5741168260574341, - "rewards/margins": 1.761366605758667, - "rewards/rejected": -2.3354835510253906, + "logits/chosen": -2.2976434230804443, + "logits/rejected": -2.2259607315063477, + "logps/chosen": -338.54229736328125, + "logps/rejected": -253.5747528076172, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.011918067932129, + "rewards/margins": 10.417112350463867, + "rewards/rejected": -17.429031372070312, "step": 1350 }, { "epoch": 2.8, "learning_rate": 1.4514896867838045e-06, - "logits/chosen": -2.8747620582580566, - "logits/rejected": -2.818427562713623, - "logps/chosen": -280.43756103515625, - "logps/rejected": -234.6200714111328, - "loss": 0.3859, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.5531826019287109, - "rewards/margins": 1.6504430770874023, - "rewards/rejected": -2.2036256790161133, + "logits/chosen": -2.3067193031311035, + "logits/rejected": -2.264526844024658, + "logps/chosen": -356.2128601074219, + "logps/rejected": -264.0433044433594, + "loss": 0.0272, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.317415237426758, + "rewards/margins": 11.136034965515137, + "rewards/rejected": -17.45345115661621, "step": 1360 }, { "epoch": 2.82, "learning_rate": 1.2987012987012986e-06, - "logits/chosen": -2.8544864654541016, - "logits/rejected": -2.8276607990264893, - "logps/chosen": -298.3465881347656, - "logps/rejected": -288.8724365234375, - "loss": 0.3595, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.5492029786109924, - "rewards/margins": 1.664731740951538, - "rewards/rejected": -2.2139344215393066, + "logits/chosen": -2.2918033599853516, + "logits/rejected": -2.221585512161255, + "logps/chosen": -358.04180908203125, + "logps/rejected": -262.490478515625, + "loss": 0.0298, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.549307346343994, + "rewards/margins": 11.43887996673584, + "rewards/rejected": -17.98818588256836, "step": 1370 }, { "epoch": 2.85, "learning_rate": 1.145912910618793e-06, - "logits/chosen": -2.7500030994415283, - "logits/rejected": -2.7478601932525635, - "logps/chosen": -245.6803741455078, - "logps/rejected": -244.2843475341797, - "loss": 0.3457, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.5498741269111633, - "rewards/margins": 1.8164262771606445, - "rewards/rejected": -2.366300344467163, + "logits/chosen": -2.2997758388519287, + "logits/rejected": -2.217319965362549, + "logps/chosen": -385.3619384765625, + "logps/rejected": -264.56927490234375, + "loss": 0.0247, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.197786808013916, + "rewards/margins": 11.309389114379883, + "rewards/rejected": -17.50717544555664, "step": 1380 }, { "epoch": 2.87, "learning_rate": 9.931245225362874e-07, - "logits/chosen": -2.78765869140625, - "logits/rejected": -2.7899107933044434, - "logps/chosen": -269.9845886230469, - "logps/rejected": -260.02557373046875, - "loss": 0.366, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.5584810376167297, - "rewards/margins": 1.836942434310913, - "rewards/rejected": -2.395423412322998, + "logits/chosen": -2.2803447246551514, + "logits/rejected": -2.218745708465576, + "logps/chosen": -334.13092041015625, + "logps/rejected": -247.9724578857422, + "loss": 0.0343, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.724381446838379, + "rewards/margins": 10.408391952514648, + "rewards/rejected": -17.13277244567871, "step": 1390 }, { "epoch": 2.89, "learning_rate": 8.403361344537816e-07, - "logits/chosen": -2.822734832763672, - "logits/rejected": -2.7889413833618164, - "logps/chosen": -295.0702819824219, - "logps/rejected": -263.552978515625, - "loss": 0.3664, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.6814947128295898, - "rewards/margins": 1.600311040878296, - "rewards/rejected": -2.2818057537078857, + "logits/chosen": -2.2882351875305176, + "logits/rejected": -2.1976969242095947, + "logps/chosen": -343.0736083984375, + "logps/rejected": -260.0362854003906, + "loss": 0.0316, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.6999921798706055, + "rewards/margins": 10.926143646240234, + "rewards/rejected": -17.626136779785156, "step": 1400 }, { "epoch": 2.91, "learning_rate": 6.875477463712758e-07, - "logits/chosen": -2.7872085571289062, - "logits/rejected": -2.813737392425537, - "logps/chosen": -271.7468566894531, - "logps/rejected": -254.4673309326172, - "loss": 0.3564, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.44644269347190857, - "rewards/margins": 1.852097511291504, - "rewards/rejected": -2.298539876937866, + "logits/chosen": -2.273560047149658, + "logits/rejected": -2.230538845062256, + "logps/chosen": -368.9632873535156, + "logps/rejected": -273.01080322265625, + "loss": 0.0258, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.649250030517578, + "rewards/margins": 11.343961715698242, + "rewards/rejected": -17.993213653564453, "step": 1410 }, { "epoch": 2.93, "learning_rate": 5.347593582887701e-07, - "logits/chosen": -2.8677477836608887, - "logits/rejected": -2.8729989528656006, - "logps/chosen": -321.89447021484375, - "logps/rejected": -266.3446350097656, - "loss": 0.3638, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.6643561124801636, - "rewards/margins": 1.5856568813323975, - "rewards/rejected": -2.2500128746032715, + "logits/chosen": -2.295637845993042, + "logits/rejected": -2.2116260528564453, + "logps/chosen": -355.9484558105469, + "logps/rejected": -262.65966796875, + "loss": 0.0361, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.842223167419434, + "rewards/margins": 11.27800178527832, + "rewards/rejected": -18.12022590637207, "step": 1420 }, { "epoch": 2.95, "learning_rate": 3.819709702062643e-07, - "logits/chosen": -2.8617660999298096, - "logits/rejected": -2.8304145336151123, - "logps/chosen": -289.00482177734375, - "logps/rejected": -255.83004760742188, - "loss": 0.3575, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.6355488300323486, - "rewards/margins": 1.5412448644638062, - "rewards/rejected": -2.1767935752868652, + "logits/chosen": -2.2705752849578857, + "logits/rejected": -2.191991090774536, + "logps/chosen": -316.24200439453125, + "logps/rejected": -252.88345336914062, + "loss": 0.032, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.750401973724365, + "rewards/margins": 11.157169342041016, + "rewards/rejected": -17.907573699951172, "step": 1430 }, { "epoch": 2.97, "learning_rate": 2.2918258212375862e-07, - "logits/chosen": -2.813324451446533, - "logits/rejected": -2.822589635848999, - "logps/chosen": -270.77374267578125, - "logps/rejected": -270.3390808105469, - "loss": 0.3543, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6036959886550903, - "rewards/margins": 1.9311864376068115, - "rewards/rejected": -2.5348825454711914, + "logits/chosen": -2.272953748703003, + "logits/rejected": -2.1898062229156494, + "logps/chosen": -342.50482177734375, + "logps/rejected": -259.5894775390625, + "loss": 0.025, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.047539710998535, + "rewards/margins": 10.942033767700195, + "rewards/rejected": -17.989574432373047, "step": 1440 }, { "epoch": 2.99, "learning_rate": 7.639419404125288e-08, - "logits/chosen": -2.8171768188476562, - "logits/rejected": -2.779967784881592, - "logps/chosen": -303.8172912597656, - "logps/rejected": -260.3236389160156, - "loss": 0.3549, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.6331970691680908, - "rewards/margins": 1.8079423904418945, - "rewards/rejected": -2.4411392211914062, + "logits/chosen": -2.346297025680542, + "logits/rejected": -2.2220358848571777, + "logps/chosen": -386.37823486328125, + "logps/rejected": -261.1239929199219, + "loss": 0.0238, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.26646614074707, + "rewards/margins": 11.493976593017578, + "rewards/rejected": -17.76044273376465, "step": 1450 }, { "epoch": 3.0, - "eval_logits/chosen": -2.842160701751709, - "eval_logits/rejected": -2.833141326904297, - "eval_logps/chosen": -279.5245056152344, - "eval_logps/rejected": -246.69915771484375, - "eval_loss": 0.45531293749809265, - "eval_rewards/accuracies": 0.8359375, - "eval_rewards/chosen": -0.5876308083534241, - "eval_rewards/margins": 1.6034575700759888, - "eval_rewards/rejected": -2.1910881996154785, - "eval_runtime": 254.2995, - "eval_samples_per_second": 7.865, - "eval_steps_per_second": 0.063, + "eval_logits/chosen": -2.3036882877349854, + "eval_logits/rejected": -2.218935012817383, + "eval_logps/chosen": -341.342041015625, + "eval_logps/rejected": -273.77117919921875, + "eval_loss": 1.0562912225723267, + "eval_rewards/accuracies": 0.72265625, + "eval_rewards/chosen": -6.750503063201904, + "eval_rewards/margins": 3.3230087757110596, + "eval_rewards/rejected": -10.07351303100586, + "eval_runtime": 258.6775, + "eval_samples_per_second": 7.732, + "eval_steps_per_second": 0.062, "step": 1455 }, { "epoch": 3.0, "step": 1455, "total_flos": 0.0, - "train_loss": 0.43281792414557074, - "train_runtime": 46468.4841, - "train_samples_per_second": 4.001, + "train_loss": 0.07034083745375122, + "train_runtime": 46831.0549, + "train_samples_per_second": 3.976, "train_steps_per_second": 0.031 } ],