diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,7021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.517799352750809, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -2.0985755920410156, + "logits/rejected": -1.9598942995071411, + "logps/chosen": -282.9971618652344, + "logps/rejected": -239.9343719482422, + "loss": 0.6951, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008073234930634499, + "rewards/margins": -0.0036141639575362206, + "rewards/rejected": -0.004459070973098278, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -2.0734448432922363, + "logits/rejected": -2.004692316055298, + "logps/chosen": -277.91009521484375, + "logps/rejected": -271.27777099609375, + "loss": 0.694, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0035435440950095654, + "rewards/margins": -0.0011311531998217106, + "rewards/rejected": 0.004674696363508701, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": -2.2034449577331543, + "logits/rejected": -2.15450382232666, + "logps/chosen": -272.84222412109375, + "logps/rejected": -296.7918701171875, + "loss": 0.6936, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011506916023790836, + "rewards/margins": -0.0003993515856564045, + "rewards/rejected": 0.011906265281140804, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.265322685241699, + "logits/rejected": -2.2147812843322754, + "logps/chosen": -371.73626708984375, + "logps/rejected": -411.03802490234375, + "loss": 0.6923, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00851450115442276, + "rewards/margins": 0.0020747678354382515, + "rewards/rejected": 0.0064397333189845085, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.5e-06, + "logits/chosen": -2.126523971557617, + "logits/rejected": -2.257094383239746, + "logps/chosen": -232.20245361328125, + "logps/rejected": -273.501953125, + "loss": 0.6971, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.012802458368241787, + "rewards/margins": -0.007561136037111282, + "rewards/rejected": -0.005241322796791792, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 3e-06, + "logits/chosen": -2.3206775188446045, + "logits/rejected": -2.236947774887085, + "logps/chosen": -282.0924072265625, + "logps/rejected": -330.6745910644531, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012886857613921165, + "rewards/margins": 0.0036454680375754833, + "rewards/rejected": 0.00924139004200697, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 3.5000000000000004e-06, + "logits/chosen": -2.068502426147461, + "logits/rejected": -2.1196892261505127, + "logps/chosen": -270.6734313964844, + "logps/rejected": -337.42877197265625, + "loss": 0.6981, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01318061351776123, + "rewards/margins": -0.009621287696063519, + "rewards/rejected": -0.003559327684342861, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.100869655609131, + "logits/rejected": -2.2541885375976562, + "logps/chosen": -310.18951416015625, + "logps/rejected": -404.02984619140625, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0011783126974478364, + "rewards/margins": 0.0021168000530451536, + "rewards/rejected": -0.0009384873555973172, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 4.5e-06, + "logits/chosen": -2.0913190841674805, + "logits/rejected": -2.1440823078155518, + "logps/chosen": -293.1015625, + "logps/rejected": -351.9281005859375, + "loss": 0.694, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0038339614402502775, + "rewards/margins": -0.0011605499312281609, + "rewards/rejected": 0.004994511604309082, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "logits/chosen": -2.306243419647217, + "logits/rejected": -2.3045802116394043, + "logps/chosen": -386.077392578125, + "logps/rejected": -367.2294921875, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0069270143285393715, + "rewards/margins": 0.010493995621800423, + "rewards/rejected": -0.01742100901901722, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -2.136928081512451, + "logits/rejected": -2.2102103233337402, + "logps/chosen": -302.4460754394531, + "logps/rejected": -337.15631103515625, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006688881199806929, + "rewards/margins": 0.004262590315192938, + "rewards/rejected": -0.010951472446322441, + "step": 11 + }, + { + "epoch": 0.01, + "learning_rate": 6e-06, + "logits/chosen": -2.064110279083252, + "logits/rejected": -2.200680732727051, + "logps/chosen": -320.602294921875, + "logps/rejected": -369.6840515136719, + "loss": 0.6988, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0003773693460971117, + "rewards/margins": -0.0108009809628129, + "rewards/rejected": 0.011178349144756794, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -2.348961353302002, + "logits/rejected": -2.3161768913269043, + "logps/chosen": -400.9246826171875, + "logps/rejected": -472.69769287109375, + "loss": 0.6845, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0075957290828228, + "rewards/margins": 0.017815779894590378, + "rewards/rejected": -0.010220050811767578, + "step": 13 + }, + { + "epoch": 0.01, + "learning_rate": 7.000000000000001e-06, + "logits/chosen": -2.412529468536377, + "logits/rejected": -2.233689785003662, + "logps/chosen": -422.3519287109375, + "logps/rejected": -368.8478088378906, + "loss": 0.6941, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0051600453443825245, + "rewards/margins": -0.0017115597147494555, + "rewards/rejected": 0.006871605291962624, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 7.5e-06, + "logits/chosen": -2.0220694541931152, + "logits/rejected": -2.0454282760620117, + "logps/chosen": -315.8769836425781, + "logps/rejected": -314.20269775390625, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.372652620077133e-05, + "rewards/margins": -0.00020327605307102203, + "rewards/rejected": 0.000159549992531538, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.0351786613464355, + "logits/rejected": -1.877925992012024, + "logps/chosen": -383.6861572265625, + "logps/rejected": -276.95050048828125, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005419491790235043, + "rewards/margins": -0.005987692158669233, + "rewards/rejected": 0.005445742513984442, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 8.500000000000002e-06, + "logits/chosen": -2.0652737617492676, + "logits/rejected": -2.2235422134399414, + "logps/chosen": -343.7192077636719, + "logps/rejected": -360.5483703613281, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007448338903486729, + "rewards/margins": 0.015540864318609238, + "rewards/rejected": -0.00809252168983221, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 9e-06, + "logits/chosen": -2.2705795764923096, + "logits/rejected": -2.159031867980957, + "logps/chosen": -335.9582824707031, + "logps/rejected": -337.2821044921875, + "loss": 0.6976, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.010970616713166237, + "rewards/margins": -0.008028840646147728, + "rewards/rejected": -0.0029417751356959343, + "step": 18 + }, + { + "epoch": 0.02, + "learning_rate": 9.5e-06, + "logits/chosen": -1.9825160503387451, + "logits/rejected": -2.1957268714904785, + "logps/chosen": -275.072509765625, + "logps/rejected": -364.77215576171875, + "loss": 0.6922, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0009894607355818152, + "rewards/margins": 0.002422523219138384, + "rewards/rejected": -0.0034119843039661646, + "step": 19 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "logits/chosen": -2.0461435317993164, + "logits/rejected": -2.114314556121826, + "logps/chosen": -280.7231140136719, + "logps/rejected": -340.37872314453125, + "loss": 0.6928, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0057319882325828075, + "rewards/margins": 0.0008945947047322989, + "rewards/rejected": 0.004837393760681152, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 1.05e-05, + "logits/chosen": -2.2721879482269287, + "logits/rejected": -2.31955885887146, + "logps/chosen": -350.15765380859375, + "logps/rejected": -346.72943115234375, + "loss": 0.6961, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01111826952546835, + "rewards/margins": -0.005553150083869696, + "rewards/rejected": -0.0055651189759373665, + "step": 21 + }, + { + "epoch": 0.02, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -2.297549247741699, + "logits/rejected": -2.258702516555786, + "logps/chosen": -354.07342529296875, + "logps/rejected": -381.96453857421875, + "loss": 0.7021, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010824179276823997, + "rewards/margins": -0.01675737090408802, + "rewards/rejected": 0.005933189764618874, + "step": 22 + }, + { + "epoch": 0.02, + "learning_rate": 1.1500000000000002e-05, + "logits/chosen": -2.095402240753174, + "logits/rejected": -2.2284393310546875, + "logps/chosen": -367.1632385253906, + "logps/rejected": -395.11016845703125, + "loss": 0.6846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00539558008313179, + "rewards/margins": 0.018006421625614166, + "rewards/rejected": -0.023401999846100807, + "step": 23 + }, + { + "epoch": 0.02, + "learning_rate": 1.2e-05, + "logits/chosen": -2.058290719985962, + "logits/rejected": -2.142444133758545, + "logps/chosen": -208.247314453125, + "logps/rejected": -261.3697204589844, + "loss": 0.6851, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0028132672887295485, + "rewards/margins": 0.01700596883893013, + "rewards/rejected": -0.0198192335665226, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "logits/chosen": -1.9966950416564941, + "logits/rejected": -2.0200271606445312, + "logps/chosen": -390.6391906738281, + "logps/rejected": -335.07867431640625, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008169196546077728, + "rewards/margins": 0.007835723459720612, + "rewards/rejected": -0.01600492000579834, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -2.1417901515960693, + "logits/rejected": -2.102891683578491, + "logps/chosen": -334.50872802734375, + "logps/rejected": -364.5052795410156, + "loss": 0.688, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.019382527098059654, + "rewards/margins": 0.01085577066987753, + "rewards/rejected": -0.03023829497396946, + "step": 26 + }, + { + "epoch": 0.03, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -2.243198871612549, + "logits/rejected": -2.3117640018463135, + "logps/chosen": -343.4504699707031, + "logps/rejected": -356.1696472167969, + "loss": 0.7073, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03033299744129181, + "rewards/margins": -0.027142930775880814, + "rewards/rejected": -0.0031900645699352026, + "step": 27 + }, + { + "epoch": 0.03, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -2.0717036724090576, + "logits/rejected": -2.2167444229125977, + "logps/chosen": -270.87353515625, + "logps/rejected": -327.87640380859375, + "loss": 0.6986, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.032239750027656555, + "rewards/margins": -0.010378909297287464, + "rewards/rejected": -0.021860837936401367, + "step": 28 + }, + { + "epoch": 0.03, + "learning_rate": 1.45e-05, + "logits/chosen": -2.2460880279541016, + "logits/rejected": -2.2309041023254395, + "logps/chosen": -435.2510986328125, + "logps/rejected": -461.74859619140625, + "loss": 0.6958, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.018374010920524597, + "rewards/margins": -0.004351234529167414, + "rewards/rejected": -0.01402278058230877, + "step": 29 + }, + { + "epoch": 0.03, + "learning_rate": 1.5e-05, + "logits/chosen": -2.228811740875244, + "logits/rejected": -2.293030261993408, + "logps/chosen": -382.5074768066406, + "logps/rejected": -371.8452453613281, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004188417922705412, + "rewards/margins": 0.01916835457086563, + "rewards/rejected": -0.014979935251176357, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 1.55e-05, + "logits/chosen": -1.9169931411743164, + "logits/rejected": -2.1672849655151367, + "logps/chosen": -270.2207946777344, + "logps/rejected": -279.6405944824219, + "loss": 0.6917, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002476263325661421, + "rewards/margins": 0.003316545393317938, + "rewards/rejected": -0.0008402818348258734, + "step": 31 + }, + { + "epoch": 0.03, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.349118232727051, + "logits/rejected": -2.260673999786377, + "logps/chosen": -375.1436462402344, + "logps/rejected": -423.8641357421875, + "loss": 0.6963, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.018851473927497864, + "rewards/margins": -0.004983377177268267, + "rewards/rejected": -0.013868091627955437, + "step": 32 + }, + { + "epoch": 0.03, + "learning_rate": 1.65e-05, + "logits/chosen": -2.253589630126953, + "logits/rejected": -2.3596742153167725, + "logps/chosen": -454.2779846191406, + "logps/rejected": -503.1221008300781, + "loss": 0.6843, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004412556067109108, + "rewards/margins": 0.02018122747540474, + "rewards/rejected": -0.024593783542513847, + "step": 33 + }, + { + "epoch": 0.04, + "learning_rate": 1.7000000000000003e-05, + "logits/chosen": -2.041515588760376, + "logits/rejected": -2.1577486991882324, + "logps/chosen": -338.1881103515625, + "logps/rejected": -387.30023193359375, + "loss": 0.6887, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.011815071105957031, + "rewards/margins": 0.009705852717161179, + "rewards/rejected": -0.02152092382311821, + "step": 34 + }, + { + "epoch": 0.04, + "learning_rate": 1.75e-05, + "logits/chosen": -2.2442219257354736, + "logits/rejected": -2.3945209980010986, + "logps/chosen": -293.7795715332031, + "logps/rejected": -396.3448181152344, + "loss": 0.686, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.004475045017898083, + "rewards/margins": 0.015541339293122292, + "rewards/rejected": -0.0200163833796978, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 1.8e-05, + "logits/chosen": -2.0793919563293457, + "logits/rejected": -2.1432011127471924, + "logps/chosen": -322.59869384765625, + "logps/rejected": -437.3168029785156, + "loss": 0.6776, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.006924772635102272, + "rewards/margins": 0.03214583545923233, + "rewards/rejected": -0.02522106282413006, + "step": 36 + }, + { + "epoch": 0.04, + "learning_rate": 1.85e-05, + "logits/chosen": -2.155754804611206, + "logits/rejected": -2.2335851192474365, + "logps/chosen": -295.4434814453125, + "logps/rejected": -359.8757019042969, + "loss": 0.7043, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023294713348150253, + "rewards/margins": -0.02007477357983589, + "rewards/rejected": -0.0032199383713304996, + "step": 37 + }, + { + "epoch": 0.04, + "learning_rate": 1.9e-05, + "logits/chosen": -2.2595698833465576, + "logits/rejected": -2.064648151397705, + "logps/chosen": -310.83416748046875, + "logps/rejected": -272.6891784667969, + "loss": 0.6706, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003548526205122471, + "rewards/margins": 0.058505721390247345, + "rewards/rejected": -0.06205424666404724, + "step": 38 + }, + { + "epoch": 0.04, + "learning_rate": 1.9500000000000003e-05, + "logits/chosen": -2.247586250305176, + "logits/rejected": -2.2411181926727295, + "logps/chosen": -410.89801025390625, + "logps/rejected": -397.5184631347656, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.023591995239257812, + "rewards/margins": 0.0011774520389735699, + "rewards/rejected": -0.024769451469182968, + "step": 39 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "logits/chosen": -2.0857701301574707, + "logits/rejected": -2.3974575996398926, + "logps/chosen": -298.82373046875, + "logps/rejected": -322.1784973144531, + "loss": 0.6681, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.012818144634366035, + "rewards/margins": 0.05216258019208908, + "rewards/rejected": -0.0393444299697876, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 2.05e-05, + "logits/chosen": -2.0326108932495117, + "logits/rejected": -2.021967887878418, + "logps/chosen": -322.1849365234375, + "logps/rejected": -286.97393798828125, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03254096582531929, + "rewards/margins": 0.01258254237473011, + "rewards/rejected": -0.04512350261211395, + "step": 41 + }, + { + "epoch": 0.04, + "learning_rate": 2.1e-05, + "logits/chosen": -2.1398110389709473, + "logits/rejected": -2.1065280437469482, + "logps/chosen": -315.64190673828125, + "logps/rejected": -273.958740234375, + "loss": 0.6843, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06157093122601509, + "rewards/margins": 0.055743757635354996, + "rewards/rejected": -0.11731469631195068, + "step": 42 + }, + { + "epoch": 0.04, + "learning_rate": 2.15e-05, + "logits/chosen": -2.027090549468994, + "logits/rejected": -2.1431775093078613, + "logps/chosen": -328.4605407714844, + "logps/rejected": -336.6142272949219, + "loss": 0.6785, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002649687696248293, + "rewards/margins": 0.030248070135712624, + "rewards/rejected": -0.02759838104248047, + "step": 43 + }, + { + "epoch": 0.05, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -2.1685423851013184, + "logits/rejected": -2.3537817001342773, + "logps/chosen": -280.93017578125, + "logps/rejected": -430.02398681640625, + "loss": 0.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009914875030517578, + "rewards/margins": 0.031635358929634094, + "rewards/rejected": -0.04155023396015167, + "step": 44 + }, + { + "epoch": 0.05, + "learning_rate": 2.25e-05, + "logits/chosen": -2.026183605194092, + "logits/rejected": -2.2760047912597656, + "logps/chosen": -327.8951416015625, + "logps/rejected": -326.314697265625, + "loss": 0.6854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003114223014563322, + "rewards/margins": 0.017296195030212402, + "rewards/rejected": -0.020410416647791862, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 2.3000000000000003e-05, + "logits/chosen": -2.2608642578125, + "logits/rejected": -2.2612316608428955, + "logps/chosen": -313.09063720703125, + "logps/rejected": -299.6613464355469, + "loss": 0.6969, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0639270767569542, + "rewards/margins": -0.005310798529535532, + "rewards/rejected": -0.0586162805557251, + "step": 46 + }, + { + "epoch": 0.05, + "learning_rate": 2.35e-05, + "logits/chosen": -2.0354762077331543, + "logits/rejected": -2.2385144233703613, + "logps/chosen": -361.6187744140625, + "logps/rejected": -416.43804931640625, + "loss": 0.6855, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0550750270485878, + "rewards/margins": 0.021474791690707207, + "rewards/rejected": -0.07654982060194016, + "step": 47 + }, + { + "epoch": 0.05, + "learning_rate": 2.4e-05, + "logits/chosen": -2.1052396297454834, + "logits/rejected": -2.1244544982910156, + "logps/chosen": -401.3201904296875, + "logps/rejected": -394.404052734375, + "loss": 0.6713, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04391036182641983, + "rewards/margins": 0.048601724207401276, + "rewards/rejected": -0.0925120860338211, + "step": 48 + }, + { + "epoch": 0.05, + "learning_rate": 2.45e-05, + "logits/chosen": -2.325778007507324, + "logits/rejected": -2.1452741622924805, + "logps/chosen": -359.43255615234375, + "logps/rejected": -393.9988708496094, + "loss": 0.6997, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03692295402288437, + "rewards/margins": -0.005499981343746185, + "rewards/rejected": -0.031422972679138184, + "step": 49 + }, + { + "epoch": 0.05, + "learning_rate": 2.5e-05, + "logits/chosen": -2.0367650985717773, + "logits/rejected": -2.193774700164795, + "logps/chosen": -318.42266845703125, + "logps/rejected": -370.5718994140625, + "loss": 0.6619, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03530995920300484, + "rewards/margins": 0.06544995307922363, + "rewards/rejected": -0.10075991600751877, + "step": 50 + }, + { + "epoch": 0.05, + "learning_rate": 2.5500000000000003e-05, + "logits/chosen": -2.1022160053253174, + "logits/rejected": -2.0698752403259277, + "logps/chosen": -338.8807678222656, + "logps/rejected": -333.04351806640625, + "loss": 0.6707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04516604542732239, + "rewards/margins": 0.047384001314640045, + "rewards/rejected": -0.09255003929138184, + "step": 51 + }, + { + "epoch": 0.05, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -2.1273610591888428, + "logits/rejected": -2.2891712188720703, + "logps/chosen": -307.1384582519531, + "logps/rejected": -314.24676513671875, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024128681048750877, + "rewards/margins": 0.040785644203424454, + "rewards/rejected": -0.06491431593894958, + "step": 52 + }, + { + "epoch": 0.05, + "learning_rate": 2.6500000000000004e-05, + "logits/chosen": -1.951267957687378, + "logits/rejected": -2.171773672103882, + "logps/chosen": -313.5145568847656, + "logps/rejected": -345.32452392578125, + "loss": 0.6764, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06882210075855255, + "rewards/margins": 0.04536902531981468, + "rewards/rejected": -0.11419112980365753, + "step": 53 + }, + { + "epoch": 0.06, + "learning_rate": 2.7000000000000002e-05, + "logits/chosen": -2.318854808807373, + "logits/rejected": -2.4153428077697754, + "logps/chosen": -407.51080322265625, + "logps/rejected": -407.7454833984375, + "loss": 0.6911, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14593330025672913, + "rewards/margins": 0.0101944450289011, + "rewards/rejected": -0.15612773597240448, + "step": 54 + }, + { + "epoch": 0.06, + "learning_rate": 2.7500000000000004e-05, + "logits/chosen": -2.251836061477661, + "logits/rejected": -2.0312557220458984, + "logps/chosen": -257.6128234863281, + "logps/rejected": -260.79705810546875, + "loss": 0.7024, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12117181718349457, + "rewards/margins": -0.015745995566248894, + "rewards/rejected": -0.10542581230401993, + "step": 55 + }, + { + "epoch": 0.06, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.3783812522888184, + "logits/rejected": -2.2356433868408203, + "logps/chosen": -322.89599609375, + "logps/rejected": -305.649658203125, + "loss": 0.7044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0736481174826622, + "rewards/margins": -0.007364703342318535, + "rewards/rejected": -0.06628341972827911, + "step": 56 + }, + { + "epoch": 0.06, + "learning_rate": 2.8499999999999998e-05, + "logits/chosen": -2.0815675258636475, + "logits/rejected": -1.8037395477294922, + "logps/chosen": -341.66839599609375, + "logps/rejected": -266.3909912109375, + "loss": 0.6625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.028729846701025963, + "rewards/margins": 0.06697390228509903, + "rewards/rejected": -0.09570374339818954, + "step": 57 + }, + { + "epoch": 0.06, + "learning_rate": 2.9e-05, + "logits/chosen": -2.123415946960449, + "logits/rejected": -2.154893398284912, + "logps/chosen": -304.90008544921875, + "logps/rejected": -325.108154296875, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11998450756072998, + "rewards/margins": 0.007698964327573776, + "rewards/rejected": -0.12768347561359406, + "step": 58 + }, + { + "epoch": 0.06, + "learning_rate": 2.95e-05, + "logits/chosen": -2.086604356765747, + "logits/rejected": -2.0624561309814453, + "logps/chosen": -294.25244140625, + "logps/rejected": -324.137939453125, + "loss": 0.676, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06285426020622253, + "rewards/margins": 0.06548047065734863, + "rewards/rejected": -0.12833473086357117, + "step": 59 + }, + { + "epoch": 0.06, + "learning_rate": 3e-05, + "logits/chosen": -2.2143869400024414, + "logits/rejected": -2.3379101753234863, + "logps/chosen": -283.2222900390625, + "logps/rejected": -320.791748046875, + "loss": 0.657, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06308362632989883, + "rewards/margins": 0.09713932871818542, + "rewards/rejected": -0.16022296249866486, + "step": 60 + }, + { + "epoch": 0.06, + "learning_rate": 3.05e-05, + "logits/chosen": -2.117171287536621, + "logits/rejected": -2.3281807899475098, + "logps/chosen": -299.3207702636719, + "logps/rejected": -364.2285461425781, + "loss": 0.7127, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.15081080794334412, + "rewards/margins": -0.02138001285493374, + "rewards/rejected": -0.12943080067634583, + "step": 61 + }, + { + "epoch": 0.06, + "learning_rate": 3.1e-05, + "logits/chosen": -1.974008321762085, + "logits/rejected": -2.116246223449707, + "logps/chosen": -263.60394287109375, + "logps/rejected": -367.9918212890625, + "loss": 0.6806, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1498243808746338, + "rewards/margins": 0.032494522631168365, + "rewards/rejected": -0.18231889605522156, + "step": 62 + }, + { + "epoch": 0.07, + "learning_rate": 3.15e-05, + "logits/chosen": -2.0844931602478027, + "logits/rejected": -2.224573850631714, + "logps/chosen": -280.8338928222656, + "logps/rejected": -339.91009521484375, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1308109611272812, + "rewards/margins": 0.016523031517863274, + "rewards/rejected": -0.14733397960662842, + "step": 63 + }, + { + "epoch": 0.07, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -2.2176733016967773, + "logits/rejected": -2.2587060928344727, + "logps/chosen": -252.23390197753906, + "logps/rejected": -273.116943359375, + "loss": 0.681, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08397925645112991, + "rewards/margins": 0.031244704499840736, + "rewards/rejected": -0.1152239516377449, + "step": 64 + }, + { + "epoch": 0.07, + "learning_rate": 3.2500000000000004e-05, + "logits/chosen": -2.1103031635284424, + "logits/rejected": -2.1916050910949707, + "logps/chosen": -253.43565368652344, + "logps/rejected": -299.1423645019531, + "loss": 0.6848, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11171821504831314, + "rewards/margins": 0.028400154784321785, + "rewards/rejected": -0.14011836051940918, + "step": 65 + }, + { + "epoch": 0.07, + "learning_rate": 3.3e-05, + "logits/chosen": -2.138770580291748, + "logits/rejected": -2.277637481689453, + "logps/chosen": -361.4617919921875, + "logps/rejected": -364.7515869140625, + "loss": 0.6562, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13355040550231934, + "rewards/margins": 0.10390853881835938, + "rewards/rejected": -0.23745892941951752, + "step": 66 + }, + { + "epoch": 0.07, + "learning_rate": 3.35e-05, + "logits/chosen": -2.256972312927246, + "logits/rejected": -2.155823230743408, + "logps/chosen": -385.70989990234375, + "logps/rejected": -342.7967529296875, + "loss": 0.6888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20078128576278687, + "rewards/margins": 0.030581658706068993, + "rewards/rejected": -0.2313629388809204, + "step": 67 + }, + { + "epoch": 0.07, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -2.1459546089172363, + "logits/rejected": -2.092371940612793, + "logps/chosen": -302.3841247558594, + "logps/rejected": -248.30654907226562, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13762035965919495, + "rewards/margins": 0.019860554486513138, + "rewards/rejected": -0.15748091042041779, + "step": 68 + }, + { + "epoch": 0.07, + "learning_rate": 3.45e-05, + "logits/chosen": -1.9295188188552856, + "logits/rejected": -2.1819865703582764, + "logps/chosen": -197.07498168945312, + "logps/rejected": -277.8927917480469, + "loss": 0.7116, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.21833539009094238, + "rewards/margins": -0.016281111165881157, + "rewards/rejected": -0.20205429196357727, + "step": 69 + }, + { + "epoch": 0.07, + "learning_rate": 3.5e-05, + "logits/chosen": -2.0420405864715576, + "logits/rejected": -2.095397710800171, + "logps/chosen": -440.4430236816406, + "logps/rejected": -404.73297119140625, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17053675651550293, + "rewards/margins": 0.025177769362926483, + "rewards/rejected": -0.19571453332901, + "step": 70 + }, + { + "epoch": 0.07, + "learning_rate": 3.55e-05, + "logits/chosen": -1.8378528356552124, + "logits/rejected": -2.188674211502075, + "logps/chosen": -271.0667724609375, + "logps/rejected": -360.34454345703125, + "loss": 0.6789, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08768844604492188, + "rewards/margins": 0.06176728755235672, + "rewards/rejected": -0.1494557410478592, + "step": 71 + }, + { + "epoch": 0.07, + "learning_rate": 3.6e-05, + "logits/chosen": -2.31483793258667, + "logits/rejected": -2.28462290763855, + "logps/chosen": -386.14312744140625, + "logps/rejected": -423.07281494140625, + "loss": 0.7184, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18145808577537537, + "rewards/margins": -0.015067771077156067, + "rewards/rejected": -0.1663903295993805, + "step": 72 + }, + { + "epoch": 0.08, + "learning_rate": 3.65e-05, + "logits/chosen": -2.0742580890655518, + "logits/rejected": -2.2153687477111816, + "logps/chosen": -255.38543701171875, + "logps/rejected": -290.1117248535156, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0764242559671402, + "rewards/margins": 0.01691494509577751, + "rewards/rejected": -0.0933392122387886, + "step": 73 + }, + { + "epoch": 0.08, + "learning_rate": 3.7e-05, + "logits/chosen": -2.0832855701446533, + "logits/rejected": -2.060133934020996, + "logps/chosen": -340.09197998046875, + "logps/rejected": -342.4574890136719, + "loss": 0.6582, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12881942093372345, + "rewards/margins": 0.10232281684875488, + "rewards/rejected": -0.23114225268363953, + "step": 74 + }, + { + "epoch": 0.08, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -2.045681953430176, + "logits/rejected": -2.041499137878418, + "logps/chosen": -399.8717956542969, + "logps/rejected": -384.0579528808594, + "loss": 0.6529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1383177787065506, + "rewards/margins": 0.0989096462726593, + "rewards/rejected": -0.2372274249792099, + "step": 75 + }, + { + "epoch": 0.08, + "learning_rate": 3.8e-05, + "logits/chosen": -2.0136969089508057, + "logits/rejected": -2.131852149963379, + "logps/chosen": -246.70101928710938, + "logps/rejected": -256.7235412597656, + "loss": 0.6548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20958754420280457, + "rewards/margins": 0.1072821319103241, + "rewards/rejected": -0.31686967611312866, + "step": 76 + }, + { + "epoch": 0.08, + "learning_rate": 3.85e-05, + "logits/chosen": -1.8942384719848633, + "logits/rejected": -1.7946527004241943, + "logps/chosen": -363.3137512207031, + "logps/rejected": -288.844482421875, + "loss": 0.7086, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2392420619726181, + "rewards/margins": -0.015016615390777588, + "rewards/rejected": -0.2242254614830017, + "step": 77 + }, + { + "epoch": 0.08, + "learning_rate": 3.9000000000000006e-05, + "logits/chosen": -2.284573793411255, + "logits/rejected": -2.2139668464660645, + "logps/chosen": -291.585205078125, + "logps/rejected": -311.04986572265625, + "loss": 0.6962, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.24729153513908386, + "rewards/margins": 0.011042074300348759, + "rewards/rejected": -0.25833362340927124, + "step": 78 + }, + { + "epoch": 0.08, + "learning_rate": 3.9500000000000005e-05, + "logits/chosen": -2.126842737197876, + "logits/rejected": -2.178943634033203, + "logps/chosen": -289.07049560546875, + "logps/rejected": -267.1867980957031, + "loss": 0.6765, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09771183133125305, + "rewards/margins": 0.04596526175737381, + "rewards/rejected": -0.14367708563804626, + "step": 79 + }, + { + "epoch": 0.08, + "learning_rate": 4e-05, + "logits/chosen": -1.9899426698684692, + "logits/rejected": -2.218869209289551, + "logps/chosen": -348.4210205078125, + "logps/rejected": -453.83306884765625, + "loss": 0.6183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17673715949058533, + "rewards/margins": 0.17254139482975006, + "rewards/rejected": -0.3492785692214966, + "step": 80 + }, + { + "epoch": 0.08, + "learning_rate": 4.05e-05, + "logits/chosen": -2.3394789695739746, + "logits/rejected": -2.3765251636505127, + "logps/chosen": -356.1443786621094, + "logps/rejected": -352.0921936035156, + "loss": 0.6923, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.30028024315834045, + "rewards/margins": 0.05001110956072807, + "rewards/rejected": -0.35029137134552, + "step": 81 + }, + { + "epoch": 0.08, + "learning_rate": 4.1e-05, + "logits/chosen": -2.136016607284546, + "logits/rejected": -2.1530215740203857, + "logps/chosen": -241.19703674316406, + "logps/rejected": -251.56166076660156, + "loss": 0.7461, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3470645248889923, + "rewards/margins": -0.07802443206310272, + "rewards/rejected": -0.2690401077270508, + "step": 82 + }, + { + "epoch": 0.09, + "learning_rate": 4.15e-05, + "logits/chosen": -2.1615593433380127, + "logits/rejected": -2.184434413909912, + "logps/chosen": -338.65057373046875, + "logps/rejected": -285.83734130859375, + "loss": 0.734, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3146396279335022, + "rewards/margins": -0.06066913902759552, + "rewards/rejected": -0.2539704740047455, + "step": 83 + }, + { + "epoch": 0.09, + "learning_rate": 4.2e-05, + "logits/chosen": -2.1108598709106445, + "logits/rejected": -2.383390426635742, + "logps/chosen": -285.7464599609375, + "logps/rejected": -336.65997314453125, + "loss": 0.6534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30576157569885254, + "rewards/margins": 0.13044245541095734, + "rewards/rejected": -0.43620407581329346, + "step": 84 + }, + { + "epoch": 0.09, + "learning_rate": 4.25e-05, + "logits/chosen": -2.185880422592163, + "logits/rejected": -2.2634472846984863, + "logps/chosen": -382.79132080078125, + "logps/rejected": -426.94805908203125, + "loss": 0.6376, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2730083465576172, + "rewards/margins": 0.15455365180969238, + "rewards/rejected": -0.4275619387626648, + "step": 85 + }, + { + "epoch": 0.09, + "learning_rate": 4.3e-05, + "logits/chosen": -2.200338363647461, + "logits/rejected": -2.3442678451538086, + "logps/chosen": -295.3491516113281, + "logps/rejected": -292.9728698730469, + "loss": 0.6193, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1620425134897232, + "rewards/margins": 0.1742367446422577, + "rewards/rejected": -0.3362792432308197, + "step": 86 + }, + { + "epoch": 0.09, + "learning_rate": 4.35e-05, + "logits/chosen": -2.085541248321533, + "logits/rejected": -2.1654982566833496, + "logps/chosen": -313.8997497558594, + "logps/rejected": -406.6915588378906, + "loss": 0.6755, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18983058631420135, + "rewards/margins": 0.09621434658765793, + "rewards/rejected": -0.2860449254512787, + "step": 87 + }, + { + "epoch": 0.09, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -2.0523123741149902, + "logits/rejected": -2.1804168224334717, + "logps/chosen": -278.9588928222656, + "logps/rejected": -358.6983642578125, + "loss": 0.7034, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.369529664516449, + "rewards/margins": 0.06673409789800644, + "rewards/rejected": -0.43626368045806885, + "step": 88 + }, + { + "epoch": 0.09, + "learning_rate": 4.4500000000000004e-05, + "logits/chosen": -1.9767179489135742, + "logits/rejected": -2.073478937149048, + "logps/chosen": -312.8780212402344, + "logps/rejected": -334.3851623535156, + "loss": 0.721, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41693034768104553, + "rewards/margins": -0.03711947053670883, + "rewards/rejected": -0.3798108398914337, + "step": 89 + }, + { + "epoch": 0.09, + "learning_rate": 4.5e-05, + "logits/chosen": -2.34897780418396, + "logits/rejected": -2.1732659339904785, + "logps/chosen": -365.05712890625, + "logps/rejected": -338.4190673828125, + "loss": 0.6986, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4183480143547058, + "rewards/margins": 0.03731951862573624, + "rewards/rejected": -0.45566752552986145, + "step": 90 + }, + { + "epoch": 0.09, + "learning_rate": 4.55e-05, + "logits/chosen": -2.2497520446777344, + "logits/rejected": -2.2454562187194824, + "logps/chosen": -326.4650573730469, + "logps/rejected": -414.0813903808594, + "loss": 0.6715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4516112208366394, + "rewards/margins": 0.08267778903245926, + "rewards/rejected": -0.5342890024185181, + "step": 91 + }, + { + "epoch": 0.1, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -2.0850160121917725, + "logits/rejected": -2.2057738304138184, + "logps/chosen": -356.4827575683594, + "logps/rejected": -372.5768127441406, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45258861780166626, + "rewards/margins": 0.37770116329193115, + "rewards/rejected": -0.8302898406982422, + "step": 92 + }, + { + "epoch": 0.1, + "learning_rate": 4.6500000000000005e-05, + "logits/chosen": -2.3158974647521973, + "logits/rejected": -2.1951651573181152, + "logps/chosen": -314.29595947265625, + "logps/rejected": -327.9600524902344, + "loss": 0.8837, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.591391384601593, + "rewards/margins": -0.23991218209266663, + "rewards/rejected": -0.3514792025089264, + "step": 93 + }, + { + "epoch": 0.1, + "learning_rate": 4.7e-05, + "logits/chosen": -2.166097640991211, + "logits/rejected": -2.2614049911499023, + "logps/chosen": -359.9385986328125, + "logps/rejected": -416.6658020019531, + "loss": 0.6978, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5130646824836731, + "rewards/margins": 0.03734045475721359, + "rewards/rejected": -0.5504050850868225, + "step": 94 + }, + { + "epoch": 0.1, + "learning_rate": 4.75e-05, + "logits/chosen": -2.368900775909424, + "logits/rejected": -2.303640365600586, + "logps/chosen": -379.1260986328125, + "logps/rejected": -359.70281982421875, + "loss": 0.7422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5878180265426636, + "rewards/margins": -0.014551635831594467, + "rewards/rejected": -0.5732664465904236, + "step": 95 + }, + { + "epoch": 0.1, + "learning_rate": 4.8e-05, + "logits/chosen": -1.9750244617462158, + "logits/rejected": -2.13342547416687, + "logps/chosen": -309.6315002441406, + "logps/rejected": -290.8506164550781, + "loss": 0.706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5889778733253479, + "rewards/margins": 0.024408388882875443, + "rewards/rejected": -0.6133862733840942, + "step": 96 + }, + { + "epoch": 0.1, + "learning_rate": 4.85e-05, + "logits/chosen": -2.128643035888672, + "logits/rejected": -2.0525574684143066, + "logps/chosen": -413.3954772949219, + "logps/rejected": -345.6094055175781, + "loss": 0.7972, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7196497917175293, + "rewards/margins": -0.1434842050075531, + "rewards/rejected": -0.5761655569076538, + "step": 97 + }, + { + "epoch": 0.1, + "learning_rate": 4.9e-05, + "logits/chosen": -2.2135891914367676, + "logits/rejected": -2.2574219703674316, + "logps/chosen": -370.8169250488281, + "logps/rejected": -344.8519592285156, + "loss": 0.7296, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5168295502662659, + "rewards/margins": -0.034182578325271606, + "rewards/rejected": -0.48264697194099426, + "step": 98 + }, + { + "epoch": 0.1, + "learning_rate": 4.9500000000000004e-05, + "logits/chosen": -2.0821609497070312, + "logits/rejected": -2.0011813640594482, + "logps/chosen": -249.1553955078125, + "logps/rejected": -292.9418029785156, + "loss": 0.6621, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31434231996536255, + "rewards/margins": 0.10270004719495773, + "rewards/rejected": -0.41704243421554565, + "step": 99 + }, + { + "epoch": 0.1, + "learning_rate": 5e-05, + "logits/chosen": -2.0994839668273926, + "logits/rejected": -2.1740150451660156, + "logps/chosen": -377.63885498046875, + "logps/rejected": -479.538818359375, + "loss": 0.6675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.41925686597824097, + "rewards/margins": 0.06821132451295853, + "rewards/rejected": -0.4874681532382965, + "step": 100 + }, + { + "epoch": 0.1, + "learning_rate": 4.999983511654996e-05, + "logits/chosen": -2.200211524963379, + "logits/rejected": -2.1883296966552734, + "logps/chosen": -384.2947998046875, + "logps/rejected": -448.62957763671875, + "loss": 0.6957, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5274229049682617, + "rewards/margins": 0.025556959211826324, + "rewards/rejected": -0.5529798865318298, + "step": 101 + }, + { + "epoch": 0.11, + "learning_rate": 4.9999340468374787e-05, + "logits/chosen": -2.13332462310791, + "logits/rejected": -2.218052387237549, + "logps/chosen": -311.7938537597656, + "logps/rejected": -272.7942199707031, + "loss": 0.6103, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3114677965641022, + "rewards/margins": 0.20093847811222076, + "rewards/rejected": -0.5124062895774841, + "step": 102 + }, + { + "epoch": 0.11, + "learning_rate": 4.99985160619992e-05, + "logits/chosen": -2.151207447052002, + "logits/rejected": -2.073565721511841, + "logps/chosen": -324.60223388671875, + "logps/rejected": -350.4017028808594, + "loss": 0.6223, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4631275534629822, + "rewards/margins": 0.19337055087089539, + "rewards/rejected": -0.6564981341362, + "step": 103 + }, + { + "epoch": 0.11, + "learning_rate": 4.99973619082977e-05, + "logits/chosen": -2.077446699142456, + "logits/rejected": -2.1615357398986816, + "logps/chosen": -347.063232421875, + "logps/rejected": -355.1671142578125, + "loss": 0.6216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.387503445148468, + "rewards/margins": 0.18972592055797577, + "rewards/rejected": -0.577229380607605, + "step": 104 + }, + { + "epoch": 0.11, + "learning_rate": 4.9995878022494335e-05, + "logits/chosen": -2.173962354660034, + "logits/rejected": -2.1823246479034424, + "logps/chosen": -378.25152587890625, + "logps/rejected": -369.76983642578125, + "loss": 0.7478, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5083476901054382, + "rewards/margins": -0.091352179646492, + "rewards/rejected": -0.41699549555778503, + "step": 105 + }, + { + "epoch": 0.11, + "learning_rate": 4.9994064424162575e-05, + "logits/chosen": -2.2684097290039062, + "logits/rejected": -2.341747760772705, + "logps/chosen": -422.6853332519531, + "logps/rejected": -422.57330322265625, + "loss": 0.6335, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5471891760826111, + "rewards/margins": 0.16060353815555573, + "rewards/rejected": -0.7077926993370056, + "step": 106 + }, + { + "epoch": 0.11, + "learning_rate": 4.9991921137225e-05, + "logits/chosen": -2.0649406909942627, + "logits/rejected": -2.0103793144226074, + "logps/chosen": -398.6349182128906, + "logps/rejected": -321.7645263671875, + "loss": 0.7777, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5084972381591797, + "rewards/margins": -0.11370338499546051, + "rewards/rejected": -0.39479386806488037, + "step": 107 + }, + { + "epoch": 0.11, + "learning_rate": 4.998944818995302e-05, + "logits/chosen": -2.1160833835601807, + "logits/rejected": -2.2650928497314453, + "logps/chosen": -319.59326171875, + "logps/rejected": -399.0347900390625, + "loss": 0.6419, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46814414858818054, + "rewards/margins": 0.12615980207920074, + "rewards/rejected": -0.5943039059638977, + "step": 108 + }, + { + "epoch": 0.11, + "learning_rate": 4.998664561496647e-05, + "logits/chosen": -2.0261175632476807, + "logits/rejected": -2.01719331741333, + "logps/chosen": -352.3540344238281, + "logps/rejected": -404.3323669433594, + "loss": 0.5917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5237185955047607, + "rewards/margins": 0.2665461599826813, + "rewards/rejected": -0.7902647256851196, + "step": 109 + }, + { + "epoch": 0.11, + "learning_rate": 4.998351344923322e-05, + "logits/chosen": -2.1265554428100586, + "logits/rejected": -2.188615322113037, + "logps/chosen": -362.6216125488281, + "logps/rejected": -339.3355407714844, + "loss": 0.7484, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6888318657875061, + "rewards/margins": -0.090579092502594, + "rewards/rejected": -0.5982527732849121, + "step": 110 + }, + { + "epoch": 0.11, + "learning_rate": 4.998005173406865e-05, + "logits/chosen": -2.379279136657715, + "logits/rejected": -2.329848527908325, + "logps/chosen": -325.26385498046875, + "logps/rejected": -324.79656982421875, + "loss": 0.7852, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8365094065666199, + "rewards/margins": -0.1392899453639984, + "rewards/rejected": -0.6972194910049438, + "step": 111 + }, + { + "epoch": 0.12, + "learning_rate": 4.997626051513512e-05, + "logits/chosen": -2.126467227935791, + "logits/rejected": -2.1980350017547607, + "logps/chosen": -376.349609375, + "logps/rejected": -436.8643798828125, + "loss": 0.6107, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.49128276109695435, + "rewards/margins": 0.21428784728050232, + "rewards/rejected": -0.705570638179779, + "step": 112 + }, + { + "epoch": 0.12, + "learning_rate": 4.997213984244138e-05, + "logits/chosen": -2.1278860569000244, + "logits/rejected": -2.218194007873535, + "logps/chosen": -235.55670166015625, + "logps/rejected": -324.1790466308594, + "loss": 0.6201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6700268387794495, + "rewards/margins": 0.23135630786418915, + "rewards/rejected": -0.9013831615447998, + "step": 113 + }, + { + "epoch": 0.12, + "learning_rate": 4.996768977034188e-05, + "logits/chosen": -2.236452579498291, + "logits/rejected": -2.300806999206543, + "logps/chosen": -308.35467529296875, + "logps/rejected": -374.0247497558594, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6465363502502441, + "rewards/margins": 0.05209742486476898, + "rewards/rejected": -0.6986337900161743, + "step": 114 + }, + { + "epoch": 0.12, + "learning_rate": 4.996291035753608e-05, + "logits/chosen": -2.234069347381592, + "logits/rejected": -2.244938611984253, + "logps/chosen": -533.840576171875, + "logps/rejected": -479.3876953125, + "loss": 0.644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6587188243865967, + "rewards/margins": 0.14327090978622437, + "rewards/rejected": -0.801989734172821, + "step": 115 + }, + { + "epoch": 0.12, + "learning_rate": 4.995780166706767e-05, + "logits/chosen": -2.2996180057525635, + "logits/rejected": -2.1304776668548584, + "logps/chosen": -336.5658264160156, + "logps/rejected": -290.9165954589844, + "loss": 0.6859, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6112417578697205, + "rewards/margins": 0.08546795696020126, + "rewards/rejected": -0.6967097520828247, + "step": 116 + }, + { + "epoch": 0.12, + "learning_rate": 4.995236376632373e-05, + "logits/chosen": -2.143672466278076, + "logits/rejected": -2.0870895385742188, + "logps/chosen": -285.63385009765625, + "logps/rejected": -269.767822265625, + "loss": 0.6417, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6124986410140991, + "rewards/margins": 0.17768412828445435, + "rewards/rejected": -0.7901827096939087, + "step": 117 + }, + { + "epoch": 0.12, + "learning_rate": 4.994659672703383e-05, + "logits/chosen": -2.0322999954223633, + "logits/rejected": -2.1645522117614746, + "logps/chosen": -294.4024963378906, + "logps/rejected": -434.4449768066406, + "loss": 0.5761, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6845369338989258, + "rewards/margins": 0.3629693388938904, + "rewards/rejected": -1.047506332397461, + "step": 118 + }, + { + "epoch": 0.12, + "learning_rate": 4.994050062526915e-05, + "logits/chosen": -2.268840789794922, + "logits/rejected": -2.210455894470215, + "logps/chosen": -454.2048034667969, + "logps/rejected": -401.5218200683594, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7424222826957703, + "rewards/margins": 0.03279120847582817, + "rewards/rejected": -0.7752134799957275, + "step": 119 + }, + { + "epoch": 0.12, + "learning_rate": 4.993407554144136e-05, + "logits/chosen": -2.1254310607910156, + "logits/rejected": -2.2457275390625, + "logps/chosen": -263.01318359375, + "logps/rejected": -279.0079040527344, + "loss": 0.6246, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8383140563964844, + "rewards/margins": 0.3438203036785126, + "rewards/rejected": -1.1821343898773193, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 4.9927321560301686e-05, + "logits/chosen": -1.9596202373504639, + "logits/rejected": -1.9732022285461426, + "logps/chosen": -333.146240234375, + "logps/rejected": -317.2575988769531, + "loss": 0.6247, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6822391152381897, + "rewards/margins": 0.18031413853168488, + "rewards/rejected": -0.8625531196594238, + "step": 121 + }, + { + "epoch": 0.13, + "learning_rate": 4.992023877093969e-05, + "logits/chosen": -2.2412517070770264, + "logits/rejected": -2.2476582527160645, + "logps/chosen": -270.1220703125, + "logps/rejected": -297.88116455078125, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1133708953857422, + "rewards/margins": 0.07820569723844528, + "rewards/rejected": -1.1915764808654785, + "step": 122 + }, + { + "epoch": 0.13, + "learning_rate": 4.991282726678215e-05, + "logits/chosen": -2.1082093715667725, + "logits/rejected": -2.260173797607422, + "logps/chosen": -342.9111633300781, + "logps/rejected": -425.59710693359375, + "loss": 0.5823, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1128913164138794, + "rewards/margins": 0.2950645089149475, + "rewards/rejected": -1.4079556465148926, + "step": 123 + }, + { + "epoch": 0.13, + "learning_rate": 4.990508714559182e-05, + "logits/chosen": -1.9671399593353271, + "logits/rejected": -2.216414213180542, + "logps/chosen": -371.3886413574219, + "logps/rejected": -412.00323486328125, + "loss": 0.513, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6249299049377441, + "rewards/margins": 0.6542832255363464, + "rewards/rejected": -2.2792131900787354, + "step": 124 + }, + { + "epoch": 0.13, + "learning_rate": 4.989701850946613e-05, + "logits/chosen": -1.9954829216003418, + "logits/rejected": -2.0708837509155273, + "logps/chosen": -309.5416259765625, + "logps/rejected": -369.55902099609375, + "loss": 0.6247, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3325786590576172, + "rewards/margins": 0.2878706455230713, + "rewards/rejected": -1.6204493045806885, + "step": 125 + }, + { + "epoch": 0.13, + "learning_rate": 4.988862146483585e-05, + "logits/chosen": -2.04398775100708, + "logits/rejected": -2.3061468601226807, + "logps/chosen": -311.40283203125, + "logps/rejected": -330.2856750488281, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9415925741195679, + "rewards/margins": 0.5770285725593567, + "rewards/rejected": -1.5186210870742798, + "step": 126 + }, + { + "epoch": 0.13, + "learning_rate": 4.987989612246368e-05, + "logits/chosen": -2.1247596740722656, + "logits/rejected": -2.293691396713257, + "logps/chosen": -415.2500915527344, + "logps/rejected": -361.0720520019531, + "loss": 0.5442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9840108752250671, + "rewards/margins": 0.4814227521419525, + "rewards/rejected": -1.4654337167739868, + "step": 127 + }, + { + "epoch": 0.13, + "learning_rate": 4.9870842597442755e-05, + "logits/chosen": -2.21590518951416, + "logits/rejected": -2.1943631172180176, + "logps/chosen": -387.32720947265625, + "logps/rejected": -422.17059326171875, + "loss": 0.493, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0035918951034546, + "rewards/margins": 0.6130063533782959, + "rewards/rejected": -1.61659836769104, + "step": 128 + }, + { + "epoch": 0.13, + "learning_rate": 4.9861461009195224e-05, + "logits/chosen": -2.2312891483306885, + "logits/rejected": -2.3044235706329346, + "logps/chosen": -297.28729248046875, + "logps/rejected": -300.91070556640625, + "loss": 0.8, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2972065210342407, + "rewards/margins": -0.13482597470283508, + "rewards/rejected": -1.162380576133728, + "step": 129 + }, + { + "epoch": 0.13, + "learning_rate": 4.9851751481470565e-05, + "logits/chosen": -2.3871798515319824, + "logits/rejected": -2.3541009426116943, + "logps/chosen": -389.8529357910156, + "logps/rejected": -395.540283203125, + "loss": 0.7455, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.4040565490722656, + "rewards/margins": -0.02014276757836342, + "rewards/rejected": -1.3839137554168701, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 4.984171414234401e-05, + "logits/chosen": -2.3224058151245117, + "logits/rejected": -2.5203804969787598, + "logps/chosen": -278.0612487792969, + "logps/rejected": -292.171142578125, + "loss": 0.698, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2112613916397095, + "rewards/margins": 0.22594159841537476, + "rewards/rejected": -1.437203049659729, + "step": 131 + }, + { + "epoch": 0.14, + "learning_rate": 4.983134912421485e-05, + "logits/chosen": -2.1884591579437256, + "logits/rejected": -2.0368237495422363, + "logps/chosen": -277.64117431640625, + "logps/rejected": -261.66094970703125, + "loss": 0.5748, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.172703742980957, + "rewards/margins": 0.33002516627311707, + "rewards/rejected": -1.502728819847107, + "step": 132 + }, + { + "epoch": 0.14, + "learning_rate": 4.982065656380468e-05, + "logits/chosen": -2.079421281814575, + "logits/rejected": -2.2217986583709717, + "logps/chosen": -295.58087158203125, + "logps/rejected": -291.7632751464844, + "loss": 0.5565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9564344882965088, + "rewards/margins": 0.4054575562477112, + "rewards/rejected": -1.3618921041488647, + "step": 133 + }, + { + "epoch": 0.14, + "learning_rate": 4.9809636602155604e-05, + "logits/chosen": -2.1835222244262695, + "logits/rejected": -2.2144436836242676, + "logps/chosen": -248.64321899414062, + "logps/rejected": -231.23239135742188, + "loss": 0.6353, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2899762392044067, + "rewards/margins": 0.29608777165412903, + "rewards/rejected": -1.586064100265503, + "step": 134 + }, + { + "epoch": 0.14, + "learning_rate": 4.9798289384628355e-05, + "logits/chosen": -2.047929048538208, + "logits/rejected": -2.020115852355957, + "logps/chosen": -270.7432556152344, + "logps/rejected": -295.75714111328125, + "loss": 0.7271, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.225740671157837, + "rewards/margins": 0.14127963781356812, + "rewards/rejected": -1.3670202493667603, + "step": 135 + }, + { + "epoch": 0.14, + "learning_rate": 4.978661506090042e-05, + "logits/chosen": -2.268289089202881, + "logits/rejected": -2.264258623123169, + "logps/chosen": -335.73406982421875, + "logps/rejected": -326.88641357421875, + "loss": 0.8802, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.066948890686035, + "rewards/margins": -0.09082351624965668, + "rewards/rejected": -1.9761252403259277, + "step": 136 + }, + { + "epoch": 0.14, + "learning_rate": 4.9774613784964e-05, + "logits/chosen": -2.366272449493408, + "logits/rejected": -2.413400888442993, + "logps/chosen": -275.4363708496094, + "logps/rejected": -316.3116149902344, + "loss": 0.705, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7615418434143066, + "rewards/margins": 0.17158068716526031, + "rewards/rejected": -1.9331226348876953, + "step": 137 + }, + { + "epoch": 0.14, + "learning_rate": 4.9762285715124054e-05, + "logits/chosen": -2.370572328567505, + "logits/rejected": -2.273383617401123, + "logps/chosen": -342.66046142578125, + "logps/rejected": -399.414794921875, + "loss": 1.0301, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.7256594896316528, + "rewards/margins": -0.3764263093471527, + "rewards/rejected": -1.3492331504821777, + "step": 138 + }, + { + "epoch": 0.14, + "learning_rate": 4.974963101399614e-05, + "logits/chosen": -2.196343421936035, + "logits/rejected": -2.460721969604492, + "logps/chosen": -255.3898162841797, + "logps/rejected": -318.3489990234375, + "loss": 0.5858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9483575820922852, + "rewards/margins": 0.5218918919563293, + "rewards/rejected": -1.4702494144439697, + "step": 139 + }, + { + "epoch": 0.14, + "learning_rate": 4.973664984850435e-05, + "logits/chosen": -2.305603265762329, + "logits/rejected": -2.2728540897369385, + "logps/chosen": -351.310791015625, + "logps/rejected": -322.8355712890625, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.272456169128418, + "rewards/margins": 0.04384595528244972, + "rewards/rejected": -1.3163020610809326, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 4.9723342389879e-05, + "logits/chosen": -2.463696241378784, + "logits/rejected": -2.424192190170288, + "logps/chosen": -487.9200439453125, + "logps/rejected": -463.71844482421875, + "loss": 0.5906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.156785011291504, + "rewards/margins": 0.42006048560142517, + "rewards/rejected": -1.5768455266952515, + "step": 141 + }, + { + "epoch": 0.15, + "learning_rate": 4.970970881365449e-05, + "logits/chosen": -2.1991195678710938, + "logits/rejected": -2.2735817432403564, + "logps/chosen": -333.13214111328125, + "logps/rejected": -371.25213623046875, + "loss": 0.59, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0519524812698364, + "rewards/margins": 0.3118639290332794, + "rewards/rejected": -1.363816261291504, + "step": 142 + }, + { + "epoch": 0.15, + "learning_rate": 4.9695749299666894e-05, + "logits/chosen": -2.0732052326202393, + "logits/rejected": -2.0948421955108643, + "logps/chosen": -355.09381103515625, + "logps/rejected": -370.39007568359375, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0995197296142578, + "rewards/margins": 0.12801453471183777, + "rewards/rejected": -1.227534294128418, + "step": 143 + }, + { + "epoch": 0.15, + "learning_rate": 4.9681464032051635e-05, + "logits/chosen": -2.281567335128784, + "logits/rejected": -2.19057559967041, + "logps/chosen": -407.2377624511719, + "logps/rejected": -354.5672302246094, + "loss": 0.8131, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.355703592300415, + "rewards/margins": -0.11926855146884918, + "rewards/rejected": -1.2364351749420166, + "step": 144 + }, + { + "epoch": 0.15, + "learning_rate": 4.966685319924106e-05, + "logits/chosen": -2.3482041358947754, + "logits/rejected": -2.267275094985962, + "logps/chosen": -445.82208251953125, + "logps/rejected": -458.0115051269531, + "loss": 0.7112, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0608644485473633, + "rewards/margins": 0.03552216291427612, + "rewards/rejected": -1.0963865518569946, + "step": 145 + }, + { + "epoch": 0.15, + "learning_rate": 4.965191699396191e-05, + "logits/chosen": -2.1460695266723633, + "logits/rejected": -2.340147018432617, + "logps/chosen": -305.9909362792969, + "logps/rejected": -318.31256103515625, + "loss": 0.7068, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9504708647727966, + "rewards/margins": 0.06828048825263977, + "rewards/rejected": -1.0187513828277588, + "step": 146 + }, + { + "epoch": 0.15, + "learning_rate": 4.963665561323286e-05, + "logits/chosen": -2.2726097106933594, + "logits/rejected": -2.2365224361419678, + "logps/chosen": -287.2446594238281, + "logps/rejected": -313.90203857421875, + "loss": 0.8693, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1069557666778564, + "rewards/margins": -0.2487190216779709, + "rewards/rejected": -0.8582366704940796, + "step": 147 + }, + { + "epoch": 0.15, + "learning_rate": 4.962106925836183e-05, + "logits/chosen": -2.1673455238342285, + "logits/rejected": -2.1659748554229736, + "logps/chosen": -363.5993957519531, + "logps/rejected": -345.0345153808594, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8967840075492859, + "rewards/margins": 0.05857213959097862, + "rewards/rejected": -0.9553561210632324, + "step": 148 + }, + { + "epoch": 0.15, + "learning_rate": 4.9605158134943356e-05, + "logits/chosen": -2.167635679244995, + "logits/rejected": -2.10685396194458, + "logps/chosen": -278.79949951171875, + "logps/rejected": -243.8157501220703, + "loss": 0.8424, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8083875179290771, + "rewards/margins": -0.2297360599040985, + "rewards/rejected": -0.5786514282226562, + "step": 149 + }, + { + "epoch": 0.16, + "learning_rate": 4.9588922452855935e-05, + "logits/chosen": -1.9295530319213867, + "logits/rejected": -2.084747314453125, + "logps/chosen": -350.78887939453125, + "logps/rejected": -416.31549072265625, + "loss": 0.6965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.61838698387146, + "rewards/margins": 0.09959676116704941, + "rewards/rejected": -0.7179837226867676, + "step": 150 + }, + { + "epoch": 0.16, + "learning_rate": 4.9572362426259176e-05, + "logits/chosen": -2.1817588806152344, + "logits/rejected": -2.141252040863037, + "logps/chosen": -347.57403564453125, + "logps/rejected": -378.94281005859375, + "loss": 0.6127, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6715837121009827, + "rewards/margins": 0.36849769949913025, + "rewards/rejected": -1.04008150100708, + "step": 151 + }, + { + "epoch": 0.16, + "learning_rate": 4.955547827359103e-05, + "logits/chosen": -2.249030590057373, + "logits/rejected": -1.901309609413147, + "logps/chosen": -358.05859375, + "logps/rejected": -263.93365478515625, + "loss": 0.7291, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6999510526657104, + "rewards/margins": -0.013744346797466278, + "rewards/rejected": -0.6862066984176636, + "step": 152 + }, + { + "epoch": 0.16, + "learning_rate": 4.953827021756489e-05, + "logits/chosen": -1.9777555465698242, + "logits/rejected": -1.9771215915679932, + "logps/chosen": -374.28192138671875, + "logps/rejected": -446.4751892089844, + "loss": 0.7191, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6933267116546631, + "rewards/margins": 0.10663817822933197, + "rewards/rejected": -0.7999648451805115, + "step": 153 + }, + { + "epoch": 0.16, + "learning_rate": 4.952073848516663e-05, + "logits/chosen": -2.353224515914917, + "logits/rejected": -2.316944122314453, + "logps/chosen": -409.6300048828125, + "logps/rejected": -406.71124267578125, + "loss": 0.7778, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6479594707489014, + "rewards/margins": -0.08428651094436646, + "rewards/rejected": -0.5636729598045349, + "step": 154 + }, + { + "epoch": 0.16, + "learning_rate": 4.9502883307651674e-05, + "logits/chosen": -1.9488294124603271, + "logits/rejected": -1.9255703687667847, + "logps/chosen": -289.2491760253906, + "logps/rejected": -412.5265808105469, + "loss": 0.5643, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4605676531791687, + "rewards/margins": 0.313413143157959, + "rewards/rejected": -0.7739807367324829, + "step": 155 + }, + { + "epoch": 0.16, + "learning_rate": 4.9484704920541856e-05, + "logits/chosen": -1.9965554475784302, + "logits/rejected": -2.217256546020508, + "logps/chosen": -285.490966796875, + "logps/rejected": -356.04559326171875, + "loss": 0.7259, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5536916851997375, + "rewards/margins": 0.03260548785328865, + "rewards/rejected": -0.5862972140312195, + "step": 156 + }, + { + "epoch": 0.16, + "learning_rate": 4.9466203563622424e-05, + "logits/chosen": -2.1669509410858154, + "logits/rejected": -2.293706178665161, + "logps/chosen": -394.7683410644531, + "logps/rejected": -457.5030822753906, + "loss": 0.7323, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6224880814552307, + "rewards/margins": 0.012505665421485901, + "rewards/rejected": -0.6349937915802002, + "step": 157 + }, + { + "epoch": 0.16, + "learning_rate": 4.944737948093876e-05, + "logits/chosen": -1.9717931747436523, + "logits/rejected": -2.068146228790283, + "logps/chosen": -258.3878173828125, + "logps/rejected": -263.28363037109375, + "loss": 0.6178, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25086289644241333, + "rewards/margins": 0.19410811364650726, + "rewards/rejected": -0.4449709951877594, + "step": 158 + }, + { + "epoch": 0.16, + "learning_rate": 4.942823292079325e-05, + "logits/chosen": -2.1289565563201904, + "logits/rejected": -2.1196882724761963, + "logps/chosen": -301.2019958496094, + "logps/rejected": -265.33685302734375, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6575217843055725, + "rewards/margins": 0.06726698577404022, + "rewards/rejected": -0.7247887849807739, + "step": 159 + }, + { + "epoch": 0.17, + "learning_rate": 4.940876413574195e-05, + "logits/chosen": -2.016998291015625, + "logits/rejected": -2.271897077560425, + "logps/chosen": -302.09454345703125, + "logps/rejected": -424.992919921875, + "loss": 0.7552, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4835960865020752, + "rewards/margins": -0.06482114642858505, + "rewards/rejected": -0.41877493262290955, + "step": 160 + }, + { + "epoch": 0.17, + "learning_rate": 4.938897338259132e-05, + "logits/chosen": -2.087447166442871, + "logits/rejected": -1.9530307054519653, + "logps/chosen": -336.6803894042969, + "logps/rejected": -300.6052551269531, + "loss": 0.727, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.46374762058258057, + "rewards/margins": -0.05146384611725807, + "rewards/rejected": -0.4122838079929352, + "step": 161 + }, + { + "epoch": 0.17, + "learning_rate": 4.936886092239475e-05, + "logits/chosen": -2.277801990509033, + "logits/rejected": -2.1965315341949463, + "logps/chosen": -356.08038330078125, + "logps/rejected": -355.2462158203125, + "loss": 0.6807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35021650791168213, + "rewards/margins": 0.04458609223365784, + "rewards/rejected": -0.39480262994766235, + "step": 162 + }, + { + "epoch": 0.17, + "learning_rate": 4.93484270204492e-05, + "logits/chosen": -2.132495403289795, + "logits/rejected": -2.1767892837524414, + "logps/chosen": -416.3873291015625, + "logps/rejected": -446.61444091796875, + "loss": 0.6713, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3273276686668396, + "rewards/margins": 0.06858228892087936, + "rewards/rejected": -0.39590996503829956, + "step": 163 + }, + { + "epoch": 0.17, + "learning_rate": 4.932767194629164e-05, + "logits/chosen": -1.9876537322998047, + "logits/rejected": -2.0633606910705566, + "logps/chosen": -398.9766845703125, + "logps/rejected": -377.8792419433594, + "loss": 0.7513, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6136964559555054, + "rewards/margins": -0.046500250697135925, + "rewards/rejected": -0.567196249961853, + "step": 164 + }, + { + "epoch": 0.17, + "learning_rate": 4.930659597369554e-05, + "logits/chosen": -2.009962320327759, + "logits/rejected": -2.057422399520874, + "logps/chosen": -303.35882568359375, + "logps/rejected": -332.4872131347656, + "loss": 0.6449, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36901330947875977, + "rewards/margins": 0.15158693492412567, + "rewards/rejected": -0.5206002593040466, + "step": 165 + }, + { + "epoch": 0.17, + "learning_rate": 4.928519938066722e-05, + "logits/chosen": -1.9507710933685303, + "logits/rejected": -1.9371974468231201, + "logps/chosen": -350.26031494140625, + "logps/rejected": -332.67572021484375, + "loss": 0.6892, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.47843092679977417, + "rewards/margins": 0.03055078350007534, + "rewards/rejected": -0.5089817047119141, + "step": 166 + }, + { + "epoch": 0.17, + "learning_rate": 4.926348244944221e-05, + "logits/chosen": -1.8575907945632935, + "logits/rejected": -1.8456588983535767, + "logps/chosen": -298.292236328125, + "logps/rejected": -305.32904052734375, + "loss": 0.6269, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5008257031440735, + "rewards/margins": 0.18142402172088623, + "rewards/rejected": -0.6822497248649597, + "step": 167 + }, + { + "epoch": 0.17, + "learning_rate": 4.9241445466481504e-05, + "logits/chosen": -1.9923934936523438, + "logits/rejected": -2.1334385871887207, + "logps/chosen": -278.231689453125, + "logps/rejected": -368.37152099609375, + "loss": 0.7279, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43066713213920593, + "rewards/margins": -0.028812985867261887, + "rewards/rejected": -0.40185415744781494, + "step": 168 + }, + { + "epoch": 0.18, + "learning_rate": 4.921908872246782e-05, + "logits/chosen": -2.099191665649414, + "logits/rejected": -2.299363136291504, + "logps/chosen": -298.4676208496094, + "logps/rejected": -369.09075927734375, + "loss": 0.5683, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.42483800649642944, + "rewards/margins": 0.3080124258995056, + "rewards/rejected": -0.7328504323959351, + "step": 169 + }, + { + "epoch": 0.18, + "learning_rate": 4.91964125123017e-05, + "logits/chosen": -2.2204477787017822, + "logits/rejected": -2.0866146087646484, + "logps/chosen": -417.0639343261719, + "logps/rejected": -406.372802734375, + "loss": 0.7542, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6449086666107178, + "rewards/margins": -0.07101374119520187, + "rewards/rejected": -0.5738948583602905, + "step": 170 + }, + { + "epoch": 0.18, + "learning_rate": 4.9173417135097715e-05, + "logits/chosen": -2.208749294281006, + "logits/rejected": -2.0630850791931152, + "logps/chosen": -286.49749755859375, + "logps/rejected": -274.74822998046875, + "loss": 0.7148, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.36861342191696167, + "rewards/margins": -0.0281071700155735, + "rewards/rejected": -0.3405062258243561, + "step": 171 + }, + { + "epoch": 0.18, + "learning_rate": 4.9150102894180415e-05, + "logits/chosen": -1.9276704788208008, + "logits/rejected": -1.7694021463394165, + "logps/chosen": -305.76702880859375, + "logps/rejected": -299.9190368652344, + "loss": 0.7089, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5562857985496521, + "rewards/margins": 0.01368020474910736, + "rewards/rejected": -0.5699659585952759, + "step": 172 + }, + { + "epoch": 0.18, + "learning_rate": 4.91264700970804e-05, + "logits/chosen": -2.1296586990356445, + "logits/rejected": -2.081202983856201, + "logps/chosen": -253.95947265625, + "logps/rejected": -282.4194641113281, + "loss": 0.6342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3127891719341278, + "rewards/margins": 0.14373984932899475, + "rewards/rejected": -0.45652902126312256, + "step": 173 + }, + { + "epoch": 0.18, + "learning_rate": 4.910251905553025e-05, + "logits/chosen": -2.120836019515991, + "logits/rejected": -2.152477979660034, + "logps/chosen": -461.200927734375, + "logps/rejected": -470.2550048828125, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39715662598609924, + "rewards/margins": 0.12175296247005463, + "rewards/rejected": -0.5189095735549927, + "step": 174 + }, + { + "epoch": 0.18, + "learning_rate": 4.9078250085460384e-05, + "logits/chosen": -2.0472662448883057, + "logits/rejected": -2.0290732383728027, + "logps/chosen": -368.0150451660156, + "logps/rejected": -276.0652770996094, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5132391452789307, + "rewards/margins": 0.08550245314836502, + "rewards/rejected": -0.5987416505813599, + "step": 175 + }, + { + "epoch": 0.18, + "learning_rate": 4.905366350699493e-05, + "logits/chosen": -2.0048584938049316, + "logits/rejected": -2.020286798477173, + "logps/chosen": -341.3944396972656, + "logps/rejected": -444.3612060546875, + "loss": 0.6891, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48147350549697876, + "rewards/margins": 0.06385768949985504, + "rewards/rejected": -0.5453312397003174, + "step": 176 + }, + { + "epoch": 0.18, + "learning_rate": 4.902875964444746e-05, + "logits/chosen": -1.9281437397003174, + "logits/rejected": -2.0846328735351562, + "logps/chosen": -397.08453369140625, + "logps/rejected": -412.8231201171875, + "loss": 0.7184, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5112196207046509, + "rewards/margins": 0.010270453989505768, + "rewards/rejected": -0.5214900374412537, + "step": 177 + }, + { + "epoch": 0.18, + "learning_rate": 4.9003538826316795e-05, + "logits/chosen": -1.9262962341308594, + "logits/rejected": -1.8548663854599, + "logps/chosen": -326.5889587402344, + "logps/rejected": -336.98443603515625, + "loss": 0.6411, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37680521607398987, + "rewards/margins": 0.14640317857265472, + "rewards/rejected": -0.5232084393501282, + "step": 178 + }, + { + "epoch": 0.19, + "learning_rate": 4.897800138528253e-05, + "logits/chosen": -2.234349250793457, + "logits/rejected": -2.1834311485290527, + "logps/chosen": -302.0099182128906, + "logps/rejected": -293.316162109375, + "loss": 0.7266, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42696690559387207, + "rewards/margins": -0.030406557023525238, + "rewards/rejected": -0.39656031131744385, + "step": 179 + }, + { + "epoch": 0.19, + "learning_rate": 4.8952147658200806e-05, + "logits/chosen": -1.9105005264282227, + "logits/rejected": -2.006873607635498, + "logps/chosen": -307.2244873046875, + "logps/rejected": -327.2491455078125, + "loss": 0.6388, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4698525667190552, + "rewards/margins": 0.18418261408805847, + "rewards/rejected": -0.6540351510047913, + "step": 180 + }, + { + "epoch": 0.19, + "learning_rate": 4.892597798609976e-05, + "logits/chosen": -1.8944015502929688, + "logits/rejected": -1.8353430032730103, + "logps/chosen": -372.86474609375, + "logps/rejected": -328.3533020019531, + "loss": 0.7458, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5451046228408813, + "rewards/margins": -0.07491657137870789, + "rewards/rejected": -0.4701881408691406, + "step": 181 + }, + { + "epoch": 0.19, + "learning_rate": 4.889949271417504e-05, + "logits/chosen": -2.0069048404693604, + "logits/rejected": -2.1132149696350098, + "logps/chosen": -313.0219421386719, + "logps/rejected": -377.8055114746094, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4439217150211334, + "rewards/margins": 0.06591986864805222, + "rewards/rejected": -0.5098415613174438, + "step": 182 + }, + { + "epoch": 0.19, + "learning_rate": 4.88726921917853e-05, + "logits/chosen": -1.847022294998169, + "logits/rejected": -1.861661434173584, + "logps/chosen": -222.52427673339844, + "logps/rejected": -241.1929931640625, + "loss": 0.6617, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39014291763305664, + "rewards/margins": 0.09657852351665497, + "rewards/rejected": -0.486721396446228, + "step": 183 + }, + { + "epoch": 0.19, + "learning_rate": 4.884557677244754e-05, + "logits/chosen": -1.9963531494140625, + "logits/rejected": -2.0941479206085205, + "logps/chosen": -274.606689453125, + "logps/rejected": -268.11395263671875, + "loss": 0.6133, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4282204508781433, + "rewards/margins": 0.23392558097839355, + "rewards/rejected": -0.6621460318565369, + "step": 184 + }, + { + "epoch": 0.19, + "learning_rate": 4.881814681383248e-05, + "logits/chosen": -1.8474693298339844, + "logits/rejected": -2.122403144836426, + "logps/chosen": -254.52745056152344, + "logps/rejected": -355.0078430175781, + "loss": 0.6778, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3664317727088928, + "rewards/margins": 0.07878479361534119, + "rewards/rejected": -0.4452165961265564, + "step": 185 + }, + { + "epoch": 0.19, + "learning_rate": 4.879040267775981e-05, + "logits/chosen": -1.891446828842163, + "logits/rejected": -1.794939637184143, + "logps/chosen": -383.87451171875, + "logps/rejected": -405.0089416503906, + "loss": 0.7267, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.592932403087616, + "rewards/margins": -0.023355990648269653, + "rewards/rejected": -0.5695763826370239, + "step": 186 + }, + { + "epoch": 0.19, + "learning_rate": 4.8762344730193445e-05, + "logits/chosen": -1.923872709274292, + "logits/rejected": -2.09379243850708, + "logps/chosen": -251.91183471679688, + "logps/rejected": -267.0522155761719, + "loss": 0.618, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5020468831062317, + "rewards/margins": 0.21398763358592987, + "rewards/rejected": -0.7160345315933228, + "step": 187 + }, + { + "epoch": 0.19, + "learning_rate": 4.873397334123667e-05, + "logits/chosen": -1.7248388528823853, + "logits/rejected": -2.1145920753479004, + "logps/chosen": -270.976806640625, + "logps/rejected": -385.2688293457031, + "loss": 0.6094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5947083234786987, + "rewards/margins": 0.23461143672466278, + "rewards/rejected": -0.8293198347091675, + "step": 188 + }, + { + "epoch": 0.2, + "learning_rate": 4.8705288885127295e-05, + "logits/chosen": -2.289656162261963, + "logits/rejected": -2.2240102291107178, + "logps/chosen": -408.82403564453125, + "logps/rejected": -361.85711669921875, + "loss": 0.7461, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6849729418754578, + "rewards/margins": -0.0317959301173687, + "rewards/rejected": -0.6531770825386047, + "step": 189 + }, + { + "epoch": 0.2, + "learning_rate": 4.867629174023268e-05, + "logits/chosen": -2.281062602996826, + "logits/rejected": -2.0911028385162354, + "logps/chosen": -390.974365234375, + "logps/rejected": -376.9208984375, + "loss": 0.7288, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5994336009025574, + "rewards/margins": -0.05144501477479935, + "rewards/rejected": -0.5479886531829834, + "step": 190 + }, + { + "epoch": 0.2, + "learning_rate": 4.864698228904478e-05, + "logits/chosen": -1.8639394044876099, + "logits/rejected": -1.969814658164978, + "logps/chosen": -390.7828674316406, + "logps/rejected": -316.2579345703125, + "loss": 0.7376, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6630164384841919, + "rewards/margins": -0.02051009237766266, + "rewards/rejected": -0.6425063610076904, + "step": 191 + }, + { + "epoch": 0.2, + "learning_rate": 4.861736091817506e-05, + "logits/chosen": -2.084822654724121, + "logits/rejected": -1.8905832767486572, + "logps/chosen": -373.1632080078125, + "logps/rejected": -256.5694274902344, + "loss": 0.6745, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6736202239990234, + "rewards/margins": 0.06500263512134552, + "rewards/rejected": -0.738622784614563, + "step": 192 + }, + { + "epoch": 0.2, + "learning_rate": 4.858742801834942e-05, + "logits/chosen": -2.0825746059417725, + "logits/rejected": -1.8865240812301636, + "logps/chosen": -371.4250793457031, + "logps/rejected": -295.2694396972656, + "loss": 0.7168, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5424689650535583, + "rewards/margins": -0.019147779792547226, + "rewards/rejected": -0.5233211517333984, + "step": 193 + }, + { + "epoch": 0.2, + "learning_rate": 4.855718398440307e-05, + "logits/chosen": -2.1740431785583496, + "logits/rejected": -1.8387389183044434, + "logps/chosen": -307.1974182128906, + "logps/rejected": -293.7174987792969, + "loss": 0.6362, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6124351620674133, + "rewards/margins": 0.22457748651504517, + "rewards/rejected": -0.8370125889778137, + "step": 194 + }, + { + "epoch": 0.2, + "learning_rate": 4.852662921527522e-05, + "logits/chosen": -2.0800061225891113, + "logits/rejected": -2.2033514976501465, + "logps/chosen": -314.9759216308594, + "logps/rejected": -373.83087158203125, + "loss": 0.6388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7755237221717834, + "rewards/margins": 0.17267946898937225, + "rewards/rejected": -0.9482032060623169, + "step": 195 + }, + { + "epoch": 0.2, + "learning_rate": 4.8495764114003966e-05, + "logits/chosen": -2.0974619388580322, + "logits/rejected": -2.1461567878723145, + "logps/chosen": -360.7484436035156, + "logps/rejected": -402.9318542480469, + "loss": 0.6068, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7390251159667969, + "rewards/margins": 0.220990851521492, + "rewards/rejected": -0.9600158929824829, + "step": 196 + }, + { + "epoch": 0.2, + "learning_rate": 4.8464589087720846e-05, + "logits/chosen": -2.0587756633758545, + "logits/rejected": -1.9800175428390503, + "logps/chosen": -287.4089050292969, + "logps/rejected": -284.52667236328125, + "loss": 0.7495, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6148634552955627, + "rewards/margins": -0.05161774903535843, + "rewards/rejected": -0.5632455945014954, + "step": 197 + }, + { + "epoch": 0.21, + "learning_rate": 4.8433104547645527e-05, + "logits/chosen": -2.166761875152588, + "logits/rejected": -2.1352829933166504, + "logps/chosen": -270.46527099609375, + "logps/rejected": -276.6797180175781, + "loss": 0.6281, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7134184241294861, + "rewards/margins": 0.1845559924840927, + "rewards/rejected": -0.89797443151474, + "step": 198 + }, + { + "epoch": 0.21, + "learning_rate": 4.840131090908038e-05, + "logits/chosen": -2.013166904449463, + "logits/rejected": -2.0285515785217285, + "logps/chosen": -250.39393615722656, + "logps/rejected": -247.9554443359375, + "loss": 0.6774, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5990790128707886, + "rewards/margins": 0.09994714707136154, + "rewards/rejected": -0.6990260481834412, + "step": 199 + }, + { + "epoch": 0.21, + "learning_rate": 4.8369208591404997e-05, + "logits/chosen": -2.070176124572754, + "logits/rejected": -2.184037446975708, + "logps/chosen": -290.0277099609375, + "logps/rejected": -408.5084228515625, + "loss": 0.7326, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8427464962005615, + "rewards/margins": -0.015421424061059952, + "rewards/rejected": -0.8273251056671143, + "step": 200 + }, + { + "epoch": 0.21, + "learning_rate": 4.833679801807064e-05, + "logits/chosen": -1.9874109029769897, + "logits/rejected": -2.082557201385498, + "logps/chosen": -298.5231628417969, + "logps/rejected": -384.05853271484375, + "loss": 0.6293, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9263674020767212, + "rewards/margins": 0.19633004069328308, + "rewards/rejected": -1.1226974725723267, + "step": 201 + }, + { + "epoch": 0.21, + "learning_rate": 4.8304079616594686e-05, + "logits/chosen": -2.2528278827667236, + "logits/rejected": -2.320962905883789, + "logps/chosen": -476.85968017578125, + "logps/rejected": -352.87298583984375, + "loss": 0.7128, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9438900947570801, + "rewards/margins": 0.07938252389431, + "rewards/rejected": -1.0232725143432617, + "step": 202 + }, + { + "epoch": 0.21, + "learning_rate": 4.8271053818554965e-05, + "logits/chosen": -2.0623865127563477, + "logits/rejected": -2.1591854095458984, + "logps/chosen": -272.2887878417969, + "logps/rejected": -340.0892639160156, + "loss": 0.726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6589000821113586, + "rewards/margins": -0.008545447140932083, + "rewards/rejected": -0.6503546833992004, + "step": 203 + }, + { + "epoch": 0.21, + "learning_rate": 4.823772105958408e-05, + "logits/chosen": -2.2021212577819824, + "logits/rejected": -2.2701587677001953, + "logps/chosen": -332.5159912109375, + "logps/rejected": -383.0360412597656, + "loss": 0.5916, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8989717960357666, + "rewards/margins": 0.3648967444896698, + "rewards/rejected": -1.2638685703277588, + "step": 204 + }, + { + "epoch": 0.21, + "learning_rate": 4.820408177936365e-05, + "logits/chosen": -2.264617443084717, + "logits/rejected": -2.322549343109131, + "logps/chosen": -434.6703796386719, + "logps/rejected": -497.743408203125, + "loss": 0.6572, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8982381820678711, + "rewards/margins": 0.15654003620147705, + "rewards/rejected": -1.0547782182693481, + "step": 205 + }, + { + "epoch": 0.21, + "learning_rate": 4.817013642161853e-05, + "logits/chosen": -2.034374237060547, + "logits/rejected": -1.8990505933761597, + "logps/chosen": -335.35137939453125, + "logps/rejected": -308.3555603027344, + "loss": 0.7744, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9858347773551941, + "rewards/margins": -0.07680069655179977, + "rewards/rejected": -0.9090341925621033, + "step": 206 + }, + { + "epoch": 0.21, + "learning_rate": 4.813588543411093e-05, + "logits/chosen": -2.0272958278656006, + "logits/rejected": -2.040910243988037, + "logps/chosen": -263.2490234375, + "logps/rejected": -330.78326416015625, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.806855320930481, + "rewards/margins": 0.22991889715194702, + "rewards/rejected": -1.0367741584777832, + "step": 207 + }, + { + "epoch": 0.22, + "learning_rate": 4.810132926863454e-05, + "logits/chosen": -1.954245924949646, + "logits/rejected": -2.222029685974121, + "logps/chosen": -344.208984375, + "logps/rejected": -378.04827880859375, + "loss": 0.5867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7623510360717773, + "rewards/margins": 0.3259902894496918, + "rewards/rejected": -1.088341236114502, + "step": 208 + }, + { + "epoch": 0.22, + "learning_rate": 4.806646838100852e-05, + "logits/chosen": -1.9601213932037354, + "logits/rejected": -1.9726929664611816, + "logps/chosen": -363.0174255371094, + "logps/rejected": -362.3369140625, + "loss": 0.6792, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.85859614610672, + "rewards/margins": 0.1350986659526825, + "rewards/rejected": -0.9936947822570801, + "step": 209 + }, + { + "epoch": 0.22, + "learning_rate": 4.803130323107157e-05, + "logits/chosen": -2.3228697776794434, + "logits/rejected": -2.4339702129364014, + "logps/chosen": -412.57208251953125, + "logps/rejected": -507.545166015625, + "loss": 0.6152, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8819563984870911, + "rewards/margins": 0.24080882966518402, + "rewards/rejected": -1.1227651834487915, + "step": 210 + }, + { + "epoch": 0.22, + "learning_rate": 4.7995834282675764e-05, + "logits/chosen": -1.9379347562789917, + "logits/rejected": -1.9101741313934326, + "logps/chosen": -336.1128234863281, + "logps/rejected": -344.0860595703125, + "loss": 0.6526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7465636730194092, + "rewards/margins": 0.16460736095905304, + "rewards/rejected": -0.9111709594726562, + "step": 211 + }, + { + "epoch": 0.22, + "learning_rate": 4.796006200368054e-05, + "logits/chosen": -1.9772237539291382, + "logits/rejected": -2.0779454708099365, + "logps/chosen": -300.3962097167969, + "logps/rejected": -379.80230712890625, + "loss": 0.6292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6897981762886047, + "rewards/margins": 0.20077620446681976, + "rewards/rejected": -0.8905743360519409, + "step": 212 + }, + { + "epoch": 0.22, + "learning_rate": 4.79239868659464e-05, + "logits/chosen": -2.019927978515625, + "logits/rejected": -2.08197283744812, + "logps/chosen": -249.34629821777344, + "logps/rejected": -299.1027526855469, + "loss": 0.6968, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8061387538909912, + "rewards/margins": 0.06293849647045135, + "rewards/rejected": -0.8690773248672485, + "step": 213 + }, + { + "epoch": 0.22, + "learning_rate": 4.788760934532883e-05, + "logits/chosen": -2.124903678894043, + "logits/rejected": -2.041473865509033, + "logps/chosen": -268.1578674316406, + "logps/rejected": -329.6964111328125, + "loss": 0.7397, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9142539501190186, + "rewards/margins": 0.021660268306732178, + "rewards/rejected": -0.935914158821106, + "step": 214 + }, + { + "epoch": 0.22, + "learning_rate": 4.785092992167192e-05, + "logits/chosen": -2.1390061378479004, + "logits/rejected": -2.125261068344116, + "logps/chosen": -268.4858703613281, + "logps/rejected": -305.736083984375, + "loss": 0.6402, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7436612248420715, + "rewards/margins": 0.18916505575180054, + "rewards/rejected": -0.9328262209892273, + "step": 215 + }, + { + "epoch": 0.22, + "learning_rate": 4.781394907880204e-05, + "logits/chosen": -2.0572445392608643, + "logits/rejected": -2.2498245239257812, + "logps/chosen": -288.4680480957031, + "logps/rejected": -313.41729736328125, + "loss": 0.5816, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6026442646980286, + "rewards/margins": 0.3183574676513672, + "rewards/rejected": -0.9210017919540405, + "step": 216 + }, + { + "epoch": 0.22, + "learning_rate": 4.777666730452151e-05, + "logits/chosen": -1.8694506883621216, + "logits/rejected": -2.019477367401123, + "logps/chosen": -265.6673889160156, + "logps/rejected": -356.0314025878906, + "loss": 0.5947, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7154449820518494, + "rewards/margins": 0.4170495271682739, + "rewards/rejected": -1.1324944496154785, + "step": 217 + }, + { + "epoch": 0.23, + "learning_rate": 4.7739085090602145e-05, + "logits/chosen": -2.254331111907959, + "logits/rejected": -2.3572165966033936, + "logps/chosen": -305.95074462890625, + "logps/rejected": -340.34893798828125, + "loss": 0.7385, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7424416542053223, + "rewards/margins": 0.05813989043235779, + "rewards/rejected": -0.8005815148353577, + "step": 218 + }, + { + "epoch": 0.23, + "learning_rate": 4.770120293277875e-05, + "logits/chosen": -2.0773184299468994, + "logits/rejected": -2.0856757164001465, + "logps/chosen": -351.6342468261719, + "logps/rejected": -304.958251953125, + "loss": 0.5738, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8276374936103821, + "rewards/margins": 0.34980258345603943, + "rewards/rejected": -1.1774399280548096, + "step": 219 + }, + { + "epoch": 0.23, + "learning_rate": 4.76630213307426e-05, + "logits/chosen": -1.996044635772705, + "logits/rejected": -2.1620826721191406, + "logps/chosen": -286.35565185546875, + "logps/rejected": -381.3570861816406, + "loss": 0.8382, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9452384114265442, + "rewards/margins": -0.1867210865020752, + "rewards/rejected": -0.7585172057151794, + "step": 220 + }, + { + "epoch": 0.23, + "learning_rate": 4.762454078813483e-05, + "logits/chosen": -1.959717035293579, + "logits/rejected": -2.038839340209961, + "logps/chosen": -327.7545471191406, + "logps/rejected": -336.3837585449219, + "loss": 0.7531, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8737196326255798, + "rewards/margins": -0.06513047218322754, + "rewards/rejected": -0.8085891008377075, + "step": 221 + }, + { + "epoch": 0.23, + "learning_rate": 4.758576181253981e-05, + "logits/chosen": -2.221623659133911, + "logits/rejected": -2.0508334636688232, + "logps/chosen": -380.0113830566406, + "logps/rejected": -305.5745544433594, + "loss": 0.6948, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.907548189163208, + "rewards/margins": 0.19952180981636047, + "rewards/rejected": -1.107069969177246, + "step": 222 + }, + { + "epoch": 0.23, + "learning_rate": 4.754668491547845e-05, + "logits/chosen": -2.2138311862945557, + "logits/rejected": -1.9480187892913818, + "logps/chosen": -353.3214111328125, + "logps/rejected": -298.512939453125, + "loss": 0.6672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7623633146286011, + "rewards/margins": 0.12673720717430115, + "rewards/rejected": -0.8891006112098694, + "step": 223 + }, + { + "epoch": 0.23, + "learning_rate": 4.750731061240143e-05, + "logits/chosen": -2.000711679458618, + "logits/rejected": -2.172668933868408, + "logps/chosen": -283.4869384765625, + "logps/rejected": -291.0225830078125, + "loss": 0.6861, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46767231822013855, + "rewards/margins": 0.11719869822263718, + "rewards/rejected": -0.5848710536956787, + "step": 224 + }, + { + "epoch": 0.23, + "learning_rate": 4.746763942268243e-05, + "logits/chosen": -1.973673701286316, + "logits/rejected": -2.03965163230896, + "logps/chosen": -380.54974365234375, + "logps/rejected": -390.66119384765625, + "loss": 0.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5797699093818665, + "rewards/margins": 0.08640918880701065, + "rewards/rejected": -0.6661791205406189, + "step": 225 + }, + { + "epoch": 0.23, + "learning_rate": 4.742767186961125e-05, + "logits/chosen": -2.1579782962799072, + "logits/rejected": -2.12062668800354, + "logps/chosen": -371.7868957519531, + "logps/rejected": -279.1838684082031, + "loss": 0.7117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.43902456760406494, + "rewards/margins": 0.0770123153924942, + "rewards/rejected": -0.5160369277000427, + "step": 226 + }, + { + "epoch": 0.24, + "learning_rate": 4.7387408480386945e-05, + "logits/chosen": -2.1720871925354004, + "logits/rejected": -2.2602319717407227, + "logps/chosen": -340.2154235839844, + "logps/rejected": -415.40625, + "loss": 0.6475, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4649474322795868, + "rewards/margins": 0.14594462513923645, + "rewards/rejected": -0.6108919978141785, + "step": 227 + }, + { + "epoch": 0.24, + "learning_rate": 4.7346849786110834e-05, + "logits/chosen": -1.956856608390808, + "logits/rejected": -2.2350995540618896, + "logps/chosen": -277.75689697265625, + "logps/rejected": -373.5628967285156, + "loss": 0.6256, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.30955129861831665, + "rewards/margins": 0.2196839451789856, + "rewards/rejected": -0.529235303401947, + "step": 228 + }, + { + "epoch": 0.24, + "learning_rate": 4.7305996321779516e-05, + "logits/chosen": -1.9420428276062012, + "logits/rejected": -2.040480613708496, + "logps/chosen": -330.3153991699219, + "logps/rejected": -419.3126220703125, + "loss": 0.6871, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7027783393859863, + "rewards/margins": 0.27350372076034546, + "rewards/rejected": -0.976282000541687, + "step": 229 + }, + { + "epoch": 0.24, + "learning_rate": 4.726484862627779e-05, + "logits/chosen": -2.0949978828430176, + "logits/rejected": -2.049705743789673, + "logps/chosen": -399.0694580078125, + "logps/rejected": -362.0954284667969, + "loss": 0.7168, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26283103227615356, + "rewards/margins": 0.01691259816288948, + "rewards/rejected": -0.27974364161491394, + "step": 230 + }, + { + "epoch": 0.24, + "learning_rate": 4.722340724237159e-05, + "logits/chosen": -1.8245809078216553, + "logits/rejected": -2.162997007369995, + "logps/chosen": -250.3448486328125, + "logps/rejected": -344.0015563964844, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40172523260116577, + "rewards/margins": 0.23018088936805725, + "rewards/rejected": -0.6319061517715454, + "step": 231 + }, + { + "epoch": 0.24, + "learning_rate": 4.718167271670077e-05, + "logits/chosen": -2.0060951709747314, + "logits/rejected": -2.127163887023926, + "logps/chosen": -301.98492431640625, + "logps/rejected": -334.6331481933594, + "loss": 0.5626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17217136919498444, + "rewards/margins": 0.36563175916671753, + "rewards/rejected": -0.5378031730651855, + "step": 232 + }, + { + "epoch": 0.24, + "learning_rate": 4.7139645599771956e-05, + "logits/chosen": -2.229623794555664, + "logits/rejected": -2.2746386528015137, + "logps/chosen": -340.92559814453125, + "logps/rejected": -386.106689453125, + "loss": 0.6761, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6003557443618774, + "rewards/margins": 0.10827502608299255, + "rewards/rejected": -0.7086308002471924, + "step": 233 + }, + { + "epoch": 0.24, + "learning_rate": 4.709732644595122e-05, + "logits/chosen": -2.075230360031128, + "logits/rejected": -1.8063032627105713, + "logps/chosen": -322.08856201171875, + "logps/rejected": -251.28103637695312, + "loss": 0.7255, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5996315479278564, + "rewards/margins": -0.027767587453126907, + "rewards/rejected": -0.5718639492988586, + "step": 234 + }, + { + "epoch": 0.24, + "learning_rate": 4.7054715813456795e-05, + "logits/chosen": -2.074021816253662, + "logits/rejected": -1.947763442993164, + "logps/chosen": -357.5520324707031, + "logps/rejected": -347.5523681640625, + "loss": 0.648, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7392159700393677, + "rewards/margins": 0.21554100513458252, + "rewards/rejected": -0.954757034778595, + "step": 235 + }, + { + "epoch": 0.24, + "learning_rate": 4.701181426435175e-05, + "logits/chosen": -1.9467442035675049, + "logits/rejected": -1.974783182144165, + "logps/chosen": -358.291748046875, + "logps/rejected": -400.4483947753906, + "loss": 0.7035, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.329414039850235, + "rewards/margins": 0.15945447981357574, + "rewards/rejected": -0.4888685643672943, + "step": 236 + }, + { + "epoch": 0.25, + "learning_rate": 4.69686223645365e-05, + "logits/chosen": -2.1252236366271973, + "logits/rejected": -2.0597469806671143, + "logps/chosen": -314.242919921875, + "logps/rejected": -304.12176513671875, + "loss": 0.7022, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.42311131954193115, + "rewards/margins": 0.1071557104587555, + "rewards/rejected": -0.5302670001983643, + "step": 237 + }, + { + "epoch": 0.25, + "learning_rate": 4.692514068374142e-05, + "logits/chosen": -2.0920791625976562, + "logits/rejected": -2.227646827697754, + "logps/chosen": -389.5137634277344, + "logps/rejected": -376.8587341308594, + "loss": 0.6463, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4674568772315979, + "rewards/margins": 0.22089587152004242, + "rewards/rejected": -0.6883527636528015, + "step": 238 + }, + { + "epoch": 0.25, + "learning_rate": 4.6881369795519266e-05, + "logits/chosen": -2.0833654403686523, + "logits/rejected": -2.075024366378784, + "logps/chosen": -489.9222412109375, + "logps/rejected": -418.9756164550781, + "loss": 0.609, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.603772759437561, + "rewards/margins": 0.2230098843574524, + "rewards/rejected": -0.8267825841903687, + "step": 239 + }, + { + "epoch": 0.25, + "learning_rate": 4.683731027723764e-05, + "logits/chosen": -2.1447250843048096, + "logits/rejected": -2.3132801055908203, + "logps/chosen": -311.90301513671875, + "logps/rejected": -416.7982177734375, + "loss": 0.72, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.117811679840088, + "rewards/margins": 0.1118747889995575, + "rewards/rejected": -1.2296864986419678, + "step": 240 + }, + { + "epoch": 0.25, + "learning_rate": 4.679296271007137e-05, + "logits/chosen": -1.8714052438735962, + "logits/rejected": -1.9733188152313232, + "logps/chosen": -378.4604797363281, + "logps/rejected": -340.4316101074219, + "loss": 0.7227, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46567368507385254, + "rewards/margins": 0.10245680809020996, + "rewards/rejected": -0.5681304931640625, + "step": 241 + }, + { + "epoch": 0.25, + "learning_rate": 4.674832767899486e-05, + "logits/chosen": -2.0161495208740234, + "logits/rejected": -2.013415813446045, + "logps/chosen": -334.1321105957031, + "logps/rejected": -480.6463928222656, + "loss": 0.7282, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8040814399719238, + "rewards/margins": 0.06382356584072113, + "rewards/rejected": -0.8679050207138062, + "step": 242 + }, + { + "epoch": 0.25, + "learning_rate": 4.6703405772774325e-05, + "logits/chosen": -2.1323914527893066, + "logits/rejected": -2.001523971557617, + "logps/chosen": -299.8099670410156, + "logps/rejected": -278.1839294433594, + "loss": 0.6323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27376043796539307, + "rewards/margins": 0.16408132016658783, + "rewards/rejected": -0.4378418028354645, + "step": 243 + }, + { + "epoch": 0.25, + "learning_rate": 4.66581975839601e-05, + "logits/chosen": -2.1282427310943604, + "logits/rejected": -2.040194272994995, + "logps/chosen": -349.28826904296875, + "logps/rejected": -320.7705078125, + "loss": 0.6147, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6112862229347229, + "rewards/margins": 0.27279651165008545, + "rewards/rejected": -0.8840827345848083, + "step": 244 + }, + { + "epoch": 0.25, + "learning_rate": 4.661270370887872e-05, + "logits/chosen": -2.2946279048919678, + "logits/rejected": -2.311314105987549, + "logps/chosen": -298.4400634765625, + "logps/rejected": -319.92828369140625, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5072231888771057, + "rewards/margins": 0.09647183120250702, + "rewards/rejected": -0.6036950349807739, + "step": 245 + }, + { + "epoch": 0.25, + "learning_rate": 4.6566924747625176e-05, + "logits/chosen": -2.1643552780151367, + "logits/rejected": -2.13722562789917, + "logps/chosen": -328.9122009277344, + "logps/rejected": -411.6603088378906, + "loss": 0.7268, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5001087188720703, + "rewards/margins": 0.027627088129520416, + "rewards/rejected": -0.527735710144043, + "step": 246 + }, + { + "epoch": 0.26, + "learning_rate": 4.652086130405492e-05, + "logits/chosen": -2.0007681846618652, + "logits/rejected": -2.173635959625244, + "logps/chosen": -344.0201416015625, + "logps/rejected": -489.677978515625, + "loss": 0.5542, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.647387683391571, + "rewards/margins": 0.5166620016098022, + "rewards/rejected": -1.164049744606018, + "step": 247 + }, + { + "epoch": 0.26, + "learning_rate": 4.647451398577589e-05, + "logits/chosen": -2.2768120765686035, + "logits/rejected": -2.2112040519714355, + "logps/chosen": -351.68658447265625, + "logps/rejected": -304.02362060546875, + "loss": 0.8001, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5356206297874451, + "rewards/margins": -0.1319734901189804, + "rewards/rejected": -0.40364715456962585, + "step": 248 + }, + { + "epoch": 0.26, + "learning_rate": 4.6427883404140564e-05, + "logits/chosen": -2.0852530002593994, + "logits/rejected": -2.1016790866851807, + "logps/chosen": -337.05499267578125, + "logps/rejected": -388.4207763671875, + "loss": 0.7004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.922147274017334, + "rewards/margins": 0.11045366525650024, + "rewards/rejected": -1.0326008796691895, + "step": 249 + }, + { + "epoch": 0.26, + "learning_rate": 4.638097017423783e-05, + "logits/chosen": -2.044597625732422, + "logits/rejected": -2.14152193069458, + "logps/chosen": -334.3042297363281, + "logps/rejected": -319.0080871582031, + "loss": 0.5122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36447468400001526, + "rewards/margins": 0.62729412317276, + "rewards/rejected": -0.9917687773704529, + "step": 250 + }, + { + "epoch": 0.26, + "learning_rate": 4.6333774914884897e-05, + "logits/chosen": -2.073789119720459, + "logits/rejected": -2.1668505668640137, + "logps/chosen": -308.4037780761719, + "logps/rejected": -298.2853698730469, + "loss": 0.7838, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6153988242149353, + "rewards/margins": -0.0456385537981987, + "rewards/rejected": -0.569760262966156, + "step": 251 + }, + { + "epoch": 0.26, + "learning_rate": 4.6286298248619144e-05, + "logits/chosen": -2.121100902557373, + "logits/rejected": -2.0319623947143555, + "logps/chosen": -342.6925048828125, + "logps/rejected": -355.6612243652344, + "loss": 0.6799, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8498857617378235, + "rewards/margins": 0.18424755334854126, + "rewards/rejected": -1.0341331958770752, + "step": 252 + }, + { + "epoch": 0.26, + "learning_rate": 4.62385408016899e-05, + "logits/chosen": -2.084768533706665, + "logits/rejected": -2.15020489692688, + "logps/chosen": -250.22640991210938, + "logps/rejected": -272.5914001464844, + "loss": 0.5436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3922913670539856, + "rewards/margins": 0.36925047636032104, + "rewards/rejected": -0.7615418434143066, + "step": 253 + }, + { + "epoch": 0.26, + "learning_rate": 4.619050320405017e-05, + "logits/chosen": -2.3483760356903076, + "logits/rejected": -2.152430772781372, + "logps/chosen": -317.35296630859375, + "logps/rejected": -290.811767578125, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.718281626701355, + "rewards/margins": 0.11232803016901016, + "rewards/rejected": -0.8306095600128174, + "step": 254 + }, + { + "epoch": 0.26, + "learning_rate": 4.614218608934834e-05, + "logits/chosen": -2.1370747089385986, + "logits/rejected": -2.1002144813537598, + "logps/chosen": -395.0451965332031, + "logps/rejected": -446.2868347167969, + "loss": 0.6049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.938556969165802, + "rewards/margins": 0.3290979564189911, + "rewards/rejected": -1.2676548957824707, + "step": 255 + }, + { + "epoch": 0.27, + "learning_rate": 4.60935900949198e-05, + "logits/chosen": -1.9551351070404053, + "logits/rejected": -1.9744064807891846, + "logps/chosen": -372.6743469238281, + "logps/rejected": -486.5342102050781, + "loss": 0.7102, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7985714077949524, + "rewards/margins": 0.2688364088535309, + "rewards/rejected": -1.0674078464508057, + "step": 256 + }, + { + "epoch": 0.27, + "learning_rate": 4.6044715861778596e-05, + "logits/chosen": -2.051593065261841, + "logits/rejected": -2.114675521850586, + "logps/chosen": -294.7435302734375, + "logps/rejected": -326.8503112792969, + "loss": 0.5986, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.621825098991394, + "rewards/margins": 0.25512248277664185, + "rewards/rejected": -0.8769477009773254, + "step": 257 + }, + { + "epoch": 0.27, + "learning_rate": 4.5995564034608884e-05, + "logits/chosen": -2.160278797149658, + "logits/rejected": -2.1137940883636475, + "logps/chosen": -393.886474609375, + "logps/rejected": -350.8844299316406, + "loss": 0.7498, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8584200143814087, + "rewards/margins": -0.06297742575407028, + "rewards/rejected": -0.795442521572113, + "step": 258 + }, + { + "epoch": 0.27, + "learning_rate": 4.5946135261756504e-05, + "logits/chosen": -2.062591791152954, + "logits/rejected": -2.16622257232666, + "logps/chosen": -323.41131591796875, + "logps/rejected": -332.9070739746094, + "loss": 0.6248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5620593428611755, + "rewards/margins": 0.3896656036376953, + "rewards/rejected": -0.9517249464988708, + "step": 259 + }, + { + "epoch": 0.27, + "learning_rate": 4.5896430195220364e-05, + "logits/chosen": -1.9204814434051514, + "logits/rejected": -1.8729685544967651, + "logps/chosen": -288.1690979003906, + "logps/rejected": -313.9109802246094, + "loss": 0.6466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.849123477935791, + "rewards/margins": 0.2756129801273346, + "rewards/rejected": -1.1247365474700928, + "step": 260 + }, + { + "epoch": 0.27, + "learning_rate": 4.584644949064391e-05, + "logits/chosen": -2.176421642303467, + "logits/rejected": -2.2713022232055664, + "logps/chosen": -273.307373046875, + "logps/rejected": -273.297607421875, + "loss": 0.7478, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8873642086982727, + "rewards/margins": -0.060915715992450714, + "rewards/rejected": -0.8264484405517578, + "step": 261 + }, + { + "epoch": 0.27, + "learning_rate": 4.579619380730642e-05, + "logits/chosen": -2.1005711555480957, + "logits/rejected": -2.1496832370758057, + "logps/chosen": -251.0666961669922, + "logps/rejected": -300.4075012207031, + "loss": 0.6459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5288766622543335, + "rewards/margins": 0.20617865025997162, + "rewards/rejected": -0.7350552678108215, + "step": 262 + }, + { + "epoch": 0.27, + "learning_rate": 4.574566380811432e-05, + "logits/chosen": -2.277989387512207, + "logits/rejected": -2.2332725524902344, + "logps/chosen": -357.2762145996094, + "logps/rejected": -382.5482482910156, + "loss": 0.7107, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6581230163574219, + "rewards/margins": 0.030204597860574722, + "rewards/rejected": -0.6883276700973511, + "step": 263 + }, + { + "epoch": 0.27, + "learning_rate": 4.5694860159592465e-05, + "logits/chosen": -1.9560626745224, + "logits/rejected": -2.078892230987549, + "logps/chosen": -314.22076416015625, + "logps/rejected": -294.4151306152344, + "loss": 0.6152, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6360582113265991, + "rewards/margins": 0.2562226951122284, + "rewards/rejected": -0.8922808766365051, + "step": 264 + }, + { + "epoch": 0.27, + "learning_rate": 4.5643783531875323e-05, + "logits/chosen": -2.0318408012390137, + "logits/rejected": -2.1967966556549072, + "logps/chosen": -251.95309448242188, + "logps/rejected": -406.43310546875, + "loss": 0.5652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6249865293502808, + "rewards/margins": 0.6104640960693359, + "rewards/rejected": -1.2354506254196167, + "step": 265 + }, + { + "epoch": 0.28, + "learning_rate": 4.559243459869814e-05, + "logits/chosen": -1.9235318899154663, + "logits/rejected": -2.222059726715088, + "logps/chosen": -282.896728515625, + "logps/rejected": -374.2673645019531, + "loss": 0.6222, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7379283905029297, + "rewards/margins": 0.2646501362323761, + "rewards/rejected": -1.0025784969329834, + "step": 266 + }, + { + "epoch": 0.28, + "learning_rate": 4.5540814037388056e-05, + "logits/chosen": -1.9744443893432617, + "logits/rejected": -1.974491834640503, + "logps/chosen": -377.78466796875, + "logps/rejected": -405.6194152832031, + "loss": 0.9336, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8520271182060242, + "rewards/margins": -0.32225731015205383, + "rewards/rejected": -0.529769778251648, + "step": 267 + }, + { + "epoch": 0.28, + "learning_rate": 4.5488922528855176e-05, + "logits/chosen": -2.0435807704925537, + "logits/rejected": -1.990431308746338, + "logps/chosen": -306.81048583984375, + "logps/rejected": -369.6317138671875, + "loss": 0.6084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.726284921169281, + "rewards/margins": 0.3399674594402313, + "rewards/rejected": -1.06625235080719, + "step": 268 + }, + { + "epoch": 0.28, + "learning_rate": 4.543676075758356e-05, + "logits/chosen": -2.0265307426452637, + "logits/rejected": -1.9903606176376343, + "logps/chosen": -359.7049255371094, + "logps/rejected": -348.72723388671875, + "loss": 0.7122, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7237378358840942, + "rewards/margins": 0.028506018221378326, + "rewards/rejected": -0.7522438764572144, + "step": 269 + }, + { + "epoch": 0.28, + "learning_rate": 4.538432941162226e-05, + "logits/chosen": -2.327871799468994, + "logits/rejected": -2.218278408050537, + "logps/chosen": -379.38812255859375, + "logps/rejected": -380.68609619140625, + "loss": 0.7641, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5538444519042969, + "rewards/margins": -0.09269000589847565, + "rewards/rejected": -0.46115440130233765, + "step": 270 + }, + { + "epoch": 0.28, + "learning_rate": 4.5331629182576153e-05, + "logits/chosen": -2.1100308895111084, + "logits/rejected": -2.077500581741333, + "logps/chosen": -265.77972412109375, + "logps/rejected": -386.9795227050781, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6752620339393616, + "rewards/margins": 0.13913662731647491, + "rewards/rejected": -0.8143986463546753, + "step": 271 + }, + { + "epoch": 0.28, + "learning_rate": 4.5278660765596884e-05, + "logits/chosen": -2.0439562797546387, + "logits/rejected": -2.036978244781494, + "logps/chosen": -364.2665710449219, + "logps/rejected": -373.9276123046875, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7833287119865417, + "rewards/margins": 0.15880194306373596, + "rewards/rejected": -0.9421306848526001, + "step": 272 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-05, + "logits/chosen": -2.2488677501678467, + "logits/rejected": -2.270367383956909, + "logps/chosen": -329.8542175292969, + "logps/rejected": -341.32464599609375, + "loss": 0.6091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5054783225059509, + "rewards/margins": 0.29985660314559937, + "rewards/rejected": -0.8053349256515503, + "step": 273 + }, + { + "epoch": 0.28, + "learning_rate": 4.5171922166124154e-05, + "logits/chosen": -2.2042911052703857, + "logits/rejected": -2.2818048000335693, + "logps/chosen": -345.41571044921875, + "logps/rejected": -371.5228271484375, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5683087110519409, + "rewards/margins": 0.31007736921310425, + "rewards/rejected": -0.8783860802650452, + "step": 274 + }, + { + "epoch": 0.28, + "learning_rate": 4.5118153391584974e-05, + "logits/chosen": -2.283979892730713, + "logits/rejected": -2.3613948822021484, + "logps/chosen": -326.4739074707031, + "logps/rejected": -346.43701171875, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5925798416137695, + "rewards/margins": 0.04503173753619194, + "rewards/rejected": -0.6376115679740906, + "step": 275 + }, + { + "epoch": 0.29, + "learning_rate": 4.5064119245002626e-05, + "logits/chosen": -1.9776369333267212, + "logits/rejected": -2.186469793319702, + "logps/chosen": -316.29217529296875, + "logps/rejected": -339.24908447265625, + "loss": 0.5385, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4294860363006592, + "rewards/margins": 0.5443057417869568, + "rewards/rejected": -0.973791778087616, + "step": 276 + }, + { + "epoch": 0.29, + "learning_rate": 4.500982043912404e-05, + "logits/chosen": -1.8511924743652344, + "logits/rejected": -1.7418498992919922, + "logps/chosen": -221.07847595214844, + "logps/rejected": -236.75009155273438, + "loss": 0.8087, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6640509963035583, + "rewards/margins": -0.18910002708435059, + "rewards/rejected": -0.47495099902153015, + "step": 277 + }, + { + "epoch": 0.29, + "learning_rate": 4.495525769018717e-05, + "logits/chosen": -2.1778125762939453, + "logits/rejected": -2.3195223808288574, + "logps/chosen": -315.2249755859375, + "logps/rejected": -350.7799987792969, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5389224290847778, + "rewards/margins": 0.4431789517402649, + "rewards/rejected": -0.9821013808250427, + "step": 278 + }, + { + "epoch": 0.29, + "learning_rate": 4.490043171791155e-05, + "logits/chosen": -2.139204502105713, + "logits/rejected": -2.0121872425079346, + "logps/chosen": -411.3319396972656, + "logps/rejected": -478.99822998046875, + "loss": 0.542, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5370864272117615, + "rewards/margins": 0.40471482276916504, + "rewards/rejected": -0.9418012499809265, + "step": 279 + }, + { + "epoch": 0.29, + "learning_rate": 4.484534324548883e-05, + "logits/chosen": -1.9133659601211548, + "logits/rejected": -1.7547776699066162, + "logps/chosen": -291.21502685546875, + "logps/rejected": -296.52691650390625, + "loss": 0.7422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7586185932159424, + "rewards/margins": 0.04123706370592117, + "rewards/rejected": -0.7998557686805725, + "step": 280 + }, + { + "epoch": 0.29, + "learning_rate": 4.4789992999573194e-05, + "logits/chosen": -1.887819766998291, + "logits/rejected": -2.074976682662964, + "logps/chosen": -238.8917999267578, + "logps/rejected": -290.9273681640625, + "loss": 0.7374, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8456001281738281, + "rewards/margins": 0.0228101909160614, + "rewards/rejected": -0.8684103488922119, + "step": 281 + }, + { + "epoch": 0.29, + "learning_rate": 4.47343817102718e-05, + "logits/chosen": -1.776401400566101, + "logits/rejected": -1.8924405574798584, + "logps/chosen": -291.739990234375, + "logps/rejected": -354.3161926269531, + "loss": 0.7368, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.637408971786499, + "rewards/margins": -0.0014106258749961853, + "rewards/rejected": -0.6359982490539551, + "step": 282 + }, + { + "epoch": 0.29, + "learning_rate": 4.467851011113515e-05, + "logits/chosen": -2.391042470932007, + "logits/rejected": -2.4325904846191406, + "logps/chosen": -423.8915710449219, + "logps/rejected": -501.38568115234375, + "loss": 0.7102, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9706649780273438, + "rewards/margins": 0.06408338248729706, + "rewards/rejected": -1.0347484350204468, + "step": 283 + }, + { + "epoch": 0.29, + "learning_rate": 4.4622378939147416e-05, + "logits/chosen": -2.258568048477173, + "logits/rejected": -2.2797646522521973, + "logps/chosen": -324.49005126953125, + "logps/rejected": -288.9884948730469, + "loss": 0.6483, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46772387623786926, + "rewards/margins": 0.14248529076576233, + "rewards/rejected": -0.6102092266082764, + "step": 284 + }, + { + "epoch": 0.3, + "learning_rate": 4.456598893471668e-05, + "logits/chosen": -2.1972222328186035, + "logits/rejected": -2.0766568183898926, + "logps/chosen": -391.5879211425781, + "logps/rejected": -395.998779296875, + "loss": 0.6513, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5539337396621704, + "rewards/margins": 0.13942018151283264, + "rewards/rejected": -0.6933539509773254, + "step": 285 + }, + { + "epoch": 0.3, + "learning_rate": 4.450934084166524e-05, + "logits/chosen": -2.177605152130127, + "logits/rejected": -2.378321886062622, + "logps/chosen": -396.26800537109375, + "logps/rejected": -513.7933349609375, + "loss": 0.5245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5082396268844604, + "rewards/margins": 0.6496228575706482, + "rewards/rejected": -1.1578625440597534, + "step": 286 + }, + { + "epoch": 0.3, + "learning_rate": 4.445243540721972e-05, + "logits/chosen": -2.266407012939453, + "logits/rejected": -2.261568069458008, + "logps/chosen": -301.5278015136719, + "logps/rejected": -347.5808410644531, + "loss": 0.6835, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9058988094329834, + "rewards/margins": 0.060255490243434906, + "rewards/rejected": -0.9661542773246765, + "step": 287 + }, + { + "epoch": 0.3, + "learning_rate": 4.4395273382001286e-05, + "logits/chosen": -1.9136242866516113, + "logits/rejected": -2.1110732555389404, + "logps/chosen": -214.5594024658203, + "logps/rejected": -286.8577880859375, + "loss": 0.6752, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7257100343704224, + "rewards/margins": 0.190011665225029, + "rewards/rejected": -0.9157217144966125, + "step": 288 + }, + { + "epoch": 0.3, + "learning_rate": 4.433785552001568e-05, + "logits/chosen": -2.236163854598999, + "logits/rejected": -2.3334312438964844, + "logps/chosen": -378.4542541503906, + "logps/rejected": -426.2008361816406, + "loss": 0.6507, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7116307020187378, + "rewards/margins": 0.14856423437595367, + "rewards/rejected": -0.860194981098175, + "step": 289 + }, + { + "epoch": 0.3, + "learning_rate": 4.428018257864333e-05, + "logits/chosen": -2.0460734367370605, + "logits/rejected": -2.1440231800079346, + "logps/chosen": -280.720703125, + "logps/rejected": -361.7416687011719, + "loss": 0.5533, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24584394693374634, + "rewards/margins": 0.45910245180130005, + "rewards/rejected": -0.7049463987350464, + "step": 290 + }, + { + "epoch": 0.3, + "learning_rate": 4.4222255318629294e-05, + "logits/chosen": -1.7771077156066895, + "logits/rejected": -2.246995210647583, + "logps/chosen": -307.57232666015625, + "logps/rejected": -455.2540283203125, + "loss": 0.7022, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4229920506477356, + "rewards/margins": 0.13145704567432404, + "rewards/rejected": -0.5544491410255432, + "step": 291 + }, + { + "epoch": 0.3, + "learning_rate": 4.4164074504073313e-05, + "logits/chosen": -2.343817949295044, + "logits/rejected": -2.2758233547210693, + "logps/chosen": -376.11419677734375, + "logps/rejected": -370.3990478515625, + "loss": 0.6549, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.604290783405304, + "rewards/margins": 0.13700008392333984, + "rewards/rejected": -0.7412909269332886, + "step": 292 + }, + { + "epoch": 0.3, + "learning_rate": 4.410564090241966e-05, + "logits/chosen": -1.545508623123169, + "logits/rejected": -1.8171346187591553, + "logps/chosen": -221.09950256347656, + "logps/rejected": -334.39544677734375, + "loss": 0.5676, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5746042132377625, + "rewards/margins": 0.4071567952632904, + "rewards/rejected": -0.9817609786987305, + "step": 293 + }, + { + "epoch": 0.3, + "learning_rate": 4.4046955284447044e-05, + "logits/chosen": -2.077967643737793, + "logits/rejected": -2.2846693992614746, + "logps/chosen": -324.32476806640625, + "logps/rejected": -440.4974365234375, + "loss": 0.7404, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6503464579582214, + "rewards/margins": 0.04034698009490967, + "rewards/rejected": -0.6906934976577759, + "step": 294 + }, + { + "epoch": 0.31, + "learning_rate": 4.398801842425842e-05, + "logits/chosen": -2.1588635444641113, + "logits/rejected": -2.1049904823303223, + "logps/chosen": -406.24176025390625, + "logps/rejected": -372.340087890625, + "loss": 0.585, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7770822048187256, + "rewards/margins": 0.31246596574783325, + "rewards/rejected": -1.0895482301712036, + "step": 295 + }, + { + "epoch": 0.31, + "learning_rate": 4.392883109927083e-05, + "logits/chosen": -2.0670382976531982, + "logits/rejected": -1.8806589841842651, + "logps/chosen": -401.965087890625, + "logps/rejected": -393.48516845703125, + "loss": 0.6777, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8736264705657959, + "rewards/margins": 0.1775364875793457, + "rewards/rejected": -1.0511629581451416, + "step": 296 + }, + { + "epoch": 0.31, + "learning_rate": 4.38693940902051e-05, + "logits/chosen": -1.988671064376831, + "logits/rejected": -1.9722682237625122, + "logps/chosen": -387.1339111328125, + "logps/rejected": -409.888916015625, + "loss": 0.6198, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7169901132583618, + "rewards/margins": 0.27162882685661316, + "rewards/rejected": -0.9886189699172974, + "step": 297 + }, + { + "epoch": 0.31, + "learning_rate": 4.3809708181075556e-05, + "logits/chosen": -2.186702013015747, + "logits/rejected": -2.0955049991607666, + "logps/chosen": -395.58294677734375, + "logps/rejected": -347.0909118652344, + "loss": 0.8175, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9574447274208069, + "rewards/margins": -0.11547727137804031, + "rewards/rejected": -0.8419675230979919, + "step": 298 + }, + { + "epoch": 0.31, + "learning_rate": 4.374977415917969e-05, + "logits/chosen": -2.026996612548828, + "logits/rejected": -1.772495985031128, + "logps/chosen": -379.002685546875, + "logps/rejected": -381.895751953125, + "loss": 0.561, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6855794191360474, + "rewards/margins": 0.3581579923629761, + "rewards/rejected": -1.0437374114990234, + "step": 299 + }, + { + "epoch": 0.31, + "learning_rate": 4.3689592815087764e-05, + "logits/chosen": -1.9657152891159058, + "logits/rejected": -1.8686068058013916, + "logps/chosen": -367.0695495605469, + "logps/rejected": -369.2501525878906, + "loss": 0.7995, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0888875722885132, + "rewards/margins": -0.10578227788209915, + "rewards/rejected": -0.9831052422523499, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 4.3629164942632386e-05, + "logits/chosen": -1.7416181564331055, + "logits/rejected": -1.8769294023513794, + "logps/chosen": -255.93417358398438, + "logps/rejected": -316.71331787109375, + "loss": 0.5563, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7211109399795532, + "rewards/margins": 0.501852810382843, + "rewards/rejected": -1.222963809967041, + "step": 301 + }, + { + "epoch": 0.31, + "learning_rate": 4.3568491338898055e-05, + "logits/chosen": -2.201251745223999, + "logits/rejected": -2.0464892387390137, + "logps/chosen": -273.45849609375, + "logps/rejected": -306.01361083984375, + "loss": 0.8629, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.645514965057373, + "rewards/margins": -0.2321767956018448, + "rewards/rejected": -0.41333818435668945, + "step": 302 + }, + { + "epoch": 0.31, + "learning_rate": 4.350757280421061e-05, + "logits/chosen": -1.9395544528961182, + "logits/rejected": -1.923555612564087, + "logps/chosen": -401.73516845703125, + "logps/rejected": -390.874267578125, + "loss": 0.5915, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9947636723518372, + "rewards/margins": 0.2778167724609375, + "rewards/rejected": -1.2725805044174194, + "step": 303 + }, + { + "epoch": 0.31, + "learning_rate": 4.34464101421267e-05, + "logits/chosen": -1.8383352756500244, + "logits/rejected": -1.9566022157669067, + "logps/chosen": -381.46258544921875, + "logps/rejected": -383.20050048828125, + "loss": 0.5982, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5637756586074829, + "rewards/margins": 0.3097578287124634, + "rewards/rejected": -0.8735334277153015, + "step": 304 + }, + { + "epoch": 0.32, + "learning_rate": 4.338500415942319e-05, + "logits/chosen": -2.0855202674865723, + "logits/rejected": -1.8852460384368896, + "logps/chosen": -313.52093505859375, + "logps/rejected": -327.4878234863281, + "loss": 0.601, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6615315079689026, + "rewards/margins": 0.2937285006046295, + "rewards/rejected": -0.9552599787712097, + "step": 305 + }, + { + "epoch": 0.32, + "learning_rate": 4.3323355666086506e-05, + "logits/chosen": -2.2466461658477783, + "logits/rejected": -2.194441556930542, + "logps/chosen": -385.48297119140625, + "logps/rejected": -358.59783935546875, + "loss": 0.7045, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8933424353599548, + "rewards/margins": 0.060363732278347015, + "rewards/rejected": -0.9537062048912048, + "step": 306 + }, + { + "epoch": 0.32, + "learning_rate": 4.326146547530196e-05, + "logits/chosen": -1.9754979610443115, + "logits/rejected": -2.028249502182007, + "logps/chosen": -392.3201599121094, + "logps/rejected": -451.3328552246094, + "loss": 0.5464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7265093922615051, + "rewards/margins": 0.5438079833984375, + "rewards/rejected": -1.2703173160552979, + "step": 307 + }, + { + "epoch": 0.32, + "learning_rate": 4.3199334403442976e-05, + "logits/chosen": -1.9868967533111572, + "logits/rejected": -1.997020959854126, + "logps/chosen": -324.3955078125, + "logps/rejected": -345.5137634277344, + "loss": 0.8214, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8653470873832703, + "rewards/margins": -0.15076835453510284, + "rewards/rejected": -0.7145787477493286, + "step": 308 + }, + { + "epoch": 0.32, + "learning_rate": 4.313696327006042e-05, + "logits/chosen": -2.213704824447632, + "logits/rejected": -2.111079692840576, + "logps/chosen": -404.150634765625, + "logps/rejected": -373.7305603027344, + "loss": 0.7839, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9824564456939697, + "rewards/margins": 0.003427162766456604, + "rewards/rejected": -0.9858837127685547, + "step": 309 + }, + { + "epoch": 0.32, + "learning_rate": 4.3074352897871686e-05, + "logits/chosen": -1.8593621253967285, + "logits/rejected": -2.1451752185821533, + "logps/chosen": -312.2982482910156, + "logps/rejected": -318.39642333984375, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7266647815704346, + "rewards/margins": 0.3218391537666321, + "rewards/rejected": -1.0485039949417114, + "step": 310 + }, + { + "epoch": 0.32, + "learning_rate": 4.301150411274992e-05, + "logits/chosen": -1.75832200050354, + "logits/rejected": -1.8182921409606934, + "logps/chosen": -294.1172790527344, + "logps/rejected": -396.57208251953125, + "loss": 0.6559, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8868988752365112, + "rewards/margins": 0.2270849496126175, + "rewards/rejected": -1.1139838695526123, + "step": 311 + }, + { + "epoch": 0.32, + "learning_rate": 4.294841774371308e-05, + "logits/chosen": -1.9476118087768555, + "logits/rejected": -1.9689973592758179, + "logps/chosen": -333.6837158203125, + "logps/rejected": -371.0637512207031, + "loss": 0.596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.839272677898407, + "rewards/margins": 0.3770362436771393, + "rewards/rejected": -1.2163089513778687, + "step": 312 + }, + { + "epoch": 0.32, + "learning_rate": 4.288509462291302e-05, + "logits/chosen": -2.073699474334717, + "logits/rejected": -1.9370347261428833, + "logps/chosen": -378.5625305175781, + "logps/rejected": -395.1451416015625, + "loss": 0.6107, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7811661958694458, + "rewards/margins": 0.2788164019584656, + "rewards/rejected": -1.0599825382232666, + "step": 313 + }, + { + "epoch": 0.33, + "learning_rate": 4.2821535585624504e-05, + "logits/chosen": -1.9244226217269897, + "logits/rejected": -2.0157883167266846, + "logps/chosen": -374.1487121582031, + "logps/rejected": -391.6187438964844, + "loss": 0.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7071002721786499, + "rewards/margins": 0.08099737763404846, + "rewards/rejected": -0.7880975604057312, + "step": 314 + }, + { + "epoch": 0.33, + "learning_rate": 4.2757741470234214e-05, + "logits/chosen": -2.2348544597625732, + "logits/rejected": -2.087557554244995, + "logps/chosen": -326.5330810546875, + "logps/rejected": -336.2727355957031, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4954121708869934, + "rewards/margins": 0.1729755401611328, + "rewards/rejected": -0.668387770652771, + "step": 315 + }, + { + "epoch": 0.33, + "learning_rate": 4.269371311822965e-05, + "logits/chosen": -2.137007236480713, + "logits/rejected": -2.230994701385498, + "logps/chosen": -402.22711181640625, + "logps/rejected": -455.44390869140625, + "loss": 0.5746, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0957834720611572, + "rewards/margins": 0.4251984655857086, + "rewards/rejected": -1.520982027053833, + "step": 316 + }, + { + "epoch": 0.33, + "learning_rate": 4.2629451374188055e-05, + "logits/chosen": -2.0285918712615967, + "logits/rejected": -2.096553087234497, + "logps/chosen": -352.289306640625, + "logps/rejected": -334.0386962890625, + "loss": 0.7908, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6512686610221863, + "rewards/margins": -0.014007307589054108, + "rewards/rejected": -0.6372612714767456, + "step": 317 + }, + { + "epoch": 0.33, + "learning_rate": 4.256495708576527e-05, + "logits/chosen": -2.0124435424804688, + "logits/rejected": -2.203395128250122, + "logps/chosen": -344.8824768066406, + "logps/rejected": -407.80548095703125, + "loss": 0.5603, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.666597843170166, + "rewards/margins": 0.5224538445472717, + "rewards/rejected": -1.1890517473220825, + "step": 318 + }, + { + "epoch": 0.33, + "learning_rate": 4.250023110368457e-05, + "logits/chosen": -2.0517170429229736, + "logits/rejected": -2.260333299636841, + "logps/chosen": -319.9532470703125, + "logps/rejected": -426.59197998046875, + "loss": 0.5659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0301696062088013, + "rewards/margins": 0.4518246054649353, + "rewards/rejected": -1.4819941520690918, + "step": 319 + }, + { + "epoch": 0.33, + "learning_rate": 4.243527428172541e-05, + "logits/chosen": -1.7961515188217163, + "logits/rejected": -2.0152459144592285, + "logps/chosen": -338.9334411621094, + "logps/rejected": -429.2481384277344, + "loss": 0.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7761648893356323, + "rewards/margins": 0.3605046570301056, + "rewards/rejected": -1.136669635772705, + "step": 320 + }, + { + "epoch": 0.33, + "learning_rate": 4.237008747671217e-05, + "logits/chosen": -2.139997720718384, + "logits/rejected": -2.038111686706543, + "logps/chosen": -311.0231018066406, + "logps/rejected": -327.61065673828125, + "loss": 0.6711, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0172241926193237, + "rewards/margins": 0.20826146006584167, + "rewards/rejected": -1.2254855632781982, + "step": 321 + }, + { + "epoch": 0.33, + "learning_rate": 4.2304671548502896e-05, + "logits/chosen": -1.859197735786438, + "logits/rejected": -2.0383810997009277, + "logps/chosen": -319.2428894042969, + "logps/rejected": -297.6011657714844, + "loss": 0.7805, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34115666151046753, + "rewards/margins": -0.06757514923810959, + "rewards/rejected": -0.27358150482177734, + "step": 322 + }, + { + "epoch": 0.33, + "learning_rate": 4.223902735997788e-05, + "logits/chosen": -2.063511371612549, + "logits/rejected": -2.0269269943237305, + "logps/chosen": -329.1101379394531, + "logps/rejected": -359.3594665527344, + "loss": 0.5742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6491307616233826, + "rewards/margins": 0.3211410343647003, + "rewards/rejected": -0.9702718257904053, + "step": 323 + }, + { + "epoch": 0.34, + "learning_rate": 4.217315577702836e-05, + "logits/chosen": -2.3090927600860596, + "logits/rejected": -2.2255845069885254, + "logps/chosen": -438.4259338378906, + "logps/rejected": -385.08880615234375, + "loss": 0.7974, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6954346895217896, + "rewards/margins": 0.034093111753463745, + "rewards/rejected": -0.7295278310775757, + "step": 324 + }, + { + "epoch": 0.34, + "learning_rate": 4.2107057668545044e-05, + "logits/chosen": -2.138420581817627, + "logits/rejected": -2.3774194717407227, + "logps/chosen": -241.03421020507812, + "logps/rejected": -291.4215087890625, + "loss": 0.5889, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7353395223617554, + "rewards/margins": 0.4763728976249695, + "rewards/rejected": -1.2117124795913696, + "step": 325 + }, + { + "epoch": 0.34, + "learning_rate": 4.204073390640666e-05, + "logits/chosen": -2.095376968383789, + "logits/rejected": -2.1427035331726074, + "logps/chosen": -359.9441223144531, + "logps/rejected": -460.4444580078125, + "loss": 0.6791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9195146560668945, + "rewards/margins": 0.12283174693584442, + "rewards/rejected": -1.042346477508545, + "step": 326 + }, + { + "epoch": 0.34, + "learning_rate": 4.1974185365468467e-05, + "logits/chosen": -2.082658052444458, + "logits/rejected": -2.0861730575561523, + "logps/chosen": -364.9388427734375, + "logps/rejected": -403.16510009765625, + "loss": 0.6084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5946727991104126, + "rewards/margins": 0.3608367443084717, + "rewards/rejected": -0.955509603023529, + "step": 327 + }, + { + "epoch": 0.34, + "learning_rate": 4.19074129235507e-05, + "logits/chosen": -2.088836431503296, + "logits/rejected": -2.0834221839904785, + "logps/chosen": -319.285888671875, + "logps/rejected": -320.6772766113281, + "loss": 0.8216, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.882214367389679, + "rewards/margins": -0.12802943587303162, + "rewards/rejected": -0.7541849613189697, + "step": 328 + }, + { + "epoch": 0.34, + "learning_rate": 4.184041746142702e-05, + "logits/chosen": -2.211498498916626, + "logits/rejected": -2.1443381309509277, + "logps/chosen": -379.1222229003906, + "logps/rejected": -416.7032470703125, + "loss": 0.7431, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7400021553039551, + "rewards/margins": -0.029454410076141357, + "rewards/rejected": -0.7105477452278137, + "step": 329 + }, + { + "epoch": 0.34, + "learning_rate": 4.177319986281285e-05, + "logits/chosen": -1.9428646564483643, + "logits/rejected": -2.0589828491210938, + "logps/chosen": -338.52618408203125, + "logps/rejected": -396.5542297363281, + "loss": 0.6202, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3697546124458313, + "rewards/margins": 0.2811446785926819, + "rewards/rejected": -0.6508992314338684, + "step": 330 + }, + { + "epoch": 0.34, + "learning_rate": 4.170576101435376e-05, + "logits/chosen": -2.2952022552490234, + "logits/rejected": -2.321949005126953, + "logps/chosen": -275.3761901855469, + "logps/rejected": -350.7015075683594, + "loss": 0.6968, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3928203880786896, + "rewards/margins": 0.07630396634340286, + "rewards/rejected": -0.46912431716918945, + "step": 331 + }, + { + "epoch": 0.34, + "learning_rate": 4.163810180561376e-05, + "logits/chosen": -1.8680933713912964, + "logits/rejected": -2.0927417278289795, + "logps/chosen": -303.8800354003906, + "logps/rejected": -341.4588928222656, + "loss": 0.7411, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6674574613571167, + "rewards/margins": -0.02869322896003723, + "rewards/rejected": -0.6387642621994019, + "step": 332 + }, + { + "epoch": 0.34, + "learning_rate": 4.157022312906352e-05, + "logits/chosen": -1.946225881576538, + "logits/rejected": -1.9390500783920288, + "logps/chosen": -340.5247497558594, + "logps/rejected": -309.8658447265625, + "loss": 0.6641, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6571869850158691, + "rewards/margins": 0.13291773200035095, + "rewards/rejected": -0.7901047468185425, + "step": 333 + }, + { + "epoch": 0.35, + "learning_rate": 4.150212588006871e-05, + "logits/chosen": -2.1470143795013428, + "logits/rejected": -2.4466309547424316, + "logps/chosen": -355.8979797363281, + "logps/rejected": -360.3694763183594, + "loss": 0.7295, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8183680772781372, + "rewards/margins": -0.04079904779791832, + "rewards/rejected": -0.7775689363479614, + "step": 334 + }, + { + "epoch": 0.35, + "learning_rate": 4.143381095687805e-05, + "logits/chosen": -1.8094087839126587, + "logits/rejected": -2.024905204772949, + "logps/chosen": -291.0030517578125, + "logps/rejected": -401.3894348144531, + "loss": 0.5393, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30385422706604004, + "rewards/margins": 0.38516706228256226, + "rewards/rejected": -0.6890213489532471, + "step": 335 + }, + { + "epoch": 0.35, + "learning_rate": 4.136527926061157e-05, + "logits/chosen": -2.2869348526000977, + "logits/rejected": -2.3848395347595215, + "logps/chosen": -354.1495361328125, + "logps/rejected": -422.5235900878906, + "loss": 0.7545, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5594637393951416, + "rewards/margins": -0.08133678883314133, + "rewards/rejected": -0.4781269133090973, + "step": 336 + }, + { + "epoch": 0.35, + "learning_rate": 4.1296531695248666e-05, + "logits/chosen": -2.1156575679779053, + "logits/rejected": -2.0625457763671875, + "logps/chosen": -420.004150390625, + "logps/rejected": -347.8268737792969, + "loss": 0.7191, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7436313033103943, + "rewards/margins": 0.03417450189590454, + "rewards/rejected": -0.777805745601654, + "step": 337 + }, + { + "epoch": 0.35, + "learning_rate": 4.1227569167616206e-05, + "logits/chosen": -1.9652526378631592, + "logits/rejected": -2.0387110710144043, + "logps/chosen": -294.15997314453125, + "logps/rejected": -333.1921081542969, + "loss": 0.6429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.547760009765625, + "rewards/margins": 0.18485116958618164, + "rewards/rejected": -0.7326111793518066, + "step": 338 + }, + { + "epoch": 0.35, + "learning_rate": 4.1158392587376536e-05, + "logits/chosen": -1.9591008424758911, + "logits/rejected": -1.9757970571517944, + "logps/chosen": -308.64453125, + "logps/rejected": -373.3291320800781, + "loss": 0.7154, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3303202986717224, + "rewards/margins": 0.0813257172703743, + "rewards/rejected": -0.4116460382938385, + "step": 339 + }, + { + "epoch": 0.35, + "learning_rate": 4.108900286701552e-05, + "logits/chosen": -2.0622799396514893, + "logits/rejected": -2.0668537616729736, + "logps/chosen": -209.19119262695312, + "logps/rejected": -266.43145751953125, + "loss": 0.6393, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.61204993724823, + "rewards/margins": 0.28989237546920776, + "rewards/rejected": -0.901942253112793, + "step": 340 + }, + { + "epoch": 0.35, + "learning_rate": 4.101940092183048e-05, + "logits/chosen": -2.208672523498535, + "logits/rejected": -2.2501301765441895, + "logps/chosen": -449.76763916015625, + "logps/rejected": -333.26617431640625, + "loss": 0.735, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4438176155090332, + "rewards/margins": 0.0284462571144104, + "rewards/rejected": -0.472263902425766, + "step": 341 + }, + { + "epoch": 0.35, + "learning_rate": 4.0949587669918124e-05, + "logits/chosen": -2.2545619010925293, + "logits/rejected": -2.284487009048462, + "logps/chosen": -368.6788635253906, + "logps/rejected": -404.1095275878906, + "loss": 0.6716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6989068984985352, + "rewards/margins": 0.13742834329605103, + "rewards/rejected": -0.8363352417945862, + "step": 342 + }, + { + "epoch": 0.36, + "learning_rate": 4.087956403216243e-05, + "logits/chosen": -2.098728895187378, + "logits/rejected": -1.9426969289779663, + "logps/chosen": -364.287109375, + "logps/rejected": -345.9195556640625, + "loss": 0.6768, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5762618780136108, + "rewards/margins": 0.06659980118274689, + "rewards/rejected": -0.6428617238998413, + "step": 343 + }, + { + "epoch": 0.36, + "learning_rate": 4.0809330932222525e-05, + "logits/chosen": -2.0413217544555664, + "logits/rejected": -1.770898699760437, + "logps/chosen": -338.1258850097656, + "logps/rejected": -310.4683837890625, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.612076461315155, + "rewards/margins": 0.09379884600639343, + "rewards/rejected": -0.7058753371238708, + "step": 344 + }, + { + "epoch": 0.36, + "learning_rate": 4.073888929652048e-05, + "logits/chosen": -1.8582487106323242, + "logits/rejected": -1.9838684797286987, + "logps/chosen": -279.77764892578125, + "logps/rejected": -287.03314208984375, + "loss": 0.6344, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22681152820587158, + "rewards/margins": 0.17675894498825073, + "rewards/rejected": -0.4035705029964447, + "step": 345 + }, + { + "epoch": 0.36, + "learning_rate": 4.066824005422907e-05, + "logits/chosen": -2.2785511016845703, + "logits/rejected": -2.248875379562378, + "logps/chosen": -282.7828369140625, + "logps/rejected": -282.3833923339844, + "loss": 0.6969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5595721006393433, + "rewards/margins": 0.007087539881467819, + "rewards/rejected": -0.5666596293449402, + "step": 346 + }, + { + "epoch": 0.36, + "learning_rate": 4.0597384137259576e-05, + "logits/chosen": -1.8838849067687988, + "logits/rejected": -1.992185115814209, + "logps/chosen": -249.31353759765625, + "logps/rejected": -291.52215576171875, + "loss": 0.639, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25019168853759766, + "rewards/margins": 0.13961070775985718, + "rewards/rejected": -0.3898024260997772, + "step": 347 + }, + { + "epoch": 0.36, + "learning_rate": 4.052632248024943e-05, + "logits/chosen": -2.194199562072754, + "logits/rejected": -2.2093465328216553, + "logps/chosen": -359.925048828125, + "logps/rejected": -341.787109375, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44080251455307007, + "rewards/margins": 0.10257270187139511, + "rewards/rejected": -0.5433753132820129, + "step": 348 + }, + { + "epoch": 0.36, + "learning_rate": 4.045505602054994e-05, + "logits/chosen": -1.9952166080474854, + "logits/rejected": -1.9181216955184937, + "logps/chosen": -272.50146484375, + "logps/rejected": -296.2552795410156, + "loss": 0.7549, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5810081958770752, + "rewards/margins": -0.0541771724820137, + "rewards/rejected": -0.5268309712409973, + "step": 349 + }, + { + "epoch": 0.36, + "learning_rate": 4.0383585698213876e-05, + "logits/chosen": -2.166259527206421, + "logits/rejected": -1.9561712741851807, + "logps/chosen": -392.21014404296875, + "logps/rejected": -360.259033203125, + "loss": 0.723, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7585784196853638, + "rewards/margins": 0.024934954941272736, + "rewards/rejected": -0.7835134267807007, + "step": 350 + }, + { + "epoch": 0.36, + "learning_rate": 4.03119124559831e-05, + "logits/chosen": -2.117828845977783, + "logits/rejected": -2.2696075439453125, + "logps/chosen": -370.1112060546875, + "logps/rejected": -357.4794921875, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7355208396911621, + "rewards/margins": 0.01835598051548004, + "rewards/rejected": -0.7538768649101257, + "step": 351 + }, + { + "epoch": 0.36, + "learning_rate": 4.024003723927614e-05, + "logits/chosen": -2.15438175201416, + "logits/rejected": -2.240237236022949, + "logps/chosen": -291.9135437011719, + "logps/rejected": -306.6573791503906, + "loss": 0.6362, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6013116836547852, + "rewards/margins": 0.29444190859794617, + "rewards/rejected": -0.8957535028457642, + "step": 352 + }, + { + "epoch": 0.37, + "learning_rate": 4.016796099617569e-05, + "logits/chosen": -2.123490571975708, + "logits/rejected": -1.9996310472488403, + "logps/chosen": -320.6844787597656, + "logps/rejected": -328.69732666015625, + "loss": 0.7201, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6908583045005798, + "rewards/margins": 0.018049392849206924, + "rewards/rejected": -0.7089077234268188, + "step": 353 + }, + { + "epoch": 0.37, + "learning_rate": 4.009568467741611e-05, + "logits/chosen": -2.1538658142089844, + "logits/rejected": -2.2801826000213623, + "logps/chosen": -332.2469482421875, + "logps/rejected": -398.9875793457031, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35775309801101685, + "rewards/margins": 0.2206544280052185, + "rewards/rejected": -0.5784075260162354, + "step": 354 + }, + { + "epoch": 0.37, + "learning_rate": 4.0023209236370905e-05, + "logits/chosen": -2.1057112216949463, + "logits/rejected": -1.9648573398590088, + "logps/chosen": -304.4049377441406, + "logps/rejected": -331.1800842285156, + "loss": 0.5937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4884049594402313, + "rewards/margins": 0.28548693656921387, + "rewards/rejected": -0.7738919258117676, + "step": 355 + }, + { + "epoch": 0.37, + "learning_rate": 3.9950535629040154e-05, + "logits/chosen": -2.075382947921753, + "logits/rejected": -2.061739206314087, + "logps/chosen": -287.4841003417969, + "logps/rejected": -279.5746765136719, + "loss": 0.6667, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.42838478088378906, + "rewards/margins": 0.09299005568027496, + "rewards/rejected": -0.5213748216629028, + "step": 356 + }, + { + "epoch": 0.37, + "learning_rate": 3.9877664814037844e-05, + "logits/chosen": -2.041006088256836, + "logits/rejected": -2.146519899368286, + "logps/chosen": -234.5111083984375, + "logps/rejected": -315.4188232421875, + "loss": 0.5746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44257616996765137, + "rewards/margins": 0.30319535732269287, + "rewards/rejected": -0.745771586894989, + "step": 357 + }, + { + "epoch": 0.37, + "learning_rate": 3.98045977525793e-05, + "logits/chosen": -1.9708685874938965, + "logits/rejected": -2.119077444076538, + "logps/chosen": -233.92739868164062, + "logps/rejected": -251.74658203125, + "loss": 0.7386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43048954010009766, + "rewards/margins": 0.0007461756467819214, + "rewards/rejected": -0.43123573064804077, + "step": 358 + }, + { + "epoch": 0.37, + "learning_rate": 3.973133540846844e-05, + "logits/chosen": -2.2124104499816895, + "logits/rejected": -2.3888068199157715, + "logps/chosen": -378.053955078125, + "logps/rejected": -432.74102783203125, + "loss": 0.6094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4846843481063843, + "rewards/margins": 0.2898719012737274, + "rewards/rejected": -0.7745562195777893, + "step": 359 + }, + { + "epoch": 0.37, + "learning_rate": 3.965787874808513e-05, + "logits/chosen": -2.182685375213623, + "logits/rejected": -2.245464324951172, + "logps/chosen": -303.55377197265625, + "logps/rejected": -304.9583435058594, + "loss": 0.7466, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.72776859998703, + "rewards/margins": -0.010826468467712402, + "rewards/rejected": -0.7169421911239624, + "step": 360 + }, + { + "epoch": 0.37, + "learning_rate": 3.958422874037236e-05, + "logits/chosen": -2.2944741249084473, + "logits/rejected": -2.1541600227355957, + "logps/chosen": -325.5491638183594, + "logps/rejected": -359.56488037109375, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6380666494369507, + "rewards/margins": 0.19594302773475647, + "rewards/rejected": -0.8340096473693848, + "step": 361 + }, + { + "epoch": 0.37, + "learning_rate": 3.951038635682353e-05, + "logits/chosen": -2.183659553527832, + "logits/rejected": -2.309375762939453, + "logps/chosen": -233.40463256835938, + "logps/rejected": -258.43792724609375, + "loss": 0.5859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.34500959515571594, + "rewards/margins": 0.3104623556137085, + "rewards/rejected": -0.655471920967102, + "step": 362 + }, + { + "epoch": 0.38, + "learning_rate": 3.943635257146958e-05, + "logits/chosen": -2.189570903778076, + "logits/rejected": -2.307717800140381, + "logps/chosen": -310.58837890625, + "logps/rejected": -396.47412109375, + "loss": 0.5968, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5562077760696411, + "rewards/margins": 0.26935189962387085, + "rewards/rejected": -0.825559675693512, + "step": 363 + }, + { + "epoch": 0.38, + "learning_rate": 3.936212836086621e-05, + "logits/chosen": -2.087996244430542, + "logits/rejected": -2.061500310897827, + "logps/chosen": -320.0885009765625, + "logps/rejected": -367.2314453125, + "loss": 0.5962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5524263381958008, + "rewards/margins": 0.3210427165031433, + "rewards/rejected": -0.8734689354896545, + "step": 364 + }, + { + "epoch": 0.38, + "learning_rate": 3.9287714704080916e-05, + "logits/chosen": -2.2036032676696777, + "logits/rejected": -2.247941017150879, + "logps/chosen": -310.3907470703125, + "logps/rejected": -373.12005615234375, + "loss": 0.6413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7271711230278015, + "rewards/margins": 0.1365385353565216, + "rewards/rejected": -0.8637096881866455, + "step": 365 + }, + { + "epoch": 0.38, + "learning_rate": 3.9213112582680136e-05, + "logits/chosen": -2.071441888809204, + "logits/rejected": -2.2175564765930176, + "logps/chosen": -350.5240478515625, + "logps/rejected": -344.10015869140625, + "loss": 0.819, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7361069321632385, + "rewards/margins": -0.15310856699943542, + "rewards/rejected": -0.5829984545707703, + "step": 366 + }, + { + "epoch": 0.38, + "learning_rate": 3.913832298071629e-05, + "logits/chosen": -2.137769937515259, + "logits/rejected": -2.1863951683044434, + "logps/chosen": -261.62445068359375, + "logps/rejected": -287.54925537109375, + "loss": 0.5843, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5611026287078857, + "rewards/margins": 0.35456323623657227, + "rewards/rejected": -0.915665864944458, + "step": 367 + }, + { + "epoch": 0.38, + "learning_rate": 3.906334688471479e-05, + "logits/chosen": -2.322150230407715, + "logits/rejected": -2.2364730834960938, + "logps/chosen": -372.37115478515625, + "logps/rejected": -441.963623046875, + "loss": 0.7185, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7501680850982666, + "rewards/margins": 0.004912780597805977, + "rewards/rejected": -0.7550809383392334, + "step": 368 + }, + { + "epoch": 0.38, + "learning_rate": 3.8988185283661006e-05, + "logits/chosen": -2.3357200622558594, + "logits/rejected": -2.281803846359253, + "logps/chosen": -349.5588073730469, + "logps/rejected": -427.9027099609375, + "loss": 0.6668, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6943049430847168, + "rewards/margins": 0.08358533680438995, + "rewards/rejected": -0.7778902649879456, + "step": 369 + }, + { + "epoch": 0.38, + "learning_rate": 3.8912839168987286e-05, + "logits/chosen": -2.0027129650115967, + "logits/rejected": -2.201495885848999, + "logps/chosen": -335.1200866699219, + "logps/rejected": -356.2291259765625, + "loss": 0.7201, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.825676441192627, + "rewards/margins": 0.053414199501276016, + "rewards/rejected": -0.8790906667709351, + "step": 370 + }, + { + "epoch": 0.38, + "learning_rate": 3.883730953455981e-05, + "logits/chosen": -2.0758540630340576, + "logits/rejected": -2.2228283882141113, + "logps/chosen": -330.1047058105469, + "logps/rejected": -359.3675842285156, + "loss": 0.6519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9475977420806885, + "rewards/margins": 0.2040516585111618, + "rewards/rejected": -1.1516493558883667, + "step": 371 + }, + { + "epoch": 0.39, + "learning_rate": 3.876159737666551e-05, + "logits/chosen": -2.0982770919799805, + "logits/rejected": -2.1729726791381836, + "logps/chosen": -386.24560546875, + "logps/rejected": -394.7810974121094, + "loss": 0.7604, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8945655226707458, + "rewards/margins": -0.06084037944674492, + "rewards/rejected": -0.8337251543998718, + "step": 372 + }, + { + "epoch": 0.39, + "learning_rate": 3.868570369399894e-05, + "logits/chosen": -2.116682767868042, + "logits/rejected": -2.1849117279052734, + "logps/chosen": -251.51715087890625, + "logps/rejected": -259.1833801269531, + "loss": 0.6078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6918532252311707, + "rewards/margins": 0.33846980333328247, + "rewards/rejected": -1.0303230285644531, + "step": 373 + }, + { + "epoch": 0.39, + "learning_rate": 3.860962948764906e-05, + "logits/chosen": -2.0886361598968506, + "logits/rejected": -2.046086549758911, + "logps/chosen": -292.8084411621094, + "logps/rejected": -357.3808898925781, + "loss": 0.6235, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5758143067359924, + "rewards/margins": 0.24546638131141663, + "rewards/rejected": -0.8212807178497314, + "step": 374 + }, + { + "epoch": 0.39, + "learning_rate": 3.85333757610861e-05, + "logits/chosen": -2.0787835121154785, + "logits/rejected": -2.094371795654297, + "logps/chosen": -318.9442443847656, + "logps/rejected": -354.12188720703125, + "loss": 0.6207, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6455579996109009, + "rewards/margins": 0.18885663151741028, + "rewards/rejected": -0.8344146013259888, + "step": 375 + }, + { + "epoch": 0.39, + "learning_rate": 3.845694352014825e-05, + "logits/chosen": -2.0175139904022217, + "logits/rejected": -2.0191752910614014, + "logps/chosen": -341.6969299316406, + "logps/rejected": -383.448974609375, + "loss": 0.7511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.768385648727417, + "rewards/margins": -0.034262366592884064, + "rewards/rejected": -0.7341232895851135, + "step": 376 + }, + { + "epoch": 0.39, + "learning_rate": 3.838033377302844e-05, + "logits/chosen": -2.143493413925171, + "logits/rejected": -2.2795639038085938, + "logps/chosen": -304.6917724609375, + "logps/rejected": -347.99774169921875, + "loss": 0.7008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7183622717857361, + "rewards/margins": 0.0656728744506836, + "rewards/rejected": -0.7840351462364197, + "step": 377 + }, + { + "epoch": 0.39, + "learning_rate": 3.830354753026102e-05, + "logits/chosen": -2.0630643367767334, + "logits/rejected": -2.1921281814575195, + "logps/chosen": -290.36041259765625, + "logps/rejected": -370.9847717285156, + "loss": 0.6672, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6518306732177734, + "rewards/margins": 0.20937411487102509, + "rewards/rejected": -0.8612047433853149, + "step": 378 + }, + { + "epoch": 0.39, + "learning_rate": 3.8226585804708435e-05, + "logits/chosen": -2.1670899391174316, + "logits/rejected": -2.202104330062866, + "logps/chosen": -399.4073486328125, + "logps/rejected": -372.1466979980469, + "loss": 0.7435, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7544846534729004, + "rewards/margins": -0.017259221524000168, + "rewards/rejected": -0.7372254133224487, + "step": 379 + }, + { + "epoch": 0.39, + "learning_rate": 3.8149449611547886e-05, + "logits/chosen": -2.1285555362701416, + "logits/rejected": -2.118056297302246, + "logps/chosen": -325.7785949707031, + "logps/rejected": -354.8650817871094, + "loss": 0.6628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7010979056358337, + "rewards/margins": 0.0963049978017807, + "rewards/rejected": -0.7974028587341309, + "step": 380 + }, + { + "epoch": 0.39, + "learning_rate": 3.807213996825788e-05, + "logits/chosen": -2.134826183319092, + "logits/rejected": -2.105046272277832, + "logps/chosen": -347.5355529785156, + "logps/rejected": -370.0594482421875, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.640611469745636, + "rewards/margins": 0.43629205226898193, + "rewards/rejected": -1.0769035816192627, + "step": 381 + }, + { + "epoch": 0.4, + "learning_rate": 3.7994657894604906e-05, + "logits/chosen": -1.999627709388733, + "logits/rejected": -2.0539674758911133, + "logps/chosen": -324.84100341796875, + "logps/rejected": -294.4129333496094, + "loss": 0.5771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6821228265762329, + "rewards/margins": 0.3325861096382141, + "rewards/rejected": -1.0147089958190918, + "step": 382 + }, + { + "epoch": 0.4, + "learning_rate": 3.791700441262987e-05, + "logits/chosen": -2.2994258403778076, + "logits/rejected": -2.4870550632476807, + "logps/chosen": -308.85284423828125, + "logps/rejected": -387.4725341796875, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8057706356048584, + "rewards/margins": 0.3048417270183563, + "rewards/rejected": -1.110612392425537, + "step": 383 + }, + { + "epoch": 0.4, + "learning_rate": 3.78391805466347e-05, + "logits/chosen": -2.0464038848876953, + "logits/rejected": -1.9356979131698608, + "logps/chosen": -348.22613525390625, + "logps/rejected": -336.2864685058594, + "loss": 0.7458, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8788054585456848, + "rewards/margins": 0.08573350310325623, + "rewards/rejected": -0.9645389914512634, + "step": 384 + }, + { + "epoch": 0.4, + "learning_rate": 3.7761187323168804e-05, + "logits/chosen": -2.0775394439697266, + "logits/rejected": -2.0725295543670654, + "logps/chosen": -378.3978576660156, + "logps/rejected": -367.72894287109375, + "loss": 0.7507, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8533260822296143, + "rewards/margins": -0.06236880645155907, + "rewards/rejected": -0.7909572124481201, + "step": 385 + }, + { + "epoch": 0.4, + "learning_rate": 3.7683025771015515e-05, + "logits/chosen": -2.129138946533203, + "logits/rejected": -2.243818521499634, + "logps/chosen": -343.4689636230469, + "logps/rejected": -372.4132995605469, + "loss": 0.6416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.645578145980835, + "rewards/margins": 0.2473578155040741, + "rewards/rejected": -0.8929359912872314, + "step": 386 + }, + { + "epoch": 0.4, + "learning_rate": 3.760469692117854e-05, + "logits/chosen": -2.071223735809326, + "logits/rejected": -1.9962562322616577, + "logps/chosen": -256.22509765625, + "logps/rejected": -263.2807312011719, + "loss": 0.6128, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.746870219707489, + "rewards/margins": 0.27368226647377014, + "rewards/rejected": -1.0205525159835815, + "step": 387 + }, + { + "epoch": 0.4, + "learning_rate": 3.752620180686837e-05, + "logits/chosen": -2.195363998413086, + "logits/rejected": -2.3401169776916504, + "logps/chosen": -329.45050048828125, + "logps/rejected": -367.7939758300781, + "loss": 0.6203, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8209664821624756, + "rewards/margins": 0.39947718381881714, + "rewards/rejected": -1.2204437255859375, + "step": 388 + }, + { + "epoch": 0.4, + "learning_rate": 3.744754146348862e-05, + "logits/chosen": -2.3540046215057373, + "logits/rejected": -2.102443218231201, + "logps/chosen": -439.45501708984375, + "logps/rejected": -325.82647705078125, + "loss": 0.7747, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0380897521972656, + "rewards/margins": -0.03507265821099281, + "rewards/rejected": -1.0030171871185303, + "step": 389 + }, + { + "epoch": 0.4, + "learning_rate": 3.736871692862239e-05, + "logits/chosen": -2.0107619762420654, + "logits/rejected": -2.133056879043579, + "logps/chosen": -280.13372802734375, + "logps/rejected": -372.2601623535156, + "loss": 0.5378, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8871760368347168, + "rewards/margins": 0.42683565616607666, + "rewards/rejected": -1.3140116930007935, + "step": 390 + }, + { + "epoch": 0.4, + "learning_rate": 3.7289729242018586e-05, + "logits/chosen": -2.216970682144165, + "logits/rejected": -2.1814701557159424, + "logps/chosen": -248.73269653320312, + "logps/rejected": -246.78436279296875, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7154771089553833, + "rewards/margins": 0.5849171280860901, + "rewards/rejected": -1.3003942966461182, + "step": 391 + }, + { + "epoch": 0.41, + "learning_rate": 3.721057944557819e-05, + "logits/chosen": -2.026475667953491, + "logits/rejected": -2.0671896934509277, + "logps/chosen": -298.8050842285156, + "logps/rejected": -324.1427307128906, + "loss": 0.5981, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.523551344871521, + "rewards/margins": 0.30002254247665405, + "rewards/rejected": -0.823573887348175, + "step": 392 + }, + { + "epoch": 0.41, + "learning_rate": 3.713126858334052e-05, + "logits/chosen": -1.7506847381591797, + "logits/rejected": -1.871716022491455, + "logps/chosen": -272.1562194824219, + "logps/rejected": -365.52752685546875, + "loss": 0.5324, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7902772426605225, + "rewards/margins": 0.6367801427841187, + "rewards/rejected": -1.4270575046539307, + "step": 393 + }, + { + "epoch": 0.41, + "learning_rate": 3.705179770146946e-05, + "logits/chosen": -2.215156316757202, + "logits/rejected": -2.17736554145813, + "logps/chosen": -338.9674987792969, + "logps/rejected": -318.7024230957031, + "loss": 0.7397, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7532888650894165, + "rewards/margins": -0.0309628713876009, + "rewards/rejected": -0.7223260402679443, + "step": 394 + }, + { + "epoch": 0.41, + "learning_rate": 3.697216784823967e-05, + "logits/chosen": -2.0116758346557617, + "logits/rejected": -2.1581830978393555, + "logps/chosen": -242.60418701171875, + "logps/rejected": -268.9742431640625, + "loss": 0.8924, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1447850465774536, + "rewards/margins": -0.10007806122303009, + "rewards/rejected": -1.0447068214416504, + "step": 395 + }, + { + "epoch": 0.41, + "learning_rate": 3.689238007402275e-05, + "logits/chosen": -2.0335168838500977, + "logits/rejected": -1.962857723236084, + "logps/chosen": -239.37826538085938, + "logps/rejected": -250.44223022460938, + "loss": 0.7472, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.880851149559021, + "rewards/margins": 0.022362351417541504, + "rewards/rejected": -0.9032134413719177, + "step": 396 + }, + { + "epoch": 0.41, + "learning_rate": 3.6812435431273374e-05, + "logits/chosen": -2.365676164627075, + "logits/rejected": -2.3459362983703613, + "logps/chosen": -479.81549072265625, + "logps/rejected": -578.3353271484375, + "loss": 0.586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9297402501106262, + "rewards/margins": 0.4031886160373688, + "rewards/rejected": -1.3329288959503174, + "step": 397 + }, + { + "epoch": 0.41, + "learning_rate": 3.673233497451541e-05, + "logits/chosen": -2.2611470222473145, + "logits/rejected": -2.1032679080963135, + "logps/chosen": -301.1606140136719, + "logps/rejected": -320.68115234375, + "loss": 0.9109, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.3444305658340454, + "rewards/margins": -0.23366227746009827, + "rewards/rejected": -1.1107683181762695, + "step": 398 + }, + { + "epoch": 0.41, + "learning_rate": 3.665207976032804e-05, + "logits/chosen": -2.068887948989868, + "logits/rejected": -2.339625835418701, + "logps/chosen": -389.18975830078125, + "logps/rejected": -526.3670654296875, + "loss": 0.526, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0946829319000244, + "rewards/margins": 0.6931131482124329, + "rewards/rejected": -1.7877960205078125, + "step": 399 + }, + { + "epoch": 0.41, + "learning_rate": 3.65716708473318e-05, + "logits/chosen": -2.0643274784088135, + "logits/rejected": -1.928961992263794, + "logps/chosen": -350.87506103515625, + "logps/rejected": -305.41510009765625, + "loss": 0.8925, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5230122804641724, + "rewards/margins": -0.1989414244890213, + "rewards/rejected": -1.324070930480957, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 3.64911092961746e-05, + "logits/chosen": -1.9068621397018433, + "logits/rejected": -2.146777629852295, + "logps/chosen": -401.4383544921875, + "logps/rejected": -405.74755859375, + "loss": 0.6681, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1909916400909424, + "rewards/margins": 0.21687617897987366, + "rewards/rejected": -1.4078677892684937, + "step": 401 + }, + { + "epoch": 0.42, + "learning_rate": 3.641039616951776e-05, + "logits/chosen": -1.996084451675415, + "logits/rejected": -1.9715969562530518, + "logps/chosen": -285.1630859375, + "logps/rejected": -260.9218444824219, + "loss": 0.7071, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1817179918289185, + "rewards/margins": 0.04514620825648308, + "rewards/rejected": -1.226864218711853, + "step": 402 + }, + { + "epoch": 0.42, + "learning_rate": 3.632953253202199e-05, + "logits/chosen": -1.890580177307129, + "logits/rejected": -2.0182905197143555, + "logps/chosen": -329.42694091796875, + "logps/rejected": -450.1892395019531, + "loss": 0.6209, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0331366062164307, + "rewards/margins": 0.35443419218063354, + "rewards/rejected": -1.3875707387924194, + "step": 403 + }, + { + "epoch": 0.42, + "learning_rate": 3.6248519450333315e-05, + "logits/chosen": -2.3349204063415527, + "logits/rejected": -2.1745338439941406, + "logps/chosen": -383.373291015625, + "logps/rejected": -421.9166259765625, + "loss": 0.7308, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1230090856552124, + "rewards/margins": 0.11969134211540222, + "rewards/rejected": -1.2427003383636475, + "step": 404 + }, + { + "epoch": 0.42, + "learning_rate": 3.6167357993069075e-05, + "logits/chosen": -2.124786615371704, + "logits/rejected": -2.2353270053863525, + "logps/chosen": -398.7025146484375, + "logps/rejected": -462.741455078125, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0880743265151978, + "rewards/margins": 0.24901491403579712, + "rewards/rejected": -1.3370893001556396, + "step": 405 + }, + { + "epoch": 0.42, + "learning_rate": 3.608604923080373e-05, + "logits/chosen": -2.124338150024414, + "logits/rejected": -1.9945569038391113, + "logps/chosen": -410.68701171875, + "logps/rejected": -351.4372863769531, + "loss": 0.6415, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3134312629699707, + "rewards/margins": 0.23311342298984528, + "rewards/rejected": -1.5465446710586548, + "step": 406 + }, + { + "epoch": 0.42, + "learning_rate": 3.6004594236054836e-05, + "logits/chosen": -1.9870651960372925, + "logits/rejected": -1.9105538129806519, + "logps/chosen": -301.83245849609375, + "logps/rejected": -309.3512268066406, + "loss": 0.5724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9611720442771912, + "rewards/margins": 0.3760131001472473, + "rewards/rejected": -1.337185263633728, + "step": 407 + }, + { + "epoch": 0.42, + "learning_rate": 3.592299408326883e-05, + "logits/chosen": -2.201324462890625, + "logits/rejected": -2.078688383102417, + "logps/chosen": -331.58795166015625, + "logps/rejected": -408.33770751953125, + "loss": 0.5669, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2536283731460571, + "rewards/margins": 0.36857056617736816, + "rewards/rejected": -1.6221990585327148, + "step": 408 + }, + { + "epoch": 0.42, + "learning_rate": 3.584124984880689e-05, + "logits/chosen": -1.9430594444274902, + "logits/rejected": -2.1800172328948975, + "logps/chosen": -255.10409545898438, + "logps/rejected": -318.5274658203125, + "loss": 0.4678, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7731481194496155, + "rewards/margins": 0.704475462436676, + "rewards/rejected": -1.4776235818862915, + "step": 409 + }, + { + "epoch": 0.42, + "learning_rate": 3.575936261093073e-05, + "logits/chosen": -2.026442766189575, + "logits/rejected": -2.3520658016204834, + "logps/chosen": -224.86386108398438, + "logps/rejected": -284.865234375, + "loss": 0.6649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.988176703453064, + "rewards/margins": 0.25633472204208374, + "rewards/rejected": -1.2445114850997925, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 3.5677333449788374e-05, + "logits/chosen": -2.2224576473236084, + "logits/rejected": -2.2077910900115967, + "logps/chosen": -376.9631042480469, + "logps/rejected": -320.0201721191406, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.145774006843567, + "rewards/margins": 0.23624449968338013, + "rewards/rejected": -1.3820184469223022, + "step": 411 + }, + { + "epoch": 0.43, + "learning_rate": 3.559516344739991e-05, + "logits/chosen": -2.0415990352630615, + "logits/rejected": -2.077162981033325, + "logps/chosen": -276.7152404785156, + "logps/rejected": -296.462646484375, + "loss": 0.573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8756453394889832, + "rewards/margins": 0.3228147625923157, + "rewards/rejected": -1.1984599828720093, + "step": 412 + }, + { + "epoch": 0.43, + "learning_rate": 3.551285368764321e-05, + "logits/chosen": -2.171372890472412, + "logits/rejected": -2.1741371154785156, + "logps/chosen": -273.3794860839844, + "logps/rejected": -258.83477783203125, + "loss": 0.7963, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3004168272018433, + "rewards/margins": 0.03895503282546997, + "rewards/rejected": -1.3393718004226685, + "step": 413 + }, + { + "epoch": 0.43, + "learning_rate": 3.543040525623965e-05, + "logits/chosen": -2.0618252754211426, + "logits/rejected": -2.1750998497009277, + "logps/chosen": -234.17120361328125, + "logps/rejected": -300.029296875, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0354472398757935, + "rewards/margins": 0.45164865255355835, + "rewards/rejected": -1.487095832824707, + "step": 414 + }, + { + "epoch": 0.43, + "learning_rate": 3.534781924073978e-05, + "logits/chosen": -1.8313791751861572, + "logits/rejected": -2.19712233543396, + "logps/chosen": -263.08843994140625, + "logps/rejected": -374.9747314453125, + "loss": 0.6435, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3530242443084717, + "rewards/margins": 0.28404536843299866, + "rewards/rejected": -1.6370694637298584, + "step": 415 + }, + { + "epoch": 0.43, + "learning_rate": 3.5265096730508974e-05, + "logits/chosen": -1.992910385131836, + "logits/rejected": -2.050726890563965, + "logps/chosen": -332.4587707519531, + "logps/rejected": -448.9661560058594, + "loss": 0.5689, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1743048429489136, + "rewards/margins": 0.4015694260597229, + "rewards/rejected": -1.5758743286132812, + "step": 416 + }, + { + "epoch": 0.43, + "learning_rate": 3.518223881671305e-05, + "logits/chosen": -2.2300572395324707, + "logits/rejected": -2.292898416519165, + "logps/chosen": -361.424560546875, + "logps/rejected": -408.99267578125, + "loss": 0.7222, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.523597240447998, + "rewards/margins": 0.003444090485572815, + "rewards/rejected": -1.5270413160324097, + "step": 417 + }, + { + "epoch": 0.43, + "learning_rate": 3.509924659230392e-05, + "logits/chosen": -1.9317662715911865, + "logits/rejected": -2.1398890018463135, + "logps/chosen": -201.24151611328125, + "logps/rejected": -316.9627685546875, + "loss": 0.7506, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2115095853805542, + "rewards/margins": 0.002029839903116226, + "rewards/rejected": -1.2135393619537354, + "step": 418 + }, + { + "epoch": 0.43, + "learning_rate": 3.501612115200512e-05, + "logits/chosen": -1.8960869312286377, + "logits/rejected": -1.8769932985305786, + "logps/chosen": -230.08914184570312, + "logps/rejected": -274.8809814453125, + "loss": 0.7128, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2795968055725098, + "rewards/margins": 0.10936379432678223, + "rewards/rejected": -1.3889607191085815, + "step": 419 + }, + { + "epoch": 0.43, + "learning_rate": 3.4932863592297395e-05, + "logits/chosen": -2.1262502670288086, + "logits/rejected": -2.0916502475738525, + "logps/chosen": -288.3411865234375, + "logps/rejected": -374.0597839355469, + "loss": 0.587, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2651339769363403, + "rewards/margins": 0.49100592732429504, + "rewards/rejected": -1.7561399936676025, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 3.4849475011404246e-05, + "logits/chosen": -1.932436227798462, + "logits/rejected": -2.0958409309387207, + "logps/chosen": -385.390380859375, + "logps/rejected": -420.717529296875, + "loss": 0.6097, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2712079286575317, + "rewards/margins": 0.29501456022262573, + "rewards/rejected": -1.5662224292755127, + "step": 421 + }, + { + "epoch": 0.44, + "learning_rate": 3.476595650927741e-05, + "logits/chosen": -2.2775261402130127, + "logits/rejected": -2.2956340312957764, + "logps/chosen": -357.61767578125, + "logps/rejected": -349.096923828125, + "loss": 0.736, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1353936195373535, + "rewards/margins": 0.031133286654949188, + "rewards/rejected": -1.1665267944335938, + "step": 422 + }, + { + "epoch": 0.44, + "learning_rate": 3.468230918758242e-05, + "logits/chosen": -2.1961703300476074, + "logits/rejected": -2.291398048400879, + "logps/chosen": -308.3431701660156, + "logps/rejected": -314.66851806640625, + "loss": 0.6423, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1682064533233643, + "rewards/margins": 0.12875376641750336, + "rewards/rejected": -1.2969601154327393, + "step": 423 + }, + { + "epoch": 0.44, + "learning_rate": 3.459853414968397e-05, + "logits/chosen": -2.1401329040527344, + "logits/rejected": -2.0424118041992188, + "logps/chosen": -323.16693115234375, + "logps/rejected": -300.6637268066406, + "loss": 0.7578, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.309455156326294, + "rewards/margins": 0.23452845215797424, + "rewards/rejected": -1.5439834594726562, + "step": 424 + }, + { + "epoch": 0.44, + "learning_rate": 3.451463250063146e-05, + "logits/chosen": -2.1163456439971924, + "logits/rejected": -2.154703140258789, + "logps/chosen": -307.1343688964844, + "logps/rejected": -349.54791259765625, + "loss": 0.6657, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2518941164016724, + "rewards/margins": 0.306789755821228, + "rewards/rejected": -1.55868399143219, + "step": 425 + }, + { + "epoch": 0.44, + "learning_rate": 3.443060534714434e-05, + "logits/chosen": -1.820733666419983, + "logits/rejected": -1.8521438837051392, + "logps/chosen": -297.1275939941406, + "logps/rejected": -264.0213623046875, + "loss": 0.761, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.155602216720581, + "rewards/margins": 0.04654591530561447, + "rewards/rejected": -1.2021480798721313, + "step": 426 + }, + { + "epoch": 0.44, + "learning_rate": 3.4346453797597576e-05, + "logits/chosen": -2.069772243499756, + "logits/rejected": -1.9325459003448486, + "logps/chosen": -291.765869140625, + "logps/rejected": -291.6902160644531, + "loss": 0.6282, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4001034498214722, + "rewards/margins": 0.3071330487728119, + "rewards/rejected": -1.7072365283966064, + "step": 427 + }, + { + "epoch": 0.44, + "learning_rate": 3.426217896200699e-05, + "logits/chosen": -1.9241890907287598, + "logits/rejected": -1.9453171491622925, + "logps/chosen": -356.77081298828125, + "logps/rejected": -332.4874572753906, + "loss": 0.7444, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3764888048171997, + "rewards/margins": 0.23870763182640076, + "rewards/rejected": -1.6151964664459229, + "step": 428 + }, + { + "epoch": 0.44, + "learning_rate": 3.417778195201464e-05, + "logits/chosen": -2.045527935028076, + "logits/rejected": -2.2840356826782227, + "logps/chosen": -358.7695617675781, + "logps/rejected": -411.1256408691406, + "loss": 0.6085, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.457747459411621, + "rewards/margins": 0.2899523377418518, + "rewards/rejected": -1.7476999759674072, + "step": 429 + }, + { + "epoch": 0.45, + "learning_rate": 3.4093263880874136e-05, + "logits/chosen": -2.2168030738830566, + "logits/rejected": -2.026329755783081, + "logps/chosen": -391.6089172363281, + "logps/rejected": -444.5746154785156, + "loss": 0.7252, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3314502239227295, + "rewards/margins": 0.08630897104740143, + "rewards/rejected": -1.4177591800689697, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 3.400862586343597e-05, + "logits/chosen": -2.105616569519043, + "logits/rejected": -2.0884623527526855, + "logps/chosen": -383.8026123046875, + "logps/rejected": -367.6466064453125, + "loss": 0.577, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5232832431793213, + "rewards/margins": 0.3972979784011841, + "rewards/rejected": -1.920581340789795, + "step": 431 + }, + { + "epoch": 0.45, + "learning_rate": 3.392386901613282e-05, + "logits/chosen": -2.2307028770446777, + "logits/rejected": -1.9826226234436035, + "logps/chosen": -296.00274658203125, + "logps/rejected": -274.8576354980469, + "loss": 0.8504, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0603615045547485, + "rewards/margins": -0.11922366172075272, + "rewards/rejected": -0.9411377310752869, + "step": 432 + }, + { + "epoch": 0.45, + "learning_rate": 3.383899445696477e-05, + "logits/chosen": -1.8530570268630981, + "logits/rejected": -1.8290507793426514, + "logps/chosen": -352.0951843261719, + "logps/rejected": -424.3826599121094, + "loss": 0.6405, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2079800367355347, + "rewards/margins": 0.37576451897621155, + "rewards/rejected": -1.5837446451187134, + "step": 433 + }, + { + "epoch": 0.45, + "learning_rate": 3.375400330548466e-05, + "logits/chosen": -2.0532162189483643, + "logits/rejected": -1.988155722618103, + "logps/chosen": -418.684814453125, + "logps/rejected": -460.6349792480469, + "loss": 0.8049, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.395944595336914, + "rewards/margins": -0.09554408490657806, + "rewards/rejected": -1.3004004955291748, + "step": 434 + }, + { + "epoch": 0.45, + "learning_rate": 3.366889668278321e-05, + "logits/chosen": -1.9795726537704468, + "logits/rejected": -2.1528823375701904, + "logps/chosen": -267.9879455566406, + "logps/rejected": -301.9024963378906, + "loss": 0.7124, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9974009990692139, + "rewards/margins": 0.12711931765079498, + "rewards/rejected": -1.1245203018188477, + "step": 435 + }, + { + "epoch": 0.45, + "learning_rate": 3.358367571147433e-05, + "logits/chosen": -1.886863112449646, + "logits/rejected": -2.1218717098236084, + "logps/chosen": -370.65704345703125, + "logps/rejected": -416.7871398925781, + "loss": 0.6567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.184512734413147, + "rewards/margins": 0.2182311713695526, + "rewards/rejected": -1.4027438163757324, + "step": 436 + }, + { + "epoch": 0.45, + "learning_rate": 3.3498341515680214e-05, + "logits/chosen": -2.1803653240203857, + "logits/rejected": -2.1077873706817627, + "logps/chosen": -321.48834228515625, + "logps/rejected": -281.7978820800781, + "loss": 0.7596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9977477192878723, + "rewards/margins": -0.019190065562725067, + "rewards/rejected": -0.9785577058792114, + "step": 437 + }, + { + "epoch": 0.45, + "learning_rate": 3.3412895221016605e-05, + "logits/chosen": -1.888815999031067, + "logits/rejected": -2.005762815475464, + "logps/chosen": -184.04368591308594, + "logps/rejected": -236.44418334960938, + "loss": 0.9197, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0804985761642456, + "rewards/margins": -0.14094766974449158, + "rewards/rejected": -0.9395509958267212, + "step": 438 + }, + { + "epoch": 0.45, + "learning_rate": 3.332733795457789e-05, + "logits/chosen": -1.9857516288757324, + "logits/rejected": -1.7923184633255005, + "logps/chosen": -267.1208801269531, + "logps/rejected": -263.0241394042969, + "loss": 0.5819, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9991573095321655, + "rewards/margins": 0.37854859232902527, + "rewards/rejected": -1.3777059316635132, + "step": 439 + }, + { + "epoch": 0.46, + "learning_rate": 3.324167084492226e-05, + "logits/chosen": -1.880125880241394, + "logits/rejected": -2.0244104862213135, + "logps/chosen": -300.0595703125, + "logps/rejected": -503.9610290527344, + "loss": 0.469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1374258995056152, + "rewards/margins": 0.9749306440353394, + "rewards/rejected": -2.112356662750244, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 3.3155895022056784e-05, + "logits/chosen": -2.1099507808685303, + "logits/rejected": -2.1651086807250977, + "logps/chosen": -297.15704345703125, + "logps/rejected": -323.30389404296875, + "loss": 0.7954, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2452776432037354, + "rewards/margins": -0.07082469016313553, + "rewards/rejected": -1.1744530200958252, + "step": 441 + }, + { + "epoch": 0.46, + "learning_rate": 3.3070011617422566e-05, + "logits/chosen": -1.9197720289230347, + "logits/rejected": -1.991129755973816, + "logps/chosen": -326.0094909667969, + "logps/rejected": -374.349609375, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8590810298919678, + "rewards/margins": 0.45791155099868774, + "rewards/rejected": -1.3169926404953003, + "step": 442 + }, + { + "epoch": 0.46, + "learning_rate": 3.2984021763879755e-05, + "logits/chosen": -2.2275571823120117, + "logits/rejected": -2.2043910026550293, + "logps/chosen": -371.75927734375, + "logps/rejected": -395.1778564453125, + "loss": 0.6456, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.126926302909851, + "rewards/margins": 0.2308472990989685, + "rewards/rejected": -1.3577736616134644, + "step": 443 + }, + { + "epoch": 0.46, + "learning_rate": 3.2897926595692664e-05, + "logits/chosen": -2.181673526763916, + "logits/rejected": -2.3108913898468018, + "logps/chosen": -357.79888916015625, + "logps/rejected": -431.6429443359375, + "loss": 0.62, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9887603521347046, + "rewards/margins": 0.32495391368865967, + "rewards/rejected": -1.3137142658233643, + "step": 444 + }, + { + "epoch": 0.46, + "learning_rate": 3.2811727248514754e-05, + "logits/chosen": -2.1919546127319336, + "logits/rejected": -2.1633992195129395, + "logps/chosen": -421.3163757324219, + "logps/rejected": -446.4229431152344, + "loss": 0.8466, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3915138244628906, + "rewards/margins": -0.18495866656303406, + "rewards/rejected": -1.2065550088882446, + "step": 445 + }, + { + "epoch": 0.46, + "learning_rate": 3.272542485937369e-05, + "logits/chosen": -2.2480454444885254, + "logits/rejected": -2.2334165573120117, + "logps/chosen": -532.5480346679688, + "logps/rejected": -461.50408935546875, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.161598563194275, + "rewards/margins": 0.1340804100036621, + "rewards/rejected": -1.2956790924072266, + "step": 446 + }, + { + "epoch": 0.46, + "learning_rate": 3.263902056665631e-05, + "logits/chosen": -2.1135687828063965, + "logits/rejected": -1.9993455410003662, + "logps/chosen": -338.2395324707031, + "logps/rejected": -354.9482727050781, + "loss": 0.7578, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.4973227977752686, + "rewards/margins": -0.08258108794689178, + "rewards/rejected": -1.4147417545318604, + "step": 447 + }, + { + "epoch": 0.46, + "learning_rate": 3.2552515510093674e-05, + "logits/chosen": -1.923673391342163, + "logits/rejected": -2.005218982696533, + "logps/chosen": -276.5377502441406, + "logps/rejected": -285.30633544921875, + "loss": 0.8311, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4315433502197266, + "rewards/margins": 0.01938357949256897, + "rewards/rejected": -1.4509271383285522, + "step": 448 + }, + { + "epoch": 0.46, + "learning_rate": 3.2465910830745924e-05, + "logits/chosen": -2.0653865337371826, + "logits/rejected": -2.0072455406188965, + "logps/chosen": -217.55328369140625, + "logps/rejected": -182.0865936279297, + "loss": 0.8905, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2046016454696655, + "rewards/margins": -0.10025043040513992, + "rewards/rejected": -1.104351282119751, + "step": 449 + }, + { + "epoch": 0.47, + "learning_rate": 3.237920767098735e-05, + "logits/chosen": -2.1028592586517334, + "logits/rejected": -2.201251745223999, + "logps/chosen": -325.9986572265625, + "logps/rejected": -432.1500244140625, + "loss": 0.7399, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1958823204040527, + "rewards/margins": 0.0005789399147033691, + "rewards/rejected": -1.1964612007141113, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 3.229240717449122e-05, + "logits/chosen": -2.1992039680480957, + "logits/rejected": -2.378601312637329, + "logps/chosen": -370.0502014160156, + "logps/rejected": -408.4455871582031, + "loss": 0.7355, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.168852686882019, + "rewards/margins": -0.0028562992811203003, + "rewards/rejected": -1.1659963130950928, + "step": 451 + }, + { + "epoch": 0.47, + "learning_rate": 3.2205510486214777e-05, + "logits/chosen": -2.1426610946655273, + "logits/rejected": -2.160429000854492, + "logps/chosen": -307.41278076171875, + "logps/rejected": -312.3187561035156, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0617390871047974, + "rewards/margins": 0.29938191175460815, + "rewards/rejected": -1.3611209392547607, + "step": 452 + }, + { + "epoch": 0.47, + "learning_rate": 3.211851875238408e-05, + "logits/chosen": -2.1584270000457764, + "logits/rejected": -1.9756699800491333, + "logps/chosen": -247.97222900390625, + "logps/rejected": -275.8572692871094, + "loss": 0.7132, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2110902070999146, + "rewards/margins": 0.21431638300418854, + "rewards/rejected": -1.425406575202942, + "step": 453 + }, + { + "epoch": 0.47, + "learning_rate": 3.203143312047889e-05, + "logits/chosen": -2.284712314605713, + "logits/rejected": -2.316433906555176, + "logps/chosen": -397.7944030761719, + "logps/rejected": -435.2258605957031, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0755599737167358, + "rewards/margins": 0.4171152710914612, + "rewards/rejected": -1.4926753044128418, + "step": 454 + }, + { + "epoch": 0.47, + "learning_rate": 3.1944254739217585e-05, + "logits/chosen": -2.3242409229278564, + "logits/rejected": -2.2877395153045654, + "logps/chosen": -317.1751403808594, + "logps/rejected": -319.5546569824219, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0918810367584229, + "rewards/margins": 0.174340158700943, + "rewards/rejected": -1.266221284866333, + "step": 455 + }, + { + "epoch": 0.47, + "learning_rate": 3.1856984758541924e-05, + "logits/chosen": -2.1573104858398438, + "logits/rejected": -2.1013479232788086, + "logps/chosen": -402.46051025390625, + "logps/rejected": -317.1357421875, + "loss": 0.5846, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2507472038269043, + "rewards/margins": 0.32065922021865845, + "rewards/rejected": -1.5714064836502075, + "step": 456 + }, + { + "epoch": 0.47, + "learning_rate": 3.176962432960197e-05, + "logits/chosen": -2.3160417079925537, + "logits/rejected": -2.1284031867980957, + "logps/chosen": -401.79803466796875, + "logps/rejected": -336.5020446777344, + "loss": 0.8215, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1392488479614258, + "rewards/margins": -0.045140765607357025, + "rewards/rejected": -1.094107985496521, + "step": 457 + }, + { + "epoch": 0.47, + "learning_rate": 3.168217460474081e-05, + "logits/chosen": -2.1930458545684814, + "logits/rejected": -2.081754684448242, + "logps/chosen": -408.9210510253906, + "logps/rejected": -364.5238037109375, + "loss": 0.7234, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2171686887741089, + "rewards/margins": 0.20472437143325806, + "rewards/rejected": -1.4218928813934326, + "step": 458 + }, + { + "epoch": 0.48, + "learning_rate": 3.159463673747945e-05, + "logits/chosen": -1.8765006065368652, + "logits/rejected": -1.8720567226409912, + "logps/chosen": -301.31402587890625, + "logps/rejected": -357.6469421386719, + "loss": 0.6946, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7754146456718445, + "rewards/margins": 0.04356713593006134, + "rewards/rejected": -0.8189818263053894, + "step": 459 + }, + { + "epoch": 0.48, + "learning_rate": 3.150701188250152e-05, + "logits/chosen": -2.129390001296997, + "logits/rejected": -1.9843943119049072, + "logps/chosen": -334.6359558105469, + "logps/rejected": -345.7218322753906, + "loss": 0.6842, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9497905373573303, + "rewards/margins": 0.12164813280105591, + "rewards/rejected": -1.0714386701583862, + "step": 460 + }, + { + "epoch": 0.48, + "learning_rate": 3.141930119563812e-05, + "logits/chosen": -1.9846031665802002, + "logits/rejected": -2.1700026988983154, + "logps/chosen": -316.71832275390625, + "logps/rejected": -362.517822265625, + "loss": 0.6957, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0000622272491455, + "rewards/margins": 0.041672270745038986, + "rewards/rejected": -1.0417344570159912, + "step": 461 + }, + { + "epoch": 0.48, + "learning_rate": 3.133150583385247e-05, + "logits/chosen": -2.246786594390869, + "logits/rejected": -2.29536771774292, + "logps/chosen": -401.23394775390625, + "logps/rejected": -394.4549865722656, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0588868856430054, + "rewards/margins": 0.1614934504032135, + "rewards/rejected": -1.220380425453186, + "step": 462 + }, + { + "epoch": 0.48, + "learning_rate": 3.124362695522476e-05, + "logits/chosen": -1.9954516887664795, + "logits/rejected": -2.391841411590576, + "logps/chosen": -270.8871154785156, + "logps/rejected": -376.65106201171875, + "loss": 0.6857, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9002559781074524, + "rewards/margins": 0.10894503444433212, + "rewards/rejected": -1.0092010498046875, + "step": 463 + }, + { + "epoch": 0.48, + "learning_rate": 3.115566571893681e-05, + "logits/chosen": -2.2636733055114746, + "logits/rejected": -2.159451484680176, + "logps/chosen": -318.85888671875, + "logps/rejected": -295.5828857421875, + "loss": 0.6792, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0352829694747925, + "rewards/margins": 0.12933281064033508, + "rewards/rejected": -1.1646157503128052, + "step": 464 + }, + { + "epoch": 0.48, + "learning_rate": 3.1067623285256766e-05, + "logits/chosen": -2.083453893661499, + "logits/rejected": -2.077559471130371, + "logps/chosen": -277.09197998046875, + "logps/rejected": -313.90728759765625, + "loss": 0.7098, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8074424266815186, + "rewards/margins": 0.011367838829755783, + "rewards/rejected": -0.8188102841377258, + "step": 465 + }, + { + "epoch": 0.48, + "learning_rate": 3.097950081552387e-05, + "logits/chosen": -2.062037467956543, + "logits/rejected": -1.969789981842041, + "logps/chosen": -272.09674072265625, + "logps/rejected": -282.55560302734375, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8802644610404968, + "rewards/margins": 0.13163542747497559, + "rewards/rejected": -1.0118999481201172, + "step": 466 + }, + { + "epoch": 0.48, + "learning_rate": 3.089129947213305e-05, + "logits/chosen": -2.0714964866638184, + "logits/rejected": -2.0289595127105713, + "logps/chosen": -338.4779968261719, + "logps/rejected": -305.9598083496094, + "loss": 0.6018, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8691202998161316, + "rewards/margins": 0.25768163800239563, + "rewards/rejected": -1.1268019676208496, + "step": 467 + }, + { + "epoch": 0.48, + "learning_rate": 3.080302041851966e-05, + "logits/chosen": -2.1845309734344482, + "logits/rejected": -2.330949306488037, + "logps/chosen": -318.61444091796875, + "logps/rejected": -337.0727844238281, + "loss": 0.7127, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0258458852767944, + "rewards/margins": 0.06928322464227676, + "rewards/rejected": -1.0951290130615234, + "step": 468 + }, + { + "epoch": 0.49, + "learning_rate": 3.071466481914409e-05, + "logits/chosen": -1.974360704421997, + "logits/rejected": -1.8974449634552002, + "logps/chosen": -328.3250732421875, + "logps/rejected": -353.5456237792969, + "loss": 0.811, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0966472625732422, + "rewards/margins": -0.16713061928749084, + "rewards/rejected": -0.9295165538787842, + "step": 469 + }, + { + "epoch": 0.49, + "learning_rate": 3.062623383947643e-05, + "logits/chosen": -1.96671462059021, + "logits/rejected": -2.175767421722412, + "logps/chosen": -325.94085693359375, + "logps/rejected": -443.0379943847656, + "loss": 0.7812, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.099274754524231, + "rewards/margins": -0.10562913119792938, + "rewards/rejected": -0.9936455488204956, + "step": 470 + }, + { + "epoch": 0.49, + "learning_rate": 3.053772864598108e-05, + "logits/chosen": -1.8880025148391724, + "logits/rejected": -2.134125232696533, + "logps/chosen": -347.37060546875, + "logps/rejected": -415.47735595703125, + "loss": 0.6779, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.082190990447998, + "rewards/margins": 0.09107710421085358, + "rewards/rejected": -1.1732680797576904, + "step": 471 + }, + { + "epoch": 0.49, + "learning_rate": 3.0449150406101367e-05, + "logits/chosen": -1.7812613248825073, + "logits/rejected": -1.7713969945907593, + "logps/chosen": -293.71978759765625, + "logps/rejected": -305.0588073730469, + "loss": 0.712, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0079898834228516, + "rewards/margins": 0.062352173030376434, + "rewards/rejected": -1.0703420639038086, + "step": 472 + }, + { + "epoch": 0.49, + "learning_rate": 3.0360500288244155e-05, + "logits/chosen": -2.2740883827209473, + "logits/rejected": -2.167978286743164, + "logps/chosen": -434.8294372558594, + "logps/rejected": -422.0832214355469, + "loss": 0.784, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0591871738433838, + "rewards/margins": -0.10172367095947266, + "rewards/rejected": -0.9574634432792664, + "step": 473 + }, + { + "epoch": 0.49, + "learning_rate": 3.0271779461764426e-05, + "logits/chosen": -2.059706926345825, + "logits/rejected": -2.0634119510650635, + "logps/chosen": -401.7886047363281, + "logps/rejected": -436.7006530761719, + "loss": 0.6583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9169749021530151, + "rewards/margins": 0.19741462171077728, + "rewards/rejected": -1.114389419555664, + "step": 474 + }, + { + "epoch": 0.49, + "learning_rate": 3.018298909694986e-05, + "logits/chosen": -2.1678502559661865, + "logits/rejected": -2.2958884239196777, + "logps/chosen": -328.1385498046875, + "logps/rejected": -411.2935485839844, + "loss": 0.6487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.87379390001297, + "rewards/margins": 0.17571806907653809, + "rewards/rejected": -1.0495120286941528, + "step": 475 + }, + { + "epoch": 0.49, + "learning_rate": 3.0094130365005395e-05, + "logits/chosen": -2.092456817626953, + "logits/rejected": -2.266845226287842, + "logps/chosen": -225.6428680419922, + "logps/rejected": -335.9315185546875, + "loss": 0.6278, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7999565601348877, + "rewards/margins": 0.19611772894859314, + "rewards/rejected": -0.996074378490448, + "step": 476 + }, + { + "epoch": 0.49, + "learning_rate": 3.0005204438037765e-05, + "logits/chosen": -2.205846071243286, + "logits/rejected": -2.052035093307495, + "logps/chosen": -361.5894775390625, + "logps/rejected": -309.0181884765625, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9857699871063232, + "rewards/margins": 0.09481573104858398, + "rewards/rejected": -1.0805857181549072, + "step": 477 + }, + { + "epoch": 0.5, + "learning_rate": 2.991621248904007e-05, + "logits/chosen": -2.26173996925354, + "logits/rejected": -2.031365394592285, + "logps/chosen": -369.19915771484375, + "logps/rejected": -294.8736877441406, + "loss": 0.7289, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.128753423690796, + "rewards/margins": -0.01859595626592636, + "rewards/rejected": -1.1101574897766113, + "step": 478 + }, + { + "epoch": 0.5, + "learning_rate": 2.9827155691876262e-05, + "logits/chosen": -1.8554052114486694, + "logits/rejected": -2.2059624195098877, + "logps/chosen": -317.9852600097656, + "logps/rejected": -354.1583251953125, + "loss": 0.5558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9314447045326233, + "rewards/margins": 0.35100990533828735, + "rewards/rejected": -1.2824546098709106, + "step": 479 + }, + { + "epoch": 0.5, + "learning_rate": 2.973803522126571e-05, + "logits/chosen": -1.9700522422790527, + "logits/rejected": -1.8781499862670898, + "logps/chosen": -307.0520935058594, + "logps/rejected": -313.407470703125, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0591685771942139, + "rewards/margins": 0.05153223127126694, + "rewards/rejected": -1.1107008457183838, + "step": 480 + }, + { + "epoch": 0.5, + "learning_rate": 2.9648852252767668e-05, + "logits/chosen": -2.1033730506896973, + "logits/rejected": -2.0786190032958984, + "logps/chosen": -452.3450012207031, + "logps/rejected": -511.24700927734375, + "loss": 0.6695, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.161694884300232, + "rewards/margins": 0.10767564922571182, + "rewards/rejected": -1.2693705558776855, + "step": 481 + }, + { + "epoch": 0.5, + "learning_rate": 2.9559607962765773e-05, + "logits/chosen": -2.103732109069824, + "logits/rejected": -2.215973377227783, + "logps/chosen": -326.31787109375, + "logps/rejected": -398.490478515625, + "loss": 0.6273, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.019250750541687, + "rewards/margins": 0.20697328448295593, + "rewards/rejected": -1.2262240648269653, + "step": 482 + }, + { + "epoch": 0.5, + "learning_rate": 2.947030352845255e-05, + "logits/chosen": -2.1636712551116943, + "logits/rejected": -2.1790904998779297, + "logps/chosen": -335.7892761230469, + "logps/rejected": -397.0627746582031, + "loss": 0.5934, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9784788489341736, + "rewards/margins": 0.27349093556404114, + "rewards/rejected": -1.251969814300537, + "step": 483 + }, + { + "epoch": 0.5, + "learning_rate": 2.9380940127813834e-05, + "logits/chosen": -2.165933609008789, + "logits/rejected": -2.2048957347869873, + "logps/chosen": -421.8327331542969, + "logps/rejected": -405.9539794921875, + "loss": 0.7287, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.179675579071045, + "rewards/margins": 0.05797319859266281, + "rewards/rejected": -1.2376487255096436, + "step": 484 + }, + { + "epoch": 0.5, + "learning_rate": 2.9291518939613315e-05, + "logits/chosen": -2.1944916248321533, + "logits/rejected": -2.3757896423339844, + "logps/chosen": -486.90582275390625, + "logps/rejected": -422.9759521484375, + "loss": 0.703, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3329401016235352, + "rewards/margins": 0.04630117490887642, + "rewards/rejected": -1.3792412281036377, + "step": 485 + }, + { + "epoch": 0.5, + "learning_rate": 2.9202041143376896e-05, + "logits/chosen": -1.9283084869384766, + "logits/rejected": -2.046499490737915, + "logps/chosen": -340.8263244628906, + "logps/rejected": -354.3287658691406, + "loss": 0.6069, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.017566204071045, + "rewards/margins": 0.2470768392086029, + "rewards/rejected": -1.2646431922912598, + "step": 486 + }, + { + "epoch": 0.5, + "learning_rate": 2.9112507919377213e-05, + "logits/chosen": -1.9073665142059326, + "logits/rejected": -1.9913743734359741, + "logps/chosen": -233.45367431640625, + "logps/rejected": -262.6000061035156, + "loss": 0.6771, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9823710918426514, + "rewards/margins": 0.10720720142126083, + "rewards/rejected": -1.0895782709121704, + "step": 487 + }, + { + "epoch": 0.51, + "learning_rate": 2.9022920448618e-05, + "logits/chosen": -2.0144119262695312, + "logits/rejected": -2.010000228881836, + "logps/chosen": -270.91790771484375, + "logps/rejected": -324.4449462890625, + "loss": 0.7047, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.950652539730072, + "rewards/margins": 0.02743140608072281, + "rewards/rejected": -0.9780839681625366, + "step": 488 + }, + { + "epoch": 0.51, + "learning_rate": 2.8933279912818566e-05, + "logits/chosen": -2.200765609741211, + "logits/rejected": -1.9420582056045532, + "logps/chosen": -326.8210144042969, + "logps/rejected": -328.5808410644531, + "loss": 0.6967, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0532310009002686, + "rewards/margins": 0.0682058334350586, + "rewards/rejected": -1.1214368343353271, + "step": 489 + }, + { + "epoch": 0.51, + "learning_rate": 2.8843587494398177e-05, + "logits/chosen": -2.297065496444702, + "logits/rejected": -2.367543935775757, + "logps/chosen": -335.68994140625, + "logps/rejected": -337.0887756347656, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0686800479888916, + "rewards/margins": 0.21843653917312622, + "rewards/rejected": -1.287116527557373, + "step": 490 + }, + { + "epoch": 0.51, + "learning_rate": 2.875384437646046e-05, + "logits/chosen": -2.2880589962005615, + "logits/rejected": -2.273855209350586, + "logps/chosen": -306.8229675292969, + "logps/rejected": -322.33050537109375, + "loss": 0.5281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9717899560928345, + "rewards/margins": 0.5251017808914185, + "rewards/rejected": -1.4968918561935425, + "step": 491 + }, + { + "epoch": 0.51, + "learning_rate": 2.8664051742777803e-05, + "logits/chosen": -2.119661569595337, + "logits/rejected": -1.9243615865707397, + "logps/chosen": -326.88348388671875, + "logps/rejected": -394.1810302734375, + "loss": 0.6364, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9266807436943054, + "rewards/margins": 0.20886686444282532, + "rewards/rejected": -1.1355476379394531, + "step": 492 + }, + { + "epoch": 0.51, + "learning_rate": 2.8574210777775755e-05, + "logits/chosen": -2.2085232734680176, + "logits/rejected": -2.188079357147217, + "logps/chosen": -315.6412353515625, + "logps/rejected": -290.0874938964844, + "loss": 0.7217, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.09775710105896, + "rewards/margins": 0.024385623633861542, + "rewards/rejected": -1.1221426725387573, + "step": 493 + }, + { + "epoch": 0.51, + "learning_rate": 2.8484322666517373e-05, + "logits/chosen": -2.236124277114868, + "logits/rejected": -2.3427038192749023, + "logps/chosen": -274.1993103027344, + "logps/rejected": -298.519775390625, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9323225617408752, + "rewards/margins": 0.053042903542518616, + "rewards/rejected": -0.9853654503822327, + "step": 494 + }, + { + "epoch": 0.51, + "learning_rate": 2.83943885946876e-05, + "logits/chosen": -2.3090035915374756, + "logits/rejected": -2.3383147716522217, + "logps/chosen": -351.7496643066406, + "logps/rejected": -356.39678955078125, + "loss": 0.776, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.235669493675232, + "rewards/margins": -0.09604780375957489, + "rewards/rejected": -1.139621615409851, + "step": 495 + }, + { + "epoch": 0.51, + "learning_rate": 2.8304409748577653e-05, + "logits/chosen": -2.266005516052246, + "logits/rejected": -2.2317018508911133, + "logps/chosen": -342.5093994140625, + "logps/rejected": -360.4115295410156, + "loss": 0.6021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.800186276435852, + "rewards/margins": 0.3189813792705536, + "rewards/rejected": -1.119167685508728, + "step": 496 + }, + { + "epoch": 0.51, + "learning_rate": 2.821438731506933e-05, + "logits/chosen": -2.1894872188568115, + "logits/rejected": -2.2929162979125977, + "logps/chosen": -366.48651123046875, + "logps/rejected": -422.5482177734375, + "loss": 0.6365, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1337993144989014, + "rewards/margins": 0.18402203917503357, + "rewards/rejected": -1.3178215026855469, + "step": 497 + }, + { + "epoch": 0.52, + "learning_rate": 2.8124322481619388e-05, + "logits/chosen": -2.1163060665130615, + "logits/rejected": -2.0039188861846924, + "logps/chosen": -361.08380126953125, + "logps/rejected": -269.610595703125, + "loss": 0.7452, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3107686042785645, + "rewards/margins": -0.045146312564611435, + "rewards/rejected": -1.2656222581863403, + "step": 498 + }, + { + "epoch": 0.52, + "learning_rate": 2.803421643624386e-05, + "logits/chosen": -2.105262041091919, + "logits/rejected": -1.973724126815796, + "logps/chosen": -311.098388671875, + "logps/rejected": -314.92181396484375, + "loss": 0.6544, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0980561971664429, + "rewards/margins": 0.178094744682312, + "rewards/rejected": -1.2761509418487549, + "step": 499 + }, + { + "epoch": 0.52, + "learning_rate": 2.7944070367502402e-05, + "logits/chosen": -2.1020731925964355, + "logits/rejected": -2.2140867710113525, + "logps/chosen": -277.33599853515625, + "logps/rejected": -283.2631530761719, + "loss": 0.6616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.192931056022644, + "rewards/margins": 0.10631629824638367, + "rewards/rejected": -1.2992472648620605, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 965, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}